00001 #ifndef FW_ANALYZER_H 00002 #define FW_ANALYZER_H 00003 00004 //-------------------------------------------------------------------- 00005 // 00006 // This file is part of PEACE. 00007 // 00008 // PEACE is free software: you can redistribute it and/or modify it 00009 // under the terms of the GNU General Public License as published by 00010 // the Free Software Foundation, either version 3 of the License, or 00011 // (at your option) any later version. 00012 // 00013 // PEACE is distributed in the hope that it will be useful, but 00014 // WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU General Public License 00019 // along with PEACE. If not, see <http://www.gnu.org/licenses/>. 00020 // 00021 // Miami University makes no representations or warranties about the 00022 // suitability of the software, either express or implied, including 00023 // but not limited to the implied warranties of merchantability, 00024 // fitness for a particular purpose, or non-infringement. Miami 00025 // University shall not be liable for any damages suffered by licensee 00026 // as a result of using, result of using, modifying or distributing 00027 // this software or its derivatives. 00028 // 00029 // By using or copying this Software, Licensee agrees to abide by the 00030 // intellectual property laws, and all other applicable laws of the 00031 // U.S., and the terms of GNU General Public License (version 3). 00032 // 00033 // Authors: Dhananjai M. Rao raodm@muohio.edu 00034 // 00035 //--------------------------------------------------------------------- 00036 00037 #include "ESTAnalyzer.h" 00038 #include <string> 00039 #include <vector> 00040 00041 // Forward declaration to keep compiler happy 00042 class EST; 00043 class ResultLog; 00044 00045 /** FWAnalyzer: Frame and Word based Analyzer. 00046 00047 <p>This analyzer provides a common base class for all EST 00048 analyzers that use a frame and word concept for analyzing ESTs. 00049 The total number of base pairs to be compared is called a Frame. 00050 A frame is broken into a sequence of fixed size (in bp) words. 00051 The frame size and word size (in terms of number of base pairs) is 00052 specified as command line arguments. 00053 00054 <p>This class has been implemented by extending the ESTAnalyzer 00055 base class. The ESTAnalyzer base class provides most of the 00056 standard functionality involved in reading FASTA files and 00057 generating formatted output. This class adds functionality to 00058 compare EST's using the concept of frames and words</p> 00059 00060 \note This class is never directly instantiated. Instead, one of 00061 the derived classes are instantiated (via the 00062 ESTAnalyzerFactory::create) method and used. 00063 */ 00064 class FWAnalyzer : public ESTAnalyzer { 00065 public: 00066 /** The destructor. 00067 00068 The destructor frees up all any dynamic memory allocated by 00069 this object for its operations. 00070 */ 00071 virtual ~FWAnalyzer(); 00072 00073 /** Obtains a frame number of bp from a given EST sequence. 00074 00075 This method is used to obtain a frame number of base pairs 00076 from a given EST sequence. The frame is extracted either from 00077 the beginning or the end of an EST depending on the start 00078 parameter. 00079 00080 \param[in] est The EST from which a frame number of bp must be 00081 extracted. 00082 00083 \param[in] start If this flag is \c true then this method 00084 extracts base pairs from the beginning of the EST 00085 sequence. Otherwise a frame is extracted from the end of the 00086 EST sequence. 00087 */ 00088 std::string getFrame(const EST* est, bool start = true); 00089 00090 /** Display valid command line arguments for this analyzer. 00091 00092 This method must be used to display all valid command line 00093 options that are supported by this analyzer. Note that 00094 derived classes may override this method to display additional 00095 command line options that are applicable to it. This method 00096 is typically used in the main() method when displaying usage 00097 information. 00098 00099 \note Derived EST analyzer classes <b>must</b> override this 00100 method to display help for their custom command line 00101 arguments. When this method is overridden don't forget to 00102 call the corresponding base class implementation to display 00103 common options. 00104 00105 \param[out] os The output stream to which the valid command 00106 line arguments must be written. 00107 */ 00108 virtual void showArguments(std::ostream& os); 00109 00110 /** Process command line arguments. 00111 00112 This method is used to process command line arguments specific 00113 to this EST analyzer. This method is typically used from the 00114 main method just after the EST analyzer has been instantiated. 00115 This method consumes all valid command line arguments. If the 00116 command line arguments were valid and successfully processed, 00117 then this method returns \c true. 00118 00119 \note Derived EST analyzer classes <b>must</b> override this 00120 method to process any command line arguments that are custom 00121 to their operation. When this method is overridden don't 00122 forget to call the corresponding base class implementation to 00123 display common options. 00124 00125 \param[in,out] argc The number of command line arguments to be 00126 processed. 00127 00128 \param[in,out] argv The array of command line arguments. 00129 00130 \return This method returns \c true if the command line 00131 arguments were successfully processed. Otherwise this method 00132 returns \c false. This method checks to ensure that a valid 00133 frame size and a valid word size have been specified. 00134 */ 00135 virtual bool parseArguments(int& argc, char **argv); 00136 00137 /** Method to begin EST analysis. 00138 00139 <p>This method is invoked just before commencement of EST 00140 analysis. This method loads the list of ESTs from a given 00141 input multi-FASTA file and pouplates the list of ESTs.</p> 00142 00143 \return If the ESTs were successfully loaded from the FATA 00144 file then this method returns 0. Otherwise this method 00145 returns with a non-zero error code. 00146 */ 00147 virtual int initialize(); 00148 00149 /** Set the reference EST id for analysis. 00150 00151 This method is invoked just before a batch of ESTs are 00152 analyzed via a call to the analyze(EST *) method. This method 00153 extracts the start frame from the reference EST and sets it in 00154 the referenceFrame instance variable in this class. 00155 00156 \note This method must be called only after the initialize() 00157 method is called. 00158 00159 \return If the extraction of the reference EST frame was 00160 successful, then this method returns 0. Otherwise this method 00161 returns an error code. 00162 */ 00163 virtual int setReferenceEST(const int estIdx); 00164 00165 /** Method to begin EST analysis. 00166 00167 This method is used to perform the core tasks of EST analysis 00168 for all FWAnalyzer classes. This method operates in the 00169 following manner: 00170 00171 <ol> 00172 00173 <li>First it loads the necessary EST information from the 00174 supplied FASTA file using the initialize() method. If the EST 00175 data is not successfully loaded then this method returns right 00176 away with 1.<li> 00177 00178 <li>Upon successfully loading the EST data, the reference EST 00179 is set via the setReferenceEST() method. If the reference EST 00180 is not correctly determined, then this method immediately 00181 returns with 2.</li> 00182 00183 <li>For each EST in the list of ESTs it performs the following 00184 tasks: 00185 00186 <ol> <li> It extracts a frame from the reference EST and 00187 current EST. <li> 00188 00189 <li> Next, it calls the polymorphic analyze() method to obtain 00190 similarity metric. </li> 00191 00192 <li>It logs the similarity metric using suitable methods in 00193 the ESTAnalyzer base class.<li> 00194 00195 </ol> 00196 00197 <li>If all the processing proceeds successfully, this method 00198 returns 0 (zero). 00199 00200 </ol> 00201 00202 \return This method returns zero if all the processing 00203 proceeded successfully. On errors this method returns a 00204 non-zero value. 00205 */ 00206 virtual int analyze(); 00207 00208 /** Determine preferred dummy EST lengths to be used with this 00209 analyzer. 00210 00211 \note For more detailed description of the motivation for 00212 dummy ESTs please refer to the documentation for the 00213 corresponding method in the base class -- 00214 getPreferredDummyESTLength(). 00215 00216 \return This method overrides the default implementation in 00217 the base class to return twice the length of the frame (aka 00218 window) size. 00219 */ 00220 virtual int getPreferredDummyESTLength() const 00221 { return frameSize * 2; } 00222 00223 protected: 00224 /** Analyze and obtain a similarity metric. 00225 00226 This method can be used to compare a given EST with the 00227 reference EST (set via the call to the setReferenceEST()) 00228 method. 00229 00230 \param[in] otherEST The index (zero based) of the EST with 00231 which the reference EST is to be compared. 00232 00233 \return This method must returns a similarity metric by 00234 comparing the ESTs by calling the analyze() method. 00235 */ 00236 virtual float getMetric(const int otherEST); 00237 00238 /** Method to compare two frames and compute similarity. 00239 00240 This method must be overridden by derived Frame-Word analyzers 00241 (see FMWSCA.h) to compare two frames and report a similarity 00242 metric. 00243 00244 \param[in] refFrame The reference frame for comparison 00245 purposes. Note that the reference frame is always a constant 00246 in a given set of caparisons. Consequently, certain analyzers 00247 can pre-compute and reuse metrics to make analysis fast. 00248 00249 \param[in] otherFrame The other frame for comparison. This 00250 frame is always guaranteed to be from a different EST than the 00251 refFrame. 00252 00253 \param[in] wordSize The size of a word within the given frame. 00254 This value is always greater than 0 (zero) and less than frame 00255 size. 00256 00257 \return This method is expected to return a similarity metric 00258 between the given frame and the refFrame. 00259 00260 \note The default implementation of this method simply returns 00261 0. Derived FWAnalyzer-based classes must override this method 00262 to perform the necessary operations. 00263 */ 00264 virtual float analyzeFrame(const std::string& refFrame, 00265 const std::string& otherFrame, 00266 const int wordSize); 00267 00268 /** Helper method to dump result log header. 00269 00270 This is a helper method that is invoked from the analyze() 00271 method to dump a result log header. This method was 00272 introduced to keep the code clustter in the analyze method to 00273 a minimum. 00274 00275 This method dumps some of the analysis parameters to the 00276 supplied log. 00277 00278 \param[out] log The log to which the header is to be dumped. 00279 00280 \param[in] mean The overall mean similarity for this set of 00281 ESTs. 00282 00283 \param[in] variance The overall variance in similarity for the 00284 given set of ESTs currently analyzed. 00285 */ 00286 virtual void dumpHeader(ResultLog& log, const double mean, 00287 const double variance); 00288 00289 /** Helper method to dump post analysis EST list to a log. 00290 00291 This is a helper method that is invoked from the analyze() 00292 method to dump the list of analyzed ESTs to a log. This 00293 method was introduced to keep the code clustter in the analyze 00294 method to a minimum. In addition, it provides the derived 00295 classes a chance to customize the working of the class. 00296 00297 This method dumps the list of ESTs to the supplied log. 00298 00299 \param[in] estList The list of ESTs that must be dumped out. 00300 00301 \param[in] refEST The reference EST. 00302 00303 \param[out] log The log to which the header is to be dumped. 00304 */ 00305 virtual void dumpESTList(const std::vector<EST*>& estList, 00306 const EST* refEST, 00307 ResultLog& log); 00308 00309 /** Dumps a given EST in 3-column format using R. 00310 00311 This method is a helper method that dumps a given EST out to 00312 the log. 00313 00314 \param[out] log The log to which the EST is to be dumped. 00315 00316 \param[in] est The EST to be dumped. This parameter is never 00317 NULL. 00318 00319 \param[in] isReference If this flag is true, then this EST is 00320 the reference EST to be dumped out. 00321 */ 00322 virtual void dumpEST(ResultLog& log, const EST* est, 00323 const bool isReference = false); 00324 00325 /** The frame size to be used by this analyzer. 00326 00327 The frame size (in bp) that must be used for comparisons. The 00328 default value is set to 0. However, the value is changed by 00329 the parseArguments method depending on the actual value 00330 specified by the user. 00331 */ 00332 int frameSize; 00333 00334 /** The frame size supplied by the user as command line input. 00335 00336 A separate, static frame size variable so that it can be 00337 assigned as a command line argument. 00338 00339 /note Deriving analyzers are not in any way required to 00340 utilize the user-supplied frame size -- see specific 00341 analyzer classes for those details. 00342 */ 00343 static int argumentFrameSize; 00344 00345 /** The word size to be used by this analyzer. 00346 00347 The word size (in bp) that must be used for comparisons. The 00348 default value is set to 0. However, the value is changed by 00349 the parseArguments method depending on the actual value 00350 specified by the user. 00351 00352 \note The word size must be smaller than the frame size. 00353 */ 00354 static int wordSize; 00355 00356 /** The reference frame to be used for EST comparisons. 00357 00358 This instance variable is set to the reference frame once the 00359 setReferenceFrame() method is called. This referenceFrame is 00360 used in subsequent analysis() methods. 00361 */ 00362 std::string referenceFrame; 00363 00364 /** The default constructor. 00365 00366 The default constructor for this class. The constructor is 00367 made protected so that this class cannot be directly 00368 instantiated. Instead one of the derived analyzer classes 00369 must be instantiated (via the ESTAnalyzerFactory::create()) 00370 method and used. 00371 00372 \param[in] analyzerName The human readable name for this EST 00373 analyzer. This name is used when generating errors, warnings, 00374 and other output messages for this analyzer. This value is 00375 simply passed-on to the base class without any checks. 00376 00377 \param[in] refESTidx The reference EST index value to be used 00378 when performing EST analysis. This parameter should be >= 0. 00379 This value is simply passed onto the base class. 00380 00381 \param[in] outputFile The name of the output file to which the 00382 EST analysis data is to be written. This parameter is ignored 00383 if this analyzer is used for clustering. If this parameter is 00384 the empty string then output is written to standard output. 00385 This value is simply passed onto the base class. 00386 */ 00387 FWAnalyzer(const std::string& analyzerName, const int refESTidx, 00388 const std::string& outputFile); 00389 00390 private: 00391 /** The set of common arguments for all FWAnalyzer instances. 00392 00393 This instance variable contains a static list of arguments 00394 that are common all the Frame-Word analyzers. The common argument 00395 list is statically defined and shared by all EST instances. 00396 00397 \note This makes FWAnalyzer class hierarchy not MT-safe. 00398 */ 00399 static arg_parser::arg_record commonArgsList[]; 00400 }; 00401 00402 00403 #endif