Main Page | Data Structures | File List | Data Fields | Globals

retroflux.c

Go to the documentation of this file.
00001 /** @mainpage retroflux
00002 * <p>
00003 * @b retroFlux facilitates the virtual retro-synthesis of compounds. The user has to specify an <i>input
00004 * file</i> that contains compounds in SMILES notation and a <i>reaction file</i> that contains the
00005 * reactions in SMIRKS/reaction SMILES notation. All compounds of the <i>input file</i> are exhaustively
00006 * retro-synthesized with all reactions specified in the <i>reaction file</i>. The result is a list of
00007 * fragments. 'Virtual atoms' are added to tag the reaction site and reaction type of the fragments.
00008 * In SMILES notation these 'virtual atoms' look like <tt>[n*]</tt> where n is the number of the
00009 * respective reaction.
00010 * <p>
00011 * This distribution comes
00012 * together with a file called <i>reactions.smi</i> that contains eleven reactions. Both the virtual
00013 * retro-synthesis procedure and the eleven reactions of <i>reactions.smi</i> were originally published
00014 * in a study by Lewell and co-workers [1].
00015 * <p>
00016 * Furthermore, @b retroFlux allows to provide a 'filter-file' that contains <i>unwanted fragments</i> in
00017 * SMILES notation. Reactions that lead to one of the fragments specified in the 'filter-file' are not
00018 * carried out. If no such file is specified all resultant fragments are considered as valid.
00019 * <p>
00020 * <b>RetroFlux</b> was presented at EuroMUG 2004, Cambridge, England. If you wish to publish results
00021 * obtained using <b>retroflux</b> please cite [1] and [2].
00022 * <p>
00023 * @b References
00024 * <ol>
00025 * <li>Lewell, X.Q., Judd, D.B., Watson, S.P., and Hann, M.M., RECAP - Retrosynthetic Combinatorial
00026 * Analysis Procedure: A Powerful New Technique for Identifying Privileged Molecular Fragments with
00027 * Useful Applications in Combinatorial Chemistry. <i>J. Chem. Inf. Comput. Sci.</i>, 38 <b>(1998)</b>,
00028 * 511.</li>
00029 * <li>G. Schneider, U. Fechner, Fragment-based de novo design, <i>Presentation at EuroMUG 2004</i>,
00030 * November 5th <b>2004</b>, Cambridge, England,
00031 * <a href="http://daylight.com/meetings/emug04/Schneider/index.html">
00032 * http://daylight.com/meetings/emug04/Schneider/index.html</a>.</li>
00033 * </ol>
00034 * <p>
00035 * &copy; 2003-2004 by Uli Fechner
00036 */
00037 
00038 /** @file
00039 *
00040 * This is the main file of the program flux. All other files are included here via the inclusion
00041 * of the file @ref includes.h.
00042 *
00043 * @author Uli Fechner
00044 * @version 10/12/2003 - Uli Fechner - 0.1 - initial stable release
00045 * @version 17/12/2003 - Uli Fechner - 0.2 - changed function prototype of @ref enqueueMolecules; major
00046 * changes in @ref enqueueMolecules; adjusted corresponding function call in @ref main
00047 * @version 07/01/2004 - UF - 0.3 - introduced the filter SMILES file; added function @ref
00048 * filterMolecules, changes in @ref main, @ref parseClp and @ref displayHelpText
00049 * @version 30/01/2004 - UF - 0.3.1 - small changes in @ref enqueueMolecules, @ref filterMolecules
00050 * and @ref readDataFromStream to reflect changes in the file smilesCompound.c
00051 * @version 23/02/2004 - UF - 0.3.2 - added a reaction counter in the stderr output (@ref main)
00052 * @version 16/03/2004 - UF - 0.3.3 - function SmilesCompound_getDaylightHandle was renamed to @ref
00053 * SmilesCompound_getMoleculeHandle; this change is now reflected in retroflux
00054 * @version 06/08/2004 - UF - 0.3.4 - changes in @ref readDataFromFile and @ref readDataFromStream
00055 */
00056 
00057 /* preprocessor includes */
00058 
00059 #include "includes.h"
00060 
00061 /* just some information about this file */
00062 #define RETROFLUX_VERSION       "0.3.4"
00063 #define RETROFLUX_DATE  "06-Aug-2004"
00064 
00065 /* function prototypes */
00066 
00067 int main( int argc, char *argv[] );
00068 
00069 int filterMolecules( char* reactants, const List_Ptr filterSmilesPtr, FILE* errorLogFile );
00070 
00071 void enqueueMolecules( char* reactants, char* reactionName, const SmilesCompound_Ptr product, \
00072         const List_Ptr listPtr, FILE* errorLogFile );
00073 
00074 void parseClp( const int argc, char* argv[], CLP_Ptr clpPtr );
00075 
00076 List_Ptr readDataFromFile( const char* const filename, const int daylight_type, const int uniqueData, \
00077         FILE* errorLogFile );
00078 
00079 List_Ptr readDataFromStream( FILE* inputStream, const char* const nameOfStream, \
00080         const int daylight_type, const int uniqueData, FILE* errorLogFile );
00081 
00082 DoubleArrayPtr getFileProperties( FILE* inputStream, const char* const nameOfStream );
00083 
00084 void displayHelpText( const CLP_Ptr clpPtr );
00085 
00086 void displayVersionInformation( void );
00087 
00088 /* functions */
00089 
00090 /** Main function of the program.
00091 *
00092 * The main function just calls the other functions. It is reduced to contain only function calls
00093 * whenever possible.
00094 *
00095 * @param argc the number of arguments provided via the command line by executing the program
00096 * @param *argv[] string array containing the command line arguments
00097 * @retval int exit code of the program
00098 * @author Uli Fechner
00099 * @version 10/12/2003 - Uli Fechner - initial release
00100 * @version 17/12/2003 - Uli Fechner - adjusted function call of @ref enqueueMolecules
00101 * @version 07/01/2004 - UF - major change: introduced the filter SMILES file; this leads to several
00102 * changes in the nested while loops as well as the addition of the reading of the filter SMILES file
00103 * @version 23/03/2004 - UF - added a reaction counter in the stderr output
00104 */
00105 int main ( int argc, char *argv[] )
00106 {
00107         CLP_Ptr clpPtr; /* pointer on CommandLineParameters */
00108         FILE* outFile; /* file pointer on the output file */
00109         FILE* errorLogFile; /* file pointer on the SMILES error log file */
00110         List_Ptr inputDataPtr; /* pointer on a structure List that stores the data of the input file */
00111         List_Ptr reactionSmilesPtr; /* pointer on a List that stores the data of the reaction smiles file */
00112         List_Ptr filterSmilesPtr = NULL; /* pointer on a List that stores the data of the filter SMILES file */
00113         SmilesCompound_Ptr currentReactionPtr; /* pointer on current reaction SMILES in OUTER LOOP */
00114         SmilesCompound_Ptr currentSmilesPtr; /* pointer on current input SMILES in INNER LOOP */
00115         int totalNumberOfCompounds = 0; /* total number of compounds in input file */
00116         /* boolean variable indicating whether a SMILES of inputDataPtr was successfully transformed and does
00117         not lead to reactants that are contained in the filter SMILES file */
00118         int moleculeEnqueued;
00119         int reactionCounter; /* counts the number each reaction takes place */
00120         
00121         /***** Daylight Toolkit variables *****/
00122         dt_Handle errorSequenceHandle; /* dt_Handle on a stream that contains the error messages of the DT */
00123         dt_Handle errorHandle; /* dt_Handle on one error message of errorSequenceHandle */
00124         dt_Integer length; /* needed by dt_cansmiles, dt_typename and dt_stringvalue */
00125         /* Daylight Toolkit variables of the OUTER LOOP */
00126         dt_Handle reactionSmilesHandle; /* dt_Handle on current reaction SMILES */
00127         /* Daylight Toolkit variables of the INNER LOOP */
00128         dt_Handle currentSmilesHandle; /* dt_Handle on the current input SMILES */
00129         dt_Handle transformSequenceHandle; /* dt_Handle on current transform sequence */
00130         dt_Handle reactionHandle; /* dt_Handle on the reactions retrieved from transformSequenceHandle */
00131         dt_Handle moleculeStreamHandle; /* dt_Handle on the molecule stream retrieved from the reactionHandle */
00132         dt_Handle moleculeHandle; /* dt_Handle on the reactants retrieved from moleculeStreamHandle */
00133         
00134         /* creating a CommandLineParameters structure and initialize it with default values */
00135         clpPtr = CLP_create( );
00136         parseClp( argc, argv, clpPtr ); /* parsing the command line arguments */
00137         /* the command line arguments are shown on standard error */
00138         fprintf( stderr, "\nretroflux %s ( %s )\n\n", RETROFLUX_VERSION, RETROFLUX_DATE );
00139         CLP_display( clpPtr, stderr );
00140         
00141         /* opening the output file; if an error occurs the program aborts */
00142         if( !( outFile = fopen( CLP_getOutputFile( clpPtr ), "w" ) ) )
00143                 FileWriteError( CLP_getOutputFile( clpPtr ) );
00144         
00145         /* opening the SMILES error log file; if an error occurs the program aborts */
00146         if( !( errorLogFile = fopen( CLP_getErrorLogFile( clpPtr ), "w" ) ) )
00147                 FileWriteError( CLP_getErrorLogFile( clpPtr ) );
00148 
00149         /* reading the data from stdin */
00150         fprintf( stderr, "Reading file from stdin......." );
00151         inputDataPtr = readDataFromStream( stdin, "<stdin>", DAYLIGHT_SMILES, BOOLEAN_TRUE, errorLogFile );
00152         List_setName( inputDataPtr, "Database" );
00153         List_rewind( inputDataPtr );
00154         fprintf( stderr, "Done\n" );
00155         while( List_hasNext( inputDataPtr ) )
00156                 totalNumberOfCompounds += SmilesCompound_getCounter( List_getNext( inputDataPtr ) );
00157         fprintf( stderr, "Total number of compounds in input file: %d\n", totalNumberOfCompounds );
00158         fprintf( stderr, "Number of unique compounds in input file: %d\n\n", \
00159                 List_getNumberOfNodes( inputDataPtr ) );
00160         #if RETROFLUX_DEBUG
00161         List_display( inputDataPtr, outFile );
00162         #endif
00163 
00164         /* reading reaction SMILES file (provided with '-r' option) */
00165         fprintf( stderr, "Reading reaction SMILES file %s...", CLP_getReactionFile( clpPtr ) );
00166         reactionSmilesPtr = readDataFromFile( CLP_getReactionFile( clpPtr ), DAYLIGHT_SMIRKS, BOOLEAN_TRUE, \
00167                 errorLogFile );
00168         List_setName( reactionSmilesPtr, "Reaction SMILES" );
00169         fprintf( stderr, "Done\n" );
00170         fprintf( stderr, "Number of reactions: %d\n", List_getNumberOfNodes( reactionSmilesPtr ) );
00171         #if RETROFLUX_DEBUG
00172         List_display( reactionSmilesPtr, outFile );
00173         #endif
00174         
00175         /* reading filter SMILES file (provided with '-f' option) */
00176         if( CLP_getFilterFile( clpPtr ) != NULL )
00177         {
00178                 fprintf( stderr, "Reading filter SMILES file %s...", CLP_getFilterFile( clpPtr ) );
00179                 filterSmilesPtr = readDataFromFile( CLP_getFilterFile( clpPtr ), DAYLIGHT_SMILES, BOOLEAN_TRUE, \
00180                         errorLogFile );
00181                 List_setName( filterSmilesPtr, "Filter SMILES" );
00182                 fprintf( stderr, "Done\n" );
00183                 fprintf( stderr, "Number of filter SMILES: %d\n", List_getNumberOfNodes( filterSmilesPtr ) );
00184                 #if RETROFLUX_DEBUG
00185                 List_display( filterSmilesPtr, outFile );
00186                 #endif
00187         }
00188 
00189         /* rewinding the lists inputDataPtr and reactionSmilesPtr */
00190         List_rewind( inputDataPtr );
00191         List_rewind( reactionSmilesPtr );
00192         /***** OUTER LOOP: LOOPING OVER THE REACTIONS *****/
00193         while( List_hasNext( reactionSmilesPtr ) )
00194         {
00195                 reactionCounter = 0; // re-setting the reaction counter
00196                 currentReactionPtr = List_getNext( reactionSmilesPtr );
00197                 fprintf( stderr, "Processing reaction %s....", \
00198                 StringArray_getElement( SmilesCompound_getStringArrayOfNames( currentReactionPtr ), 0 ) );
00199                 /*SmilesCompound_display( currentReactionPtr, outFile ); */
00200                 reactionSmilesHandle = SmilesCompound_getMoleculeHandle( currentReactionPtr );
00201                 /***** INNER LOOP: LOOPING OVER THE INPUT DATA *****/
00202                 while( List_hasNext( inputDataPtr ) )
00203                 {
00204                         moleculeEnqueued = BOOLEAN_FALSE;
00205                         currentSmilesPtr = List_getNext( inputDataPtr );
00206                         currentSmilesHandle = SmilesCompound_getMoleculeHandle( currentSmilesPtr );
00207                         /*SmilesCompound_display( currentSmilesPtr, outFile ); */
00208                         transformSequenceHandle = dt_utransform( reactionSmilesHandle, currentSmilesHandle, DX_REVERSE, 1 );
00209                         
00210                         /* Looping over the individual REACTIONS of the transform sequence */
00211                         while( NULL_OB != ( reactionHandle = dt_next( transformSequenceHandle ) ) )
00212                         {
00213                                 /*fprintf( outFile, "Complete Reaction: %s\n", dt_cansmiles( &length, reactionHandle, 1 ) );*/
00214                                 /* Looping over the individual MOLECULES of one reaction */
00215                                 moleculeStreamHandle = dt_stream( reactionHandle, TYP_MOLECULE );
00216                                 /*fprintf( stderr, "%d\n", dt_count( moleculeStreamHandle, TYP_MOLECULE ) );*/
00217                                 while( NULL_OB != ( moleculeHandle = dt_next( moleculeStreamHandle ) ) )
00218                                 {
00219                                         if( dt_getrole( moleculeHandle, reactionHandle ) != DX_ROLE_REACTANT )
00220                                                 break; //LOOP EXITS HERE
00221                                         /* fprintf( outFile, "%s\n%s\n", StringArray_getElement( \
00222                                                 SmilesCompound_getName( currentSmilesPtr ), 0 ), \
00223                                                 dt_cansmiles( &length, moleculeHandle, 1 ) ); fflush( NULL ); */
00224                                         /* if one of the reactants is contained in the filter file the reactants are not enqueued */
00225                                         if( ( CLP_getFilterFile( clpPtr ) != NULL ) && \
00226                                                 ( filterMolecules( dt_cansmiles( &length, moleculeHandle, 1 ), \
00227                                                 filterSmilesPtr, errorLogFile ) == BOOLEAN_TRUE ) )
00228                                                 break; //LOOP EXITS HERE
00229                                         enqueueMolecules( dt_cansmiles( &length, moleculeHandle, 1 ), \
00230                                                 StringArray_getElement( SmilesCompound_getStringArrayOfNames( currentReactionPtr ), 0 ), \
00231                                                 currentSmilesPtr, inputDataPtr, errorLogFile );
00232                                         moleculeEnqueued = BOOLEAN_TRUE;
00233                                         reactionCounter++;
00234                                         /*List_display( inputDataPtr, outFile ); fflush( NULL ); */
00235                                 }
00236                         }
00237                         /* the current SMILES in the input file is removed if the transform was applied successfully
00238                         AND the reactants are not contained in the filter SMILES file */
00239                         if( ( dt_count( transformSequenceHandle, TYP_REACTION ) ) != -1 && \
00240                                 moleculeEnqueued == BOOLEAN_TRUE )
00241                                 List_remove( inputDataPtr );
00242                         dt_dealloc( transformSequenceHandle );
00243                 }
00244                 /***** END OF INNER LOOP: LOOPING OVER THE INPUT DATA *****/
00245                 /*fprintf( outFile, "FINISHED: %s\n", \
00246                         StringArray_getElement( SmilesCompound_getStringArrayOfNames( currentReactionPtr ), 0 ) );*/
00247                 //List_display( inputDataPtr, outFile ); fflush( NULL ); //DEBUG
00248                 List_rewind( inputDataPtr );
00249                 fprintf( stderr, "%d\n", reactionCounter );
00250         }
00251         /***** END OF OUTER LOOP: LOOPING OVER THE REACTIONS *****/
00252 
00253         /* writing the Daylight Toolkit errors to the error log file */
00254         errorSequenceHandle = dt_errors( DX_ERR_NONE );
00255         while( NULL_OB != ( errorHandle = dt_next( errorSequenceHandle ) ) )
00256                 fprintf( errorLogFile, "%s\n", dt_stringvalue( &length, errorHandle ) );
00257         
00258         /* writing the final file to outFile */
00259         List_display( inputDataPtr, outFile );
00260         
00261         /* cleaning up */
00262         List_destroy( inputDataPtr );
00263         List_destroy( reactionSmilesPtr );
00264         if( CLP_getFilterFile( clpPtr ) != NULL )
00265                 List_destroy( filterSmilesPtr );
00266         fclose( outFile );
00267         CLP_destroy( clpPtr );
00268         
00269         fprintf( stderr, "Adios.\n\n");
00270         return EXIT_SUCCESS;
00271 }
00272 
00273 /** Takes a SMILES string with several molecules and checks if one of these molecules is contained in @c filterSmilesPtr.
00274 *
00275 * This function takes a SMILES string with several molecules that are separated by a '.' (@c 
00276 * reactants) and stores each molecule individually in the List @c listPtr points on. Molecules
00277 * are inserted at the tail of the list. @c reactionName and @c errorLogFile are used as a function
00278 * argument to @ref SmilesCompound_create.
00279 *
00280 * @param reactants SMILES string that contains several molecules
00281 * @param filterSmilesPtr pointer on a List that stores the molecules that should be filtered
00282 * @param errorLogFile file pointer on the error log file
00283 * @author Uli Fechner
00284 * @version 07/01/2004 - Uli Fechner - initial release
00285 */
00286 int filterMolecules( char* reactants, const List_Ptr filterSmilesPtr, FILE* errorLogFile )
00287 {
00288         char* substring; /* used with strstr to go from '.' to '.' in reactants */
00289         char* copyStart; /* used together with substring to retrieve individual molecules out of reactants */
00290         char* tempSmiles; /* temporary storage for the SMILES contained in reactants */
00291         /* set to BOOLEAN_TRUE if one of the molecules of reactants is contained in filterMoleculesPtr */
00292         int containsFilterMolecule = BOOLEAN_FALSE;
00293         SmilesCompound_Ptr scPtr; /* pointer on a newly created structure SmilesCompound */
00294         
00295         if( !( tempSmiles = calloc( strlen( reactants ) + 1, sizeof( char ) ) ) )
00296                 MemoryError( "tempSmiles", "filterMolecules" );
00297                 
00298         copyStart = reactants;
00299         /* looping over all molecules contained in reactants (separated by a '.') */
00300         while( ( substring = strstr( copyStart, "." ) ) != NULL )
00301         {
00302                 strncpy( tempSmiles, copyStart, (substring - copyStart ) );
00303                 tempSmiles[ substring - copyStart ] = '\0';
00304                 copyStart = substring + 1;
00305                 /* insertion of the molecule at the tail of the list */
00306                 scPtr = SmilesCompound_create( NULL, tempSmiles, DAYLIGHT_SMILES, errorLogFile );
00307                 if( List_isContained( filterSmilesPtr, scPtr ) == BOOLEAN_TRUE )
00308                         containsFilterMolecule = BOOLEAN_TRUE;
00309                 SmilesCompound_destroy( scPtr );
00310         }
00311         /* copying the last molecule contained in reactants */
00312         strncpy( tempSmiles, copyStart, ( ( reactants + strlen( reactants ) ) - ( copyStart ) ) );
00313         tempSmiles[ ( ( reactants + strlen( reactants ) ) - ( copyStart ) ) ] = '\0';
00314         /* insertion of the last molecule at the tail of the list */
00315         scPtr = SmilesCompound_create( NULL, tempSmiles, DAYLIGHT_SMILES, errorLogFile );
00316         if( List_isContained( filterSmilesPtr, scPtr ) == BOOLEAN_TRUE )
00317                 containsFilterMolecule = BOOLEAN_TRUE;
00318         SmilesCompound_destroy( scPtr );
00319 
00320         /* cleaning up */
00321         free( tempSmiles );
00322         
00323         return containsFilterMolecule;
00324 }
00325 
00326 /** Takes a SMILES string with several molecules and adds each molecule to @c reactants.
00327 *
00328 * This function takes a SMILES string with several molecules that are separated by a '.' (@c 
00329 * reactants) and stores each molecule individually in the List @c listPtr points on. Molecules
00330 * are inserted at the tail of the list. @c reactionName and @c errorLogFile are used as a function
00331 * argument to @ref SmilesCompound_create.
00332 *
00333 * @param reactants SMILES string that contains several molecules
00334 * @param reactionName name of the reaction the SMILES string resulted from
00335 * @param product 
00336 * @param listPtr pointer on a List that stores the individual molecules that result from @c reactants
00337 * @param errorLogFile file pointer on the error log file
00338 * @author Uli Fechner
00339 * @version 04/12/2003 - Uli Fechner - initial release
00340 * @version 07/12/2003 - Uli Fechner - the reactants are now named according to the scheme
00341 * compoundName.reactionName, where compoundName is the name of the product
00342 * @version 17/12/2003 - Uli Fechner - major changes related to the creation of the reactant
00343 * @ref SmilesCompound
00344 */
00345 void enqueueMolecules( char* reactants, char* reactionName, const SmilesCompound_Ptr product, \
00346         const List_Ptr listPtr, FILE* errorLogFile )
00347 {
00348         char* substring; /* used with strstr to go from '.' to '.' in reactants */
00349         char* copyStart; /* used together with substring to retrieve individual molecules out of reactants */
00350         char* tempSmiles; /* temporary storage for the SMILES contained in reactants */
00351         char* dotPlusReactionName; /* a dot plus the name of the reaction (=.reactionName) */
00352         SmilesCompound_Ptr scPtr; /* pointer on a newly created structure SmilesCompound */
00353         
00354         if( !( tempSmiles = calloc( strlen( reactants ) + 1, sizeof( char ) ) ) )
00355                 MemoryError( "tempSmiles", "enqueueMolecules" );
00356         if( !( dotPlusReactionName = calloc( strlen( reactionName ) + 2, sizeof( char ) ) ) )
00357                 MemoryError( "dotPlusReactionName", "enqueueMolecules" );
00358         strcpy( dotPlusReactionName, "." );
00359         strcat( dotPlusReactionName, reactionName );
00360                 
00361         copyStart = reactants;
00362         /* looping over all molecules contained in reactants (separated by a '.') */
00363         while( ( substring = strstr( copyStart, "." ) ) != NULL )
00364         {
00365                 strncpy( tempSmiles, copyStart, (substring - copyStart ) );
00366                 tempSmiles[ substring - copyStart ] = '\0';
00367                 copyStart = substring + 1;
00368                 /* insertion of the molecule at the tail of the list */
00369                 scPtr = SmilesCompound_copy( product );
00370                 SmilesCompound_setSmiles( scPtr, tempSmiles, DAYLIGHT_SMILES, errorLogFile );
00371                 StringArray_mapConcatenate( SmilesCompound_getStringArrayOfNames( scPtr ), dotPlusReactionName );
00372                 if( List_insertTail( listPtr, scPtr ) == BOOLEAN_FALSE )
00373                         SmilesCompound_destroy( scPtr );
00374         }
00375         /* copying the last molecule contained in reactants */
00376         strncpy( tempSmiles, copyStart, ( ( reactants + strlen( reactants ) ) - ( copyStart ) ) );
00377         tempSmiles[ ( ( reactants + strlen( reactants ) ) - ( copyStart ) ) ] = '\0';
00378         /* insertion of the last molecule at the tail of the list */
00379         scPtr = SmilesCompound_copy( product );
00380         SmilesCompound_setSmiles( scPtr, tempSmiles, DAYLIGHT_SMILES, errorLogFile );
00381         StringArray_mapConcatenate( SmilesCompound_getStringArrayOfNames( scPtr ), dotPlusReactionName );
00382         if( List_insertTail( listPtr, scPtr ) == BOOLEAN_FALSE )
00383                 SmilesCompound_destroy( scPtr );
00384 
00385         /* cleaning up */
00386         free( tempSmiles );
00387         free( dotPlusReactionName );
00388 }
00389 
00390 /** Parses the command line parameters.
00391 *
00392 * The command line parameters are parsed here. Additionally, it is checked whether the standard in is
00393 * connected to a terminal. If this is the case, no input file is provided and the help text of the
00394 * program is printed on standard error. It is also checked here if there are any incompatibilities
00395 * of the options provided at the command line. If so, the program aborts with an appropriate error
00396 * message.
00397 *
00398 * @param argc the number of arguments provided via the command line by executing the program
00399 * @param *argv[] string array containing the command line arguments
00400 * @param clpPtr pointer on structure @ref CommandLineParameters that is used to store all the command line parameters
00401 * @author Uli Fechner
00402 * @version 24/11/2003 - Uli Fechner - initial release
00403 * @version 02/12/2003 - Uli Fechner - changed option '-r' to mandatory
00404 * @version 03/12/2003 - Uli Fechner - now automatically setting the name of the error log file
00405 * @version 01/07/2004 - UF - added support for the f option (filter file)
00406 */
00407 void parseClp( const int argc, char* argv[], CLP_Ptr clpPtr )
00408 {       
00409         int c; /* gets the arguments via getopt */
00410         char* errorLogFile; /* temporary storage of the error log file name */
00411         Given_CLP_Ptr givenClpPtr; /* pointer on structure GivenCommandLineParameters */        
00412 
00413         givenClpPtr = GivenClp_create( ); /* creating and initializing a structure GivenCommandLineParameters */
00414         
00415         opterr = 0; /* disable the error message getopt normaly prints on stderr */
00416         
00417         while ( ( c = getopt( argc, argv, "hvo:r:f:" ) ) != -1 )
00418         {
00419                 switch( c )
00420                 {
00421                 /* checking if to provide the help text */
00422                 case 'h':
00423                         displayHelpText( clpPtr );
00424                         break;
00425 
00426                 /* checking if to provide version information text */
00427                 case 'v':
00428                         displayVersionInformation();
00429                         break;
00430 
00431                 /* the argument of the argument o defines the name of the output file */
00432                 case 'o':
00433                         CLP_setOutputFile( clpPtr, optarg );
00434                         GivenClp_setOption( givenClpPtr, 'o', BOOLEAN_TRUE );
00435                         break;
00436 
00437                 /* the argument of the argument o defines the name of the reaction SMILES file */
00438                 case 'r':
00439                         CLP_setReactionFile( clpPtr, optarg );
00440                         GivenClp_setOption( givenClpPtr, 'r', BOOLEAN_TRUE );
00441                         break;
00442 
00443                 /* the argument of the argument f defines the name of the filter SMILES file */
00444                 case 'f':
00445                         CLP_setFilterFile( clpPtr, optarg );
00446                         GivenClp_setOption( givenClpPtr, 'f', BOOLEAN_TRUE );
00447                         break;
00448 
00449                 case '?':
00450                         fprintf( stderr, "\n\nERROR: There is given either an option without an argument that\n" );
00451                         fprintf( stderr, "requires one or an unknown option!\n" );
00452                         fprintf( stderr, "Type 'retroflux -h' for a detailed help text!\n");
00453                         AbortProgram;
00454                         break;                                  
00455                 }
00456         }
00457         
00458         /* testing whether the standard in is connected to the terminal; if so display help */
00459         if( isatty( STDIN_FILENO ) )
00460                 displayHelpText( clpPtr );
00461 
00462         /* the '-o' argument is mandatory; if itisn't provided the program aborts with an appropriate message */
00463         if( GivenClp_getOption( givenClpPtr, 'o' ) == BOOLEAN_FALSE )
00464                 MandatoryOption( "'-o'" );
00465 
00466         /* the '-r' argument is mandatory; if itisn't provided the program aborts with an appropriate message */
00467         if( GivenClp_getOption( givenClpPtr, 'r' ) == BOOLEAN_FALSE )
00468                 MandatoryOption( "'-r'" );
00469         
00470         /* generating the name of the error log file and storing it in the structure CommandLineParameters */
00471         if( !( errorLogFile = calloc( strlen( CLP_getOutputFile( clpPtr ) ) + 5, sizeof( char ) ) ) )
00472                 MemoryError( "parseClp", "errorLogFile" );
00473         strncpy( errorLogFile, CLP_getOutputFile( clpPtr ), strlen( CLP_getOutputFile( clpPtr ) ) + 1 );
00474         CLP_setErrorLogFile( clpPtr, strncat( errorLogFile, ".log", 4 ) );
00475         if( errorLogFile != NULL )
00476                 free( errorLogFile );
00477         
00478         GivenClp_destroy( givenClpPtr ); /* destroying structure GivenCommandLineParameters */
00479 }
00480 
00481 /** Reads the data of a file and stores it in a structure @ref List.
00482 * 
00483 * @attention
00484 * This is mainly a wrapper for function @ref readDataFromStream!
00485 *
00486 * @param filename the name of the file to read
00487 * @param daylight_type indicates the file type (@ref DAYLIGHT_SMILES, @ref DAYLIGHT_SMIRKS, @ref DAYLIGHT_SMARTS)
00488 * @param uniqueData if set to @ref BOOLEAN_TRUE input SMILES are filtered to yield unique SMILES
00489 * @param errorLogFile file pointer on the error log file
00490 * @retval List_Ptr points on structure @ref List containing the content of @c filename
00491 * @author Uli Fechner
00492 * @version 28/11/2003 - Uli Fechner - initial release
00493 * @version 09/06/2004 - UF - added new parameter @c uniqueData
00494 */
00495 List_Ptr readDataFromFile( const char* const filename, const int daylight_type, \
00496         const int uniqueData, FILE* errorLogFile )
00497 {
00498         FILE* dataFile = NULL; /* file pointer of the file with 'filename' */
00499 
00500         /* opening of the file and checking if this was successful */
00501         if( !( dataFile = fopen( filename, "r" ) ) )
00502                 FileReadError( filename );
00503         
00504         /* calling of the sub-function that actually reads the file */
00505         return readDataFromStream( dataFile, filename, daylight_type, uniqueData, errorLogFile );
00506 }
00507 
00508 /** Reads the data of a stream and stores it in a structure @ref List.
00509 *
00510 * @param inputStream file pointer of the stream to read
00511 * @param nameOfStream string indicating the name of the stream
00512 * @param daylight_type indicates the file type (@ref DAYLIGHT_SMILES, @ref DAYLIGHT_SMIRKS, @ref DAYLIGHT_SMARTS)
00513 * @param uniqueData if set to @ref BOOLEAN_TRUE input SMILES are filtered to yield unique SMILES
00514 * @param errorLogFile file pointer on the error log file
00515 * @retval List_Ptr points on structure @ref List containing the content of @c inputStream
00516 * @author Uli Fechner
00517 * @version 28/11/2003 - Uli Fechner - initial release
00518 * @version 08/12/2003 - Uli Fechner - considered unsuccessful creations of SmilesCompounds due to
00519 * invalid input SMILES; added error output
00520 * @version 09/06/2004 - UF - added new parameter @c uniqueData; bugfix: SMILES without a name were
00521 * assigned an empty string;
00522 * @version 11/06/2004 - UF - MAJOR change: from now on the first column is the SMILES and the second
00523 * column is the name (if any; before the 1st column was the name and the 2nd column was the SMILES);
00524 * this follows the common format of SMILES files
00525 * @version 05/07/2004 - UF - if no name is provided the SMILES/SMARTS/SMIRKS is taken as a name
00526 */
00527 List_Ptr readDataFromStream( FILE* inputStream, const char* const nameOfStream, \
00528         const int daylight_type, const int uniqueData, FILE* errorLogFile )
00529 {
00530         int numberOfColumns = 0; /* the number of columns in the file provided via 'inputStream' */
00531         /* maximum number of characters per column of the file provided via 'inputStream' */
00532         int maxNumberOfCharsPerRow = 0;
00533         int numberOfRows = 0; /* the number of rows in the file provided via 'inputStream' */
00534         char* tempRow = NULL; /* temporary storage of a row of 'inputStream' */
00535         char* substring = NULL; /* pointer needed by the algorithm that divides a line into columns */
00536         char* tempSmiles = NULL; /* temporary storage for the smiles of the current row */
00537         char* tempName = NULL; /* temporary storage for the name of the current row */
00538         DoubleArrayPtr dAPtr = NULL; /* pointer on DoubleArray containing the results of getFileProperties */
00539         List_Ptr listPtr = NULL; /* pointer on DoubleLinkedList containing the content of the file */
00540         SmilesCompound_Ptr scPtr = NULL; /* temp. pointer on a SmilesCompound during insertion of a compound */
00541         char* errorMessage = NULL; /* for each unsuccessful compound creation a error message is created */
00542         /* pointer on structure stringArray to store the error messages for unsuccessful compound creations */
00543         StringArray_Ptr sAPtr = NULL;
00544         
00545         /* examine the file (number of columns, number of rows, maximum number of chars per row) */
00546         dAPtr = getFileProperties( inputStream, nameOfStream );
00547         
00548         rewind( inputStream );
00549         numberOfColumns = (int) DoubleArray_getValue( dAPtr, 0 );
00550         maxNumberOfCharsPerRow = (int) DoubleArray_getValue( dAPtr, 1 );
00551         numberOfRows = (int) DoubleArray_getValue( dAPtr, 2 );
00552         
00553         #if FLUX_DEBUG
00554         fprintf( stdout, "\nNoColumns = %d\tMaxNoChars = %d\tNoRows = %d\n", \
00555                 numberOfColumns, maxNumberOfCharsPerRow, numberOfRows );
00556         #endif
00557 
00558         if( !( tempRow = calloc( maxNumberOfCharsPerRow + 2, sizeof( char ) ) ) )
00559                 MemoryError( "tempRow", "readDataFromStream" );
00560         if( !( tempSmiles = calloc( maxNumberOfCharsPerRow + 2, sizeof( char ) ) ) )
00561                 MemoryError( "tempSmiles", "readDataFromStream" );
00562         if( !( tempName = calloc( maxNumberOfCharsPerRow + 2, sizeof( char ) ) ) )
00563                 MemoryError( "tempName", "readDataFromStream" );
00564         listPtr = List_create( uniqueData, SmilesCompound_display, SmilesCompound_destroy, \
00565                 SmilesCompound_identical );
00566         sAPtr = StringArray_create( );
00567 
00568         /* in the following while loop each row of the file is read and stored in the strucure SmilesCompound */
00569         /* looping over all rows of 'inputSream' */
00570         while( fgets( tempRow, maxNumberOfCharsPerRow + 2, inputStream ) != NULL )
00571         {
00572                 #if FLUX_DEBUG
00573                 fprintf( stdout, "tempRow = %s", tempRow ); fflush( NULL );
00574                 #endif
00575 
00576                 /* the input file contains only one column - the SMILES string */
00577                 if( numberOfColumns == 1 )
00578                 {
00579                         strncpy( tempSmiles, tempRow, strlen( tempRow ) - 1 );
00580                         tempSmiles[ strlen( tempRow ) - 1 ] = '\0';
00581                         strncpy( tempName, tempRow, strlen( tempRow ) - 1 );
00582                         tempName[ strlen( tempRow ) - 1 ] = '\0';
00583                 }
00584                 /* the input file contains two columns - the name and the SMILES string */
00585                 else
00586                 {
00587                         substring = strstr( tempRow, "\t" );
00588                         strncpy( tempSmiles, tempRow, ( substring - tempRow ) );
00589                         tempSmiles[ substring - tempRow ] = '\0';
00590                         strncpy( tempName, substring + 1, ( tempRow + strlen( tempRow ) - 1 ) - ( substring + 1 ) );
00591                         tempName[ ( tempRow + strlen( tempRow ) - 1 ) - ( substring + 1 ) ] = '\0';
00592                 }
00593                 #if FLUX_DEBUG
00594                 fprintf( stdout, "Smiles: %s\tName: %s\n", tempSmiles, tempName ); fflush( NULL );
00595                 #endif
00596                 /* a SmilesCompound is created and inserted at the tail of the list
00597                 if the insertion fails due to the presence of an identical SmilesCompound it is destroyed */
00598                 scPtr = SmilesCompound_create( tempName, tempSmiles, daylight_type, errorLogFile );
00599                 /* checking if the creation of the SmilesCompound was successful */
00600                 if( scPtr != BOOLEAN_FALSE )
00601                 {
00602                         if( List_insertTail( listPtr, scPtr ) == BOOLEAN_FALSE )
00603                                 SmilesCompound_destroy( scPtr );
00604                 }
00605                 else
00606                 {
00607                         if( !( errorMessage = calloc( 300, sizeof( char ) ) ) )
00608                                 MemoryError( "errorMessage", "readDataFromStream" );
00609                         sprintf( errorMessage, "Error reading compound %s\n", tempSmiles );
00610                         StringArray_addElement( sAPtr, errorMessage );
00611                 }
00612         }
00613         /* if there are any error messages they are printed to stderr */
00614         if( StringArray_getNumberOfElements( sAPtr ) != 0 )
00615                 StringArray_display( sAPtr, stderr );
00616         
00617         /* cleaning up */
00618         free( tempRow );
00619         free( tempSmiles );
00620         free( tempName );
00621         DoubleArray_destroy( dAPtr );
00622         StringArray_destroy( sAPtr );
00623         
00624         return listPtr;
00625 }
00626 
00627 /** Examines the properties of a file stream.
00628 * 
00629 * The number of columns, characters per column and the number of rows are counted and the result
00630 * of this examination are stored in a structure @ref DoubleArray. This @ref DoubleArray contains
00631 * three elements regarded to the properties of dataFile:
00632 *
00633 * @li index 0 - number of columns
00634 * @li index 1 - maximum number of characters per column
00635 * @li index 2 - number of rows
00636 *
00637 * The file is also checked for integrity, i.e. it is checked, if it has the same number of columns in
00638 * all rows. If this is not the case, the define @ref FileIntegrityError is called and the program
00639 * aborts.
00640 *
00641 * @param inputStream file pointer of the stream to examine
00642 * @param nameOfStream string indicating the name of the stream
00643 * @retval DoubleArrayPtr contains the properties of @c inputStream
00644 * @author Uli Fechner
00645 * @version 28/11/2003 - Uli Fechner - initial release
00646 */
00647 DoubleArrayPtr getFileProperties( FILE* inputStream, const char* const nameOfStream )
00648 {
00649         int numberOfColumns = 1; /* number of columns in dataFile */
00650         int maxCharsPerRow = 0; /* maximum number of characters per row in dataFile */
00651         int numberOfRows = 0; /* number of rows in dataFile */
00652         char currentCharacter; /* temporary storage of the character read by fgetc */
00653         int position; /* counter for the position in tempRow */
00654         char* tempRow; /* temporary string containing one row for analysis */
00655         char* substring; /* pointer needed for the 'number of occurences' algorithm */
00656         int counter = 1; /* counter needed by the 'number of occurences' algorithm */
00657         /* dAPtr points on a DoubleArray returning the results.
00658         The DoubleArray contains three values regarding to the properties of dataFile:
00659         index 0 - number of columns
00660         index 1 - maximum number of characters per column
00661         index 2 - number of rows */
00662         DoubleArrayPtr dAPtr; /* the file properties are stored in a DoubleArray structure */
00663         
00664         if( !( tempRow = calloc( 10000, sizeof( char ) ) ) )
00665                 MemoryError( "tempRow", "getFileProperties" );  
00666 
00667         dAPtr = DoubleArray_create( 3 ); /* create a new DoubleArray with 3 elements */
00668         
00669         /* looping over all rows of the file */
00670         while( ( currentCharacter = fgetc( inputStream ) ) != EOF )
00671         {
00672                 /* putting the character back in the stream */
00673                 ungetc( currentCharacter, inputStream );
00674                 
00675                 /* resetting the position counter and the tab counter */
00676                 position = 0;
00677                 counter = 1;
00678                 
00679                 /* getting a line of inputStream */
00680                 while( ( currentCharacter = fgetc( inputStream ) ) != '\n' )
00681                 {
00682                         tempRow[ position ] = currentCharacter;
00683                         position++;
00684                 }
00685                 tempRow[ position ] = '\0';
00686                 numberOfRows++;
00687                 
00688                 /* counting the columns by counting the number of occurences of '\t' */
00689                 substring = strstr( tempRow, "\t" );
00690                 while( substring != NULL)
00691                 {
00692                         counter++;                      
00693                         substring = strstr( substring + 1, "\t" );
00694                 }
00695                 
00696                 /* if the number of columns of the first row are counted the variable 'numberOfColumns' is set */
00697                 if( numberOfRows == 1 )
00698                         numberOfColumns = counter;
00699                 
00700                 /* if the number of columns in one row is different from that in the first row the program aborts */
00701                 if( counter != numberOfColumns )                        
00702                         FileIntegrityError( nameOfStream, numberOfRows, counter, numberOfColumns );
00703                         
00704                 /* keeping the maximum number of chars in one row up to date */
00705                 if( strlen( tempRow ) > maxCharsPerRow )
00706                         maxCharsPerRow = position;
00707         }
00708 
00709         /* checking if there are exactly 2 columns; if not the input file is invalid and the program aborts */
00710         if( numberOfColumns > 2 )
00711         {
00712                 fprintf( stderr, "\nERROR: The number of columns in %s is neither one nor two!\n", nameOfStream );
00713                 fprintf( stderr, "Only input files with the name in the first column and the SMILES string\n" );
00714                 fprintf( stderr, "in the second column or the SMILES string as the only column are valid!\n" );
00715                 AbortProgram;
00716         }
00717         
00718         /* the values regarding to the file property are copied to the struct DoubleArray */
00719         DoubleArray_setValue( dAPtr, 0, numberOfColumns );
00720         DoubleArray_setValue( dAPtr, 1, maxCharsPerRow );
00721         DoubleArray_setValue( dAPtr, 2, numberOfRows );
00722         
00723         free( tempRow );
00724         
00725         return dAPtr;
00726 }
00727 
00728 /** Displays the help text on standard error.
00729 *
00730 * @param clpPtr pointer on structure @ref CommandLineParameters that is used to store all the command line parameters
00731 * @author Uli Fechner
00732 * @version 24/11/2003 - Uli Fechner - initial release
00733 * @version 7/1/2003 - UF - added help text for the f option
00734 */
00735 void displayHelpText( const CLP_Ptr clpPtr )
00736 {
00737         fprintf( stderr, "\nNAME:\n" );
00738   fprintf( stderr, "  retroflux\n" );
00739         fprintf( stderr, "\nFUNCTION:\n" );
00740   fprintf( stderr, "  Facilitates a virtual retro-synthesis of compounds.\n" );
00741         fprintf( stderr, "\nUSAGE:\n" );
00742         fprintf( stderr, "  retroflux [Options] <INFILE\n" );
00743         fprintf( stderr, "     INFILE has to be a file with one SMILES per row or a tab separated file\n" );
00744         fprintf( stderr, "     with SMILES in the first and labels in the second column.\n" );
00745         fprintf( stderr, "\nREMARK:\n" );
00746         fprintf( stderr, "  An error log file is automatically created. Its name is the suffix '.log'\n" );
00747         fprintf( stderr, "  appended to the name of the output file.\n" );
00748         fprintf( stderr, "\nOPTIONS:\n" );
00749         fprintf( stderr, "  -h\n" );
00750         fprintf( stderr, "     Display this help text.\n" );
00751         fprintf( stderr, "  -v\n" );
00752         fprintf( stderr, "     Display detailed version information and exit.\n" );
00753         fprintf( stderr, "  -o STRING\n" );
00754         fprintf( stderr, "     Set STRING as the name of the output file.\n" );
00755         fprintf( stderr, "  -r FILENAME | mandatory\n" );
00756         fprintf( stderr, "     Set FILENAME as the name of the reaction SMILES file. This file has to\n" );
00757         fprintf( stderr, "     contain exactly two tab separated columns, where SMIRKS are in the first\n" );
00758         fprintf( stderr, "     column and reaction names are in the second column.\n" );
00759         fprintf( stderr, "  -f FILENAME\n" );
00760         fprintf( stderr, "     Set FILENAME as the name of the filter SMILES file. This file has to\n" );
00761         fprintf( stderr, "     contain one SMILES per row. All molecules in the filter file are\n" );
00762         fprintf( stderr, "     considered as unwanted fragments, i.e. reactions that lead to one of these\n" );
00763         fprintf( stderr, "     molecules are not carried out.\n" );
00764         fprintf( stderr, "     If no file is specified, all resulting fragments are considered as valid.\n" );
00765         fprintf( stderr, "\nAUTHOR:\n" );
00766         fprintf( stderr, "  Uli Fechner\n" );
00767         fprintf( stderr, "\nVERSION & RELEASE DATE:\n" );
00768   fprintf( stderr, "  %s ( %s )\n", RETROFLUX_VERSION, RETROFLUX_DATE );
00769         fprintf( stderr, "\nBUGS:\n" );
00770   fprintf( stderr, "  Please report bugs to u.fechner@chemie.uni-frankfurt.de\n\n" );
00771         exit( EXIT_SUCCESS );
00772 }
00773 
00774 /** Displays the version information text on standard error.
00775 *
00776 * @author Uli Fechner
00777 * @version 28/11/2003 - Uli Fechner - initial release
00778 */
00779 void displayVersionInformation( void )
00780 {
00781         fprintf( stderr, "\nVERSION & RELEASE DATE:\n\n");
00782         fprintf( stderr, "  includes.h: %s [%s]\n", INCLUDES_VERSION, INCLUDES_DATE );
00783         fprintf( stderr, "  generalDefines.h: %s [%s]\n", GENERALDEFINES_VERSION, GENERALDEFINES_DATE );
00784         fprintf( stderr, "  givenClp.c: %s [%s]\n", GIVENCLP_VERSION, GIVENCLP_DATE );
00785         fprintf( stderr, "  doubleArray.c: %s [%s]\n", DOUBLEARRAY_VERSION, DOUBLEARRAY_DATE );
00786         fprintf( stderr, "  clp.c: %s [%s]\n", CLP_VERSION, CLP_DATE );
00787         fprintf( stderr, "  smilesCompound.c: %s [%s]\n", SMILESCOMPOUND_VERSION, SMILESCOMPOUND_DATE );
00788         fprintf( stderr, "  doubleLinkedList.c: %s [%s]\n", DOUBLELINKEDLIST_VERSION, DOUBLELINKEDLIST_DATE );
00789         fprintf( stderr, "  retroflux core: %s [%s]\n\n", RETROFLUX_VERSION, RETROFLUX_DATE );
00790         exit( EXIT_SUCCESS );
00791 }

Generated on Tue Nov 9 16:27:11 2004 for retroflux by doxygen 1.3.6