/* -*-C-*- */ /*********************************************************************** * This source code is public domain, and may be freely distributed, * * modified, and used for any purpose. IT COMES WITH ABSOLUTELY NO * * WARRANTY OF ANY KIND. * ***********************************************************************/ /*====================================================================== * stigmata() -- program for computing common characteristics * * * Reference: N.E. Shemetulskis, D. Weininger, C. J. Blankley, * J. J. Yang, C. Humblet, "Stigmata: An Algorithm to * Determine Structural Commonalities in Diverse Datasets", * JCICS... ======================================================================= * Author: Dave Weininger * Modified by: Norah Shemetulskis ======================================================================= */ #include#include #include #include #include "dt_smiles.h" #include "dt_smarts.h" #include "dt_finger.h" /*** Default parameters. ***/ #define THRESH 0.5 #define FPSIZE 2048 /*** A "pool" item. ***/ typedef struct _item { dt_Handle mol; /* the molecule */ dt_Handle fp; /* normal molecular (fp) fingerprint */ char *fps; /* string value of fingerprint fp */ int lenfps; /* length of fps */ char *smi; /* input SMILES */ char *nam; /* input name */ float sim; /* normal (sim) */ float mp; /* percent of similarity to modal */ struct _item *next; /* pointer to next item in list */ } ITEM; static int spew = 0; /* debug level */ static int modal = 0; /* modal already exists */ static int db = 0; /* do db search */ static int ascii = 0; /* do ascii output */ static float thresh = THRESH; static int size = FPSIZE; void putusmi(dt_Handle mol) { char *s; int lens; s = dt_cansmiles(&lens, mol, 0); fprintf(stderr, "%.*s", lens, s); return; } void putsymb(dt_Handle atom) { char *s; int lens; s = dt_symbol(&lens, atom); fprintf(stderr, "%.*s", lens, s); if (1 == lens) fprintf(stderr, " "); return; } void sayfp(dt_Handle fp) { char *s; int lens, i, j, k; s = dt_stringvalue(&lens, fp); for (i = 0; i < lens; i += 8) { for (j = 0; j < 8; j++) { k = s[i + j]; fprintf(stderr, "%d", 0 != (k & 0x01)); fprintf(stderr, "%d", 0 != (k & 0x02)); fprintf(stderr, "%d", 0 != (k & 0x04)); fprintf(stderr, "%d", 0 != (k & 0x08)); fprintf(stderr, "%d", 0 != (k & 0x10)); fprintf(stderr, "%d", 0 != (k & 0x20)); fprintf(stderr, "%d", 0 != (k & 0x40)); fprintf(stderr, "%d", 0 != (k & 0x80)); fprintf(stderr, " "); } fprintf(stderr, "\n"); } return; } /*====================================================================== * my_stringvalue() -- like dt_stringvalue(), but returns new string ====================================================================== */ static char *my_stringvalue(int *plen, dt_Handle ob) { char *news; char *str = dt_stringvalue(plen, ob); if (NULL == str) return NULL; news = (char *)malloc(*plen); memcpy(news, str, *plen); return news; } /*====================================================================== * mpcmp() -- compare mp similarities of two ITEMs for sorting. * added 1/11/94 N.S. ====================================================================== */ static int mpcmp(const void *pit1, const void *pit2) { ITEM *it1 = *((ITEM **)pit1); ITEM *it2 = *((ITEM **)pit2); /*** Primary sort, based on mp ***/ if (it1->mp < it2->mp) return (1); if (it1->mp > it2->mp) return (-1); /*** Secondary sort, mp's are the same ***/ if (it1->sim < it2->sim) return (1); if (it1->sim > it2->sim) return (-1); return 0; } /*============================================================================ * options() */ static FILE *options(int *argc, char **argv) { int i, nerr; FILE *fin; char *fn = NULL; /*** Consume arguments. ***/ for (nerr = 0, i = 1; i < *argc; i++) { if (0 == strcmp("-c", argv[i])) { if (0 == (size = atoi(argv[++i]))) nerr += fprintf(stderr, "can't interpret -c size: %s\n", argv[i]); else if (32 > size || 2048 < size) nerr += fprintf(stderr, "-c size must be 32-2048: %d\n", size); } else if (0 == strcmp("-m", argv[i])) modal=1; else if (0 == strcmp("-n", argv[i])) { db=1; ascii=1; modal=1; } else if (0 == strcmp("-o", argv[i])) ascii=1; else if (0 == strcmp("-v", argv[i])) spew = 1; else if (0 == strcmp("-vv", argv[i])) spew = 2; else if (0 == strcmp("-t", argv[i])) { if (0.0 == (thresh = atof(argv[++i]))) nerr += fprintf(stderr, "can't interpret -t threshold: %s\n", argv[i]); else if (0.0 > thresh || 1.0 < thresh) nerr += fprintf(stderr, "-t threshold value must be 0 < val <=1: %.4f\n", thresh); } else if ('-' == *argv[i]) nerr += fprintf(stderr, "unknown option: %s\n", argv[i]); else if (NULL != fn) nerr += fprintf(stderr, "only one filename allowed, 2nd is: %s\n", argv[i]); else fn = argv[i]; if (0 < nerr) break; } if (0 == nerr) { if (NULL == fn) nerr += fprintf(stderr, "name of input SMILES file is required\n"); else if (NULL == (fin = fopen(fn, "r"))) nerr += fprintf(stderr, "can't open %s\n", fn); } if (0 < nerr) { fprintf(stderr, "usage: $ %s [ options ] file.smi\n", argv[0]); fprintf(stderr, "options:\n"); fprintf(stderr, " -c size ...... Fingerprint generation size (%d)\n", FPSIZE); fprintf(stderr, " -m ........... query structures in file " "second.smi\n"); fprintf(stderr, " -n ........... query structures in file " "second.smi, ascii output only\n"); fprintf(stderr, " -o ........... ASCII output (ascii.dat) containing " "name, MODP, MSIM \n"); fprintf(stderr, " -t thresh .... modal threshold (%.4f)\n", THRESH); exit(1); } return fin; } /*============================================================================ * main() */ main(int argc, char **argv) { dt_Handle mol, mfp, atoms, atom, tfp, subs, bonds, bond; char *nl, *sp, *mfps, *s, *maxnam, *minnam; char smi[1000], modf[2048]; int maxfp, minfp, tmp, tmpf1, i, j, k, kk, bc, nin, lens, no; int *ffp; float stig; FILE *mf, *af, *af2; ITEM *it, **pita; FILE *fin = NULL; ITEM *itlist = NULL; /*** 1/9/95 N.S. added percent to flag if kk<=1 ***/ fin = options(&argc, argv); /*** Set fingerprint parameters. ***/ if (spew) { fprintf(stderr, "\n%s -- characteristic attribute generation\n", *argv); fprintf(stderr, "\nPARAMETERS:\n"); if (0 == spew) fprintf(stderr, " verbose: OFF\n"); else if (1 == spew) fprintf(stderr, " verbose: VERBOSE\n"); else fprintf(stderr, " verbose: VERY VERBOSE\n"); fprintf(stderr, " size: %d\n", size); fprintf(stderr, " threshold: %.4f\n", thresh); } /*** Interpret input SMILES as molecules. ***/ nin = 0; while (fgets(smi, 1000, fin)) { if (NULL != (nl = strchr(smi, '\n'))) *nl = '\0'; if (NULL != (mol = dt_smilin(strlen(smi), smi))) { it = (ITEM *) malloc(sizeof(ITEM)); if (NULL == (sp = strchr(smi, ' '))) it->nam = strdup("no name"); else { *sp = '\0'; it->nam = strdup(sp + 1); } if (2 == spew) fprintf(stderr, "fingerprinting %s\n", it->nam); else if (spew) fprintf(stderr, "."); it->smi = strdup(smi); it->mol = mol; it->next = itlist; itlist = it; it->fp = dt_fp_generatefp(mol, 0, 7, size); if (spew) sayfp(it->fp); nin++; } } fclose(fin); if (1 == spew) fprintf(stderr, "\n"); if (spew) fprintf(stderr, "\n%d structures read in.\n", nin); /*** Create array of ITEM pointers for sorting. ***/ pita = (ITEM **) malloc(nin * sizeof(ITEM *)); for (i = 0, it = itlist; NULL != it; it = it->next) pita[i++] = it; /*** Create frequency fingerprint as an array of int's. ***/ if (spew) fprintf(stderr, "\nMeasuring bit frequencies.\n"); ffp = (int *) malloc(size * sizeof(int)); memset((char *) ffp, 0, size * sizeof(int)); /*** Add 'em up. ***/ for (it = itlist; NULL != it; it = it->next) { it->fps = my_stringvalue(&it->lenfps, it->fp); for (i = j = 0; i < it->lenfps; i++, j += 8) { k = it->fps[i]; if (k & 0x01) ffp[j ]++; if (k & 0x02) ffp[j+1]++; if (k & 0x04) ffp[j+2]++; if (k & 0x08) ffp[j+3]++; if (k & 0x10) ffp[j+4]++; if (k & 0x20) ffp[j+5]++; if (k & 0x40) ffp[j+6]++; if (k & 0x80) ffp[j+7]++; } } /*** Make modal fingerprint string or read it in if it exists from prev. run **/ if (2 == modal) { mf = fopen("modal.dat","r"); mfps = (char *) malloc(size); memset((char *) mfps, 0, size); fgets(modf, size, mf); for (i = j = 0; i < size/8; i++, j += 8) { k = modf[i]; if (k & 0x01) mfps[j ]++; if (k & 0x02) mfps[j+1]++; if (k & 0x04) mfps[j+2]++; if (k & 0x08) mfps[j+3]++; if (k & 0x10) mfps[j+4]++; if (k & 0x20) mfps[j+5]++; if (k & 0x40) mfps[j+6]++; if (k & 0x80) mfps[j+7]++; } fclose(mf); } else { mfps = (char *) malloc(size); memset((char *) mfps, 0, size); if (spew) fprintf(stderr, "\nCreating modal fingerprint.\n"); kk = nin * thresh; /* make sure value of kk is at least 2 (N.S. 1/6/95) */ if (kk <= 1) { kk = 2; thresh = 2.0/((float)nin); fprintf(stderr,"Threshold is too small reset to %f", thresh); } for (j = i = 0; i < size/8; i++) { if (ffp[j++] >= kk) mfps[i] |= 0x01; if (ffp[j++] >= kk) mfps[i] |= 0x02; if (ffp[j++] >= kk) mfps[i] |= 0x04; if (ffp[j++] >= kk) mfps[i] |= 0x08; if (ffp[j++] >= kk) mfps[i] |= 0x10; if (ffp[j++] >= kk) mfps[i] |= 0x20; if (ffp[j++] >= kk) mfps[i] |= 0x40; if (ffp[j++] >= kk) mfps[i] |= 0x80; } } /*** Make modal fingerprint object. ***/ mfp = dt_fp_allocfp(size); dt_setstringvalue(mfp, size / 8, mfps); if (2 == spew) { fprintf(stderr, "Modal fingerprint:\n"); sayfp(mfp); } /******* ADDING IN -m option. N.S. ****/ if (1 == modal) { fprintf(stderr, "\n%d structures read in.\n", nin); /* first deallocating all of info from first file */ for (i = 0; i < nin; i++) { it = pita[i]; dt_dealloc(it->mol); dt_dealloc(it->fp); free(ffp); free(it->fps); free(it->smi); free(it->nam); free(it); } free(pita); itlist = NULL; nin = 0; mf = fopen("second.smi","r"); fprintf(stderr,"opened second.smi\n"); while (fgets(smi, 1000, mf)) { if (NULL != (nl = strchr(smi, '\n'))) *nl = '\0'; if (NULL != (mol = dt_smilin(strlen(smi), smi))) { it = (ITEM *) malloc(sizeof(ITEM)); if (NULL == (sp = strchr(smi, ' '))) it->nam = strdup("no name"); else { *sp = '\0'; it->nam = strdup(sp + 1); } if (2 == spew) fprintf(stderr, "fingerprinting %s\n", it->nam); else if (spew) fprintf(stderr, "."); it->smi = strdup(smi); it->mol = mol; it->next = itlist; itlist = it; it->fp = dt_fp_generatefp(mol, 0, 7, size); if (spew) sayfp(it->fp); nin++; } } fclose(mf); if (1 == spew) fprintf(stderr, "\n"); if (spew) fprintf(stderr, "\n%d structures read in.\n", nin); /*** Create array of ITEM pointers for sorting second file. ***/ pita = (ITEM **) malloc(nin * sizeof(ITEM *)); for (i = 0, it = itlist; NULL != it; it = it->next) pita[i++] = it; /*** Create frequency fingerprint as an array of int's. ***/ if (spew) fprintf(stderr, "\nMeasuring bit frequencies.\n"); ffp = (int *) malloc(size * sizeof(int)); memset((char *) ffp, 0, size * sizeof(int)); /*** Add 'em up. ***/ for (it = itlist; NULL != it; it = it->next) { it->fps = my_stringvalue(&it->lenfps, it->fp); for (i = j = 0; i < it->lenfps; i++, j += 8) { k = it->fps[i]; if (k & 0x01) ffp[j ]++; if (k & 0x02) ffp[j+1]++; if (k & 0x04) ffp[j+2]++; if (k & 0x08) ffp[j+3]++; if (k & 0x10) ffp[j+4]++; if (k & 0x20) ffp[j+5]++; if (k & 0x40) ffp[j+6]++; if (k & 0x80) ffp[j+7]++; } } } /*** Loop over input. ***/ if (spew) fprintf(stderr, "\nSorting by similarity to modal fingerprint.\n"); /*** initiallizing the parameters used for threshold analysis ***/ maxfp = 0; minfp = FPSIZE + 1; tmp = 0; for (i = 0, it = itlist; NULL != it; it = it->next) { pita[i++] = it; it->sim = dt_fp_tanimoto(mfp, it->fp); it->mp = dt_fp_tversky(mfp, it->fp, 1.0, 0.0); /* keep track of the max and min bitcounts */ if (db == 0) { tmp = dt_fp_bitcount(it->fp); if (tmp <= minfp) { minfp = tmp; minnam = it->nam; } if (tmp >= maxfp) { maxfp = tmp; maxnam = it->nam; } } } /*** Sort by percent similarity to modal(mp) and overal similarity ***/ qsort(pita, nin, sizeof(ITEM *), mpcmp); if (spew) { fprintf(stderr, "\n" "=== Structures sorted by similarity to modal fingerprint ===\n" "\n" "No.| Tan | Name\n" "--- ------ ----------------------------------------------\n" ); for (i = 0; i < 50 && i < nin; i++) fprintf(stderr, "%2d: %6.4f %s\n", i + 1, pita[i]->sim, pita[i]->nam); fprintf(stderr, "---------------------------------------------------------\n"); } /* Opening ascii output file if -o option was set 2.3.95 N.S. */ if (ascii) { af = fopen("ascii.dat","w"); fprintf(af, "NAME\tTanimoto\tModal Percent \n"); } /** checking if this is a database comparison 5.15.95 N.S. **/ if (db) { af2 = fopen("hits.smi","w"); for (i = 0; i < nin; i++) { it = pita[i]; if (it->mp >= .5) { /* hits.smi output */ fprintf(af2, "%s %s\n",it->smi, it->nam); /* ascii.dat output ***/ fprintf(af, "%s\t%f\t%f\n",it->nam, it->sim, it->mp); } } fclose(af2); } else { /*** Loop over molecules, most similar first. ***/ for (i = 0; i < nin; i++) { it = pita[i]; /*** Write input smiles if dt_cansmiles() fails ***/ s = dt_cansmiles(&lens, it->mol, 0); if (s == NULL) printf("$SMI<%s>",it->smi); else printf("$SMI<%.*s>", lens, s); /*** write rest of data. ***/ printf("NAM<%s>MSIM<%.4f>MODP<%.4f>", it->nam, it->sim, it->mp); if (ascii) fprintf(af, "%s\t%f\t%f\n", it->nam, it->sim, it->mp); if (spew) fprintf(stderr, "%d: %s\n", i + 1, it->nam); /*** Count bits in fingerprint of molecule. ***/ bc = dt_fp_bitcount(it->fp); if (spew) { fprintf(stderr, "USMILES: "); putusmi(it->mol); fprintf(stderr, "\n"); fprintf(stderr, "molecule-based bitcount = %d\n", bc); } /*** Create substruct containing all atoms and bonds. This is not necessary (using the molecule would do) but it will be faster this way. ***/ subs = dt_alloc_substruct(it->mol); atoms = dt_stream(it->mol, TYP_ATOM); while (NULL_OB != (atom = dt_next(atoms))) dt_add(subs, atom); dt_dealloc(atoms); bonds = dt_stream(it->mol, TYP_BOND); while (NULL_OB != (bond = dt_next(bonds))) dt_add(subs, bond); dt_dealloc(bonds); /*** Loop over atoms in molecule in canonical order. Generate the atom contributions to the modal. ***/ printf("ALAB<"); atoms = dt_canatom_stream(it->mol, 0, 0); no = 0; while (NULL_OB != (atom = dt_next(atoms))) { tfp = dt_e_fp_partfp(atom, subs, 0, 7, size); /*** Compute metric. ***/ if (no) printf(","); stig = dt_fp_tversky(tfp, mfp, 1.0, 0.0); printf("%.3f", stig); if (spew) { putsymb(atom); fprintf(stderr, "-based stigma = %.4f\n", stig); } dt_dealloc(tfp); no++; } printf(">|\n"); dt_dealloc(atoms); dt_dealloc(subs); } } /* adding threshold value and mbc/maxfp and mbc/minfp to ascii output */ if (ascii && (db == 0)) { tmpf1 = dt_fp_bitcount(mfp); fprintf(af, "\n"); fprintf(af, " Threshold = %f\n", thresh); fprintf(af, " Percent of Maxfp = %f\n", (float) tmpf1/ (float) maxfp); fprintf(af, " Percent of Minfp = %f\n", (float) tmpf1/ (float) minfp); fprintf(af, " Maxfp Name = %s\n", maxnam); fprintf(af, " Minfp Name = %s\n", minnam); fclose(af); } /*** Tidy up. ***/ for (i = 0; i < nin; i++) { it = pita[i]; dt_dealloc(it->mol); dt_dealloc(it->fp); free(it->smi); free(it->nam); } dt_dealloc(mfp); fprintf(stderr, "handle count: %d\n", dt_vh_count()); free(it); free(ffp); free(pita); return (1); }