/* -*-C-*- */
 
/***********************************************************************
 * This source code is public domain, and may be freely distributed,   *
 * modified, and used for any purpose.  IT COMES WITH ABSOLUTELY NO    *
 * WARRANTY OF ANY KIND.                                               *
 ***********************************************************************/
 
/*======================================================================
 *  stigmata() -- program for computing common characteristics
 *
 * 
 *  Reference:  N.E. Shemetulskis, D. Weininger, C. J. Blankley, 
 *              J. J. Yang,  C. Humblet, "Stigmata:  An Algorithm to 
 *              Determine Structural Commonalities in Diverse Datasets",
 *              JCICS...
 =======================================================================
 * Author:  Dave Weininger
 * Modified by: Norah Shemetulskis
 ======================================================================= */

#include 
#include 
#include 
#include 
#include "dt_smiles.h"
#include "dt_smarts.h"
#include "dt_finger.h"

/*** Default parameters. ***/

#define THRESH    0.5
#define FPSIZE    2048

/*** A "pool" item. ***/

typedef struct _item {
  dt_Handle     mol;		/* the molecule */
  dt_Handle     fp;		/* normal molecular (fp) fingerprint */
  char         *fps;		/* string value of fingerprint fp */
  int           lenfps;		/* length of fps */
  char         *smi;		/* input SMILES */
  char         *nam;		/* input name */
  float         sim;		/* normal (sim) */
  float         mp;		/* percent of similarity to modal */
  struct _item *next;		/* pointer to next item in list */
} ITEM;

static int spew = 0;		/* debug level */
static int modal = 0;		/* modal already exists */
static int db = 0;		/* do db search */
static int ascii = 0;		/* do ascii output */
static float thresh = THRESH;
static int size = FPSIZE;

void putusmi(dt_Handle mol)
{
   char *s;
   int  lens;
   s = dt_cansmiles(&lens, mol, 0);
   fprintf(stderr, "%.*s", lens, s);
   return;
}

void putsymb(dt_Handle atom)
{
   char *s;
   int  lens;
   s = dt_symbol(&lens, atom);
   fprintf(stderr, "%.*s", lens, s);
   if (1 == lens) fprintf(stderr, " ");
   return;
}

void sayfp(dt_Handle fp)
{
   char *s;
   int   lens, i, j, k;

   s = dt_stringvalue(&lens, fp);
   for (i = 0; i < lens; i += 8) {
      for (j = 0; j < 8; j++) {
         k = s[i + j];
         fprintf(stderr, "%d", 0 != (k & 0x01));
         fprintf(stderr, "%d", 0 != (k & 0x02));
         fprintf(stderr, "%d", 0 != (k & 0x04));
         fprintf(stderr, "%d", 0 != (k & 0x08));
         fprintf(stderr, "%d", 0 != (k & 0x10));
         fprintf(stderr, "%d", 0 != (k & 0x20));
         fprintf(stderr, "%d", 0 != (k & 0x40));
         fprintf(stderr, "%d", 0 != (k & 0x80));
         fprintf(stderr, " ");
      }
      fprintf(stderr, "\n");
   }
   return;
}

/*======================================================================
 *  my_stringvalue() -- like dt_stringvalue(), but returns new string
 ====================================================================== */

static char *my_stringvalue(int *plen, dt_Handle ob)
{
   char *news;
   char *str = dt_stringvalue(plen, ob);
   if (NULL == str) return NULL;
   news = (char *)malloc(*plen);
   memcpy(news, str, *plen);
   return news;
}

/*======================================================================
 *  mpcmp() -- compare mp similarities of two ITEMs for sorting. 
 *             added 1/11/94 N.S. 
 ====================================================================== */

static int mpcmp(const void *pit1, const void *pit2)
{
   ITEM *it1 = *((ITEM **)pit1);
   ITEM *it2 = *((ITEM **)pit2);

   /*** Primary sort, based on mp ***/

   if (it1->mp < it2->mp) return (1);
   if (it1->mp > it2->mp) return (-1);

   /*** Secondary sort, mp's are the same ***/
   
   if (it1->sim < it2->sim) return (1);
   if (it1->sim > it2->sim) return (-1);

   return 0;
}

/*============================================================================
 *  options()
 */

static FILE *options(int *argc, char **argv)
{
   int     i, nerr;
   FILE   *fin;
   char   *fn = NULL;

   /*** Consume arguments. ***/

   for (nerr = 0, i = 1; i < *argc; i++)
     {
       if (0 == strcmp("-c", argv[i]))
	 {
	   if (0 == (size = atoi(argv[++i])))
	     nerr += fprintf(stderr,
			     "can't interpret -c size: %s\n", argv[i]);
	   else if (32 > size || 2048 < size)
	     nerr += fprintf(stderr,
			     "-c size must be 32-2048: %d\n", size);
	 }
       else if (0 == strcmp("-m", argv[i]))
	 modal=1;
       else if (0 == strcmp("-n", argv[i]))
	 {
	   db=1;
	   ascii=1;
	   modal=1;
	 }
       else if (0 == strcmp("-o", argv[i]))
	 ascii=1;
       else if (0 == strcmp("-v", argv[i]))
	 spew = 1;
       else if (0 == strcmp("-vv", argv[i]))
	 spew = 2;
       else if (0 == strcmp("-t", argv[i]))
	 {
	   if (0.0 == (thresh = atof(argv[++i])))
	     nerr += fprintf(stderr,
			     "can't interpret -t threshold: %s\n", argv[i]);
	   else if (0.0 > thresh || 1.0 < thresh)
	     nerr += fprintf(stderr,
			     "-t threshold value must be 0 < val <=1: %.4f\n",
			     thresh);
	 }
       else if ('-' == *argv[i])
	 nerr += fprintf(stderr, "unknown option: %s\n", argv[i]);
       else if (NULL != fn)
         nerr += fprintf(stderr, "only one filename allowed, 2nd is: %s\n",
			 argv[i]);
       else
         fn = argv[i];
       
       if (0 < nerr) break;
     }
   
   if (0 == nerr)
     {
       if (NULL == fn) 
         nerr += fprintf(stderr, "name of input SMILES file is required\n");
       else if (NULL == (fin = fopen(fn, "r")))
         nerr += fprintf(stderr, "can't open %s\n", fn);
     }

   if (0 < nerr)
     {
       fprintf(stderr, "usage: $ %s [ options ] file.smi\n", argv[0]);
       fprintf(stderr, "options:\n");
       fprintf(stderr, " -c size ...... Fingerprint generation size (%d)\n",
	       FPSIZE);
       fprintf(stderr, " -m ........... query structures in file "
	       "second.smi\n");
       fprintf(stderr, " -n ........... query structures in file "
	       "second.smi, ascii output only\n");
       fprintf(stderr, " -o ........... ASCII output (ascii.dat) containing "
	       "name, MODP, MSIM \n");
       fprintf(stderr, " -t thresh .... modal threshold (%.4f)\n", THRESH);
       exit(1);
     }
   return fin;
}


/*============================================================================
 *  main()
 */

main(int argc, char **argv)
{
  dt_Handle mol, mfp, atoms, atom, tfp, subs, bonds, bond;
  
  char *nl, *sp, *mfps, *s, *maxnam, *minnam;
  char	smi[1000], modf[2048];
  
  int  maxfp, minfp, tmp, tmpf1, i, j, k, kk, bc, nin, lens, no;
  int *ffp;
  
  float stig;
  
  FILE *mf, *af, *af2;
  ITEM *it, **pita;
  
  FILE *fin = NULL;
  ITEM *itlist = NULL;
  
  /*** 1/9/95 N.S. added percent to flag if kk<=1  ***/
  
  fin = options(&argc, argv);
  
  /*** Set fingerprint parameters. ***/
  
  if (spew)
    {
      fprintf(stderr, "\n%s -- characteristic attribute generation\n", *argv);
      fprintf(stderr, "\nPARAMETERS:\n");
      if      (0 == spew) fprintf(stderr, "   verbose: OFF\n");
      else if (1 == spew) fprintf(stderr, "   verbose: VERBOSE\n");
      else                fprintf(stderr, "   verbose: VERY VERBOSE\n");
      fprintf(stderr, "      size: %d\n", size);
      fprintf(stderr, " threshold: %.4f\n", thresh);
    }
  
  /*** Interpret input SMILES as molecules. ***/
  
  nin = 0;
  
  while (fgets(smi, 1000, fin))
    {
      if (NULL != (nl = strchr(smi, '\n')))
	*nl = '\0';
      if (NULL != (mol = dt_smilin(strlen(smi), smi)))
	{
	  it = (ITEM *) malloc(sizeof(ITEM));
	  if (NULL == (sp = strchr(smi, ' ')))
	    it->nam = strdup("no name");
	  else
	    {
	      *sp = '\0';
	      it->nam = strdup(sp + 1);
	    }
	  if (2 == spew)
	    fprintf(stderr, "fingerprinting %s\n", it->nam);
	  else if (spew)
	    fprintf(stderr, ".");
	  it->smi  = strdup(smi);
	  it->mol  = mol;
	  it->next = itlist;
	  itlist   = it;
	  it->fp   = dt_fp_generatefp(mol, 0, 7, size);
	  if (spew) sayfp(it->fp);
	  nin++;
	}
    }
  fclose(fin);
  if (1 == spew) fprintf(stderr, "\n");
  
  if (spew) fprintf(stderr, "\n%d structures read in.\n", nin);
   
  /*** Create array of ITEM pointers for sorting. ***/
  
  pita = (ITEM **) malloc(nin * sizeof(ITEM *));
  for (i = 0, it = itlist; NULL != it; it = it->next)
    pita[i++] = it;
  
  /*** Create frequency fingerprint as an array of int's. ***/
  
  if (spew) fprintf(stderr, "\nMeasuring bit frequencies.\n");
  
  ffp  = (int *) malloc(size * sizeof(int));
  memset((char *) ffp, 0, size * sizeof(int));
  
  /*** Add 'em up. ***/
  
  for (it = itlist; NULL != it; it = it->next)
    {
      it->fps = my_stringvalue(&it->lenfps, it->fp);
      for (i = j = 0; i < it->lenfps; i++, j += 8)
	{
	  k = it->fps[i];
	  if (k & 0x01) ffp[j  ]++;
	  if (k & 0x02) ffp[j+1]++;
	  if (k & 0x04) ffp[j+2]++;
	  if (k & 0x08) ffp[j+3]++;
	  if (k & 0x10) ffp[j+4]++;
	  if (k & 0x20) ffp[j+5]++;
	  if (k & 0x40) ffp[j+6]++;
	  if (k & 0x80) ffp[j+7]++;
	}
    }
  
  /*** Make modal fingerprint string or read it in if it exists from
    prev. run **/
  
  if (2 == modal)
    {
      mf = fopen("modal.dat","r"); 
      mfps = (char *) malloc(size);
      memset((char *) mfps, 0, size);
      fgets(modf, size, mf);
      for (i = j = 0; i < size/8; i++, j += 8)
	{
	  k = modf[i];
	  if (k & 0x01) mfps[j  ]++;
	  if (k & 0x02) mfps[j+1]++;
	  if (k & 0x04) mfps[j+2]++;
	  if (k & 0x08) mfps[j+3]++;
	  if (k & 0x10) mfps[j+4]++;
	  if (k & 0x20) mfps[j+5]++;
	  if (k & 0x40) mfps[j+6]++;
	  if (k & 0x80) mfps[j+7]++;
	}
      fclose(mf);
    }
  else
    { 
      mfps = (char *) malloc(size);
      memset((char *) mfps, 0, size);
      if (spew) fprintf(stderr, "\nCreating modal fingerprint.\n"); 
      kk = nin * thresh;
      /* make sure value of kk is at least 2 (N.S.  1/6/95) */
      if (kk <= 1)
	{
	  kk = 2;
	  thresh = 2.0/((float)nin);
	  fprintf(stderr,"Threshold is too small reset to %f", thresh); 
	} 
      
      for (j = i = 0; i < size/8; i++)
	{
	  if (ffp[j++] >= kk) mfps[i] |= 0x01;
	  if (ffp[j++] >= kk) mfps[i] |= 0x02;
	  if (ffp[j++] >= kk) mfps[i] |= 0x04;
	  if (ffp[j++] >= kk) mfps[i] |= 0x08;
	  if (ffp[j++] >= kk) mfps[i] |= 0x10;
	  if (ffp[j++] >= kk) mfps[i] |= 0x20;
	  if (ffp[j++] >= kk) mfps[i] |= 0x40;
	  if (ffp[j++] >= kk) mfps[i] |= 0x80;
	}
    }

  /*** Make modal fingerprint object. ***/

  mfp = dt_fp_allocfp(size);
  dt_setstringvalue(mfp, size / 8, mfps);
  if (2 == spew) { fprintf(stderr, "Modal fingerprint:\n"); sayfp(mfp); }
  
  /*******  ADDING IN -m option.  N.S. ****/
  
  if (1 == modal)
    {
      fprintf(stderr, "\n%d structures read in.\n", nin);
      /* first deallocating all  of info from first file */
      for (i = 0; i < nin; i++) 
	{
	  it = pita[i];
	  dt_dealloc(it->mol); 
	  dt_dealloc(it->fp); 
	  free(ffp);
	  free(it->fps);
	  free(it->smi);
	  free(it->nam);
	  free(it);
	}
      free(pita);
      itlist = NULL;
      
      nin = 0;
      mf = fopen("second.smi","r"); 
      fprintf(stderr,"opened second.smi\n");
      while (fgets(smi, 1000, mf))
	{
	  if (NULL != (nl = strchr(smi, '\n'))) *nl = '\0';
	  if (NULL != (mol = dt_smilin(strlen(smi), smi)))
	    {
	      it     = (ITEM *) malloc(sizeof(ITEM));
	      if (NULL == (sp = strchr(smi, ' ')))
		it->nam = strdup("no name");
	      else
		 {
		   *sp = '\0';
		   it->nam = strdup(sp + 1);
		 }
	       if (2 == spew)
		 fprintf(stderr, "fingerprinting %s\n", it->nam);
	       else if (spew)
		 fprintf(stderr, ".");
	       it->smi  = strdup(smi);
	       it->mol  = mol;
	       it->next = itlist;
	       itlist   = it;
	       it->fp   = dt_fp_generatefp(mol, 0, 7, size);
	       if (spew) sayfp(it->fp);
	       nin++;
	     }
	 }
       fclose(mf);
       if (1 == spew) fprintf(stderr, "\n");

       if (spew) fprintf(stderr, "\n%d structures read in.\n", nin);
       
       /*** Create array of ITEM pointers for sorting second file. ***/
       
       pita = (ITEM **) malloc(nin * sizeof(ITEM *));
       for (i = 0, it = itlist; NULL != it; it = it->next)
	 pita[i++] = it;
       /*** Create frequency fingerprint as an array of int's. ***/
       
       if (spew) fprintf(stderr, "\nMeasuring bit frequencies.\n");
       
       ffp  = (int *) malloc(size * sizeof(int));
       memset((char *) ffp, 0, size * sizeof(int));
       
       /*** Add 'em up. ***/
       
       for (it = itlist; NULL != it; it = it->next)
	 {
	   it->fps = my_stringvalue(&it->lenfps, it->fp);
	   for (i = j = 0; i < it->lenfps; i++, j += 8)
	     {
	       k = it->fps[i];
	       if (k & 0x01) ffp[j  ]++;
	       if (k & 0x02) ffp[j+1]++;
	       if (k & 0x04) ffp[j+2]++;
	       if (k & 0x08) ffp[j+3]++;
	       if (k & 0x10) ffp[j+4]++;
	       if (k & 0x20) ffp[j+5]++;
	       if (k & 0x40) ffp[j+6]++;
	       if (k & 0x80) ffp[j+7]++;
	       
	     }
	 }
     }
   /*** Loop over input. ***/
   
   if (spew) fprintf(stderr,
		     "\nSorting by similarity to modal fingerprint.\n");
   
   /*** initiallizing the parameters used for threshold analysis ***/
   
   maxfp = 0; 
   minfp = FPSIZE + 1;
   tmp = 0; 
   for (i = 0, it = itlist; NULL != it; it = it->next)
     {
       pita[i++] = it;
       it->sim   = dt_fp_tanimoto(mfp, it->fp); 
       it->mp    = dt_fp_tversky(mfp, it->fp, 1.0, 0.0);
       
       /* keep track of the max and min bitcounts */
       if (db == 0)
	 {
	   tmp = dt_fp_bitcount(it->fp);    
	   if (tmp <= minfp)
	     {  
	       minfp = tmp;
	       minnam = it->nam;  
	     }  
	   if (tmp >= maxfp)
	     {  
	       maxfp = tmp;  
	       maxnam = it->nam;
	     }  
	 }
     }
   
   /***  Sort by percent similarity to modal(mp) and overal similarity ***/
   
   qsort(pita, nin, sizeof(ITEM *), mpcmp);
   
   if (spew)
     {
       fprintf(stderr,  "\n"
	       "=== Structures sorted by similarity to modal fingerprint ===\n"
	       "\n"
	       "No.| Tan  | Name\n"
	       "--- ------ ----------------------------------------------\n"
	       );
       
       for (i = 0; i < 50 && i < nin; i++)
         fprintf(stderr, "%2d: %6.4f %s\n",
		 i + 1, pita[i]->sim, pita[i]->nam);
       fprintf(stderr,
	       "---------------------------------------------------------\n");
     }
   
   /* Opening ascii output file if -o option was set  2.3.95 N.S. */
   
   if (ascii)
     {
       af = fopen("ascii.dat","w");
       fprintf(af, "NAME\tTanimoto\tModal Percent \n");
     }
   
   /** checking if this is a database comparison 5.15.95 N.S. **/
   
   if (db)
     {
       af2 = fopen("hits.smi","w");
       for (i = 0; i < nin; i++)
	 {
	   it = pita[i];
	   if (it->mp >= .5)
	     {   
	       /* hits.smi output */
	       fprintf(af2, "%s %s\n",it->smi, it->nam);
	       /*  ascii.dat output ***/
	       fprintf(af, "%s\t%f\t%f\n",it->nam, it->sim, it->mp);
	     } 
	 }
       fclose(af2);
     }
   else
     {
       /*** Loop over molecules, most similar first. ***/
       
       for (i = 0; i < nin; i++)
	 {
	   it = pita[i];

	   /*** Write input smiles if dt_cansmiles() fails ***/
	   s = dt_cansmiles(&lens, it->mol, 0);
	   if (s == NULL)
	     printf("$SMI<%s>",it->smi);
	   else
	     printf("$SMI<%.*s>", lens, s);

	   /*** write rest of data. ***/
	   printf("NAM<%s>MSIM<%.4f>MODP<%.4f>", it->nam, it->sim, it->mp);  

	   if (ascii)
	     fprintf(af, "%s\t%f\t%f\n", it->nam, it->sim, it->mp);
	   
	   if (spew) fprintf(stderr, "%d: %s\n", i + 1, it->nam);

	   /*** Count bits in fingerprint of molecule. ***/
	   bc = dt_fp_bitcount(it->fp);

	   if (spew)
	     {
	       fprintf(stderr, "USMILES: ");
	       putusmi(it->mol);
	       fprintf(stderr, "\n");
	       fprintf(stderr, "molecule-based bitcount = %d\n", bc);
	     }
	   
	   /*** Create substruct containing all atoms and bonds.  This
	     is not necessary (using the molecule would do) but it will
	     be faster this way. ***/
	   
	   subs = dt_alloc_substruct(it->mol);
	   
	   atoms = dt_stream(it->mol, TYP_ATOM);
	   while (NULL_OB != (atom = dt_next(atoms)))
	     dt_add(subs, atom);
	   dt_dealloc(atoms);
	   
	   bonds = dt_stream(it->mol, TYP_BOND);
	   while (NULL_OB != (bond = dt_next(bonds)))
	     dt_add(subs, bond);
	   dt_dealloc(bonds);
	   
	   /*** Loop over atoms in molecule in canonical order.  Generate
	     the atom contributions to the modal. ***/
	   
	   printf("ALAB<");
	   atoms = dt_canatom_stream(it->mol, 0, 0);
	   no = 0;
	   while (NULL_OB != (atom = dt_next(atoms)))
	     {
	       tfp = dt_e_fp_partfp(atom, subs, 0, 7, size);
	       
	       /*** Compute metric. ***/
	       if (no) printf(",");
	       stig = dt_fp_tversky(tfp, mfp, 1.0, 0.0);
	       printf("%.3f", stig);
	       
	       if (spew)
		 {
		   putsymb(atom);
		   fprintf(stderr, "-based stigma = %.4f\n", stig);
		 }
	       
	       dt_dealloc(tfp);
	       no++;
	     }
	   printf(">|\n");
	   dt_dealloc(atoms);
	   dt_dealloc(subs);
	   
	 }	
     }

   /*  adding threshold value and mbc/maxfp and mbc/minfp to ascii output */

   if (ascii &&  (db == 0))
     {
       tmpf1 = dt_fp_bitcount(mfp);
       fprintf(af, "\n");
       fprintf(af, "  Threshold = %f\n", thresh);
       fprintf(af, "  Percent of Maxfp = %f\n", (float) tmpf1/ (float) maxfp);
       fprintf(af, "  Percent of Minfp = %f\n", (float) tmpf1/ (float) minfp);
       fprintf(af, "  Maxfp Name = %s\n", maxnam);
       fprintf(af, "  Minfp Name = %s\n", minnam);
       fclose(af); 
     }

   /*** Tidy up. ***/

  for (i = 0; i < nin; i++)
    {
      it = pita[i];
      dt_dealloc(it->mol);
      dt_dealloc(it->fp);
      free(it->smi);
      free(it->nam);
    }

  dt_dealloc(mfp);
  
  fprintf(stderr, "handle count: %d\n", dt_vh_count());
    
  free(it);
  free(ffp);
  free(pita);
  return (1);
}