SeqFuncs.c | searchcode

/code_comm/SeqFuncs.c

Large files files are truncated, but you can click here to view the full file

/* tacg - a command line tool for the analysis of nucleic acids and protein  */

/* Copyright  1994-2001 Harry J Mangalam, tacg Informatics

(mangalam@tacgi.com | mangalam@home.com | 949 856 2847) */



/* $Id: SeqFuncs.c,v 1.13 2001/11/14 01:18:09 hjm Exp $  */



/* The use of this software (except that by Harald T. Alvestrand, which is described in 'udping.c')

   and that by James Knight, which is described in 'seqio.c' is bound by the notice that appears 

   in the file 'tacg.h' which should accompany this file.  In the event that 'tacg.h' is not bundled 

   with this file, please contact the author.

*/



#include <stdio.h>

#include <ctype.h> 

#include <string.h>

#include <stdlib.h>

#include <sys/stat.h>

#include <unistd.h>

#include <regex.h>

  

#include "seqio.h"

#include "tacg.h" 



/* takes care of:

	- reading rules from the file (path validated via SearchPaths()), 

   - parsing the file into the Rules struct

   - validating that the rules are formed correctly (parens match up, logicals are correct)

   - making sure that each of the subpattern names has been loaded in RE (have to hash the

   	names as in satisfying the -x flag in ReadEnz().

   - eventually, create --rules 'thisrule,thatrule,mars,promoter, etc'

     (can be specified w/o --rulefile in which case it will read the deafult 'rules.data'.)

     as opposed to current --rule for specifying a rule at a time from the commandline.

     (which should write out that rule to the tacg.patterns file for incoporation into the 

     main rules file.  require that it be named as well. 

     ie: --rule 'Name,((()()())(())),width'

     

     - open the file, getting path name from F.RuleFile

     - read in the strings, concatting as we go, then validate the logic 

         and stuff all the bits into the Rule_struct

     - at each iter, make sure there's enough mem

     - when finished with the file read, and all the bits are in Rule_struct, go thru

     		Rules[].Name and check against the names in RE[].E_nam.

     - make 

 */    

int tacg_Eval_Rules_from_File(struct SE_struct SelEnz[MAX_SEL_ENZ],  char datestamp[80]) {

                              /*   long ProtoNameHash[], int NumREs, */

	

   FILE *fpRF; 

   char junk[10], *ctemp1, ct, *rulestring, *packed_RS, *rstr;

   short NameCheck[RE_NAME_TAB_SIZ]; /* indices from realhash(PatName) to check against multiple identicals  */

   int SEi=0;

   long j, EOpRS=0,

        window_size = 0,

        NumRules=0, 

   	  rstr_len, linenum=0,

        RS_Cnt = 0, /* RuleStruct Counter */

        EOAM_RS = INIT_NUM_RULES; /* End Of Allocated Memory for the Rule_Struct; allocated in tacg */

        

   short cont = 0; /* continue indicator */



   /* alloc the Rule_struct Rules */

   if ((Rules = calloc(EOAM_RS, sizeof(*Rules))) == NULL) {

      BadMem("Can't init mem for Rules", 1);

   }



	if ((ctemp1 = calloc(512,sizeof(char))) == NULL) { BadMem("Can't init mem for 'ctemp1'", 1); }

	if ((rulestring = calloc(MAX_LEN_RULESTRING,sizeof(char))) == NULL) { BadMem("Can't init mem for 'rulestring'", 1); }

	if ((packed_RS = calloc(MAX_LEN_RULESTRING,sizeof(char))) == NULL) { BadMem("Can't init mem for 'packed_RS'", 1); }

	if ((rstr = calloc(MAX_LEN_RULESTRING,sizeof(char))) == NULL) { BadMem("Can't init mem for 'rstr'", 1); }

   memset(NameCheck,0, sizeof(short)*RE_NAME_TAB_SIZ); /* make sure everything is set to 0 */

   

   if ((fpRF = fopen(F.RuleFile,"r")) == NULL) {  /* or the standard/optional one from SetFlags() */

   	fprintf(stderr,"Cannot open the Rules file \"%s\" for reading!!\n", F.RuleFile); /* DON'T HIDE behind -V */

      exit(1); /* print an error and die gracefully */

   } else if (F.Verbose > 0) { fprintf(stderr,"Rule File %s opened OK for reading\n", F.RuleFile); }

   

	/*  Scan thru comments in rebase file to separator ("..")   */

   junk[0] = 'k';   /* make sure the strcmp below fails the first time */

   while ((feof(fpRF) == 0) && (strncmp(junk, "..",2) != 0))  { 

      fscanf (fpRF, "%2s", junk); ct = 'm'; /* and read to end of line - this needs to be able to deal */

      while (ct != '\n' && ct != '\r' && (feof(fpRF) == 0)) {  /* with end of line conditions for Mac, Win, and unix - what have I missed? */

         ct = fgetc(fpRF); linenum++;

      }

   }



/* we're at "..", so go into file and start slurping directly data into struct

   except for RE_rawsite which has to be filtered a couple ways before being acceptable */

   

   ct = 'm';

   while (fgets(rstr, 512, fpRF) && (NumRules <= MAX_NUM_RULES)) { 

      /* check the size of the rule struct to see if it's going to run into the end of the alloc'ed mem */

      if (RS_Cnt == EOAM_RS) { /* then we need to alloc more mem for the Rules struct */

          EOAM_RS +=  RULES_INCR;

          if ((Rules = realloc(Rules, sizeof(*Rules)*EOAM_RS)) == NULL) {

            BadMem("Failed to get more mem for Rules.\n", 1);

          }  

      } 

      

      /* now the whole line's in rstr; use strsep to parse it, make decisions re: parsing, skipping */

      j = stripwhitespace(rstr,0,0);

      if (rstr[0] != ';'  && (strlen(rstr) != 0)) {  /* if the line hasn't been commented out, get all the bits */

         /* REMEMBER!  strsep chews the string to pieces and changes the start of the string */

      if (j>0 && F.Verbose > 0) {fprintf(stderr,"DEBUG: stripped %ld whitespace characters from RuleName\n", j);} 

         

         strcpy(packed_RS, rstr);  /* copy over to pass to tacg_SlidingWindow - may not be needed */

         



         /* now have to break on ',' to get all the rulestring in 1 shot 

         	- also have to detect if the last char is a continuation char '\' */

            

         /* if last char is '\', then rulestring is incomplete and there won't be a window term  */

         cont = 0;

         EOpRS = strlen(packed_RS) - 1; /* */

         if (packed_RS[EOpRS] == '\\') { /* then there's a continuation char */

         	cont = 1; 

            /* keep on going until there's no more continuation lines */

            while ((cont == 1) && fgets(rstr, 512, fpRF)) {  /*     */

               j = stripwhitespace(rstr,0,0); /* no whitespaces, eols */

            	rstr_len = strlen(rstr); /* length excluding terminating \n */

               EOpRS = strlen(packed_RS);

               if (EOpRS + rstr_len >= MAX_LEN_RULESTRING) {

                  fprintf(stderr, "Total length of the rulestring exceeded; change MAX_LEN_RULESTRING \n"

                  "or check your continuation characters for the rule %s\n", ctemp1);

                  exit (1);

               } 

               /* rstr should be the right size to copy mod the possible final \ and \n */

            	strcpy(packed_RS+EOpRS-1,rstr); /* copy all but the last char over */



/*                fgets(rstr, 512, fpRF); */



               

               fprintf(stderr, "\nend char = %s\n", rstr+(strlen(rstr)-3) );

               if (rstr[(strlen(rstr)-1)] == '\\' ){ cont = 1;}   

               else                                { cont = 0;}   

            }

            fprintf(stderr, "\n\n\npacked_RS = %s\n\n", packed_RS);

	         if ((rstr = calloc(MAX_LEN_RULESTRING,sizeof(char))) == NULL) { BadMem("Can't init mem for 'rstr'", 1); }

            strcpy(rstr,packed_RS);   

            

         }  

         /* coming out of the continuation loop, packed_RS contains the entire Name,RuleString,Window concat

            that the whole thing should look like. rstr has only the last line of a continued rule, so it needs to 

            be synced, so copy packed_RS to rstr again */

                                 

         ctemp1 = strsep(&rstr, ","); /* using ctemp1 (=Name) for congruence with previous code  */

         if (strlen(ctemp1) > 10) {

            fprintf(stderr, "Name %s is longer than allowed (%d) - check it!\n", ctemp1, MAX_PAT_NAME_LEN);

            exit(1);

         } 

            

            

         /* so now packed_RS has the whole packed rulestring in it at this point; rstr has everything 

            BUT the RuleName in it. So now we finish off the line, so break on ',' to get the rulestring

            and ONLY THEN break on the ',' to separate the rule term from the sliding window term 

            (since already have processed the Name) */  

         if (strlen(rstr) >= MAX_LEN_RULESTRING) {

            fprintf(stderr, "Total length of the rulestring exceeded; change MAX_LEN_RULESTRING \n"

                            "or check your continuation characters for the rule %s\n", ctemp1);

            exit (1);

         }

         

         /* break on the comma so we can ignore the window term - this probably needs some more error-checking */

         memset(rulestring, 0, MAX_LEN_RULESTRING);

         rulestring = strsep(&rstr,",");  /* rulestring now contains ONLY the packed rulestring */

         window_size = (long)atoi(strsep(&rstr,","));



         if (window_size < 1) {  /* now test whether windowsize & rulestring are valid  */

            fprintf (stderr, "Window size for %s is less than 1, which is silly.\n", ctemp1);

            exit (1);

         }

         /* Are the component pattern Names valid? ie is this name represented in RE by being loaded 

            from the RE database?  need to hash the Names of the individual patterns and see if they 

            hit any of the Names that were loaded from whatever rebase was being used.

            Soooooo, go thru the rulestring and hash all words breaking on ()|^& to cut out the 'Name:min:Max' stanzas */

         tacg_Fill_SelEnz_fr_RuleString(SelEnz, &SEi, rulestring, NameCheck, ctemp1);

         





         /* stuff ALL the bits (Name, rulestring, window_size) into the struct (the validity of the rule itself 

            tested in the evaluation code) */

         strcpy(Rules[RS_Cnt].Name, ctemp1);

         if ((Rules[RS_Cnt].RuleString = calloc((strlen(rulestring)+1),sizeof(char))) == NULL) { 

            BadMem("Can't init mem for 'Rules[RS_Cnt].RuleString'", 1); 

         }

         strcpy(Rules[RS_Cnt].RuleString, rulestring);

         Rules[RS_Cnt].Window = window_size;         



         if ((rstr = calloc(MAX_LEN_RULESTRING,sizeof(char))) == NULL) { BadMem("Can't init mem for 'rstr'", 1); }

         RS_Cnt++;

		} else {

/*          fprintf(stderr, "\nDEBUG: line ignored: %s\n", rstr); */

      }

   }

   F.Xplicit = -1; /* notify ReadEnz */

   F.Rules = RS_Cnt-1; /* does this have to be decr by one 1st?? */

   return SEi;

}





/* ***************************************************************************************************** */





/* tacg_Fill_SelEnz_fr_RuleString() takes the SelEnz sruct and the rulestring (and a few other housekeeping vars 

   and parses the rulestring passed to it to populate SelEnz and do a namecheck to make sure that the names being 

   parsed out of the rulestring haven't been seen before and aren't otherwise objectionable      

   takes as args:

       SelEnz, 

       the starting SEi (as an int * as it's changed in the fn())

       the rulestring (copied in the function so that the original rulestring is untouched)

       NameCheck (has to survive fn() to check for multiple identicals over multiple rulestrings,

          as well as to check for multiple identicals in the same rulestring (A&B|A&C)

       ctemp1 - Name of the Rule (for error messages)

    and back comes 

       the (more) populated SelEnz

       and the ending SEi.  

*/



void tacg_Fill_SelEnz_fr_RuleString(struct SE_struct SelEnz[MAX_SEL_ENZ], int *SEi, 

            char *rulestring, short NameCheck[RE_NAME_TAB_SIZ],  char *ctemp1) {

   char *Name_mM, *rstr, *PatName = "            ",

   *amin = "           ", 

   *aMax = "           ";

   

   int ind;

 	if ((Name_mM = calloc(33,sizeof(char))) == NULL) { BadMem("Can't init mem for 'Name_mM'", 1); } 

	if ((rstr = calloc(MAX_LEN_RULESTRING,sizeof(char))) == NULL) { BadMem("Can't init mem for 'rstr'", 1); }

   strcpy(rstr,rulestring); /* re-using rstr; mem should be fine after re-calloc above */

   Name_mM[0] = '\0'; /*  to pass on 1st pass */

   while (Name_mM[0] == '\0') { 

   Name_mM = strsep(&rstr, "()|^&"); } /* this should break out a 'Name:#:#' set */



   while (Name_mM != 0L) {

      if (Name_mM[0] != '\0') {

         PatName = strsep(&Name_mM, ":");

         ind = (int) realhash(PatName, RE_NAME_TAB_SIZ);

/*          fprintf(stderr, "DEBUG: realhash '%s' = %d\n", PatName, ind); */

         if (strlen(PatName) < 11) {

            if (NameCheck[ind] == 0) { 

               strcpy(SelEnz[*SEi].PName,PatName);

               NameCheck[ind] = 1; /* now mark it to make sure that we don't take it again */

               if ((amin = strsep(&Name_mM, ":")) == '\0') { 

                 SelEnz[*SEi].min =   0; /* RE[ii].min =  */

               } else { 

                  SelEnz[*SEi].min =  atoi(amin); /* RE[ii].min =  */

               }

               if ((aMax = strsep(&Name_mM, ":")) == '\0') { 

                  SelEnz[*SEi].Max = 32000; /* RE[ii].Max = */

               } else { 

                  SelEnz[*SEi].Max =  atoi(aMax); /* RE[ii].Max =   */

               }

               (*SEi)++; 



            } else { /* else it's already been seen in another rule or sub-rule and we don't care */

/*                fprintf(stderr, "DEBUG: The pattern named %s has already been seen previously; continuing .. \n", PatName); */

            }

         } else { 

            fprintf (stderr, "\nIn Rule named %s, Pattern Name %s is too long (must be <=10 chars)\n", 

            ctemp1, PatName); 

            exit(1);

         }

      }

      Name_mM = strsep(&rstr, "()|^&");  /* get the next one */

   }/* @ end of loop, all constituent patterns are loaded into SelEnz or the app has exited on an err */

}





/* ***************************************************************************************************** */











/* RE_Filter returns 1 if the RE indexed by Proto[REi] matches the tests for

	Magnitude

   Number of Cuts

   Cost

   Overlap

   etc that are can be used to fine-tune the selection of REs */

int RE_Filters_OK(int REi) {  /* REi should be the ProtoCutters index, as that is the only RE el that we have to check */

	int OK=0;

/*    fprintf(stdout, "\n REi = %d, RE name = %s \n", REi, RE[REi].E_nam); */

   if (((RE[REi].E_mag   >= (int)F.Mag)       &&    /* E_mag must be >= to -n flag */

        (REi >= F.RE_ST)                              &&    /* Don't EVER want dam/dcm sites printed */

        (RE[REi].E_Ncuts >= (int)F.Min)       &&    /* Ncuts >= -m, */

        (RE[REi].E_Ncuts <= (int)F.Max)       &&    /* Ncuts <= -M */

        (RE[REi].Cost    >= (int)F.Cost)      && 

/* the dam/dcm part of this COULD be rolled in if the RWC (aka base_real_pos) was included as an optional 

	argument (or if -1, ignore it), but right now it doesn't seem to be used in the same way, so I'll let

   the code stay but commented out for now */

/*        (DamDcmOK(base_real_pos, REi) == 1)   &&     print only RE's !masked by dam/dcm methylases */

        

    (F.Overhang == 1))      || /* if want all, only 1st has to be true to short circuit */

       ((RE[REi].E_olap > 0) && (F.Overhang == 5)) || /* if olap < 0 && overhang = 5' */

       ((RE[REi].E_olap < 0) && (F.Overhang == 3)) || /* if olap > 0 && overhang = 3' */

       ((RE[REi].E_olap ==0) && (F.Overhang == 0)))  /* if olap = 0 && overhang = blunt */ 

	   { OK = 1; } 

   else { OK = 0; }

   return OK;

}



/* if either the --dam or --dcm flags have been used, DamDcmOK() checks to see 

	if the REi under consideration should be printed - has it been masked or is 

   it OK to print it.  What it does: passed the RE index of the enzyme, it

   checks to see if that RE is possibly affected and if so, whether the 'raw'

   hit recorded in DD.Dat is present in RE[].Sites.  The index to the

   last-checked position in RE[].Sites is kept in: 

   RE[].Sites[RE[].Sites[0]]       as 'RE[].Sites[0]' points to the end of the array.

   In this way, only have to incr by 1 to get to the next check point.

   RWC = Real World Coordinates

   REi = RE index

   Hmmm - this could be the prototype for an overall RE filter to be used in a number of cases..

*/



/* Clone() goes thru RE[] and figures out which REs match the conditions of the option flag, where

   --clone '#_#,#x#,#x#,#_#' (up to MAX_NUM_CLONE_RANGES).  '_' indicates that the range cannot be cut;

   'x' indicates that it can be cut.  

   There will probably be some interactions between --clone and other options, but

   for now, finish the basic functionality and sort out the interactions later.  In fact,

   RE[].. should not be a problem as it will be populated only with sites that are requested, 

   so that if only a subset or special file of REs are used, only they will be used for 

   for the clone() analysis.

   This fn() is called AFTER the digest is done in the normal way and should only 

   need the number of REs (NumREs - not modified by this so it doesn;t ave to be 

   passed as a pointer) as a max to go thru to determine the ranges that agree with the 

   ranges specified.

*/

/* should really be running thru Protos[] from 0->ProtoCutters */

void Clone(int ProtoCutters, int *Protos) {

	long i, c, j, m, p, r, 

   Best[F.Clone+2][ProtoCutters], 	/* Best = array that stores and sorts the best REs for a set of cloning criteria

   												last el is for storing summary truths to check if ALL rules have been met */

   BadProtos[ProtoCutters];  	/* BadProtos keep track of Protos that map inside a forbidden zone and therefore have to 

   										be removed from contention after the fact.  If a Protos index shows up in BadProtos,

                                 it cannot be a possible cloning RE (unless cloner wants to try a partial digest.. */

   short N_Crits;

   int LadderProtos[ProtoCutters];

   

   N_Crits =  F.Clone; /*  simplicity; F.Clone stores the # of ranges in the --clone option string */ 

   for (i=0; i<= F.Clone; i++) {

   	memset(Best[i],0,sizeof(int)*ProtoCutters);

   }

   memset(BadProtos,0,sizeof(long)*ProtoCutters);

   memset(LadderProtos,0,sizeof(int)*ProtoCutters);

   

   /* Need additional rules to elim REs.  

   	If an RE hits in the forbidden zone, that RE is then added to a FORBIDDEN array to be checked post-processing.

   	now #x# = MUST HIT range  Should there be an option to set prefs for this? Or just show how many

      	there are in that zone?

      How should REs that hit in zones NOT explicitly marked as allowed (x) or forbidden (_) be treated?  In typical 

      	cloning, don't want lots of extra cuts as it increases ends to contend with.   

   */

   

   for (r=0; r<=N_Crits; r++) { /* for each range parsed out in SetFlags() */

   	for (i=0; i<ProtoCutters; i++) {

	      p = Protos[i]; /* p is the alias for the RE prototype index, so RE[p].etc will hit only prototypes */

         if (Clone_S[r].to_cut == 1) { /* if this is a range where we REQUIRE hits... */  

            m = Clone_S[r].matching_REs[1]; /* [1] is the 1st free position of the array */

				for (j=1; j<RE[p].Sites[0]; j++) { /* RE[p].Sites[0] holds the index of the 1st FREE position in the Sites array */

               if (RE[p].Sites[j] >= Clone_S[r].begin &&     /* site must match BOTH conditions */

                   RE[p].Sites[j] <= Clone_S[r].end ) {

                 	/* do we need a per-range log for positives & negatives? */

               	Clone_S[r].matching_REs[m++] = i;  		/* incr m, store the Proto index */

                  Clone_S[r].matching_REs[1] = m;      	/* keep track of the 1st free el as well */

                  /* and check for realloc conditions */

                  if ((m+1) == Clone_S[r].matching_REs[0]) { realloc_Clone_matching_REs(m, r); }

               } /* else we silently ignore the bad hits */

            }

            Clone_S[r].matching_REs[1] = m; /* and then store the position back into the array */

         } else { /* this is a range we where we will NOT TOLERATE hits */

            m = Clone_S[r].matching_REs[1]; /* [1] is the 1st free position of the array  */

            for (j=1; j<RE[p].Sites[0]; j++) { /* RE[p].Sites[0] holds the index of the 1st FREE position in the Sites array */

            	if (RE[p].Sites[j] <= Clone_S[r].begin ||      /* site can match either condition */

                   RE[p].Sites[j] >= Clone_S[r].end ) {

               	Clone_S[r].matching_REs[m++] = i;  /* incr m, store the Proto index */

                  Clone_S[r].matching_REs[1] = m;            /* keep track of the 1st free el as well */

                  /* and check for realloc conditions  */

                  if ((m+1) == Clone_S[r].matching_REs[0]) {realloc_Clone_matching_REs(m, r); }

               } else { /* else we have to track the REs to mask out the results by adding their index to BadProtos[] */

               	BadProtos[i]++; /* if an el is > 0, it's EVIL */

               }

            }

            Clone_S[r].matching_REs[1] = m; /* and then store the position back into the array */                     	

         }

      }

   }

   

      /* now we've run thru all the criteria for judging whether the REs map into the ranges 

   	now have to output the data in some usable form.  1st, are there any REs that match ALL requirements?

      2nd output all ranges with all matching REs, sorted alphabetically.  Should --clone automatically set

      --sites, so can output all the sites at the same time??  Hmmm...

   */

   /* 1st are there any that match ALL crits? for all crits, are there any indices that appear in all of them??  

   	set up array of length N_Crits and increment each RE index as go thru all the ranges.  Also has the effect of

      being able to sort those REs that match the MOST crits as well. */

   for (i=0; i<=N_Crits; i++) {   /*  cycle thru all the crits */

   	for (j=2; j<Clone_S[i].matching_REs[1]; j++) { /* cycle thru all the prototype REs that match for each crit */

      	Best[i][Clone_S[i].matching_REs[j]]++; 	/* this will count the # of times a proto came up positive in the criteria  */

      }

   } /*  so now have all the REs counted. */

   

   /* and print the results in some kind of visually appealing way. */

   /* 1st print out any REs that met ALL crits.. */

  	fprintf(stdout,"\n\n== Here's the Clone output for the %d Criteria:\n", (F.Clone+1));

	for (i=0; i<=F.Clone; i++) { fprintf(stdout, "    Rule %ld:  %s\n", i+1, Clone_S[i].range_rule); }



   for (j=0; j <= F.Clone; j++) {  /* sum the rules results to generate a summary */

   	for (i=0; i<ProtoCutters; i++) {

   	   if (Best[j][i] > 0 && BadProtos[i] == 0) {

         	Best[F.Clone+1][i]++; /* for each rule passed, incr the truth element once */

         }

      }

   }

   /* and finally print out the Protos that match ALL the rules BUT this should probably be mod to allow selection and

   	filtering by the standard rules -o -n -m -M --cost, etc but later... this will require a test fn() that tests all the

      Protos against all the various restrictions, but it will be applicable to at least 4 cases */

   fprintf(stdout,"\nREs that met ALL criteria:\n");

/* 

	for (i=0; i<ProtoCutters; i++) {   

      if (Best[F.Clone+1][i] == F.Clone+1) {

   	   fprintf(stdout,"\n  RE[Protos[i]].E_nam);

      }

   } 

*/

   i = c = m = 0;

   while (i<ProtoCutters) {

      if (m >= F.Width) { m = 0; fprintf(stdout, "\n"); }

   	if (Best[F.Clone+1][i] == F.Clone+1 ) { 

   		fprintf(stdout,"%10s", RE[Protos[i]].E_nam);

         LadderProtos[i] = Protos[i];

         c++; m += 10;

      }

      i++;

   } 

   if (c == 0) { fprintf(stdout,"There were no REs that met ALL criteria\n"); }

	else {

   	PrintGelLadderMap(i, F.Seq_Len, LadderProtos, 2);

   }

   memset(LadderProtos,0,sizeof(int)*ProtoCutters);

   

   fprintf(stdout,"\n\nResults (listed by Rules) that met SOME criteria & did not violate a hits-forbidden range.");

   for (j=0; j<=F.Clone; j++) {

   	fprintf(stdout, "\n\n--- For Rule %ld (%s), the following REs matched:\n\n", j+1, Clone_S[j].range_rule);

   	i = c = m = 0;

      while (i<ProtoCutters) {

      	if (m >= F.Width) { m = 0; fprintf(stdout, "\n"); }

   	   if (Best[j][i] > 0 && BadProtos[i] == 0) { 

   		   fprintf(stdout,"%10s", RE[Protos[i]].E_nam);

            LadderProtos[i] = Protos[i];

            c++; m += 10; 

         }

      	i++;

      }

      if (c == 0) { fprintf(stdout,"There were no REs that met EVEN SOME of the criteria\n"); }

	   else {

   	   PrintGelLadderMap(i, F.Seq_Len, LadderProtos, 2);

      }

      memset(LadderProtos,0,sizeof(int)*ProtoCutters);

   }

   

   fprintf(stdout,"\n\n--- Results (listed by RE) that met SOME criteria & did not violate a hits-forbidden range\n");

   fprintf(stdout,"\n   RE Name");

   for (j=0; j<=F.Clone; j++) { fprintf(stdout,"  Rule %ld", j+1); }

   fprintf(stdout,"\n");

   for (i=0; i<ProtoCutters; i++) {

   	if (BadProtos[i] == 0) { 

   	   fprintf(stdout,"\n%10s", RE[Protos[i]].E_nam);

   	   for (j=0; j<=F.Clone; j++) {

   	      if (Best[j][i] > 0) { 

   		      fprintf(stdout,"    X   ");

            } else {

            	fprintf(stdout,"        ");

            }

         }

      }

   }

   fprintf(stdout,"\n\n");

};





void realloc_Clone_matching_REs(long current, long index) {

	long newsize;

   /* and check for realloc conditions */

   if ((current+1) == Clone_S[index].matching_REs[0]) { /* if we're nearly up to the alloc'ed mem, need to realloc */

      newsize = Clone_S[index].matching_REs[0] + CLONE_INCR_MRE; /* this is how big the new size should be  */

      if ((Clone_S[index].matching_REs = (long *) realloc(Clone_S[index].matching_REs, sizeof(long) * newsize)) == NULL) {

         BadMem("Can't realloc mem for Clone_S[].matching_REs", 1); 

      } else { Clone_S[index].matching_REs[0] = newsize; }   /* new alloc range */

   }

};

   



/* the example function included below is a mock-up to show how to include your own function.

	Obviously, there are 10s of functions included with this source code and some of them may be

   immediately applicable to your work.  Feel free to bash them into a useful state for yourself.

   If you think that you've created a widely useful function that you would like to have included 

   with tacg, please give me a shout, and I'll include a pointer to it or provide other information

   about it on my web site.	  

   

   void Example(void){



	}   

*/





/* LinearMap() is simply a functionization of the Linear Mapping chunk that

	should have been functionized long ago */

   

void LinearMap(long seq_len, int basesPerLine, int NumREs, char *sequence, int max_okline, 

				char Codons[][][], int codon_Table, int *Protos) {



	/* Declarations */

   int i, d, k, mm=0, rc, n_letters=0, Xn=0, okline, HTML, spacer, spacermod, stanzaspace,

   	   min_okline, tic_line, Cur_RE, block_Cnt, block_repeat, block_cut_pos, lastline = O_SEQ_LINE+2, hits,

   		ok2wr[MAX_OUTPUT_LINES], p_width, abcp, in_OL=-1, seq_offset, tripclick=0;



   long  *BackCheck, /* BackCheck[] checks proto entries in the Linear map output to remove duplicates */

         DCnt, Cnt_reset=0, base_real_pos, numstart; 

         					

   char  ctemp1[30], *s,  Prot_out[MAX_BASES_PER_LINE],

   		O_txt[MAX_OUTPUT_LINES][O_LMARG+O_RMARG+MAX_BASES_PER_LINE];

   

/*    char Codons[MAX_ORGS][N_CODONS][7]; */

   char os = '~';  /* started out as '<' to mark other strand, but causes problems with HTML generation */



   BackCheck = (long *)calloc((size_t)NumREs+2, sizeof(long));  /* Needs to be '+2' as combination cut will add one and 1 extra */

   if (BackCheck == NULL) BadMem("Can't init mem for BackCheck", 1);



   s = (char *) calloc(256, sizeof(char)); 

   if (s == NULL) BadMem("failed to calloc mem for 's'", 0);

	memset(O_txt,' ',(size_t)((O_LMARG+O_RMARG+MAX_BASES_PER_LINE)*MAX_OUTPUT_LINES));  /* set all of O_txt to blanks - works!*/

	p_width = O_LMARG + O_RMARG + basesPerLine;	  

   

	/* Routine to print out the text of the digestion, with names going where they should go

	a la Strider - have to keep track of how long the names are and where an allowable place 

	to write the name - */



	/* In brief, the following function, does this: 

	It reads thru DD->Dat (the array that keeps track of the linear sequence of cuts in the sequence) 

	and matches the position of the enzymes to the current block being printed, taking into account 

	the cut offset of the RE from the 1st base of the recognition sequence.  The tricky part is that 

	because of this latter functionality, it has to check the sequence that will be the *following* 

	block to see if there are any RE recognition seqs within the 1st few bases that will cause the actual 

	cut site to be in the *previous* block.  Doing this seems to take a great amount of the cpu time - can 

	probably be optimized a good deal more, but it's still >10 faster than GCG ;). */

	

   HTML = F.HTML;         /* simplicity */

   n_letters = F.Lab1or3; /* simplicity */

   Xn = F.XLinFr;         /* simplicity */

   numstart = F.NumStart;  /* simplicity */

   

	fprintf(stdout, "\n\n");

	if (HTML > -1) fprintf(stdout, "<H3><A NAME=\"linmap\">");

	fprintf(stdout, "== Linear Map of Sequence:\n"); /* Write the header */ 

	if (HTML > -1) fprintf(stdout, "</A></H3>\n");



	/* inits, etc */ 

	if (seq_len % basesPerLine == 0)  block_repeat = (int) seq_len/basesPerLine; /*  # times thru the compose/print cycle */

	else  block_repeat = (int) seq_len/basesPerLine + 1;  

	DCnt=2;  /* start the DD->Dat counter at the beginning of REAL DATA [0]=Cnt; [1]=Siz */

	base_real_pos = DD->Dat[DCnt];

  

  /* spacer dictates the line spacing between the 1st 3 and 2nd 3 translation stanzas; if 6 there

      will be no space between the 2 stanzas, if 7 there will be 1 space; could make this another

      option but sheesh... */

  spacer = 7;  spacermod = 0;

  if (F.Strands == 1) { spacermod++; }   

  if (F.Tics == 0) { spacermod++;}

  spacer = spacer - spacermod;

  if (spacer == 7) stanzaspace = spacermod + 1; 



	for (block_Cnt=1; block_Cnt <= block_repeat; block_Cnt++) {

      /* hits is set to 1 if there are hits on the line; if no hits, decr the top buffer line to output 

      	so that there is a constant 1 blank line spacer between stanzas */

   	hits = -1;  

		/* if there was an OVERLAP failure previously, fix it */ 

		if (in_OL == 1) {

         memset(BackCheck,0,sizeof(long)*(NumREs+2)); /* clear BackCheck[] as it's filled with stale data */

         DCnt = Cnt_reset;  /* and reset DCnt to that pointer */ 

		}

      in_OL = -1;  /* and reset 'in OVERLAP' marker for new block */

		base_real_pos = DD->Dat[DCnt];  /* Make sure that ' base_real_pos' is set */

 		okline = min_okline = O_SEQ_LINE-2; /* set the min (highest on page) line of buffer to print out */



		/* Set up the seq and its rev compl in the output buffer */

		memset(ok2wr,0,sizeof(int)*MAX_OUTPUT_LINES);  /* set/reset all of ok2wr to 0's for the new block*/

		/* next line was deleted in a brief fork of tacg.c - note this if get unexlained segfs */

      memset(BackCheck, 0, sizeof(long)*NumREs); /* reset the BackCheck[] */



		/*write the correct numbering in Left margin  */

    if (F.NumStart < 1 && (((block_Cnt-1)*basesPerLine) + 1 + F.NumStart) == 0) {numstart++; tripclick++;}

    sprintf((char *)s, "%7ld",(((block_Cnt-1)*basesPerLine) + 1 + numstart)); /* numstart is just a + offset */

    memcpy(&O_txt[O_SEQ_LINE][0], s, 7);

		/* and in the right margin - combine with the above bit? */

    if (F.NumStart < 0 && (tripclick || (((block_Cnt)*basesPerLine + numstart)) > 0)) {

      if (!tripclick) {tripclick++; numstart++;}

    }

		sprintf((char *)s, "%7ld",(((block_Cnt)*basesPerLine + numstart)));

		memcpy(&O_txt[O_SEQ_LINE][O_LMARG+basesPerLine], s, 7);

		/* and drop in the sequence directly after it. */



		if (block_Cnt != block_repeat) k = basesPerLine; /* test for end of sequence to end gracefully */ 

		else k = (int)seq_len - (block_Cnt-1)*basesPerLine;

		if ((F.SeqBeg != 1 || F.NumStart != 0)  &&

        (F.Strands != 1 || F.Tics != 0)) { /* if subsequence or dif #ing, print the #s of the original seq too */

			/* write the correct numbering in Left margin  */

			sprintf((char *)s, "%7d",((int)(((block_Cnt-1)*basesPerLine)+F.SeqBeg))); /* these 2 lines could be combined...? */

			memcpy(&O_txt[O_SEQ_LINE+1][0], s, 7);

			/* and in the right margin - combine with the above bit? */

			sprintf((char *)s, "%7d",(int)(((block_Cnt)*basesPerLine - 1 + F.SeqBeg)));

			memcpy(&O_txt[O_SEQ_LINE+1][O_LMARG+basesPerLine], s, 7);

		}



		memcpy(&O_txt[O_SEQ_LINE][O_LMARG], &sequence[(block_Cnt-1)*basesPerLine+BASE_OVERLAP+1], (size_t)k); 



      if (F.Strands == 2) { /* if we want the bottom strand as well */

			Rev_Compl(sequence+BASE_OVERLAP+1+((block_Cnt-1) * basesPerLine), s, (long)k);

			/* and then plunk the converted sequence into O_txt with a similar statement */

			memcpy(&O_txt[O_SEQ_LINE+1][10], s, (size_t)k);

      }

		if (F.Tics == 1) {  /* if we want tics  */

         /* write out the minor and major tics below the reverse complement */

			tic_line = O_SEQ_LINE+2;

         /* if there's only 1 strand AND we want tics, have to move them up 1 line */

         if (F.Strands == 1) tic_line -= 1; 

			for (i=0;i<(int)(basesPerLine/10);i++) memcpy (&O_txt[tic_line][i*10+O_LMARG],"    ^    *",10);

      }



		/* Following call to Translate has to calculate the # of bases at each call; otherwise end up with 

			overrun condition at end; "k" in code above 

         This stanza doesn't have to change at all, even after the -T/-t change of 9.2.98 */



      rc = seq_len % 3;  /* rc is the correction factor for the trans, if not a perfect div by 3;

         if it is a perfect div by 3, then rc = 0, and there won't be any change */

		if (n_letters != 0) {

			for (mm=0;(mm<(int)Xn && mm<3); mm++) {

			   if (k % 3 != 0) k = ((int) k /3)*3;   /* also added this code to Translate() for better modularity */

			   /* this chunk doen't care about rc */

            seq_offset = ((block_Cnt-1)*basesPerLine+BASE_OVERLAP+1);

			   Translate(sequence+seq_offset+mm, Prot_out, k, n_letters, Codons, codon_Table); 

			   /* need to place the translation frame labels here before memcpy() call */

            d = 0;

            if (F.Strands == 1) { d += 1; }  

            if (F.Tics == 0) { d += 1; }

 			   sprintf(ctemp1, "%d", mm+1);   O_txt[O_SEQ_LINE+3+mm-d][0] = ctemp1[0]; 

			   memcpy(&O_txt[O_SEQ_LINE+3+mm-d][O_LMARG+mm], &Prot_out, (size_t)k); /* and copy it into place */

			   lastline = O_SEQ_LINE + 3 + mm - d; /* keep track of last line of Xl */

            

            if (Xn == 6) { /* but if it's 6 frames of Xlation, need to do oppo at the same time */

               /* but not EXACTLY the same - seq to be has to be shifted slightly to account for 

                  those seqs not exactly / 3 - should have to only add a correction to the expr below

                  AND add the same correction to the offset for printing it out, so this chunk DOES care

                  about rc */



               Anti_Par(sequence+seq_offset-mm+rc, s, k); /* get the *antipar* sequence */

               /* Translate() below shouldn't have to change */

			      Translate (s, Prot_out, k, n_letters, Codons, codon_Table); /* Xlate it into N-letter code */

			      if (n_letters == 1) Reverse (Prot_out); /* to match the 'backwards' DNA strand */

			      else Triplet_Reverse (Prot_out);

			      /* need to place the translation frame labels here before memcpy() call */

			      sprintf(ctemp1, "%d", mm+4);   O_txt[O_SEQ_LINE+spacer+mm][0] = ctemp1[0];

			      memcpy(&O_txt[O_SEQ_LINE+spacer+mm][O_LMARG-mm+rc], &Prot_out, (size_t)k); /* and copy it into place */

      			lastline = O_SEQ_LINE + spacer + mm; /* this is the last line of the Xl, so last to print */

            }

			}

		} else {

      d = 0;

      if (F.Strands == 1) { d += 1; }  

      if (F.Tics == 0) { d += 1; } 

      lastline = O_SEQ_LINE + 2 - d;

    }



		/* This 'while' loop tests each text block for REs and also prints out those blocks that have no 

		cuts in them at all but which have to be printed anyway....*/

		while ((DD->Dat[DCnt] != -2) && (base_real_pos < ((block_Cnt * basesPerLine) + BASE_OVERLAP))) {

			DCnt++; 

			/* if we're not at the end and the real pos'n <  current block + the overlap.. */

			/* !! Now DD->Dat[DCnt] should point to the corresponding  RE #  */



			 /* if following test is true (they're !=) then it's safe to write it; */

			 /* if it's false, then the same proto has already written to the same space */

			if (BackCheck[labs(DD->Dat[DCnt])] != labs(DD->Dat[DCnt-1])) { /* if !=, do this, but 1st... */

			   BackCheck[labs(DD->Dat[DCnt])] = labs(DD->Dat[DCnt-1]);  /*  ... set it = so that it won't do this one again */

             okline = O_SEQ_LINE-2; 



       /* This is where the RE filter could be inserted so that the DD data is ignored unless the RE that's 

         referenced matches the parameters set in the filters (can take this straight from the ladder-map 

         functions.  */

         /* direct from GelLad.. 

            these are the conditions that have to be passed in order to have the REs placed in the

            linear map. */

            /*  DD->Dat[DCnt] is still referencing a possibly neg #, so have to take the abs() */

            Cur_RE = labs((int)DD->Dat[DCnt]); 

            /*  this whole 'if' test thingie can and should be reduced to one extensible function call 

               (using the embedded DamDcmOK() as a template), but ... not yet... */

            if ((DamDcmOK(base_real_pos, Cur_RE) == 1)   &&  RE_Filters_OK(Cur_RE)) { /* if olap = 0 && overhang = blunt */ 

               /* this is the overall test for whether the RE will be printed out in the Linear Map or not  

						if the RE DOESN'T meet the crit, it just gets ignored here, but will be printed in other outputs */



			      block_cut_pos =  (int)base_real_pos - 1 - ((block_Cnt-1) * basesPerLine); /* see if it's this easy?! */

			      /* Now locate it in the block, checking for under- and over-runs */

			      if (block_cut_pos >= 0) { /* if the position is greater than the beginning of the block */

			         if (block_cut_pos <= basesPerLine) { /* and if it's less than the end of the block... */

			            abcp = O_LMARG + block_cut_pos - 1; /* abcp = adjusted block_cut_pos - used x times below */

			            O_txt[O_SEQ_LINE-1][abcp] = '\\';   /* write the 'cut' indicator */

                     hits = 1;

			            /* locate the position for writing the name */

			            while (block_cut_pos < ok2wr[okline]) okline--; /* then go up to the lowest 'free' line */

			            if (okline != O_SEQ_LINE-2)  { /* but if can't place the name on the lowest line.. */

			               i = O_SEQ_LINE-1; k = 0;   /* check if there's any space lower between previous names */

			               /* following 'while' must be ordered like it is - i will be decr only if k == 0 */

			               while (k == 0 && --i != okline) { /* leaving 1 space before and after the name */

			                  if (O_txt[i][abcp-1] == ' ' &&                /* this can be replaced with something */

			                      O_txt[i][abcp+RE[Cur_RE].E_nam_l] == ' ' &&  /* more efficient later */

			                      O_txt[i][abcp+3] == ' ' &&       /* must check for a space here... */               

			                      O_txt[i][abcp+4] == ' ' )  k = 1;  /* as well as here */

			                    /* i = vertical counter ~= okline, k = 1 when we find enuf space */

			               }

			               okline = i;

			            }

			            if (okline < 2) {

			               fprintf(stderr, "!! Output Buffer Blown around base %ld - degeneracy caused too many Enz's\n"

			                  "to be matched!! Increase the stringency of Enz selection w/ -n -o -M flags\n", base_real_pos);

			               exit(0);

			            }

			            /* and memcpy the RE name from the struct to the output array */

			            memcpy (&O_txt[okline][abcp],&RE[Cur_RE].E_nam, (size_t)RE[Cur_RE].E_nam_l);

                     if (DD->Dat[DCnt] < 0) { /* if the match is in the (-) orientation */

                        memcpy (&O_txt[okline][abcp+RE[Cur_RE].E_nam_l], &os, 1);

                     }



			            /* and incr the ok2wr posn for that line by the length of the RE name + 1 */

			            /* 'ok2wr[okline]' below is modified only if we didn't find any lower spaces */

			            if (block_cut_pos+RE[Cur_RE].E_nam_l+1 > ok2wr[okline]) {

			               ok2wr[okline] = block_cut_pos+RE[Cur_RE].E_nam_l + 1; 

			            }

			         } /* if (block_cut_pos < basesPerLine).. */

			         else {  /* otherwise it's in the overlap area from the next block */

			            if (in_OL == -1) {   /* if this is the 1st time in OVERLAP, mark it */

			               in_OL = 1;        /* by setting in_OL to 1  */

			               Cnt_reset = DCnt - 1;  /* Have to back up to 1st RE in the OVERLAP that did not 

			                                                resolve into the current block */

			            }   /* The above marker *should* back the DCnt to the position of the Cur_RE */ 

			         }

			      }     /* if (block_cut_pos >= 0) ... */

			   }

			   base_real_pos = DD->Dat[++DCnt]; /* make sure that this is set before the next loop */

			   if (okline < min_okline) min_okline = okline;

			} else {

			   base_real_pos = DD->Dat[++DCnt]; /* this gonna fix it? */

			}

		}  /*  while (DD->Dat..... */



		/* Now print out the block we've composed, from min_okline to MAX_OUTPUT_LINES */

		/* adj to min_okline is nec to provide a 1 blank line between stanzas  */

		for (i=min_okline-hits; i<= lastline; i++) fprintf(stdout, "%.*s\n",p_width,O_txt[i]);



		/* And clean out the lines written into O_txt as well */

		memset(O_txt,' ',(size_t)((O_LMARG + O_RMARG + MAX_BASES_PER_LINE) * MAX_OUTPUT_LINES));  /* set all of O_txt to blanks */

	}   /* for (block_Cnt=1; block_Cnt<block_repeat; block_Cnt++) ... */

	free(BackCheck); /* this line was also deleted from the bike fork of tacg.c at one point

                       along with the corresponding memset at ~ line 36 */

}







   



/* GenerateTOC() simply checks the status of all the major output headers and

	generates the appro HTML to make up a Table of Contents for the results that 

   follow.  It does not actually check that they have completed correctly, but

   it does keep everything in one place.  Just a series of if()'s  */

   

void GenerateTOC(char *datestamp) {



	/* print a header */

   fprintf(stdout, "<h2>Table of Results</h2>\n");

   /* maybe print some date, version, file, machine run on, domain run on? etc info */

   fprintf(stdout, "<b>tacg version %s; Date: %s </b><br>\n"

   "<i>Contact Harry Mangalam (mangalam@home.com) with comments, critiques</i><br>\n",TACG_VERSION, datestamp);

   

   /* Start the UNordered list */

   fprintf(stdout, "<ul>\n");

   

   if (F.Summary != 0) { /* this generates no-cuts and all-cuts */

   	fprintf(stdout, "<li><a href=\"#nocuts\">Patterns that DO NOT MAP to this sequence</a>\n");

   	fprintf(stdout, "<li><a href=\"#allcuts\">Total Number of Hits per Pattern</a>\n");

   }

   if (F.Ladder != 0) { /* generates ladder map and infrequent matches */

   	fprintf(stdout, "<li><a href=\"#ladmap\">Ladder Map of Pattern Matches</a>\n");

   	fprintf(stdout, "<li><a href=\"#summarycuts\">Summary Map of Infrequent Matches</a>\n");

   }

   if (F.GelLO != 0 || F.GelHI != 0) {

   	fprintf(stdout, "<li><a href=\"#gelmap\">Pseudo Gel Map</a>\n");

   }

   if (F.Sites != 0) {

   	fprintf(stdout, "<li><a href=\"#sitetab\">Sites by Pattern</a>\n");

   }

   if (F.Frags == 1) {

   	fprintf(stdout, "<li><a href=\"#fragunsort\">Fragment Sizes (UNSorted)</a>\n");

   }

   if (F.Frags == 2) {

   	fprintf(stdout, "<li><a href=\"#fragsort\">Fragment Sizes (Sorted)</a>\n");

   }

   if (F.ORFs != -1) {

   	fprintf(stdout, "<li><a href=\"#orfs\">Open Reading Frame Analysis</a>\n");

   }

   if (F.LinMap != 0) {

   	fprintf(stdout, "<li><a href=\"#linmap\">Linear Map</a>\n");

   }

   if (F.GrfBBin != 0) {

   	fprintf(stdout, "<li><a href=\"#graphics\">Graphics Plotting Data</a>\n");

   }

   if (F.Prox != 0) {

   	fprintf(stdout, "<li><a href=\"#pair_proximity\">Pairwise Proximity</a>\n");

   }

   if (F.Rules != 0) {

   	fprintf(stdout, "<li><a href=\"#rules_proximity\">Complex Rules-based Proximity</a>\n");

   }

   if (F.Hookey != -1) {

   	fprintf(stdout, "<li><a href=\"#hookeyunsort\">AFLP Analysis - UNSORTED</a>\n");

   	fprintf(stdout, "<li><a href=\"#hookeysort\">AFLP Analysis - SORTED</a>\n");

   }



   if (F.GrafMap != -1) {

   	if (F.PDF == 1) {

   		fprintf(stdout, "<li><a href=\"%s/tacg_Map.pdf \">PDF Plasmid Map</a>\n", F.tmppath);

      } else {

      	fprintf(stdout, "<li><a href=\"%s/tacg_Map.ps \">Postscript Plasmid Map</a>\n", F.tmppath);

      }

   }

/* spares below */

/*

   if (F. != ) {

   	fprintf(stdout, "<li><a href=\"#LABEL\">DESCRIPTION</a>\n");

   }

 */   

   /* below marks the end of the UNordered list and the beginning of the PRE-formatted output */

   fprintf(stdout, "</ul><HR noshade size=5><pre>\n"); 

/*    fprintf(stdout, "<>  </>\n"); */



}

   





/* if either the --dam or --dcm flags have been used, DamDcmOK() checks to see 

	if the REi under consideration should be printed - has it been masked or is 

   it OK to print it.  What it does: passed the RE index of the enzyme, it

   checks to see if that RE is possibly affected and if so, whether the 'raw'

   hit recorded in DD.Dat is present in RE[].Sites.  The index to the

   last-checked position in RE[].Sites is kept in: 

   RE[].Sites[RE[].Sites[0]]       as 'RE[].Sites[0]' points to the end of the array.

   In this way, only have to incr by 1 to get to the next check point.

   RWC = Real World Coordinates

   REi = RE index

   Hmmm - this could be the prototype for an overall RE filter to be used in a number of cases..

*/

int DamDcmOK(long RWC, int REi) {

	int i, OK=0;

   /* 1st, check to see if this RE is sensitive at all and if not, skip it */

   /* also skip if it doesn't have ANY cuts at all */

   if ((RE[REi].E_Ncuts > 0) && ((F.dam == 1 && RE[REi].dam != 100) || (F.dcm == 1 && RE[REi].dcm != 100))) {

      if (RE[REi].Sites[RE[REi].Sites[0]] == 0) {

   	   RE[REi].Sites[RE[REi].Sites[0]] = 1; /* 1st time thru, set this to 1 (point it at the 1st real value */

      }

      OK = 0; 

      i = RE[REi].Sites[RE[REi].Sites[0]]; /* shorten: i = the last el to have been checked */

      while (RWC >= RE[REi].Sites[i] && OK == 0) {

      	if (RWC == RE[REi].Sites[i]) {

         	OK = 1; 

         }

         i++; 

      }

      RE[REi].Sites[RE[REi].Sites[0]] = i;

   } else OK = 1;

   if (RE[REi].E_Ncuts == 0) OK = 0; /* just in case */

   return OK;

}



/* Add_dam and Add_dcm just fill the RE[1] and RE[2] entries for these special 

   methylation enzymes. */



void Add_dam(int NumREs) {

   strcpy(RE[NumREs].E_nam,"dam\0");

   if ((RE[NumREs].E_raw_sit = calloc(5, sizeof(char))) == NULL) BadMem("Add_dam-1", 1);

   RE[NumREs].E_raw_sit = "GATC\0"; 

   if ((RE[NumREs].E_wsit[1] = calloc(5, sizeof(char))) == NULL) BadMem("Add_dam-2", 1);

   RE[NumREs].E_wsit[1] = "gatc\0"; 

   strcpy(RE[NumREs].E_hex[1],"gatcnn\0");

   RE[NumREs].E_tcut[1] = 0;

   RE[NumREs].E_olap = 0;

   RE[NumREs].E_len = 4;

   RE[NumREs].E_pal = 1;

   RE[NumREs].E_dgen = 16;

   RE[NumREs].E_mag = 4;

   RE[NumREs].proto = NumREs;

   RE[NumREs].Err = 0;

   RE[NumREs].E_nam_l = 3;

}



void Add_dcm(int NumREs) {

   strcpy(RE[NumREs].E_nam,"dcm\0");

   if ((RE[NumREs].E_raw_sit = calloc(6, sizeof(char))) == NULL) BadMem("Add_dcm-1", 1);

   RE[NumREs].E_raw_sit = "CCWGG\0"; 

   if ((RE[NumREs].E_wsit[1] = calloc(6, sizeof(char))) == NULL) BadMem("Add_dcm-2", 1);

   RE[NumREs].E_wsit[1] = "ccwgg\0"; 

   strcpy(RE[NumREs].E_hex[1],"ccwggn\0");

   RE[NumREs].E_tcut[1] = 0;

   RE[NumREs].E_olap = 0;

   RE[NumREs].E_len = 5;

   RE[NumREs].E_pal = 1;

   RE[NumREs].E_dgen = 8;

   RE[NumREs].E_mag = 4;

   RE[NumREs].proto = NumREs;

   RE[NumREs].Err = 0;

   RE[NumREs].E_nam_l = 3;

}



/***************************************************************************************************/

/* The following code performs the "pad the beginning sequence with the end and the end 

   with the beginning" regardless of whether it's the whole sequence or a subsequence, 

   using the vars 'begin'  and 'end'.  *Bare_Seq_Len is the the exactly "Real Sequence 

   Length" as determined from the SEQIO routines, no padding added or subtracted - 

   it's incremented to TotPaddedSeqLen in here and passed back to calling fn() via

   *Pad_Seq_Cnt  */



/* TotPaddedSeqLen    length of sequence + bracketing overlaps

   Bare_Seq_Len       actual number of bases read

*/

/* will try to contruct this so that you give it the exact sequence you want padded, and it

   does all the calculations, work to alloc a new sequence space and do the padding, returning 

   exactly what you want */



char *PadEndForEnd(char *SeqIn, long *Pad_Seq_Len, long *Bare_Seq_Len) {

   long m, j, l, BareLen, TotPaddedSeqLen, FrPadSeqLen;

/*   int  f1=0, iseq; */

   char *sequence, b…
Large files files are truncated, but you can click here to view the full file