/code_comm/SeqFuncs.c
C | 1221 lines | 715 code | 146 blank | 360 comment | 357 complexity | 4f1d097f6c9a33be26bb695adc201b9a MD5 | raw file
Large files files are truncated, but you can click here to view the full file
- /* tacg - a command line tool for the analysis of nucleic acids and protein */
- /* Copyright 1994-2001 Harry J Mangalam, tacg Informatics
- (mangalam@tacgi.com | mangalam@home.com | 949 856 2847) */
-
- /* $Id: SeqFuncs.c,v 1.13 2001/11/14 01:18:09 hjm Exp $ */
-
- /* The use of this software (except that by Harald T. Alvestrand, which is described in 'udping.c')
- and that by James Knight, which is described in 'seqio.c' is bound by the notice that appears
- in the file 'tacg.h' which should accompany this file. In the event that 'tacg.h' is not bundled
- with this file, please contact the author.
- */
-
- #include <stdio.h>
- #include <ctype.h>
- #include <string.h>
- #include <stdlib.h>
- #include <sys/stat.h>
- #include <unistd.h>
- #include <regex.h>
-
- #include "seqio.h"
- #include "tacg.h"
-
- /* takes care of:
- - reading rules from the file (path validated via SearchPaths()),
- - parsing the file into the Rules struct
- - validating that the rules are formed correctly (parens match up, logicals are correct)
- - making sure that each of the subpattern names has been loaded in RE (have to hash the
- names as in satisfying the -x flag in ReadEnz().
- - eventually, create --rules 'thisrule,thatrule,mars,promoter, etc'
- (can be specified w/o --rulefile in which case it will read the deafult 'rules.data'.)
- as opposed to current --rule for specifying a rule at a time from the commandline.
- (which should write out that rule to the tacg.patterns file for incoporation into the
- main rules file. require that it be named as well.
- ie: --rule 'Name,((()()())(())),width'
-
- - open the file, getting path name from F.RuleFile
- - read in the strings, concatting as we go, then validate the logic
- and stuff all the bits into the Rule_struct
- - at each iter, make sure there's enough mem
- - when finished with the file read, and all the bits are in Rule_struct, go thru
- Rules[].Name and check against the names in RE[].E_nam.
- - make
- */
- int tacg_Eval_Rules_from_File(struct SE_struct SelEnz[MAX_SEL_ENZ], char datestamp[80]) {
- /* long ProtoNameHash[], int NumREs, */
-
- FILE *fpRF;
- char junk[10], *ctemp1, ct, *rulestring, *packed_RS, *rstr;
- short NameCheck[RE_NAME_TAB_SIZ]; /* indices from realhash(PatName) to check against multiple identicals */
- int SEi=0;
- long j, EOpRS=0,
- window_size = 0,
- NumRules=0,
- rstr_len, linenum=0,
- RS_Cnt = 0, /* RuleStruct Counter */
- EOAM_RS = INIT_NUM_RULES; /* End Of Allocated Memory for the Rule_Struct; allocated in tacg */
-
- short cont = 0; /* continue indicator */
-
- /* alloc the Rule_struct Rules */
- if ((Rules = calloc(EOAM_RS, sizeof(*Rules))) == NULL) {
- BadMem("Can't init mem for Rules", 1);
- }
-
- if ((ctemp1 = calloc(512,sizeof(char))) == NULL) { BadMem("Can't init mem for 'ctemp1'", 1); }
- if ((rulestring = calloc(MAX_LEN_RULESTRING,sizeof(char))) == NULL) { BadMem("Can't init mem for 'rulestring'", 1); }
- if ((packed_RS = calloc(MAX_LEN_RULESTRING,sizeof(char))) == NULL) { BadMem("Can't init mem for 'packed_RS'", 1); }
- if ((rstr = calloc(MAX_LEN_RULESTRING,sizeof(char))) == NULL) { BadMem("Can't init mem for 'rstr'", 1); }
- memset(NameCheck,0, sizeof(short)*RE_NAME_TAB_SIZ); /* make sure everything is set to 0 */
-
- if ((fpRF = fopen(F.RuleFile,"r")) == NULL) { /* or the standard/optional one from SetFlags() */
- fprintf(stderr,"Cannot open the Rules file \"%s\" for reading!!\n", F.RuleFile); /* DON'T HIDE behind -V */
- exit(1); /* print an error and die gracefully */
- } else if (F.Verbose > 0) { fprintf(stderr,"Rule File %s opened OK for reading\n", F.RuleFile); }
-
- /* Scan thru comments in rebase file to separator ("..") */
- junk[0] = 'k'; /* make sure the strcmp below fails the first time */
- while ((feof(fpRF) == 0) && (strncmp(junk, "..",2) != 0)) {
- fscanf (fpRF, "%2s", junk); ct = 'm'; /* and read to end of line - this needs to be able to deal */
- while (ct != '\n' && ct != '\r' && (feof(fpRF) == 0)) { /* with end of line conditions for Mac, Win, and unix - what have I missed? */
- ct = fgetc(fpRF); linenum++;
- }
- }
-
- /* we're at "..", so go into file and start slurping directly data into struct
- except for RE_rawsite which has to be filtered a couple ways before being acceptable */
-
- ct = 'm';
- while (fgets(rstr, 512, fpRF) && (NumRules <= MAX_NUM_RULES)) {
- /* check the size of the rule struct to see if it's going to run into the end of the alloc'ed mem */
- if (RS_Cnt == EOAM_RS) { /* then we need to alloc more mem for the Rules struct */
- EOAM_RS += RULES_INCR;
- if ((Rules = realloc(Rules, sizeof(*Rules)*EOAM_RS)) == NULL) {
- BadMem("Failed to get more mem for Rules.\n", 1);
- }
- }
-
- /* now the whole line's in rstr; use strsep to parse it, make decisions re: parsing, skipping */
- j = stripwhitespace(rstr,0,0);
- if (rstr[0] != ';' && (strlen(rstr) != 0)) { /* if the line hasn't been commented out, get all the bits */
- /* REMEMBER! strsep chews the string to pieces and changes the start of the string */
- if (j>0 && F.Verbose > 0) {fprintf(stderr,"DEBUG: stripped %ld whitespace characters from RuleName\n", j);}
-
- strcpy(packed_RS, rstr); /* copy over to pass to tacg_SlidingWindow - may not be needed */
-
-
- /* now have to break on ',' to get all the rulestring in 1 shot
- - also have to detect if the last char is a continuation char '\' */
-
- /* if last char is '\', then rulestring is incomplete and there won't be a window term */
- cont = 0;
- EOpRS = strlen(packed_RS) - 1; /* */
- if (packed_RS[EOpRS] == '\\') { /* then there's a continuation char */
- cont = 1;
- /* keep on going until there's no more continuation lines */
- while ((cont == 1) && fgets(rstr, 512, fpRF)) { /* */
- j = stripwhitespace(rstr,0,0); /* no whitespaces, eols */
- rstr_len = strlen(rstr); /* length excluding terminating \n */
- EOpRS = strlen(packed_RS);
- if (EOpRS + rstr_len >= MAX_LEN_RULESTRING) {
- fprintf(stderr, "Total length of the rulestring exceeded; change MAX_LEN_RULESTRING \n"
- "or check your continuation characters for the rule %s\n", ctemp1);
- exit (1);
- }
- /* rstr should be the right size to copy mod the possible final \ and \n */
- strcpy(packed_RS+EOpRS-1,rstr); /* copy all but the last char over */
-
- /* fgets(rstr, 512, fpRF); */
-
-
- fprintf(stderr, "\nend char = %s\n", rstr+(strlen(rstr)-3) );
- if (rstr[(strlen(rstr)-1)] == '\\' ){ cont = 1;}
- else { cont = 0;}
- }
- fprintf(stderr, "\n\n\npacked_RS = %s\n\n", packed_RS);
- if ((rstr = calloc(MAX_LEN_RULESTRING,sizeof(char))) == NULL) { BadMem("Can't init mem for 'rstr'", 1); }
- strcpy(rstr,packed_RS);
-
- }
- /* coming out of the continuation loop, packed_RS contains the entire Name,RuleString,Window concat
- that the whole thing should look like. rstr has only the last line of a continued rule, so it needs to
- be synced, so copy packed_RS to rstr again */
-
- ctemp1 = strsep(&rstr, ","); /* using ctemp1 (=Name) for congruence with previous code */
- if (strlen(ctemp1) > 10) {
- fprintf(stderr, "Name %s is longer than allowed (%d) - check it!\n", ctemp1, MAX_PAT_NAME_LEN);
- exit(1);
- }
-
-
- /* so now packed_RS has the whole packed rulestring in it at this point; rstr has everything
- BUT the RuleName in it. So now we finish off the line, so break on ',' to get the rulestring
- and ONLY THEN break on the ',' to separate the rule term from the sliding window term
- (since already have processed the Name) */
- if (strlen(rstr) >= MAX_LEN_RULESTRING) {
- fprintf(stderr, "Total length of the rulestring exceeded; change MAX_LEN_RULESTRING \n"
- "or check your continuation characters for the rule %s\n", ctemp1);
- exit (1);
- }
-
- /* break on the comma so we can ignore the window term - this probably needs some more error-checking */
- memset(rulestring, 0, MAX_LEN_RULESTRING);
- rulestring = strsep(&rstr,","); /* rulestring now contains ONLY the packed rulestring */
- window_size = (long)atoi(strsep(&rstr,","));
-
- if (window_size < 1) { /* now test whether windowsize & rulestring are valid */
- fprintf (stderr, "Window size for %s is less than 1, which is silly.\n", ctemp1);
- exit (1);
- }
- /* Are the component pattern Names valid? ie is this name represented in RE by being loaded
- from the RE database? need to hash the Names of the individual patterns and see if they
- hit any of the Names that were loaded from whatever rebase was being used.
- Soooooo, go thru the rulestring and hash all words breaking on ()|^& to cut out the 'Name:min:Max' stanzas */
- tacg_Fill_SelEnz_fr_RuleString(SelEnz, &SEi, rulestring, NameCheck, ctemp1);
-
-
-
- /* stuff ALL the bits (Name, rulestring, window_size) into the struct (the validity of the rule itself
- tested in the evaluation code) */
- strcpy(Rules[RS_Cnt].Name, ctemp1);
- if ((Rules[RS_Cnt].RuleString = calloc((strlen(rulestring)+1),sizeof(char))) == NULL) {
- BadMem("Can't init mem for 'Rules[RS_Cnt].RuleString'", 1);
- }
- strcpy(Rules[RS_Cnt].RuleString, rulestring);
- Rules[RS_Cnt].Window = window_size;
-
- if ((rstr = calloc(MAX_LEN_RULESTRING,sizeof(char))) == NULL) { BadMem("Can't init mem for 'rstr'", 1); }
- RS_Cnt++;
- } else {
- /* fprintf(stderr, "\nDEBUG: line ignored: %s\n", rstr); */
- }
- }
- F.Xplicit = -1; /* notify ReadEnz */
- F.Rules = RS_Cnt-1; /* does this have to be decr by one 1st?? */
- return SEi;
- }
-
-
- /* ***************************************************************************************************** */
-
-
- /* tacg_Fill_SelEnz_fr_RuleString() takes the SelEnz sruct and the rulestring (and a few other housekeeping vars
- and parses the rulestring passed to it to populate SelEnz and do a namecheck to make sure that the names being
- parsed out of the rulestring haven't been seen before and aren't otherwise objectionable
- takes as args:
- SelEnz,
- the starting SEi (as an int * as it's changed in the fn())
- the rulestring (copied in the function so that the original rulestring is untouched)
- NameCheck (has to survive fn() to check for multiple identicals over multiple rulestrings,
- as well as to check for multiple identicals in the same rulestring (A&B|A&C)
- ctemp1 - Name of the Rule (for error messages)
- and back comes
- the (more) populated SelEnz
- and the ending SEi.
- */
-
- void tacg_Fill_SelEnz_fr_RuleString(struct SE_struct SelEnz[MAX_SEL_ENZ], int *SEi,
- char *rulestring, short NameCheck[RE_NAME_TAB_SIZ], char *ctemp1) {
- char *Name_mM, *rstr, *PatName = " ",
- *amin = " ",
- *aMax = " ";
-
- int ind;
- if ((Name_mM = calloc(33,sizeof(char))) == NULL) { BadMem("Can't init mem for 'Name_mM'", 1); }
- if ((rstr = calloc(MAX_LEN_RULESTRING,sizeof(char))) == NULL) { BadMem("Can't init mem for 'rstr'", 1); }
- strcpy(rstr,rulestring); /* re-using rstr; mem should be fine after re-calloc above */
- Name_mM[0] = '\0'; /* to pass on 1st pass */
- while (Name_mM[0] == '\0') {
- Name_mM = strsep(&rstr, "()|^&"); } /* this should break out a 'Name:#:#' set */
-
- while (Name_mM != 0L) {
- if (Name_mM[0] != '\0') {
- PatName = strsep(&Name_mM, ":");
- ind = (int) realhash(PatName, RE_NAME_TAB_SIZ);
- /* fprintf(stderr, "DEBUG: realhash '%s' = %d\n", PatName, ind); */
- if (strlen(PatName) < 11) {
- if (NameCheck[ind] == 0) {
- strcpy(SelEnz[*SEi].PName,PatName);
- NameCheck[ind] = 1; /* now mark it to make sure that we don't take it again */
- if ((amin = strsep(&Name_mM, ":")) == '\0') {
- SelEnz[*SEi].min = 0; /* RE[ii].min = */
- } else {
- SelEnz[*SEi].min = atoi(amin); /* RE[ii].min = */
- }
- if ((aMax = strsep(&Name_mM, ":")) == '\0') {
- SelEnz[*SEi].Max = 32000; /* RE[ii].Max = */
- } else {
- SelEnz[*SEi].Max = atoi(aMax); /* RE[ii].Max = */
- }
- (*SEi)++;
-
- } else { /* else it's already been seen in another rule or sub-rule and we don't care */
- /* fprintf(stderr, "DEBUG: The pattern named %s has already been seen previously; continuing .. \n", PatName); */
- }
- } else {
- fprintf (stderr, "\nIn Rule named %s, Pattern Name %s is too long (must be <=10 chars)\n",
- ctemp1, PatName);
- exit(1);
- }
- }
- Name_mM = strsep(&rstr, "()|^&"); /* get the next one */
- }/* @ end of loop, all constituent patterns are loaded into SelEnz or the app has exited on an err */
- }
-
-
- /* ***************************************************************************************************** */
-
-
-
-
-
- /* RE_Filter returns 1 if the RE indexed by Proto[REi] matches the tests for
- Magnitude
- Number of Cuts
- Cost
- Overlap
- etc that are can be used to fine-tune the selection of REs */
- int RE_Filters_OK(int REi) { /* REi should be the ProtoCutters index, as that is the only RE el that we have to check */
- int OK=0;
- /* fprintf(stdout, "\n REi = %d, RE name = %s \n", REi, RE[REi].E_nam); */
- if (((RE[REi].E_mag >= (int)F.Mag) && /* E_mag must be >= to -n flag */
- (REi >= F.RE_ST) && /* Don't EVER want dam/dcm sites printed */
- (RE[REi].E_Ncuts >= (int)F.Min) && /* Ncuts >= -m, */
- (RE[REi].E_Ncuts <= (int)F.Max) && /* Ncuts <= -M */
- (RE[REi].Cost >= (int)F.Cost) &&
- /* the dam/dcm part of this COULD be rolled in if the RWC (aka base_real_pos) was included as an optional
- argument (or if -1, ignore it), but right now it doesn't seem to be used in the same way, so I'll let
- the code stay but commented out for now */
- /* (DamDcmOK(base_real_pos, REi) == 1) && print only RE's !masked by dam/dcm methylases */
-
- (F.Overhang == 1)) || /* if want all, only 1st has to be true to short circuit */
- ((RE[REi].E_olap > 0) && (F.Overhang == 5)) || /* if olap < 0 && overhang = 5' */
- ((RE[REi].E_olap < 0) && (F.Overhang == 3)) || /* if olap > 0 && overhang = 3' */
- ((RE[REi].E_olap ==0) && (F.Overhang == 0))) /* if olap = 0 && overhang = blunt */
- { OK = 1; }
- else { OK = 0; }
- return OK;
- }
-
- /* if either the --dam or --dcm flags have been used, DamDcmOK() checks to see
- if the REi under consideration should be printed - has it been masked or is
- it OK to print it. What it does: passed the RE index of the enzyme, it
- checks to see if that RE is possibly affected and if so, whether the 'raw'
- hit recorded in DD.Dat is present in RE[].Sites. The index to the
- last-checked position in RE[].Sites is kept in:
- RE[].Sites[RE[].Sites[0]] as 'RE[].Sites[0]' points to the end of the array.
- In this way, only have to incr by 1 to get to the next check point.
- RWC = Real World Coordinates
- REi = RE index
- Hmmm - this could be the prototype for an overall RE filter to be used in a number of cases..
- */
-
- /* Clone() goes thru RE[] and figures out which REs match the conditions of the option flag, where
- --clone '#_#,#x#,#x#,#_#' (up to MAX_NUM_CLONE_RANGES). '_' indicates that the range cannot be cut;
- 'x' indicates that it can be cut.
- There will probably be some interactions between --clone and other options, but
- for now, finish the basic functionality and sort out the interactions later. In fact,
- RE[].. should not be a problem as it will be populated only with sites that are requested,
- so that if only a subset or special file of REs are used, only they will be used for
- for the clone() analysis.
- This fn() is called AFTER the digest is done in the normal way and should only
- need the number of REs (NumREs - not modified by this so it doesn;t ave to be
- passed as a pointer) as a max to go thru to determine the ranges that agree with the
- ranges specified.
- */
- /* should really be running thru Protos[] from 0->ProtoCutters */
- void Clone(int ProtoCutters, int *Protos) {
- long i, c, j, m, p, r,
- Best[F.Clone+2][ProtoCutters], /* Best = array that stores and sorts the best REs for a set of cloning criteria
- last el is for storing summary truths to check if ALL rules have been met */
- BadProtos[ProtoCutters]; /* BadProtos keep track of Protos that map inside a forbidden zone and therefore have to
- be removed from contention after the fact. If a Protos index shows up in BadProtos,
- it cannot be a possible cloning RE (unless cloner wants to try a partial digest.. */
- short N_Crits;
- int LadderProtos[ProtoCutters];
-
- N_Crits = F.Clone; /* simplicity; F.Clone stores the # of ranges in the --clone option string */
- for (i=0; i<= F.Clone; i++) {
- memset(Best[i],0,sizeof(int)*ProtoCutters);
- }
- memset(BadProtos,0,sizeof(long)*ProtoCutters);
- memset(LadderProtos,0,sizeof(int)*ProtoCutters);
-
- /* Need additional rules to elim REs.
- If an RE hits in the forbidden zone, that RE is then added to a FORBIDDEN array to be checked post-processing.
- now #x# = MUST HIT range Should there be an option to set prefs for this? Or just show how many
- there are in that zone?
- How should REs that hit in zones NOT explicitly marked as allowed (x) or forbidden (_) be treated? In typical
- cloning, don't want lots of extra cuts as it increases ends to contend with.
- */
-
- for (r=0; r<=N_Crits; r++) { /* for each range parsed out in SetFlags() */
- for (i=0; i<ProtoCutters; i++) {
- p = Protos[i]; /* p is the alias for the RE prototype index, so RE[p].etc will hit only prototypes */
- if (Clone_S[r].to_cut == 1) { /* if this is a range where we REQUIRE hits... */
- m = Clone_S[r].matching_REs[1]; /* [1] is the 1st free position of the array */
- for (j=1; j<RE[p].Sites[0]; j++) { /* RE[p].Sites[0] holds the index of the 1st FREE position in the Sites array */
- if (RE[p].Sites[j] >= Clone_S[r].begin && /* site must match BOTH conditions */
- RE[p].Sites[j] <= Clone_S[r].end ) {
- /* do we need a per-range log for positives & negatives? */
- Clone_S[r].matching_REs[m++] = i; /* incr m, store the Proto index */
- Clone_S[r].matching_REs[1] = m; /* keep track of the 1st free el as well */
- /* and check for realloc conditions */
- if ((m+1) == Clone_S[r].matching_REs[0]) { realloc_Clone_matching_REs(m, r); }
- } /* else we silently ignore the bad hits */
- }
- Clone_S[r].matching_REs[1] = m; /* and then store the position back into the array */
- } else { /* this is a range we where we will NOT TOLERATE hits */
- m = Clone_S[r].matching_REs[1]; /* [1] is the 1st free position of the array */
- for (j=1; j<RE[p].Sites[0]; j++) { /* RE[p].Sites[0] holds the index of the 1st FREE position in the Sites array */
- if (RE[p].Sites[j] <= Clone_S[r].begin || /* site can match either condition */
- RE[p].Sites[j] >= Clone_S[r].end ) {
- Clone_S[r].matching_REs[m++] = i; /* incr m, store the Proto index */
- Clone_S[r].matching_REs[1] = m; /* keep track of the 1st free el as well */
- /* and check for realloc conditions */
- if ((m+1) == Clone_S[r].matching_REs[0]) {realloc_Clone_matching_REs(m, r); }
- } else { /* else we have to track the REs to mask out the results by adding their index to BadProtos[] */
- BadProtos[i]++; /* if an el is > 0, it's EVIL */
- }
- }
- Clone_S[r].matching_REs[1] = m; /* and then store the position back into the array */
- }
- }
- }
-
- /* now we've run thru all the criteria for judging whether the REs map into the ranges
- now have to output the data in some usable form. 1st, are there any REs that match ALL requirements?
- 2nd output all ranges with all matching REs, sorted alphabetically. Should --clone automatically set
- --sites, so can output all the sites at the same time?? Hmmm...
- */
- /* 1st are there any that match ALL crits? for all crits, are there any indices that appear in all of them??
- set up array of length N_Crits and increment each RE index as go thru all the ranges. Also has the effect of
- being able to sort those REs that match the MOST crits as well. */
- for (i=0; i<=N_Crits; i++) { /* cycle thru all the crits */
- for (j=2; j<Clone_S[i].matching_REs[1]; j++) { /* cycle thru all the prototype REs that match for each crit */
- Best[i][Clone_S[i].matching_REs[j]]++; /* this will count the # of times a proto came up positive in the criteria */
- }
- } /* so now have all the REs counted. */
-
- /* and print the results in some kind of visually appealing way. */
- /* 1st print out any REs that met ALL crits.. */
- fprintf(stdout,"\n\n== Here's the Clone output for the %d Criteria:\n", (F.Clone+1));
- for (i=0; i<=F.Clone; i++) { fprintf(stdout, " Rule %ld: %s\n", i+1, Clone_S[i].range_rule); }
-
- for (j=0; j <= F.Clone; j++) { /* sum the rules results to generate a summary */
- for (i=0; i<ProtoCutters; i++) {
- if (Best[j][i] > 0 && BadProtos[i] == 0) {
- Best[F.Clone+1][i]++; /* for each rule passed, incr the truth element once */
- }
- }
- }
- /* and finally print out the Protos that match ALL the rules BUT this should probably be mod to allow selection and
- filtering by the standard rules -o -n -m -M --cost, etc but later... this will require a test fn() that tests all the
- Protos against all the various restrictions, but it will be applicable to at least 4 cases */
- fprintf(stdout,"\nREs that met ALL criteria:\n");
- /*
- for (i=0; i<ProtoCutters; i++) {
- if (Best[F.Clone+1][i] == F.Clone+1) {
- fprintf(stdout,"\n RE[Protos[i]].E_nam);
- }
- }
- */
- i = c = m = 0;
- while (i<ProtoCutters) {
- if (m >= F.Width) { m = 0; fprintf(stdout, "\n"); }
- if (Best[F.Clone+1][i] == F.Clone+1 ) {
- fprintf(stdout,"%10s", RE[Protos[i]].E_nam);
- LadderProtos[i] = Protos[i];
- c++; m += 10;
- }
- i++;
- }
- if (c == 0) { fprintf(stdout,"There were no REs that met ALL criteria\n"); }
- else {
- PrintGelLadderMap(i, F.Seq_Len, LadderProtos, 2);
- }
- memset(LadderProtos,0,sizeof(int)*ProtoCutters);
-
- fprintf(stdout,"\n\nResults (listed by Rules) that met SOME criteria & did not violate a hits-forbidden range.");
- for (j=0; j<=F.Clone; j++) {
- fprintf(stdout, "\n\n--- For Rule %ld (%s), the following REs matched:\n\n", j+1, Clone_S[j].range_rule);
- i = c = m = 0;
- while (i<ProtoCutters) {
- if (m >= F.Width) { m = 0; fprintf(stdout, "\n"); }
- if (Best[j][i] > 0 && BadProtos[i] == 0) {
- fprintf(stdout,"%10s", RE[Protos[i]].E_nam);
- LadderProtos[i] = Protos[i];
- c++; m += 10;
- }
- i++;
- }
- if (c == 0) { fprintf(stdout,"There were no REs that met EVEN SOME of the criteria\n"); }
- else {
- PrintGelLadderMap(i, F.Seq_Len, LadderProtos, 2);
- }
- memset(LadderProtos,0,sizeof(int)*ProtoCutters);
- }
-
- fprintf(stdout,"\n\n--- Results (listed by RE) that met SOME criteria & did not violate a hits-forbidden range\n");
- fprintf(stdout,"\n RE Name");
- for (j=0; j<=F.Clone; j++) { fprintf(stdout," Rule %ld", j+1); }
- fprintf(stdout,"\n");
- for (i=0; i<ProtoCutters; i++) {
- if (BadProtos[i] == 0) {
- fprintf(stdout,"\n%10s", RE[Protos[i]].E_nam);
- for (j=0; j<=F.Clone; j++) {
- if (Best[j][i] > 0) {
- fprintf(stdout," X ");
- } else {
- fprintf(stdout," ");
- }
- }
- }
- }
- fprintf(stdout,"\n\n");
- };
-
-
- void realloc_Clone_matching_REs(long current, long index) {
- long newsize;
- /* and check for realloc conditions */
- if ((current+1) == Clone_S[index].matching_REs[0]) { /* if we're nearly up to the alloc'ed mem, need to realloc */
- newsize = Clone_S[index].matching_REs[0] + CLONE_INCR_MRE; /* this is how big the new size should be */
- if ((Clone_S[index].matching_REs = (long *) realloc(Clone_S[index].matching_REs, sizeof(long) * newsize)) == NULL) {
- BadMem("Can't realloc mem for Clone_S[].matching_REs", 1);
- } else { Clone_S[index].matching_REs[0] = newsize; } /* new alloc range */
- }
- };
-
-
- /* the example function included below is a mock-up to show how to include your own function.
- Obviously, there are 10s of functions included with this source code and some of them may be
- immediately applicable to your work. Feel free to bash them into a useful state for yourself.
- If you think that you've created a widely useful function that you would like to have included
- with tacg, please give me a shout, and I'll include a pointer to it or provide other information
- about it on my web site.
-
- void Example(void){
-
- }
- */
-
-
- /* LinearMap() is simply a functionization of the Linear Mapping chunk that
- should have been functionized long ago */
-
- void LinearMap(long seq_len, int basesPerLine, int NumREs, char *sequence, int max_okline,
- char Codons[][][], int codon_Table, int *Protos) {
-
- /* Declarations */
- int i, d, k, mm=0, rc, n_letters=0, Xn=0, okline, HTML, spacer, spacermod, stanzaspace,
- min_okline, tic_line, Cur_RE, block_Cnt, block_repeat, block_cut_pos, lastline = O_SEQ_LINE+2, hits,
- ok2wr[MAX_OUTPUT_LINES], p_width, abcp, in_OL=-1, seq_offset, tripclick=0;
-
- long *BackCheck, /* BackCheck[] checks proto entries in the Linear map output to remove duplicates */
- DCnt, Cnt_reset=0, base_real_pos, numstart;
-
- char ctemp1[30], *s, Prot_out[MAX_BASES_PER_LINE],
- O_txt[MAX_OUTPUT_LINES][O_LMARG+O_RMARG+MAX_BASES_PER_LINE];
-
- /* char Codons[MAX_ORGS][N_CODONS][7]; */
- char os = '~'; /* started out as '<' to mark other strand, but causes problems with HTML generation */
-
- BackCheck = (long *)calloc((size_t)NumREs+2, sizeof(long)); /* Needs to be '+2' as combination cut will add one and 1 extra */
- if (BackCheck == NULL) BadMem("Can't init mem for BackCheck", 1);
-
- s = (char *) calloc(256, sizeof(char));
- if (s == NULL) BadMem("failed to calloc mem for 's'", 0);
- memset(O_txt,' ',(size_t)((O_LMARG+O_RMARG+MAX_BASES_PER_LINE)*MAX_OUTPUT_LINES)); /* set all of O_txt to blanks - works!*/
- p_width = O_LMARG + O_RMARG + basesPerLine;
-
- /* Routine to print out the text of the digestion, with names going where they should go
- a la Strider - have to keep track of how long the names are and where an allowable place
- to write the name - */
-
- /* In brief, the following function, does this:
- It reads thru DD->Dat (the array that keeps track of the linear sequence of cuts in the sequence)
- and matches the position of the enzymes to the current block being printed, taking into account
- the cut offset of the RE from the 1st base of the recognition sequence. The tricky part is that
- because of this latter functionality, it has to check the sequence that will be the *following*
- block to see if there are any RE recognition seqs within the 1st few bases that will cause the actual
- cut site to be in the *previous* block. Doing this seems to take a great amount of the cpu time - can
- probably be optimized a good deal more, but it's still >10 faster than GCG ;). */
-
- HTML = F.HTML; /* simplicity */
- n_letters = F.Lab1or3; /* simplicity */
- Xn = F.XLinFr; /* simplicity */
- numstart = F.NumStart; /* simplicity */
-
- fprintf(stdout, "\n\n");
- if (HTML > -1) fprintf(stdout, "<H3><A NAME=\"linmap\">");
- fprintf(stdout, "== Linear Map of Sequence:\n"); /* Write the header */
- if (HTML > -1) fprintf(stdout, "</A></H3>\n");
-
- /* inits, etc */
- if (seq_len % basesPerLine == 0) block_repeat = (int) seq_len/basesPerLine; /* # times thru the compose/print cycle */
- else block_repeat = (int) seq_len/basesPerLine + 1;
- DCnt=2; /* start the DD->Dat counter at the beginning of REAL DATA [0]=Cnt; [1]=Siz */
- base_real_pos = DD->Dat[DCnt];
-
- /* spacer dictates the line spacing between the 1st 3 and 2nd 3 translation stanzas; if 6 there
- will be no space between the 2 stanzas, if 7 there will be 1 space; could make this another
- option but sheesh... */
- spacer = 7; spacermod = 0;
- if (F.Strands == 1) { spacermod++; }
- if (F.Tics == 0) { spacermod++;}
- spacer = spacer - spacermod;
- if (spacer == 7) stanzaspace = spacermod + 1;
-
- for (block_Cnt=1; block_Cnt <= block_repeat; block_Cnt++) {
- /* hits is set to 1 if there are hits on the line; if no hits, decr the top buffer line to output
- so that there is a constant 1 blank line spacer between stanzas */
- hits = -1;
- /* if there was an OVERLAP failure previously, fix it */
- if (in_OL == 1) {
- memset(BackCheck,0,sizeof(long)*(NumREs+2)); /* clear BackCheck[] as it's filled with stale data */
- DCnt = Cnt_reset; /* and reset DCnt to that pointer */
- }
- in_OL = -1; /* and reset 'in OVERLAP' marker for new block */
- base_real_pos = DD->Dat[DCnt]; /* Make sure that ' base_real_pos' is set */
- okline = min_okline = O_SEQ_LINE-2; /* set the min (highest on page) line of buffer to print out */
-
- /* Set up the seq and its rev compl in the output buffer */
- memset(ok2wr,0,sizeof(int)*MAX_OUTPUT_LINES); /* set/reset all of ok2wr to 0's for the new block*/
- /* next line was deleted in a brief fork of tacg.c - note this if get unexlained segfs */
- memset(BackCheck, 0, sizeof(long)*NumREs); /* reset the BackCheck[] */
-
- /*write the correct numbering in Left margin */
- if (F.NumStart < 1 && (((block_Cnt-1)*basesPerLine) + 1 + F.NumStart) == 0) {numstart++; tripclick++;}
- sprintf((char *)s, "%7ld",(((block_Cnt-1)*basesPerLine) + 1 + numstart)); /* numstart is just a + offset */
- memcpy(&O_txt[O_SEQ_LINE][0], s, 7);
- /* and in the right margin - combine with the above bit? */
- if (F.NumStart < 0 && (tripclick || (((block_Cnt)*basesPerLine + numstart)) > 0)) {
- if (!tripclick) {tripclick++; numstart++;}
- }
- sprintf((char *)s, "%7ld",(((block_Cnt)*basesPerLine + numstart)));
- memcpy(&O_txt[O_SEQ_LINE][O_LMARG+basesPerLine], s, 7);
- /* and drop in the sequence directly after it. */
-
- if (block_Cnt != block_repeat) k = basesPerLine; /* test for end of sequence to end gracefully */
- else k = (int)seq_len - (block_Cnt-1)*basesPerLine;
- if ((F.SeqBeg != 1 || F.NumStart != 0) &&
- (F.Strands != 1 || F.Tics != 0)) { /* if subsequence or dif #ing, print the #s of the original seq too */
- /* write the correct numbering in Left margin */
- sprintf((char *)s, "%7d",((int)(((block_Cnt-1)*basesPerLine)+F.SeqBeg))); /* these 2 lines could be combined...? */
- memcpy(&O_txt[O_SEQ_LINE+1][0], s, 7);
- /* and in the right margin - combine with the above bit? */
- sprintf((char *)s, "%7d",(int)(((block_Cnt)*basesPerLine - 1 + F.SeqBeg)));
- memcpy(&O_txt[O_SEQ_LINE+1][O_LMARG+basesPerLine], s, 7);
- }
-
- memcpy(&O_txt[O_SEQ_LINE][O_LMARG], &sequence[(block_Cnt-1)*basesPerLine+BASE_OVERLAP+1], (size_t)k);
-
- if (F.Strands == 2) { /* if we want the bottom strand as well */
- Rev_Compl(sequence+BASE_OVERLAP+1+((block_Cnt-1) * basesPerLine), s, (long)k);
- /* and then plunk the converted sequence into O_txt with a similar statement */
- memcpy(&O_txt[O_SEQ_LINE+1][10], s, (size_t)k);
- }
- if (F.Tics == 1) { /* if we want tics */
- /* write out the minor and major tics below the reverse complement */
- tic_line = O_SEQ_LINE+2;
- /* if there's only 1 strand AND we want tics, have to move them up 1 line */
- if (F.Strands == 1) tic_line -= 1;
- for (i=0;i<(int)(basesPerLine/10);i++) memcpy (&O_txt[tic_line][i*10+O_LMARG]," ^ *",10);
- }
-
- /* Following call to Translate has to calculate the # of bases at each call; otherwise end up with
- overrun condition at end; "k" in code above
- This stanza doesn't have to change at all, even after the -T/-t change of 9.2.98 */
-
- rc = seq_len % 3; /* rc is the correction factor for the trans, if not a perfect div by 3;
- if it is a perfect div by 3, then rc = 0, and there won't be any change */
- if (n_letters != 0) {
- for (mm=0;(mm<(int)Xn && mm<3); mm++) {
- if (k % 3 != 0) k = ((int) k /3)*3; /* also added this code to Translate() for better modularity */
- /* this chunk doen't care about rc */
- seq_offset = ((block_Cnt-1)*basesPerLine+BASE_OVERLAP+1);
- Translate(sequence+seq_offset+mm, Prot_out, k, n_letters, Codons, codon_Table);
- /* need to place the translation frame labels here before memcpy() call */
- d = 0;
- if (F.Strands == 1) { d += 1; }
- if (F.Tics == 0) { d += 1; }
- sprintf(ctemp1, "%d", mm+1); O_txt[O_SEQ_LINE+3+mm-d][0] = ctemp1[0];
- memcpy(&O_txt[O_SEQ_LINE+3+mm-d][O_LMARG+mm], &Prot_out, (size_t)k); /* and copy it into place */
- lastline = O_SEQ_LINE + 3 + mm - d; /* keep track of last line of Xl */
-
- if (Xn == 6) { /* but if it's 6 frames of Xlation, need to do oppo at the same time */
- /* but not EXACTLY the same - seq to be has to be shifted slightly to account for
- those seqs not exactly / 3 - should have to only add a correction to the expr below
- AND add the same correction to the offset for printing it out, so this chunk DOES care
- about rc */
-
- Anti_Par(sequence+seq_offset-mm+rc, s, k); /* get the *antipar* sequence */
- /* Translate() below shouldn't have to change */
- Translate (s, Prot_out, k, n_letters, Codons, codon_Table); /* Xlate it into N-letter code */
- if (n_letters == 1) Reverse (Prot_out); /* to match the 'backwards' DNA strand */
- else Triplet_Reverse (Prot_out);
- /* need to place the translation frame labels here before memcpy() call */
- sprintf(ctemp1, "%d", mm+4); O_txt[O_SEQ_LINE+spacer+mm][0] = ctemp1[0];
- memcpy(&O_txt[O_SEQ_LINE+spacer+mm][O_LMARG-mm+rc], &Prot_out, (size_t)k); /* and copy it into place */
- lastline = O_SEQ_LINE + spacer + mm; /* this is the last line of the Xl, so last to print */
- }
- }
- } else {
- d = 0;
- if (F.Strands == 1) { d += 1; }
- if (F.Tics == 0) { d += 1; }
- lastline = O_SEQ_LINE + 2 - d;
- }
-
- /* This 'while' loop tests each text block for REs and also prints out those blocks that have no
- cuts in them at all but which have to be printed anyway....*/
- while ((DD->Dat[DCnt] != -2) && (base_real_pos < ((block_Cnt * basesPerLine) + BASE_OVERLAP))) {
- DCnt++;
- /* if we're not at the end and the real pos'n < current block + the overlap.. */
- /* !! Now DD->Dat[DCnt] should point to the corresponding RE # */
-
- /* if following test is true (they're !=) then it's safe to write it; */
- /* if it's false, then the same proto has already written to the same space */
- if (BackCheck[labs(DD->Dat[DCnt])] != labs(DD->Dat[DCnt-1])) { /* if !=, do this, but 1st... */
- BackCheck[labs(DD->Dat[DCnt])] = labs(DD->Dat[DCnt-1]); /* ... set it = so that it won't do this one again */
- okline = O_SEQ_LINE-2;
-
- /* This is where the RE filter could be inserted so that the DD data is ignored unless the RE that's
- referenced matches the parameters set in the filters (can take this straight from the ladder-map
- functions. */
- /* direct from GelLad..
- these are the conditions that have to be passed in order to have the REs placed in the
- linear map. */
- /* DD->Dat[DCnt] is still referencing a possibly neg #, so have to take the abs() */
- Cur_RE = labs((int)DD->Dat[DCnt]);
- /* this whole 'if' test thingie can and should be reduced to one extensible function call
- (using the embedded DamDcmOK() as a template), but ... not yet... */
- if ((DamDcmOK(base_real_pos, Cur_RE) == 1) && RE_Filters_OK(Cur_RE)) { /* if olap = 0 && overhang = blunt */
- /* this is the overall test for whether the RE will be printed out in the Linear Map or not
- if the RE DOESN'T meet the crit, it just gets ignored here, but will be printed in other outputs */
-
- block_cut_pos = (int)base_real_pos - 1 - ((block_Cnt-1) * basesPerLine); /* see if it's this easy?! */
- /* Now locate it in the block, checking for under- and over-runs */
- if (block_cut_pos >= 0) { /* if the position is greater than the beginning of the block */
- if (block_cut_pos <= basesPerLine) { /* and if it's less than the end of the block... */
- abcp = O_LMARG + block_cut_pos - 1; /* abcp = adjusted block_cut_pos - used x times below */
- O_txt[O_SEQ_LINE-1][abcp] = '\\'; /* write the 'cut' indicator */
- hits = 1;
- /* locate the position for writing the name */
- while (block_cut_pos < ok2wr[okline]) okline--; /* then go up to the lowest 'free' line */
- if (okline != O_SEQ_LINE-2) { /* but if can't place the name on the lowest line.. */
- i = O_SEQ_LINE-1; k = 0; /* check if there's any space lower between previous names */
- /* following 'while' must be ordered like it is - i will be decr only if k == 0 */
- while (k == 0 && --i != okline) { /* leaving 1 space before and after the name */
- if (O_txt[i][abcp-1] == ' ' && /* this can be replaced with something */
- O_txt[i][abcp+RE[Cur_RE].E_nam_l] == ' ' && /* more efficient later */
- O_txt[i][abcp+3] == ' ' && /* must check for a space here... */
- O_txt[i][abcp+4] == ' ' ) k = 1; /* as well as here */
- /* i = vertical counter ~= okline, k = 1 when we find enuf space */
- }
- okline = i;
- }
- if (okline < 2) {
- fprintf(stderr, "!! Output Buffer Blown around base %ld - degeneracy caused too many Enz's\n"
- "to be matched!! Increase the stringency of Enz selection w/ -n -o -M flags\n", base_real_pos);
- exit(0);
- }
- /* and memcpy the RE name from the struct to the output array */
- memcpy (&O_txt[okline][abcp],&RE[Cur_RE].E_nam, (size_t)RE[Cur_RE].E_nam_l);
- if (DD->Dat[DCnt] < 0) { /* if the match is in the (-) orientation */
- memcpy (&O_txt[okline][abcp+RE[Cur_RE].E_nam_l], &os, 1);
- }
-
- /* and incr the ok2wr posn for that line by the length of the RE name + 1 */
- /* 'ok2wr[okline]' below is modified only if we didn't find any lower spaces */
- if (block_cut_pos+RE[Cur_RE].E_nam_l+1 > ok2wr[okline]) {
- ok2wr[okline] = block_cut_pos+RE[Cur_RE].E_nam_l + 1;
- }
- } /* if (block_cut_pos < basesPerLine).. */
- else { /* otherwise it's in the overlap area from the next block */
- if (in_OL == -1) { /* if this is the 1st time in OVERLAP, mark it */
- in_OL = 1; /* by setting in_OL to 1 */
- Cnt_reset = DCnt - 1; /* Have to back up to 1st RE in the OVERLAP that did not
- resolve into the current block */
- } /* The above marker *should* back the DCnt to the position of the Cur_RE */
- }
- } /* if (block_cut_pos >= 0) ... */
- }
- base_real_pos = DD->Dat[++DCnt]; /* make sure that this is set before the next loop */
- if (okline < min_okline) min_okline = okline;
- } else {
- base_real_pos = DD->Dat[++DCnt]; /* this gonna fix it? */
- }
- } /* while (DD->Dat..... */
-
- /* Now print out the block we've composed, from min_okline to MAX_OUTPUT_LINES */
- /* adj to min_okline is nec to provide a 1 blank line between stanzas */
- for (i=min_okline-hits; i<= lastline; i++) fprintf(stdout, "%.*s\n",p_width,O_txt[i]);
-
- /* And clean out the lines written into O_txt as well */
- memset(O_txt,' ',(size_t)((O_LMARG + O_RMARG + MAX_BASES_PER_LINE) * MAX_OUTPUT_LINES)); /* set all of O_txt to blanks */
- } /* for (block_Cnt=1; block_Cnt<block_repeat; block_Cnt++) ... */
- free(BackCheck); /* this line was also deleted from the bike fork of tacg.c at one point
- along with the corresponding memset at ~ line 36 */
- }
-
-
-
-
-
- /* GenerateTOC() simply checks the status of all the major output headers and
- generates the appro HTML to make up a Table of Contents for the results that
- follow. It does not actually check that they have completed correctly, but
- it does keep everything in one place. Just a series of if()'s */
-
- void GenerateTOC(char *datestamp) {
-
- /* print a header */
- fprintf(stdout, "<h2>Table of Results</h2>\n");
- /* maybe print some date, version, file, machine run on, domain run on? etc info */
- fprintf(stdout, "<b>tacg version %s; Date: %s </b><br>\n"
- "<i>Contact Harry Mangalam (mangalam@home.com) with comments, critiques</i><br>\n",TACG_VERSION, datestamp);
-
- /* Start the UNordered list */
- fprintf(stdout, "<ul>\n");
-
- if (F.Summary != 0) { /* this generates no-cuts and all-cuts */
- fprintf(stdout, "<li><a href=\"#nocuts\">Patterns that DO NOT MAP to this sequence</a>\n");
- fprintf(stdout, "<li><a href=\"#allcuts\">Total Number of Hits per Pattern</a>\n");
- }
- if (F.Ladder != 0) { /* generates ladder map and infrequent matches */
- fprintf(stdout, "<li><a href=\"#ladmap\">Ladder Map of Pattern Matches</a>\n");
- fprintf(stdout, "<li><a href=\"#summarycuts\">Summary Map of Infrequent Matches</a>\n");
- }
- if (F.GelLO != 0 || F.GelHI != 0) {
- fprintf(stdout, "<li><a href=\"#gelmap\">Pseudo Gel Map</a>\n");
- }
- if (F.Sites != 0) {
- fprintf(stdout, "<li><a href=\"#sitetab\">Sites by Pattern</a>\n");
- }
- if (F.Frags == 1) {
- fprintf(stdout, "<li><a href=\"#fragunsort\">Fragment Sizes (UNSorted)</a>\n");
- }
- if (F.Frags == 2) {
- fprintf(stdout, "<li><a href=\"#fragsort\">Fragment Sizes (Sorted)</a>\n");
- }
- if (F.ORFs != -1) {
- fprintf(stdout, "<li><a href=\"#orfs\">Open Reading Frame Analysis</a>\n");
- }
- if (F.LinMap != 0) {
- fprintf(stdout, "<li><a href=\"#linmap\">Linear Map</a>\n");
- }
- if (F.GrfBBin != 0) {
- fprintf(stdout, "<li><a href=\"#graphics\">Graphics Plotting Data</a>\n");
- }
- if (F.Prox != 0) {
- fprintf(stdout, "<li><a href=\"#pair_proximity\">Pairwise Proximity</a>\n");
- }
- if (F.Rules != 0) {
- fprintf(stdout, "<li><a href=\"#rules_proximity\">Complex Rules-based Proximity</a>\n");
- }
- if (F.Hookey != -1) {
- fprintf(stdout, "<li><a href=\"#hookeyunsort\">AFLP Analysis - UNSORTED</a>\n");
- fprintf(stdout, "<li><a href=\"#hookeysort\">AFLP Analysis - SORTED</a>\n");
- }
-
- if (F.GrafMap != -1) {
- if (F.PDF == 1) {
- fprintf(stdout, "<li><a href=\"%s/tacg_Map.pdf \">PDF Plasmid Map</a>\n", F.tmppath);
- } else {
- fprintf(stdout, "<li><a href=\"%s/tacg_Map.ps \">Postscript Plasmid Map</a>\n", F.tmppath);
- }
- }
- /* spares below */
- /*
- if (F. != ) {
- fprintf(stdout, "<li><a href=\"#LABEL\">DESCRIPTION</a>\n");
- }
- */
- /* below marks the end of the UNordered list and the beginning of the PRE-formatted output */
- fprintf(stdout, "</ul><HR noshade size=5><pre>\n");
- /* fprintf(stdout, "<> </>\n"); */
-
- }
-
-
-
- /* if either the --dam or --dcm flags have been used, DamDcmOK() checks to see
- if the REi under consideration should be printed - has it been masked or is
- it OK to print it. What it does: passed the RE index of the enzyme, it
- checks to see if that RE is possibly affected and if so, whether the 'raw'
- hit recorded in DD.Dat is present in RE[].Sites. The index to the
- last-checked position in RE[].Sites is kept in:
- RE[].Sites[RE[].Sites[0]] as 'RE[].Sites[0]' points to the end of the array.
- In this way, only have to incr by 1 to get to the next check point.
- RWC = Real World Coordinates
- REi = RE index
- Hmmm - this could be the prototype for an overall RE filter to be used in a number of cases..
- */
- int DamDcmOK(long RWC, int REi) {
- int i, OK=0;
- /* 1st, check to see if this RE is sensitive at all and if not, skip it */
- /* also skip if it doesn't have ANY cuts at all */
- if ((RE[REi].E_Ncuts > 0) && ((F.dam == 1 && RE[REi].dam != 100) || (F.dcm == 1 && RE[REi].dcm != 100))) {
- if (RE[REi].Sites[RE[REi].Sites[0]] == 0) {
- RE[REi].Sites[RE[REi].Sites[0]] = 1; /* 1st time thru, set this to 1 (point it at the 1st real value */
- }
- OK = 0;
- i = RE[REi].Sites[RE[REi].Sites[0]]; /* shorten: i = the last el to have been checked */
- while (RWC >= RE[REi].Sites[i] && OK == 0) {
- if (RWC == RE[REi].Sites[i]) {
- OK = 1;
- }
- i++;
- }
- RE[REi].Sites[RE[REi].Sites[0]] = i;
- } else OK = 1;
- if (RE[REi].E_Ncuts == 0) OK = 0; /* just in case */
- return OK;
- }
-
- /* Add_dam and Add_dcm just fill the RE[1] and RE[2] entries for these special
- methylation enzymes. */
-
- void Add_dam(int NumREs) {
- strcpy(RE[NumREs].E_nam,"dam\0");
- if ((RE[NumREs].E_raw_sit = calloc(5, sizeof(char))) == NULL) BadMem("Add_dam-1", 1);
- RE[NumREs].E_raw_sit = "GATC\0";
- if ((RE[NumREs].E_wsit[1] = calloc(5, sizeof(char))) == NULL) BadMem("Add_dam-2", 1);
- RE[NumREs].E_wsit[1] = "gatc\0";
- strcpy(RE[NumREs].E_hex[1],"gatcnn\0");
- RE[NumREs].E_tcut[1] = 0;
- RE[NumREs].E_olap = 0;
- RE[NumREs].E_len = 4;
- RE[NumREs].E_pal = 1;
- RE[NumREs].E_dgen = 16;
- RE[NumREs].E_mag = 4;
- RE[NumREs].proto = NumREs;
- RE[NumREs].Err = 0;
- RE[NumREs].E_nam_l = 3;
- }
-
- void Add_dcm(int NumREs) {
- strcpy(RE[NumREs].E_nam,"dcm\0");
- if ((RE[NumREs].E_raw_sit = calloc(6, sizeof(char))) == NULL) BadMem("Add_dcm-1", 1);
- RE[NumREs].E_raw_sit = "CCWGG\0";
- if ((RE[NumREs].E_wsit[1] = calloc(6, sizeof(char))) == NULL) BadMem("Add_dcm-2", 1);
- RE[NumREs].E_wsit[1] = "ccwgg\0";
- strcpy(RE[NumREs].E_hex[1],"ccwggn\0");
- RE[NumREs].E_tcut[1] = 0;
- RE[NumREs].E_olap = 0;
- RE[NumREs].E_len = 5;
- RE[NumREs].E_pal = 1;
- RE[NumREs].E_dgen = 8;
- RE[NumREs].E_mag = 4;
- RE[NumREs].proto = NumREs;
- RE[NumREs].Err = 0;
- RE[NumREs].E_nam_l = 3;
- }
-
- /***************************************************************************************************/
- /* The following code performs the "pad the beginning sequence with the end and the end
- with the beginning" regardless of whether it's the whole sequence or a subsequence,
- using the vars 'begin' and 'end'. *Bare_Seq_Len is the the exactly "Real Sequence
- Length" as determined from the SEQIO routines, no padding added or subtracted -
- it's incremented to TotPaddedSeqLen in here and passed back to calling fn() via
- *Pad_Seq_Cnt */
-
- /* TotPaddedSeqLen length of sequence + bracketing overlaps
- Bare_Seq_Len actual number of bases read
- */
- /* will try to contruct this so that you give it the exact sequence you want padded, and it
- does all the calculations, work to alloc a new sequence space and do the padding, returning
- exactly what you want */
-
- char *PadEndForEnd(char *SeqIn, long *Pad_Seq_Len, long *Bare_Seq_Len) {
- long m, j, l, BareLen, TotPaddedSeqLen, FrPadSeqLen;
- /* int f1=0, iseq; */
- char *sequence, b…
Large files files are truncated, but you can click here to view the full file