PageRenderTime 61ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 1ms

/code_comm/SeqFuncs.c

http://research-code-base-animesh.googlecode.com/
C | 1221 lines | 715 code | 146 blank | 360 comment | 357 complexity | 4f1d097f6c9a33be26bb695adc201b9a MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1. /* tacg - a command line tool for the analysis of nucleic acids and protein */
  2. /* Copyright Š 1994-2001 Harry J Mangalam, tacg Informatics
  3. (mangalam@tacgi.com | mangalam@home.com | 949 856 2847) */
  4. /* $Id: SeqFuncs.c,v 1.13 2001/11/14 01:18:09 hjm Exp $ */
  5. /* The use of this software (except that by Harald T. Alvestrand, which is described in 'udping.c')
  6. and that by James Knight, which is described in 'seqio.c' is bound by the notice that appears
  7. in the file 'tacg.h' which should accompany this file. In the event that 'tacg.h' is not bundled
  8. with this file, please contact the author.
  9. */
  10. #include <stdio.h>
  11. #include <ctype.h>
  12. #include <string.h>
  13. #include <stdlib.h>
  14. #include <sys/stat.h>
  15. #include <unistd.h>
  16. #include <regex.h>
  17. #include "seqio.h"
  18. #include "tacg.h"
  19. /* takes care of:
  20. - reading rules from the file (path validated via SearchPaths()),
  21. - parsing the file into the Rules struct
  22. - validating that the rules are formed correctly (parens match up, logicals are correct)
  23. - making sure that each of the subpattern names has been loaded in RE (have to hash the
  24. names as in satisfying the -x flag in ReadEnz().
  25. - eventually, create --rules 'thisrule,thatrule,mars,promoter, etc'
  26. (can be specified w/o --rulefile in which case it will read the deafult 'rules.data'.)
  27. as opposed to current --rule for specifying a rule at a time from the commandline.
  28. (which should write out that rule to the tacg.patterns file for incoporation into the
  29. main rules file. require that it be named as well.
  30. ie: --rule 'Name,((()()())(())),width'
  31. - open the file, getting path name from F.RuleFile
  32. - read in the strings, concatting as we go, then validate the logic
  33. and stuff all the bits into the Rule_struct
  34. - at each iter, make sure there's enough mem
  35. - when finished with the file read, and all the bits are in Rule_struct, go thru
  36. Rules[].Name and check against the names in RE[].E_nam.
  37. - make
  38. */
  39. int tacg_Eval_Rules_from_File(struct SE_struct SelEnz[MAX_SEL_ENZ], char datestamp[80]) {
  40. /* long ProtoNameHash[], int NumREs, */
  41. FILE *fpRF;
  42. char junk[10], *ctemp1, ct, *rulestring, *packed_RS, *rstr;
  43. short NameCheck[RE_NAME_TAB_SIZ]; /* indices from realhash(PatName) to check against multiple identicals */
  44. int SEi=0;
  45. long j, EOpRS=0,
  46. window_size = 0,
  47. NumRules=0,
  48. rstr_len, linenum=0,
  49. RS_Cnt = 0, /* RuleStruct Counter */
  50. EOAM_RS = INIT_NUM_RULES; /* End Of Allocated Memory for the Rule_Struct; allocated in tacg */
  51. short cont = 0; /* continue indicator */
  52. /* alloc the Rule_struct Rules */
  53. if ((Rules = calloc(EOAM_RS, sizeof(*Rules))) == NULL) {
  54. BadMem("Can't init mem for Rules", 1);
  55. }
  56. if ((ctemp1 = calloc(512,sizeof(char))) == NULL) { BadMem("Can't init mem for 'ctemp1'", 1); }
  57. if ((rulestring = calloc(MAX_LEN_RULESTRING,sizeof(char))) == NULL) { BadMem("Can't init mem for 'rulestring'", 1); }
  58. if ((packed_RS = calloc(MAX_LEN_RULESTRING,sizeof(char))) == NULL) { BadMem("Can't init mem for 'packed_RS'", 1); }
  59. if ((rstr = calloc(MAX_LEN_RULESTRING,sizeof(char))) == NULL) { BadMem("Can't init mem for 'rstr'", 1); }
  60. memset(NameCheck,0, sizeof(short)*RE_NAME_TAB_SIZ); /* make sure everything is set to 0 */
  61. if ((fpRF = fopen(F.RuleFile,"r")) == NULL) { /* or the standard/optional one from SetFlags() */
  62. fprintf(stderr,"Cannot open the Rules file \"%s\" for reading!!\n", F.RuleFile); /* DON'T HIDE behind -V */
  63. exit(1); /* print an error and die gracefully */
  64. } else if (F.Verbose > 0) { fprintf(stderr,"Rule File %s opened OK for reading\n", F.RuleFile); }
  65. /* Scan thru comments in rebase file to separator ("..") */
  66. junk[0] = 'k'; /* make sure the strcmp below fails the first time */
  67. while ((feof(fpRF) == 0) && (strncmp(junk, "..",2) != 0)) {
  68. fscanf (fpRF, "%2s", junk); ct = 'm'; /* and read to end of line - this needs to be able to deal */
  69. while (ct != '\n' && ct != '\r' && (feof(fpRF) == 0)) { /* with end of line conditions for Mac, Win, and unix - what have I missed? */
  70. ct = fgetc(fpRF); linenum++;
  71. }
  72. }
  73. /* we're at "..", so go into file and start slurping directly data into struct
  74. except for RE_rawsite which has to be filtered a couple ways before being acceptable */
  75. ct = 'm';
  76. while (fgets(rstr, 512, fpRF) && (NumRules <= MAX_NUM_RULES)) {
  77. /* check the size of the rule struct to see if it's going to run into the end of the alloc'ed mem */
  78. if (RS_Cnt == EOAM_RS) { /* then we need to alloc more mem for the Rules struct */
  79. EOAM_RS += RULES_INCR;
  80. if ((Rules = realloc(Rules, sizeof(*Rules)*EOAM_RS)) == NULL) {
  81. BadMem("Failed to get more mem for Rules.\n", 1);
  82. }
  83. }
  84. /* now the whole line's in rstr; use strsep to parse it, make decisions re: parsing, skipping */
  85. j = stripwhitespace(rstr,0,0);
  86. if (rstr[0] != ';' && (strlen(rstr) != 0)) { /* if the line hasn't been commented out, get all the bits */
  87. /* REMEMBER! strsep chews the string to pieces and changes the start of the string */
  88. if (j>0 && F.Verbose > 0) {fprintf(stderr,"DEBUG: stripped %ld whitespace characters from RuleName\n", j);}
  89. strcpy(packed_RS, rstr); /* copy over to pass to tacg_SlidingWindow - may not be needed */
  90. /* now have to break on ',' to get all the rulestring in 1 shot
  91. - also have to detect if the last char is a continuation char '\' */
  92. /* if last char is '\', then rulestring is incomplete and there won't be a window term */
  93. cont = 0;
  94. EOpRS = strlen(packed_RS) - 1; /* */
  95. if (packed_RS[EOpRS] == '\\') { /* then there's a continuation char */
  96. cont = 1;
  97. /* keep on going until there's no more continuation lines */
  98. while ((cont == 1) && fgets(rstr, 512, fpRF)) { /* */
  99. j = stripwhitespace(rstr,0,0); /* no whitespaces, eols */
  100. rstr_len = strlen(rstr); /* length excluding terminating \n */
  101. EOpRS = strlen(packed_RS);
  102. if (EOpRS + rstr_len >= MAX_LEN_RULESTRING) {
  103. fprintf(stderr, "Total length of the rulestring exceeded; change MAX_LEN_RULESTRING \n"
  104. "or check your continuation characters for the rule %s\n", ctemp1);
  105. exit (1);
  106. }
  107. /* rstr should be the right size to copy mod the possible final \ and \n */
  108. strcpy(packed_RS+EOpRS-1,rstr); /* copy all but the last char over */
  109. /* fgets(rstr, 512, fpRF); */
  110. fprintf(stderr, "\nend char = %s\n", rstr+(strlen(rstr)-3) );
  111. if (rstr[(strlen(rstr)-1)] == '\\' ){ cont = 1;}
  112. else { cont = 0;}
  113. }
  114. fprintf(stderr, "\n\n\npacked_RS = %s\n\n", packed_RS);
  115. if ((rstr = calloc(MAX_LEN_RULESTRING,sizeof(char))) == NULL) { BadMem("Can't init mem for 'rstr'", 1); }
  116. strcpy(rstr,packed_RS);
  117. }
  118. /* coming out of the continuation loop, packed_RS contains the entire Name,RuleString,Window concat
  119. that the whole thing should look like. rstr has only the last line of a continued rule, so it needs to
  120. be synced, so copy packed_RS to rstr again */
  121. ctemp1 = strsep(&rstr, ","); /* using ctemp1 (=Name) for congruence with previous code */
  122. if (strlen(ctemp1) > 10) {
  123. fprintf(stderr, "Name %s is longer than allowed (%d) - check it!\n", ctemp1, MAX_PAT_NAME_LEN);
  124. exit(1);
  125. }
  126. /* so now packed_RS has the whole packed rulestring in it at this point; rstr has everything
  127. BUT the RuleName in it. So now we finish off the line, so break on ',' to get the rulestring
  128. and ONLY THEN break on the ',' to separate the rule term from the sliding window term
  129. (since already have processed the Name) */
  130. if (strlen(rstr) >= MAX_LEN_RULESTRING) {
  131. fprintf(stderr, "Total length of the rulestring exceeded; change MAX_LEN_RULESTRING \n"
  132. "or check your continuation characters for the rule %s\n", ctemp1);
  133. exit (1);
  134. }
  135. /* break on the comma so we can ignore the window term - this probably needs some more error-checking */
  136. memset(rulestring, 0, MAX_LEN_RULESTRING);
  137. rulestring = strsep(&rstr,","); /* rulestring now contains ONLY the packed rulestring */
  138. window_size = (long)atoi(strsep(&rstr,","));
  139. if (window_size < 1) { /* now test whether windowsize & rulestring are valid */
  140. fprintf (stderr, "Window size for %s is less than 1, which is silly.\n", ctemp1);
  141. exit (1);
  142. }
  143. /* Are the component pattern Names valid? ie is this name represented in RE by being loaded
  144. from the RE database? need to hash the Names of the individual patterns and see if they
  145. hit any of the Names that were loaded from whatever rebase was being used.
  146. Soooooo, go thru the rulestring and hash all words breaking on ()|^& to cut out the 'Name:min:Max' stanzas */
  147. tacg_Fill_SelEnz_fr_RuleString(SelEnz, &SEi, rulestring, NameCheck, ctemp1);
  148. /* stuff ALL the bits (Name, rulestring, window_size) into the struct (the validity of the rule itself
  149. tested in the evaluation code) */
  150. strcpy(Rules[RS_Cnt].Name, ctemp1);
  151. if ((Rules[RS_Cnt].RuleString = calloc((strlen(rulestring)+1),sizeof(char))) == NULL) {
  152. BadMem("Can't init mem for 'Rules[RS_Cnt].RuleString'", 1);
  153. }
  154. strcpy(Rules[RS_Cnt].RuleString, rulestring);
  155. Rules[RS_Cnt].Window = window_size;
  156. if ((rstr = calloc(MAX_LEN_RULESTRING,sizeof(char))) == NULL) { BadMem("Can't init mem for 'rstr'", 1); }
  157. RS_Cnt++;
  158. } else {
  159. /* fprintf(stderr, "\nDEBUG: line ignored: %s\n", rstr); */
  160. }
  161. }
  162. F.Xplicit = -1; /* notify ReadEnz */
  163. F.Rules = RS_Cnt-1; /* does this have to be decr by one 1st?? */
  164. return SEi;
  165. }
  166. /* ***************************************************************************************************** */
  167. /* tacg_Fill_SelEnz_fr_RuleString() takes the SelEnz sruct and the rulestring (and a few other housekeeping vars
  168. and parses the rulestring passed to it to populate SelEnz and do a namecheck to make sure that the names being
  169. parsed out of the rulestring haven't been seen before and aren't otherwise objectionable
  170. takes as args:
  171. SelEnz,
  172. the starting SEi (as an int * as it's changed in the fn())
  173. the rulestring (copied in the function so that the original rulestring is untouched)
  174. NameCheck (has to survive fn() to check for multiple identicals over multiple rulestrings,
  175. as well as to check for multiple identicals in the same rulestring (A&B|A&C)
  176. ctemp1 - Name of the Rule (for error messages)
  177. and back comes
  178. the (more) populated SelEnz
  179. and the ending SEi.
  180. */
  181. void tacg_Fill_SelEnz_fr_RuleString(struct SE_struct SelEnz[MAX_SEL_ENZ], int *SEi,
  182. char *rulestring, short NameCheck[RE_NAME_TAB_SIZ], char *ctemp1) {
  183. char *Name_mM, *rstr, *PatName = " ",
  184. *amin = " ",
  185. *aMax = " ";
  186. int ind;
  187. if ((Name_mM = calloc(33,sizeof(char))) == NULL) { BadMem("Can't init mem for 'Name_mM'", 1); }
  188. if ((rstr = calloc(MAX_LEN_RULESTRING,sizeof(char))) == NULL) { BadMem("Can't init mem for 'rstr'", 1); }
  189. strcpy(rstr,rulestring); /* re-using rstr; mem should be fine after re-calloc above */
  190. Name_mM[0] = '\0'; /* to pass on 1st pass */
  191. while (Name_mM[0] == '\0') {
  192. Name_mM = strsep(&rstr, "()|^&"); } /* this should break out a 'Name:#:#' set */
  193. while (Name_mM != 0L) {
  194. if (Name_mM[0] != '\0') {
  195. PatName = strsep(&Name_mM, ":");
  196. ind = (int) realhash(PatName, RE_NAME_TAB_SIZ);
  197. /* fprintf(stderr, "DEBUG: realhash '%s' = %d\n", PatName, ind); */
  198. if (strlen(PatName) < 11) {
  199. if (NameCheck[ind] == 0) {
  200. strcpy(SelEnz[*SEi].PName,PatName);
  201. NameCheck[ind] = 1; /* now mark it to make sure that we don't take it again */
  202. if ((amin = strsep(&Name_mM, ":")) == '\0') {
  203. SelEnz[*SEi].min = 0; /* RE[ii].min = */
  204. } else {
  205. SelEnz[*SEi].min = atoi(amin); /* RE[ii].min = */
  206. }
  207. if ((aMax = strsep(&Name_mM, ":")) == '\0') {
  208. SelEnz[*SEi].Max = 32000; /* RE[ii].Max = */
  209. } else {
  210. SelEnz[*SEi].Max = atoi(aMax); /* RE[ii].Max = */
  211. }
  212. (*SEi)++;
  213. } else { /* else it's already been seen in another rule or sub-rule and we don't care */
  214. /* fprintf(stderr, "DEBUG: The pattern named %s has already been seen previously; continuing .. \n", PatName); */
  215. }
  216. } else {
  217. fprintf (stderr, "\nIn Rule named %s, Pattern Name %s is too long (must be <=10 chars)\n",
  218. ctemp1, PatName);
  219. exit(1);
  220. }
  221. }
  222. Name_mM = strsep(&rstr, "()|^&"); /* get the next one */
  223. }/* @ end of loop, all constituent patterns are loaded into SelEnz or the app has exited on an err */
  224. }
  225. /* ***************************************************************************************************** */
  226. /* RE_Filter returns 1 if the RE indexed by Proto[REi] matches the tests for
  227. Magnitude
  228. Number of Cuts
  229. Cost
  230. Overlap
  231. etc that are can be used to fine-tune the selection of REs */
  232. int RE_Filters_OK(int REi) { /* REi should be the ProtoCutters index, as that is the only RE el that we have to check */
  233. int OK=0;
  234. /* fprintf(stdout, "\n REi = %d, RE name = %s \n", REi, RE[REi].E_nam); */
  235. if (((RE[REi].E_mag >= (int)F.Mag) && /* E_mag must be >= to -n flag */
  236. (REi >= F.RE_ST) && /* Don't EVER want dam/dcm sites printed */
  237. (RE[REi].E_Ncuts >= (int)F.Min) && /* Ncuts >= -m, */
  238. (RE[REi].E_Ncuts <= (int)F.Max) && /* Ncuts <= -M */
  239. (RE[REi].Cost >= (int)F.Cost) &&
  240. /* the dam/dcm part of this COULD be rolled in if the RWC (aka base_real_pos) was included as an optional
  241. argument (or if -1, ignore it), but right now it doesn't seem to be used in the same way, so I'll let
  242. the code stay but commented out for now */
  243. /* (DamDcmOK(base_real_pos, REi) == 1) && print only RE's !masked by dam/dcm methylases */
  244. (F.Overhang == 1)) || /* if want all, only 1st has to be true to short circuit */
  245. ((RE[REi].E_olap > 0) && (F.Overhang == 5)) || /* if olap < 0 && overhang = 5' */
  246. ((RE[REi].E_olap < 0) && (F.Overhang == 3)) || /* if olap > 0 && overhang = 3' */
  247. ((RE[REi].E_olap ==0) && (F.Overhang == 0))) /* if olap = 0 && overhang = blunt */
  248. { OK = 1; }
  249. else { OK = 0; }
  250. return OK;
  251. }
  252. /* if either the --dam or --dcm flags have been used, DamDcmOK() checks to see
  253. if the REi under consideration should be printed - has it been masked or is
  254. it OK to print it. What it does: passed the RE index of the enzyme, it
  255. checks to see if that RE is possibly affected and if so, whether the 'raw'
  256. hit recorded in DD.Dat is present in RE[].Sites. The index to the
  257. last-checked position in RE[].Sites is kept in:
  258. RE[].Sites[RE[].Sites[0]] as 'RE[].Sites[0]' points to the end of the array.
  259. In this way, only have to incr by 1 to get to the next check point.
  260. RWC = Real World Coordinates
  261. REi = RE index
  262. Hmmm - this could be the prototype for an overall RE filter to be used in a number of cases..
  263. */
  264. /* Clone() goes thru RE[] and figures out which REs match the conditions of the option flag, where
  265. --clone '#_#,#x#,#x#,#_#' (up to MAX_NUM_CLONE_RANGES). '_' indicates that the range cannot be cut;
  266. 'x' indicates that it can be cut.
  267. There will probably be some interactions between --clone and other options, but
  268. for now, finish the basic functionality and sort out the interactions later. In fact,
  269. RE[].. should not be a problem as it will be populated only with sites that are requested,
  270. so that if only a subset or special file of REs are used, only they will be used for
  271. for the clone() analysis.
  272. This fn() is called AFTER the digest is done in the normal way and should only
  273. need the number of REs (NumREs - not modified by this so it doesn;t ave to be
  274. passed as a pointer) as a max to go thru to determine the ranges that agree with the
  275. ranges specified.
  276. */
  277. /* should really be running thru Protos[] from 0->ProtoCutters */
  278. void Clone(int ProtoCutters, int *Protos) {
  279. long i, c, j, m, p, r,
  280. Best[F.Clone+2][ProtoCutters], /* Best = array that stores and sorts the best REs for a set of cloning criteria
  281. last el is for storing summary truths to check if ALL rules have been met */
  282. BadProtos[ProtoCutters]; /* BadProtos keep track of Protos that map inside a forbidden zone and therefore have to
  283. be removed from contention after the fact. If a Protos index shows up in BadProtos,
  284. it cannot be a possible cloning RE (unless cloner wants to try a partial digest.. */
  285. short N_Crits;
  286. int LadderProtos[ProtoCutters];
  287. N_Crits = F.Clone; /* simplicity; F.Clone stores the # of ranges in the --clone option string */
  288. for (i=0; i<= F.Clone; i++) {
  289. memset(Best[i],0,sizeof(int)*ProtoCutters);
  290. }
  291. memset(BadProtos,0,sizeof(long)*ProtoCutters);
  292. memset(LadderProtos,0,sizeof(int)*ProtoCutters);
  293. /* Need additional rules to elim REs.
  294. If an RE hits in the forbidden zone, that RE is then added to a FORBIDDEN array to be checked post-processing.
  295. now #x# = MUST HIT range Should there be an option to set prefs for this? Or just show how many
  296. there are in that zone?
  297. How should REs that hit in zones NOT explicitly marked as allowed (x) or forbidden (_) be treated? In typical
  298. cloning, don't want lots of extra cuts as it increases ends to contend with.
  299. */
  300. for (r=0; r<=N_Crits; r++) { /* for each range parsed out in SetFlags() */
  301. for (i=0; i<ProtoCutters; i++) {
  302. p = Protos[i]; /* p is the alias for the RE prototype index, so RE[p].etc will hit only prototypes */
  303. if (Clone_S[r].to_cut == 1) { /* if this is a range where we REQUIRE hits... */
  304. m = Clone_S[r].matching_REs[1]; /* [1] is the 1st free position of the array */
  305. for (j=1; j<RE[p].Sites[0]; j++) { /* RE[p].Sites[0] holds the index of the 1st FREE position in the Sites array */
  306. if (RE[p].Sites[j] >= Clone_S[r].begin && /* site must match BOTH conditions */
  307. RE[p].Sites[j] <= Clone_S[r].end ) {
  308. /* do we need a per-range log for positives & negatives? */
  309. Clone_S[r].matching_REs[m++] = i; /* incr m, store the Proto index */
  310. Clone_S[r].matching_REs[1] = m; /* keep track of the 1st free el as well */
  311. /* and check for realloc conditions */
  312. if ((m+1) == Clone_S[r].matching_REs[0]) { realloc_Clone_matching_REs(m, r); }
  313. } /* else we silently ignore the bad hits */
  314. }
  315. Clone_S[r].matching_REs[1] = m; /* and then store the position back into the array */
  316. } else { /* this is a range we where we will NOT TOLERATE hits */
  317. m = Clone_S[r].matching_REs[1]; /* [1] is the 1st free position of the array */
  318. for (j=1; j<RE[p].Sites[0]; j++) { /* RE[p].Sites[0] holds the index of the 1st FREE position in the Sites array */
  319. if (RE[p].Sites[j] <= Clone_S[r].begin || /* site can match either condition */
  320. RE[p].Sites[j] >= Clone_S[r].end ) {
  321. Clone_S[r].matching_REs[m++] = i; /* incr m, store the Proto index */
  322. Clone_S[r].matching_REs[1] = m; /* keep track of the 1st free el as well */
  323. /* and check for realloc conditions */
  324. if ((m+1) == Clone_S[r].matching_REs[0]) {realloc_Clone_matching_REs(m, r); }
  325. } else { /* else we have to track the REs to mask out the results by adding their index to BadProtos[] */
  326. BadProtos[i]++; /* if an el is > 0, it's EVIL */
  327. }
  328. }
  329. Clone_S[r].matching_REs[1] = m; /* and then store the position back into the array */
  330. }
  331. }
  332. }
  333. /* now we've run thru all the criteria for judging whether the REs map into the ranges
  334. now have to output the data in some usable form. 1st, are there any REs that match ALL requirements?
  335. 2nd output all ranges with all matching REs, sorted alphabetically. Should --clone automatically set
  336. --sites, so can output all the sites at the same time?? Hmmm...
  337. */
  338. /* 1st are there any that match ALL crits? for all crits, are there any indices that appear in all of them??
  339. set up array of length N_Crits and increment each RE index as go thru all the ranges. Also has the effect of
  340. being able to sort those REs that match the MOST crits as well. */
  341. for (i=0; i<=N_Crits; i++) { /* cycle thru all the crits */
  342. for (j=2; j<Clone_S[i].matching_REs[1]; j++) { /* cycle thru all the prototype REs that match for each crit */
  343. Best[i][Clone_S[i].matching_REs[j]]++; /* this will count the # of times a proto came up positive in the criteria */
  344. }
  345. } /* so now have all the REs counted. */
  346. /* and print the results in some kind of visually appealing way. */
  347. /* 1st print out any REs that met ALL crits.. */
  348. fprintf(stdout,"\n\n== Here's the Clone output for the %d Criteria:\n", (F.Clone+1));
  349. for (i=0; i<=F.Clone; i++) { fprintf(stdout, " Rule %ld: %s\n", i+1, Clone_S[i].range_rule); }
  350. for (j=0; j <= F.Clone; j++) { /* sum the rules results to generate a summary */
  351. for (i=0; i<ProtoCutters; i++) {
  352. if (Best[j][i] > 0 && BadProtos[i] == 0) {
  353. Best[F.Clone+1][i]++; /* for each rule passed, incr the truth element once */
  354. }
  355. }
  356. }
  357. /* and finally print out the Protos that match ALL the rules BUT this should probably be mod to allow selection and
  358. filtering by the standard rules -o -n -m -M --cost, etc but later... this will require a test fn() that tests all the
  359. Protos against all the various restrictions, but it will be applicable to at least 4 cases */
  360. fprintf(stdout,"\nREs that met ALL criteria:\n");
  361. /*
  362. for (i=0; i<ProtoCutters; i++) {
  363. if (Best[F.Clone+1][i] == F.Clone+1) {
  364. fprintf(stdout,"\n RE[Protos[i]].E_nam);
  365. }
  366. }
  367. */
  368. i = c = m = 0;
  369. while (i<ProtoCutters) {
  370. if (m >= F.Width) { m = 0; fprintf(stdout, "\n"); }
  371. if (Best[F.Clone+1][i] == F.Clone+1 ) {
  372. fprintf(stdout,"%10s", RE[Protos[i]].E_nam);
  373. LadderProtos[i] = Protos[i];
  374. c++; m += 10;
  375. }
  376. i++;
  377. }
  378. if (c == 0) { fprintf(stdout,"There were no REs that met ALL criteria\n"); }
  379. else {
  380. PrintGelLadderMap(i, F.Seq_Len, LadderProtos, 2);
  381. }
  382. memset(LadderProtos,0,sizeof(int)*ProtoCutters);
  383. fprintf(stdout,"\n\nResults (listed by Rules) that met SOME criteria & did not violate a hits-forbidden range.");
  384. for (j=0; j<=F.Clone; j++) {
  385. fprintf(stdout, "\n\n--- For Rule %ld (%s), the following REs matched:\n\n", j+1, Clone_S[j].range_rule);
  386. i = c = m = 0;
  387. while (i<ProtoCutters) {
  388. if (m >= F.Width) { m = 0; fprintf(stdout, "\n"); }
  389. if (Best[j][i] > 0 && BadProtos[i] == 0) {
  390. fprintf(stdout,"%10s", RE[Protos[i]].E_nam);
  391. LadderProtos[i] = Protos[i];
  392. c++; m += 10;
  393. }
  394. i++;
  395. }
  396. if (c == 0) { fprintf(stdout,"There were no REs that met EVEN SOME of the criteria\n"); }
  397. else {
  398. PrintGelLadderMap(i, F.Seq_Len, LadderProtos, 2);
  399. }
  400. memset(LadderProtos,0,sizeof(int)*ProtoCutters);
  401. }
  402. fprintf(stdout,"\n\n--- Results (listed by RE) that met SOME criteria & did not violate a hits-forbidden range\n");
  403. fprintf(stdout,"\n RE Name");
  404. for (j=0; j<=F.Clone; j++) { fprintf(stdout," Rule %ld", j+1); }
  405. fprintf(stdout,"\n");
  406. for (i=0; i<ProtoCutters; i++) {
  407. if (BadProtos[i] == 0) {
  408. fprintf(stdout,"\n%10s", RE[Protos[i]].E_nam);
  409. for (j=0; j<=F.Clone; j++) {
  410. if (Best[j][i] > 0) {
  411. fprintf(stdout," X ");
  412. } else {
  413. fprintf(stdout," ");
  414. }
  415. }
  416. }
  417. }
  418. fprintf(stdout,"\n\n");
  419. };
  420. void realloc_Clone_matching_REs(long current, long index) {
  421. long newsize;
  422. /* and check for realloc conditions */
  423. if ((current+1) == Clone_S[index].matching_REs[0]) { /* if we're nearly up to the alloc'ed mem, need to realloc */
  424. newsize = Clone_S[index].matching_REs[0] + CLONE_INCR_MRE; /* this is how big the new size should be */
  425. if ((Clone_S[index].matching_REs = (long *) realloc(Clone_S[index].matching_REs, sizeof(long) * newsize)) == NULL) {
  426. BadMem("Can't realloc mem for Clone_S[].matching_REs", 1);
  427. } else { Clone_S[index].matching_REs[0] = newsize; } /* new alloc range */
  428. }
  429. };
  430. /* the example function included below is a mock-up to show how to include your own function.
  431. Obviously, there are 10s of functions included with this source code and some of them may be
  432. immediately applicable to your work. Feel free to bash them into a useful state for yourself.
  433. If you think that you've created a widely useful function that you would like to have included
  434. with tacg, please give me a shout, and I'll include a pointer to it or provide other information
  435. about it on my web site.
  436. void Example(void){
  437. }
  438. */
  439. /* LinearMap() is simply a functionization of the Linear Mapping chunk that
  440. should have been functionized long ago */
  441. void LinearMap(long seq_len, int basesPerLine, int NumREs, char *sequence, int max_okline,
  442. char Codons[][][], int codon_Table, int *Protos) {
  443. /* Declarations */
  444. int i, d, k, mm=0, rc, n_letters=0, Xn=0, okline, HTML, spacer, spacermod, stanzaspace,
  445. min_okline, tic_line, Cur_RE, block_Cnt, block_repeat, block_cut_pos, lastline = O_SEQ_LINE+2, hits,
  446. ok2wr[MAX_OUTPUT_LINES], p_width, abcp, in_OL=-1, seq_offset, tripclick=0;
  447. long *BackCheck, /* BackCheck[] checks proto entries in the Linear map output to remove duplicates */
  448. DCnt, Cnt_reset=0, base_real_pos, numstart;
  449. char ctemp1[30], *s, Prot_out[MAX_BASES_PER_LINE],
  450. O_txt[MAX_OUTPUT_LINES][O_LMARG+O_RMARG+MAX_BASES_PER_LINE];
  451. /* char Codons[MAX_ORGS][N_CODONS][7]; */
  452. char os = '~'; /* started out as '<' to mark other strand, but causes problems with HTML generation */
  453. BackCheck = (long *)calloc((size_t)NumREs+2, sizeof(long)); /* Needs to be '+2' as combination cut will add one and 1 extra */
  454. if (BackCheck == NULL) BadMem("Can't init mem for BackCheck", 1);
  455. s = (char *) calloc(256, sizeof(char));
  456. if (s == NULL) BadMem("failed to calloc mem for 's'", 0);
  457. memset(O_txt,' ',(size_t)((O_LMARG+O_RMARG+MAX_BASES_PER_LINE)*MAX_OUTPUT_LINES)); /* set all of O_txt to blanks - works!*/
  458. p_width = O_LMARG + O_RMARG + basesPerLine;
  459. /* Routine to print out the text of the digestion, with names going where they should go
  460. a la Strider - have to keep track of how long the names are and where an allowable place
  461. to write the name - */
  462. /* In brief, the following function, does this:
  463. It reads thru DD->Dat (the array that keeps track of the linear sequence of cuts in the sequence)
  464. and matches the position of the enzymes to the current block being printed, taking into account
  465. the cut offset of the RE from the 1st base of the recognition sequence. The tricky part is that
  466. because of this latter functionality, it has to check the sequence that will be the *following*
  467. block to see if there are any RE recognition seqs within the 1st few bases that will cause the actual
  468. cut site to be in the *previous* block. Doing this seems to take a great amount of the cpu time - can
  469. probably be optimized a good deal more, but it's still >10 faster than GCG ;). */
  470. HTML = F.HTML; /* simplicity */
  471. n_letters = F.Lab1or3; /* simplicity */
  472. Xn = F.XLinFr; /* simplicity */
  473. numstart = F.NumStart; /* simplicity */
  474. fprintf(stdout, "\n\n");
  475. if (HTML > -1) fprintf(stdout, "<H3><A NAME=\"linmap\">");
  476. fprintf(stdout, "== Linear Map of Sequence:\n"); /* Write the header */
  477. if (HTML > -1) fprintf(stdout, "</A></H3>\n");
  478. /* inits, etc */
  479. if (seq_len % basesPerLine == 0) block_repeat = (int) seq_len/basesPerLine; /* # times thru the compose/print cycle */
  480. else block_repeat = (int) seq_len/basesPerLine + 1;
  481. DCnt=2; /* start the DD->Dat counter at the beginning of REAL DATA [0]=Cnt; [1]=Siz */
  482. base_real_pos = DD->Dat[DCnt];
  483. /* spacer dictates the line spacing between the 1st 3 and 2nd 3 translation stanzas; if 6 there
  484. will be no space between the 2 stanzas, if 7 there will be 1 space; could make this another
  485. option but sheesh... */
  486. spacer = 7; spacermod = 0;
  487. if (F.Strands == 1) { spacermod++; }
  488. if (F.Tics == 0) { spacermod++;}
  489. spacer = spacer - spacermod;
  490. if (spacer == 7) stanzaspace = spacermod + 1;
  491. for (block_Cnt=1; block_Cnt <= block_repeat; block_Cnt++) {
  492. /* hits is set to 1 if there are hits on the line; if no hits, decr the top buffer line to output
  493. so that there is a constant 1 blank line spacer between stanzas */
  494. hits = -1;
  495. /* if there was an OVERLAP failure previously, fix it */
  496. if (in_OL == 1) {
  497. memset(BackCheck,0,sizeof(long)*(NumREs+2)); /* clear BackCheck[] as it's filled with stale data */
  498. DCnt = Cnt_reset; /* and reset DCnt to that pointer */
  499. }
  500. in_OL = -1; /* and reset 'in OVERLAP' marker for new block */
  501. base_real_pos = DD->Dat[DCnt]; /* Make sure that ' base_real_pos' is set */
  502. okline = min_okline = O_SEQ_LINE-2; /* set the min (highest on page) line of buffer to print out */
  503. /* Set up the seq and its rev compl in the output buffer */
  504. memset(ok2wr,0,sizeof(int)*MAX_OUTPUT_LINES); /* set/reset all of ok2wr to 0's for the new block*/
  505. /* next line was deleted in a brief fork of tacg.c - note this if get unexlained segfs */
  506. memset(BackCheck, 0, sizeof(long)*NumREs); /* reset the BackCheck[] */
  507. /*write the correct numbering in Left margin */
  508. if (F.NumStart < 1 && (((block_Cnt-1)*basesPerLine) + 1 + F.NumStart) == 0) {numstart++; tripclick++;}
  509. sprintf((char *)s, "%7ld",(((block_Cnt-1)*basesPerLine) + 1 + numstart)); /* numstart is just a + offset */
  510. memcpy(&O_txt[O_SEQ_LINE][0], s, 7);
  511. /* and in the right margin - combine with the above bit? */
  512. if (F.NumStart < 0 && (tripclick || (((block_Cnt)*basesPerLine + numstart)) > 0)) {
  513. if (!tripclick) {tripclick++; numstart++;}
  514. }
  515. sprintf((char *)s, "%7ld",(((block_Cnt)*basesPerLine + numstart)));
  516. memcpy(&O_txt[O_SEQ_LINE][O_LMARG+basesPerLine], s, 7);
  517. /* and drop in the sequence directly after it. */
  518. if (block_Cnt != block_repeat) k = basesPerLine; /* test for end of sequence to end gracefully */
  519. else k = (int)seq_len - (block_Cnt-1)*basesPerLine;
  520. if ((F.SeqBeg != 1 || F.NumStart != 0) &&
  521. (F.Strands != 1 || F.Tics != 0)) { /* if subsequence or dif #ing, print the #s of the original seq too */
  522. /* write the correct numbering in Left margin */
  523. sprintf((char *)s, "%7d",((int)(((block_Cnt-1)*basesPerLine)+F.SeqBeg))); /* these 2 lines could be combined...? */
  524. memcpy(&O_txt[O_SEQ_LINE+1][0], s, 7);
  525. /* and in the right margin - combine with the above bit? */
  526. sprintf((char *)s, "%7d",(int)(((block_Cnt)*basesPerLine - 1 + F.SeqBeg)));
  527. memcpy(&O_txt[O_SEQ_LINE+1][O_LMARG+basesPerLine], s, 7);
  528. }
  529. memcpy(&O_txt[O_SEQ_LINE][O_LMARG], &sequence[(block_Cnt-1)*basesPerLine+BASE_OVERLAP+1], (size_t)k);
  530. if (F.Strands == 2) { /* if we want the bottom strand as well */
  531. Rev_Compl(sequence+BASE_OVERLAP+1+((block_Cnt-1) * basesPerLine), s, (long)k);
  532. /* and then plunk the converted sequence into O_txt with a similar statement */
  533. memcpy(&O_txt[O_SEQ_LINE+1][10], s, (size_t)k);
  534. }
  535. if (F.Tics == 1) { /* if we want tics */
  536. /* write out the minor and major tics below the reverse complement */
  537. tic_line = O_SEQ_LINE+2;
  538. /* if there's only 1 strand AND we want tics, have to move them up 1 line */
  539. if (F.Strands == 1) tic_line -= 1;
  540. for (i=0;i<(int)(basesPerLine/10);i++) memcpy (&O_txt[tic_line][i*10+O_LMARG]," ^ *",10);
  541. }
  542. /* Following call to Translate has to calculate the # of bases at each call; otherwise end up with
  543. overrun condition at end; "k" in code above
  544. This stanza doesn't have to change at all, even after the -T/-t change of 9.2.98 */
  545. rc = seq_len % 3; /* rc is the correction factor for the trans, if not a perfect div by 3;
  546. if it is a perfect div by 3, then rc = 0, and there won't be any change */
  547. if (n_letters != 0) {
  548. for (mm=0;(mm<(int)Xn && mm<3); mm++) {
  549. if (k % 3 != 0) k = ((int) k /3)*3; /* also added this code to Translate() for better modularity */
  550. /* this chunk doen't care about rc */
  551. seq_offset = ((block_Cnt-1)*basesPerLine+BASE_OVERLAP+1);
  552. Translate(sequence+seq_offset+mm, Prot_out, k, n_letters, Codons, codon_Table);
  553. /* need to place the translation frame labels here before memcpy() call */
  554. d = 0;
  555. if (F.Strands == 1) { d += 1; }
  556. if (F.Tics == 0) { d += 1; }
  557. sprintf(ctemp1, "%d", mm+1); O_txt[O_SEQ_LINE+3+mm-d][0] = ctemp1[0];
  558. memcpy(&O_txt[O_SEQ_LINE+3+mm-d][O_LMARG+mm], &Prot_out, (size_t)k); /* and copy it into place */
  559. lastline = O_SEQ_LINE + 3 + mm - d; /* keep track of last line of Xl */
  560. if (Xn == 6) { /* but if it's 6 frames of Xlation, need to do oppo at the same time */
  561. /* but not EXACTLY the same - seq to be has to be shifted slightly to account for
  562. those seqs not exactly / 3 - should have to only add a correction to the expr below
  563. AND add the same correction to the offset for printing it out, so this chunk DOES care
  564. about rc */
  565. Anti_Par(sequence+seq_offset-mm+rc, s, k); /* get the *antipar* sequence */
  566. /* Translate() below shouldn't have to change */
  567. Translate (s, Prot_out, k, n_letters, Codons, codon_Table); /* Xlate it into N-letter code */
  568. if (n_letters == 1) Reverse (Prot_out); /* to match the 'backwards' DNA strand */
  569. else Triplet_Reverse (Prot_out);
  570. /* need to place the translation frame labels here before memcpy() call */
  571. sprintf(ctemp1, "%d", mm+4); O_txt[O_SEQ_LINE+spacer+mm][0] = ctemp1[0];
  572. memcpy(&O_txt[O_SEQ_LINE+spacer+mm][O_LMARG-mm+rc], &Prot_out, (size_t)k); /* and copy it into place */
  573. lastline = O_SEQ_LINE + spacer + mm; /* this is the last line of the Xl, so last to print */
  574. }
  575. }
  576. } else {
  577. d = 0;
  578. if (F.Strands == 1) { d += 1; }
  579. if (F.Tics == 0) { d += 1; }
  580. lastline = O_SEQ_LINE + 2 - d;
  581. }
  582. /* This 'while' loop tests each text block for REs and also prints out those blocks that have no
  583. cuts in them at all but which have to be printed anyway....*/
  584. while ((DD->Dat[DCnt] != -2) && (base_real_pos < ((block_Cnt * basesPerLine) + BASE_OVERLAP))) {
  585. DCnt++;
  586. /* if we're not at the end and the real pos'n < current block + the overlap.. */
  587. /* !! Now DD->Dat[DCnt] should point to the corresponding RE # */
  588. /* if following test is true (they're !=) then it's safe to write it; */
  589. /* if it's false, then the same proto has already written to the same space */
  590. if (BackCheck[labs(DD->Dat[DCnt])] != labs(DD->Dat[DCnt-1])) { /* if !=, do this, but 1st... */
  591. BackCheck[labs(DD->Dat[DCnt])] = labs(DD->Dat[DCnt-1]); /* ... set it = so that it won't do this one again */
  592. okline = O_SEQ_LINE-2;
  593. /* This is where the RE filter could be inserted so that the DD data is ignored unless the RE that's
  594. referenced matches the parameters set in the filters (can take this straight from the ladder-map
  595. functions. */
  596. /* direct from GelLad..
  597. these are the conditions that have to be passed in order to have the REs placed in the
  598. linear map. */
  599. /* DD->Dat[DCnt] is still referencing a possibly neg #, so have to take the abs() */
  600. Cur_RE = labs((int)DD->Dat[DCnt]);
  601. /* this whole 'if' test thingie can and should be reduced to one extensible function call
  602. (using the embedded DamDcmOK() as a template), but ... not yet... */
  603. if ((DamDcmOK(base_real_pos, Cur_RE) == 1) && RE_Filters_OK(Cur_RE)) { /* if olap = 0 && overhang = blunt */
  604. /* this is the overall test for whether the RE will be printed out in the Linear Map or not
  605. if the RE DOESN'T meet the crit, it just gets ignored here, but will be printed in other outputs */
  606. block_cut_pos = (int)base_real_pos - 1 - ((block_Cnt-1) * basesPerLine); /* see if it's this easy?! */
  607. /* Now locate it in the block, checking for under- and over-runs */
  608. if (block_cut_pos >= 0) { /* if the position is greater than the beginning of the block */
  609. if (block_cut_pos <= basesPerLine) { /* and if it's less than the end of the block... */
  610. abcp = O_LMARG + block_cut_pos - 1; /* abcp = adjusted block_cut_pos - used x times below */
  611. O_txt[O_SEQ_LINE-1][abcp] = '\\'; /* write the 'cut' indicator */
  612. hits = 1;
  613. /* locate the position for writing the name */
  614. while (block_cut_pos < ok2wr[okline]) okline--; /* then go up to the lowest 'free' line */
  615. if (okline != O_SEQ_LINE-2) { /* but if can't place the name on the lowest line.. */
  616. i = O_SEQ_LINE-1; k = 0; /* check if there's any space lower between previous names */
  617. /* following 'while' must be ordered like it is - i will be decr only if k == 0 */
  618. while (k == 0 && --i != okline) { /* leaving 1 space before and after the name */
  619. if (O_txt[i][abcp-1] == ' ' && /* this can be replaced with something */
  620. O_txt[i][abcp+RE[Cur_RE].E_nam_l] == ' ' && /* more efficient later */
  621. O_txt[i][abcp+3] == ' ' && /* must check for a space here... */
  622. O_txt[i][abcp+4] == ' ' ) k = 1; /* as well as here */
  623. /* i = vertical counter ~= okline, k = 1 when we find enuf space */
  624. }
  625. okline = i;
  626. }
  627. if (okline < 2) {
  628. fprintf(stderr, "!! Output Buffer Blown around base %ld - degeneracy caused too many Enz's\n"
  629. "to be matched!! Increase the stringency of Enz selection w/ -n -o -M flags\n", base_real_pos);
  630. exit(0);
  631. }
  632. /* and memcpy the RE name from the struct to the output array */
  633. memcpy (&O_txt[okline][abcp],&RE[Cur_RE].E_nam, (size_t)RE[Cur_RE].E_nam_l);
  634. if (DD->Dat[DCnt] < 0) { /* if the match is in the (-) orientation */
  635. memcpy (&O_txt[okline][abcp+RE[Cur_RE].E_nam_l], &os, 1);
  636. }
  637. /* and incr the ok2wr posn for that line by the length of the RE name + 1 */
  638. /* 'ok2wr[okline]' below is modified only if we didn't find any lower spaces */
  639. if (block_cut_pos+RE[Cur_RE].E_nam_l+1 > ok2wr[okline]) {
  640. ok2wr[okline] = block_cut_pos+RE[Cur_RE].E_nam_l + 1;
  641. }
  642. } /* if (block_cut_pos < basesPerLine).. */
  643. else { /* otherwise it's in the overlap area from the next block */
  644. if (in_OL == -1) { /* if this is the 1st time in OVERLAP, mark it */
  645. in_OL = 1; /* by setting in_OL to 1 */
  646. Cnt_reset = DCnt - 1; /* Have to back up to 1st RE in the OVERLAP that did not
  647. resolve into the current block */
  648. } /* The above marker *should* back the DCnt to the position of the Cur_RE */
  649. }
  650. } /* if (block_cut_pos >= 0) ... */
  651. }
  652. base_real_pos = DD->Dat[++DCnt]; /* make sure that this is set before the next loop */
  653. if (okline < min_okline) min_okline = okline;
  654. } else {
  655. base_real_pos = DD->Dat[++DCnt]; /* this gonna fix it? */
  656. }
  657. } /* while (DD->Dat..... */
  658. /* Now print out the block we've composed, from min_okline to MAX_OUTPUT_LINES */
  659. /* adj to min_okline is nec to provide a 1 blank line between stanzas */
  660. for (i=min_okline-hits; i<= lastline; i++) fprintf(stdout, "%.*s\n",p_width,O_txt[i]);
  661. /* And clean out the lines written into O_txt as well */
  662. memset(O_txt,' ',(size_t)((O_LMARG + O_RMARG + MAX_BASES_PER_LINE) * MAX_OUTPUT_LINES)); /* set all of O_txt to blanks */
  663. } /* for (block_Cnt=1; block_Cnt<block_repeat; block_Cnt++) ... */
  664. free(BackCheck); /* this line was also deleted from the bike fork of tacg.c at one point
  665. along with the corresponding memset at ~ line 36 */
  666. }
  667. /* GenerateTOC() simply checks the status of all the major output headers and
  668. generates the appro HTML to make up a Table of Contents for the results that
  669. follow. It does not actually check that they have completed correctly, but
  670. it does keep everything in one place. Just a series of if()'s */
  671. void GenerateTOC(char *datestamp) {
  672. /* print a header */
  673. fprintf(stdout, "<h2>Table of Results</h2>\n");
  674. /* maybe print some date, version, file, machine run on, domain run on? etc info */
  675. fprintf(stdout, "<b>tacg version %s; Date: %s </b><br>\n"
  676. "<i>Contact Harry Mangalam (mangalam@home.com) with comments, critiques</i><br>\n",TACG_VERSION, datestamp);
  677. /* Start the UNordered list */
  678. fprintf(stdout, "<ul>\n");
  679. if (F.Summary != 0) { /* this generates no-cuts and all-cuts */
  680. fprintf(stdout, "<li><a href=\"#nocuts\">Patterns that DO NOT MAP to this sequence</a>\n");
  681. fprintf(stdout, "<li><a href=\"#allcuts\">Total Number of Hits per Pattern</a>\n");
  682. }
  683. if (F.Ladder != 0) { /* generates ladder map and infrequent matches */
  684. fprintf(stdout, "<li><a href=\"#ladmap\">Ladder Map of Pattern Matches</a>\n");
  685. fprintf(stdout, "<li><a href=\"#summarycuts\">Summary Map of Infrequent Matches</a>\n");
  686. }
  687. if (F.GelLO != 0 || F.GelHI != 0) {
  688. fprintf(stdout, "<li><a href=\"#gelmap\">Pseudo Gel Map</a>\n");
  689. }
  690. if (F.Sites != 0) {
  691. fprintf(stdout, "<li><a href=\"#sitetab\">Sites by Pattern</a>\n");
  692. }
  693. if (F.Frags == 1) {
  694. fprintf(stdout, "<li><a href=\"#fragunsort\">Fragment Sizes (UNSorted)</a>\n");
  695. }
  696. if (F.Frags == 2) {
  697. fprintf(stdout, "<li><a href=\"#fragsort\">Fragment Sizes (Sorted)</a>\n");
  698. }
  699. if (F.ORFs != -1) {
  700. fprintf(stdout, "<li><a href=\"#orfs\">Open Reading Frame Analysis</a>\n");
  701. }
  702. if (F.LinMap != 0) {
  703. fprintf(stdout, "<li><a href=\"#linmap\">Linear Map</a>\n");
  704. }
  705. if (F.GrfBBin != 0) {
  706. fprintf(stdout, "<li><a href=\"#graphics\">Graphics Plotting Data</a>\n");
  707. }
  708. if (F.Prox != 0) {
  709. fprintf(stdout, "<li><a href=\"#pair_proximity\">Pairwise Proximity</a>\n");
  710. }
  711. if (F.Rules != 0) {
  712. fprintf(stdout, "<li><a href=\"#rules_proximity\">Complex Rules-based Proximity</a>\n");
  713. }
  714. if (F.Hookey != -1) {
  715. fprintf(stdout, "<li><a href=\"#hookeyunsort\">AFLP Analysis - UNSORTED</a>\n");
  716. fprintf(stdout, "<li><a href=\"#hookeysort\">AFLP Analysis - SORTED</a>\n");
  717. }
  718. if (F.GrafMap != -1) {
  719. if (F.PDF == 1) {
  720. fprintf(stdout, "<li><a href=\"%s/tacg_Map.pdf \">PDF Plasmid Map</a>\n", F.tmppath);
  721. } else {
  722. fprintf(stdout, "<li><a href=\"%s/tacg_Map.ps \">Postscript Plasmid Map</a>\n", F.tmppath);
  723. }
  724. }
  725. /* spares below */
  726. /*
  727. if (F. != ) {
  728. fprintf(stdout, "<li><a href=\"#LABEL\">DESCRIPTION</a>\n");
  729. }
  730. */
  731. /* below marks the end of the UNordered list and the beginning of the PRE-formatted output */
  732. fprintf(stdout, "</ul><HR noshade size=5><pre>\n");
  733. /* fprintf(stdout, "<> </>\n"); */
  734. }
  735. /* if either the --dam or --dcm flags have been used, DamDcmOK() checks to see
  736. if the REi under consideration should be printed - has it been masked or is
  737. it OK to print it. What it does: passed the RE index of the enzyme, it
  738. checks to see if that RE is possibly affected and if so, whether the 'raw'
  739. hit recorded in DD.Dat is present in RE[].Sites. The index to the
  740. last-checked position in RE[].Sites is kept in:
  741. RE[].Sites[RE[].Sites[0]] as 'RE[].Sites[0]' points to the end of the array.
  742. In this way, only have to incr by 1 to get to the next check point.
  743. RWC = Real World Coordinates
  744. REi = RE index
  745. Hmmm - this could be the prototype for an overall RE filter to be used in a number of cases..
  746. */
  747. int DamDcmOK(long RWC, int REi) {
  748. int i, OK=0;
  749. /* 1st, check to see if this RE is sensitive at all and if not, skip it */
  750. /* also skip if it doesn't have ANY cuts at all */
  751. if ((RE[REi].E_Ncuts > 0) && ((F.dam == 1 && RE[REi].dam != 100) || (F.dcm == 1 && RE[REi].dcm != 100))) {
  752. if (RE[REi].Sites[RE[REi].Sites[0]] == 0) {
  753. RE[REi].Sites[RE[REi].Sites[0]] = 1; /* 1st time thru, set this to 1 (point it at the 1st real value */
  754. }
  755. OK = 0;
  756. i = RE[REi].Sites[RE[REi].Sites[0]]; /* shorten: i = the last el to have been checked */
  757. while (RWC >= RE[REi].Sites[i] && OK == 0) {
  758. if (RWC == RE[REi].Sites[i]) {
  759. OK = 1;
  760. }
  761. i++;
  762. }
  763. RE[REi].Sites[RE[REi].Sites[0]] = i;
  764. } else OK = 1;
  765. if (RE[REi].E_Ncuts == 0) OK = 0; /* just in case */
  766. return OK;
  767. }
  768. /* Add_dam and Add_dcm just fill the RE[1] and RE[2] entries for these special
  769. methylation enzymes. */
  770. void Add_dam(int NumREs) {
  771. strcpy(RE[NumREs].E_nam,"dam\0");
  772. if ((RE[NumREs].E_raw_sit = calloc(5, sizeof(char))) == NULL) BadMem("Add_dam-1", 1);
  773. RE[NumREs].E_raw_sit = "GATC\0";
  774. if ((RE[NumREs].E_wsit[1] = calloc(5, sizeof(char))) == NULL) BadMem("Add_dam-2", 1);
  775. RE[NumREs].E_wsit[1] = "gatc\0";
  776. strcpy(RE[NumREs].E_hex[1],"gatcnn\0");
  777. RE[NumREs].E_tcut[1] = 0;
  778. RE[NumREs].E_olap = 0;
  779. RE[NumREs].E_len = 4;
  780. RE[NumREs].E_pal = 1;
  781. RE[NumREs].E_dgen = 16;
  782. RE[NumREs].E_mag = 4;
  783. RE[NumREs].proto = NumREs;
  784. RE[NumREs].Err = 0;
  785. RE[NumREs].E_nam_l = 3;
  786. }
  787. void Add_dcm(int NumREs) {
  788. strcpy(RE[NumREs].E_nam,"dcm\0");
  789. if ((RE[NumREs].E_raw_sit = calloc(6, sizeof(char))) == NULL) BadMem("Add_dcm-1", 1);
  790. RE[NumREs].E_raw_sit = "CCWGG\0";
  791. if ((RE[NumREs].E_wsit[1] = calloc(6, sizeof(char))) == NULL) BadMem("Add_dcm-2", 1);
  792. RE[NumREs].E_wsit[1] = "ccwgg\0";
  793. strcpy(RE[NumREs].E_hex[1],"ccwggn\0");
  794. RE[NumREs].E_tcut[1] = 0;
  795. RE[NumREs].E_olap = 0;
  796. RE[NumREs].E_len = 5;
  797. RE[NumREs].E_pal = 1;
  798. RE[NumREs].E_dgen = 8;
  799. RE[NumREs].E_mag = 4;
  800. RE[NumREs].proto = NumREs;
  801. RE[NumREs].Err = 0;
  802. RE[NumREs].E_nam_l = 3;
  803. }
  804. /***************************************************************************************************/
  805. /* The following code performs the "pad the beginning sequence with the end and the end
  806. with the beginning" regardless of whether it's the whole sequence or a subsequence,
  807. using the vars 'begin' and 'end'. *Bare_Seq_Len is the the exactly "Real Sequence
  808. Length" as determined from the SEQIO routines, no padding added or subtracted -
  809. it's incremented to TotPaddedSeqLen in here and passed back to calling fn() via
  810. *Pad_Seq_Cnt */
  811. /* TotPaddedSeqLen length of sequence + bracketing overlaps
  812. Bare_Seq_Len actual number of bases read
  813. */
  814. /* will try to contruct this so that you give it the exact sequence you want padded, and it
  815. does all the calculations, work to alloc a new sequence space and do the padding, returning
  816. exactly what you want */
  817. char *PadEndForEnd(char *SeqIn, long *Pad_Seq_Len, long *Bare_Seq_Len) {
  818. long m, j, l, BareLen, TotPaddedSeqLen, FrPadSeqLen;
  819. /* int f1=0, iseq; */
  820. char *sequence, b

Large files files are truncated, but you can click here to view the full file