PageRenderTime 66ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/code_comm/SlidWin.c

http://research-code-base-animesh.googlecode.com/
C | 601 lines | 326 code | 91 blank | 184 comment | 163 complexity | 1d18ef69eb0b3e383d5cce39ff0dc3c5 MD5 | raw file
  1. /* tacg - a command line tool for the analysis of nucleic acids and protein */
  2. /* Copyright Š 1994-2001 Harry J Mangalam, tacg Informatics
  3. (mangalam@tacgi.com | mangalam@home.com | 949 856 2847) */
  4. /* $Id: SlidWin.c,v 1.6 2001/10/04 21:31:28 hjm Exp $ */
  5. /* The use of this software (except that by Harald T. Alvestrand, which is described in 'udping.c')
  6. and that by James Knight, which is described in 'seqio.c' is bound by the notice that appears
  7. in the file 'tacg.h' which should accompany this file. In the event that 'tacg.h' is not bundled
  8. with this file, please contact the author.
  9. */
  10. #include <stdio.h>
  11. #include <ctype.h>
  12. #include <string.h>
  13. #include <stdlib.h>
  14. #include <regex.h>
  15. /* #include "seqio.h" */
  16. #include "tacg.h"
  17. /*********************************** tacg_SlidingWindows ************************************/
  18. /* The following code searches thru DD->Dat, and verifies whether the per-pattern min/Max limits
  19. are being met within that window. It returns the number of Positive Sliding Windows over all
  20. the patterns searched. The struct that holds all the data is global, so it can be accessed
  21. from anywhere. It also handles it's own printing if requested, by setting PrintIf1 = 1 */
  22. /* where rule = the rule to evaluate; allows fn() to evaluate a single rule at a time, so it can be fed
  23. indices of arbitrary rules (to allow it to be configured to eval only selected rules from a file of rules) */
  24. /* this should be changed to handle the --rule as a special (single) case of the --rulefile flag;
  25. then we wouldnt need to pass it rulestring - it would already have it (named) in the Rule struct. */
  26. /* and ProtoNameHash[] shouldn;t be required either, as we don't have to do this kind of backchecking - ReadEnz
  27. will only read those REs that were represented in the Rule files. */
  28. /* shouldn't need the skip hack as all the patterns need to be printed */
  29. int tacg_SlidingWindow(int rule, int NumREs, int PrintIf1, struct SE_struct SelEnz[MAX_SEL_ENZ]) {
  30. short NameCheck[RE_NAME_TAB_SIZ]; /* indices from realhash(PatName) to check against multiple identicals */
  31. int *Counts, /* callocable array that is indexed on # of patterns and holds the counts
  32. per pattern */
  33. PWIncSiz = 20, /* PosWin Incr/Ending Size */
  34. PWEndSiz = 20,
  35. PWCnt=0, p, EOLA, Pspace=0, r, rl, i, j, k, m, LP, RP, SLP, Op, R, ok, L, lo, loopCnt=0,
  36. /* if gonna allow multiple rules to be checked at once, gonna have to make the 2 things following
  37. into 2D arrays ie: **LogicArr, **TruthArr */
  38. *LogicArr, *TruthArr,
  39. SWHits, /* counter for the Sliding Window hits */
  40. SWHBIncSiz = 200, /* the initial and incr size of SWHitBuf (carries the hits in the sliding windows) */
  41. Lparen[10] = {0,0,0,0,0,0,0,0,0,0},
  42. Lo=2, /* the Low end pointer to DD->Dat (bottom of the SlidWin) */
  43. Hi=4, /* the High end pointer to DD->Dat (top of the SlidWin) */
  44. namelen, NameOpen, Matched, /* vars for extracting the Name from the Name:#:# string */
  45. lpi, CLVp, CTruth, RVp, COp, CRV, amt2mov, SEi ;
  46. long seq_len, SW=0,
  47. *SWHitBuf; /* follows DD->Dat, where [0] is current pointer; [1] is the last alloc'ed el */
  48. char *Name, rulematch,
  49. *lcREName, *rulestring, *rulecopy;
  50. if ((rulestring = calloc(MAX_LEN_RULESTRING,sizeof(char))) == NULL) { BadMem("Can't init mem for 'rulestring'", 1); }
  51. if ((Name = calloc(11,sizeof(char))) == NULL) { BadMem("SlidWin.c: Name", 1); }
  52. /* is there any reason to have a separate F.SlidWin now?? */
  53. seq_len = F.Seq_Len;
  54. /* copy in the bits we need from the Rule struct and proceed */
  55. strcpy(rulestring, Rules[rule].RuleString);
  56. strcpy(Name, Rules[rule].Name);
  57. if (F.SlidWin > 0) { SW=F.SlidWin; } /* abbriev. for the Flag */
  58. else if (Rules[rule].Window != 0){
  59. SW = Rules[rule].Window; /* take it from the appro Rule struct value */
  60. } else {
  61. SW = seq_len; /* else it defaults to the entire sequence length */
  62. }
  63. /* Following stanza (or subpart thereof) must be wrapped in a loop that moves the window up DD->Dat
  64. by hits, testing at each correctly sized window, all of the conditions. Seems that after the
  65. initial error checking there should be a way to avoid going thru the whole stanza repeatedly */
  66. if (SW <= seq_len) { /* 'global' if that spans the file - SW has to be smaller than whole sequence*/
  67. /* PW_struct is external, so don't have to return it, but only its size */
  68. if ((SWHitBuf = calloc(SWHBIncSiz, sizeof(long))) == NULL) BadMem("No mem for SWHitBuf", 1);
  69. SWHitBuf[0] = 2; SWHitBuf[1] = SWHBIncSiz; /* set up the initial values */
  70. /* alloc PosWin to a suitable starting size */
  71. if ((PosWin = calloc(PWIncSiz, sizeof(*PosWin))) == NULL) BadMem("No mem for PosWin", 1);
  72. /* alloc PosWin[].PatHits[] (1st dimension) for the initial group */
  73. for (i=0; i<PWIncSiz; i++) {
  74. if ((PosWin[i].PatHits = calloc(NumREs, sizeof(long*))) == NULL) BadMem("No mem for PosWin[]PatHits", 1);
  75. /* and now calloc mem for the 0th el which keeps track of the base of the lst sliding win so we don't
  76. have to count up to it each time we want to figure to how many hits there have been in the window */
  77. if ((PosWin[i].PatHits[0] = calloc(NumREs, sizeof(long))) == NULL) BadMem("No mem for PosWin[].PatHits[0]", 1);
  78. }
  79. /* this is only for the 0th PatHits el of the 0th el of PosWin: PosWin[0].PatHits[0][...] */
  80. for (j=0; j< NumREs; j++) { /* and set them all to 1 (where RE[].Sites begins */
  81. PosWin[0].PatHits[0][j] = 1;
  82. }
  83. rl = strlen(rulestring);
  84. if ((Name = (char *)calloc(MAX_PAT_NAME_LEN+1, sizeof(char))) == NULL) {
  85. BadMem("No mem for Name", 1);
  86. }
  87. if ((lcREName = (char *)calloc(MAX_PAT_NAME_LEN+1, sizeof(char))) == NULL) {
  88. BadMem("No mem for lcREName", 1);
  89. }
  90. if ((TruthArr = (int *)calloc(rl, sizeof(int))) == NULL) {
  91. BadMem("No mem for TruthArr", 1);
  92. }
  93. if ((LogicArr = (int *)calloc(rl, sizeof(int))) == NULL) {
  94. BadMem("No mem for LogicArr", 1);
  95. }
  96. if ((rulecopy = (char *)calloc(rl+1, sizeof(char))) == NULL) {
  97. BadMem("No mem for rulecopy", 1);
  98. }
  99. strcpy(rulecopy, rulestring);
  100. /* ************************ calculate LogicArr ************************* */
  101. /* i = counter for rulestring
  102. j = counter for LogicArr (which will be shorter) */
  103. /* 'bani:2:4&hinfi:3:5' */
  104. i = -1; j = 1; namelen = 0; NameOpen = 1; Matched = 0;
  105. while (i++ < rl) {
  106. if (NameOpen == 0 && Matched == 0) { /* the name was closed by some other concluding character, so now processs the name */
  107. /* by running thru RE to try to match the names, unless it's already been processsed */
  108. k = F.RE_ST; m = 0; /* k has to start @ F.RE_ST, not 1, not 0 */
  109. while (k < NumREs && m == 0) { /* this placement wastes 1 set of loops; maybe move it? */
  110. lcREName = DownCase(RE[k].E_nam);
  111. Name = DownCase(Name);
  112. /* fprintf(stdout, "\n Name=%s\n", Name); */
  113. if (strcmp(lcREName, Name) == 0) {
  114. m = 1; LogicArr[j++] = k;
  115. /* fprintf(stderr,"Matched: %s!\n", lcREName); */
  116. } else {
  117. /* fprintf(stderr,"NOT matched: %s!\n", lcREName); */
  118. }
  119. k++;
  120. }
  121. namelen = 0; Matched = 1; /* reset the Name params to allow a new name to be started */
  122. }
  123. switch (rulestring[i]) {
  124. default: /* name chars - anything other than : ( ) | & or a # */
  125. NameOpen = 1; Matched = 0; /* Name is now open and hasn't been matched yet */
  126. /* process it as normal */
  127. if (namelen <= MAX_PAT_NAME_LEN) { /* there IS a Name being composed, so if it's < Max */
  128. Name[namelen++] = rulestring[i]; /* copy it over as well */
  129. } else {
  130. fprintf(stderr, "In '--rules', a name was too long (must be <= %d chars).\n", MAX_PAT_NAME_LEN);
  131. exit(1);
  132. }
  133. break;
  134. case '|': LogicArr[j++] = -1; Name[namelen] = '\0'; NameOpen = 0; break;
  135. case '&': LogicArr[j++] = -2; Name[namelen] = '\0'; NameOpen = 0; break;
  136. case '^': LogicArr[j++] = -7; Name[namelen] = '\0'; NameOpen = 0; break;
  137. case '(': LogicArr[j++] = -3; Name[namelen] = '\0'; NameOpen = 0; break;
  138. case ')': LogicArr[j++] = -4; Name[namelen] = '\0'; NameOpen = 0; break;
  139. case ':': Name[namelen] = '\0'; NameOpen = 0; break;
  140. case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
  141. /* skip it, unless NameOpen == 1 (in which case, it's part of the Name) */
  142. if (NameOpen == 1){
  143. Name[namelen++] = rulestring[i]; /* copy it over as well */
  144. }
  145. break;
  146. }
  147. }
  148. LogicArr[0] = j; /* sets EOLA */
  149. /* init and zero Counts[] - will only use 1->NumREs */
  150. if ((Counts = calloc(NumREs+1, sizeof(int))) == NULL) BadMem("Can't init mem for Counts", 1);
  151. if (DD->Dat[0] > 3) { /* if > 2 hits (if [0] > 3, there has to be a paired [5]) */
  152. /* Counts[abs(DD->Dat[3])]++; Counts[abs(DD->Dat[5])]++; initial values */
  153. Lo = Hi = 2;
  154. } else {
  155. fprintf(stderr, "Sliding Window: Only 1 match in whole sequence; not enough to calc sliding windows.\n");
  156. /* maybe need some other notes printed as well */
  157. exit(1);
  158. }
  159. /* in following, the Current pattern (aka RE) will be 'DD->Dat[Hi+1]' or 'DD->Dat[Lo+1]',
  160. a long way to represent it, but unfortunately about as short as possible */
  161. /* initialize the Hi pointer to just below the SW length, counting hits as we go.
  162. following will check for mult patterns that map to *same* site, but will NOT check for
  163. extremely jittered sites, if searching for extremely degenerate sites that might map
  164. all over a small area, so hits from degenerate patterns that occur near a boundary
  165. condition might not be completely accurate */
  166. /* Lo = Hi = 2; initial conditions */
  167. Counts[abs(DD->Dat[Lo+1])]++; /* count the 1st hit */
  168. while (Lo > Hi) { /* if Lo has somehow passed Hi (as would happen in crossing a BS) */
  169. Hi += 2; /* keep incr'g the Hi pointer until they're = */
  170. Counts[abs(DD->Dat[Hi+1])]++; /* & incr the counts for the new pattern */
  171. }
  172. while (((DD->Dat[Hi+2] - DD->Dat[Lo]) <= SW) && (DD->Dat[Hi+2] != -2)){
  173. Hi += 2;
  174. Counts[abs(DD->Dat[Hi+1])]++; /* incr the counts for the new pattern */
  175. } /* now both Lo and Hi are set correctly, bracketing the max Win that's <= the SW */
  176. /* this is where the loop for real sliding windows starts. The LogicArray has already
  177. been set up, memory has been alloc'ed for many of the structures that are needed. Just need to set
  178. up the loop tests and keep looping until the end of the seq has been reached. */
  179. /* Next stanza is the BIG LOOP to do until the end of the array */
  180. /* ######################################################################################################## */
  181. SEi = 0; /* for this instance, want to restart it for each rule */
  182. memset(NameCheck,0, sizeof(short)*RE_NAME_TAB_SIZ); /* make sure everything is set to 0 */
  183. tacg_Fill_SelEnz_fr_RuleString(SelEnz, &SEi, rulestring, NameCheck, Name);
  184. while ((Hi <= DD->Dat[0] && DD->Dat[Hi] != -2)) { /* BIG LOOP to do until the end of the array */
  185. /* && (DD->Dat[Hi] - DD->Dat[Lo] <= SW) */
  186. loopCnt++;
  187. if (F.Verbose > 1) fprintf(stdout, "Loop # %d : Window = %d : Hi = %d : Lo = %d : Diff = %ld\n", loopCnt, PWCnt, Hi, Lo, (DD->Dat[Hi] - DD->Dat[Lo]));
  188. /* 1st test if the window (set to be of OK size) is POSITIVE, based on rules */
  189. /* this will have to be redesigned to parse formal logical conditions:
  190. ie: ((1 & 2) | ((3 & 4)) & ((5 & 6) | 7) but it will have to do for now
  191. rules: (1 | 2) & (3 | 4) etc */
  192. /* This code eliminates the need to test for only ANDs or only ORs, as it covers both -
  193. I assume that the phrase contains both | and & so need to use the formal logic rules */
  194. /* check! will this handle flat complex strings? ie: 'T|F&T&T|T|T&F&T|F&F&F&T&F|T&T&F' ? I think so. */
  195. /* 1st, reduce and serialize all the conditions, changing this:
  196. A&((B|C)|(D&E)&(K&G)&H)|(I&J) (where the letter names subst for 'NameX:min:Max') to
  197. 1&((2|3)|(4&5)&(6&7)&8)|(9&10) done in SetFlags() and stored in passed-in 'int LogicArr[]' var
  198. and EOLA (EndOfLogicArray) stored as as LogicArr[0]
  199. EOLA points to .....................................................................v
  200. 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 [array index]
  201. 1 -2 -3 -3 2 -1 3 -4 -1 -3 4 -2 5 -4 -2 -3 6 -2 7 -4 -2 8 -4 -1 -3 9 -2 10 -4 [array values]
  202. 1 & ( ( 2 | 3 ) | ( 4 & 5 ) & ( 6 & 7 ) & 8 ) | ( 9 & 10 ) [equiv chars]
  203. where: positive #s = RE indices and negative #s are defined as:
  204. -1 = '|' = OR , -3 = '(' = LP -5 = TRUE -7 = '^' = XOR (exclusive OR)
  205. -2 = '&' = AND , -4 = ')' = RP -6 = FALSE -8 = reserved for later
  206. so all I do here is set:
  207. T&((F|F)|(T&T)&(T&F)&T)|(F&T) (their equiv truth values in this routine)
  208. - sort of already done -> names and min:Max values are registered as RE entries in SetFlags.
  209. what should also be done in SetFlags is the logic string should be re-written using
  210. the RE indices: as above.
  211. so, given LogicArr as above, calculate the equiv TruthArr,
  212. */
  213. /* p = 1; cuz LogicArr[0] carries EOLA */
  214. EOLA = LogicArr[0]; /* EOLA = EndOfLogicArray, set above */
  215. /* Check the Min:Max values */
  216. /* Following sets the 'truth value' for each pattern by going thru LogicArr, checking each
  217. pos# vs the RE[pos#].min/Max */
  218. for (p=1; p<=EOLA; p++) {
  219. if (LogicArr[p] > 0) { /* for all pos #s (=RE indices) */
  220. r = LogicArr[p]; /* shorten */
  221. if (Counts[r] <= RE[r].Max && Counts[r] >= RE[r].min) {
  222. TruthArr[p] = -5;
  223. }
  224. else TruthArr[p] = -6;
  225. } else { /* this should have an 'else' to allow copying over neg values that indicate logic symbols */
  226. TruthArr[p] = LogicArr[p];
  227. }
  228. }
  229. /* so now the LogicArr has effectively been changed to TruthArr
  230. 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29[array index]
  231. 1 & ( ( 2 | 3 ) | ( 4 & 5 ) & ( 6 & 7 ) & 8 ) | ( 9 & 10 ) [equiv chars]
  232. 1 -2 -3 -3 2 -1 3 -4 -1 -3 4 -2 5 -4 -2 -3 6 -2 7 -4 -2 8 -4 -1 -3 9 -2 10 -4 [logic values]
  233. -5 -2 -3 -3 -6 -1 -6 -4 -1 -3 -5 -2 -5 -4 -2 -3 -6 -2 -6 -4 -2 -5 -4 -1 -3 -6 -2 -5 -4 [truth values]
  234. T & ( ( F | F ) | ( T & T ) & ( T & F ) & T ) | ( F & T ) [equiv chars]
  235. could these just be written right back into LogicArr? YES!, but leave for now */
  236. /* now need to 'flatten' the array to wipe out the parens, substituting a single TRUTH value for
  237. any logical expr; anything preceding a '(' is left where it is for now */
  238. p=1; LP = RP = lpi = 0;
  239. while (p < EOLA) { /* (p <= EOLA) */
  240. r = TruthArr[p];
  241. switch(r) {
  242. case 0: /* reached at end of sweep thru the array - sweep again until only a single Truth value remains */
  243. /* so test to see how much Truth has yet to be extracted , and if there is some, reset EOLA and p and
  244. restart the sweep */
  245. if (EOLA > 4) p = 0; /* then another sweep is required */
  246. else p = EOLA + 10; /* make p > EOLA to break out of the loop */
  247. break;
  248. default: /* there should ONLY be negative numbers here - this should never be hit with a +# */
  249. /* fprintf(stderr, "%d left in place at postion %d\n", r, p); */
  250. break;
  251. /* only paren detection matters at this point; all else is handled inside of the parens */
  252. case -3: /* (, LP */
  253. /* fprintf(stderr, "( "); */
  254. /* LP++; incr LP */
  255. Lparen[LP++] = p; /* note where it lies in Lparen */
  256. break;
  257. case -4: /* ), RP */
  258. /* fprintf(stderr, ") "); */
  259. RP++; /* incr RP */
  260. if (RP > LP) { /* should never be the case - if it is, err */
  261. fprintf(stderr, "Too many ')'s detected (%d) in '--rules' string - try again\n", RP);
  262. exit (1);
  263. } else { /* eval the previously paren'ed subarray to a single TRUTH value */
  264. SLP = Lparen[LP-1]; /* Starting Left Paren - where the expr has to be eval'ed FROM*/
  265. /* define the Left Paren, LeftVal, Operator, RightVal, Right Paren */
  266. L = SLP; /* where the eval'ed expr starts from */
  267. CLVp = SLP+1; /* CLVp = Current Left Value pointer, set to lpi */
  268. CTruth = TruthArr[CLVp]; /* CTruth (=Current Truth) */
  269. if ((p-SLP) == 2) { /* if the paren pair just encloses a single truth value '..|(hpaii:4:9)' */
  270. /* then just collapse it immediately by removing the parens */
  271. TruthArr[SLP] = TruthArr[SLP+1]; /* move the Truth value over */
  272. amt2mov = EOLA-p-1;
  273. if (amt2mov > 0) {
  274. memmove(&TruthArr[SLP+1], &TruthArr[p+1], (sizeof(int)*(EOLA-p-1)));
  275. Pspace = p - Lparen[LP-1];
  276. for (i=EOLA-Pspace; i<=EOLA; i++) TruthArr[i] = 0; /* and zero the rest */
  277. } else { /* just zero the rest of it */
  278. for (i=p-1; i<EOLA; i++) TruthArr[i] = 0; /* and zero the rest of the array to clarify for debugging */
  279. }
  280. p = SLP;
  281. EOLA -= Pspace; /* EOLA = EOLA - p + 1; correct EOLA */
  282. RP--; LP--; /* decr the counters, now that one pair is taken care of */
  283. } else {
  284. while ((RVp = CLVp+2) < p) { /* do it until all the Ops in the paren pair have been eval'ed, L -> R */
  285. /* RVp = CLVp+2; Right Value pointer (Right border of the expr to be eval'ed */
  286. COp = TruthArr[CLVp+1]; /* Current Operator */
  287. CRV = TruthArr[RVp]; /* Current Right Value */
  288. /* ..and CTruth is already set */
  289. if (CTruth == -5) { /* -5 = T, -6 = F */
  290. if (COp == -1) { /* -1 = |, -2 = & , -7 = ^ */
  291. CTruth = -5; /* T | X -> T */
  292. } else if (CRV == -5) { /* if COp = '&' or '^', have to eval the last part */
  293. if (COp == -7) { /* if CRV is T and COp is ^, then expr is F, as XOR only allows 1 truth */
  294. CTruth = -6;
  295. /* fprintf (stderr, " got a ^ "\n); */
  296. } else { /* if COp is '&' and the CRV is T, then expression is true */
  297. CTruth = -5; /* T & T -> T */
  298. }
  299. } else {
  300. CTruth = -6; /* T & F -> F */
  301. }
  302. } else { /* CTruth = F, so have to finish the eval */
  303. if (COp == -2) { /* F & X -> F*/
  304. CTruth = -6;
  305. } else if (CRV == -5) { /* if F [^|] T , have to test for which it is */
  306. if (COp == -7) { /* it's '^' */
  307. CTruth = -5; /* F ^ T -> T (as below, but keep separate for now) */
  308. } else { /* it's '|' */
  309. CTruth = -5; /* F | T -> T */
  310. }
  311. } else {
  312. CTruth = -6; /* F | F -> F */
  313. }
  314. /* now finished with the 1st 2 values, reset the pointers, etc for another round */
  315. }
  316. CLVp = RVp;
  317. } /* when leave loop, last CTruth value is value for the whole parened expr */
  318. /* so drop it on top of TruthArr[SLP] (where it all began) and shift the uneval'ed
  319. part of the string over to the left */
  320. TruthArr[SLP] = CTruth;
  321. memmove(&TruthArr[SLP+1], &TruthArr[p+1], (sizeof(int)*(EOLA-p-1)));
  322. Pspace = p - Lparen[LP-1];
  323. for (i=EOLA-Pspace; i<=EOLA; i++) TruthArr[i] = 0; /* and zero the rest of the array to clarify for debugging */
  324. /* where ( was----^; ^---where ) was; need 1 more to move in the new value */
  325. /* EOLA = EOLA - (p+1-SLP); correct EOLA */
  326. EOLA -= Pspace; /* correct EOLA */ /* EOLA = EOLA - p + 1; */
  327. p = SLP-1; /* bc of the 'p++' at the end of the loop */
  328. RP--; LP--; /* decr the counters, now that one pair is taken care of */
  329. }
  330. }
  331. break;
  332. } /* end of switch(r) statement */
  333. p++;
  334. } /* end of 'while (p <= EOLA)' loop */
  335. /* end of the big 'else' expression that handles complex logic/truth statements */
  336. /* must check to see that there aren't any hanging '(' left over either.. */
  337. if (LP > 0) {
  338. fprintf(stderr, "Problem with too many Left Parens '(' - check the rules string carefully!\n");
  339. exit(1);
  340. }
  341. /* now that all the parens have been elim'ed, go L -> R to calc final Truth, in this case just
  342. setting the Rightmost of the 2 Truth values being tested to the outcome and then starting from there
  343. on the next comparison */
  344. L = 1; Op = 2; R = 3;
  345. while (R <= EOLA) { /* yes, this should be fn()ized and I will as soon as it verifies */
  346. if (TruthArr[Op] == -1) { /* if the Op is an '|' */
  347. if (TruthArr[L] == -5 || TruthArr[R] == -5) TruthArr[R] = -5;
  348. else TruthArr[R] = -6;
  349. } else { /* if the Op is '&' */
  350. if (TruthArr[L] == -6 || TruthArr[R] == -6) TruthArr[R] = -6;
  351. else TruthArr[R] = -5;
  352. }
  353. /* ie: now have the prev 'T|F' triplet replaced by 'T|T' */
  354. /* now set up pointers for next cmp */
  355. L = R; Op = L+1; R = L+2;
  356. }
  357. /* and now test the FINAL TRUTH */
  358. if (TruthArr[L] == -5) {
  359. ok = 1;
  360. if (F.Verbose > 1) fprintf(stdout, "--rules string evals to TRUE\n");
  361. } else {
  362. ok = 0;
  363. if (F.Verbose > 1) fprintf(stdout, "--rules string evals to FALSE\n");
  364. }
  365. /* and that's THAT */
  366. /* at end, if ok = 1, AT LEAST 1 condition has been met and Window is Pos */
  367. /* if in either case ok = 0, then none of the nec conditions have been met and
  368. the test fails -> NOT a Positive Window */
  369. if (ok == 1) { /* then mark the Window as Pos and set all the vars that need to be set */
  370. /* fprintf(stderr, "\nDEBUG:####### New OK == 1 ########\n"); */
  371. /* otherwise just store it in the struct whose pointer gets returned */
  372. PosWin[PWCnt].Lo = Lo;
  373. PosWin[PWCnt].Hi = Hi;
  374. PosWin[PWCnt].SeqLo = DD->Dat[Lo];
  375. PosWin[PWCnt].SeqHi = DD->Dat[Hi];
  376. /* we have to put something in PosWin[].PatHits if we expect to get anything out, right? */
  377. /* Following uses Counts[] as a guide to alloc mem to hold the actual sites */
  378. /* ############################# this is why it prints thru the entire RE[] ############################ */
  379. /* ############ this is now changed to match the new way of doing things ############################ */
  380. for (i=F.RE_ST; (i<NumREs && RE[i].proto == i); i++) {
  381. /* 1st, figure out how many els are needed - how many hits were in each window and then
  382. alloc mem for each RE/row needed */
  383. lo = PosWin[0].PatHits[0][i]; /* lo = index of the appro Sites[] array - base of last sliding window */
  384. while (RE[i].Sites[lo] < PosWin[PWCnt].SeqLo) {
  385. lo++; /* just runs it up to the current base */
  386. }
  387. /* and then record this for later use so we don't have to count up from 0 each time - set BEFORE the counting
  388. loop as this base won't change after it - it's the OLD FLOOR from where we start to count up (just above) */
  389. PosWin[0].PatHits[0][i] = lo; /* rememb to keep track of where the newly set, but OLD floor will be */
  390. m = 2; /* SWHits = 0; */
  391. /* this can be re-written to use PosWin[PWCnt].PatHits directly, instead of SWHitBuf - do the same mem jig */
  392. /* have to calloc PosWin[PWCnt].PatHits[i] based on Counts[i] (+ 2 for accounting + 3 for a jitter buffer), test for overrun */
  393. /* calloc the mem for how many hits it was.. */
  394. if ((PosWin[PWCnt].PatHits[i] = calloc(Counts[i]+5, sizeof(long*))) == NULL) BadMem("No mem for PosWin.PatHits[i]", 1);
  395. PosWin[PWCnt].PatHits[i][1] = Counts[i]+5; /* [1] tracks the end of alloc'ed mem */
  396. while (RE[i].Sites[lo] <= PosWin[PWCnt].SeqHi && lo < RE[i].Sites[0]
  397. && RE[i].E_Ncuts > 0 ) { /* record all the sites in here */
  398. PosWin[PWCnt].PatHits[i][m++] = RE[i].Sites[lo];
  399. /* SWHitBuf[m++] = RE[i].Sites[lo]; */
  400. PosWin[PWCnt].PatHits[i][0] = m; /* [0] tracks next open position/how many */
  401. SWHitBuf[0] = m; /* add it and keep track of it */
  402. lo++; /* SWHits++; count the hits in the window by stepping lo up to the top of the window */
  403. if (PosWin[PWCnt].PatHits[i][1] - PosWin[PWCnt].PatHits[i][0] < 2) { /* check the mem for overrun */
  404. /* since we've approximated it pretty accurately, we shouldn't need a much bigger alloc */
  405. k = PosWin[PWCnt].PatHits[i][1] + 10;
  406. if ((PosWin[PWCnt].PatHits[i] = realloc(PosWin[PWCnt].PatHits[i], sizeof(long)*k)) == NULL) BadMem("Can't realloc mem for SWHitBuf", 1);
  407. PosWin[PWCnt].PatHits[i][1] = k;
  408. }
  409. }
  410. /* RE[i].Sites[lo] ends being <= to PosWin[PWCnt].SeqHi && lo < RE[i].Sites[0] */
  411. /* PosWin[0].PatHits[0][i] (below) holds the RE[].Sites position of the floor (using lo as the index)
  412. so we don't have to count up each time */
  413. }
  414. /*
  415. ii = i; mm = m;
  416. for (p=0; p< PWCnt; p++) {
  417. fprintf(stderr, "\nPWCnt = %d ---\n", p);
  418. for (i=0; i<ii; i++) {
  419. fprintf(stderr, "\n%d (%s); ", i, RE[i].E_nam);
  420. for (m=0;m< PosWin[p].PatHits[i][1]; m++){
  421. fprintf(stderr, "%ld ", PosWin[p].PatHits[i][m]);
  422. }
  423. fprintf(stderr, "\n");
  424. }
  425. fprintf(stderr, "\n");
  426. }
  427. fprintf(stderr, "\n");
  428. */
  429. /* and check to see if we're too close to the end of mem for PosWin */
  430. if ((PWEndSiz - PWCnt) < 2) {
  431. m = PWEndSiz; /* now it's the new start site for alloc'ing new memory */
  432. PWEndSiz += PWIncSiz;
  433. if ((PosWin = realloc(PosWin, sizeof(*PosWin)*PWEndSiz)) == NULL) BadMem("Can't realloc mem for PosWin", 1);
  434. /* now set all the new vars and calloc new mem for all of them */
  435. for (i=m; i<PWEndSiz; i++) {
  436. if ((PosWin[i].PatHits = calloc(NumREs, sizeof(long*))) == NULL) BadMem("No mem for new PosWin", 1);
  437. }
  438. }
  439. /* if output should be printed as well as stored in struct, print the header only once */
  440. if (PrintIf1 == 1 && PWCnt == 0) {
  441. fprintf(stdout, "\n== Positive Sliding Windows for %s (Window Size = %ld) \n\n",
  442. Rules[rule].Name, Rules[rule].Window);
  443. /* and the warning that there may be some funny biz with degenerate sequences */
  444. fprintf(stdout, "NB: When using very degenerate sequences (ie: 'wwwwwwwwww') in conjunction\n"
  445. "with Sliding Windows, inaccuracies in the way that the hits are counted\n"
  446. "may allow hits to be OVERCOUNTED. If you suspect that this is happening,\n"
  447. "check the Sites table (-S) against the Sliding Windows. Sites data has\n"
  448. "been filtered to remove this 'jitter'. \n"
  449. "Each stanza includes header info about the window and one line on each pattern:\n\n"
  450. "Serial# Low Border High Border Diff\n"
  451. " \" Name:min:Max (# hits in window) Pattern sequence Actual sites....\n\n");
  452. }
  453. /* printing section also needs to check current rule and ONLY print that pattern data for the the patterns
  454. in that particular rule (not all patterns in the rule file as is currently the case */
  455. if (PrintIf1 == 1) { /* and if this should be printed as well as stored in struct */
  456. /* Serial # Low Border High Border Difference */
  457. fprintf(stdout, "%d Rule Name = [%s] Req'd Win = [%ld] Lo = [%ld] Hi = [%ld] Disc'd Win = [%ld] \n%d Logic = %s\n",
  458. PWCnt, Rules[rule].Name, Rules[rule].Window, PosWin[PWCnt].SeqLo, PosWin[PWCnt].SeqHi,
  459. (PosWin[PWCnt].SeqHi - PosWin[PWCnt].SeqLo), PWCnt, rulestring);
  460. for (i=F.RE_ST; i<NumREs && RE[i].proto == i; i++) { /* start @ F.RE_ST, not 1, not 0 */
  461. if (PosWin[PWCnt].PatHits[i][0] == 0) SWHits = 0;
  462. else SWHits = PosWin[PWCnt].PatHits[i][0]-2;
  463. if (NameCheck[realhash(RE[i].E_nam, RE_NAME_TAB_SIZ)] == 1) {
  464. if (SWHits >= RE[i].min && SWHits <= RE[i].Max) {
  465. rulematch = 'x';
  466. } else {
  467. rulematch = ' ';
  468. }
  469. fprintf(stdout, "%d %10s:%d:%d %c (%d) %s ",
  470. PWCnt, RE[i].E_nam, RE[i].min, RE[i].Max, rulematch, SWHits, RE[i].E_raw_sit);
  471. /* following is suposed to print out the actual hits but it's currently buggered
  472. it may not be a good idea to reiterate this data for every window as that's a lot of
  473. repeated data. Better to just use the Sites output? */
  474. for (j=2; j< PosWin[PWCnt].PatHits[i][0]; j++) {
  475. fprintf(stdout, " %ld ", PosWin[PWCnt].PatHits[i][j]);
  476. }
  477. fprintf(stdout, "\n");
  478. }
  479. /* this prints out the actual hits in real world coords - WHY?
  480. */
  481. }
  482. fprintf(stdout, "\n");
  483. }
  484. PWCnt++;
  485. } /* if (ok == 1) loop */
  486. /* Now move the SW up. Lo always moves up one position. Hi may or may not, depending on whether the
  487. next hit is too far away. If Lo becomes > Hi, it indicates that we've jumped a BS and Hi has to
  488. be run up to be >= to Lo. */
  489. Counts[abs(DD->Dat[Lo+1])]--; /* decr the count for the current (old) hit */
  490. Lo += 2; /* incr Lo to move up DD->Dat */
  491. /* if (Lo == Hi) {
  492. Counts[abs(DD->Dat[Lo+1])] = 1;
  493. } else {
  494. Counts[abs(DD->Dat[Lo+1])]++;
  495. }
  496. */
  497. while (Lo >= Hi && (DD->Dat[Hi + 2] - DD->Dat[Lo] <= SW)) { /* if Lo has somehow passed Hi (as would happen in crossing a BS) */
  498. Hi += 2; /* keep incr'g the Hi pointer until they're = */
  499. Counts[abs(DD->Dat[Hi+1])]++; /* & incr the counts for the new pattern */
  500. }
  501. while (((DD->Dat[Hi+2] - DD->Dat[Lo]) <= SW) && (DD->Dat[Hi+2] != -2)){
  502. Hi += 2;
  503. Counts[abs(DD->Dat[Hi+1])]++; /* incr the counts for the new pattern */
  504. } /* now both Lo and Hi are set correctly, bracketing the max Win that's <= the SW */
  505. } /* while (Hi <= DD->Dat[0] || (( Hi - Lo == 2) && (DD->Dat[Hi] - DD->Dat[Lo] > SW))) */
  506. } else { /* if (SW <= (int)seq_len) - file spanning if loop */
  507. fprintf(stderr, "Sliding Window (%ld) > entire sequence (%ld), skipping Sliding Window analysis.\n", SW, seq_len);
  508. }
  509. return PWCnt; /* PosWin is already available as it's global */
  510. }