PageRenderTime 49ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/SNAPLib/OldAligner.cpp

https://github.com/terhorst/snap
C++ | 1259 lines | 759 code | 132 blank | 368 comment | 205 complexity | b57edd81d0f81e47fc5f2b87477945de MD5 | raw file
Possible License(s): Apache-2.0
  1. /*++
  2. Module Name:
  3. Aligner.cpp
  4. Abstract:
  5. SNAP genome aligner
  6. Authors:
  7. Bill Bolosky, August, 2011
  8. Environment:
  9. User mode service.
  10. This class is NOT thread safe. It's the caller's responsibility to ensure that
  11. at most one thread uses an instance at any time.
  12. Revision History:
  13. Adapted from Matei Zaharia's Scala implementation.
  14. --*/
  15. #include "stdafx.h"
  16. #include "LandauVishkin.h"
  17. #include "OldAligner.h"
  18. _int64 nCandidatesCreated = 0;
  19. _int64 nCandidatesEliminated = 0;
  20. _int64 nScoredCandidatesEliminated = 0;
  21. _int64 nScoredCandidatesSoftEliminated = 0;
  22. OldAligner::OldAligner(
  23. GenomeIndex *i_genomeIndex,
  24. unsigned i_confDiff,
  25. unsigned i_maxHitsToConsider,
  26. unsigned i_maxK,
  27. unsigned i_maxReadSize,
  28. unsigned i_maxSeedsToUse,
  29. unsigned i_lvCutoff) :
  30. genomeIndex(i_genomeIndex), confDiff(i_confDiff), maxHitsToConsider(i_maxHitsToConsider), maxK(i_maxK),
  31. maxReadSize(i_maxReadSize), maxSeedsToUse(i_maxSeedsToUse), lvCutoff(i_lvCutoff)
  32. /*++
  33. Routine Description:
  34. Constructor for the OldAligner class. Aligners align reads against an indexed genome.
  35. Arguments:
  36. i_genomeIndex - The index against which to do the alignments
  37. i_confDiff - The string difference between two matches necessary to believe they're really different
  38. i_maxHitsToConsider - The maximum number of hits to use from a seed lookup. Any lookups that return more
  39. than this are ignored.
  40. i_maxK - The largest string difference to consider for any comparison.
  41. i_maxReadSize - Bound on the number of bases in any read. There's no reason to make it tight, it just affects a little memory allocation.
  42. i_maxSeedsToUse - The maximum number of seeds to use when aligning any read (not counting ones ignored because they resulted in too many
  43. hits). Once we've looked up this many seeds, we just score what we've got.
  44. --*/
  45. {
  46. #if MAINTAIN_HISTOGRAMS
  47. lvHistogram = new Histogram(20, true);
  48. lookupHistogram = new Histogram(20, true);
  49. lvHistogramForMulti = new Histogram(20, true);
  50. lvCountWhenBestFound = new Histogram(maxHitsToConsider*4,false);
  51. #endif // MAINTAIN_HISTOGRAMS
  52. nHashTableLookups = 0;
  53. nLocationsScored = 0;
  54. nHitsIgnoredBecauseOfTooHighPopularity = 0;
  55. nReadsIgnoredBecauseOfTooManyNs = 0;
  56. nIndelsMerged = 0;
  57. genome = genomeIndex->getGenome();
  58. seedLen = genomeIndex->getSeedLength();
  59. #if USE_CAREFUL_BASE_STATUS
  60. //
  61. // Allocate the seed status and candidate arrays. Logically, they are new for each call to AlignRead, but by doing
  62. // the allocations here we avoid a new/delete cycle for each read.
  63. //
  64. bases = new BaseStatus[maxReadSize];
  65. if (NULL == bases) {
  66. fprintf(stderr,"OldAligner: unable to allocate bases.\n");
  67. exit(1);
  68. }
  69. #endif // USE_CAREFUL_BASE_STATUS
  70. candidates = new Candidate[(maxHitsToConsider * (maxReadSize - seedLen + 1)) * 2]; // *2 is for reverse compliment
  71. if (NULL == candidates) {
  72. fprintf(stderr,"OldAligner: unable to allocate candidates.\n");
  73. exit(1);
  74. }
  75. rcReadData = new char[maxReadSize];
  76. if (NULL == rcReadData) {
  77. fprintf(stderr,"OldAligner: unable to allocate rc read data.");
  78. exit(1);
  79. }
  80. rcTranslationTable['A'] = 'T';
  81. rcTranslationTable['G'] = 'C';
  82. rcTranslationTable['C'] = 'G';
  83. rcTranslationTable['T'] = 'A';
  84. rcTranslationTable['N'] = 'N';
  85. for (unsigned i = 0; i < 256; i++) {
  86. nTable[i] = 0;
  87. }
  88. nTable['N'] = 1;
  89. seedUsed = new BYTE[(maxReadSize + 7 + 128) / 8]; // +128 to make sure it extends at both before and after at least an _int64
  90. if (NULL == seedUsed) {
  91. fprintf(stderr,"OldAligner: unable to allocate seedUsed array.\n");
  92. exit(1);
  93. }
  94. seedUsedAsAllocated = seedUsed; // Save the pointer for the delete.
  95. seedUsed += 8; // This moves the pointer up an _int64, so we now have the appropriate before buffer.
  96. }
  97. AlignmentResult
  98. OldAligner::AlignRead(Read *read, unsigned *genomeLocation, bool *hitIsRC, int *finalScore)
  99. /*++
  100. Routine Description:
  101. Align a particular read.
  102. Arguments:
  103. read - the read to align
  104. genomeLocation - if this aligned to a SingleHit, the 0-based offset in the genome that this hit. The aligner treats the entire
  105. genome as a single string, even though it's really a set of chrosomes. It just makes the code simpler.
  106. The caller can convert to chromosome+offset by looking up the piece boudaries using the Genome object.
  107. hitIsRC - the aligner tries to align both the given read and its reverse compliment. If we found a SIngleHit this
  108. bool is set to indicate whether that hit was on the reverse compliment.
  109. finalScore - if a single or confident hit, this is the score of the hit (i.e., the LV distance from the read)
  110. Return Value:
  111. ConfidentHit, SingleHit, MultiHit or NotFound depending on how the alignment went.
  112. --*/
  113. {
  114. unsigned lookupsThisRun = 0;
  115. unsigned lvScores = 0;
  116. unsigned lvScoresAfterBestFound = 0;
  117. bool anyHitsSkippedBecauseOfTooHighPopularityThisRun = false;
  118. AlignmentResult finalResult;
  119. //
  120. // A bitvector for used seeds, indexed on the starting location of the seed within the read.
  121. //
  122. if (read->getDataLength() > maxReadSize) {
  123. fprintf(stderr,"OldAligner:: got too big read (%d > %d)", read->getDataLength(),maxReadSize);
  124. exit(1);
  125. }
  126. if (read->getDataLength() < seedLen) {
  127. //
  128. // Too short to have any seeds, it's hopeless.
  129. //
  130. return NotFound;
  131. }
  132. //
  133. // Clear out the seed used array.
  134. //
  135. memset(seedUsed,0,(read->getDataLength() + 7) / 8);
  136. int rdl = (int)read->getDataLength();
  137. const char *readData = read->getData();
  138. unsigned countOfNs = 0;
  139. for (int i = 0; i < rdl; i++) {
  140. char baseByte = readData[i];
  141. rcReadData[rdl - i - 1] = rcTranslationTable[baseByte];
  142. countOfNs += nTable[baseByte];
  143. }
  144. if (countOfNs > maxK) {
  145. nReadsIgnoredBecauseOfTooManyNs++;
  146. return NotFound;
  147. }
  148. //
  149. // Block off any seeds that would contain an N.
  150. //
  151. if (countOfNs > 0) {
  152. int minSeedToConsiderNing = 0; // In English, any word can be verbed. Including, apparently, "N."
  153. for (int i = 0; i < rdl; i++) {
  154. if (readData[i] == 'N') {
  155. int limit = __max(i + (int)seedLen - 1, rdl-1);
  156. for (int j = __max(minSeedToConsiderNing, i - (int)seedLen + 1); j <= limit; j++) {
  157. SetSeedUsed(j);
  158. }
  159. minSeedToConsiderNing = limit+1;
  160. }
  161. }
  162. }
  163. Read rcRead[1];
  164. rcRead->init(NULL,0,rcReadData,NULL,read->getDataLength());
  165. clearCandidates();
  166. //
  167. // Initialize the bases table, which represents which bases we've checked.
  168. // We have readSize - seeds size + 1 possible seeds.
  169. //
  170. unsigned nPossibleSeeds = read->getDataLength() - seedLen + 1;
  171. #if USE_CAREFUL_BASE_STATUS
  172. memset(bases,0,sizeof(BaseStatus) * read->getDataLength());
  173. #endif // USE_CAREFUL_BASE_STATUS
  174. unsigned nextSeedToTest = 0;
  175. unsigned wrapCount = 0;
  176. unsigned lowestPossibleScoreOfAnyUnseenLocation = 0;
  177. unsigned lowestPossibleRCScoreOfAnyUnseenLocation = 0;
  178. #if USE_CAREFUL_BASE_STATUS
  179. unsigned mostSeedsContainingAnyParticularBase = 0;
  180. unsigned mostRCSeedsContainingAnyParticularBase = 0;
  181. #else // USE_CAREFUL_BASE_STATUS
  182. unsigned mostSeedsContainingAnyParticularBase = 1; // Instead of tracking this for real, we're just conservative and use wrapCount+1. It's faster.
  183. unsigned mostRCSeedsContainingAnyParticularBase = 1;// ditto
  184. #endif // USE_CAREFUL_BASE_STATUS
  185. unsigned bestScore = UnusedScoreValue;
  186. unsigned bestScoreGenomeLocation;
  187. unsigned secondBestScore = UnusedScoreValue;
  188. unsigned nSeedsApplied = 0;
  189. unsigned nRCSeedsApplied = 0;
  190. while (nSeedsApplied + nRCSeedsApplied < maxSeedsToUse) {
  191. //
  192. // Choose the next seed to use. Choose the first one that isn't used
  193. //
  194. if (nextSeedToTest >= nPossibleSeeds) {
  195. //
  196. // We're wrapping. We want to space the seeds out as much as possible, so if we had
  197. // a seed length of 20 we'd want to take 0, 10, 5, 15, 2, 7, 12, 17. To make the computation
  198. // fast, we use use a table lookup.
  199. //
  200. wrapCount++;
  201. if (wrapCount >= seedLen) {
  202. //
  203. // We tried all possible seeds without matching or even getting enough seeds to
  204. // exceed our seed count. Do the best we can with what we have.
  205. //
  206. score(
  207. true,
  208. read,
  209. rcRead,
  210. &finalResult,
  211. finalScore,
  212. genomeLocation,
  213. hitIsRC,
  214. nSeedsApplied,
  215. nRCSeedsApplied,
  216. mostSeedsContainingAnyParticularBase,
  217. mostRCSeedsContainingAnyParticularBase,
  218. lowestPossibleScoreOfAnyUnseenLocation,
  219. lowestPossibleRCScoreOfAnyUnseenLocation,
  220. candidates,
  221. bestScore,
  222. bestScoreGenomeLocation,
  223. secondBestScore,
  224. lvScores,
  225. lvScoresAfterBestFound,
  226. anyHitsSkippedBecauseOfTooHighPopularityThisRun);
  227. #if MAINTAIN_HISTOGRAMS
  228. lvHistogram->addToCount(lvScores,lvScores);
  229. lookupHistogram->addToCount(lookupsThisRun,lookupsThisRun);
  230. if (MultipleHits == finalResult) {
  231. lvHistogramForMulti->addToCount(lvScores,lvScores);
  232. } else if (SingleHit == finalResult) {
  233. lvCountWhenBestFound->addToCount(lvScores-lvScoresAfterBestFound);
  234. }
  235. #endif // MAINTAIN_HISTOGRAMS
  236. return finalResult;
  237. }
  238. nextSeedToTest = getWrappedNextSeedToTest(wrapCount);
  239. #if !USE_CAREFUL_BASE_STATUS
  240. mostSeedsContainingAnyParticularBase = wrapCount + 1;
  241. mostRCSeedsContainingAnyParticularBase = wrapCount + 1;
  242. #endif // !USE_CAREFUL_BASE_STATUS
  243. }
  244. while (nextSeedToTest < nPossibleSeeds && IsSeedUsed(nextSeedToTest)) {
  245. //
  246. // This seed is already used. Try the next one.
  247. //
  248. nextSeedToTest++;
  249. }
  250. if (nextSeedToTest >= nPossibleSeeds) {
  251. //
  252. // Unusable seeds have pushed us past the end of the read. Go back around the outer loop so we wrap properly.
  253. //
  254. continue;
  255. }
  256. Seed seed(read->getData() + nextSeedToTest, seedLen);
  257. unsigned nHits; // Number of times this seed hits in the genome
  258. const unsigned *hits; // The actual hits (of size nHits)
  259. unsigned nRCHits; // Number of hits for the seed's reverse compliment
  260. const unsigned *rcHits; // The actual reverse compliment hits
  261. genomeIndex->lookupSeed(seed, &nHits, &hits, &nRCHits, &rcHits);
  262. nHashTableLookups++;
  263. lookupsThisRun++;
  264. SetSeedUsed(nextSeedToTest);
  265. bool appliedEitherSeed = false;
  266. if (nHits > maxHitsToConsider) {
  267. //
  268. // This seed is matching too many places. Just pretend we never looked and keep going.
  269. //
  270. nHitsIgnoredBecauseOfTooHighPopularity++;
  271. anyHitsSkippedBecauseOfTooHighPopularityThisRun = true;
  272. } else {
  273. //
  274. // Apply this seed. First update the table of seeds per base.
  275. //
  276. #if USE_CAREFUL_BASE_STATUS
  277. for (unsigned i = nextSeedToTest; i < nextSeedToTest + seedLen; i++) {
  278. bases[i].nSeedsIncludingThisBase++;
  279. mostSeedsContainingAnyParticularBase = __max(mostSeedsContainingAnyParticularBase,bases[i].nSeedsIncludingThisBase);
  280. }
  281. #endif // USE_CAREFUL_BASE_STATUS
  282. //
  283. // And now update the candidates list with any hits from this seed. If lowest possible score of any unseen location is
  284. // more than best_score + confDiff then we know that if this location is newly seen then its location won't ever be a
  285. // winner, and we can ignore it.
  286. //
  287. for (unsigned i = 0 ; i < nHits; i++) {
  288. //
  289. // Find the genome location where the beginning of the read would hit, given a match on this seed.
  290. //
  291. unsigned genomeLocationOfThisHit = hits[i] - nextSeedToTest;
  292. Candidate *candidate = findCandidate(genomeLocationOfThisHit,false);
  293. if (NULL != candidate) {
  294. candidate->weight++;
  295. if (candidate->minRange > genomeLocationOfThisHit) {
  296. candidate->minRange = genomeLocationOfThisHit;
  297. } else if (candidate->maxRange < genomeLocationOfThisHit) {
  298. candidate->maxRange = genomeLocationOfThisHit;
  299. }
  300. } else if (bestScore + confDiff > lowestPossibleScoreOfAnyUnseenLocation &&
  301. secondBestScore > lowestPossibleScoreOfAnyUnseenLocation) {
  302. candidate = allocateNewCandidate(genomeLocationOfThisHit, false);
  303. candidate->weight = 1;
  304. candidate->minPossibleScore = lowestPossibleScoreOfAnyUnseenLocation;
  305. candidate->scored = false;
  306. }
  307. }
  308. nSeedsApplied++;
  309. appliedEitherSeed = true;
  310. }
  311. if (nRCHits > maxHitsToConsider) {
  312. //
  313. // This seed is matching too many places. Just pretend we never looked and keep going.
  314. //
  315. nHitsIgnoredBecauseOfTooHighPopularity++;
  316. anyHitsSkippedBecauseOfTooHighPopularityThisRun = true;
  317. } else {
  318. //
  319. // Apply this seed. First update the table of seeds per base.
  320. // The RC seed is at offset ReadSize - SeedSize - seed offset in the RC seed.
  321. //
  322. // To see why, imagine that you had a read that looked like 0123456 (where the digits
  323. // represented some particular bases, and digit' is the base's compliment). Then the
  324. // RC of that read is 6'5'4'3'2'1'. So, when we look up the hits for the seed at
  325. // offset 0 in the forward read (i.e. 012 assuming a seed size of 3) then the index
  326. // will also return the results for the seed's reverse compliment, i.e., 3'2'1'.
  327. // This happens as the last seed in the RC read.
  328. //
  329. unsigned rcOffset = read->getDataLength() - seedLen - nextSeedToTest;
  330. #if USE_CAREFUL_BASE_STATUS
  331. for (unsigned i = rcOffset; i < rcOffset + seedLen; i++) {
  332. bases[i].nRCSeedsIncludingThisBase++;
  333. mostRCSeedsContainingAnyParticularBase = __max(mostRCSeedsContainingAnyParticularBase,bases[i].nRCSeedsIncludingThisBase);
  334. }
  335. #endif // USE_CAREFUL_BASE_STATUS
  336. //
  337. // And now update the candidates list with any hits from this seed. If lowest possible score of any unseen RC location is
  338. // more than best_score + confDiff then we know that if this location is newly seen then its location won't ever be a
  339. // winner, and we can ignore it.
  340. //
  341. for (unsigned i = 0 ; i < nRCHits; i++) {
  342. //
  343. // Find the genome location where the beginning of the read would hit, given a match on this seed.
  344. //
  345. unsigned genomeLocationOfThisHit = rcHits[i] - rcOffset;
  346. Candidate *candidate = findCandidate(genomeLocationOfThisHit,true);
  347. if (NULL != candidate) {
  348. candidate->weight++;
  349. if (candidate->minRange > genomeLocationOfThisHit) {
  350. candidate->minRange = genomeLocationOfThisHit;
  351. } else if (candidate->maxRange < genomeLocationOfThisHit) {
  352. candidate->maxRange = genomeLocationOfThisHit;
  353. }
  354. } else if (bestScore + confDiff > lowestPossibleRCScoreOfAnyUnseenLocation &&
  355. secondBestScore > lowestPossibleRCScoreOfAnyUnseenLocation) {
  356. candidate = allocateNewCandidate(genomeLocationOfThisHit, true);
  357. candidate->weight = 1;
  358. candidate->minPossibleScore = lowestPossibleRCScoreOfAnyUnseenLocation;
  359. candidate->scored = false;
  360. }
  361. }
  362. nRCSeedsApplied++;
  363. appliedEitherSeed = true;
  364. }
  365. //
  366. // Move us along.
  367. //
  368. nextSeedToTest += seedLen;
  369. if (appliedEitherSeed) {
  370. //
  371. // And finally, try scoring.
  372. //
  373. if (score( false,
  374. read,
  375. rcRead,
  376. &finalResult,
  377. finalScore,
  378. genomeLocation,
  379. hitIsRC,
  380. nSeedsApplied,
  381. nRCSeedsApplied,
  382. mostSeedsContainingAnyParticularBase,
  383. mostRCSeedsContainingAnyParticularBase,
  384. lowestPossibleScoreOfAnyUnseenLocation,
  385. lowestPossibleRCScoreOfAnyUnseenLocation,
  386. candidates,
  387. bestScore,
  388. bestScoreGenomeLocation,
  389. secondBestScore,
  390. lvScores,
  391. lvScoresAfterBestFound,
  392. anyHitsSkippedBecauseOfTooHighPopularityThisRun)) {
  393. #if MAINTAIN_HISTOGRAMS
  394. lvHistogram->addToCount(lvScores,lvScores);
  395. lookupHistogram->addToCount(lookupsThisRun,lookupsThisRun);
  396. if (MultipleHits == finalResult) {
  397. lvHistogramForMulti->addToCount(lvScores,lvScores);
  398. } else if (SingleHit == finalResult) {
  399. lvCountWhenBestFound->addToCount(lvScores-lvScoresAfterBestFound);
  400. }
  401. #endif // MAINTAIN_HISTOGRAMS
  402. return finalResult;
  403. }
  404. }
  405. }
  406. //
  407. // Do the best with what we've got.
  408. //
  409. score( true,
  410. read,
  411. rcRead,
  412. &finalResult,
  413. finalScore,
  414. genomeLocation,
  415. hitIsRC,
  416. nSeedsApplied,
  417. nRCSeedsApplied,
  418. mostSeedsContainingAnyParticularBase,
  419. mostRCSeedsContainingAnyParticularBase,
  420. lowestPossibleScoreOfAnyUnseenLocation,
  421. lowestPossibleRCScoreOfAnyUnseenLocation,
  422. candidates,
  423. bestScore,
  424. bestScoreGenomeLocation,
  425. secondBestScore,
  426. lvScores,
  427. lvScoresAfterBestFound,
  428. anyHitsSkippedBecauseOfTooHighPopularityThisRun);
  429. #if MAINTAIN_HISTOGRAMS
  430. lvHistogram->addToCount(lvScores,lvScores);
  431. lookupHistogram->addToCount(lookupsThisRun,lookupsThisRun);
  432. if (MultipleHits == finalResult) {
  433. lvHistogramForMulti->addToCount(lvScores,lvScores);
  434. } else if (SingleHit == finalResult) {
  435. lvCountWhenBestFound->addToCount(lvScores-lvScoresAfterBestFound);
  436. }
  437. #endif // MAINTAIN_HISTOGRAMS
  438. return finalResult;
  439. }
  440. bool
  441. OldAligner::score(
  442. bool forceResult,
  443. Read *read,
  444. Read *rcRead,
  445. AlignmentResult *result,
  446. int *finalScore,
  447. unsigned *singleHitGenomeLocation,
  448. bool *hitIsRC,
  449. unsigned nSeedsApplied,
  450. unsigned nRCSeedsApplied,
  451. unsigned mostSeedsContainingAnyParticularBase,
  452. unsigned mostRCSeedsContainingAnyParticularBase,
  453. unsigned &lowestPossibleScoreOfAnyUnseenLocation,
  454. unsigned &lowestPossibleRCScoreOfAnyUnseenLocation,
  455. Candidate *candidates,
  456. unsigned &bestScore,
  457. unsigned &bestScoreGenomeLocation,
  458. unsigned &secondBestScore,
  459. unsigned &lvScores,
  460. unsigned &lvScoresAfterBestFound,
  461. bool anyHitsSkippedBecauseOfTooHighPopularityThisRun)
  462. /*++
  463. Routine Description:
  464. Make progress in scoring a possibly partial alignment. This is a private method of the OldAligner class that's used
  465. only by AlignRead.
  466. It does a number of things. First, it computes the lowest possible score of any unseen location. This is useful
  467. because once we have a scored hit that's more than confDiff better than all unseen locations, there's no need to
  468. lookup more of them, we can just score what we've got and be sure that the answer is right (unless errors have
  469. pushed the read to be closer to a wrong location than to the correct one, in which case it's hopeless).
  470. It then decides whether it should score a location, and if so what one to score. It chooses the unscored
  471. location that's got the highest weight (i.e., appeared in the most hash table lookups), since that's most
  472. likely to be the winner. If there are multiple candidates with the same best weight, it breaks the tie using
  473. the best possible score for the candidates (which is determined when they're first hit). Remaining ties are
  474. broken arbitrarily.
  475. It merges indels with scored candidates. If there's an insertion or deletion in the read, then we'll get very
  476. close but unequal results out of the hash table lookup for parts of the read on opposite sides of the
  477. insertion or deletion. This throws out the one with the worse score.
  478. It then figures out if we have a definitive answer, and says what that is.
  479. Arguments:
  480. forceResult - should we generate an answer even if it's not definitive?
  481. read - the read we're aligning
  482. rcRead - the reverse compliment of read
  483. result - returns the result if we reach one
  484. singleHitGenomeLocation - returns the location in the genome if we return a single hit
  485. hitIsRC - if we return a single hit, indicates whether it's on the reverse compliment
  486. nSeedsApplied - how may seeds have we looked up and not ignored in this alignment
  487. nRCSeedsApplied - how may reverse compliment seeds have we looked up and not ignored in this alignment
  488. mostSeedsContainingAnyParticularBase - what's the largest number of seeds we've used that contain any single base from the read?
  489. mostRCSeedsContainingAnyParticularBase - same thing for the reverse compliment read
  490. lowestPossibleScoreOfAnyUnseenLocation - in/out the best score that any genome location that we haven't hit could possinly have
  491. lowestPossibleRCScoreOfAnyUnseenLocation- same thing for the revrse compliment read
  492. candidates - in/out the array of candidates that have hit and possibly been scored
  493. bestScore - in/out the best score we've seen so far (recall that score is string distance, so best is least).
  494. bestScoreGenomeLocation - in/out where in the genome was the best score
  495. secondBestScore - in/out what's the second best score we've seen
  496. lvScores - in/out how many calls to LandauVishkin have we made (instrumentation)
  497. lvScoresAfterBestFound - in/out how many calls to LandauVishkin have we made after finding the best score (instrumentation)
  498. Return Value:
  499. true iff we've reached a result. When called with forceResult, we'll always return true.
  500. --*/
  501. {
  502. if (0 == mostSeedsContainingAnyParticularBase && 0 == mostRCSeedsContainingAnyParticularBase) {
  503. //
  504. // The only way we can get here is if we've tried all of the seeds that we're willing
  505. // to try and every one of them generated too many hits to process. Declare
  506. // a multi hit and give up.
  507. //
  508. _ASSERT(forceResult);
  509. *result = MultipleHits;
  510. return true;
  511. }
  512. //
  513. // Recompute lowestPossibleScore.
  514. //
  515. if (0 != mostSeedsContainingAnyParticularBase) {
  516. lowestPossibleScoreOfAnyUnseenLocation =
  517. __max(lowestPossibleScoreOfAnyUnseenLocation,
  518. nSeedsApplied / mostSeedsContainingAnyParticularBase);
  519. }
  520. if (0 != mostRCSeedsContainingAnyParticularBase) {
  521. lowestPossibleRCScoreOfAnyUnseenLocation =
  522. __max(lowestPossibleRCScoreOfAnyUnseenLocation,
  523. nRCSeedsApplied / mostRCSeedsContainingAnyParticularBase);
  524. }
  525. #if 1
  526. //
  527. // If we haven't seen enough seeds to exclude unseen locations (i.e., places
  528. // in the genome where we didn't get a hit), just say no and wait for more
  529. // lookups.
  530. //
  531. if (lowestPossibleScoreOfAnyUnseenLocation < confDiff && lowestPossibleRCScoreOfAnyUnseenLocation < confDiff) {
  532. if (forceResult) {
  533. //
  534. // We can't exclude things that haven't hit, but just blow through and do the best we can.
  535. //
  536. } else {
  537. return false;
  538. }
  539. }
  540. #endif // 0
  541. do {
  542. //
  543. // Run through the candidates doing two things:
  544. // 1) Excluding those that have a score (or minPossibleScore) >= bestScore + confDiff,
  545. // 2) Finding the highest weight candidate that's not been scored (if there is one).
  546. //
  547. unsigned biggestWeight = 0;
  548. unsigned minPossibleScoreOfBiggestWeight = 1000;
  549. Candidate *candidateToScore;
  550. int i = 0;
  551. unsigned nUnscoredCandidates = 0;
  552. int indelPartner1 = nCandidates;
  553. int indelPartner2 = nCandidates;
  554. while (i < nCandidates) { // Use a while loop because sometimes we increase i, sometimes we decrease nCandidates
  555. unsigned minScoreForCandidate;
  556. if (candidates[i].scored) {
  557. minScoreForCandidate = candidates[i].score;
  558. _ASSERT(minScoreForCandidate >= bestScore);
  559. } else {
  560. if (candidates[i].isRC) {
  561. if (lowestPossibleRCScoreOfAnyUnseenLocation >= candidates[i].weight) {
  562. candidates[i].minPossibleScore =
  563. __max(candidates[i].minPossibleScore,lowestPossibleRCScoreOfAnyUnseenLocation - candidates[i].weight);
  564. }
  565. } else {
  566. if (lowestPossibleScoreOfAnyUnseenLocation >= candidates[i].weight) {
  567. candidates[i].minPossibleScore =
  568. __max(candidates[i].minPossibleScore,lowestPossibleScoreOfAnyUnseenLocation - candidates[i].weight);
  569. }
  570. }
  571. minScoreForCandidate = candidates[i].minPossibleScore;
  572. }
  573. if (minScoreForCandidate > bestScore + confDiff || minScoreForCandidate > secondBestScore) {
  574. //
  575. // This can't possibly win, exclude it.
  576. //
  577. nCandidatesEliminated++;
  578. if (candidates[i].scored) {
  579. nScoredCandidatesEliminated++;
  580. if (minScoreForCandidate < bestScore + confDiff) nScoredCandidatesSoftEliminated++;
  581. }
  582. removeCandidateAtIndex(i);
  583. if (indelPartner2 == i) {
  584. indelPartner2 = nCandidates;
  585. }
  586. continue;
  587. }
  588. if (!candidates[i].scored) {
  589. nUnscoredCandidates++;
  590. if (candidates[i].weight > biggestWeight ||
  591. candidates[i].weight == biggestWeight && candidates[i].minPossibleScore < minPossibleScoreOfBiggestWeight) {
  592. biggestWeight = candidates[i].weight;
  593. minPossibleScoreOfBiggestWeight = candidates[i].minPossibleScore;
  594. candidateToScore = &candidates[i]; // This is OK, because we've already done whatever excluding we need to <= i
  595. if (i > 0 &&
  596. candidates[i-1].scored &&
  597. candidates[i-1].genomeLocation + confDiff >= candidates[i].genomeLocation) {
  598. indelPartner1 = i-1;
  599. } else {
  600. indelPartner1 = nCandidates;
  601. }
  602. if (i < nCandidates-1 &&
  603. candidates[i+1].scored &&
  604. candidates[i+1].genomeLocation - confDiff <= candidates[i].genomeLocation) {
  605. indelPartner2 = i+1;
  606. } else {
  607. indelPartner2 = nCandidates;
  608. }
  609. }
  610. }
  611. i++;
  612. }
  613. if (biggestWeight > 0) {
  614. //
  615. // We have an unscored candidate. Score it.
  616. //
  617. unsigned score;
  618. if (candidateToScore->isRC) {
  619. score = candidateToScore->score = landauVishkin.computeEditDistance(
  620. genome->getSubstring(candidateToScore->genomeLocation,rcRead->getDataLength()),
  621. rcRead->getDataLength(),
  622. rcRead->getData(),
  623. rcRead->getDataLength(),
  624. __min(maxK,bestScore)+confDiff-1);
  625. } else {
  626. score = candidateToScore->score = landauVishkin.computeEditDistance(
  627. genome->getSubstring(candidateToScore->genomeLocation,read->getDataLength()),
  628. read->getDataLength(),
  629. read->getData(),
  630. read->getDataLength(),
  631. __min(maxK,bestScore)+confDiff-1);
  632. }
  633. candidateToScore->scored = true;
  634. nLocationsScored++;
  635. lvScores++;
  636. lvScoresAfterBestFound++;
  637. //
  638. // See if there's an indel partner for this candidate. If so, delete the one
  639. // that's got the lower score. We carefully do indelPartner2 before 1 because
  640. // if we delete 1 (or candidateToScore) then the index for indelPartner2
  641. // won't be right anymore.
  642. //
  643. bool deletedCandidateToScore = false;
  644. bool bestScoreWasIndelPartner = false;
  645. if (indelPartner2 < nCandidates) {
  646. _ASSERT(candidates[indelPartner2].scored);
  647. _ASSERT(candidates[indelPartner2].genomeLocation <= candidateToScore->genomeLocation + confDiff);
  648. _ASSERT(&candidates[indelPartner2-1] == candidateToScore);
  649. if (candidates[indelPartner2].score <= score) {
  650. //
  651. // The newly scored candidate is the loser. Delete it.
  652. //
  653. _ASSERT(score >= bestScore);
  654. removeCandidateAtIndex(indelPartner2-1);
  655. deletedCandidateToScore = true;
  656. } else {
  657. bestScoreWasIndelPartner = (bestScore == candidates[indelPartner2].score);
  658. removeCandidateAtIndex(indelPartner2);
  659. }
  660. nIndelsMerged++;
  661. }
  662. if (indelPartner1 < nCandidates) {
  663. _ASSERT(candidates[indelPartner1].scored);
  664. _ASSERT(deletedCandidateToScore || candidates[indelPartner1].genomeLocation + confDiff >= candidateToScore->genomeLocation);
  665. _ASSERT(deletedCandidateToScore || &candidates[indelPartner1+1] == candidateToScore);
  666. if (candidates[indelPartner1].score <= score && !deletedCandidateToScore) {
  667. _ASSERT(score >= bestScore);
  668. removeCandidateAtIndex(indelPartner1+1); // i.e., candidateToScore
  669. deletedCandidateToScore = true;
  670. nIndelsMerged++;
  671. } else if (!deletedCandidateToScore) {
  672. bestScoreWasIndelPartner = (bestScore == candidates[indelPartner1].score);
  673. removeCandidateAtIndex(indelPartner1);
  674. _ASSERT(indelPartner1 >= 0 && indelPartner1 < nCandidates);
  675. candidateToScore = &candidates[indelPartner1]; // We just slid the candidate down into this position.
  676. nIndelsMerged++;
  677. }
  678. }
  679. _ASSERT(nUnscoredCandidates > 0);
  680. nUnscoredCandidates--;
  681. // off until we fix the insert/delete problem _ASSERT(candidates[candidateToScore].score >= candidates[candidateToScore].minPossibleScore); // Else we messed up minPossibleScore (or LV or something)
  682. if (!deletedCandidateToScore) {
  683. if (bestScore > candidateToScore->score) {
  684. //
  685. // We have a new best score.
  686. //
  687. if (!bestScoreWasIndelPartner) {
  688. secondBestScore = bestScore;
  689. }
  690. bestScore = candidateToScore->score;
  691. bestScoreGenomeLocation = candidateToScore->genomeLocation;
  692. *hitIsRC = candidateToScore->isRC;
  693. lvScoresAfterBestFound = 0;
  694. } else if (secondBestScore > candidateToScore->score) {
  695. //
  696. // A new second best.
  697. //
  698. secondBestScore = candidateToScore->score;
  699. }
  700. if (secondBestScore < confDiff) {
  701. //
  702. // We've scored two different places that are good enough that this must be an ambiguous read.
  703. //
  704. *result = MultipleHits;
  705. return true;
  706. }
  707. }
  708. }
  709. if (bestScore + confDiff <= __min(lowestPossibleScoreOfAnyUnseenLocation,lowestPossibleRCScoreOfAnyUnseenLocation) || forceResult) {
  710. if (0 == nUnscoredCandidates) {
  711. //
  712. // We've scored all live candidates and excluded all non-candidates. We have our
  713. // answer.
  714. //
  715. if (nCandidates > 0 && bestScore + confDiff <= secondBestScore && bestScore <= maxK) {
  716. if (!anyHitsSkippedBecauseOfTooHighPopularityThisRun && !forceResult) {
  717. *result = CertainHit;
  718. } else {
  719. *result = SingleHit;
  720. }
  721. *singleHitGenomeLocation = bestScoreGenomeLocation;
  722. if (NULL != finalScore) {
  723. *finalScore = bestScore;
  724. }
  725. return true;
  726. } else if (nCandidates == 0 || bestScore > maxK) {
  727. *result = NotFound;
  728. return true;
  729. } else {
  730. *result = MultipleHits;
  731. return true;
  732. }
  733. }
  734. //
  735. // We don't have an unambiguous winner, but we do know that there's no need to
  736. // look elsewhere in the genome. We can simply score all the remaining candidates
  737. // and go with that, or else we can keep looking for more hits on the theory that it
  738. // will eliminate candidates without scoring them.
  739. //
  740. if (nUnscoredCandidates < 10) {
  741. //
  742. // Let's go with what we've got.
  743. //
  744. forceResult = true;
  745. }
  746. }
  747. if (lvScores >= lvCutoff) {
  748. //
  749. // Don't score any more, just go with what we've got now.
  750. //
  751. if (UnusedScoreValue == bestScore) {
  752. *result = NotFound;
  753. return true;
  754. }
  755. if (bestScore + confDiff <= secondBestScore) {
  756. *result = SingleHit;
  757. *singleHitGenomeLocation = bestScoreGenomeLocation;
  758. if (NULL != finalScore) {
  759. *finalScore = bestScore;
  760. }
  761. return true;
  762. }
  763. *result = MultipleHits;
  764. return true;
  765. }
  766. } while (forceResult);
  767. return false;
  768. }
  769. OldAligner::Candidate *
  770. OldAligner::findCandidate(
  771. unsigned genomeLocation,
  772. bool isRC)
  773. /*++
  774. Routine Description:
  775. Find an existing candidate with the given genome location within the existing set
  776. of candidates.
  777. This is a simple binary search.
  778. Arguments:
  779. genomeLocation - the location of the candidate we'd like to look up
  780. isRC - Are we searching for the ordinary or reverse compliment candidate (they're treated separately)
  781. Return Value:
  782. A pointer to the candidate if found, NULL otherwise.
  783. --*/
  784. {
  785. AssertCandidatesAreSorted();
  786. Candidate candidateToFind;
  787. candidateToFind.genomeLocation = genomeLocation;
  788. candidateToFind.isRC = isRC;
  789. int min = 0;
  790. int max = nCandidates -1;
  791. //
  792. // Binary search.
  793. //
  794. while (min <= max) {
  795. int probe = (min + max) / 2;
  796. _ASSERT(probe >= 0 && probe <= nCandidates);
  797. if (candidates[probe] == candidateToFind) {
  798. return &candidates[probe];
  799. } else if (candidates[probe] < candidateToFind) {
  800. min = probe+1;
  801. } else {
  802. max = probe-1;
  803. }
  804. }
  805. return NULL;
  806. }
  807. OldAligner::Candidate *
  808. OldAligner::allocateNewCandidate(unsigned genomeLocation, bool isRC)
  809. /*++
  810. Routine Description:
  811. Arguments:
  812. Return Value:
  813. --*/
  814. {
  815. _ASSERT(NULL == findCandidate(genomeLocation, isRC)); // Caller must ensure this
  816. nCandidatesCreated ++;
  817. Candidate newCandidate;
  818. newCandidate.genomeLocation = genomeLocation;
  819. newCandidate.isRC = isRC;
  820. int locationToInsertBefore = -1;
  821. if (0 == nCandidates) {
  822. locationToInsertBefore = 0;
  823. } else {
  824. int min = 0;
  825. int max = nCandidates - 1;
  826. //
  827. // Binary search.
  828. //
  829. while (min <= max) {
  830. int probe = (min + max) / 2;
  831. _ASSERT(probe >= 0 && probe < nCandidates);
  832. _ASSERT(candidates[probe] != newCandidate);
  833. if (candidates[probe] < newCandidate) {
  834. if (probe+1 == nCandidates || candidates[probe+1] > newCandidate) {
  835. locationToInsertBefore = probe+1;
  836. break;
  837. }
  838. min = probe+1;
  839. } else {
  840. _ASSERT(candidates[probe] > newCandidate);
  841. if (probe == 0 || candidates[probe-1] < newCandidate) {
  842. locationToInsertBefore = probe;
  843. break;
  844. }
  845. max = probe-1;
  846. }
  847. }
  848. }
  849. _ASSERT(locationToInsertBefore >= 0); // i.e., we should always find a spot
  850. _ASSERT(locationToInsertBefore <= nCandidates + 1);
  851. //
  852. // Slide everything down one location.
  853. //
  854. memmove(candidates + locationToInsertBefore + 1, candidates + locationToInsertBefore,(nCandidates - locationToInsertBefore) * sizeof(Candidate));
  855. (nCandidates)++;
  856. _ASSERT(locationToInsertBefore >= 0 && locationToInsertBefore < nCandidates);
  857. candidates[locationToInsertBefore].genomeLocation = genomeLocation;
  858. candidates[locationToInsertBefore].isRC = isRC;
  859. AssertCandidatesAreSorted();
  860. return &candidates[locationToInsertBefore];
  861. }
  862. void
  863. OldAligner::removeCandidateAtIndex(int index)
  864. /*++
  865. Routine Description:
  866. Arguments:
  867. Return Value:
  868. --*/
  869. {
  870. _ASSERT(index < nCandidates);
  871. memmove(candidates + index, candidates + index + 1, (nCandidates - index -1) * sizeof(Candidate));
  872. nCandidates--;
  873. }
  874. #if DBG
  875. bool
  876. OldAligner::AssertCandidatesAreSorted()
  877. /*++
  878. Routine Description:
  879. Arguments:
  880. Return Value:
  881. --*/
  882. {
  883. for (int i = 1; i < nCandidates; i++) {
  884. _ASSERT(candidates[i].genomeLocation <= candidates[i-1].genomeLocation);
  885. }
  886. }
  887. #endif // DBG
  888. OldAligner::~OldAligner()
  889. /*++
  890. Routine Description:
  891. Arguments:
  892. Return Value:
  893. --*/
  894. {
  895. #if MAINTAIN_HISTOGRAMS
  896. delete lvHistogram;
  897. delete lookupHistogram;
  898. delete lvHistogramForMulti;
  899. delete lvCountWhenBestFound;
  900. #endif // MAINTAIN_HISTOGRAMS
  901. delete [] seedUsedAsAllocated;
  902. #if USE_CAREFUL_BASE_STATUS
  903. delete [] bases;
  904. #endif // USE_CAREFUL_BASE_STATUS
  905. delete [] candidates;
  906. delete [] rcReadData;
  907. }
  908. void
  909. OldAligner::ComputeHitDistribution(
  910. Read *read,
  911. unsigned correctGenomeLocation,
  912. bool correctHitIsRC,
  913. unsigned *hitCountBySeed,
  914. unsigned *rcHitCountBySeed,
  915. unsigned &nSeedsApplied,
  916. unsigned &nRCSeedsApplied,
  917. unsigned *hitsContainingCorrectLocation)
  918. {
  919. nSeedsApplied = 0;
  920. nRCSeedsApplied = 0;
  921. for (unsigned i = 0; i < maxSeedsToUse; i++) {
  922. hitCountBySeed[i] = 0;
  923. rcHitCountBySeed[i] = 0;
  924. hitsContainingCorrectLocation[i] = 0;
  925. }
  926. //
  927. // A bitvector for used seeds, indexed on the starting location of the seed within the read.
  928. //
  929. if (read->getDataLength() > maxReadSize) {
  930. fprintf(stderr,"OldAligner:: got too big read (%d > %d)", read->getDataLength(),maxReadSize);
  931. exit(1);
  932. }
  933. if (read->getDataLength() < seedLen) {
  934. //
  935. // Too short to have any seeds, it's hopeless.
  936. //
  937. return;
  938. }
  939. //
  940. // Clear out the seed used array.
  941. //
  942. memset(seedUsed,0,(read->getDataLength() + 7) / 8);
  943. int rdl = (int)read->getDataLength();
  944. const char *readData = read->getData();
  945. unsigned countOfNs = 0;
  946. for (int i = 0; i < rdl; i++) {
  947. char baseByte = readData[i];
  948. rcReadData[rdl - i - 1] = rcTranslationTable[baseByte];
  949. countOfNs += nTable[baseByte];
  950. }
  951. if (countOfNs > maxK) {
  952. return;
  953. }
  954. //
  955. // Block off any seeds that would contain an N.
  956. //
  957. if (countOfNs > 0) {
  958. int minSeedToConsiderNing = 0; // In English, any word can be verbed. Including, apparently, "N."
  959. for (int i = 0; i < rdl; i++) {
  960. if (readData[i] == 'N') {
  961. int limit = __max(i + (int)seedLen - 1, rdl-1);
  962. for (int j = __max(minSeedToConsiderNing, i - (int)seedLen + 1); j <= limit; j++) {
  963. SetSeedUsed(j);
  964. }
  965. minSeedToConsiderNing = limit+1;
  966. }
  967. }
  968. }
  969. Read rcRead[1];
  970. rcRead->init(NULL,0,rcReadData,NULL,read->getDataLength());
  971. unsigned nPossibleSeeds = read->getDataLength() - seedLen + 1;
  972. unsigned nextSeedToTest = 0;
  973. unsigned wrapCount = 0;
  974. while (nSeedsApplied + nRCSeedsApplied < maxSeedsToUse) {
  975. //
  976. // Choose the next seed to use. Choose the first one that isn't used
  977. //
  978. if (nextSeedToTest >= nPossibleSeeds) {
  979. //
  980. // We're wrapping. We want to space the seeds out as much as possible, so if we had
  981. // a seed length of 20 we'd want to take 0, 10, 5, 15, 2, 7, 12, 17. To make the computation
  982. // fast, we use use a table lookup.
  983. //
  984. wrapCount++;
  985. if (wrapCount >= seedLen) {
  986. //
  987. // We tried all possible seeds without matching or even getting enough seeds to
  988. // exceed our seed count. Do the best we can with what we have.
  989. //
  990. return;
  991. }
  992. nextSeedToTest = getWrappedNextSeedToTest(wrapCount);
  993. }
  994. while (nextSeedToTest < nPossibleSeeds && IsSeedUsed(nextSeedToTest)) {
  995. //
  996. // This seed is already used. Try the next one.
  997. //
  998. nextSeedToTest++;
  999. }
  1000. if (nextSeedToTest >= nPossibleSeeds) {
  1001. //
  1002. // Unusable seeds have pushed us past the end of the read. Go back around the outer loop so we wrap properly.
  1003. //
  1004. continue;
  1005. }
  1006. Seed seed(read->getData() + nextSeedToTest, seedLen);
  1007. const unsigned *hits; // The actual hits (of size nHits)
  1008. const unsigned *rcHits; // The actual reverse compliment hits
  1009. genomeIndex->lookupSeed(seed, &hitCountBySeed[nSeedsApplied], &hits, &rcHitCountBySeed[nRCSeedsApplied], &rcHits);
  1010. SetSeedUsed(nextSeedToTest);
  1011. if (hitCountBySeed[nSeedsApplied] <= maxHitsToConsider) {
  1012. if (!correctHitIsRC) {
  1013. //
  1014. // See if the correct answer is in this hit.
  1015. //
  1016. _int64 min = 0;
  1017. _int64 max = ((_int64)hitCountBySeed[nSeedsApplied]) - 1;
  1018. while (min <= max) {
  1019. _int64 probe = (min + max) / 2;
  1020. _ASSERT(probe >= min && probe <= max);
  1021. if (hits[probe] == correctGenomeLocation + nextSeedToTest) {
  1022. hitsContainingCorrectLocation[nSeedsApplied] = hitCountBySeed[nSeedsApplied];
  1023. break;
  1024. }
  1025. //
  1026. // Slice off half of the region. Recall that the hits in the table are sorted backwards, with
  1027. // hits[0] > hits[1], etc.
  1028. //
  1029. if (hits[probe] > correctGenomeLocation) {
  1030. min = probe+1;
  1031. } else {
  1032. max = probe-1;
  1033. }
  1034. }
  1035. }
  1036. nSeedsApplied++;
  1037. }
  1038. if (rcHitCountBySeed[nRCSeedsApplied] <= maxHitsToConsider) {
  1039. if (correctHitIsRC) {
  1040. //
  1041. // See if the correct answer is in this hit.
  1042. //
  1043. _int64 min = 0;
  1044. _int64 max = ((_int64)rcHitCountBySeed[nRCSeedsApplied]) - 1;
  1045. while (min <= max) {
  1046. _int64 probe = (min + max) / 2;
  1047. _ASSERT(probe >= min && probe <= max);
  1048. if (rcHits[probe] - (read->getDataLength() - seedLen - nextSeedToTest) == correctGenomeLocation) {
  1049. hitsContainingCorrectLocation[nRCSeedsApplied] = rcHitCountBySeed[nRCSeedsApplied];
  1050. break;
  1051. }
  1052. //
  1053. // Slice off half of the region. Recall that the hits in the table are sorted backwards, with
  1054. // hits[0] > hits[1], etc.
  1055. //
  1056. if (rcHits[probe] > correctGenomeLocation) {
  1057. min = probe+1;
  1058. } else {
  1059. max = probe-1;
  1060. }
  1061. }
  1062. }
  1063. nRCSeedsApplied++;
  1064. }
  1065. nextSeedToTest += seedLen;
  1066. }
  1067. return;
  1068. }