PageRenderTime 92ms CodeModel.GetById 33ms RepoModel.GetById 1ms app.codeStats 1ms

/SNAPLib/BaseAligner.cpp

https://github.com/terhorst/snap
C++ | 1474 lines | 927 code | 153 blank | 394 comment | 244 complexity | 056372bd6369dbba03f63fc241ed98b8 MD5 | raw file
Possible License(s): Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. /*++
  2. Module Name:
  3. BaseAligner.cpp
  4. Abstract:
  5. SNAP genome aligner
  6. Authors:
  7. Bill Bolosky, August, 2011
  8. Environment:
  9. User mode service.
  10. This class is NOT thread safe. It's the caller's responsibility to ensure that
  11. at most one thread uses an instance at any time.
  12. Revision History:
  13. Adapted from Matei Zaharia's Scala implementation.
  14. --*/
  15. #include "stdafx.h"
  16. #include "BaseAligner.h"
  17. #include "Compat.h"
  18. #include "LandauVishkin.h"
  19. #include "BigAlloc.h"
  20. using std::min;
  21. BaseAligner::BaseAligner(
  22. GenomeIndex *i_genomeIndex,
  23. unsigned i_confDiff,
  24. unsigned i_maxHitsToConsider,
  25. unsigned i_maxK,
  26. unsigned i_maxReadSize,
  27. unsigned i_maxSeedsToUse,
  28. unsigned i_lvCutoff,
  29. unsigned i_adaptiveConfDiffThreshold,
  30. LandauVishkin *i_landauVishkin) :
  31. genomeIndex(i_genomeIndex), confDiff(i_confDiff), maxHitsToConsider(i_maxHitsToConsider), maxK(i_maxK),
  32. maxReadSize(i_maxReadSize), maxSeedsToUse(i_maxSeedsToUse), lvCutoff(i_lvCutoff), readId(-1),
  33. adaptiveConfDiffThreshold(i_adaptiveConfDiffThreshold), landauVishkin(i_landauVishkin),
  34. explorePopularSeeds(false), stopOnFirstHit(false)
  35. /*++
  36. Routine Description:
  37. Constructor for the BaseAligner class. Aligners align reads against an indexed genome.
  38. Arguments:
  39. i_genomeIndex - The index against which to do the alignments
  40. i_confDiff - The string difference between two matches necessary to believe they're really different
  41. i_maxHitsToConsider - The maximum number of hits to use from a seed lookup. Any lookups that return more
  42. than this are ignored.
  43. i_maxK - The largest string difference to consider for any comparison.
  44. i_maxReadSize - Bound on the number of bases in any read. There's no reason to make it tight, it just affects a little memory allocation.
  45. i_maxSeedsToUse - The maximum number of seeds to use when aligning any read (not counting ones ignored because they resulted in too many
  46. hits). Once we've looked up this many seeds, we just score what we've got.
  47. --*/
  48. {
  49. #if MAINTAIN_HISTOGRAMS
  50. lvHistogram = new Histogram(20, true);
  51. lookupHistogram = new Histogram(20, true);
  52. lvHistogramForMulti = new Histogram(20, true);
  53. lvCountWhenBestFound = new Histogram(maxHitsToConsider*4,false);
  54. #endif // MAINTAIN_HISTOGRAMS
  55. nHashTableLookups = 0;
  56. nLocationsScored = 0;
  57. nHitsIgnoredBecauseOfTooHighPopularity = 0;
  58. nReadsIgnoredBecauseOfTooManyNs = 0;
  59. nIndelsMerged = 0;
  60. genome = genomeIndex->getGenome();
  61. seedLen = genomeIndex->getSeedLength();
  62. bsd = new BoundedStringDistance<>(3);
  63. if (i_landauVishkin == NULL) {
  64. landauVishkin = new LandauVishkin;
  65. ownLandauVishkin = true;
  66. } else {
  67. ownLandauVishkin = false;
  68. }
  69. unsigned nCandidates = maxHitsToConsider * (maxReadSize - seedLen + 1) * 2; // *2 is for reverse complement
  70. candidates = (Candidate *)BigAlloc(sizeof(Candidate) * nCandidates);
  71. for (unsigned i = 0 ; i < nCandidates; i++) {
  72. candidates[i].init();
  73. }
  74. rcReadData = (char *)BigAlloc(sizeof(char) * maxReadSize);
  75. // treat everything but ACTG like N
  76. for (unsigned i = 0; i < 256; i++) {
  77. nTable[i] = 1;
  78. rcTranslationTable[i] = 'N';
  79. }
  80. rcTranslationTable['A'] = 'T';
  81. rcTranslationTable['G'] = 'C';
  82. rcTranslationTable['C'] = 'G';
  83. rcTranslationTable['T'] = 'A';
  84. nTable['A'] = 0;
  85. nTable['C'] = 0;
  86. nTable['T'] = 0;
  87. nTable['G'] = 0;
  88. seedUsed = (BYTE *)BigAlloc(sizeof(BYTE) * (maxReadSize + 7 + 128) / 8); // +128 to make sure it extends at both before and after at least an _int64
  89. seedUsedAsAllocated = seedUsed; // Save the pointer for the delete.
  90. seedUsed += 8; // This moves the pointer up an _int64, so we now have the appropriate before buffer.
  91. nUsedHashTableElements = 0;
  92. hashTableElementPoolSize = maxHitsToConsider * maxSeedsToUse * 2 ; // *2 for RC
  93. /*
  94. if (maxK*2 > 64) {
  95. fprintf(stderr,"The HashTableElement struct uses a 64 bit bitvector for used candidates. As a result, you can't have maxK > 32. Not that you'd want to anyway.\n");
  96. exit(1);
  97. }
  98. */
  99. candidateHashTablesSize = (maxHitsToConsider * maxSeedsToUse * 3)/2; // *1.5 for hash table slack
  100. hashTableElementPool = (HashTableElement *)BigAlloc(sizeof(HashTableElement) * hashTableElementPoolSize);
  101. candidateHashTable[0] = (HashTableAnchor *)BigAlloc(sizeof(HashTableAnchor) * candidateHashTablesSize);
  102. candidateHashTable[1] = (HashTableAnchor *)BigAlloc(sizeof(HashTableAnchor) * candidateHashTablesSize);
  103. weightLists = (HashTableElement *)BigAlloc(sizeof(HashTableElement) * (maxSeedsToUse + 1));
  104. for (unsigned i = 0; i < hashTableElementPoolSize; i++) {
  105. hashTableElementPool[i].init();
  106. }
  107. for (unsigned i = 0; i < maxSeedsToUse + 1; i++) {
  108. weightLists[i].init();
  109. }
  110. memset(candidateHashTable[0],0,sizeof(HashTableAnchor) * candidateHashTablesSize);
  111. memset(candidateHashTable[1],0,sizeof(HashTableAnchor) * candidateHashTablesSize);
  112. hashTableEpoch = 0;
  113. }
  114. AlignmentResult
  115. BaseAligner::AlignRead(Read *read, unsigned *genomeLocation, bool *hitIsRC, int *finalScore)
  116. {
  117. return AlignRead(read, genomeLocation, hitIsRC, finalScore, 0, 0, false, 0, NULL, NULL, NULL, NULL);
  118. }
  119. AlignmentResult
  120. BaseAligner::AlignRead(
  121. Read *read,
  122. unsigned *genomeLocation,
  123. bool *hitIsRC,
  124. int *finalScore,
  125. unsigned searchRadius,
  126. unsigned searchLocation,
  127. bool searchRC)
  128. {
  129. return AlignRead(read, genomeLocation, hitIsRC, finalScore,
  130. searchRadius, searchLocation, searchRC, 0, NULL, NULL, NULL, NULL);
  131. }
  132. AlignmentResult
  133. BaseAligner::AlignRead(
  134. Read *read,
  135. unsigned *genomeLocation,
  136. bool *hitIsRC,
  137. int *finalScore,
  138. unsigned searchRadius,
  139. unsigned searchLocation,
  140. bool searchRC,
  141. int maxHitsToGet,
  142. int *multiHitsFound,
  143. unsigned *multiHitLocations,
  144. bool *multiHitRCs,
  145. int *multiHitScores)
  146. /*++
  147. Routine Description:
  148. Align a particular read, possibly constraining the search around a given location.
  149. Arguments:
  150. read - the read to align
  151. genomeLocation - if this aligned to a SingleHit, the 0-based offset in the genome that this hit. The aligner treats the entire
  152. genome as a single string, even though it's really a set of chrosomes. It just makes the code simpler.
  153. The caller can convert to chromosome+offset by looking up the piece boudaries using the Genome object.
  154. hitIsRC - the aligner tries to align both the given read and its reverse complement. If we found a SingleHit this
  155. bool is set to indicate whether that hit was on the reverse complement.
  156. finalScore - if a single or confident hit, this is the score of the hit (i.e., the LV distance from the read)
  157. searchRadius - if non-zero, constrain the search to this distance around searchLocation, in orientation searchRC
  158. searchLocation - location to search around if searchRadius is given
  159. searchRC - whether to search in reverse complement orientation if searchRadius is given
  160. maxHitsToGet - if greater than 0, output up to this many hits within confDiff of the best (if any) in multiHitLocations,
  161. writing their count in multiHitsFound, instead of returning MultipleHits immediately
  162. multiHitsFound - output parameter for number of alternative hits found if maxHitsToGet is true
  163. multiHitLocations - output parameter for locations of alternative hits found if maxHitsToGet is true
  164. Return Value:
  165. ConfidentHit, SingleHit, MultiHit or NotFound depending on how the alignment went.
  166. --*/
  167. {
  168. *genomeLocation = 0xFFFFFFFF; // Value to return if we don't find a location.
  169. *hitIsRC = false; // So we deterministically print the read forward in this case.
  170. if (finalScore != NULL) {
  171. *finalScore = UnusedScoreValue;
  172. }
  173. unsigned lookupsThisRun = 0;
  174. unsigned lvScores = 0;
  175. unsigned lvScoresAfterBestFound = 0;
  176. popularSeedsSkipped = 0;
  177. // Range of genome locations to search in.
  178. unsigned minLocation = 0;
  179. unsigned maxLocation = 0xFFFFFFFF;
  180. if (searchRadius != 0) {
  181. minLocation = (searchLocation > searchRadius) ? searchLocation - searchRadius : 0;
  182. maxLocation = (searchLocation < 0xFFFFFFFF - searchRadius) ? searchLocation + searchRadius : 0xFFFFFFFF;
  183. }
  184. // If asked to return the locations of multiple hits, make sure we get sensible counts and results
  185. if (maxHitsToGet > 0) {
  186. memset(hitCount, 0, MAX_K * sizeof(unsigned));
  187. *multiHitsFound = 0;
  188. }
  189. AlignmentResult finalResult;
  190. //
  191. // A bitvector for used seeds, indexed on the starting location of the seed within the read.
  192. //
  193. if (read->getDataLength() > maxReadSize) {
  194. fprintf(stderr,"BaseAligner:: got too big read (%d > %d)", read->getDataLength(),maxReadSize);
  195. exit(1);
  196. }
  197. if ((int)read->getDataLength() < seedLen) {
  198. //
  199. // Too short to have any seeds, it's hopeless.
  200. //
  201. return NotFound;
  202. }
  203. //
  204. // Clear out the seed used array.
  205. //
  206. memset(seedUsed, 0, (read->getDataLength() + 7) / 8);
  207. unsigned readLen = read->getDataLength();
  208. const char *readData = read->getData();
  209. unsigned countOfNs = 0;
  210. for (unsigned i = 0; i < readLen; i++) {
  211. char baseByte = readData[i];
  212. rcReadData[readLen - i - 1] = rcTranslationTable[baseByte];
  213. countOfNs += nTable[baseByte];
  214. }
  215. if (countOfNs > maxK) {
  216. nReadsIgnoredBecauseOfTooManyNs++;
  217. return NotFound;
  218. }
  219. //
  220. // Block off any seeds that would contain an N.
  221. //
  222. if (countOfNs > 0) {
  223. int minSeedToConsiderNing = 0; // In English, any word can be verbed. Including, apparently, "N."
  224. for (int i = 0; i < (int) readLen; i++) {
  225. if (BASE_VALUE[readData[i]] > 3) {
  226. int limit = __min(i + seedLen - 1, readLen-1);
  227. for (int j = __max(minSeedToConsiderNing, i - (int) seedLen + 1); j <= limit; j++) {
  228. SetSeedUsed(j);
  229. }
  230. minSeedToConsiderNing = limit+1;
  231. if (minSeedToConsiderNing >= (int) readLen)
  232. break;
  233. }
  234. }
  235. }
  236. Read rcRead[1];
  237. rcRead->init(NULL, 0, rcReadData, NULL, readLen);
  238. clearCandidates();
  239. //
  240. // Initialize the bases table, which represents which bases we've checked.
  241. // We have readSize - seeds size + 1 possible seeds.
  242. //
  243. unsigned nPossibleSeeds = readLen - seedLen + 1;
  244. unsigned nextSeedToTest = 0;
  245. unsigned wrapCount = 0;
  246. unsigned lowestPossibleScoreOfAnyUnseenLocation = 0;
  247. unsigned lowestPossibleRCScoreOfAnyUnseenLocation = 0;
  248. unsigned mostSeedsContainingAnyParticularBase = 1; // Instead of tracking this for real, we're just conservative and use wrapCount+1. It's faster.
  249. unsigned mostRCSeedsContainingAnyParticularBase = 1;// ditto
  250. unsigned bestScore = UnusedScoreValue;
  251. unsigned bestScoreGenomeLocation;
  252. unsigned secondBestScore = UnusedScoreValue;
  253. unsigned secondBestScoreGenomeOffset;
  254. bool secondBestScoreIsRC;
  255. unsigned nSeedsApplied = 0;
  256. unsigned nRCSeedsApplied = 0;
  257. unsigned scoreLimit;
  258. if (maxHitsToGet > 0) {
  259. scoreLimit = maxK + 3;
  260. } else {
  261. scoreLimit = maxK + confDiff - 1;
  262. }
  263. while (nSeedsApplied + nRCSeedsApplied < maxSeedsToUse) {
  264. //
  265. // Choose the next seed to use. Choose the first one that isn't used
  266. //
  267. if (nextSeedToTest >= nPossibleSeeds) {
  268. //
  269. // We're wrapping. We want to space the seeds out as much as possible, so if we had
  270. // a seed length of 20 we'd want to take 0, 10, 5, 15, 2, 7, 12, 17. To make the computation
  271. // fast, we use use a table lookup.
  272. //
  273. wrapCount++;
  274. if (wrapCount >= seedLen) {
  275. //
  276. // We tried all possible seeds without matching or even getting enough seeds to
  277. // exceed our seed count. Do the best we can with what we have.
  278. //
  279. #ifdef TRACE_ALIGNER
  280. printf("Calling score with force=true because we wrapped around enough\n");
  281. #endif
  282. score(
  283. true,
  284. read,
  285. rcRead,
  286. &finalResult,
  287. finalScore,
  288. genomeLocation,
  289. hitIsRC,
  290. nSeedsApplied,
  291. nRCSeedsApplied,
  292. mostSeedsContainingAnyParticularBase,
  293. mostRCSeedsContainingAnyParticularBase,
  294. lowestPossibleScoreOfAnyUnseenLocation,
  295. lowestPossibleRCScoreOfAnyUnseenLocation,
  296. candidates,
  297. bestScore,
  298. bestScoreGenomeLocation,
  299. secondBestScore,
  300. secondBestScoreGenomeOffset,
  301. secondBestScoreIsRC,
  302. scoreLimit,
  303. lvScores,
  304. lvScoresAfterBestFound,
  305. maxHitsToGet);
  306. #if MAINTAIN_HISTOGRAMS
  307. lvHistogram->addToCount(lvScores,lvScores);
  308. lookupHistogram->addToCount(lookupsThisRun,lookupsThisRun);
  309. if (MultipleHits == finalResult) {
  310. lvHistogramForMulti->addToCount(lvScores,lvScores);
  311. } else if (SingleHit == finalResult) {
  312. lvCountWhenBestFound->addToCount(lvScores-lvScoresAfterBestFound);
  313. }
  314. #endif // MAINTAIN_HISTOGRAMS
  315. fillHitsFound(maxHitsToGet, multiHitsFound,
  316. multiHitLocations, multiHitRCs, multiHitScores);
  317. return finalResult;
  318. }
  319. nextSeedToTest = getWrappedNextSeedToTest(wrapCount);
  320. mostSeedsContainingAnyParticularBase = wrapCount + 1;
  321. mostRCSeedsContainingAnyParticularBase = wrapCount + 1;
  322. }
  323. while (nextSeedToTest < nPossibleSeeds && IsSeedUsed(nextSeedToTest)) {
  324. //
  325. // This seed is already used. Try the next one.
  326. //
  327. nextSeedToTest++;
  328. }
  329. if (nextSeedToTest >= nPossibleSeeds) {
  330. //
  331. // Unusable seeds have pushed us past the end of the read. Go back around the outer loop so we wrap properly.
  332. //
  333. continue;
  334. }
  335. Seed seed(read->getData() + nextSeedToTest, seedLen);
  336. unsigned nHits; // Number of times this seed hits in the genome
  337. const unsigned *hits; // The actual hits (of size nHits)
  338. unsigned nRCHits; // Number of hits for the seed's reverse complement
  339. const unsigned *rcHits; // The actual reverse complement hits
  340. unsigned minSeedLoc = (minLocation < readLen ? 0 : minLocation - readLen);
  341. unsigned maxSeedLoc = (maxLocation > 0xFFFFFFFF - readLen ? 0xFFFFFFFF : maxLocation + readLen);
  342. genomeIndex->lookupSeed(seed, minSeedLoc, maxSeedLoc, &nHits, &hits, &nRCHits, &rcHits);
  343. nHashTableLookups++;
  344. lookupsThisRun++;
  345. #ifdef TRACE_ALIGNER
  346. printf("Looked up seed %llx: hits=%u, rchits=%u\n", seed.getBases(), nHits, nRCHits);
  347. printf("Hits:");
  348. for (int i = 0; i < nHits; i++)
  349. printf(" %u", hits[i]);
  350. printf("\n");
  351. printf("RC hits:");
  352. for (int i = 0; i < nRCHits; i++)
  353. printf(" %u", rcHits[i]);
  354. printf("\n");
  355. #endif
  356. SetSeedUsed(nextSeedToTest);
  357. bool appliedEitherSeed = false;
  358. //
  359. // Look for the seed in the forward direction, unless we were asked to search only the RC.
  360. //
  361. if (searchRadius == 0 || searchRC == false) {
  362. if (nHits > maxHitsToConsider && !explorePopularSeeds) {
  363. //
  364. // This seed is matching too many places. Just pretend we never looked and keep going.
  365. //
  366. nHitsIgnoredBecauseOfTooHighPopularity++;
  367. popularSeedsSkipped++;
  368. } else {
  369. //
  370. // Update the candidates list with any hits from this seed. If lowest possible score of any unseen location is
  371. // more than best_score + confDiff then we know that if this location is newly seen then its location won't ever be a
  372. // winner, and we can ignore it.
  373. //
  374. for (unsigned i = 0 ; i < min(nHits, maxHitsToConsider); i++) {
  375. //
  376. // Find the genome location where the beginning of the read would hit, given a match on this seed.
  377. //
  378. unsigned genomeLocationOfThisHit = hits[i] - nextSeedToTest;
  379. if (genomeLocationOfThisHit < minLocation || genomeLocationOfThisHit > maxLocation)
  380. continue;
  381. Candidate *candidate;
  382. HashTableElement *hashTableElement;
  383. findCandidate(genomeLocationOfThisHit,false,&candidate,&hashTableElement);
  384. if (NULL != hashTableElement) {
  385. incrementWeight(hashTableElement);
  386. } else if (lowestPossibleScoreOfAnyUnseenLocation <= scoreLimit) {
  387. allocateNewCandidate(genomeLocationOfThisHit,false, lowestPossibleScoreOfAnyUnseenLocation, &candidate, &hashTableElement);
  388. }
  389. }
  390. nSeedsApplied++;
  391. appliedEitherSeed = true;
  392. }
  393. }
  394. //
  395. // Look for the seed in the RC direction, unless we were asked to search only forwards.
  396. //
  397. if (searchRadius == 0 || searchRC == true) {
  398. if (nRCHits > maxHitsToConsider && !explorePopularSeeds) {
  399. //
  400. // This seed is matching too many places. Just pretend we never looked and keep going.
  401. //
  402. nHitsIgnoredBecauseOfTooHighPopularity++;
  403. popularSeedsSkipped++;
  404. } else {
  405. //
  406. // The RC seed is at offset ReadSize - SeedSize - seed offset in the RC seed.
  407. //
  408. // To see why, imagine that you had a read that looked like 0123456 (where the digits
  409. // represented some particular bases, and digit' is the base's complement). Then the
  410. // RC of that read is 6'5'4'3'2'1'. So, when we look up the hits for the seed at
  411. // offset 0 in the forward read (i.e. 012 assuming a seed size of 3) then the index
  412. // will also return the results for the seed's reverse complement, i.e., 3'2'1'.
  413. // This happens as the last seed in the RC read.
  414. //
  415. unsigned rcOffset = readLen - seedLen - nextSeedToTest;
  416. //
  417. // And now update the candidates list with any hits from this seed. If lowest possible score of any unseen RC location is
  418. // more than best_score + confDiff then we know that if this location is newly seen then its location won't ever be a
  419. // winner, and we can ignore it.
  420. //
  421. for (unsigned i = 0 ; i < min(nRCHits, maxHitsToConsider); i++) {
  422. //
  423. // Find the genome location where the beginning of the read would hit, given a match on this seed.
  424. //
  425. unsigned genomeLocationOfThisHit = rcHits[i] - rcOffset;
  426. if (genomeLocationOfThisHit < minLocation || genomeLocationOfThisHit > maxLocation)
  427. continue;
  428. Candidate *candidate;
  429. HashTableElement *hashTableElement;
  430. findCandidate(genomeLocationOfThisHit,true,&candidate,&hashTableElement);
  431. if (NULL != hashTableElement) {
  432. incrementWeight(hashTableElement);
  433. } else if (lowestPossibleRCScoreOfAnyUnseenLocation <= scoreLimit) {
  434. allocateNewCandidate(genomeLocationOfThisHit, true, lowestPossibleRCScoreOfAnyUnseenLocation, &candidate, &hashTableElement);
  435. }
  436. }
  437. nRCSeedsApplied++;
  438. appliedEitherSeed = true;
  439. }
  440. }
  441. //
  442. // Move us along.
  443. //
  444. nextSeedToTest += seedLen;
  445. if (appliedEitherSeed) {
  446. //
  447. // And finally, try scoring.
  448. //
  449. if (score( false,
  450. read,
  451. rcRead,
  452. &finalResult,
  453. finalScore,
  454. genomeLocation,
  455. hitIsRC,
  456. nSeedsApplied,
  457. nRCSeedsApplied,
  458. mostSeedsContainingAnyParticularBase,
  459. mostRCSeedsContainingAnyParticularBase,
  460. lowestPossibleScoreOfAnyUnseenLocation,
  461. lowestPossibleRCScoreOfAnyUnseenLocation,
  462. candidates,
  463. bestScore,
  464. bestScoreGenomeLocation,
  465. secondBestScore,
  466. secondBestScoreGenomeOffset,
  467. secondBestScoreIsRC,
  468. scoreLimit,
  469. lvScores,
  470. lvScoresAfterBestFound,
  471. maxHitsToGet)) {
  472. #if MAINTAIN_HISTOGRAMS
  473. lvHistogram->addToCount(lvScores,lvScores);
  474. lookupHistogram->addToCount(lookupsThisRun,lookupsThisRun);
  475. if (MultipleHits == finalResult) {
  476. lvHistogramForMulti->addToCount(lvScores,lvScores);
  477. } else if (SingleHit == finalResult) {
  478. lvCountWhenBestFound->addToCount(lvScores-lvScoresAfterBestFound);
  479. }
  480. #endif // MAINTAIN_HISTOGRAMS
  481. fillHitsFound(maxHitsToGet, multiHitsFound,
  482. multiHitLocations, multiHitRCs, multiHitScores);
  483. return finalResult;
  484. }
  485. }
  486. }
  487. //
  488. // Do the best with what we've got.
  489. //
  490. #ifdef TRACE_ALIGNER
  491. printf("Calling score with force=true because we ran out of seeds\n");
  492. #endif
  493. score( true,
  494. read,
  495. rcRead,
  496. &finalResult,
  497. finalScore,
  498. genomeLocation,
  499. hitIsRC,
  500. nSeedsApplied,
  501. nRCSeedsApplied,
  502. mostSeedsContainingAnyParticularBase,
  503. mostRCSeedsContainingAnyParticularBase,
  504. lowestPossibleScoreOfAnyUnseenLocation,
  505. lowestPossibleRCScoreOfAnyUnseenLocation,
  506. candidates,
  507. bestScore,
  508. bestScoreGenomeLocation,
  509. secondBestScore,
  510. secondBestScoreGenomeOffset,
  511. secondBestScoreIsRC,
  512. scoreLimit,
  513. lvScores,
  514. lvScoresAfterBestFound,
  515. maxHitsToGet);
  516. #if MAINTAIN_HISTOGRAMS
  517. lvHistogram->addToCount(lvScores,lvScores);
  518. lookupHistogram->addToCount(lookupsThisRun,lookupsThisRun);
  519. if (MultipleHits == finalResult) {
  520. lvHistogramForMulti->addToCount(lvScores,lvScores);
  521. } else if (SingleHit == finalResult) {
  522. lvCountWhenBestFound->addToCount(lvScores-lvScoresAfterBestFound);
  523. }
  524. #endif // MAINTAIN_HISTOGRAMS
  525. fillHitsFound(maxHitsToGet, multiHitsFound,
  526. multiHitLocations, multiHitRCs, multiHitScores);
  527. return finalResult;
  528. }
  529. bool
  530. BaseAligner::score(
  531. bool forceResult,
  532. Read *read,
  533. Read *rcRead,
  534. AlignmentResult *result,
  535. int *finalScore,
  536. unsigned *singleHitGenomeLocation,
  537. bool *hitIsRC,
  538. unsigned nSeedsApplied,
  539. unsigned nRCSeedsApplied,
  540. unsigned mostSeedsContainingAnyParticularBase,
  541. unsigned mostRCSeedsContainingAnyParticularBase,
  542. unsigned &lowestPossibleScoreOfAnyUnseenLocation,
  543. unsigned &lowestPossibleRCScoreOfAnyUnseenLocation,
  544. Candidate *candidates,
  545. unsigned &bestScore,
  546. unsigned &bestScoreGenomeLocation,
  547. unsigned &secondBestScore,
  548. unsigned &secondBestScoreGenomeLocation,
  549. bool &secondBestScoreIsRC,
  550. unsigned &scoreLimit,
  551. unsigned &lvScores,
  552. unsigned &lvScoresAfterBestFound,
  553. unsigned maxHitsToGet)
  554. /*++
  555. Routine Description:
  556. Make progress in scoring a possibly partial alignment. This is a private method of the BaseAligner class that's used
  557. only by AlignRead.
  558. It does a number of things. First, it computes the lowest possible score of any unseen location. This is useful
  559. because once we have a scored hit that's more than confDiff better than all unseen locations, there's no need to
  560. lookup more of them, we can just score what we've got and be sure that the answer is right (unless errors have
  561. pushed the read to be closer to a wrong location than to the correct one, in which case it's hopeless).
  562. It then decides whether it should score a location, and if so what one to score. It chooses the unscored
  563. location that's got the highest weight (i.e., appeared in the most hash table lookups), since that's most
  564. likely to be the winner. If there are multiple candidates with the same best weight, it breaks the tie using
  565. the best possible score for the candidates (which is determined when they're first hit). Remaining ties are
  566. broken arbitrarily.
  567. It merges indels with scored candidates. If there's an insertion or deletion in the read, then we'll get very
  568. close but unequal results out of the hash table lookup for parts of the read on opposite sides of the
  569. insertion or deletion. This throws out the one with the worse score.
  570. It then figures out if we have a definitive answer, and says what that is.
  571. Arguments:
  572. forceResult - should we generate an answer even if it's not definitive?
  573. read - the read we're aligning
  574. rcRead - the reverse complement of read
  575. result - returns the result if we reach one
  576. singleHitGenomeLocation - returns the location in the genome if we return a single hit
  577. hitIsRC - if we return a single hit, indicates whether it's on the reverse complement
  578. nSeedsApplied - how may seeds have we looked up and not ignored in this alignment
  579. nRCSeedsApplied - how may reverse complement seeds have we looked up and not ignored in this alignment
  580. mostSeedsContainingAnyParticularBase - what's the largest number of seeds we've used that contain any single base from the read?
  581. mostRCSeedsContainingAnyParticularBase - same thing for the reverse complement read
  582. lowestPossibleScoreOfAnyUnseenLocation - in/out the best score that any genome location that we haven't hit could possinly have
  583. lowestPossibleRCScoreOfAnyUnseenLocation- same thing for the reverse complement read
  584. candidates - in/out the array of candidates that have hit and possibly been scored
  585. bestScore - in/out the best score we've seen so far (recall that score is string distance, so best is least).
  586. bestScoreGenomeLocation - in/out where in the genome was the best score
  587. secondBestScore - in/out what's the second best score we've seen
  588. scoreLimit - in/out maximum edit distance we are need to consider for further scores (bigger ones won't help)
  589. lvScores - in/out how many calls to LandauVishkin have we made (instrumentation)
  590. lvScoresAfterBestFound - in/out how many calls to LandauVishkin have we made after finding the best score (instrumentation)
  591. popularSeedsSkipped - how many overly popular seeds we skipped during this alignment
  592. maxHitsToGet - whether the user wants to get multiple non-confident hits in the MultiHit case
  593. Return Value:
  594. true iff we've reached a result. When called with forceResult, we'll always return true.
  595. --*/
  596. {
  597. #ifdef TRACE_ALIGNER
  598. printf("score() called with force=%d nsa=%d nrcsa=%d best=%u bestloc=%u 2nd=%u\n",
  599. forceResult, nSeedsApplied, nRCSeedsApplied, bestScore, bestScoreGenomeLocation, secondBestScore);
  600. printf("Read:\n ");
  601. for (int i = 0; i < read->getDataLength(); i++) printf("%c", read->getData()[i]);
  602. printf("\n");
  603. //printf("Candidates:\n");
  604. //for (int i = 0; i < nCandidates; i++) {
  605. // Candidate* c = candidates + i;
  606. // printf(" loc=%u rc=%d weight=%u minps=%u scored=%d score=%u r=%u-%u\n",
  607. // c->genomeLocation, c->isRC, c->weight, c->minPossibleScore, c->scored,
  608. // c->score, c->minRange, c->maxRange);
  609. //}
  610. //printf("\n\n");
  611. #endif
  612. if (0 == mostSeedsContainingAnyParticularBase && 0 == mostRCSeedsContainingAnyParticularBase) {
  613. //
  614. // The only way we can get here is if we've tried all of the seeds that we're willing
  615. // to try and every one of them generated too many hits to process. Declare
  616. // a multi hit and give up.
  617. //
  618. _ASSERT(forceResult);
  619. *result = MultipleHits;
  620. return true;
  621. }
  622. //
  623. // Recompute lowestPossibleScore.
  624. //
  625. if (0 != mostSeedsContainingAnyParticularBase) {
  626. lowestPossibleScoreOfAnyUnseenLocation =
  627. __max(lowestPossibleScoreOfAnyUnseenLocation,
  628. nSeedsApplied / mostSeedsContainingAnyParticularBase);
  629. }
  630. if (0 != mostRCSeedsContainingAnyParticularBase) {
  631. lowestPossibleRCScoreOfAnyUnseenLocation =
  632. __max(lowestPossibleRCScoreOfAnyUnseenLocation,
  633. nRCSeedsApplied / mostRCSeedsContainingAnyParticularBase);
  634. }
  635. #ifdef TRACE_ALIGNER
  636. printf("Lowest possible scores for unseen locations: %d (fwd), %d (RC)\n",
  637. lowestPossibleScoreOfAnyUnseenLocation,
  638. lowestPossibleRCScoreOfAnyUnseenLocation);
  639. #endif
  640. // Return early if we haven't tried enough seeds
  641. //if (__max(nSeedsApplied, nRCSeedsApplied) < 4 && !forceResult) {
  642. // return false;
  643. //}
  644. unsigned weightListToCheck = highestUsedWeightList;
  645. do {
  646. HashTableElement *elementToScore;
  647. _ASSERT(weightListToCheck <= maxSeedsToUse);
  648. //
  649. // Grab the next element to score, and score it.
  650. //
  651. while (weightListToCheck > 0 && weightLists[weightListToCheck].weightNext == &weightLists[weightListToCheck]) {
  652. weightListToCheck--;
  653. highestUsedWeightList = weightListToCheck;
  654. }
  655. _ASSERT(weightListToCheck <= maxSeedsToUse);
  656. if (__min(lowestPossibleScoreOfAnyUnseenLocation,lowestPossibleRCScoreOfAnyUnseenLocation) > scoreLimit || forceResult) {
  657. if (weightListToCheck == 0) {
  658. //
  659. // We've scored all live candidates and excluded all non-candidates. We have our
  660. // answer.
  661. //
  662. int realConfDiff = confDiff + (popularSeedsSkipped >= adaptiveConfDiffThreshold ? 1 : 0);
  663. if (NULL != finalScore) {
  664. *finalScore = bestScore;
  665. }
  666. if (bestScore + realConfDiff <= secondBestScore && bestScore <= maxK) {
  667. if (popularSeedsSkipped == 0 && !forceResult) {
  668. *result = CertainHit;
  669. } else {
  670. *result = SingleHit;
  671. }
  672. *singleHitGenomeLocation = bestScoreGenomeLocation;
  673. return true;
  674. } else if (bestScore > maxK) {
  675. // If none of our seeds was below the popularity threshold, report this as MultipleHits; otherwise,
  676. // report it as NotFound
  677. *result = (nSeedsApplied == 0 && nRCSeedsApplied == 0) ? MultipleHits : NotFound;
  678. return true;
  679. } else {
  680. *result = MultipleHits;
  681. return true;
  682. }
  683. }
  684. //
  685. // Nothing that we haven't already looked up can possibly be the answer. Score what we've got and exit.
  686. //
  687. forceResult = true;
  688. } else if (weightListToCheck == 0) {
  689. if (forceResult) {
  690. if (NULL != finalScore) {
  691. *finalScore = bestScore;
  692. }
  693. int realConfDiff = confDiff + (popularSeedsSkipped >= adaptiveConfDiffThreshold ? 1 : 0);
  694. if (bestScore + realConfDiff <= secondBestScore && bestScore <= maxK) {
  695. *result = SingleHit;
  696. *singleHitGenomeLocation = bestScoreGenomeLocation;
  697. return true;
  698. } else if (bestScore > maxK) {
  699. // If none of our seeds was below the popularity threshold, report this as MultipleHits; otherwise,
  700. // report it as NotFound
  701. *result = (nSeedsApplied == 0 && nRCSeedsApplied == 0) ? MultipleHits : NotFound;
  702. return true;
  703. } else {
  704. *result = MultipleHits;
  705. return true;
  706. }
  707. } else {
  708. //
  709. // No candidates, look for more.
  710. //
  711. return false;
  712. }
  713. }
  714. elementToScore = weightLists[weightListToCheck].weightNext;
  715. _ASSERT(!elementToScore->allExtantCandidatesScored);
  716. _ASSERT(elementToScore->candidatesUsed != 0);
  717. _ASSERT(elementToScore != &weightLists[weightListToCheck]);
  718. if (elementToScore->lowestPossibleScore <= scoreLimit) {
  719. unsigned long candidateIndexToScore;
  720. _uint64 candidatesMask = elementToScore->candidatesUsed;
  721. while (_BitScanForward64(&candidateIndexToScore,candidatesMask)) {
  722. candidatesMask &= ~(_uint64)1 << candidateIndexToScore;
  723. Candidate *candidateToScore = &elementToScore->candidates[candidateIndexToScore];
  724. if (candidateToScore->scoredInEpoch == hashTableEpoch) {
  725. //
  726. // This can happen if we find a candidate, score it, and then hit on an indel
  727. // near it.
  728. //
  729. continue;
  730. }
  731. unsigned genomeLocation = elementToScore->baseGenomeLocation + candidateIndexToScore;
  732. unsigned score = -1;
  733. if (elementToScore->isRC) {
  734. const char *data = genome->getSubstring(genomeLocation, rcRead->getDataLength() + MAX_K);
  735. if (data != NULL) {
  736. #ifdef USE_NEW_DISTANCE
  737. score = bsd->compute(data, rcRead->getData(), rcRead->getDataLength(), scoreLimit);
  738. #else
  739. _uint64 cacheKey = 0;
  740. if (readId != -1) {
  741. cacheKey = ((_uint64) readId) << 33 | ((_uint64) elementToScore->isRC) << 32 | genomeLocation;
  742. }
  743. score = landauVishkin->computeEditDistance(
  744. data, rcRead->getDataLength() + MAX_K,
  745. rcRead->getData(), rcRead->getDataLength(),
  746. scoreLimit, cacheKey);
  747. #endif
  748. }
  749. #ifdef TRACE_ALIGNER
  750. printf("Computing distance at %u (RC) with limit %d: %d\n", genomeLocation, scoreLimit, score);
  751. #endif
  752. } else {
  753. const char *data = genome->getSubstring(genomeLocation, read->getDataLength() + MAX_K);
  754. if (data != NULL) {
  755. #ifdef USE_NEW_DISTANCE
  756. score = bsd->compute(data, read->getData(), read->getDataLength(), scoreLimit);
  757. #else
  758. _uint64 cacheKey = 0;
  759. if (readId != -1) {
  760. cacheKey = ((_uint64) readId) << 33 | ((_uint64) elementToScore->isRC) << 32 | genomeLocation;
  761. }
  762. score = landauVishkin->computeEditDistance(
  763. data, read->getDataLength() + MAX_K,
  764. read->getData(), read->getDataLength(),
  765. scoreLimit, cacheKey);
  766. #endif
  767. }
  768. #ifdef TRACE_ALIGNER
  769. printf("Computing distance at %u (fwd) with limit %d: %d\n", genomeLocation, scoreLimit, score);
  770. #endif
  771. }
  772. if (maxHitsToGet > 0 && score != -1 && hitCount[score] < maxHitsToGet) {
  773. // Remember the location of this hit because we don't have enough at this distance
  774. hitLocations[score][hitCount[score]] = genomeLocation;
  775. hitRCs[score][hitCount[score]] = elementToScore->isRC;
  776. hitCount[score]++;
  777. }
  778. candidateToScore->score = score;
  779. candidateToScore->scoredInEpoch = hashTableEpoch;
  780. elementToScore->bestScore = __min(elementToScore->bestScore, score);
  781. nLocationsScored++;
  782. lvScores++;
  783. lvScoresAfterBestFound++;
  784. // off until we fix the insert/delete problem _ASSERT(candidates[candidateToScore].score >= candidates[candidateToScore].minPossibleScore); // Else we messed up minPossibleScore (or LV or something)
  785. if (bestScore > score) {
  786. if ((secondBestScore == UnusedScoreValue || !(secondBestScoreGenomeLocation + maxK > genomeLocation && secondBestScoreGenomeLocation < genomeLocation + maxK)) &&
  787. (bestScore == UnusedScoreValue || !(bestScoreGenomeLocation + maxK > genomeLocation && bestScoreGenomeLocation < genomeLocation + maxK)))
  788. {
  789. secondBestScore = bestScore;
  790. secondBestScoreGenomeLocation = bestScoreGenomeLocation;
  791. secondBestScoreIsRC = *hitIsRC;
  792. }
  793. //
  794. // We have a new best score.
  795. //
  796. bestScore = score;
  797. bestScoreGenomeLocation = genomeLocation;
  798. *singleHitGenomeLocation = bestScoreGenomeLocation;
  799. if (NULL != finalScore) {
  800. *finalScore = bestScore;
  801. }
  802. *hitIsRC = elementToScore->isRC;
  803. lvScoresAfterBestFound = 0;
  804. } else if (secondBestScore > score &&
  805. (secondBestScore == UnusedScoreValue || !(secondBestScoreGenomeLocation + maxK > genomeLocation && secondBestScoreGenomeLocation < genomeLocation + maxK)) &&
  806. !(bestScoreGenomeLocation + maxK > genomeLocation && bestScoreGenomeLocation < genomeLocation + maxK)) {
  807. //
  808. // A new second best.
  809. //
  810. secondBestScore = score;
  811. secondBestScoreGenomeLocation = genomeLocation;
  812. secondBestScoreIsRC = elementToScore->isRC;
  813. }
  814. if (stopOnFirstHit && bestScore <= maxK) {
  815. // The user just wanted to find reads that match the database within some distance, but doesn't
  816. // care about the best alignment. Stop now but mark the result as MultipleHits because we're not
  817. // confident that it's the best one.
  818. *result = MultipleHits;
  819. return true;
  820. }
  821. // Update scoreLimit since we may have improved bestScore or secondBestScore
  822. if (maxHitsToGet > 0) {
  823. scoreLimit = min(bestScore, maxK) + 3;
  824. } else {
  825. scoreLimit = min(bestScore, maxK) + confDiff - 1 + (popularSeedsSkipped >= adaptiveConfDiffThreshold ? 1 : 0);
  826. if (bestScore != UnusedScoreValue && secondBestScore < bestScore + confDiff) {
  827. // Since secondBestScore already means that our best location so far won't be a SingleHit,
  828. // we really just care about finding better locations than that. However, still check for
  829. // scores up to bestScore - 1 since those might become a new secondBestScore.
  830. // _ASSERT(bestScore >= 1); // Otherwise we would've returned MultipleHits
  831. scoreLimit = bestScore - 1;
  832. }
  833. }
  834. } // While candidates exist in the element
  835. } // If the element could possibly affect the result
  836. //
  837. // Remove the element from the weight list.
  838. //
  839. elementToScore->allExtantCandidatesScored = true;
  840. elementToScore->weightNext->weightPrev = elementToScore->weightPrev;
  841. elementToScore->weightPrev->weightNext = elementToScore->weightNext;
  842. elementToScore->weightNext = elementToScore->weightPrev = elementToScore;
  843. if (lvScores >= lvCutoff) {
  844. //
  845. // Don't score any more, just go with what we've got now.
  846. //
  847. if (bestScore > maxK) {
  848. // If none of our seeds was below the popularity threshold, report this as MultipleHits; otherwise,
  849. // report it as NotFound
  850. *result = (nSeedsApplied == 0 && nRCSeedsApplied == 0) ? MultipleHits : NotFound;
  851. return true;
  852. }
  853. int realConfDiff = confDiff + (popularSeedsSkipped >= adaptiveConfDiffThreshold ? 1 : 0);
  854. if (bestScore + realConfDiff <= secondBestScore && bestScore <= maxK) {
  855. *result = SingleHit;
  856. *singleHitGenomeLocation = bestScoreGenomeLocation;
  857. if (NULL != finalScore) {
  858. *finalScore = bestScore;
  859. }
  860. return true;
  861. }
  862. *result = MultipleHits;
  863. return true;
  864. }
  865. } while (forceResult);
  866. return false;
  867. }
  868. void
  869. BaseAligner::fillHitsFound(
  870. unsigned maxHitsToGet,
  871. int *multiHitsFound,
  872. unsigned *multiHitLocations,
  873. bool *multiHitRCs,
  874. int *multiHitScores)
  875. /*++
  876. Routine Description:
  877. Return up to maxHitsToGet best hits found for the user in the given array and count parameters, as long
  878. as maxHitsToGet >= 1. We also need the popularSeedsSkipped parameter to set confDiff adaptively.
  879. --*/
  880. {
  881. if (maxHitsToGet > 0) {
  882. unsigned realConfDiff = confDiff + (popularSeedsSkipped > adaptiveConfDiffThreshold ? 1 : 0);
  883. *multiHitsFound = 0;
  884. int firstDist = 0; // Distance of the best hit
  885. while (firstDist < MAX_K && hitCount[firstDist] == 0) {
  886. firstDist++;
  887. }
  888. for (int dist = firstDist; dist < min(firstDist + 4, MAX_K); dist++) {
  889. for (unsigned i = 0; i < hitCount[dist]; i++) {
  890. multiHitLocations[*multiHitsFound] = hitLocations[dist][i];
  891. multiHitRCs[*multiHitsFound] = hitRCs[dist][i];
  892. multiHitScores[*multiHitsFound] = dist;
  893. *multiHitsFound += 1;
  894. if (*multiHitsFound == maxHitsToGet) {
  895. return;
  896. }
  897. }
  898. }
  899. }
  900. }
  901. void
  902. BaseAligner::findCandidate(
  903. unsigned genomeLocation,
  904. bool isRC,
  905. Candidate **candidate,
  906. HashTableElement **hashTableElement)
  907. /*++
  908. Routine Description:
  909. Find a candidate in the hash table. If it doesn't exist, create, initialize and insert it.
  910. Arguments:
  911. genomeLocation - the location of the candidate we'd like to look up
  912. candidate - The candidate that was found or created
  913. hashTableElement - the hashTableElement for the candidate that was found or created.
  914. --*/
  915. {
  916. HashTableAnchor *hashTable = candidateHashTable[isRC];
  917. unsigned lowOrderGenomeLocation = genomeLocation % (2 * maxMergeDist);
  918. unsigned highOrderGenomeLocation = genomeLocation - lowOrderGenomeLocation;
  919. unsigned hashTableIndex = hash(highOrderGenomeLocation) % candidateHashTablesSize;
  920. HashTableAnchor *anchor = &hashTable[hashTableIndex];
  921. if (anchor->epoch != hashTableEpoch) {
  922. //
  923. // It's empty.
  924. //
  925. *hashTableElement = NULL;
  926. *candidate = NULL;
  927. return;
  928. }
  929. HashTableElement *lookedUpElement = anchor->element;
  930. while (NULL != lookedUpElement && lookedUpElement->baseGenomeLocation != highOrderGenomeLocation) {
  931. lookedUpElement = lookedUpElement->next;
  932. }
  933. if (NULL != lookedUpElement) {
  934. *hashTableElement = lookedUpElement;
  935. *candidate = &lookedUpElement->candidates[lowOrderGenomeLocation];
  936. _uint64 bitForThisCandidate = (_uint64)1 << lowOrderGenomeLocation;
  937. lookedUpElement->allExtantCandidatesScored = lookedUpElement->allExtantCandidatesScored && (lookedUpElement->candidatesUsed & bitForThisCandidate);
  938. lookedUpElement->candidatesUsed |= bitForThisCandidate;
  939. } else {
  940. *hashTableElement = NULL;
  941. *candidate = NULL;
  942. }
  943. }
  944. bool doAlignerPrefetch = true;
  945. void
  946. BaseAligner::allocateNewCandidate(unsigned genomeLocation, bool isRC, unsigned lowestPossibleScore, Candidate **candidate, HashTableElement **hashTableElement)
  947. /*++
  948. Routine Description:
  949. Arguments:
  950. Return Value:
  951. --*/
  952. {
  953. HashTableAnchor *hashTable = candidateHashTable[isRC];
  954. unsigned lowOrderGenomeLocation = genomeLocation % (2 * maxMergeDist);
  955. unsigned highOrderGenomeLocation = genomeLocation - lowOrderGenomeLocation;
  956. unsigned hashTableIndex = hash(highOrderGenomeLocation) % candidateHashTablesSize;
  957. HashTableAnchor *anchor = &hashTable[hashTableIndex];
  958. HashTableElement *element;
  959. if (doAlignerPrefetch) {
  960. genomeIndex->prefetchGenomeData(genomeLocation);
  961. }
  962. #if DBG
  963. element = hashTable[hashTableIndex].element;
  964. while (anchor->epoch == hashTableEpoch && NULL != element && element->genomeLocation != highOrderGenomeLocation) {
  965. element = element->next;
  966. }
  967. _ASSERT(NULL == element || anchor->epoch != hashTableEpoch);
  968. #endif // DBG
  969. _ASSERT(nUsedHashTableElements < hashTableElementPoolSize);
  970. element = &hashTableElementPool[nUsedHashTableElements];
  971. nUsedHashTableElements++;
  972. if (anchor->epoch == hashTableEpoch) {
  973. element->next = anchor->element;
  974. } else {
  975. anchor->epoch = hashTableEpoch;
  976. element->next = NULL;
  977. }
  978. anchor->element = element;
  979. element->candidatesUsed = 0;
  980. element->lowestPossibleScore = lowestPossibleScore;
  981. element->isRC = isRC;
  982. element->candidatesUsed = (_uint64)1 << lowOrderGenomeLocation;
  983. element->weight = 1;
  984. element->baseGenomeLocation = highOrderGenomeLocation;
  985. element->bestScore = UnusedScoreValue;
  986. element->allExtantCandidatesScored = false;
  987. //
  988. // And insert it at the end of weight list 1.
  989. //
  990. element->weightNext = &weightLists[1];
  991. element->weightPrev = weightLists[1].weightPrev;
  992. element->weightNext->weightPrev = element;
  993. element->weightPrev->weightNext = element;
  994. *candidate = &element->candidates[lowOrderGenomeLocation];
  995. *hashTableElement = element;
  996. highestUsedWeightList = __max(highestUsedWeightList,(unsigned)1);
  997. }
  998. #if DBG
  999. bool
  1000. BaseAligner::AssertCandidatesAreSorted()
  1001. /*++
  1002. Routine Description:
  1003. Arguments:
  1004. Return Value:
  1005. --*/
  1006. {
  1007. for (int i = 1; i < nCandidates; i++) {
  1008. _ASSERT(candidates[i].genomeLocation <= candidates[i-1].genomeLocation);
  1009. }
  1010. }
  1011. #endif // DBG
  1012. BaseAligner::~BaseAligner()
  1013. /*++
  1014. Routine Description:
  1015. Arguments:
  1016. Return Value:
  1017. --*/
  1018. {
  1019. #if MAINTAIN_HISTOGRAMS
  1020. delete lvHistogram;
  1021. delete lookupHistogram;
  1022. delete lvHistogramForMulti;
  1023. delete lvCountWhenBestFound;
  1024. #endif // MAINTAIN_HISTOGRAMS
  1025. BigDealloc(seedUsedAsAllocated);
  1026. BigDealloc(candidates);
  1027. BigDealloc(rcReadData);
  1028. BigDealloc(hashTableElementPool);
  1029. BigDealloc(candidateHashTable[0]);
  1030. BigDealloc(candidateHashTable[1]);
  1031. BigDealloc(weightLists);
  1032. delete bsd;
  1033. if (ownLandauVishkin) {
  1034. delete landauVishkin;
  1035. }
  1036. }
  1037. void
  1038. BaseAligner::ComputeHitDistribution(
  1039. Read *read,
  1040. unsigned correctGenomeLocation,
  1041. bool correctHitIsRC,
  1042. unsigned *hitCountBySeed,
  1043. unsigned *rcHitCountBySeed,
  1044. unsigned &nSeedsApplied,
  1045. unsigned &nRCSeedsApplied,
  1046. unsigned *hitsContainingCorrectLocation)
  1047. {
  1048. nSeedsApplied = 0;
  1049. nRCSeedsApplied = 0;
  1050. for (unsigned i = 0; i < maxSeedsToUse; i++) {
  1051. hitCountBySeed[i] = 0;
  1052. rcHitCountBySeed[i] = 0;
  1053. hitsContainingCo

Large files files are truncated, but you can click here to view the full file