/SNAPLib/BaseAligner.cpp
C++ | 1474 lines | 927 code | 153 blank | 394 comment | 244 complexity | 056372bd6369dbba03f63fc241ed98b8 MD5 | raw file
Possible License(s): Apache-2.0
Large files files are truncated, but you can click here to view the full file
- /*++
- Module Name:
- BaseAligner.cpp
- Abstract:
- SNAP genome aligner
- Authors:
- Bill Bolosky, August, 2011
- Environment:
- User mode service.
- This class is NOT thread safe. It's the caller's responsibility to ensure that
- at most one thread uses an instance at any time.
- Revision History:
- Adapted from Matei Zaharia's Scala implementation.
- --*/
- #include "stdafx.h"
- #include "BaseAligner.h"
- #include "Compat.h"
- #include "LandauVishkin.h"
- #include "BigAlloc.h"
- using std::min;
- BaseAligner::BaseAligner(
- GenomeIndex *i_genomeIndex,
- unsigned i_confDiff,
- unsigned i_maxHitsToConsider,
- unsigned i_maxK,
- unsigned i_maxReadSize,
- unsigned i_maxSeedsToUse,
- unsigned i_lvCutoff,
- unsigned i_adaptiveConfDiffThreshold,
- LandauVishkin *i_landauVishkin) :
- genomeIndex(i_genomeIndex), confDiff(i_confDiff), maxHitsToConsider(i_maxHitsToConsider), maxK(i_maxK),
- maxReadSize(i_maxReadSize), maxSeedsToUse(i_maxSeedsToUse), lvCutoff(i_lvCutoff), readId(-1),
- adaptiveConfDiffThreshold(i_adaptiveConfDiffThreshold), landauVishkin(i_landauVishkin),
- explorePopularSeeds(false), stopOnFirstHit(false)
- /*++
- Routine Description:
- Constructor for the BaseAligner class. Aligners align reads against an indexed genome.
- Arguments:
- i_genomeIndex - The index against which to do the alignments
- i_confDiff - The string difference between two matches necessary to believe they're really different
- i_maxHitsToConsider - The maximum number of hits to use from a seed lookup. Any lookups that return more
- than this are ignored.
- i_maxK - The largest string difference to consider for any comparison.
- i_maxReadSize - Bound on the number of bases in any read. There's no reason to make it tight, it just affects a little memory allocation.
- i_maxSeedsToUse - The maximum number of seeds to use when aligning any read (not counting ones ignored because they resulted in too many
- hits). Once we've looked up this many seeds, we just score what we've got.
- --*/
- {
- #if MAINTAIN_HISTOGRAMS
- lvHistogram = new Histogram(20, true);
- lookupHistogram = new Histogram(20, true);
- lvHistogramForMulti = new Histogram(20, true);
- lvCountWhenBestFound = new Histogram(maxHitsToConsider*4,false);
- #endif // MAINTAIN_HISTOGRAMS
- nHashTableLookups = 0;
- nLocationsScored = 0;
- nHitsIgnoredBecauseOfTooHighPopularity = 0;
- nReadsIgnoredBecauseOfTooManyNs = 0;
- nIndelsMerged = 0;
- genome = genomeIndex->getGenome();
- seedLen = genomeIndex->getSeedLength();
- bsd = new BoundedStringDistance<>(3);
- if (i_landauVishkin == NULL) {
- landauVishkin = new LandauVishkin;
- ownLandauVishkin = true;
- } else {
- ownLandauVishkin = false;
- }
- unsigned nCandidates = maxHitsToConsider * (maxReadSize - seedLen + 1) * 2; // *2 is for reverse complement
- candidates = (Candidate *)BigAlloc(sizeof(Candidate) * nCandidates);
- for (unsigned i = 0 ; i < nCandidates; i++) {
- candidates[i].init();
- }
- rcReadData = (char *)BigAlloc(sizeof(char) * maxReadSize);
-
- // treat everything but ACTG like N
- for (unsigned i = 0; i < 256; i++) {
- nTable[i] = 1;
- rcTranslationTable[i] = 'N';
- }
- rcTranslationTable['A'] = 'T';
- rcTranslationTable['G'] = 'C';
- rcTranslationTable['C'] = 'G';
- rcTranslationTable['T'] = 'A';
- nTable['A'] = 0;
- nTable['C'] = 0;
- nTable['T'] = 0;
- nTable['G'] = 0;
- seedUsed = (BYTE *)BigAlloc(sizeof(BYTE) * (maxReadSize + 7 + 128) / 8); // +128 to make sure it extends at both before and after at least an _int64
- seedUsedAsAllocated = seedUsed; // Save the pointer for the delete.
- seedUsed += 8; // This moves the pointer up an _int64, so we now have the appropriate before buffer.
- nUsedHashTableElements = 0;
- hashTableElementPoolSize = maxHitsToConsider * maxSeedsToUse * 2 ; // *2 for RC
- /*
- if (maxK*2 > 64) {
- fprintf(stderr,"The HashTableElement struct uses a 64 bit bitvector for used candidates. As a result, you can't have maxK > 32. Not that you'd want to anyway.\n");
- exit(1);
- }
- */
- candidateHashTablesSize = (maxHitsToConsider * maxSeedsToUse * 3)/2; // *1.5 for hash table slack
- hashTableElementPool = (HashTableElement *)BigAlloc(sizeof(HashTableElement) * hashTableElementPoolSize);
- candidateHashTable[0] = (HashTableAnchor *)BigAlloc(sizeof(HashTableAnchor) * candidateHashTablesSize);
- candidateHashTable[1] = (HashTableAnchor *)BigAlloc(sizeof(HashTableAnchor) * candidateHashTablesSize);
- weightLists = (HashTableElement *)BigAlloc(sizeof(HashTableElement) * (maxSeedsToUse + 1));
- for (unsigned i = 0; i < hashTableElementPoolSize; i++) {
- hashTableElementPool[i].init();
- }
- for (unsigned i = 0; i < maxSeedsToUse + 1; i++) {
- weightLists[i].init();
- }
- memset(candidateHashTable[0],0,sizeof(HashTableAnchor) * candidateHashTablesSize);
- memset(candidateHashTable[1],0,sizeof(HashTableAnchor) * candidateHashTablesSize);
- hashTableEpoch = 0;
- }
- AlignmentResult
- BaseAligner::AlignRead(Read *read, unsigned *genomeLocation, bool *hitIsRC, int *finalScore)
- {
- return AlignRead(read, genomeLocation, hitIsRC, finalScore, 0, 0, false, 0, NULL, NULL, NULL, NULL);
- }
- AlignmentResult
- BaseAligner::AlignRead(
- Read *read,
- unsigned *genomeLocation,
- bool *hitIsRC,
- int *finalScore,
- unsigned searchRadius,
- unsigned searchLocation,
- bool searchRC)
- {
- return AlignRead(read, genomeLocation, hitIsRC, finalScore,
- searchRadius, searchLocation, searchRC, 0, NULL, NULL, NULL, NULL);
- }
- AlignmentResult
- BaseAligner::AlignRead(
- Read *read,
- unsigned *genomeLocation,
- bool *hitIsRC,
- int *finalScore,
- unsigned searchRadius,
- unsigned searchLocation,
- bool searchRC,
- int maxHitsToGet,
- int *multiHitsFound,
- unsigned *multiHitLocations,
- bool *multiHitRCs,
- int *multiHitScores)
- /*++
- Routine Description:
- Align a particular read, possibly constraining the search around a given location.
- Arguments:
- read - the read to align
- genomeLocation - if this aligned to a SingleHit, the 0-based offset in the genome that this hit. The aligner treats the entire
- genome as a single string, even though it's really a set of chrosomes. It just makes the code simpler.
- The caller can convert to chromosome+offset by looking up the piece boudaries using the Genome object.
- hitIsRC - the aligner tries to align both the given read and its reverse complement. If we found a SingleHit this
- bool is set to indicate whether that hit was on the reverse complement.
- finalScore - if a single or confident hit, this is the score of the hit (i.e., the LV distance from the read)
- searchRadius - if non-zero, constrain the search to this distance around searchLocation, in orientation searchRC
- searchLocation - location to search around if searchRadius is given
- searchRC - whether to search in reverse complement orientation if searchRadius is given
- maxHitsToGet - if greater than 0, output up to this many hits within confDiff of the best (if any) in multiHitLocations,
- writing their count in multiHitsFound, instead of returning MultipleHits immediately
- multiHitsFound - output parameter for number of alternative hits found if maxHitsToGet is true
- multiHitLocations - output parameter for locations of alternative hits found if maxHitsToGet is true
- Return Value:
- ConfidentHit, SingleHit, MultiHit or NotFound depending on how the alignment went.
- --*/
- {
- *genomeLocation = 0xFFFFFFFF; // Value to return if we don't find a location.
- *hitIsRC = false; // So we deterministically print the read forward in this case.
- if (finalScore != NULL) {
- *finalScore = UnusedScoreValue;
- }
- unsigned lookupsThisRun = 0;
- unsigned lvScores = 0;
- unsigned lvScoresAfterBestFound = 0;
- popularSeedsSkipped = 0;
-
- // Range of genome locations to search in.
- unsigned minLocation = 0;
- unsigned maxLocation = 0xFFFFFFFF;
- if (searchRadius != 0) {
- minLocation = (searchLocation > searchRadius) ? searchLocation - searchRadius : 0;
- maxLocation = (searchLocation < 0xFFFFFFFF - searchRadius) ? searchLocation + searchRadius : 0xFFFFFFFF;
- }
-
- // If asked to return the locations of multiple hits, make sure we get sensible counts and results
- if (maxHitsToGet > 0) {
- memset(hitCount, 0, MAX_K * sizeof(unsigned));
- *multiHitsFound = 0;
- }
- AlignmentResult finalResult;
- //
- // A bitvector for used seeds, indexed on the starting location of the seed within the read.
- //
- if (read->getDataLength() > maxReadSize) {
- fprintf(stderr,"BaseAligner:: got too big read (%d > %d)", read->getDataLength(),maxReadSize);
- exit(1);
- }
- if ((int)read->getDataLength() < seedLen) {
- //
- // Too short to have any seeds, it's hopeless.
- //
- return NotFound;
- }
- //
- // Clear out the seed used array.
- //
- memset(seedUsed, 0, (read->getDataLength() + 7) / 8);
- unsigned readLen = read->getDataLength();
- const char *readData = read->getData();
- unsigned countOfNs = 0;
- for (unsigned i = 0; i < readLen; i++) {
- char baseByte = readData[i];
- rcReadData[readLen - i - 1] = rcTranslationTable[baseByte];
- countOfNs += nTable[baseByte];
- }
- if (countOfNs > maxK) {
- nReadsIgnoredBecauseOfTooManyNs++;
- return NotFound;
- }
- //
- // Block off any seeds that would contain an N.
- //
- if (countOfNs > 0) {
- int minSeedToConsiderNing = 0; // In English, any word can be verbed. Including, apparently, "N."
- for (int i = 0; i < (int) readLen; i++) {
- if (BASE_VALUE[readData[i]] > 3) {
- int limit = __min(i + seedLen - 1, readLen-1);
- for (int j = __max(minSeedToConsiderNing, i - (int) seedLen + 1); j <= limit; j++) {
- SetSeedUsed(j);
- }
- minSeedToConsiderNing = limit+1;
- if (minSeedToConsiderNing >= (int) readLen)
- break;
- }
- }
- }
- Read rcRead[1];
- rcRead->init(NULL, 0, rcReadData, NULL, readLen);
- clearCandidates();
- //
- // Initialize the bases table, which represents which bases we've checked.
- // We have readSize - seeds size + 1 possible seeds.
- //
- unsigned nPossibleSeeds = readLen - seedLen + 1;
- unsigned nextSeedToTest = 0;
- unsigned wrapCount = 0;
- unsigned lowestPossibleScoreOfAnyUnseenLocation = 0;
- unsigned lowestPossibleRCScoreOfAnyUnseenLocation = 0;
- unsigned mostSeedsContainingAnyParticularBase = 1; // Instead of tracking this for real, we're just conservative and use wrapCount+1. It's faster.
- unsigned mostRCSeedsContainingAnyParticularBase = 1;// ditto
- unsigned bestScore = UnusedScoreValue;
- unsigned bestScoreGenomeLocation;
- unsigned secondBestScore = UnusedScoreValue;
- unsigned secondBestScoreGenomeOffset;
- bool secondBestScoreIsRC;
- unsigned nSeedsApplied = 0;
- unsigned nRCSeedsApplied = 0;
- unsigned scoreLimit;
- if (maxHitsToGet > 0) {
- scoreLimit = maxK + 3;
- } else {
- scoreLimit = maxK + confDiff - 1;
- }
- while (nSeedsApplied + nRCSeedsApplied < maxSeedsToUse) {
- //
- // Choose the next seed to use. Choose the first one that isn't used
- //
- if (nextSeedToTest >= nPossibleSeeds) {
- //
- // We're wrapping. We want to space the seeds out as much as possible, so if we had
- // a seed length of 20 we'd want to take 0, 10, 5, 15, 2, 7, 12, 17. To make the computation
- // fast, we use use a table lookup.
- //
- wrapCount++;
- if (wrapCount >= seedLen) {
- //
- // We tried all possible seeds without matching or even getting enough seeds to
- // exceed our seed count. Do the best we can with what we have.
- //
- #ifdef TRACE_ALIGNER
- printf("Calling score with force=true because we wrapped around enough\n");
- #endif
- score(
- true,
- read,
- rcRead,
- &finalResult,
- finalScore,
- genomeLocation,
- hitIsRC,
- nSeedsApplied,
- nRCSeedsApplied,
- mostSeedsContainingAnyParticularBase,
- mostRCSeedsContainingAnyParticularBase,
- lowestPossibleScoreOfAnyUnseenLocation,
- lowestPossibleRCScoreOfAnyUnseenLocation,
- candidates,
- bestScore,
- bestScoreGenomeLocation,
- secondBestScore,
- secondBestScoreGenomeOffset,
- secondBestScoreIsRC,
- scoreLimit,
- lvScores,
- lvScoresAfterBestFound,
- maxHitsToGet);
- #if MAINTAIN_HISTOGRAMS
- lvHistogram->addToCount(lvScores,lvScores);
- lookupHistogram->addToCount(lookupsThisRun,lookupsThisRun);
- if (MultipleHits == finalResult) {
- lvHistogramForMulti->addToCount(lvScores,lvScores);
- } else if (SingleHit == finalResult) {
- lvCountWhenBestFound->addToCount(lvScores-lvScoresAfterBestFound);
- }
- #endif // MAINTAIN_HISTOGRAMS
- fillHitsFound(maxHitsToGet, multiHitsFound,
- multiHitLocations, multiHitRCs, multiHitScores);
- return finalResult;
- }
- nextSeedToTest = getWrappedNextSeedToTest(wrapCount);
- mostSeedsContainingAnyParticularBase = wrapCount + 1;
- mostRCSeedsContainingAnyParticularBase = wrapCount + 1;
- }
- while (nextSeedToTest < nPossibleSeeds && IsSeedUsed(nextSeedToTest)) {
- //
- // This seed is already used. Try the next one.
- //
- nextSeedToTest++;
- }
- if (nextSeedToTest >= nPossibleSeeds) {
- //
- // Unusable seeds have pushed us past the end of the read. Go back around the outer loop so we wrap properly.
- //
- continue;
- }
- Seed seed(read->getData() + nextSeedToTest, seedLen);
- unsigned nHits; // Number of times this seed hits in the genome
- const unsigned *hits; // The actual hits (of size nHits)
- unsigned nRCHits; // Number of hits for the seed's reverse complement
- const unsigned *rcHits; // The actual reverse complement hits
-
- unsigned minSeedLoc = (minLocation < readLen ? 0 : minLocation - readLen);
- unsigned maxSeedLoc = (maxLocation > 0xFFFFFFFF - readLen ? 0xFFFFFFFF : maxLocation + readLen);
- genomeIndex->lookupSeed(seed, minSeedLoc, maxSeedLoc, &nHits, &hits, &nRCHits, &rcHits);
- nHashTableLookups++;
- lookupsThisRun++;
- #ifdef TRACE_ALIGNER
- printf("Looked up seed %llx: hits=%u, rchits=%u\n", seed.getBases(), nHits, nRCHits);
- printf("Hits:");
- for (int i = 0; i < nHits; i++)
- printf(" %u", hits[i]);
- printf("\n");
- printf("RC hits:");
- for (int i = 0; i < nRCHits; i++)
- printf(" %u", rcHits[i]);
- printf("\n");
- #endif
- SetSeedUsed(nextSeedToTest);
- bool appliedEitherSeed = false;
- //
- // Look for the seed in the forward direction, unless we were asked to search only the RC.
- //
- if (searchRadius == 0 || searchRC == false) {
- if (nHits > maxHitsToConsider && !explorePopularSeeds) {
- //
- // This seed is matching too many places. Just pretend we never looked and keep going.
- //
- nHitsIgnoredBecauseOfTooHighPopularity++;
- popularSeedsSkipped++;
- } else {
- //
- // Update the candidates list with any hits from this seed. If lowest possible score of any unseen location is
- // more than best_score + confDiff then we know that if this location is newly seen then its location won't ever be a
- // winner, and we can ignore it.
- //
- for (unsigned i = 0 ; i < min(nHits, maxHitsToConsider); i++) {
- //
- // Find the genome location where the beginning of the read would hit, given a match on this seed.
- //
- unsigned genomeLocationOfThisHit = hits[i] - nextSeedToTest;
- if (genomeLocationOfThisHit < minLocation || genomeLocationOfThisHit > maxLocation)
- continue;
-
- Candidate *candidate;
- HashTableElement *hashTableElement;
- findCandidate(genomeLocationOfThisHit,false,&candidate,&hashTableElement);
- if (NULL != hashTableElement) {
- incrementWeight(hashTableElement);
- } else if (lowestPossibleScoreOfAnyUnseenLocation <= scoreLimit) {
- allocateNewCandidate(genomeLocationOfThisHit,false, lowestPossibleScoreOfAnyUnseenLocation, &candidate, &hashTableElement);
- }
- }
- nSeedsApplied++;
- appliedEitherSeed = true;
- }
- }
- //
- // Look for the seed in the RC direction, unless we were asked to search only forwards.
- //
- if (searchRadius == 0 || searchRC == true) {
- if (nRCHits > maxHitsToConsider && !explorePopularSeeds) {
- //
- // This seed is matching too many places. Just pretend we never looked and keep going.
- //
- nHitsIgnoredBecauseOfTooHighPopularity++;
- popularSeedsSkipped++;
- } else {
- //
- // The RC seed is at offset ReadSize - SeedSize - seed offset in the RC seed.
- //
- // To see why, imagine that you had a read that looked like 0123456 (where the digits
- // represented some particular bases, and digit' is the base's complement). Then the
- // RC of that read is 6'5'4'3'2'1'. So, when we look up the hits for the seed at
- // offset 0 in the forward read (i.e. 012 assuming a seed size of 3) then the index
- // will also return the results for the seed's reverse complement, i.e., 3'2'1'.
- // This happens as the last seed in the RC read.
- //
- unsigned rcOffset = readLen - seedLen - nextSeedToTest;
-
- //
- // And now update the candidates list with any hits from this seed. If lowest possible score of any unseen RC location is
- // more than best_score + confDiff then we know that if this location is newly seen then its location won't ever be a
- // winner, and we can ignore it.
- //
- for (unsigned i = 0 ; i < min(nRCHits, maxHitsToConsider); i++) {
- //
- // Find the genome location where the beginning of the read would hit, given a match on this seed.
- //
- unsigned genomeLocationOfThisHit = rcHits[i] - rcOffset;
- if (genomeLocationOfThisHit < minLocation || genomeLocationOfThisHit > maxLocation)
- continue;
-
- Candidate *candidate;
- HashTableElement *hashTableElement;
- findCandidate(genomeLocationOfThisHit,true,&candidate,&hashTableElement);
- if (NULL != hashTableElement) {
- incrementWeight(hashTableElement);
- } else if (lowestPossibleRCScoreOfAnyUnseenLocation <= scoreLimit) {
- allocateNewCandidate(genomeLocationOfThisHit, true, lowestPossibleRCScoreOfAnyUnseenLocation, &candidate, &hashTableElement);
- }
- }
- nRCSeedsApplied++;
- appliedEitherSeed = true;
- }
- }
- //
- // Move us along.
- //
- nextSeedToTest += seedLen;
- if (appliedEitherSeed) {
- //
- // And finally, try scoring.
- //
- if (score( false,
- read,
- rcRead,
- &finalResult,
- finalScore,
- genomeLocation,
- hitIsRC,
- nSeedsApplied,
- nRCSeedsApplied,
- mostSeedsContainingAnyParticularBase,
- mostRCSeedsContainingAnyParticularBase,
- lowestPossibleScoreOfAnyUnseenLocation,
- lowestPossibleRCScoreOfAnyUnseenLocation,
- candidates,
- bestScore,
- bestScoreGenomeLocation,
- secondBestScore,
- secondBestScoreGenomeOffset,
- secondBestScoreIsRC,
- scoreLimit,
- lvScores,
- lvScoresAfterBestFound,
- maxHitsToGet)) {
- #if MAINTAIN_HISTOGRAMS
- lvHistogram->addToCount(lvScores,lvScores);
- lookupHistogram->addToCount(lookupsThisRun,lookupsThisRun);
- if (MultipleHits == finalResult) {
- lvHistogramForMulti->addToCount(lvScores,lvScores);
- } else if (SingleHit == finalResult) {
- lvCountWhenBestFound->addToCount(lvScores-lvScoresAfterBestFound);
- }
- #endif // MAINTAIN_HISTOGRAMS
- fillHitsFound(maxHitsToGet, multiHitsFound,
- multiHitLocations, multiHitRCs, multiHitScores);
- return finalResult;
- }
- }
- }
- //
- // Do the best with what we've got.
- //
- #ifdef TRACE_ALIGNER
- printf("Calling score with force=true because we ran out of seeds\n");
- #endif
- score( true,
- read,
- rcRead,
- &finalResult,
- finalScore,
- genomeLocation,
- hitIsRC,
- nSeedsApplied,
- nRCSeedsApplied,
- mostSeedsContainingAnyParticularBase,
- mostRCSeedsContainingAnyParticularBase,
- lowestPossibleScoreOfAnyUnseenLocation,
- lowestPossibleRCScoreOfAnyUnseenLocation,
- candidates,
- bestScore,
- bestScoreGenomeLocation,
- secondBestScore,
- secondBestScoreGenomeOffset,
- secondBestScoreIsRC,
- scoreLimit,
- lvScores,
- lvScoresAfterBestFound,
- maxHitsToGet);
- #if MAINTAIN_HISTOGRAMS
- lvHistogram->addToCount(lvScores,lvScores);
- lookupHistogram->addToCount(lookupsThisRun,lookupsThisRun);
- if (MultipleHits == finalResult) {
- lvHistogramForMulti->addToCount(lvScores,lvScores);
- } else if (SingleHit == finalResult) {
- lvCountWhenBestFound->addToCount(lvScores-lvScoresAfterBestFound);
- }
- #endif // MAINTAIN_HISTOGRAMS
-
- fillHitsFound(maxHitsToGet, multiHitsFound,
- multiHitLocations, multiHitRCs, multiHitScores);
- return finalResult;
- }
- bool
- BaseAligner::score(
- bool forceResult,
- Read *read,
- Read *rcRead,
- AlignmentResult *result,
- int *finalScore,
- unsigned *singleHitGenomeLocation,
- bool *hitIsRC,
- unsigned nSeedsApplied,
- unsigned nRCSeedsApplied,
- unsigned mostSeedsContainingAnyParticularBase,
- unsigned mostRCSeedsContainingAnyParticularBase,
- unsigned &lowestPossibleScoreOfAnyUnseenLocation,
- unsigned &lowestPossibleRCScoreOfAnyUnseenLocation,
- Candidate *candidates,
- unsigned &bestScore,
- unsigned &bestScoreGenomeLocation,
- unsigned &secondBestScore,
- unsigned &secondBestScoreGenomeLocation,
- bool &secondBestScoreIsRC,
- unsigned &scoreLimit,
- unsigned &lvScores,
- unsigned &lvScoresAfterBestFound,
- unsigned maxHitsToGet)
- /*++
- Routine Description:
- Make progress in scoring a possibly partial alignment. This is a private method of the BaseAligner class that's used
- only by AlignRead.
- It does a number of things. First, it computes the lowest possible score of any unseen location. This is useful
- because once we have a scored hit that's more than confDiff better than all unseen locations, there's no need to
- lookup more of them, we can just score what we've got and be sure that the answer is right (unless errors have
- pushed the read to be closer to a wrong location than to the correct one, in which case it's hopeless).
- It then decides whether it should score a location, and if so what one to score. It chooses the unscored
- location that's got the highest weight (i.e., appeared in the most hash table lookups), since that's most
- likely to be the winner. If there are multiple candidates with the same best weight, it breaks the tie using
- the best possible score for the candidates (which is determined when they're first hit). Remaining ties are
- broken arbitrarily.
- It merges indels with scored candidates. If there's an insertion or deletion in the read, then we'll get very
- close but unequal results out of the hash table lookup for parts of the read on opposite sides of the
- insertion or deletion. This throws out the one with the worse score.
- It then figures out if we have a definitive answer, and says what that is.
- Arguments:
- forceResult - should we generate an answer even if it's not definitive?
- read - the read we're aligning
- rcRead - the reverse complement of read
- result - returns the result if we reach one
- singleHitGenomeLocation - returns the location in the genome if we return a single hit
- hitIsRC - if we return a single hit, indicates whether it's on the reverse complement
- nSeedsApplied - how may seeds have we looked up and not ignored in this alignment
- nRCSeedsApplied - how may reverse complement seeds have we looked up and not ignored in this alignment
- mostSeedsContainingAnyParticularBase - what's the largest number of seeds we've used that contain any single base from the read?
- mostRCSeedsContainingAnyParticularBase - same thing for the reverse complement read
- lowestPossibleScoreOfAnyUnseenLocation - in/out the best score that any genome location that we haven't hit could possinly have
- lowestPossibleRCScoreOfAnyUnseenLocation- same thing for the reverse complement read
- candidates - in/out the array of candidates that have hit and possibly been scored
- bestScore - in/out the best score we've seen so far (recall that score is string distance, so best is least).
- bestScoreGenomeLocation - in/out where in the genome was the best score
- secondBestScore - in/out what's the second best score we've seen
- scoreLimit - in/out maximum edit distance we are need to consider for further scores (bigger ones won't help)
- lvScores - in/out how many calls to LandauVishkin have we made (instrumentation)
- lvScoresAfterBestFound - in/out how many calls to LandauVishkin have we made after finding the best score (instrumentation)
- popularSeedsSkipped - how many overly popular seeds we skipped during this alignment
- maxHitsToGet - whether the user wants to get multiple non-confident hits in the MultiHit case
- Return Value:
- true iff we've reached a result. When called with forceResult, we'll always return true.
- --*/
- {
- #ifdef TRACE_ALIGNER
- printf("score() called with force=%d nsa=%d nrcsa=%d best=%u bestloc=%u 2nd=%u\n",
- forceResult, nSeedsApplied, nRCSeedsApplied, bestScore, bestScoreGenomeLocation, secondBestScore);
- printf("Read:\n ");
- for (int i = 0; i < read->getDataLength(); i++) printf("%c", read->getData()[i]);
- printf("\n");
- //printf("Candidates:\n");
- //for (int i = 0; i < nCandidates; i++) {
- // Candidate* c = candidates + i;
- // printf(" loc=%u rc=%d weight=%u minps=%u scored=%d score=%u r=%u-%u\n",
- // c->genomeLocation, c->isRC, c->weight, c->minPossibleScore, c->scored,
- // c->score, c->minRange, c->maxRange);
- //}
- //printf("\n\n");
- #endif
- if (0 == mostSeedsContainingAnyParticularBase && 0 == mostRCSeedsContainingAnyParticularBase) {
- //
- // The only way we can get here is if we've tried all of the seeds that we're willing
- // to try and every one of them generated too many hits to process. Declare
- // a multi hit and give up.
- //
- _ASSERT(forceResult);
- *result = MultipleHits;
- return true;
- }
- //
- // Recompute lowestPossibleScore.
- //
- if (0 != mostSeedsContainingAnyParticularBase) {
- lowestPossibleScoreOfAnyUnseenLocation =
- __max(lowestPossibleScoreOfAnyUnseenLocation,
- nSeedsApplied / mostSeedsContainingAnyParticularBase);
- }
- if (0 != mostRCSeedsContainingAnyParticularBase) {
- lowestPossibleRCScoreOfAnyUnseenLocation =
- __max(lowestPossibleRCScoreOfAnyUnseenLocation,
- nRCSeedsApplied / mostRCSeedsContainingAnyParticularBase);
- }
- #ifdef TRACE_ALIGNER
- printf("Lowest possible scores for unseen locations: %d (fwd), %d (RC)\n",
- lowestPossibleScoreOfAnyUnseenLocation,
- lowestPossibleRCScoreOfAnyUnseenLocation);
- #endif
- // Return early if we haven't tried enough seeds
- //if (__max(nSeedsApplied, nRCSeedsApplied) < 4 && !forceResult) {
- // return false;
- //}
- unsigned weightListToCheck = highestUsedWeightList;
- do {
- HashTableElement *elementToScore;
- _ASSERT(weightListToCheck <= maxSeedsToUse);
- //
- // Grab the next element to score, and score it.
- //
- while (weightListToCheck > 0 && weightLists[weightListToCheck].weightNext == &weightLists[weightListToCheck]) {
- weightListToCheck--;
- highestUsedWeightList = weightListToCheck;
- }
- _ASSERT(weightListToCheck <= maxSeedsToUse);
- if (__min(lowestPossibleScoreOfAnyUnseenLocation,lowestPossibleRCScoreOfAnyUnseenLocation) > scoreLimit || forceResult) {
- if (weightListToCheck == 0) {
- //
- // We've scored all live candidates and excluded all non-candidates. We have our
- // answer.
- //
- int realConfDiff = confDiff + (popularSeedsSkipped >= adaptiveConfDiffThreshold ? 1 : 0);
- if (NULL != finalScore) {
- *finalScore = bestScore;
- }
- if (bestScore + realConfDiff <= secondBestScore && bestScore <= maxK) {
- if (popularSeedsSkipped == 0 && !forceResult) {
- *result = CertainHit;
- } else {
- *result = SingleHit;
- }
- *singleHitGenomeLocation = bestScoreGenomeLocation;
- return true;
- } else if (bestScore > maxK) {
- // If none of our seeds was below the popularity threshold, report this as MultipleHits; otherwise,
- // report it as NotFound
- *result = (nSeedsApplied == 0 && nRCSeedsApplied == 0) ? MultipleHits : NotFound;
- return true;
- } else {
- *result = MultipleHits;
- return true;
- }
- }
- //
- // Nothing that we haven't already looked up can possibly be the answer. Score what we've got and exit.
- //
- forceResult = true;
- } else if (weightListToCheck == 0) {
- if (forceResult) {
- if (NULL != finalScore) {
- *finalScore = bestScore;
- }
- int realConfDiff = confDiff + (popularSeedsSkipped >= adaptiveConfDiffThreshold ? 1 : 0);
- if (bestScore + realConfDiff <= secondBestScore && bestScore <= maxK) {
- *result = SingleHit;
- *singleHitGenomeLocation = bestScoreGenomeLocation;
- return true;
- } else if (bestScore > maxK) {
- // If none of our seeds was below the popularity threshold, report this as MultipleHits; otherwise,
- // report it as NotFound
- *result = (nSeedsApplied == 0 && nRCSeedsApplied == 0) ? MultipleHits : NotFound;
- return true;
- } else {
- *result = MultipleHits;
- return true;
- }
- } else {
- //
- // No candidates, look for more.
- //
- return false;
- }
- }
- elementToScore = weightLists[weightListToCheck].weightNext;
- _ASSERT(!elementToScore->allExtantCandidatesScored);
- _ASSERT(elementToScore->candidatesUsed != 0);
- _ASSERT(elementToScore != &weightLists[weightListToCheck]);
- if (elementToScore->lowestPossibleScore <= scoreLimit) {
- unsigned long candidateIndexToScore;
- _uint64 candidatesMask = elementToScore->candidatesUsed;
- while (_BitScanForward64(&candidateIndexToScore,candidatesMask)) {
- candidatesMask &= ~(_uint64)1 << candidateIndexToScore;
- Candidate *candidateToScore = &elementToScore->candidates[candidateIndexToScore];
- if (candidateToScore->scoredInEpoch == hashTableEpoch) {
- //
- // This can happen if we find a candidate, score it, and then hit on an indel
- // near it.
- //
- continue;
- }
-
- unsigned genomeLocation = elementToScore->baseGenomeLocation + candidateIndexToScore;
-
- unsigned score = -1;
- if (elementToScore->isRC) {
- const char *data = genome->getSubstring(genomeLocation, rcRead->getDataLength() + MAX_K);
- if (data != NULL) {
- #ifdef USE_NEW_DISTANCE
- score = bsd->compute(data, rcRead->getData(), rcRead->getDataLength(), scoreLimit);
- #else
- _uint64 cacheKey = 0;
- if (readId != -1) {
- cacheKey = ((_uint64) readId) << 33 | ((_uint64) elementToScore->isRC) << 32 | genomeLocation;
- }
- score = landauVishkin->computeEditDistance(
- data, rcRead->getDataLength() + MAX_K,
- rcRead->getData(), rcRead->getDataLength(),
- scoreLimit, cacheKey);
- #endif
- }
- #ifdef TRACE_ALIGNER
- printf("Computing distance at %u (RC) with limit %d: %d\n", genomeLocation, scoreLimit, score);
- #endif
- } else {
- const char *data = genome->getSubstring(genomeLocation, read->getDataLength() + MAX_K);
- if (data != NULL) {
- #ifdef USE_NEW_DISTANCE
- score = bsd->compute(data, read->getData(), read->getDataLength(), scoreLimit);
- #else
- _uint64 cacheKey = 0;
- if (readId != -1) {
- cacheKey = ((_uint64) readId) << 33 | ((_uint64) elementToScore->isRC) << 32 | genomeLocation;
- }
- score = landauVishkin->computeEditDistance(
- data, read->getDataLength() + MAX_K,
- read->getData(), read->getDataLength(),
- scoreLimit, cacheKey);
- #endif
- }
- #ifdef TRACE_ALIGNER
- printf("Computing distance at %u (fwd) with limit %d: %d\n", genomeLocation, scoreLimit, score);
- #endif
- }
-
- if (maxHitsToGet > 0 && score != -1 && hitCount[score] < maxHitsToGet) {
- // Remember the location of this hit because we don't have enough at this distance
- hitLocations[score][hitCount[score]] = genomeLocation;
- hitRCs[score][hitCount[score]] = elementToScore->isRC;
- hitCount[score]++;
- }
-
- candidateToScore->score = score;
- candidateToScore->scoredInEpoch = hashTableEpoch;
- elementToScore->bestScore = __min(elementToScore->bestScore, score);
-
- nLocationsScored++;
- lvScores++;
- lvScoresAfterBestFound++;
-
- // off until we fix the insert/delete problem _ASSERT(candidates[candidateToScore].score >= candidates[candidateToScore].minPossibleScore); // Else we messed up minPossibleScore (or LV or something)
-
- if (bestScore > score) {
- if ((secondBestScore == UnusedScoreValue || !(secondBestScoreGenomeLocation + maxK > genomeLocation && secondBestScoreGenomeLocation < genomeLocation + maxK)) &&
- (bestScore == UnusedScoreValue || !(bestScoreGenomeLocation + maxK > genomeLocation && bestScoreGenomeLocation < genomeLocation + maxK)))
- {
- secondBestScore = bestScore;
- secondBestScoreGenomeLocation = bestScoreGenomeLocation;
- secondBestScoreIsRC = *hitIsRC;
- }
-
- //
- // We have a new best score.
- //
- bestScore = score;
- bestScoreGenomeLocation = genomeLocation;
- *singleHitGenomeLocation = bestScoreGenomeLocation;
- if (NULL != finalScore) {
- *finalScore = bestScore;
- }
- *hitIsRC = elementToScore->isRC;
-
- lvScoresAfterBestFound = 0;
-
- } else if (secondBestScore > score &&
- (secondBestScore == UnusedScoreValue || !(secondBestScoreGenomeLocation + maxK > genomeLocation && secondBestScoreGenomeLocation < genomeLocation + maxK)) &&
- !(bestScoreGenomeLocation + maxK > genomeLocation && bestScoreGenomeLocation < genomeLocation + maxK)) {
- //
- // A new second best.
- //
- secondBestScore = score;
- secondBestScoreGenomeLocation = genomeLocation;
- secondBestScoreIsRC = elementToScore->isRC;
- }
- if (stopOnFirstHit && bestScore <= maxK) {
- // The user just wanted to find reads that match the database within some distance, but doesn't
- // care about the best alignment. Stop now but mark the result as MultipleHits because we're not
- // confident that it's the best one.
- *result = MultipleHits;
- return true;
- }
- // Update scoreLimit since we may have improved bestScore or secondBestScore
- if (maxHitsToGet > 0) {
- scoreLimit = min(bestScore, maxK) + 3;
- } else {
- scoreLimit = min(bestScore, maxK) + confDiff - 1 + (popularSeedsSkipped >= adaptiveConfDiffThreshold ? 1 : 0);
- if (bestScore != UnusedScoreValue && secondBestScore < bestScore + confDiff) {
- // Since secondBestScore already means that our best location so far won't be a SingleHit,
- // we really just care about finding better locations than that. However, still check for
- // scores up to bestScore - 1 since those might become a new secondBestScore.
- // _ASSERT(bestScore >= 1); // Otherwise we would've returned MultipleHits
- scoreLimit = bestScore - 1;
- }
- }
- } // While candidates exist in the element
- } // If the element could possibly affect the result
- //
- // Remove the element from the weight list.
- //
- elementToScore->allExtantCandidatesScored = true;
- elementToScore->weightNext->weightPrev = elementToScore->weightPrev;
- elementToScore->weightPrev->weightNext = elementToScore->weightNext;
- elementToScore->weightNext = elementToScore->weightPrev = elementToScore;
-
- if (lvScores >= lvCutoff) {
- //
- // Don't score any more, just go with what we've got now.
- //
- if (bestScore > maxK) {
- // If none of our seeds was below the popularity threshold, report this as MultipleHits; otherwise,
- // report it as NotFound
- *result = (nSeedsApplied == 0 && nRCSeedsApplied == 0) ? MultipleHits : NotFound;
- return true;
- }
- int realConfDiff = confDiff + (popularSeedsSkipped >= adaptiveConfDiffThreshold ? 1 : 0);
- if (bestScore + realConfDiff <= secondBestScore && bestScore <= maxK) {
- *result = SingleHit;
- *singleHitGenomeLocation = bestScoreGenomeLocation;
- if (NULL != finalScore) {
- *finalScore = bestScore;
- }
- return true;
- }
- *result = MultipleHits;
- return true;
- }
- } while (forceResult);
- return false;
- }
- void
- BaseAligner::fillHitsFound(
- unsigned maxHitsToGet,
- int *multiHitsFound,
- unsigned *multiHitLocations,
- bool *multiHitRCs,
- int *multiHitScores)
- /*++
- Routine Description:
- Return up to maxHitsToGet best hits found for the user in the given array and count parameters, as long
- as maxHitsToGet >= 1. We also need the popularSeedsSkipped parameter to set confDiff adaptively.
- --*/
- {
- if (maxHitsToGet > 0) {
- unsigned realConfDiff = confDiff + (popularSeedsSkipped > adaptiveConfDiffThreshold ? 1 : 0);
- *multiHitsFound = 0;
- int firstDist = 0; // Distance of the best hit
- while (firstDist < MAX_K && hitCount[firstDist] == 0) {
- firstDist++;
- }
- for (int dist = firstDist; dist < min(firstDist + 4, MAX_K); dist++) {
- for (unsigned i = 0; i < hitCount[dist]; i++) {
- multiHitLocations[*multiHitsFound] = hitLocations[dist][i];
- multiHitRCs[*multiHitsFound] = hitRCs[dist][i];
- multiHitScores[*multiHitsFound] = dist;
- *multiHitsFound += 1;
- if (*multiHitsFound == maxHitsToGet) {
- return;
- }
- }
- }
- }
- }
- void
- BaseAligner::findCandidate(
- unsigned genomeLocation,
- bool isRC,
- Candidate **candidate,
- HashTableElement **hashTableElement)
- /*++
- Routine Description:
- Find a candidate in the hash table. If it doesn't exist, create, initialize and insert it.
- Arguments:
- genomeLocation - the location of the candidate we'd like to look up
- candidate - The candidate that was found or created
- hashTableElement - the hashTableElement for the candidate that was found or created.
- --*/
- {
- HashTableAnchor *hashTable = candidateHashTable[isRC];
- unsigned lowOrderGenomeLocation = genomeLocation % (2 * maxMergeDist);
- unsigned highOrderGenomeLocation = genomeLocation - lowOrderGenomeLocation;
- unsigned hashTableIndex = hash(highOrderGenomeLocation) % candidateHashTablesSize;
- HashTableAnchor *anchor = &hashTable[hashTableIndex];
- if (anchor->epoch != hashTableEpoch) {
- //
- // It's empty.
- //
- *hashTableElement = NULL;
- *candidate = NULL;
- return;
- }
- HashTableElement *lookedUpElement = anchor->element;
- while (NULL != lookedUpElement && lookedUpElement->baseGenomeLocation != highOrderGenomeLocation) {
- lookedUpElement = lookedUpElement->next;
- }
- if (NULL != lookedUpElement) {
- *hashTableElement = lookedUpElement;
- *candidate = &lookedUpElement->candidates[lowOrderGenomeLocation];
- _uint64 bitForThisCandidate = (_uint64)1 << lowOrderGenomeLocation;
- lookedUpElement->allExtantCandidatesScored = lookedUpElement->allExtantCandidatesScored && (lookedUpElement->candidatesUsed & bitForThisCandidate);
- lookedUpElement->candidatesUsed |= bitForThisCandidate;
- } else {
- *hashTableElement = NULL;
- *candidate = NULL;
- }
- }
- bool doAlignerPrefetch = true;
- void
- BaseAligner::allocateNewCandidate(unsigned genomeLocation, bool isRC, unsigned lowestPossibleScore, Candidate **candidate, HashTableElement **hashTableElement)
- /*++
- Routine Description:
- Arguments:
- Return Value:
- --*/
- {
- HashTableAnchor *hashTable = candidateHashTable[isRC];
- unsigned lowOrderGenomeLocation = genomeLocation % (2 * maxMergeDist);
- unsigned highOrderGenomeLocation = genomeLocation - lowOrderGenomeLocation;
- unsigned hashTableIndex = hash(highOrderGenomeLocation) % candidateHashTablesSize;
- HashTableAnchor *anchor = &hashTable[hashTableIndex];
- HashTableElement *element;
- if (doAlignerPrefetch) {
- genomeIndex->prefetchGenomeData(genomeLocation);
- }
- #if DBG
- element = hashTable[hashTableIndex].element;
- while (anchor->epoch == hashTableEpoch && NULL != element && element->genomeLocation != highOrderGenomeLocation) {
- element = element->next;
- }
- _ASSERT(NULL == element || anchor->epoch != hashTableEpoch);
- #endif // DBG
- _ASSERT(nUsedHashTableElements < hashTableElementPoolSize);
- element = &hashTableElementPool[nUsedHashTableElements];
- nUsedHashTableElements++;
- if (anchor->epoch == hashTableEpoch) {
- element->next = anchor->element;
- } else {
- anchor->epoch = hashTableEpoch;
- element->next = NULL;
- }
- anchor->element = element;
- element->candidatesUsed = 0;
- element->lowestPossibleScore = lowestPossibleScore;
- element->isRC = isRC;
- element->candidatesUsed = (_uint64)1 << lowOrderGenomeLocation;
- element->weight = 1;
- element->baseGenomeLocation = highOrderGenomeLocation;
- element->bestScore = UnusedScoreValue;
- element->allExtantCandidatesScored = false;
- //
- // And insert it at the end of weight list 1.
- //
- element->weightNext = &weightLists[1];
- element->weightPrev = weightLists[1].weightPrev;
- element->weightNext->weightPrev = element;
- element->weightPrev->weightNext = element;
- *candidate = &element->candidates[lowOrderGenomeLocation];
- *hashTableElement = element;
- highestUsedWeightList = __max(highestUsedWeightList,(unsigned)1);
- }
- #if DBG
- bool
- BaseAligner::AssertCandidatesAreSorted()
- /*++
- Routine Description:
- Arguments:
- Return Value:
- --*/
- {
- for (int i = 1; i < nCandidates; i++) {
- _ASSERT(candidates[i].genomeLocation <= candidates[i-1].genomeLocation);
- }
- }
- #endif // DBG
- BaseAligner::~BaseAligner()
- /*++
- Routine Description:
- Arguments:
- Return Value:
- --*/
- {
- #if MAINTAIN_HISTOGRAMS
- delete lvHistogram;
- delete lookupHistogram;
- delete lvHistogramForMulti;
- delete lvCountWhenBestFound;
- #endif // MAINTAIN_HISTOGRAMS
- BigDealloc(seedUsedAsAllocated);
-
- BigDealloc(candidates);
- BigDealloc(rcReadData);
- BigDealloc(hashTableElementPool);
- BigDealloc(candidateHashTable[0]);
- BigDealloc(candidateHashTable[1]);
- BigDealloc(weightLists);
- delete bsd;
- if (ownLandauVishkin) {
- delete landauVishkin;
- }
- }
- void
- BaseAligner::ComputeHitDistribution(
- Read *read,
- unsigned correctGenomeLocation,
- bool correctHitIsRC,
- unsigned *hitCountBySeed,
- unsigned *rcHitCountBySeed,
- unsigned &nSeedsApplied,
- unsigned &nRCSeedsApplied,
- unsigned *hitsContainingCorrectLocation)
- {
- nSeedsApplied = 0;
- nRCSeedsApplied = 0;
- for (unsigned i = 0; i < maxSeedsToUse; i++) {
- hitCountBySeed[i] = 0;
- rcHitCountBySeed[i] = 0;
- hitsContainingCo…
Large files files are truncated, but you can click here to view the full file