/Source/Framework/Bio/WordMatch.cs

# · C# · 537 lines · 312 code · 66 blank · 159 comment · 66 complexity · cac23742251ba5ccd25ad791a7510aea MD5 · raw file

  1. // *********************************************************
  2. //
  3. // Copyright (c) Microsoft. All rights reserved.
  4. // This code is licensed under the Apache License, Version 2.0.
  5. // THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
  6. // ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
  7. // IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
  8. // PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
  9. //
  10. // *********************************************************
  11. using System;
  12. using System.Collections.Generic;
  13. using Bio.Algorithms.Kmer;
  14. using System.Linq;
  15. namespace Bio
  16. {
  17. /// <summary>
  18. /// WordMatch stores the region of similarity between two sequences.
  19. /// </summary>
  20. public class WordMatch : IComparable, IComparable<WordMatch>, IEquatable<WordMatch>
  21. {
  22. /// <summary>
  23. /// Length of the match.
  24. /// </summary>
  25. private int length;
  26. /// <summary>
  27. /// Start index of the first sequence.
  28. /// </summary>
  29. private int sequence1Start;
  30. /// <summary>
  31. /// Start index of the second sequence.
  32. /// </summary>
  33. private int sequence2Start;
  34. /// <summary>
  35. /// Initializes a new instance of the WordMatch class.
  36. /// </summary>
  37. /// <param name="length">Length of the match</param>
  38. /// <param name="sequence1Start">Start index of the first sequence.</param>
  39. /// <param name="sequence2Start"> Start index of the second sequence.</param>
  40. public WordMatch(int length, int sequence1Start, int sequence2Start)
  41. {
  42. this.length = length;
  43. this.sequence1Start = sequence1Start;
  44. this.sequence2Start = sequence2Start;
  45. }
  46. /// <summary>
  47. /// Gets or sets the length of the match
  48. /// </summary>
  49. public int Length
  50. {
  51. get { return length; }
  52. set { length = value; }
  53. }
  54. /// <summary>
  55. /// Gets or sets the start index of the first sequence.
  56. /// </summary>
  57. public int Sequence1Start
  58. {
  59. get { return sequence1Start; }
  60. set { sequence1Start = value; }
  61. }
  62. /// <summary>
  63. /// Gets or sets the start index of the second sequence.
  64. /// </summary>
  65. public int Sequence2Start
  66. {
  67. get { return sequence2Start; }
  68. set { sequence2Start = value; }
  69. }
  70. /// <summary>
  71. /// Gets or sets a value indicating whether this word match should be considered or not.
  72. /// </summary>
  73. public bool Deleted
  74. {
  75. get;
  76. set;
  77. }
  78. /// <summary>
  79. /// Given a list of matches, reduce it to the minimal set of best
  80. /// non-overlapping matches.
  81. /// </summary>
  82. /// <param name="completeList">List of matches to reduce to non-overlapping set.</param>
  83. /// <param name="wordLength">Wordlength entered by the user.</param>
  84. /// <returns>Minimal set of best non-overlapping matches.</returns>
  85. public static List<WordMatch> GetMinimalList(List<WordMatch> completeList, int wordLength)
  86. {
  87. List<WordMatch> minimalList = new List<WordMatch>();
  88. completeList.Sort();
  89. foreach (WordMatch wordMatch in completeList)
  90. {
  91. int deadx1 = 0;
  92. int deady1 = 0;
  93. int deadx2 = 0;
  94. int deady2 = 0;
  95. if (!wordMatch.Deleted)
  96. {
  97. // First pos of match
  98. deadx1 = wordMatch.sequence1Start;
  99. // First pos of match
  100. deady1 = wordMatch.sequence2Start;
  101. // Last pos of match
  102. deadx2 = wordMatch.sequence1Start + wordMatch.length - 1;
  103. // Last pos of match
  104. deady2 = wordMatch.sequence2Start + wordMatch.length - 1;
  105. foreach (WordMatch innerWordMatch in completeList)
  106. {
  107. if (wordMatch != innerWordMatch && !innerWordMatch.Deleted)
  108. {
  109. // Want to remove this match if it is in the dead zone
  110. bool result = WordDeadZone(innerWordMatch, deadx1, deady1, deadx2, deady2, wordLength);
  111. if (result)
  112. {
  113. // It is in the dead zone - remove it
  114. // Need to free up the match structure and remove the
  115. // current node of the list
  116. innerWordMatch.Deleted = true;
  117. }
  118. }
  119. }
  120. }
  121. }
  122. foreach (WordMatch wordMatch in completeList)
  123. {
  124. if (!wordMatch.Deleted)
  125. {
  126. minimalList.Add(wordMatch);
  127. }
  128. }
  129. return minimalList;
  130. }
  131. /// <summary>
  132. /// Create a list of all the matches and order them by the
  133. /// second sequence.
  134. /// </summary>
  135. /// <param name="kmerList">List of kmer's.</param>
  136. /// <param name="seq2">Second sequence.</param>
  137. /// <param name="wordLength">Wordlength entered by the user</param>
  138. /// <returns>List of all the matches.</returns>
  139. public static List<WordMatch> BuildMatchTable(KmersOfSequence kmerList, ISequence seq2, int wordLength)
  140. {
  141. if (seq2 == null)
  142. {
  143. throw new ArgumentNullException("seq2");
  144. }
  145. int i = 0;
  146. int ilast = (int)seq2.Count - wordLength;
  147. List<WordMatch> wordCurList = new List<WordMatch>();
  148. List<WordMatch> hitList = new List<WordMatch>();
  149. bool matched = false;
  150. while (i < (ilast + 1))
  151. {
  152. IList<long> positions = FindCorrespondingMatch(new string(seq2.Skip(i).Take(wordLength).Select(a => (char)a).ToArray()), kmerList);
  153. if (positions != null)
  154. {
  155. int kcur = 0;
  156. int kcur2 = 0;
  157. int knew = 0;
  158. if (wordCurList.Count > 0)
  159. {
  160. WordMatch curmatch = wordCurList[0];
  161. kcur = curmatch.sequence1Start + curmatch.length - wordLength + 1;
  162. kcur2 = curmatch.sequence2Start + curmatch.length - wordLength + 1;
  163. }
  164. foreach (int position in positions)
  165. {
  166. knew = position;
  167. matched = false;
  168. foreach (WordMatch curmatch in wordCurList)
  169. {
  170. if (!curmatch.Deleted)
  171. {
  172. kcur = curmatch.sequence1Start + curmatch.length -
  173. wordLength + 1;
  174. kcur2 = curmatch.sequence2Start + curmatch.length -
  175. wordLength + 1;
  176. // When we test, we may have already incremented
  177. // one of the matches - so test old and new kcur2
  178. if (kcur2 != i && kcur2 != i + 1)
  179. {
  180. curmatch.Deleted = true;
  181. continue;
  182. }
  183. if (kcur == knew && kcur2 == i)
  184. {
  185. curmatch.length++;
  186. matched = true;
  187. continue;
  188. }
  189. }
  190. }
  191. if (!matched)
  192. {
  193. // New current match
  194. WordMatch match2 = new WordMatch(wordLength, knew, i);
  195. hitList.Add(match2);
  196. wordCurList.Add(match2);
  197. }
  198. }
  199. }
  200. i++;
  201. }
  202. wordCurList.Sort();
  203. foreach (WordMatch curmatch in wordCurList)
  204. {
  205. curmatch.Deleted = false;
  206. }
  207. return wordCurList;
  208. }
  209. #region IComparable Members
  210. /// <summary>
  211. /// CompareTo method is used while sorting WordMatch objects.
  212. /// </summary>
  213. /// <param name="obj">WordMatch object</param>
  214. /// <returns>
  215. /// Returns zero if the objects are equal,
  216. /// Else, returns zero if the objects have the same length, sequence1start and sequence2Start
  217. /// If lengths are equal, then the objects are ordered by sequence1start
  218. /// If lengths are equal and sequence1Start are equal, then the objects are ordered by
  219. /// </returns>
  220. public int CompareTo(object obj)
  221. {
  222. WordMatch other = obj as WordMatch;
  223. if (other == null)
  224. {
  225. return -1;
  226. }
  227. else
  228. {
  229. return CompareTo(other);
  230. }
  231. }
  232. #endregion
  233. #region IComparable<WordMatch> Members
  234. /// <summary>
  235. /// Compares two sequence matches so the result can be used in sorting.
  236. /// The comparison is done by size and if the size is equal, by seq1
  237. /// start position. If the sequence1 start positions are equal they are
  238. /// sorted by sequence2 start position.
  239. /// </summary>
  240. /// <param name="other">WordMatch object</param>
  241. /// <returns>
  242. /// Returns zero if the objects have the same length, sequence1start and sequence2Start
  243. /// If lengths are equal, then the objects are ordered by sequence1start
  244. /// If lengths are equal and sequence1Start are equal, then the objects are ordered by sequence2start
  245. /// </returns>
  246. public int CompareTo(WordMatch other)
  247. {
  248. if (other != null)
  249. {
  250. if (other.length == length)
  251. {
  252. if (other.sequence1Start == sequence1Start)
  253. {
  254. if (other.sequence2Start == sequence2Start)
  255. {
  256. return 0;
  257. }
  258. else
  259. {
  260. return sequence2Start - other.sequence2Start;
  261. }
  262. }
  263. else
  264. {
  265. return sequence1Start - other.sequence1Start;
  266. }
  267. }
  268. else
  269. {
  270. return other.length - length;
  271. }
  272. }
  273. return -1;
  274. }
  275. #endregion
  276. #region IEquatable<WordMatch> Members
  277. /// <summary>
  278. /// Checks if another WordMatch object is equal to the current
  279. /// object.
  280. /// </summary>
  281. /// <param name="other">WordMatch object to be compared.</param>
  282. /// <returns>
  283. /// true: if the objects are equal else false.
  284. /// </returns>
  285. public bool Equals(WordMatch other)
  286. {
  287. return length == other.length
  288. && sequence1Start == other.sequence1Start
  289. && sequence2Start == other.sequence2Start;
  290. }
  291. #endregion
  292. /// <summary>
  293. /// Determines if a match is within the region which is not overlapped by the
  294. /// match starting at position (deadx1, deady1) or ending at position
  295. /// (deadx2, deady2). If it is in this region
  296. /// (the 'live zone') then true is returned, else false is returned.
  297. /// </summary>
  298. /// <param name="wordMatch">Word Match object which holds the similarity of the two sequences.</param>
  299. /// <param name="deadx1">starting x-position of the region for which overlapped has to be checked.</param>
  300. /// <param name="deady1">starting y-position of the region for which overlapped has to be checked.</param>
  301. /// <param name="deadx2">ending x-position of the region for which overlapped has to be checked.</param>
  302. /// <param name="deady2">ending y-position of the region for which overlapped has to be checked.</param>
  303. /// <param name="wordLength">Wordlength entered by the user</param>
  304. /// <returns>
  305. /// true: if the wordMatch is in the overlapped region, else false.
  306. /// </returns>
  307. private static bool WordDeadZone(WordMatch wordMatch, int deadx1, int deady1, int deadx2, int deady2, int wordLength)
  308. {
  309. int startx;
  310. int starty;
  311. int endx;
  312. int endy;
  313. startx = wordMatch.sequence1Start;
  314. starty = wordMatch.sequence2Start;
  315. endx = wordMatch.sequence1Start + wordMatch.length - 1;
  316. endy = wordMatch.sequence2Start + wordMatch.length - 1;
  317. // Is it in the top right live zone?
  318. if (startx > deadx2 && starty > deady2)
  319. {
  320. return false;
  321. }
  322. // Is it in the bottom right live zone?
  323. if (endx < deadx1 && endy < deady1)
  324. {
  325. return false;
  326. }
  327. // Is it in the top left dead zone?
  328. if (starty >= deady1 && endx <= deadx2)
  329. {
  330. return true;
  331. }
  332. // Is it in the bottom right dead zone?
  333. if (endy <= deady2 && startx >= deadx1)
  334. {
  335. return true;
  336. }
  337. if (endy < deady2)
  338. {
  339. if (startx - starty < deadx1 - deady1)
  340. {
  341. // Crosses deady1
  342. wordMatch.length = deady1 - starty;
  343. }
  344. else if (startx - starty > deadx1 - deady1)
  345. {
  346. // Crosses deadx1
  347. wordMatch.length = deadx1 - startx;
  348. }
  349. }
  350. else if (starty > deady1)
  351. {
  352. if (startx - starty < deadx1 - deady1)
  353. {
  354. // Crosses deadx2
  355. wordMatch.length = endx - deadx2;
  356. wordMatch.sequence1Start = deadx2 + 1;
  357. wordMatch.sequence2Start += deadx2 - startx + 1;
  358. }
  359. else if (startx - starty > deadx1 - deady1)
  360. {
  361. // Crosses deady2
  362. wordMatch.length = endy - deady2;
  363. wordMatch.sequence1Start += deady2 - starty + 1;
  364. wordMatch.sequence2Start = deady2 + 1;
  365. }
  366. }
  367. if (wordMatch.length < wordLength)
  368. {
  369. return true;
  370. }
  371. return false;
  372. }
  373. /// <summary>
  374. /// Finds the sequence in the list of IKmer and returns the list of position
  375. /// of the Kmers.
  376. /// </summary>
  377. /// <param name="sequence">Sequence which has to be matched in the list of IKmer.</param>
  378. /// <param name="kmerList">List of IKmer.</param>
  379. /// <returns>Returns the list of position of IKmer.</returns>
  380. private static IList<long> FindCorrespondingMatch(string sequence, KmersOfSequence kmerList)
  381. {
  382. IList<long> positions = null;
  383. foreach (KmersOfSequence.KmerPositions kmer in kmerList.Kmers)
  384. {
  385. string kmerString = new string(kmerList.KmerToSequence(kmer).Select(a => (char)a).ToArray());
  386. if (sequence.Equals(kmerString))
  387. {
  388. positions = kmer.Positions;
  389. break;
  390. }
  391. }
  392. return positions;
  393. }
  394. /// <summary>
  395. /// Overrides hash function for a particular type.
  396. /// </summary>
  397. /// <returns>hash code</returns>
  398. public override int GetHashCode()
  399. {
  400. return base.GetHashCode();
  401. }
  402. /// <summary>
  403. /// Overrides the equal method
  404. /// </summary>
  405. /// <param name="obj">Object to be checked</param>
  406. /// <returns>Is equals</returns>
  407. public override bool Equals(object obj)
  408. {
  409. if (obj == null || GetType() != obj.GetType())
  410. {
  411. return false;
  412. }
  413. return base.Equals(obj);
  414. }
  415. /// <summary>
  416. /// Override equal operator
  417. /// </summary>
  418. /// <param name="leftHandSideObject">LHS object</param>
  419. /// <param name="rightHandSideObject">RHS object</param>
  420. /// <returns>Is LHS == RHS</returns>
  421. public static bool operator ==(WordMatch leftHandSideObject, WordMatch rightHandSideObject)
  422. {
  423. if (System.Object.ReferenceEquals(leftHandSideObject, rightHandSideObject))
  424. {
  425. return true;
  426. }
  427. else
  428. {
  429. return false;
  430. }
  431. }
  432. /// <summary>
  433. /// Override not equal operator
  434. /// </summary>
  435. /// <param name="leftHandSideObject">LHS object</param>
  436. /// <param name="rightHandSideObject">RHS object</param>
  437. /// <returns>Is LHS == RHS</returns>
  438. public static bool operator !=(WordMatch leftHandSideObject, WordMatch rightHandSideObject)
  439. {
  440. return !(leftHandSideObject == rightHandSideObject);
  441. }
  442. /// <summary>
  443. /// Override less than operator
  444. /// </summary>
  445. /// <param name="leftHandSideObject">LHS object</param>
  446. /// <param name="rightHandSideObject">RHS object</param>
  447. /// <returns>Is LHS == RHS</returns>
  448. public static bool operator <(WordMatch leftHandSideObject, WordMatch rightHandSideObject)
  449. {
  450. if (object.ReferenceEquals(leftHandSideObject, null) || object.ReferenceEquals(rightHandSideObject, null))
  451. {
  452. return false;
  453. }
  454. return (leftHandSideObject.CompareTo(rightHandSideObject) < 0);
  455. }
  456. /// <summary>
  457. /// Override greater than operator
  458. /// </summary>
  459. /// <param name="leftHandSideObject">LHS object</param>
  460. /// <param name="rightHandSideObject">RHS object</param>
  461. /// <returns>Is LHS == RHS</returns>
  462. public static bool operator >(WordMatch leftHandSideObject, WordMatch rightHandSideObject)
  463. {
  464. if (object.ReferenceEquals(leftHandSideObject, null) || object.ReferenceEquals(rightHandSideObject, null))
  465. {
  466. return false;
  467. }
  468. return (leftHandSideObject.CompareTo(rightHandSideObject) > 0);
  469. }
  470. }
  471. }