PageRenderTime 53ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/source/library/Interlace/Differencing/SequenceMatcher.cs

https://bitbucket.org/VahidN/interlace
C# | 396 lines | 225 code | 57 blank | 114 comment | 47 complexity | a82c06dab9c1f3c3d245070fe54b0a31 MD5 | raw file
  1. #region Using Directives and Copyright Notice
  2. // Copyright (c) 2007-2010, Computer Consultancy Pty Ltd
  3. // All rights reserved.
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are met:
  7. // * Redistributions of source code must retain the above copyright
  8. // notice, this list of conditions and the following disclaimer.
  9. // * Redistributions in binary form must reproduce the above copyright
  10. // notice, this list of conditions and the following disclaimer in the
  11. // documentation and/or other materials provided with the distribution.
  12. // * Neither the name of the Computer Consultancy Pty Ltd nor the
  13. // names of its contributors may be used to endorse or promote products
  14. // derived from this software without specific prior written permission.
  15. //
  16. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. // ARE DISCLAIMED. IN NO EVENT SHALL COMPUTER CONSULTANCY PTY LTD BE LIABLE
  20. // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24. // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25. // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  26. // DAMAGE.
  27. using System;
  28. using System.Collections.Generic;
  29. using System.Text;
  30. #endregion
  31. // This class is a conversion of portions of the Python 2.4 difflib class.
  32. //
  33. // PSF LICENSE AGREEMENT FOR PYTHON 2.4
  34. // ------------------------------------
  35. //
  36. // 1. This LICENSE AGREEMENT is between the Python Software Foundation
  37. // ("PSF"), and the Individual or Organization ("Licensee") accessing and
  38. // otherwise using Python 2.4 software in source or binary form and its
  39. // associated documentation.
  40. //
  41. // 2. Subject to the terms and conditions of this License Agreement, PSF
  42. // hereby grants Licensee a nonexclusive, royalty-free, world-wide
  43. // license to reproduce, analyze, test, perform and/or display publicly,
  44. // prepare derivative works, distribute, and otherwise use Python 2.4
  45. // alone or in any derivative version, provided, however, that PSF's
  46. // License Agreement and PSF's notice of copyright, i.e., "Copyright (c)
  47. // 2001, 2002, 2003, 2004 Python Software Foundation; All Rights Reserved"
  48. // are retained in Python 2.4 alone or in any derivative version prepared
  49. // by Licensee.
  50. //
  51. // 3. In the event Licensee prepares a derivative work that is based on
  52. // or incorporates Python 2.4 or any part thereof, and wants to make
  53. // the derivative work available to others as provided herein, then
  54. // Licensee hereby agrees to include in any such work a brief summary of
  55. // the changes made to Python 2.4.
  56. //
  57. // 4. PSF is making Python 2.4 available to Licensee on an "AS IS"
  58. // basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
  59. // IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
  60. // DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
  61. // FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 2.4 WILL NOT
  62. // INFRINGE ANY THIRD PARTY RIGHTS.
  63. //
  64. // 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
  65. // 2.4 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
  66. // A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 2.4,
  67. // OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
  68. //
  69. // 6. This License Agreement will automatically terminate upon a material
  70. // breach of its terms and conditions.
  71. //
  72. // 7. Nothing in this License Agreement shall be deemed to create any
  73. // relationship of agency, partnership, or joint venture between PSF and
  74. // Licensee. This License Agreement does not grant permission to use PSF
  75. // trademarks or trade name in a trademark sense to endorse or promote
  76. // products or services of Licensee, or any third party.
  77. //
  78. // 8. By copying, installing or otherwise using Python 2.4, Licensee
  79. // agrees to be bound by the terms and conditions of this License
  80. // Agreement.
  81. namespace Interlace.Differencing
  82. {
  83. public delegate bool IsJunk<T>(T element);
  84. /// <summary>
  85. /// SequenceMatcher is a flexible class for comparing pairs of sequences of
  86. /// any type, so long as the sequence elements are hashable.
  87. /// </summary>
  88. public class SequenceMatcher<T>
  89. {
  90. IsJunk<T> _isJunk;
  91. IList<T> _left;
  92. IList<T> _right;
  93. List<MatchingBlock> _matchingBlocks;
  94. List<OpCode> _opCodes;
  95. Dictionary<T, List<int>> _rightToIndices;
  96. Dictionary<T, bool> _popularElements;
  97. Dictionary<T, bool> _rightJunk;
  98. public SequenceMatcher()
  99. : this(IsNeverJunk)
  100. {
  101. }
  102. public SequenceMatcher(IsJunk<T> isJunk)
  103. {
  104. _isJunk = isJunk;
  105. }
  106. private static bool IsNeverJunk(T element)
  107. {
  108. return false;
  109. }
  110. public IList<T> Left
  111. {
  112. get { return _left; }
  113. set
  114. {
  115. if (object.ReferenceEquals(_left, value)) return;
  116. _left = value;
  117. _matchingBlocks = null;
  118. _opCodes = null;
  119. }
  120. }
  121. public IList<T> Right
  122. {
  123. get { return _right; }
  124. set
  125. {
  126. if (object.ReferenceEquals(_right, value)) return;
  127. _right = value;
  128. _matchingBlocks = null;
  129. _opCodes = null;
  130. PrepareRightSideTables();
  131. }
  132. }
  133. void PrepareRightSideTables()
  134. {
  135. _rightToIndices = new Dictionary<T, List<int>>();
  136. _popularElements = new Dictionary<T, bool>();
  137. _rightJunk = new Dictionary<T, bool>();
  138. for (int i = 0; i < _right.Count; i++)
  139. {
  140. T element = _right[i];
  141. if (_rightToIndices.ContainsKey(element))
  142. {
  143. List<int> indices = _rightToIndices[element];
  144. if (_right.Count >= 200 && indices.Count * 100 > _right.Count)
  145. {
  146. _popularElements[element] = true;
  147. indices.Clear();
  148. }
  149. else
  150. {
  151. indices.Add(i);
  152. }
  153. }
  154. else
  155. {
  156. if (!_rightJunk.ContainsKey(element))
  157. {
  158. if (_isJunk(element))
  159. {
  160. _rightJunk[element] = true;
  161. }
  162. else
  163. {
  164. List<int> indices = new List<int>();
  165. indices.Add(i);
  166. _rightToIndices[element] = indices;
  167. }
  168. }
  169. }
  170. }
  171. // Purge leftover indices for popular elements:
  172. foreach (T element in _popularElements.Keys)
  173. {
  174. _rightToIndices.Remove(element);
  175. }
  176. }
  177. /// <summary>
  178. /// Find longest matching block in two subsequences of the left and right sequences.
  179. /// </summary>
  180. /// <param name="leftLow">The index of the first element in the left sequence.</param>
  181. /// <param name="leftHigh">The index of the first element after the last element in the left sequence.</param>
  182. /// <param name="rightLow">The index of the first element in the right sequence.</param>
  183. /// <param name="rightHigh">The index of the first element after the last element in the right sequence.</param>
  184. private MatchingBlock FindLongestMatch(int leftLow, int leftHigh, int rightLow, int rightHigh)
  185. {
  186. int bestLeftLow = leftLow;
  187. int bestRightLow = rightLow;
  188. int bestSize = 0;
  189. // During an iteration of the loop, j2len[j] = length of longest
  190. // junk-free match ending with a[i-1] and b[j]
  191. Dictionary<int, int> longestLengths = new Dictionary<int,int>();
  192. for (int i = leftLow; i < leftHigh; i++)
  193. {
  194. Dictionary<int, int> newLongestLengths = new Dictionary<int, int>();
  195. if (_rightToIndices.ContainsKey(_left[i]))
  196. {
  197. foreach (int j in _rightToIndices[_left[i]])
  198. {
  199. // _left[i] matches _right[j]:
  200. if (j < rightLow) continue;
  201. if (j >= rightHigh) break;
  202. int newSize;
  203. if (!longestLengths.TryGetValue(j - 1, out newSize)) newSize = 0;
  204. newSize++;
  205. newLongestLengths[j] = newSize;
  206. if (newSize > bestSize)
  207. {
  208. bestLeftLow = i - newSize + 1;
  209. bestRightLow = j - newSize + 1;
  210. bestSize = newSize;
  211. }
  212. }
  213. }
  214. longestLengths = newLongestLengths;
  215. }
  216. // Extend the best by non-junk elements on each end. In particular,
  217. // "popular" non-junk elements aren't in b2j, which greatly speeds
  218. // the inner loop above, but also means "the best" match so far
  219. // doesn't contain any junk *or* popular non-junk elements.
  220. while (bestLeftLow > leftLow && bestRightLow > rightLow &&
  221. !_rightJunk.ContainsKey(_right[bestRightLow -1 ]) &
  222. object.Equals(_left[bestLeftLow - 1], _right[bestRightLow - 1]))
  223. {
  224. bestLeftLow--;
  225. bestRightLow--;
  226. bestSize++;
  227. }
  228. while (bestLeftLow + bestSize < leftHigh && bestRightLow + bestSize < rightHigh &&
  229. !_rightJunk.ContainsKey(_right[bestRightLow + bestSize]) &&
  230. object.Equals(_left[bestLeftLow + bestSize], _right[bestRightLow + bestSize]))
  231. {
  232. bestSize++;
  233. }
  234. // Now that we have a wholly interesting match (albeit possibly
  235. // empty!), we may as well suck up the matching junk on each
  236. // side of it too. Can't think of a good reason not to, and it
  237. // saves post-processing the (possibly considerable) expense of
  238. // figuring out what to do with it. In the case of an empty
  239. // interesting match, this is clearly the right thing to do,
  240. // because no other kind of match is possible in the regions.
  241. while (bestLeftLow > leftLow && bestRightLow > rightLow &&
  242. _rightJunk.ContainsKey(_right[bestRightLow - 1]) &&
  243. object.Equals(_left[bestLeftLow - 1], _right[bestRightLow - 1]))
  244. {
  245. bestLeftLow--;
  246. bestRightLow--;
  247. bestSize++;
  248. }
  249. while (bestLeftLow + bestSize < leftHigh && bestRightLow + bestSize < rightHigh &&
  250. _rightJunk.ContainsKey(_right[bestRightLow + bestSize]) &&
  251. object.Equals(_left[bestLeftLow + bestSize], _right[bestRightLow + bestSize]))
  252. {
  253. bestSize++;
  254. }
  255. return new MatchingBlock(bestLeftLow, bestRightLow, bestSize);
  256. }
  257. private void FindAndDivideIntoMatchingBlocks(int leftLow, int leftHigh, int rightLow, int rightHigh, List<MatchingBlock> into)
  258. {
  259. MatchingBlock block = FindLongestMatch(leftLow, leftHigh, rightLow, rightHigh);
  260. if (block.length == 0) return;
  261. if (leftLow < block.leftStart && rightLow < block.rightStart)
  262. FindAndDivideIntoMatchingBlocks(leftLow, block.leftStart, rightLow, block.rightStart, into);
  263. into.Add(block);
  264. if (block.leftStart + block.length < leftHigh &&
  265. block.rightStart + block.length < rightHigh)
  266. FindAndDivideIntoMatchingBlocks(block.leftStart + block.length, leftHigh,
  267. block.rightStart + block.length, rightHigh, into);
  268. }
  269. /// <summary>
  270. /// Return list of triples describing matching subsequences.
  271. /// </summary>
  272. public List<MatchingBlock> MatchingBlocks
  273. {
  274. get
  275. {
  276. if (_left == null || _right == null)
  277. {
  278. throw new InvalidOperationException("Both the left and right sequences must be set " +
  279. "to non-null sequences.");
  280. }
  281. if (_matchingBlocks == null)
  282. {
  283. _matchingBlocks = new List<MatchingBlock>();
  284. FindAndDivideIntoMatchingBlocks(0, _left.Count, 0, _right.Count, _matchingBlocks);
  285. _matchingBlocks.Add(new MatchingBlock(_left.Count, _right.Count, 0));
  286. }
  287. return _matchingBlocks;
  288. }
  289. }
  290. /// <summary>
  291. /// Returns a list of operations giving instructions to transform the left
  292. /// sequence into the right sequence.
  293. /// </summary>
  294. /// <returns>A list of operation codes giving the instructions.</returns>
  295. public List<OpCode> OpCodes
  296. {
  297. get
  298. {
  299. if (_opCodes == null)
  300. {
  301. int leftPosition = 0;
  302. int rightPosition = 0;
  303. _opCodes = new List<OpCode>();
  304. foreach (MatchingBlock block in MatchingBlocks)
  305. {
  306. // invariant: we've pumped out correct diffs to change
  307. // a[:i] into b[:j], and the next matching block is
  308. // a[ai:ai+size] == b[bj:bj+size]. So we need to pump
  309. // out a diff to change a[i:ai] into b[j:bj], pump out
  310. // the matching block, and move (i,j) beyond the match
  311. OpCodeOperation operation = OpCodeOperation.None;
  312. if (leftPosition < block.leftStart && rightPosition < block.rightStart)
  313. {
  314. operation = OpCodeOperation.Replace;
  315. }
  316. else
  317. {
  318. if (leftPosition < block.leftStart) operation = OpCodeOperation.Delete;
  319. if (rightPosition < block.rightStart) operation = OpCodeOperation.Insert;
  320. }
  321. if (operation != OpCodeOperation.None)
  322. {
  323. _opCodes.Add(new OpCode(operation, leftPosition, block.leftStart,
  324. rightPosition, block.rightStart));
  325. }
  326. leftPosition = block.leftStart + block.length;
  327. rightPosition = block.rightStart + block.length;
  328. if (block.length > 0)
  329. {
  330. _opCodes.Add(new OpCode(OpCodeOperation.Equal, block.leftStart, leftPosition,
  331. block.rightStart, rightPosition));
  332. }
  333. }
  334. }
  335. return _opCodes;
  336. }
  337. }
  338. }
  339. }