/Src/Protsyk.Sources/Data Structures/SuffixTree/SuffixTreeUkkonenCubic.cs

https://github.com/PetroProtsyk/Sources · C# · 411 lines · 336 code · 54 blank · 21 comment · 44 complexity · 408e654df76acd47b671139846071d8c MD5 · raw file

  1. using System;
  2. using System.Linq;
  3. using System.Collections.Generic;
  4. using System.Text;
  5. namespace Protsyk.DataStructures
  6. {
  7. /// <summary>
  8. /// Ukkonen O(n^3) algorithm.
  9. /// As described in the book by D. Gusfield, Algorithms on Strings, Trees and Sequences
  10. /// in the Section 6.1
  11. /// </summary>
  12. public class SuffixTreeUkkonenCubic : SuffixTree
  13. {
  14. #region Fields
  15. private readonly Node root;
  16. private readonly string text;
  17. #endregion
  18. #region Constructor
  19. public SuffixTreeUkkonenCubic(string inputText)
  20. {
  21. text = inputText + TerminationCharacter;
  22. root = Build(text);
  23. }
  24. #endregion
  25. #region Api
  26. public override bool IsMatch(string substring)
  27. {
  28. return Match(substring).Any();
  29. }
  30. public override IEnumerable<int> Match(string substring)
  31. {
  32. var node = Navigate(root, 0, substring);
  33. if (!node.Item1)
  34. {
  35. yield break;
  36. }
  37. var stack = new Stack<Node>();
  38. if (node.Item5 < 0)
  39. {
  40. stack.Push(node.Item2);
  41. }
  42. else
  43. {
  44. stack.Push(node.Item2.children[node.Item5]);
  45. }
  46. while (stack.Count > 0)
  47. {
  48. var current = stack.Pop();
  49. if (IsInternal(current))
  50. {
  51. foreach (var child in current.children)
  52. {
  53. stack.Push(child);
  54. }
  55. }
  56. else
  57. {
  58. yield return current.pos;
  59. }
  60. }
  61. }
  62. #endregion
  63. #region Methods
  64. private ValueTuple<bool, Node, int, int, int> Navigate(Node parent, int from, string substring)
  65. {
  66. var node = parent;
  67. if (string.IsNullOrEmpty(substring))
  68. {
  69. return new ValueTuple<bool, Node, int, int, int>(true, node, from, 0, -1);
  70. }
  71. // Navigate to the end of substring
  72. var k = from;
  73. while (true)
  74. {
  75. var childIndex = FindChild(substring, node, k);
  76. if (childIndex < 0)
  77. {
  78. return new ValueTuple<bool, Node, int, int, int>(false, node, k, 0, -1);
  79. }
  80. var child = node.children[childIndex];
  81. var m = 0;
  82. while (child.start + m < child.end &&
  83. k < substring.Length &&
  84. text[child.start + m] == substring[k])
  85. {
  86. m++;
  87. k++;
  88. }
  89. if (k == substring.Length)
  90. {
  91. return new ValueTuple<bool, Node, int, int, int>(true, node, k, m, childIndex);
  92. }
  93. else if (child.start + m == child.end)
  94. {
  95. if (!IsInternal(child))
  96. {
  97. return new ValueTuple<bool, Node, int, int, int>(false, node, k, m, childIndex);
  98. }
  99. node = child;
  100. }
  101. else
  102. {
  103. return new ValueTuple<bool, Node, int, int, int>(false, node, k, m, childIndex);
  104. }
  105. }
  106. }
  107. private int FindChild(string substring, Node node, int k)
  108. {
  109. for (int i = 0; i < node.children.Count; ++i)
  110. {
  111. var child = node.children[i];
  112. if (text[child.start] == substring[k])
  113. {
  114. return i;
  115. }
  116. }
  117. return -1;
  118. }
  119. private static bool IsInternal(Node node)
  120. {
  121. return node.children.Count > 0;
  122. }
  123. #endregion
  124. #region Construction
  125. private Node Build(string text)
  126. {
  127. var builder = new UkkonenBuilder(text);
  128. return builder.Build();
  129. }
  130. private class UkkonenBuilder
  131. {
  132. private readonly string text;
  133. private readonly Node root;
  134. public UkkonenBuilder(string text)
  135. {
  136. this.text = text;
  137. this.root = new Node();
  138. }
  139. public Node Build()
  140. {
  141. for (int i = 0; i < text.Length; ++i)
  142. {
  143. // Phase i+1
  144. for (int j = 0; j < i; ++j)
  145. {
  146. // Extension j
  147. Extend(j, i);
  148. }
  149. // Do not put TerminationCharacter to the tree
  150. if (i < text.Length - 1)
  151. {
  152. // Extend empty suffix, by putting the next character to the tree
  153. ConstructT(i);
  154. }
  155. }
  156. return root;
  157. }
  158. private void ConstructT(int t)
  159. {
  160. var childIndex = FindChild(root.children, text[t]);
  161. if (childIndex >= 0)
  162. {
  163. return;
  164. }
  165. var newNode = new Node
  166. {
  167. start = t,
  168. end = t + 1,
  169. pos = t
  170. };
  171. root.children.Add(newNode);
  172. }
  173. private int FindChild(IList<Node> children, char c)
  174. {
  175. for (int i=0; i< children.Count; ++i)
  176. {
  177. if (text[children[i].start] == c)
  178. {
  179. return i;
  180. }
  181. }
  182. return -1;
  183. }
  184. private void Extend(int from, int to)
  185. {
  186. // Navigate to the end of substring
  187. var node = root;
  188. var k = from;
  189. while (k < to)
  190. {
  191. var childIndex = FindChild(node.children, text[k]);
  192. if (childIndex < 0)
  193. {
  194. throw new Exception("What?");
  195. }
  196. var child = node.children[childIndex];
  197. var m = 0;
  198. while (child.start + m < child.end &&
  199. k < to &&
  200. text[child.start + m] == text[k])
  201. {
  202. m++;
  203. k++;
  204. }
  205. if (k == to)
  206. {
  207. if (child.start + m == child.end)
  208. {
  209. if (IsInternal(child))
  210. {
  211. if (FindChild(child.children, text[k]) >= 0)
  212. {
  213. ApplyRule3();
  214. }
  215. else
  216. {
  217. child.children.Add(new Node
  218. {
  219. start = k,
  220. end = to + 1,
  221. pos = from
  222. });
  223. }
  224. }
  225. else
  226. {
  227. ApplyRule1(child, to + 1);
  228. }
  229. }
  230. else
  231. {
  232. if (text[child.start + m] == text[k])
  233. {
  234. ApplyRule3();
  235. }
  236. else
  237. {
  238. ApplyRule2(node, childIndex, m, k, from, to + 1);
  239. }
  240. }
  241. }
  242. else if (child.start + m == child.end)
  243. {
  244. if (IsInternal(child))
  245. {
  246. node = child;
  247. }
  248. else
  249. {
  250. throw new Exception("What?");
  251. }
  252. }
  253. else
  254. {
  255. throw new Exception("What?");
  256. }
  257. }
  258. }
  259. private void ApplyRule1(Node leaf, int newEnd)
  260. {
  261. //Rule 1. Path ends at a leaf. Extend label
  262. leaf.end = newEnd;
  263. }
  264. private void ApplyRule2(Node parent, int childIndex, int m, int k, int pos, int to)
  265. {
  266. //Rule 2. Split label and add new leaf
  267. var child = parent.children[childIndex];
  268. // 1) replace child with internal node
  269. var newParent = new Node
  270. {
  271. start = child.start,
  272. end = child.start + m,
  273. };
  274. parent.children[childIndex] = newParent;
  275. // 2) adjust start position of the child and add it to the new internal node as a child
  276. child.start += m;
  277. newParent.children.Add(child);
  278. // 3) add the rest of the suffix as a new child
  279. newParent.children.Add(new Node
  280. {
  281. start = k,
  282. end = to,
  283. pos = pos
  284. });
  285. }
  286. private void ApplyRule3()
  287. {
  288. //Rule 3. Suffix is already in the tree
  289. // Do nothing
  290. }
  291. }
  292. #endregion
  293. #region Visualization
  294. public override string ToDotNotation()
  295. {
  296. var dotText = new StringBuilder();
  297. dotText.AppendLine("digraph g {");
  298. dotText.AppendLine("node[shape = circle];");
  299. var labels = new Dictionary<Node, int>();
  300. // Nodes
  301. foreach (var node in Visit())
  302. {
  303. int index = GetLabelIndex(labels, node);
  304. if (!IsInternal(node))
  305. {
  306. dotText.AppendLine($"node{index} [label=\"{node.pos}\"]");
  307. }
  308. else
  309. {
  310. dotText.AppendLine($"node{index} [label=\"\"]");
  311. foreach (var child in node.children.OrderBy(c => text[c.start]))
  312. {
  313. int childIndex = GetLabelIndex(labels, child);
  314. dotText.AppendLine($"node{index} -> node{childIndex} [label=\"{text.Substring(child.start, child.end - child.start)}\"]");
  315. }
  316. }
  317. }
  318. dotText.AppendLine("}");
  319. return dotText.ToString();
  320. }
  321. private static int GetLabelIndex(Dictionary<Node, int> labels, Node node)
  322. {
  323. if (!labels.TryGetValue(node, out var index))
  324. {
  325. index = labels.Count + 1;
  326. labels.Add(node, index);
  327. }
  328. return index;
  329. }
  330. private IEnumerable<Node> Visit()
  331. {
  332. var stack = new Stack<Node>();
  333. stack.Push(root);
  334. while (stack.Count > 0)
  335. {
  336. var current = stack.Pop();
  337. foreach (var child in current.children)
  338. {
  339. stack.Push(child);
  340. }
  341. yield return current;
  342. }
  343. }
  344. #endregion
  345. #region Types
  346. class Node
  347. {
  348. public int start;
  349. public int end;
  350. // Leaf
  351. public int pos;
  352. // Internal
  353. public readonly IList<Node> children = new List<Node>();
  354. }
  355. #endregion
  356. }
  357. }