PageRenderTime 63ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/Bio/triefind.py

http://github.com/biopython/biopython
Python | 102 lines | 46 code | 12 blank | 44 comment | 14 complexity | 6e3eb85b2b2e42e47d0b481289f49188 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. # This code is part of the Biopython distribution and governed by its
  2. # license. Please see the LICENSE file that should have been included
  3. # as part of this package.
  4. #
  5. """
  6. Given a trie, find all occurrences of a word in the trie in a string.
  7. Like searching a string for a substring, except that the substring is
  8. any word in a trie.
  9. Functions:
  10. match Find longest key in a trie matching the beginning of the string.
  11. match_all Find all keys in a trie matching the beginning of the string.
  12. find Find keys in a trie matching anywhere in a string.
  13. find_words Find keys in a trie matching whole words in a string.
  14. """
  15. import string
  16. import re
  17. def match(string, trie):
  18. """match(string, trie) -> longest key or None
  19. Find the longest key in the trie that matches the beginning of the
  20. string.
  21. """
  22. longest = None
  23. for i in range(len(string)):
  24. substr = string[:i + 1]
  25. if not trie.has_prefix(substr):
  26. break
  27. if substr in trie:
  28. longest = substr
  29. return longest
  30. def match_all(string, trie):
  31. """match_all(string, trie) -> list of keys
  32. Find all the keys in the trie that matches the beginning of the
  33. string.
  34. """
  35. matches = []
  36. for i in range(len(string)):
  37. substr = string[:i + 1]
  38. if not trie.has_prefix(substr):
  39. break
  40. if substr in trie:
  41. matches.append(substr)
  42. return matches
  43. def find(string, trie):
  44. """find(string, trie) -> list of tuples (key, start, end)
  45. Find all the keys in the trie that match anywhere in the string.
  46. """
  47. results = []
  48. start = 0 # index to start the search
  49. while start < len(string):
  50. # Look for a match.
  51. keys = match_all(string[start:], trie)
  52. for key in keys:
  53. results.append((key, start, start + len(key)))
  54. start += 1
  55. return results
  56. DEFAULT_BOUNDARY_CHARS = string.punctuation + string.whitespace
  57. def find_words(string, trie):
  58. """find_words(string, trie) -> list of tuples (key, start, end)
  59. Find all the keys in the trie that match full words in the string.
  60. Word boundaries are defined as any punctuation or whitespace.
  61. """
  62. _boundary_re = re.compile(r"[%s]+" % re.escape(DEFAULT_BOUNDARY_CHARS))
  63. results = []
  64. start = 0 # index of word boundary
  65. while start < len(string):
  66. # Look for a match.
  67. keys = match_all(string[start:], trie)
  68. for key in keys:
  69. l = len(key)
  70. # Make sure it ends at a boundary.
  71. if start + l == len(string) or \
  72. _boundary_re.match(string[start + l]):
  73. results.append((key, start, start + l))
  74. # Move forward to the next boundary.
  75. m = _boundary_re.search(string, start)
  76. if m is None:
  77. break
  78. start = m.end()
  79. return results