PageRenderTime 51ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/compat/haystack/utils.py

https://bitbucket.org/resplin/byteflow
Python | 151 lines | 105 code | 29 blank | 17 comment | 21 complexity | a5565ac33de127ddee7e6bfb8aa223c3 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. import re
  2. from django.utils.html import strip_tags
  3. try:
  4. set
  5. except NameError:
  6. from sets import Set as set
  7. IDENTIFIER_REGEX = re.compile('^[\w\d_]+\.[\w\d_]+\.\d+$')
  8. def get_identifier(obj_or_string):
  9. """
  10. Get an unique identifier for the object or a string representing the
  11. object.
  12. If not overridden, uses <app_label>.<object_name>.<pk>.
  13. """
  14. if isinstance(obj_or_string, basestring):
  15. if not IDENTIFIER_REGEX.match(obj_or_string):
  16. raise AttributeError("Provided string '%s' is not a valid identifier." % obj_or_string)
  17. return obj_or_string
  18. return u"%s.%s.%s" % (obj_or_string._meta.app_label, obj_or_string._meta.module_name, obj_or_string._get_pk_val())
  19. class Highlighter(object):
  20. css_class = 'highlighted'
  21. html_tag = 'span'
  22. max_length = 200
  23. text_block = ''
  24. def __init__(self, query, **kwargs):
  25. self.query = query
  26. if 'max_length' in kwargs:
  27. self.max_length = int(kwargs['max_length'])
  28. if 'html_tag' in kwargs:
  29. self.html_tag = kwargs['html_tag']
  30. if 'css_class' in kwargs:
  31. self.css_class = kwargs['css_class']
  32. self.query_words = set([word.lower() for word in self.query.split() if not word.startswith('-')])
  33. def highlight(self, text_block):
  34. self.text_block = strip_tags(text_block)
  35. highlight_locations = self.find_highlightable_words()
  36. start_offset, end_offset = self.find_window(highlight_locations)
  37. return self.render_html(highlight_locations, start_offset, end_offset)
  38. def find_highlightable_words(self):
  39. # Use a set so we only do this once per unique word.
  40. word_positions = {}
  41. # Pre-compute the length.
  42. end_offset = len(self.text_block)
  43. lower_text_block = self.text_block.lower()
  44. for word in self.query_words:
  45. if not word in word_positions:
  46. word_positions[word] = []
  47. start_offset = 0
  48. while start_offset < end_offset:
  49. next_offset = lower_text_block.find(word, start_offset, end_offset)
  50. # If we get a -1 out of find, it wasn't found. Bomb out and
  51. # start the next word.
  52. if next_offset == -1:
  53. break
  54. word_positions[word].append(next_offset)
  55. start_offset = next_offset + len(word)
  56. return word_positions
  57. def find_window(self, highlight_locations):
  58. best_start = 0
  59. best_end = self.max_length
  60. # First, make sure we have words.
  61. if not len(highlight_locations):
  62. return (best_start, best_end)
  63. words_found = []
  64. # Next, make sure we found any words at all.
  65. for word, offset_list in highlight_locations.items():
  66. if len(offset_list):
  67. # Add all of the locations to the list.
  68. words_found.extend(offset_list)
  69. if not len(words_found):
  70. return (best_start, best_end)
  71. if len(words_found) == 1:
  72. return (words_found[0], words_found[0] + self.max_length)
  73. # Sort the list so it's in ascending order.
  74. words_found = sorted(words_found)
  75. # We now have a denormalized list of all positions were a word was
  76. # found. We'll iterate through and find the densest window we can by
  77. # counting the number of found offsets (-1 to fit in the window).
  78. highest_density = 0
  79. if words_found[:-1][0] > self.max_length:
  80. best_start = words_found[:-1][0]
  81. best_end = best_start + self.max_length
  82. for count, start in enumerate(words_found[:-1]):
  83. current_density = 1
  84. for end in words_found[count + 1:]:
  85. if end - start < self.max_length:
  86. current_density += 1
  87. else:
  88. current_density = 0
  89. # Only replace if we have a bigger (not equal density) so we
  90. # give deference to windows earlier in the document.
  91. if current_density > highest_density:
  92. best_start = start
  93. best_end = start + self.max_length
  94. highest_density = current_density
  95. return (best_start, best_end)
  96. def render_html(self, highlight_locations=None, start_offset=None, end_offset=None):
  97. # Start by chopping the block down to the proper window.
  98. highlighted_chunk = self.text_block[start_offset:end_offset]
  99. for word in self.query_words:
  100. word_re = re.compile("(%s)" % word, re.I)
  101. if self.css_class:
  102. highlighted_chunk = re.sub(word_re, r'<%s class="%s">\1</%s>' % (self.html_tag, self.css_class, self.html_tag), highlighted_chunk)
  103. else:
  104. highlighted_chunk = re.sub(word_re, r'<%s>\1</%s>' % (self.html_tag, self.html_tag), highlighted_chunk)
  105. if start_offset > 0:
  106. highlighted_chunk = '...%s' % highlighted_chunk
  107. if end_offset < len(self.text_block):
  108. highlighted_chunk = '%s...' % highlighted_chunk
  109. return highlighted_chunk