/haystack/utils/__init__.py

https://github.com/oyiptong/django-haystack · Python · 190 lines · 110 code · 52 blank · 28 comment · 33 complexity · 149b2ee868eb08c2f74a8269e36be2e4 MD5 · raw file

  1. import re
  2. from django.utils.html import strip_tags
  3. try:
  4. set
  5. except NameError:
  6. from sets import Set as set
  7. IDENTIFIER_REGEX = re.compile('^[\w\d_]+\.[\w\d_]+\.\d+$')
  8. def get_identifier(obj_or_string):
  9. """
  10. Get an unique identifier for the object or a string representing the
  11. object.
  12. If not overridden, uses <app_label>.<object_name>.<pk>.
  13. """
  14. if isinstance(obj_or_string, basestring):
  15. if not IDENTIFIER_REGEX.match(obj_or_string):
  16. raise AttributeError("Provided string '%s' is not a valid identifier." % obj_or_string)
  17. return obj_or_string
  18. return u"%s.%s.%s" % (obj_or_string._meta.app_label, obj_or_string._meta.module_name, obj_or_string._get_pk_val())
  19. def get_facet_field_name(fieldname):
  20. if fieldname in ['id', 'django_id', 'django_ct']:
  21. return fieldname
  22. return "%s_exact" % fieldname
  23. class Highlighter(object):
  24. css_class = 'highlighted'
  25. html_tag = 'span'
  26. max_length = 200
  27. text_block = ''
  28. def __init__(self, query, **kwargs):
  29. self.query = query
  30. if 'max_length' in kwargs:
  31. self.max_length = int(kwargs['max_length'])
  32. if 'html_tag' in kwargs:
  33. self.html_tag = kwargs['html_tag']
  34. if 'css_class' in kwargs:
  35. self.css_class = kwargs['css_class']
  36. self.query_words = set([word.lower() for word in self.query.split() if not word.startswith('-')])
  37. def highlight(self, text_block):
  38. self.text_block = strip_tags(text_block)
  39. highlight_locations = self.find_highlightable_words()
  40. start_offset, end_offset = self.find_window(highlight_locations)
  41. return self.render_html(highlight_locations, start_offset, end_offset)
  42. def find_highlightable_words(self):
  43. # Use a set so we only do this once per unique word.
  44. word_positions = {}
  45. # Pre-compute the length.
  46. end_offset = len(self.text_block)
  47. lower_text_block = self.text_block.lower()
  48. for word in self.query_words:
  49. if not word in word_positions:
  50. word_positions[word] = []
  51. start_offset = 0
  52. while start_offset < end_offset:
  53. next_offset = lower_text_block.find(word, start_offset, end_offset)
  54. # If we get a -1 out of find, it wasn't found. Bomb out and
  55. # start the next word.
  56. if next_offset == -1:
  57. break
  58. word_positions[word].append(next_offset)
  59. start_offset = next_offset + len(word)
  60. return word_positions
  61. def find_window(self, highlight_locations):
  62. best_start = 0
  63. best_end = self.max_length
  64. # First, make sure we have words.
  65. if not len(highlight_locations):
  66. return (best_start, best_end)
  67. words_found = []
  68. # Next, make sure we found any words at all.
  69. for word, offset_list in highlight_locations.items():
  70. if len(offset_list):
  71. # Add all of the locations to the list.
  72. words_found.extend(offset_list)
  73. if not len(words_found):
  74. return (best_start, best_end)
  75. if len(words_found) == 1:
  76. return (words_found[0], words_found[0] + self.max_length)
  77. # Sort the list so it's in ascending order.
  78. words_found = sorted(words_found)
  79. # We now have a denormalized list of all positions were a word was
  80. # found. We'll iterate through and find the densest window we can by
  81. # counting the number of found offsets (-1 to fit in the window).
  82. highest_density = 0
  83. if words_found[:-1][0] > self.max_length:
  84. best_start = words_found[:-1][0]
  85. best_end = best_start + self.max_length
  86. for count, start in enumerate(words_found[:-1]):
  87. current_density = 1
  88. for end in words_found[count + 1:]:
  89. if end - start < self.max_length:
  90. current_density += 1
  91. else:
  92. current_density = 0
  93. # Only replace if we have a bigger (not equal density) so we
  94. # give deference to windows earlier in the document.
  95. if current_density > highest_density:
  96. best_start = start
  97. best_end = start + self.max_length
  98. highest_density = current_density
  99. return (best_start, best_end)
  100. def render_html(self, highlight_locations=None, start_offset=None, end_offset=None):
  101. # Start by chopping the block down to the proper window.
  102. text = self.text_block[start_offset:end_offset]
  103. # Invert highlight_locations to a location -> term list
  104. term_list = []
  105. for term, locations in highlight_locations.items():
  106. term_list += [(loc - start_offset, term) for loc in locations]
  107. loc_to_term = sorted(term_list)
  108. # Prepare the highlight template
  109. if self.css_class:
  110. hl_start = '<%s class="%s">' % (self.html_tag, self.css_class)
  111. else:
  112. hl_start = '<%s>' % (self.html_tag)
  113. hl_end = '</%s>' % self.html_tag
  114. highlight_length = len(hl_start + hl_end)
  115. # Copy the part from the start of the string to the first match,
  116. # and there replace the match with a highlighted version.
  117. highlighted_chunk = ""
  118. matched_so_far = 0
  119. prev = 0
  120. prev_str = ""
  121. for cur, cur_str in loc_to_term:
  122. # This can be in a different case than cur_str
  123. actual_term = text[cur:cur + len(cur_str)]
  124. # Handle incorrect highlight_locations by first checking for the term
  125. if actual_term.lower() == cur_str:
  126. highlighted_chunk += text[prev + len(prev_str):cur] + hl_start + actual_term + hl_end
  127. prev = cur
  128. prev_str = cur_str
  129. # Keep track of how far we've copied so far, for the last step
  130. matched_so_far = cur + len(actual_term)
  131. # Don't forget the chunk after the last term
  132. highlighted_chunk += text[matched_so_far:]
  133. if start_offset > 0:
  134. highlighted_chunk = '...%s' % highlighted_chunk
  135. if end_offset < len(self.text_block):
  136. highlighted_chunk = '%s...' % highlighted_chunk
  137. return highlighted_chunk