PageRenderTime 25ms CodeModel.GetById 14ms RepoModel.GetById 1ms app.codeStats 0ms

/helpers/hadoop/wikipedia/lighttag/lighttag.py

https://github.com/champ1/twittomatic
Python | 142 lines | 95 code | 39 blank | 8 comment | 30 complexity | 376719219f9107507962e32765915360 MD5 | raw file
  1. #!/usr/bin/env python
  2. # encoding=utf8
  3. import os
  4. import sys
  5. from utils import profiled
  6. from itertools import izip_longest
  7. from disambiguate import Disambiguator
  8. import plyvel
  9. import marisa_trie
  10. def extract_words(text):
  11. words = []
  12. startpos = 0
  13. text = text.lower()
  14. for pos, ch in enumerate(text.lower()):
  15. if ch == ' ' or ch == '\t' or ch == '\n' or ch == '\r':
  16. if pos != startpos:
  17. words.append((text[startpos:pos], (startpos, pos)))
  18. startpos = pos + 1
  19. if pos != startpos:
  20. words.append((text[startpos:pos + 1], (startpos, pos + 1)))
  21. print words
  22. return words
  23. class LightTag(object):
  24. def __init__(self, filetrie="anchors.marisa", filestop="stop.txt"):
  25. self.trie = marisa_trie.Trie()
  26. self.disambig = Disambiguator()
  27. self.stopwords = set()
  28. with open(filestop, 'r') as stopfile:
  29. self.stopwords = set(filter(lambda x: x and x[0] != '#', map(lambda x: x.rstrip(), stopfile.readlines())))
  30. with open(filetrie, 'r') as inputfile:
  31. self.trie.read(inputfile)
  32. print "Loaded %d anchors" % len(self.trie)
  33. def annotate(self, text):
  34. with profiled("Disambiguated in %s"):
  35. matches = self.match(text)
  36. spots = []
  37. indices = {}
  38. for (start, stop), anchors in matches.items():
  39. if anchors[0] in self.stopwords:
  40. print "Ignoring stopword", anchors[0], anchors
  41. continue
  42. spots.append(anchors[0])
  43. indices[anchors[0]] = (start, stop)
  44. ret = {}
  45. results = self.disambig.disambiguate(spots)
  46. for spot in results:
  47. ret[spot] = results[spot]
  48. ret[spot]['indices'] = indices[spot]
  49. start, stop = indices[spot]
  50. ret[spot]['spot'] = text[start:stop]
  51. return [v for(k, v) in ret.items()]
  52. def match(self, text, context=5, threshold=0.8):
  53. anchors = {}
  54. with profiled("Annotated in %s"):
  55. start = 0
  56. text_words = extract_words(text)
  57. while start < len(text_words):
  58. stop = min(start + context, len(text_words))
  59. while stop >= start:
  60. words = map(lambda x: x[0], text_words[start:stop])
  61. target = u' '.join(words)
  62. # Only consider strings which are at least 3 characters long
  63. if len(target) >= 3:
  64. begin = text_words[start][1][0]
  65. end = text_words[stop - 1][1][1]
  66. #assert target == text[begin:end].lower()
  67. wiki_titles = self.trie.keys(target)
  68. wiki_titles = [title for title in self.filter_wiki_titles(words, wiki_titles, threshold=threshold)]
  69. if wiki_titles:
  70. anchors[(begin, end)] = wiki_titles
  71. stop = 0
  72. stop -= 1
  73. start += 1
  74. return anchors
  75. def filter_wiki_titles(self, words, wiki_titles, threshold=0.8):
  76. """
  77. Filter a list of wiki titles based on similarity
  78. """
  79. #print "Filtering", words, wiki_titles
  80. for title in wiki_titles:
  81. similarities = 0
  82. for counter, (word1, word2) in enumerate(izip_longest(words, title.split())):
  83. if not word1 or not word2:
  84. continue
  85. if word1 == word2:
  86. similarities += 1
  87. sim = similarities / float(counter + 1)
  88. if sim > threshold:
  89. yield title
  90. if sim == 1.0:
  91. raise StopIteration
  92. if __name__ == "__main__":
  93. from optparse import OptionParser
  94. parser = OptionParser("%s [options]" % sys.argv[0])
  95. parser.add_option("-t", "--trie", default="anchors.marisa",
  96. help="Marisa trie file containing anchors information")
  97. (options, args) = parser.parse_args()
  98. app = LightTag(options.trie)
  99. app.annotate(u"Uomini e donne è un programma televisivo di merda")
  100. app.annotate(u"Il 10 novembre 1938 Fermi venne insignito del premio nobel per i suoi studi nel settore della fisica nucleare")
  101. app.annotate(u"Il settore del mio disco fisso e' andato. Ho perso tutti i miei fottuti dati. Dovevo fare un backup cazzo!")
  102. app.annotate(u"Certo che il berlusconi sembra proprio un nano da giardino. Dopo il governo monti penso che andremo giu di bound")
  103. app.annotate(u"Non vedo l'ora di vedere l'ultimo film di quentin tarantino")