PageRenderTime 45ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/tagger.py

https://github.com/BankFacil/tagger
Python | 550 lines | 417 code | 24 blank | 109 comment | 21 complexity | 17407a29a7ed0d5eb0646a4ed3debdfd MD5 | raw file
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # Copyright (C) 2011 by Alessandro Presta
  4. # Permission is hereby granted, free of charge, to any person obtaining a copy
  5. # of this software and associated documentation files (the "Software"), to deal
  6. # in the Software without restriction, including without limitation the rights
  7. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8. # copies of the Software, and to permit persons to whom the Software is
  9. # furnished to do so, subject to the following conditions:
  10. # The above copyright notice and this permission notice shall be included in
  11. # all copies or substantial portions of the Software.
  12. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  15. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  16. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  17. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  18. # THE SOFTWARE
  19. '''
  20. ======
  21. tagger
  22. ======
  23. Module for extracting tags from text documents.
  24. Copyright (C) 2011 by Alessandro Presta
  25. Configuration
  26. =============
  27. Dependencies:
  28. python2.7, stemming, nltk (optional), lxml (optional), tkinter (optional)
  29. You can install the stemming package with::
  30. $ easy_install stemming
  31. Usage
  32. =====
  33. Tagging a text document from Python::
  34. import tagger
  35. weights = pickle.load(open('data/dict.pkl', 'rb')) # or your own dictionary
  36. myreader = tagger.Reader() # or your own reader class
  37. mystemmer = tagger.Stemmer() # or your own stemmer class
  38. myrater = tagger.Rater(weights) # or your own... (you got the idea)
  39. mytagger = Tagger(myreader, mystemmer, myrater)
  40. best_3_tags = mytagger(text_string, 3)
  41. Running the module as a script::
  42. $ ./tagger.py <text document(s) to tag>
  43. Example::
  44. $ ./tagger.py tests/*
  45. Loading dictionary...
  46. Tags for tests/bbc1.txt :
  47. ['bin laden', 'obama', 'pakistan', 'killed', 'raid']
  48. Tags for tests/bbc2.txt :
  49. ['jo yeates', 'bristol', 'vincent tabak', 'murder', 'strangled']
  50. Tags for tests/bbc3.txt :
  51. ['snp', 'party', 'election', 'scottish', 'labour']
  52. Tags for tests/guardian1.txt :
  53. ['bin laden', 'al-qaida', 'killed', 'pakistan', 'al-fawwaz']
  54. Tags for tests/guardian2.txt :
  55. ['clegg', 'tory', 'lib dem', 'party', 'coalition']
  56. Tags for tests/post1.txt :
  57. ['sony', 'stolen', 'playstation network', 'hacker attack', 'lawsuit']
  58. Tags for tests/wikipedia1.txt :
  59. ['universe', 'anthropic principle', 'observed', 'cosmological', 'theory']
  60. Tags for tests/wikipedia2.txt :
  61. ['beetroot', 'beet', 'betaine', 'blood pressure', 'dietary nitrate']
  62. Tags for tests/wikipedia3.txt :
  63. ['the lounge lizards', 'jazz', 'john lurie', 'musical', 'albums']
  64. '''
  65. from __future__ import division
  66. import collections
  67. import re
  68. import nltk
  69. import unicodedata
  70. class Tag:
  71. '''
  72. General class for tags (small units of text)
  73. '''
  74. def __init__(self, string, stem=None, rating=1.0, proper=False,
  75. terminal=False):
  76. '''
  77. @param string: the actual representation of the tag
  78. @param stem: the internal (usually stemmed) representation;
  79. tags with the same stem are regarded as equal
  80. @param rating: a measure of the tag's relevance in the interval [0,1]
  81. @param proper: whether the tag is a proper noun
  82. @param terminal: set to True if the tag is at the end of a phrase
  83. (or anyway it cannot be logically merged to the
  84. following one)
  85. @returns: a new L{Tag} object
  86. '''
  87. self.string = string
  88. self.stem = stem or string
  89. self.rating = rating
  90. self.proper = proper
  91. self.terminal = terminal
  92. def __eq__(self, other):
  93. return self.stem == other.stem
  94. def __repr__(self):
  95. return repr(self.string)
  96. def __lt__(self, other):
  97. return self.rating > other.rating
  98. def __hash__(self):
  99. return hash(self.stem)
  100. class MultiTag(Tag):
  101. '''
  102. Class for aggregates of tags (usually next to each other in the document)
  103. '''
  104. def __init__(self, tail, head=None):
  105. '''
  106. @param tail: the L{Tag} object to add to the first part (head)
  107. @param head: the (eventually absent) L{MultiTag} to be extended
  108. @returns: a new L{MultiTag} object
  109. '''
  110. if not head:
  111. Tag.__init__(self, tail.string, tail.stem, tail.rating,
  112. tail.proper, tail.terminal)
  113. self.size = 1
  114. self.subratings = [self.rating]
  115. else:
  116. self.string = ' '.join([head.string, tail.string])
  117. self.stem = ' '.join([head.stem, tail.stem])
  118. self.size = head.size + 1
  119. self.proper = (head.proper and tail.proper)
  120. self.terminal = tail.terminal
  121. self.subratings = head.subratings + [tail.rating]
  122. self.rating = self.combined_rating()
  123. def combined_rating(self):
  124. '''
  125. Method that computes the multitag's rating from the ratings of unit
  126. subtags
  127. (the default implementation uses the geometric mean - with a special
  128. treatment for proper nouns - but this method can be overridden)
  129. @returns: the rating of the multitag
  130. '''
  131. # by default, the rating of a multitag is the geometric mean of its
  132. # unit subtags' ratings
  133. product = reduce(lambda x, y: x * y, self.subratings, 1.0)
  134. root = self.size
  135. # but proper nouns shouldn't be penalized by stopwords
  136. if product == 0.0 and self.proper:
  137. nonzero = [r for r in self.subratings if r > 0.0]
  138. if len(nonzero) == 0:
  139. return 0.0
  140. product = reduce(lambda x, y: x * y, nonzero, 1.0)
  141. root = len(nonzero)
  142. return product ** (1.0 / root)
  143. class Reader:
  144. '''
  145. Class for parsing a string of text to obtain tags
  146. (it just turns the string to lowercase and splits it according to
  147. whitespaces and punctuation, identifying proper nouns and terminal words;
  148. different rules and formats other than plain text could be used)
  149. '''
  150. match_apostrophes = re.compile(r'`|’', re.UNICODE)
  151. match_paragraphs = re.compile(r'[\.\?!\t\n\r\f\v]+', re.UNICODE)
  152. match_phrases = re.compile(r'[,;:\(\)\[\]\{\}<>]+', re.UNICODE)
  153. match_words = re.compile(r'[\w\-\'_/&]+', re.UNICODE)
  154. def __call__(self, text):
  155. '''
  156. @param text: the string of text to be tagged
  157. @returns: a list of tags respecting the order in the text
  158. '''
  159. text = self.preprocess(text)
  160. # split by full stops, newlines, question marks...
  161. paragraphs = self.match_paragraphs.split(text)
  162. tags = []
  163. for par in paragraphs:
  164. # split by commas, colons, parentheses...
  165. phrases = self.match_phrases.split(par)
  166. if len(phrases) > 0:
  167. # first phrase of a paragraph
  168. words = self.match_words.findall(phrases[0])
  169. if len(words) > 1:
  170. tags.append(Tag(words[0].lower()))
  171. for w in words[1:-1]:
  172. tags.append(Tag(w.lower(), proper=w[0].isupper()))
  173. tags.append(Tag(words[-1].lower(),
  174. proper=words[-1][0].isupper(),
  175. terminal=True))
  176. elif len(words) == 1:
  177. tags.append(Tag(words[0].lower(), terminal=True))
  178. # following phrases
  179. for phr in phrases[1:]:
  180. words = self.match_words.findall(phr)
  181. if len(words) > 1:
  182. for w in words[:-1]:
  183. tags.append(Tag(w.lower(), proper=w[0].isupper()))
  184. if len(words) > 0:
  185. tags.append(Tag(words[-1].lower(),
  186. proper=words[-1][0].isupper(),
  187. terminal=True))
  188. return tags
  189. def preprocess(self, text):
  190. '''
  191. @param text: a string containing the text document to perform any
  192. required transformation before splitting
  193. @returns: the processed text
  194. '''
  195. text = self.match_apostrophes.sub('\'', text)
  196. return text
  197. class Stemmer:
  198. '''
  199. Class for extracting the stem of a word
  200. (by default it uses a simple open-source implementation of Porter's
  201. algorithm; this can be improved a lot, so experimenting with different ones
  202. is advisable; nltk.stem provides different algorithms for many languages)
  203. '''
  204. match_contractions = re.compile(r'(\w+)\'(m|re|d|ve|s|ll|t)?', re.UNICODE)
  205. match_hyphens = re.compile(r'\b[\-_]\b', re.UNICODE)
  206. def __init__(self, stemmer=None):
  207. '''
  208. @param stemmer: an object or module with a 'stem' method (defaults to
  209. stemming.porter2)
  210. @returns: a new L{Stemmer} object
  211. '''
  212. if not stemmer:
  213. from stemming import porter2
  214. stemmer = porter2
  215. self.stemmer = stemmer
  216. def __call__(self, tag):
  217. '''
  218. @param tag: the tag to be stemmed
  219. @returns: the stemmed tag
  220. '''
  221. string = self.preprocess(tag.string)
  222. tag.stem = self.stemmer.stem(string)
  223. return tag
  224. def preprocess(self, string):
  225. '''
  226. @param string: a string to be treated before passing it to the stemmer
  227. @returns: the processed string
  228. '''
  229. # delete hyphens and underscores
  230. string = self.match_hyphens.sub('', string)
  231. # get rid of contractions and possessive forms
  232. match = self.match_contractions.match(string)
  233. if match: string = match.group(1)
  234. return string
  235. class Rater:
  236. numbers = re.compile(r'\d+$|[\d\s\/\-]+$|\d+\s*.*', re.UNICODE)
  237. hyphens = re.compile(r'-\s.*|.*\s\-|-', re.UNICODE)
  238. '''
  239. Class for estimating the relevance of tags
  240. (the default implementation uses TF (term frequency) multiplied by weight,
  241. but any other reasonable measure is fine; a quite rudimental heuristic
  242. tries to discard redundant tags)
  243. '''
  244. def __init__(self, weights, blacklist, synonyms, multitag_size=3):
  245. '''
  246. @param weights: a dictionary of weights normalized in the
  247. interval [0,1]
  248. @param multitag_size: maximum size of tags formed by multiple unit
  249. tags
  250. @returns: a new L{Rater} object
  251. '''
  252. self.weights = weights
  253. self.multitag_size = multitag_size
  254. def __call__(self, tags):
  255. '''
  256. @param tags: a list of (preferably stemmed) tags
  257. @returns: a list of unique (multi)tags sorted by relevance
  258. '''
  259. self.rate_tags(tags)
  260. multitags = self.create_multitags(tags)
  261. # keep most frequent version of each tag
  262. clusters = collections.defaultdict(collections.Counter)
  263. proper = collections.defaultdict(int)
  264. ratings = collections.defaultdict(float)
  265. for t in multitags:
  266. clusters[t][t.string] += 1
  267. if t.proper:
  268. proper[t] += 1
  269. ratings[t] = max(ratings[t], t.rating)
  270. term_count = collections.Counter(multitags)
  271. for t, cnt in term_count.iteritems():
  272. t.string = clusters[t].most_common(1)[0][0]
  273. proper_freq = proper[t] / cnt
  274. if proper_freq >= 0.5:
  275. t.proper = True
  276. t.rating = ratings[t]
  277. # purge duplicates, one-character tags and stopwords
  278. unique_tags = set(t for t in term_count
  279. if self.verify_valid_tag(t))
  280. for t in term_count:
  281. if t.stem in blacklist:
  282. unique_tags.discard(t)
  283. # remove redundant tags
  284. for t, cnt in term_count.iteritems():
  285. words = t.stem.split()
  286. for l in xrange(1, len(words)):
  287. for i in xrange(len(words) - l + 1):
  288. s = Tag(' '.join(words[i:i + l]))
  289. relative_freq = cnt / term_count[s]
  290. if ((relative_freq == 1.0 and t.proper) or
  291. (relative_freq >= 0.5 and t.rating > 0.0)):
  292. unique_tags.discard(s)
  293. else:
  294. unique_tags.discard(t)
  295. return sorted(unique_tags)
  296. def verify_valid_tag(self, t):
  297. step1 = len(t.string) > 1 and t.rating > 0.0
  298. step2 = (not self.numbers.match(t.string)) and (not self.hyphens.match(t.string))
  299. return step1 and step2
  300. def rate_tags_by_count(self, tags):
  301. '''
  302. @param tags: criado para o bkf
  303. '''
  304. term_count = collections.Counter(tags)
  305. for t in tags:
  306. # rating of a single tag is term frequency * weight
  307. if self.weights.get(t.stem, 1.0) <= 0.0:
  308. t.rating = 0
  309. else:
  310. t.rating = term_count[t] #/ len(tags) * self.weights.get(t.stem, 1.0)
  311. def rate_tags(self, tags):
  312. '''
  313. @param tags: a list of tags to be assigned a rating
  314. '''
  315. term_count = collections.Counter(tags)
  316. for t in tags:
  317. # rating of a single tag is term frequency * weight
  318. t.rating = term_count[t] / len(tags) * self.weights.get(t.stem, 1.0)
  319. def create_multitags(self, tags):
  320. '''
  321. @param tags: a list of tags (respecting the order in the text)
  322. @returns: a list of multitags
  323. '''
  324. multitags = []
  325. for i in xrange(len(tags)):
  326. t = MultiTag(tags[i])
  327. multitags.append(t)
  328. for j in xrange(1, self.multitag_size):
  329. if t.terminal or i + j >= len(tags):
  330. break
  331. else:
  332. t = MultiTag(tags[i + j], t)
  333. multitags.append(t)
  334. return multitags
  335. class MyPtStemmer(Stemmer):
  336. def __init__(self):
  337. Stemmer.__init__(self, nltk.stem.RSLPStemmer())
  338. def preprocess(self, string):
  339. return string
  340. class Tagger:
  341. '''
  342. Master class for tagging text documents
  343. (this is a simple interface that should allow convenient experimentation
  344. by using different classes as building blocks)
  345. '''
  346. def __init__(self, reader, stemmer, rater):
  347. '''
  348. @param reader: a L{Reader} object
  349. @param stemmer: a L{Stemmer} object
  350. @param rater: a L{Rater} object
  351. @returns: a new L{Tagger} object
  352. '''
  353. self.reader = reader
  354. self.stemmer = stemmer
  355. self.rater = rater
  356. def __call__(self, text, tags_number=5):
  357. '''
  358. @param text: the string of text to be tagged
  359. @param tags_number: number of best tags to be returned
  360. Returns: a list of (hopefully) relevant tags
  361. '''
  362. tags = self.reader(text)
  363. tags = map(self.stemmer, tags)
  364. tags = self.rater(tags)
  365. return tags[:tags_number]
  366. if __name__ == '__main__':
  367. import glob
  368. import pickle
  369. import sys
  370. import getopt
  371. import codecs
  372. from unidecode import unidecode
  373. dict_file = 'dict'
  374. n_tags = 5
  375. id_regexp = re.compile(r'.*_(\d*)', re.UNICODE)
  376. all_tags = {}
  377. all_ids = []
  378. stem_synonyms = {}
  379. if len(sys.argv) < 2:
  380. print 'No arguments given, running tests: '
  381. documents = glob.glob('tests/*')
  382. else:
  383. options = getopt.getopt(sys.argv[1:], 'd:n:')
  384. if len(options[0]) > 0:
  385. dict_file = options[0][0][1]
  386. if len(options[0]) > 1:
  387. n_tags = int(options[0][1][1])
  388. documents = options[1]
  389. from extras import SimpleReader
  390. stemmer = MyPtStemmer()
  391. reader = SimpleReader()
  392. with codecs.open('data/blacklist.txt', 'r', 'utf-8') as bl:
  393. blacklist = reader(bl.read())
  394. blacklist = [w.stem for w in map(stemmer, blacklist)]
  395. with codecs.open('data/synonyms.txt', 'r', 'utf-8') as sn:
  396. synonyms = reader(sn.read())
  397. synonyms = [w.stem for w in map(stemmer, synonyms)]
  398. print 'Loading dictionary... '
  399. weights = pickle.load(open('data/%s.pkl'%(dict_file), 'rb'))
  400. tagger = Tagger(Reader(), stemmer, Rater(weights, blacklist, synonyms))
  401. for doc in documents:
  402. with codecs.open(doc, 'r', 'utf-8') as file:
  403. file_contents = file.read()
  404. tags = tagger(file_contents, n_tags)
  405. article_id = int(id_regexp.match(doc).group(1))
  406. print 'Tags for ', article_id, ':'
  407. all_ids.append(article_id)
  408. #handle synonyms
  409. for tag in tags:
  410. stem = unidecode(tag.stem)
  411. if not all_tags.get(stem):
  412. all_tags[stem] = set()
  413. all_tags[stem].add(article_id)
  414. if not stem_synonyms.get(stem):
  415. stem_synonyms[stem] = []
  416. tag_string = unidecode(tag.string)
  417. if not tag_string in stem_synonyms[stem]: stem_synonyms[stem].append(tag_string)
  418. print tags
  419. from matrix_builder import print_normalized_sparse_matrix
  420. from matrix_builder import print_synonyms
  421. from link_matrix import print_normalized_link_matrix
  422. from link_matrix import print_math_prog
  423. from tag_exporter import print_tags
  424. from tag_exporter import print_whitelist_tags
  425. top_tags = dict(sorted(all_tags.items(), key=lambda y: -1*len(y[1]))[0:len(all_ids)])
  426. print_normalized_sparse_matrix(top_tags, sorted(all_ids), stem_synonyms)
  427. print_synonyms(stem_synonyms)
  428. print_math_prog(top_tags, sorted(all_ids))
  429. print_tags(top_tags)
  430. print_whitelist_tags(all_tags)