/extras.py

http://github.com/apresta/tagger · Python · 151 lines · 72 code · 15 blank · 64 comment · 3 complexity · 2e66f86f09b6bcfff8d4c47d7e306637 MD5 · raw file

  1. # Copyright (C) 2011 by Alessandro Presta
  2. # Permission is hereby granted, free of charge, to any person obtaining a copy
  3. # of this software and associated documentation files (the "Software"), to deal
  4. # in the Software without restriction, including without limitation the rights
  5. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  6. # copies of the Software, and to permit persons to whom the Software is
  7. # furnished to do so, subject to the following conditions:
  8. # The above copyright notice and this permission notice shall be included in
  9. # all copies or substantial portions of the Software.
  10. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  11. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  12. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  13. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  14. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  15. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  16. # THE SOFTWARE
  17. from tagger import *
  18. class UnicodeReader(Reader):
  19. '''
  20. Reader subclass that converts Unicode strings to a close ASCII
  21. representation
  22. '''
  23. def __call__(self, text):
  24. import unicodedata
  25. text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
  26. return Reader.__call__(self, text)
  27. class HTMLReader(UnicodeReader):
  28. '''
  29. Reader subclass that can parse HTML code from the input
  30. '''
  31. def __call__(self, html):
  32. import lxml.html
  33. text = lxml.html.fromstring(html).text_content()
  34. if isinstance(text, unicode):
  35. return UnicodeReader.__call__(self, text)
  36. else:
  37. return Reader.__call__(self, text)
  38. class SimpleReader(Reader):
  39. '''
  40. Reader subclass that doesn't perform any advanced analysis of the text
  41. '''
  42. def __call__(self, text):
  43. text = text.lower()
  44. text = self.preprocess(text)
  45. words = self.match_words.findall(text)
  46. tags = [Tag(w) for w in words]
  47. return tags
  48. class FastStemmer(Stemmer):
  49. '''
  50. Stemmer subclass that uses a much faster, but less correct algorithm
  51. '''
  52. def __init__(self):
  53. from stemming import porter
  54. Stemmer.__init__(self, porter)
  55. class NaiveRater(Rater):
  56. '''
  57. Rater subclass that jusk ranks single-word tags by their frequency and
  58. weight
  59. '''
  60. def __call__(self, tags):
  61. self.rate_tags(tags)
  62. # we still get rid of one-character tags and stopwords
  63. unique_tags = set(t for t in tags
  64. if len(t.string) > 1 and t.rating > 0.0)
  65. return sorted(unique_tags)
  66. def build_dict_from_nltk(output_file, corpus=None, stopwords=None,
  67. stemmer=Stemmer(), measure='IDF', verbose=False):
  68. '''
  69. @param output_file: the name of the file where the dictionary should be
  70. saved
  71. @param corpus: the NLTK corpus to use (defaults to nltk.corpus.reuters)
  72. @param stopwords: a list of (not stemmed) stopwords (defaults to
  73. nltk.corpus.reuters.words('stopwords'))
  74. @param stemmer: the L{Stemmer} object to be used
  75. @param measure: the measure used to compute the weights ('IDF'
  76. i.e. 'inverse document frequency' or 'ICF' i.e.
  77. 'inverse collection frequency'; defaults to 'IDF')
  78. @param verbose: whether information on the progress should be printed
  79. on screen
  80. '''
  81. from build_dict import build_dict
  82. import nltk
  83. import pickle
  84. if not (corpus and stopwords):
  85. nltk.download('reuters')
  86. corpus = corpus or nltk.corpus.reuters
  87. stopwords = stopwords or nltk.corpus.reuters.words('stopwords')
  88. corpus_list = []
  89. if verbose: print 'Processing corpus...'
  90. for file in corpus.fileids():
  91. doc = [stemmer(Tag(w.lower())).stem for w in corpus.words(file)
  92. if w[0].isalpha()]
  93. corpus_list.append(doc)
  94. if verbose: print 'Processing stopwords...'
  95. stopwords = [stemmer(Tag(w.lower())).stem for w in stopwords]
  96. if verbose: print 'Building dictionary... '
  97. dictionary = build_dict(corpus_list, stopwords, measure)
  98. with open(output_file, 'wb') as out:
  99. pickle.dump(dictionary, out, -1)