PageRenderTime 22ms CodeModel.GetById 12ms app.highlight 7ms RepoModel.GetById 1ms app.codeStats 0ms

/extras.py

http://github.com/apresta/tagger
Python | 151 lines | 79 code | 15 blank | 57 comment | 1 complexity | 2e66f86f09b6bcfff8d4c47d7e306637 MD5 | raw file
  1# Copyright (C) 2011 by Alessandro Presta
  2
  3# Permission is hereby granted, free of charge, to any person obtaining a copy
  4# of this software and associated documentation files (the "Software"), to deal
  5# in the Software without restriction, including without limitation the rights
  6# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7# copies of the Software, and to permit persons to whom the Software is
  8# furnished to do so, subject to the following conditions:
  9
 10# The above copyright notice and this permission notice shall be included in
 11# all copies or substantial portions of the Software.
 12
 13# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 14# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 15# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 16# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 17# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 18# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 19# THE SOFTWARE
 20
 21
 22from tagger import *
 23
 24
 25class UnicodeReader(Reader):
 26    '''
 27    Reader subclass that converts Unicode strings to a close ASCII
 28    representation
 29    '''
 30
 31    def __call__(self, text):
 32        import unicodedata
 33        
 34        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
 35        return Reader.__call__(self, text)
 36
 37
 38class HTMLReader(UnicodeReader):
 39    '''
 40    Reader subclass that can parse HTML code from the input
 41    '''
 42
 43    def __call__(self, html):
 44        import lxml.html
 45
 46        text = lxml.html.fromstring(html).text_content()
 47        if isinstance(text, unicode):
 48            return UnicodeReader.__call__(self, text)
 49        else:
 50            return Reader.__call__(self, text)
 51
 52    
 53class SimpleReader(Reader):
 54    '''
 55    Reader subclass that doesn't perform any advanced analysis of the text
 56    '''
 57    
 58    def __call__(self, text):
 59        text = text.lower()
 60        text = self.preprocess(text)
 61        words = self.match_words.findall(text)
 62        tags = [Tag(w) for w in words]
 63        return tags
 64
 65
 66class FastStemmer(Stemmer):
 67    '''
 68    Stemmer subclass that uses a much faster, but less correct algorithm
 69    '''
 70
 71    def __init__(self):
 72        from stemming import porter
 73        
 74        Stemmer.__init__(self, porter)
 75
 76
 77class NaiveRater(Rater):
 78    '''
 79    Rater subclass that jusk ranks single-word tags by their frequency and
 80    weight
 81    '''
 82
 83    def __call__(self, tags):
 84        self.rate_tags(tags)
 85        # we still get rid of one-character tags and stopwords
 86        unique_tags = set(t for t in tags
 87                          if len(t.string) > 1 and t.rating > 0.0)
 88        return sorted(unique_tags)
 89    
 90        
 91def build_dict_from_nltk(output_file, corpus=None, stopwords=None,
 92                         stemmer=Stemmer(), measure='IDF', verbose=False):
 93    '''
 94    @param output_file: the name of the file where the dictionary should be
 95                        saved
 96    @param corpus:      the NLTK corpus to use (defaults to nltk.corpus.reuters)
 97    @param stopwords:   a list of (not stemmed) stopwords (defaults to
 98                        nltk.corpus.reuters.words('stopwords'))
 99    @param stemmer:     the L{Stemmer} object to be used
100    @param measure:     the measure used to compute the weights ('IDF'
101                        i.e. 'inverse document frequency' or 'ICF' i.e.
102                        'inverse collection frequency'; defaults to 'IDF')
103    @param verbose:     whether information on the progress should be printed
104                        on screen
105    '''
106    
107    from build_dict import build_dict
108    import nltk
109    import pickle
110
111    if not (corpus and stopwords):
112        nltk.download('reuters')
113        
114    corpus = corpus or nltk.corpus.reuters
115    stopwords = stopwords or nltk.corpus.reuters.words('stopwords')
116
117    corpus_list = []
118    
119    if verbose: print 'Processing corpus...'
120    for file in corpus.fileids():
121        doc = [stemmer(Tag(w.lower())).stem for w in corpus.words(file)
122               if w[0].isalpha()]
123        corpus_list.append(doc)
124
125    if verbose: print 'Processing stopwords...'
126    stopwords = [stemmer(Tag(w.lower())).stem for w in stopwords]
127
128    if verbose: print 'Building dictionary... '
129    dictionary = build_dict(corpus_list, stopwords, measure)
130    with open(output_file, 'wb') as out:
131        pickle.dump(dictionary, out, -1) 
132
133
134
135    
136
137    
138
139     
140
141    
142
143        
144
145    
146
147
148
149
150
151