PageRenderTime 28ms CodeModel.GetById 16ms app.highlight 10ms RepoModel.GetById 1ms app.codeStats 0ms

/build_dict.py

http://github.com/apresta/tagger
Python | 137 lines | 104 code | 9 blank | 24 comment | 0 complexity | 0b08c4f560bc29d7729d3867cc17c60c MD5 | raw file
  1#!/usr/bin/env python
  2
  3# Copyright (C) 2011 by Alessandro Presta
  4
  5# Permission is hereby granted, free of charge, to any person obtaining a copy
  6# of this software and associated documentation files (the "Software"), to deal
  7# in the Software without restriction, including without limitation the rights
  8# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9# copies of the Software, and to permit persons to whom the Software is
 10# furnished to do so, subject to the following conditions:
 11
 12# The above copyright notice and this permission notice shall be included in
 13# all copies or substantial portions of the Software.
 14
 15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21# THE SOFTWARE
 22
 23
 24'''
 25Usage: build_dict.py -o <output file> -s <stopwords file> <list of files>
 26'''
 27
 28from __future__ import division
 29
 30from tagger import Stemmer
 31from extras import SimpleReader
 32
 33def build_dict(corpus, stopwords=None, measure='IDF'):
 34    '''
 35    @param corpus:    a list of documents, represented as lists of (stemmed)
 36                      words
 37    @param stopwords: the list of (stemmed) words that should have zero weight
 38    @param measure:   the measure used to compute the weights ('IDF'
 39                      i.e. 'inverse document frequency' or 'ICF' i.e.
 40                      'inverse collection frequency'; defaults to 'IDF')
 41
 42    @returns: a dictionary of weights in the interval [0,1]
 43    '''
 44
 45    import collections
 46    import math
 47
 48    dictionary = {}
 49
 50    if measure == 'ICF':
 51        words = [w for doc in corpus for w in doc]
 52        
 53        term_count = collections.Counter(words)
 54        total_count = len(words)
 55        scale = math.log(total_count)
 56    
 57        for w, cnt in term_count.iteritems():
 58            dictionary[w] = math.log(total_count / (cnt + 1)) / scale
 59
 60    elif measure == 'IDF':
 61        corpus_size = len(corpus)
 62        scale = math.log(corpus_size)
 63
 64        term_count = collections.defaultdict(int)
 65
 66        for doc in corpus:
 67            words = set(doc)
 68            for w in words:
 69                term_count[w] += 1
 70
 71        for w, cnt in term_count.iteritems():
 72            dictionary[w] = math.log(corpus_size / (cnt + 1)) / scale
 73            
 74    if stopwords:
 75        for w in stopwords:
 76            dictionary[w] = 0.0
 77    
 78    return dictionary
 79
 80
 81def build_dict_from_files(output_file, corpus_files, stopwords_file=None,
 82                          reader=SimpleReader(), stemmer=Stemmer(),
 83                          measure='IDF', verbose=False):
 84    '''
 85    @param output_file:    the name of the file where the dictionary should be
 86                           saved
 87    @param corpus_files:   a list of files with words to process
 88    @param stopwords_file: a file containing a list of stopwords
 89    @param reader:         the L{Reader} object to be used
 90    @param stemmer:        the L{Stemmer} object to be used
 91    @param measure:        the measure used to compute the weights ('IDF'
 92                           i.e. 'inverse document frequency' or 'ICF' i.e.
 93                           'inverse collection frequency'; defaults to 'IDF')
 94    @param verbose:        whether information on the progress should be
 95                           printed on screen
 96    '''
 97
 98    import pickle
 99
100    if verbose: print 'Processing corpus...'
101    corpus = []
102    for filename in corpus_files:
103        with open(filename, 'r') as doc:
104            corpus.append(reader(doc.read()))
105    corpus = [[w.stem for w in map(stemmer, doc)] for doc in corpus]
106
107    stopwords = None
108    if stopwords_file:
109        if verbose: print 'Processing stopwords...'
110        with open(stopwords_file, 'r') as sw:
111            stopwords = reader(sw.read())
112        stopwords = [w.stem for w in map(stemmer, stopwords)]
113
114    if verbose: print 'Building dictionary... '
115    dictionary = build_dict(corpus, stopwords, measure)
116    with open(output_file, 'wb') as out:
117        pickle.dump(dictionary, out, -1) 
118    
119
120if __name__ == '__main__':
121
122    import getopt
123    import sys
124    
125    try:
126        options = getopt.getopt(sys.argv[1:], 'o:s:')
127        output_file = options[0][0][1]
128        stopwords_file = options[0][1][1]
129        corpus = options[1]
130    except:
131        print __doc__
132        exit(1)
133
134    build_dict_from_files(output_file, corpus, stopwords_file, verbose=True)
135    
136               
137