/build_dict.py
Python | 137 lines | 104 code | 9 blank | 24 comment | 0 complexity | 0b08c4f560bc29d7729d3867cc17c60c MD5 | raw file
1#!/usr/bin/env python 2 3# Copyright (C) 2011 by Alessandro Presta 4 5# Permission is hereby granted, free of charge, to any person obtaining a copy 6# of this software and associated documentation files (the "Software"), to deal 7# in the Software without restriction, including without limitation the rights 8# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9# copies of the Software, and to permit persons to whom the Software is 10# furnished to do so, subject to the following conditions: 11 12# The above copyright notice and this permission notice shall be included in 13# all copies or substantial portions of the Software. 14 15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21# THE SOFTWARE 22 23 24''' 25Usage: build_dict.py -o <output file> -s <stopwords file> <list of files> 26''' 27 28from __future__ import division 29 30from tagger import Stemmer 31from extras import SimpleReader 32 33def build_dict(corpus, stopwords=None, measure='IDF'): 34 ''' 35 @param corpus: a list of documents, represented as lists of (stemmed) 36 words 37 @param stopwords: the list of (stemmed) words that should have zero weight 38 @param measure: the measure used to compute the weights ('IDF' 39 i.e. 'inverse document frequency' or 'ICF' i.e. 40 'inverse collection frequency'; defaults to 'IDF') 41 42 @returns: a dictionary of weights in the interval [0,1] 43 ''' 44 45 import collections 46 import math 47 48 dictionary = {} 49 50 if measure == 'ICF': 51 words = [w for doc in corpus for w in doc] 52 53 term_count = collections.Counter(words) 54 total_count = len(words) 55 scale = math.log(total_count) 56 57 for w, cnt in term_count.iteritems(): 58 dictionary[w] = math.log(total_count / (cnt + 1)) / scale 59 60 elif measure == 'IDF': 61 corpus_size = len(corpus) 62 scale = math.log(corpus_size) 63 64 term_count = collections.defaultdict(int) 65 66 for doc in corpus: 67 words = set(doc) 68 for w in words: 69 term_count[w] += 1 70 71 for w, cnt in term_count.iteritems(): 72 dictionary[w] = math.log(corpus_size / (cnt + 1)) / scale 73 74 if stopwords: 75 for w in stopwords: 76 dictionary[w] = 0.0 77 78 return dictionary 79 80 81def build_dict_from_files(output_file, corpus_files, stopwords_file=None, 82 reader=SimpleReader(), stemmer=Stemmer(), 83 measure='IDF', verbose=False): 84 ''' 85 @param output_file: the name of the file where the dictionary should be 86 saved 87 @param corpus_files: a list of files with words to process 88 @param stopwords_file: a file containing a list of stopwords 89 @param reader: the L{Reader} object to be used 90 @param stemmer: the L{Stemmer} object to be used 91 @param measure: the measure used to compute the weights ('IDF' 92 i.e. 'inverse document frequency' or 'ICF' i.e. 93 'inverse collection frequency'; defaults to 'IDF') 94 @param verbose: whether information on the progress should be 95 printed on screen 96 ''' 97 98 import pickle 99 100 if verbose: print 'Processing corpus...' 101 corpus = [] 102 for filename in corpus_files: 103 with open(filename, 'r') as doc: 104 corpus.append(reader(doc.read())) 105 corpus = [[w.stem for w in map(stemmer, doc)] for doc in corpus] 106 107 stopwords = None 108 if stopwords_file: 109 if verbose: print 'Processing stopwords...' 110 with open(stopwords_file, 'r') as sw: 111 stopwords = reader(sw.read()) 112 stopwords = [w.stem for w in map(stemmer, stopwords)] 113 114 if verbose: print 'Building dictionary... ' 115 dictionary = build_dict(corpus, stopwords, measure) 116 with open(output_file, 'wb') as out: 117 pickle.dump(dictionary, out, -1) 118 119 120if __name__ == '__main__': 121 122 import getopt 123 import sys 124 125 try: 126 options = getopt.getopt(sys.argv[1:], 'o:s:') 127 output_file = options[0][0][1] 128 stopwords_file = options[0][1][1] 129 corpus = options[1] 130 except: 131 print __doc__ 132 exit(1) 133 134 build_dict_from_files(output_file, corpus, stopwords_file, verbose=True) 135 136 137