build_dict.py - This Python script builds a dictionary of w…

/build_dict.py

http://github.com/apresta/tagger · Python · 137 lines · 72 code · 34 blank · 31 comment · 23 complexity · 0b08c4f560bc29d7729d3867cc17c60c MD5 · raw file


#!/usr/bin/env python

# Copyright (C) 2011 by Alessandro Presta

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE


'''
Usage: build_dict.py -o <output file> -s <stopwords file> <list of files>
'''

from __future__ import division

from tagger import Stemmer
from extras import SimpleReader

def build_dict(corpus, stopwords=None, measure='IDF'):
    '''
    @param corpus:    a list of documents, represented as lists of (stemmed)
                      words
    @param stopwords: the list of (stemmed) words that should have zero weight
    @param measure:   the measure used to compute the weights ('IDF'
                      i.e. 'inverse document frequency' or 'ICF' i.e.
                      'inverse collection frequency'; defaults to 'IDF')

    @returns: a dictionary of weights in the interval [0,1]
    '''

    import collections
    import math

    dictionary = {}

    if measure == 'ICF':
        words = [w for doc in corpus for w in doc]
        
        term_count = collections.Counter(words)
        total_count = len(words)
        scale = math.log(total_count)
    
        for w, cnt in term_count.iteritems():
            dictionary[w] = math.log(total_count / (cnt + 1)) / scale

    elif measure == 'IDF':
        corpus_size = len(corpus)
        scale = math.log(corpus_size)

        term_count = collections.defaultdict(int)

        for doc in corpus:
            words = set(doc)
            for w in words:
                term_count[w] += 1

        for w, cnt in term_count.iteritems():
            dictionary[w] = math.log(corpus_size / (cnt + 1)) / scale
            
    if stopwords:
        for w in stopwords:
            dictionary[w] = 0.0
    
    return dictionary


def build_dict_from_files(output_file, corpus_files, stopwords_file=None,
                          reader=SimpleReader(), stemmer=Stemmer(),
                          measure='IDF', verbose=False):
    '''
    @param output_file:    the name of the file where the dictionary should be
                           saved
    @param corpus_files:   a list of files with words to process
    @param stopwords_file: a file containing a list of stopwords
    @param reader:         the L{Reader} object to be used
    @param stemmer:        the L{Stemmer} object to be used
    @param measure:        the measure used to compute the weights ('IDF'
                           i.e. 'inverse document frequency' or 'ICF' i.e.
                           'inverse collection frequency'; defaults to 'IDF')
    @param verbose:        whether information on the progress should be
                           printed on screen
    '''

    import pickle

    if verbose: print 'Processing corpus...'
    corpus = []
    for filename in corpus_files:
        with open(filename, 'r') as doc:
            corpus.append(reader(doc.read()))
    corpus = [[w.stem for w in map(stemmer, doc)] for doc in corpus]

    stopwords = None
    if stopwords_file:
        if verbose: print 'Processing stopwords...'
        with open(stopwords_file, 'r') as sw:
            stopwords = reader(sw.read())
        stopwords = [w.stem for w in map(stemmer, stopwords)]

    if verbose: print 'Building dictionary... '
    dictionary = build_dict(corpus, stopwords, measure)
    with open(output_file, 'wb') as out:
        pickle.dump(dictionary, out, -1) 
    

if __name__ == '__main__':

    import getopt
    import sys
    
    try:
        options = getopt.getopt(sys.argv[1:], 'o:s:')
        output_file = options[0][0][1]
        stopwords_file = options[0][1][1]
        corpus = options[1]
    except:
        print __doc__
        exit(1)

    build_dict_from_files(output_file, corpus, stopwords_file, verbose=True)

Summary ✨

This Python script builds a dictionary of word weights from a list of documents, using either inverse document frequency (IDF) or inverse collection frequency (ICF) as the weight measure. It takes input files for the corpus and stopwords, and outputs a dictionary file in pickle format. The script can be run with command-line options to specify output and stopword files.

Alerts (7)

Complexity hotspot; lines 50 to 51 (total complexity: 3)
50 51
'def' Ensure functions have docstrings for documentation
81
Complexity hotspot; lines 108 to 110 (total complexity: 3)
108 109 110
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
130