/build_dict.py

http://github.com/apresta/tagger · Python · 137 lines · 72 code · 34 blank · 31 comment · 23 complexity · 0b08c4f560bc29d7729d3867cc17c60c MD5 · raw file

  1. #!/usr/bin/env python
  2. # Copyright (C) 2011 by Alessandro Presta
  3. # Permission is hereby granted, free of charge, to any person obtaining a copy
  4. # of this software and associated documentation files (the "Software"), to deal
  5. # in the Software without restriction, including without limitation the rights
  6. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. # copies of the Software, and to permit persons to whom the Software is
  8. # furnished to do so, subject to the following conditions:
  9. # The above copyright notice and this permission notice shall be included in
  10. # all copies or substantial portions of the Software.
  11. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  12. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  13. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  14. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  15. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  16. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  17. # THE SOFTWARE
  18. '''
  19. Usage: build_dict.py -o <output file> -s <stopwords file> <list of files>
  20. '''
  21. from __future__ import division
  22. from tagger import Stemmer
  23. from extras import SimpleReader
  24. def build_dict(corpus, stopwords=None, measure='IDF'):
  25. '''
  26. @param corpus: a list of documents, represented as lists of (stemmed)
  27. words
  28. @param stopwords: the list of (stemmed) words that should have zero weight
  29. @param measure: the measure used to compute the weights ('IDF'
  30. i.e. 'inverse document frequency' or 'ICF' i.e.
  31. 'inverse collection frequency'; defaults to 'IDF')
  32. @returns: a dictionary of weights in the interval [0,1]
  33. '''
  34. import collections
  35. import math
  36. dictionary = {}
  37. if measure == 'ICF':
  38. words = [w for doc in corpus for w in doc]
  39. term_count = collections.Counter(words)
  40. total_count = len(words)
  41. scale = math.log(total_count)
  42. for w, cnt in term_count.iteritems():
  43. dictionary[w] = math.log(total_count / (cnt + 1)) / scale
  44. elif measure == 'IDF':
  45. corpus_size = len(corpus)
  46. scale = math.log(corpus_size)
  47. term_count = collections.defaultdict(int)
  48. for doc in corpus:
  49. words = set(doc)
  50. for w in words:
  51. term_count[w] += 1
  52. for w, cnt in term_count.iteritems():
  53. dictionary[w] = math.log(corpus_size / (cnt + 1)) / scale
  54. if stopwords:
  55. for w in stopwords:
  56. dictionary[w] = 0.0
  57. return dictionary
  58. def build_dict_from_files(output_file, corpus_files, stopwords_file=None,
  59. reader=SimpleReader(), stemmer=Stemmer(),
  60. measure='IDF', verbose=False):
  61. '''
  62. @param output_file: the name of the file where the dictionary should be
  63. saved
  64. @param corpus_files: a list of files with words to process
  65. @param stopwords_file: a file containing a list of stopwords
  66. @param reader: the L{Reader} object to be used
  67. @param stemmer: the L{Stemmer} object to be used
  68. @param measure: the measure used to compute the weights ('IDF'
  69. i.e. 'inverse document frequency' or 'ICF' i.e.
  70. 'inverse collection frequency'; defaults to 'IDF')
  71. @param verbose: whether information on the progress should be
  72. printed on screen
  73. '''
  74. import pickle
  75. if verbose: print 'Processing corpus...'
  76. corpus = []
  77. for filename in corpus_files:
  78. with open(filename, 'r') as doc:
  79. corpus.append(reader(doc.read()))
  80. corpus = [[w.stem for w in map(stemmer, doc)] for doc in corpus]
  81. stopwords = None
  82. if stopwords_file:
  83. if verbose: print 'Processing stopwords...'
  84. with open(stopwords_file, 'r') as sw:
  85. stopwords = reader(sw.read())
  86. stopwords = [w.stem for w in map(stemmer, stopwords)]
  87. if verbose: print 'Building dictionary... '
  88. dictionary = build_dict(corpus, stopwords, measure)
  89. with open(output_file, 'wb') as out:
  90. pickle.dump(dictionary, out, -1)
  91. if __name__ == '__main__':
  92. import getopt
  93. import sys
  94. try:
  95. options = getopt.getopt(sys.argv[1:], 'o:s:')
  96. output_file = options[0][0][1]
  97. stopwords_file = options[0][1][1]
  98. corpus = options[1]
  99. except:
  100. print __doc__
  101. exit(1)
  102. build_dict_from_files(output_file, corpus, stopwords_file, verbose=True)