PageRenderTime 49ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/nltk/corpus/reader/lin.py

https://github.com/BrucePHill/nltk
Python | 156 lines | 132 code | 7 blank | 17 comment | 6 complexity | 5ab19119c04bc552b7b559b172ae6fca MD5 | raw file
Possible License(s): Apache-2.0
  1. # Natural Language Toolkit: Lin's Thesaurus
  2. #
  3. # Copyright (C) 2001-2013 NLTK Project
  4. # Author: Dan Blanchard <dblanchard@ets.org>
  5. # URL: <http://www.nltk.org/>
  6. # For license information, see LICENSE.txt
  7. from __future__ import print_function
  8. import re
  9. from collections import defaultdict
  10. from functools import reduce
  11. from nltk.corpus.reader import CorpusReader
  12. class LinThesaurusCorpusReader(CorpusReader):
  13. """ Wrapper for the LISP-formatted thesauruses distributed by Dekang Lin. """
  14. # Compiled regular expression for extracting the key from the first line of each
  15. # thesaurus entry
  16. _key_re = re.compile(r'\("?([^"]+)"? \(desc [0-9.]+\).+')
  17. @staticmethod
  18. def __defaultdict_factory():
  19. ''' Factory for creating defaultdict of defaultdict(dict)s '''
  20. return defaultdict(dict)
  21. def __init__(self, root, badscore=0.0):
  22. '''
  23. Initialize the thesaurus.
  24. :param root: root directory containing thesaurus LISP files
  25. :type root: C{string}
  26. :param badscore: the score to give to words which do not appear in each other's sets of synonyms
  27. :type badscore: C{float}
  28. '''
  29. super(LinThesaurusCorpusReader, self).__init__(root, r'sim[A-Z]\.lsp')
  30. self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory)
  31. self._badscore = badscore
  32. for path, encoding, fileid in self.abspaths(include_encoding=True, include_fileid=True):
  33. with open(path) as lin_file:
  34. first = True
  35. for line in lin_file:
  36. line = line.strip()
  37. # Start of entry
  38. if first:
  39. key = LinThesaurusCorpusReader._key_re.sub(r'\1', line)
  40. first = False
  41. # End of entry
  42. elif line == '))':
  43. first = True
  44. # Lines with pairs of ngrams and scores
  45. else:
  46. split_line = line.split('\t')
  47. if len(split_line) == 2:
  48. ngram, score = split_line
  49. self._thesaurus[fileid][key][ngram.strip('"')] = float(score)
  50. def similarity(self, ngram1, ngram2, fileid=None):
  51. '''
  52. Returns the similarity score for two ngrams.
  53. :param ngram1: first ngram to compare
  54. :type ngram1: C{string}
  55. :param ngram2: second ngram to compare
  56. :type ngram2: C{string}
  57. :param fileid: thesaurus fileid to search in. If None, search all fileids.
  58. :type fileid: C{string}
  59. :return: If fileid is specified, just the score for the two ngrams; otherwise,
  60. list of tuples of fileids and scores.
  61. '''
  62. # Entries don't contain themselves, so make sure similarity between item and itself is 1.0
  63. if ngram1 == ngram2:
  64. if fileid:
  65. return 1.0
  66. else:
  67. return [(fid, 1.0) for fid in self._fileids]
  68. else:
  69. if fileid:
  70. return self._thesaurus[fileid][ngram1][ngram2] if ngram2 in self._thesaurus[fileid][ngram1] else self._badscore
  71. else:
  72. return [(fid, (self._thesaurus[fid][ngram1][ngram2] if ngram2 in self._thesaurus[fid][ngram1]
  73. else self._badscore)) for fid in self._fileids]
  74. def scored_synonyms(self, ngram, fileid=None):
  75. '''
  76. Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram
  77. :param ngram: ngram to lookup
  78. :type ngram: C{string}
  79. :param fileid: thesaurus fileid to search in. If None, search all fileids.
  80. :type fileid: C{string}
  81. :return: If fileid is specified, list of tuples of scores and synonyms; otherwise,
  82. list of tuples of fileids and lists, where inner lists consist of tuples of
  83. scores and synonyms.
  84. '''
  85. if fileid:
  86. return self._thesaurus[fileid][ngram].items()
  87. else:
  88. return [(fileid, self._thesaurus[fileid][ngram].items()) for fileid in self._fileids]
  89. def synonyms(self, ngram, fileid=None):
  90. '''
  91. Returns a list of synonyms for the current ngram.
  92. :param ngram: ngram to lookup
  93. :type ngram: C{string}
  94. :param fileid: thesaurus fileid to search in. If None, search all fileids.
  95. :type fileid: C{string}
  96. :return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and
  97. lists, where inner lists contain synonyms.
  98. '''
  99. if fileid:
  100. return self._thesaurus[fileid][ngram].keys()
  101. else:
  102. return [(fileid, self._thesaurus[fileid][ngram].keys()) for fileid in self._fileids]
  103. def __contains__(self, ngram):
  104. '''
  105. Determines whether or not the given ngram is in the thesaurus.
  106. :param ngram: ngram to lookup
  107. :type ngram: C{string}
  108. :return: whether the given ngram is in the thesaurus.
  109. '''
  110. return reduce(lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]), self._fileids, False)
  111. ######################################################################
  112. # Demo
  113. ######################################################################
  114. def demo():
  115. from nltk.corpus import lin_thesaurus as thes
  116. word1 = "business"
  117. word2 = "enterprise"
  118. print("Getting synonyms for " + word1)
  119. print(thes.synonyms(word1))
  120. print("Getting scored synonyms for " + word1)
  121. print(thes.synonyms(word1))
  122. print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
  123. print(thes.synonyms(word1, fileid="simN.lsp"))
  124. print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
  125. print(thes.synonyms(word1, fileid="simN.lsp"))
  126. print("Similarity score for %s and %s:" % (word1, word2))
  127. print(thes.similarity(word1, word2))
  128. if __name__ == '__main__':
  129. demo()