lin.py - Natural Language Toolkit: Lin's Thesaurus Copyrigh…

/nltk/corpus/reader/lin.py

https://github.com/BrucePHill/nltk · Python · 156 lines · 132 code · 7 blank · 17 comment · 6 complexity · 5ab19119c04bc552b7b559b172ae6fca MD5 · raw file

# Natural Language Toolkit: Lin's Thesaurus
#
# Copyright (C) 2001-2013 NLTK Project
# Author: Dan Blanchard <dblanchard@ets.org>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.txt
from __future__ import print_function

import re
from collections import defaultdict
from functools import reduce

from nltk.corpus.reader import CorpusReader


class LinThesaurusCorpusReader(CorpusReader):
    """ Wrapper for the LISP-formatted thesauruses distributed by Dekang Lin. """

    # Compiled regular expression for extracting the key from the first line of each
    # thesaurus entry
    _key_re = re.compile(r'\("?([^"]+)"? \(desc [0-9.]+\).+')

    @staticmethod
    def __defaultdict_factory():
        ''' Factory for creating defaultdict of defaultdict(dict)s '''
        return defaultdict(dict)

    def __init__(self, root, badscore=0.0):
        '''
        Initialize the thesaurus.

        :param root: root directory containing thesaurus LISP files
        :type root: C{string}
        :param badscore: the score to give to words which do not appear in each other's sets of synonyms
        :type badscore: C{float}
        '''

        super(LinThesaurusCorpusReader, self).__init__(root, r'sim[A-Z]\.lsp')
        self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory)
        self._badscore = badscore
        for path, encoding, fileid in self.abspaths(include_encoding=True, include_fileid=True):
            with open(path) as lin_file:
                first = True
                for line in lin_file:
                    line = line.strip()
                    # Start of entry
                    if first:
                        key = LinThesaurusCorpusReader._key_re.sub(r'\1', line)
                        first = False
                    # End of entry
                    elif line == '))':
                        first = True
                    # Lines with pairs of ngrams and scores
                    else:
                        split_line = line.split('\t')
                        if len(split_line) == 2:
                            ngram, score = split_line
                            self._thesaurus[fileid][key][ngram.strip('"')] = float(score)

    def similarity(self, ngram1, ngram2, fileid=None):
        '''
        Returns the similarity score for two ngrams.

        :param ngram1: first ngram to compare
        :type ngram1: C{string}
        :param ngram2: second ngram to compare
        :type ngram2: C{string}
        :param fileid: thesaurus fileid to search in. If None, search all fileids.
        :type fileid: C{string}
        :return: If fileid is specified, just the score for the two ngrams; otherwise,
                 list of tuples of fileids and scores.
        '''
        # Entries don't contain themselves, so make sure similarity between item and itself is 1.0
        if ngram1 == ngram2:
            if fileid:
                return 1.0
            else:
                return [(fid, 1.0) for fid in self._fileids]
        else:
            if fileid:
                return self._thesaurus[fileid][ngram1][ngram2] if ngram2 in self._thesaurus[fileid][ngram1] else self._badscore
            else:
                return [(fid, (self._thesaurus[fid][ngram1][ngram2] if ngram2 in self._thesaurus[fid][ngram1]
                                  else self._badscore)) for fid in self._fileids]

    def scored_synonyms(self, ngram, fileid=None):
        '''
        Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram

        :param ngram: ngram to lookup
        :type ngram: C{string}
        :param fileid: thesaurus fileid to search in. If None, search all fileids.
        :type fileid: C{string}
        :return: If fileid is specified, list of tuples of scores and synonyms; otherwise,
                 list of tuples of fileids and lists, where inner lists consist of tuples of
                 scores and synonyms.
        '''
        if fileid:
            return self._thesaurus[fileid][ngram].items()
        else:
            return [(fileid, self._thesaurus[fileid][ngram].items()) for fileid in self._fileids]

    def synonyms(self, ngram, fileid=None):
        '''
        Returns a list of synonyms for the current ngram.

        :param ngram: ngram to lookup
        :type ngram: C{string}
        :param fileid: thesaurus fileid to search in. If None, search all fileids.
        :type fileid: C{string}
        :return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and
                 lists, where inner lists contain synonyms.
        '''
        if fileid:
            return self._thesaurus[fileid][ngram].keys()
        else:
            return [(fileid, self._thesaurus[fileid][ngram].keys()) for fileid in self._fileids]

    def __contains__(self, ngram):
        '''
        Determines whether or not the given ngram is in the thesaurus.

        :param ngram: ngram to lookup
        :type ngram: C{string}
        :return: whether the given ngram is in the thesaurus.
        '''
        return reduce(lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]), self._fileids, False)


######################################################################
# Demo
######################################################################

def demo():
    from nltk.corpus import lin_thesaurus as thes

    word1 = "business"
    word2 = "enterprise"
    print("Getting synonyms for " + word1)
    print(thes.synonyms(word1))

    print("Getting scored synonyms for " + word1)
    print(thes.synonyms(word1))

    print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
    print(thes.synonyms(word1, fileid="simN.lsp"))

    print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
    print(thes.synonyms(word1, fileid="simN.lsp"))

    print("Similarity score for %s and %s:" % (word1, word2))
    print(thes.similarity(word1, word2))


if __name__ == '__main__':
    demo()
Tech Fingerprint

Alerts (11)

'def' Ensure functions have docstrings for documentation
134
'print(' Use logging module for better control and configurability
139 140 142 143 145 146 148 149 151 152