concord.py - Natural Language Toolkit: Concordance System C…

/nltk_contrib/nltk_contrib/concord.py

http://nltk.googlecode.com/ · Python · 867 lines · 724 code · 20 blank · 123 comment · 33 complexity · efb97f390ef98829dee21abc5726d8bf MD5 · raw file

# Natural Language Toolkit: Concordance System
#
# Copyright (C) 2005 University of Melbourne
# Author: Peter Spiller
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT

from nltk.corpus import brown
from math import *
import re, string
from nltk.probability import *

class SentencesIndex(object):
    """Class implementing an index of a collection of sentences.

    Given a list of sentences, where each sentence is a list of words,
    this class generates an index of the list. Each word should be a (word, POS
    tag) pair. The index is stored as a dictionary, with the hashable items as
    keys and a list of (sentence number, word number) tuples as values. This
    class also generates a list of sentence lengths. 
    """
    
    def __init__(self, sentences):
        """ Constructor. Takes the list of sentences to index.

        @type sentences:    list
        @param sentences:   List of sentences to index. Sentences should be
                            lists of (string, string) pairs.
        """

        sentenceCount = 0
        self.index = {}
        self.lengths = []

        # for each sentence:
        for sentence in sentences:
            # add the sentences length to the list of sentence lengths
            self.lengths.append(len(sentence))
            wordCount = 0
            for word in sentence:
                self.index[word] = self.index.get(word, []) + [(sentenceCount, wordCount)]
                wordCount += 1
            sentenceCount += 1

    def getIndex(self):
        """ Returns the index dictionary.

        @rtype:     dictionary
        @returns:   The dictionary containing the index.
        """
        return self.index

    def getSentenceLengths(self):
        """ Returns the list of sentence lengths.

        Element 0 is the length of the first sentence, element 1 the second,
        etc.

        @rtype:     list
        @returns:   List of lengths of sentences.
        """
        return self.lengths

class IndexConcordance(object):
    """ Class that generates concordances from a list of sentences.

    Uses an index for efficiency. If a SentencesIndex object is provided,
    it will be used, otherwise one will be constructed from the list of
    sentences. When generating a concordance, the supplied regular expression
    is used to filter the list of words in the index. Any that match are looked
    up in the index, and their lists of (sentence number, word number) pairs are
    used to extract the correct amount of context from the sentences.

    Although this class also allows regular expressions to be specified for the
    left and right context, they are not used on the index. If only left/right
    regexps are provided, the class will essentially generate a concordance for
    every word in the corpus, then filter it with the regexps. This will not be
    very efficient and requires very large amounts of memory.

    @cvar SORT_WORD:    Constant for sorting by target word.
    @cvar SORT_POS:     Constant for sorting by target word's POS tag.
    @cvar SORT_NUM:     Constant for sorting by sentence number.
    @cvar SORT_RIGHT_CONTEXT:    Constant for sorting by the first word of the
    right context.
    """

    # constants for different types of sort
    
    SORT_WORD = 0
    SORT_POS = 1
    SORT_NUM = 2
    SORT_RIGHT_CONTEXT = 3
    
    def __init__(self, sentences, index=None):
        """ Constructor.

        Arguments:
        @type sentences:    list
        @param sentences:   List of sentences to create a concordance for.
                            Sentences should be lists of (string, string) pairs.
        @type index:        SentencesIndex
        @param index:     SentencesIndex object to use as an index. If this is
                            not provided, one will be generated.
        """
        
        self.sentences = sentences
        self.index = index
        # generate an index if one wasn't provided
        if self.index == None:
            self.index = SentencesIndex(self.sentences)

    def formatted(self, leftRegexp=None, middleRegexp=".*", rightRegexp=None,
            leftContextLength=3, rightContextLength=3, contextInSentences=False,
            contextChars=50, maxKeyLength=0, showWord=True,
            sort=0, showPOS=True, flipWordAndPOS=False, verbose=False):
        """Generates and displays keyword-in-context formatted concordance data.

        This is a convenience method that combines raw() and display()'s
        options. Unless you need raw output, this is probably the most useful
        method.
        
        @type leftRegexp:       string
        @param leftRegexp:    Regular expression applied to the left context
                                to filter output. Defaults to None.
        @type middleRegexp:     string
        @param middleRegexp:  Regular expression applied to target word to
                                filter output. Defaults to ".*" (ie everything).
        @type rightRegexp:      string
        @param rightRegexp:   Regular expression applied to the right context
                                to filter output. Defaults to None.
        @type leftContextLength:        number
        @param leftContextLength:     Length of left context. Defaults to 3.
        @type rightContextLength:       number
        @param rightContextLength:    Length of right context. Defaults to 3.
        @type contextInSentences:       number
        @param contextInSentences:    Determines whether the context lengths
        arguments are in words or sentences. If false, the context lengths
        are in words - a rightContextLength argument of 2 results in two
        words of right context. If true, a rightContextLength argument of 2
        results in a right context consisting of the portion of the target
        word's sentence to the right of the target, plus the two sentences
        to the right of that sentence. Defaults to False.
        @type contextChars      number
        @param contextChars:  Amount of context to show. If set to less than
                                0, does not limit amount of context shown
                                (may look ugly). Defaults to 55.
        @type maxKeyLength:     number
        @param maxKeyLength:  Max number of characters to show for the
                                target word. If 0 or less, this value is
                                calculated so as to fully show all target
                                words. Defaults to 0.
        @type showWord:         boolean
        @param showWord:      Whether to show words. Defaults to True.
        @type sort:             integer
        @param sort: Should be set to one the provided SORT constants. If
        SORT_WORD, the output is sorted on the target word. If SORT_POS, the
        output is sorted on the target word's POS tag. If SORT_NUM, the
        output is sorted by sentence number. If SORT_RIGHT_CONTEXT, the
        output is sorted on the first word of the right context. Defaults to
        SORT_WORD.
        @type showPOS:          boolean
        @param showPOS:       Whether to show POS tags. Defaults to True.
        @type flipWordAndPOS:   boolean
        @param flipWordAndPOS: If true, displays POS tags first instead of
            words (ie prints 'cc/and' instead of 'and/cc'). Defaults to False.
        @type verbose:          boolean
        @param verbose:       Displays some extra status information. Defaults
                                to False.
        """
            
        self.format(self.raw(leftRegexp, middleRegexp, rightRegexp, leftContextLength,
                rightContextLength, contextInSentences, sort, verbose), contextChars,
                maxKeyLength, showWord, showPOS, flipWordAndPOS, verbose)

    def raw(self, leftRegexp=None, middleRegexp=".*", rightRegexp=None,
            leftContextLength=3, rightContextLength=3, contextInSentences=False,
            sort=0, verbose=False):
        """ Generates and returns raw concordance data.

        Regular expressions supplied are evaluated over the appropriate part of
        each line of the concordance. For the purposes of evaluating the regexps,
        the lists of (word, POS tag) tuples are flattened into a space-separated
        list of word/POS tokens (ie the word followed by '/' followed by the POS
        tag). A regexp like '^must/.*' matches the word 'must' with any POS tag,
        while one like '.*/nn$' matches any word with a POS tag of 'nn'. All
        regexps are evaluated over lowercase versions of the text.

        @type leftRegexp:       string
        @param leftRegexp:    Regular expression applied to the left context
                                to filter output. Defaults to None.
        @type middleRegexp:     string
        @param middleRegexp:  Regular expression applied to target word to
                                filter output. Defaults to ".*" (ie everything).
        @type rightRegexp:      string
        @param rightRegexp:   Regular expression applied to the right context
                                to filter output. Defaults to None.
        @type leftContextLength:        number
        @param leftContextLength:     Length of left context. Defaults to 3.
        @type rightContextLength:       number
        @param rightContextLength:    Length of right context. Defaults to 3.
        @type contextInSentences:       number
        @param contextInSentences:    Determines whether the context lengths
            arguments are in words or sentences. If false, the context lengths
            are in words - a rightContextLength argument of 2 results in two
            words of right context. If true, a rightContextLength argument of 2
            results in a right context consisting of the portion of the target
            word's sentence to the right of the target, plus the two sentences
            to the right of that sentence. Defaults to False.
        @type sort:             integer
        @param sort: Should be set to one the provided SORT constants. If
            SORT_WORD, the output is sorted on the target word. If SORT_POS, the
            output is sorted on the target word's POS tag. If SORT_NUM, the
            output is sorted by sentence number. If SORT_RIGHT_CONTEXT, the
            output is sorted on the first word of the right context. Defaults to
            SORT_WORD.
        @type verbose:          boolean
        @param verbose:       Displays some extra status information. Defaults
                                to False.
        @rtype:     list
        @return:    Raw concordance ouput. Returned as a list of
                    ([left context], target word, [right context], target word
                    sentence number) tuples.
        """
        # compile the middle regexp.
        reg = re.compile(middleRegexp)

        if verbose:
            print "Matching the following target words:"
        wordLocs = []
        # get list of (sentence, word) pairs to get context for
        for item in self.index.getIndex().iteritems():
            if reg.match("/".join([item[0][0].lower(), item[0][1]])):
                if verbose:
                    print "/".join(item[0])
                wordLocs.append(item[1])
                
        print ""

        items = []
        # if context lengths are specified in words:
        if contextInSentences == False:
            # for each list of (sentence, word offset in sentence) pairs:
            for wordList in wordLocs:
                # for each (sentence, word offset in sentence) pair:
                for sentenceNum, offset in wordList:
                    # set pointers to the left- and rightmost sentences to be
                    # looked at to the sentence the target word is in
                    leftCorpusIndex = sentenceNum
                    rightCorpusIndex = sentenceNum
                    # number of words to include in the left context is
                    # initially everything in the sentence up to the target
                    leftLength = offset
                    # number of words to include in the left context is
                    # initially everything in the sentence after the target
                    rightLength = self.index.getSentenceLengths()[sentenceNum] - offset - 1

                    # while the length of the left context is less than what we
                    # need, keep decreasing the left corpus index (ie adding
                    # sentences to the left context).
                    while leftLength < leftContextLength:
                        leftCorpusIndex -= 1
                        # if the new corpus index would fall off the end of the
                        # list, stop at 0
                        if(leftCorpusIndex < 0):
                            leftCorpusIndex = 0
                            break
                        # adjust length and offset
                        leftLength += self.index.getSentenceLengths()[leftCorpusIndex]
                        offset += self.index.getSentenceLengths()[leftCorpusIndex]

                    # while the length of the right context is less than what we
                    # need, keep increasing the right corpus index (ie adding
                    # sentences to the right context).
                    while rightLength < rightContextLength:
                        rightCorpusIndex += 1
                        try:
                            rightLength += self.index.getSentenceLengths()[rightCorpusIndex]
                        # if the new corpus index falls off the end of the list,
                        # stop at the end
                        except IndexError:
                            rightCorpusIndex -= 1
                            break

                    # grab all sentences from the left to right corpus indices,
                    # then flatten them into a single list of words
                    sents = self.sentences[leftCorpusIndex:rightCorpusIndex+1]
                    words = []
                    for sentence in sents:
                        for word in sentence:
                            words.append(word)

                    # select the appropriate sections of context from the list
                    # of words
                    left = words[offset-leftContextLength:offset]
                    target = words[offset]
                    right = words[offset+1:offset+1+rightContextLength]
                    items.append((left, target, right, sentenceNum))
        # if context lengths are specified in sentences:
        else:
            # for each list of (sentence, word offset in sentence) pairs:
            for wordList in wordLocs:
                # for each list of (sentence, word offset in sentence) pairs:
                for sentenceNum, offset in wordList:
                    # set pointers to the left- and rightmost sentences to be
                    # looked at to the sentence the target word is in
                    leftCorpusIndex = sentenceNum
                    rightCorpusIndex = sentenceNum
                    # number of words to include in the left context is
                    # initially everything in the sentence up to the target
                    leftLength = offset
                    # number of words to include in the left context is
                    # initially everything in the sentence after the target
                    rightLength = self.index.getSentenceLengths()[sentenceNum] - offset - 1
                    # keep track of the number of sentences included in the
                    # left/right context
                    leftSents = 0;
                    rightSents = 0;

                    # while we don't have enough sentences in the left context,
                    # keep decreasing the left corpus index
                    while leftSents < leftContextLength:
                        leftCorpusIndex -= 1
                        # if the new corpus index would fall off the end of the
                        # list, stop at 0
                        if(leftCorpusIndex < 0):
                            leftCorpusIndex = 0
                            break
                        leftLength += self.index.getSentenceLengths()[leftCorpusIndex]
                        offset += self.index.getSentenceLengths()[leftCorpusIndex]
                        leftSents += 1

                    # while we don't have enough sentences in the right context,
                    # keep increasing the right corpus index
                    while rightSents < rightContextLength:
                        rightCorpusIndex += 1
                        try:
                            rightLength += self.index.getSentenceLengths()[rightCorpusIndex]
                            rightSents += 1
                        # if the new corpus index falls off the end of the list,
                        # stop at the end
                        except IndexError:
                            rightCorpusIndex -= 1
                            break

                    # grab all sentences from the left to right corpus indices,
                    # then flatten them into a single list of words
                    sents = self.sentences[leftCorpusIndex:rightCorpusIndex+1]
                    words = []
                    for sentence in sents:
                        for word in sentence:
                            words.append(word)

                    # select the appropriate sections of context from the list
                    # of words
                    left = words[0:offset]
                    target = words[offset]
                    right = words[offset+1:]
                    items.append((left, target, right, sentenceNum))

        if verbose:
            print "Found %d matches for target word..." % len(items)

        # sort the concordance
        if sort == self.SORT_WORD:
            if verbose:
                print "Sorting by target word..."
            items.sort(key=lambda i:i[1][0].lower())
        elif sort == self.SORT_POS:
            if verbose:
                print "Sorting by target word POS tag..."
            items.sort(key=lambda i:i[1][1].lower())
        elif sort == self.SORT_NUM:
            if verbose:
                print "Sorting by sentence number..."
            items.sort(key=lambda i:i[3])
        elif sort == self.SORT_RIGHT_CONTEXT:
            if verbose:
                print "Sorting by first word of right context..."
            items.sort(key=lambda i:i[2][0][0])

        # if any regular expressions have been given for the context, filter
        # the concordance using them
        filtered = []
        filterBool = False
        if leftRegexp != None or rightRegexp != None:
            filterBool = True
        if filterBool:    

            leftRe=None
            rightRe=None
            if leftRegexp != None:
                if verbose:
                    print "Filtering on left context..."
                leftRe = re.compile(leftRegexp)
            if rightRegexp != None:
                if verbose:
                    print "Filtering on right context..."
                rightRe = re.compile(rightRegexp)
            
            for item in items:
                if self._matches(item, leftRe, rightRe):
                    filtered.append(item)
    
        if filterBool:
            source = filtered
        else:
            source = items

        return source

    def format(self, source, contextChars=55, maxKeyLength=0, showWord=True,
            showPOS=True, flipWordAndPOS=False, verbose=False):
        """Formats raw concordance output produced by raw().

        Displays a concordance in keyword-in-context style format.

        @type source:   list
        @param source:  Raw concordance output to format. Expects a list of
        ([left context], target word, [right context], target
        word sentence number) tuples.
        @type contextChars      number
        @param contextChars:  Amount of context to show. If set to less than
        0, does not limit amount of context shown (may look ugly). Defaults to 55.
        @type maxKeyLength:     number
        @param maxKeyLength:  Max number of characters to show for the
        target word. If 0 or less, this value is
        calculated so as to fully show all target
        words. Defaults to 0.
        @type showWord:         boolean
        @param showWord:      Whether to show words. Defaults to True.
        @type showPOS:          boolean
        @param showPOS:       Whether to show POS tags. Defaults to True.
        @type flipWordAndPOS:   boolean
        @param flipWordAndPOS: If true, displays POS tags first instead of
        words (ie prints 'cc/and' instead of 'and/cc'). Defaults to False.
        @type verbose:          boolean
        @param verbose:       Displays some extra status information. Defaults
        to False.
        """

        # flatten lists of tokens into strings
        lines = []
        maxMiddleLength = -1

        # generate intermediate list of string tuples        
        for line in source:
            # flatten left context tokens into a single string, joining words
            # and their POS tag with a '/' (if both are shown).
            left = ""
            for item in line[0]:
                if item[0] == "" and item[1] == "":
                    left = ""
                elif showWord and (not showPOS):
                    left += item[0] + " "
                elif (not showWord) and showPOS:
                    left += item[1] + " "
                elif flipWordAndPOS:
                    left += item[1] + "/" + item[0] + " "
                else:      
                    left += "/".join(item) + " "

            # flatten target word into a single string, joining the word and
            # its POS tag with a '/' (if both are shown).
            if showWord and (not showPOS):
                middle = line[1][0]
            elif (not showWord) and showPOS:
                middle = line[1][1]
            elif flipWordAndPOS:
                middle = line[1][1] + "/" + line[1][0] + " "
            else:      
                middle = "/".join(line[1])
            
            if len(middle) > maxMiddleLength:
                maxMiddleLength = len(middle)

            # flatten right context tokens into a single string, joining words
            # and their POS tag with a '/' (if both are shown).        
            right = ""
            for item in line[2]:
                if item[0] == "" and item[1] == "":
                    right = ""
                elif showWord and (not showPOS):
                    right += item[0] + " "
                elif (not showWord) and showPOS:
                    right += item[1] + " "
                elif flipWordAndPOS:
                    right += item[1] + "/" + item[0] + " "
                else:      
                    right += "/".join(item) + " "

            num = line[3]

            lines.append((middle, left, right, num))

        # crop and justify strings to generate KWIC-format output
        count = 0
        for middle, left, right, num in lines:
            # calculate amount of left padding needed
            leftPaddingLength = contextChars - len(left)
            if leftPaddingLength < 0:
                leftPaddingLength = 0
            if len(left) > contextChars and contextChars > -1:
                left = left[-contextChars:]
            left = " "*leftPaddingLength + left
            if contextChars > -1:
                right = right[0:contextChars]
            
            # add sentence numbers
            left = str(num) + ": " + left[len(str(num))+2 : ]

            # calculate amount of middle padding needed
            if maxKeyLength > 0:
                maxMiddleLength = maxKeyLength
            lPad = int(ceil(max(maxMiddleLength - len(middle), 0) / 2.0))
            rPad = int(floor(max(maxMiddleLength - len(middle), 0) / 2.0))
            middle = " "*lPad + middle + " "*rPad
            
            print left + "| " + middle + " | " + right + " "
            count += 1
        
        if verbose:    
            print "\n" + repr(count) + " lines"

    def _matches(self, item, leftRe, rightRe):
        """ Private method that runs the given regexps over a raw concordance
        item and returns whether they match it.
        """
        left = item[0]
        right = item[2]

        # flatten left and right contexts
        leftString = ""
        for token in left:
            leftString += "/".join(token) + " "
        rightString = ""
        for token in right:
            rightString += "/".join(token) + " "    

        # see if regexps match    
        ok = True
        if leftRe != None and leftRe.match(leftString) == None:
            ok = False
        if rightRe != None and rightRe.match(rightString) == None:
            ok = False
                       
        if ok:                
            return True
        else:
            return False

class Aggregator(object):
    """ Class for aggregating and summarising corpus concordance data.

    This class allows one or more sets of concordance data to be summarised and
    displayed. This is useful for corpus linguistic tasks like counting the
    number of occurences of a particular word and its different POS tags in a
    given corpus, or comparing these frequencies across different corpora. It
    creates a FreqDist for each set of concordance data, counting how often each
    unique entry appears in it.

    An example of how to use this class to show the frequency of the five most
    common digrams of the form "must/md X/Y" in the Brown Corpus sections a
    and g::
    
        concA = IndexConcordance(brown.tagged_sents('a'))
        rawA = concA.raw(middleRegexp="^must/md$", leftContextLength=0, rightContextLength=1)
        concG = IndexConcordance(brown.tagged_sents('g'))
        rawG = concG.raw(middleRegexp="^must/md$", leftContextLength=0, rightContextLength=1)
        agg = Aggregator()
        agg.add(rawA, "Brown Corpus A")
        agg.add(rawG, "Brown Corpus G")
        agg.formatted(showFirstX=5)

        Output:

        Brown Corpus A
        ------------------------------
         must/md be/be          17
         must/md have/hv        5
         must/md not/*          3
         must/md play/vb        2
         must/md ''/''          1

        Brown Corpus G
        ------------------------------
         must/md be/be          38
         must/md have/hv        21
         must/md ,/,            6
         must/md not/*          5
         must/md always/rb      3
    """

    # text for 'other' row in output tables
    _OTHER_TEXT = "<OTHER>"
    # text for 'total' row in output tables
    _TOTAL_TEXT = "<TOTAL>"
    
    def __init__(self, inputList=None):
        """ Constructor.

        @type inputList:    list
        @param inputList: List of (raw concordance data, name) tuples to be
                            entered into the aggregator. Defaults to None.
        """
        self._outputSets = []
        if inputList != None:
            for (item, n) in inputList:
                self.add(item, name=n)

    def add(self, raw, name):
        """ Adds the given set of raw concordance output to the aggregator.

        @type raw:  list
        @param raw: Raw concordance data (produced by IndexConcordance.raw()).
                    Expects a list of ([left context], target word,
                    [right context], target word sentence number) tuples.
        @type name:     string
        @param name:    Name to associate with the set of data.
        """
        self._outputSets.append((raw, name));

    def remove(self, name):
        """ Removes all sets of raw concordance output with the given name.

        @type name:     string
        @param name:    Name of data set to remove.
        """
        for item in self._outputSets:
            if item[1] == name:
                self._outputSets.remove(item)

    def formatted(self, useWord=True, usePOS=True, normalise=False,
                  threshold=-1, showFirstX=-1, decimalPlaces=4,
                  countOther=False, showTotal=False):
        """ Displays formatted concordance summary information.

        This is a convenience method that combines raw() and display()'s
        options. Unless you need raw output, this is probably the most useful
        method.

        @type useWord:      boolean
        @param useWord:   Include the words in the count. Defaults to True.
        @type usePOS:       boolean
        @param usePOS:    Include the POS tags in the count. Defaults to
                            False.
        @type normalise:    boolean
        @param normalise: If true, normalises the frequencies for each set
            of concordance output by dividing each key's frequency by the total
            number of samples in that concordances's FreqDist. Allows easier
            comparison of results between data sets.  Care must be taken when
            combining this option with the threshold option, as any threshold
            of 1 or more will prevent any output being displayed. Defaults to
            False.
        @type threshold:    number
        @param threshold: Frequency display threshold. Results below this
            frequency will not be displayed. If less than 0, everything will be
            displayed. Defaults to -1.
        @type showFirstX:       number
        @param showFirstX:    Only show this many results, starting with the
            most frequent. If less than 0, everything will be displayed.
            Defaults to -1.
        @type decimalPlaces:    integer
        @param decimalPlaces: Number of decimal places of accuracy to
            display. Used when displaying non-integers with the normalise
            option. Defaults to 4.
        @type countOther:       boolean
        @param countOther:    If true, any samples not shown (due to their
            frequency being below the given thershold or because they were
            after the number of results specified by the showFirstX argument)
            will be combined into one sample. This sample's frequency is the
            sum of all unshown sample's frequencies. Defaults to False.
        @type showTotal:    boolean
        @param showTotal: If true, prints the sum of all frequencies (of
            the entire FreqDist, not just of the samples displayed.) Defaults
            to False.
        """
        
        output, maxKeyLength = self.raw(useWord, usePOS)
        self.format(output, maxKeyLength, threshold, showFirstX,
                decimalPlaces, normalise, countOther, showTotal)

    def raw(self, useWord=True, usePOS=True):
        """ Generates raw summary information.

        Creates a FreqDist for each set of concordance output and uses it to
        count the frequency of each line in it. The concordance output is
        flattened from lists of tokens to strings, as lists cannot be hashed.
        The list of FreqDists is returned, as well as the length of the longest
        string (used for formatted display).

        @type useWord:      boolean
        @param useWord:   Include the words in the count. Defaults to True.
        @type usePOS:       boolean
        @param usePOS:    Include the POS tags in the count. Defaults to
                            False.
        @rtype:     list, number
        @returns:   A list of (FreqDist, name) pairs, and the length of the
                    longest key in all the FreqDists.
        """

        output = []
        maxKeyLength = 0

        # for each set of raw concordance data:
        for (rawConcOutput, name) in self._outputSets:
            # initialise a FreqDist
            dist = FreqDist()
            # for each item in the raw concordance output:
            for (left, middle, right, num) in rawConcOutput:
                # flatten the lists of tokens so they can be hashed in
                # the FreqDist
                leftList = []
                for word in left:
                    if usePOS == False and useWord == True:
                        leftList.append(word[0].lower())
                    elif usePOS == True and useWord == False:
                        leftList.append(word[1].lower())
                    else:
                        leftList.append(word[0].lower() + "/" + word[1].lower())
                try:
                    if usePOS == False and useWord == True:
                        midString = middle[0].lower()
                    elif usePOS == True and useWord == False:
                        midString = middle[1].lower()
                    else:
                        midString = middle[0].lower() + "/" + middle[1].lower()
                except IndexError:
                    midString = ""

                rightList = []
                for word in right:
                    if usePOS == False and useWord == True:
                        rightList.append(word[0].lower())
                    elif usePOS == True and useWord == False:
                        rightList.append(word[1].lower())
                    else:
                        rightList.append(word[0].lower() + "/" + word[1].lower())

                # join the tokens together to form a key string
                key = string.join(leftList) + " " + midString + " " + string.join(rightList)
                # keep track of the longest key length
                if len(key) > maxKeyLength:
                    maxKeyLength = len(key)
                # increment the FreqDist's count for this key
                dist.inc(key)

            # add this FreqDist and name to the output
            output.append((dist, name))

        # return the output and maximum key length
        return output, maxKeyLength

    def format(self, output, maxKeyLength=20, threshold=-1, showFirstX=-1,
                decimalPlaces=4, normalise=False, countOther=False,
                showTotal=False):
        """ Displays concordance summary information.

        Formats and displays information produced by raw().

        @type output:   list
        @param output:  List of (FreqDist, name) pairs (as produced by raw()).
        @type maxKeyLength:     number
        @param maxKeyLength:  Length of longest key. Defaults to 20.
        @type normalise:    boolean
        @param normalise: If true, normalises the frequencies for each set
            of concordance output by dividing each key's frequency by the total
            number of samples in that concordances's FreqDist. Allows easier
            comparison of results between data sets.  Care must be taken when
            combining this option with the threshold option, as any threshold
            of 1 or more will prevent any output being displayed. Defaults to
            False.
        @type threshold:    number
        @param threshold: Frequency display threshold. Results below this
            frequency will not be displayed. If less than 0, everything will be
            displayed. Defaults to -1.
        @type showFirstX:       number
        @param showFirstX:    Only show this many results, starting with the
            most frequent. If less than 0, everything will be displayed.
            Defaults to -1.
        @type decimalPlaces:    integer
        @param decimalPlaces: Number of decimal places of accuracy to
            display. Used when displaying non-integers with the normalise
            option. Defaults to 4.
        @type countOther:       boolean
        @param countOther:    If true, any samples not shown (due to their
            frequency being below the given thershold or because they were
            after the number of results specified by the showFirstX argument)
            will be combined into one sample. This sample's frequency is the
            sum of all unshown sample's frequencies. Defaults to False.
        @type showTotal:    boolean
        @param showTotal: If true, prints the sum of all frequencies (of
            the entire FreqDist, not just of the samples displayed.) Defaults
            to False.
        """

        # for each FreqDist:
        for (dist, name) in output:
            x = 0
            other = 0
            total = 0
            print name
            print "-"*(maxKeyLength + 7)
            # for each key:
            for key in dist.keys():
                # keep track of how many samples shown, if using the showFirstX
                # option
                #if showFirstX > 0 and x >= showFirstX:
                #   break

                # get and format the sample's frequency
                if normalise:
                    count = 1.0 * dist[key] / dist.N()
                    countString = str(count)[0:decimalPlaces + 2]
                else:
                    count = dist[key]
                    countString = str(count)

                total += count

                # if the count is less than the threshold value, or we've
                # already shown X samples, add this sample's frequency to the
                # 'other' bin
                if count < threshold or (showFirstX > 0 and x >= showFirstX):
                    other += count
                else:
                    print key + " "*(maxKeyLength - len(key) + 1) + countString
                x += 1

            if countOther:
                if normalise:
                    count = 1.0 * other
                    countString = str(count)[0:decimalPlaces + 2]
                else:
                    count = other
                    countString = str(count)
                print self._OTHER_TEXT + " "*(maxKeyLength - len(self._OTHER_TEXT) + 1) + countString
            if showTotal:
                if normalise:
                    count = 1.0 * total
                    countString = str(count)[0:decimalPlaces + 2]
                else:
                    count = total
                    countString = str(count)
                print self._TOTAL_TEXT + " "*(maxKeyLength - len(self._TOTAL_TEXT) + 1) + countString
            print ""
            
def demo():
    """
    Demonstrates how to use IndexConcordance and Aggregator.
    """
    print "Reading Brown Corpus into memory..."
    corpus = brown.tagged_sents('a')
    print "Generating index..."
    ic = IndexConcordance(corpus)
    print "Showing all occurences of 'plasma' in the Brown Corpus..."
    ic.formatted(middleRegexp="^plasma/.*", verbose=True)

    print "Investigating the collocates of 'deal' and derivatives..."
    agg = Aggregator()
    agg.add(ic.raw(middleRegexp="^deal", leftContextLength=1, rightContextLength=0,
    leftRegexp="^(\w|\s|/)*$"), "Brown Corpus 'deal' left collocates")
    agg.add(ic.raw(middleRegexp="^deal", leftContextLength=0, rightContextLength=1,
    rightRegexp="^(\w|\s|/)*$"), "Brown Corpus 'deal' right collocates")
    agg.formatted(showFirstX=5, usePOS=False)

if __name__ == '__main__':
    demo()
Tech Fingerprint

Alerts (10)

'import *' Avoid to prevent namespace pollution; import specific names or use aliases
9 11
'== None' Use 'is' for None comparisons (e.g., x is None)
109 541 543
'def' Ensure functions have docstrings for documentation
112 175 411
Complexity hotspot; lines 184 to 185 (total complexity: 3)
184 185