tagger.py - Copyright (C) 2011 by Alessandro Presta Permiss…

/tagger.py

https://github.com/BankFacil/tagger · Python · 550 lines · 262 code · 37 blank · 251 comment · 23 complexity · 17407a29a7ed0d5eb0646a4ed3debdfd MD5 · raw file

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright (C) 2011 by Alessandro Presta

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE


'''
======
tagger
======

Module for extracting tags from text documents.
                   
Copyright (C) 2011 by Alessandro Presta

Configuration
=============

Dependencies:
python2.7, stemming, nltk (optional), lxml (optional), tkinter (optional)

You can install the stemming package with::

    $ easy_install stemming

Usage
=====

Tagging a text document from Python::

    import tagger
    weights = pickle.load(open('data/dict.pkl', 'rb')) # or your own dictionary
    myreader = tagger.Reader() # or your own reader class
    mystemmer = tagger.Stemmer() # or your own stemmer class
    myrater = tagger.Rater(weights) # or your own... (you got the idea)
    mytagger = Tagger(myreader, mystemmer, myrater)
    best_3_tags = mytagger(text_string, 3)

Running the module as a script::

    $ ./tagger.py <text document(s) to tag>

Example::

    $ ./tagger.py tests/*
    Loading dictionary... 
    Tags for  tests/bbc1.txt :
    ['bin laden', 'obama', 'pakistan', 'killed', 'raid']
    Tags for  tests/bbc2.txt :
    ['jo yeates', 'bristol', 'vincent tabak', 'murder', 'strangled']
    Tags for  tests/bbc3.txt :
    ['snp', 'party', 'election', 'scottish', 'labour']
    Tags for  tests/guardian1.txt :
    ['bin laden', 'al-qaida', 'killed', 'pakistan', 'al-fawwaz']
    Tags for  tests/guardian2.txt :
    ['clegg', 'tory', 'lib dem', 'party', 'coalition']
    Tags for  tests/post1.txt :
    ['sony', 'stolen', 'playstation network', 'hacker attack', 'lawsuit']
    Tags for  tests/wikipedia1.txt :
    ['universe', 'anthropic principle', 'observed', 'cosmological', 'theory']
    Tags for  tests/wikipedia2.txt :
    ['beetroot', 'beet', 'betaine', 'blood pressure', 'dietary nitrate']
    Tags for  tests/wikipedia3.txt :
    ['the lounge lizards', 'jazz', 'john lurie', 'musical', 'albums']
'''

from __future__ import division

import collections
import re
import nltk
import unicodedata

class Tag:
    '''
    General class for tags (small units of text)
    '''
    
    def __init__(self, string, stem=None, rating=1.0, proper=False,
                 terminal=False):
        '''
        @param string:   the actual representation of the tag
        @param stem:     the internal (usually stemmed) representation;
                         tags with the same stem are regarded as equal
        @param rating:   a measure of the tag's relevance in the interval [0,1]
        @param proper:   whether the tag is a proper noun
        @param terminal: set to True if the tag is at the end of a phrase
                         (or anyway it cannot be logically merged to the
                         following one)

        @returns: a new L{Tag} object
        '''
            
        self.string  = string
        self.stem = stem or string
        self.rating = rating
        self.proper = proper
        self.terminal = terminal
        
    def __eq__(self, other):
        return self.stem == other.stem

    def __repr__(self):
        return repr(self.string)

    def __lt__(self, other):
        return self.rating > other.rating

    def __hash__(self):
        return hash(self.stem)


class MultiTag(Tag):
    '''
    Class for aggregates of tags (usually next to each other in the document)
    '''
    
    def __init__(self, tail, head=None):
        '''
        @param tail: the L{Tag} object to add to the first part (head)
        @param head: the (eventually absent) L{MultiTag} to be extended

        @returns: a new L{MultiTag} object
        '''
        
        if not head:
            Tag.__init__(self, tail.string, tail.stem, tail.rating,
                         tail.proper, tail.terminal)
            self.size = 1
            self.subratings = [self.rating]
        else:
            self.string = ' '.join([head.string, tail.string])
            self.stem = ' '.join([head.stem, tail.stem])
            self.size = head.size + 1

            self.proper = (head.proper and tail.proper)
            self.terminal = tail.terminal

            self.subratings = head.subratings + [tail.rating]
            self.rating = self.combined_rating()
                                           
    def combined_rating(self):
        '''
        Method that computes the multitag's rating from the ratings of unit
        subtags

        (the default implementation uses the geometric mean - with a special
        treatment for proper nouns - but this method can be overridden)
        
        @returns: the rating of the multitag
        '''
        
        # by default, the rating of a multitag is the geometric mean of its
        # unit subtags' ratings
        product = reduce(lambda x, y: x * y, self.subratings, 1.0)
        root = self.size
        
        # but proper nouns shouldn't be penalized by stopwords
        if product == 0.0 and self.proper:
            nonzero = [r for r in self.subratings if r > 0.0]
            if len(nonzero) == 0:
                return 0.0
            product = reduce(lambda x, y: x * y, nonzero, 1.0)
            root = len(nonzero)
        return product ** (1.0 / root)

    
class Reader:
    '''
    Class for parsing a string of text to obtain tags

    (it just turns the string to lowercase and splits it according to
    whitespaces and punctuation, identifying proper nouns and terminal words;
    different rules and formats other than plain text could be used)
    '''
    match_apostrophes = re.compile(r'`|â', re.UNICODE)
    match_paragraphs = re.compile(r'[\.\?!\t\n\r\f\v]+', re.UNICODE)
    match_phrases = re.compile(r'[,;:\(\)\[\]\{\}<>]+', re.UNICODE)
    match_words = re.compile(r'[\w\-\'_/&]+', re.UNICODE)
    
    def __call__(self, text):
        '''
        @param text: the string of text to be tagged

        @returns: a list of tags respecting the order in the text
        '''

        text = self.preprocess(text)
        # split by full stops, newlines, question marks...
        paragraphs = self.match_paragraphs.split(text)

        tags = []

        for par in paragraphs:
            # split by commas, colons, parentheses...
            phrases = self.match_phrases.split(par)

            if len(phrases) > 0:
                # first phrase of a paragraph
                words = self.match_words.findall(phrases[0])
                if len(words) > 1:
                    tags.append(Tag(words[0].lower()))
                    for w in words[1:-1]:
                        tags.append(Tag(w.lower(), proper=w[0].isupper()))
                    tags.append(Tag(words[-1].lower(),
                                    proper=words[-1][0].isupper(),
                                    terminal=True))
                elif len(words) == 1:
                    tags.append(Tag(words[0].lower(), terminal=True))

            # following phrases
            for phr in phrases[1:]:
                words = self.match_words.findall(phr)
                if len(words) > 1:
                    for w in words[:-1]:
                        tags.append(Tag(w.lower(), proper=w[0].isupper()))
                if len(words) > 0:
                    tags.append(Tag(words[-1].lower(),
                                    proper=words[-1][0].isupper(),
                                    terminal=True))

        return tags

    def preprocess(self, text):
        '''
        @param text: a string containing the text document to perform any
                     required transformation before splitting

        @returns:    the processed text
        '''
        
        text = self.match_apostrophes.sub('\'', text)
        return text

    
class Stemmer:
    '''
    Class for extracting the stem of a word
    
    (by default it uses a simple open-source implementation of Porter's
    algorithm; this can be improved a lot, so experimenting with different ones
    is advisable; nltk.stem provides different algorithms for many languages)
    '''

    match_contractions = re.compile(r'(\w+)\'(m|re|d|ve|s|ll|t)?', re.UNICODE)
    match_hyphens = re.compile(r'\b[\-_]\b', re.UNICODE)

    def __init__(self, stemmer=None):
        '''
        @param stemmer: an object or module with a 'stem' method (defaults to
                        stemming.porter2)

        @returns: a new L{Stemmer} object
        '''
        
        if not stemmer:
            from stemming import porter2
            stemmer = porter2
        self.stemmer = stemmer

    def __call__(self, tag):
        '''
        @param tag: the tag to be stemmed

        @returns: the stemmed tag
        '''

        string = self.preprocess(tag.string)
        tag.stem = self.stemmer.stem(string)
        return tag    
        
    def preprocess(self, string):
        '''
        @param string: a string to be treated before passing it to the stemmer

        @returns: the processed string
        '''

        # delete hyphens and underscores
        string = self.match_hyphens.sub('', string)
        
        # get rid of contractions and possessive forms
        match = self.match_contractions.match(string)
        if match: string = match.group(1)
        
        return string
    

class Rater:
  
    numbers = re.compile(r'\d+$|[\d\s\/\-]+$|\d+\s*.*', re.UNICODE)
    hyphens = re.compile(r'-\s.*|.*\s\-|-', re.UNICODE)
    
    '''
    Class for estimating the relevance of tags

    (the default implementation uses TF (term frequency) multiplied by weight,
    but any other reasonable measure is fine; a quite rudimental heuristic
    tries to discard redundant tags)
    '''

    def __init__(self, weights, blacklist, synonyms, multitag_size=3):
        '''
        @param weights:       a dictionary of weights normalized in the
                              interval [0,1]
        @param multitag_size: maximum size of tags formed by multiple unit
                              tags

        @returns: a new L{Rater} object
        '''
        
        self.weights = weights
        self.multitag_size = multitag_size
        
    def __call__(self, tags):
        '''
        @param tags: a list of (preferably stemmed) tags

        @returns: a list of unique (multi)tags sorted by relevance
        '''

        self.rate_tags(tags)
        multitags = self.create_multitags(tags)

        # keep most frequent version of each tag
        clusters = collections.defaultdict(collections.Counter)
        proper = collections.defaultdict(int)
        ratings = collections.defaultdict(float)
        
        for t in multitags:
            clusters[t][t.string] += 1
            if t.proper:
                proper[t] += 1
                ratings[t] = max(ratings[t], t.rating)

        term_count = collections.Counter(multitags)
                
        for t, cnt in term_count.iteritems():
            t.string = clusters[t].most_common(1)[0][0]
            proper_freq = proper[t] / cnt
            if proper_freq >= 0.5:
                t.proper = True
                t.rating = ratings[t]
        
        # purge duplicates, one-character tags and stopwords

        unique_tags = set(t for t in term_count
                          if self.verify_valid_tag(t))
        for t in term_count:
          if t.stem in blacklist:
            unique_tags.discard(t)
        # remove redundant tags
        for t, cnt in term_count.iteritems():
            words = t.stem.split()
            for l in xrange(1, len(words)):
                for i in xrange(len(words) - l + 1):
                    s = Tag(' '.join(words[i:i + l]))
                    relative_freq = cnt / term_count[s]
                    if ((relative_freq == 1.0 and t.proper) or
                        (relative_freq >= 0.5 and t.rating > 0.0)):
                        unique_tags.discard(s)
                    else:
                        unique_tags.discard(t)

        return sorted(unique_tags)

    def verify_valid_tag(self, t):
      step1 = len(t.string) > 1 and t.rating > 0.0 
      step2 = (not self.numbers.match(t.string)) and (not self.hyphens.match(t.string))
      return step1 and step2
      

    def rate_tags_by_count(self, tags):
        '''
        @param tags: criado para o bkf
        '''

        term_count = collections.Counter(tags)

        for t in tags:
            # rating of a single tag is term frequency * weight
            if self.weights.get(t.stem, 1.0) <= 0.0:
              t.rating = 0
            else:
              t.rating = term_count[t] #/ len(tags) * self.weights.get(t.stem, 1.0)

    def rate_tags(self, tags):
        '''
        @param tags: a list of tags to be assigned a rating
        '''
        
        term_count = collections.Counter(tags)
        
        for t in tags:
          # rating of a single tag is term frequency * weight
          t.rating = term_count[t] / len(tags) * self.weights.get(t.stem, 1.0)
    
    def create_multitags(self, tags):
        '''
        @param tags: a list of tags (respecting the order in the text)

        @returns: a list of multitags
        '''
        
        multitags = []
        
        for i in xrange(len(tags)):
            t = MultiTag(tags[i])
            multitags.append(t)
            for j in xrange(1, self.multitag_size):
                if t.terminal or i + j >= len(tags):
                    break
                else:
                    t = MultiTag(tags[i + j], t)
                    multitags.append(t)

        return multitags
    

class MyPtStemmer(Stemmer):
    def __init__(self):
        Stemmer.__init__(self, nltk.stem.RSLPStemmer())
    def preprocess(self, string):
      return string

class Tagger:
    '''
    Master class for tagging text documents

    (this is a simple interface that should allow convenient experimentation
    by using different classes as building blocks)
    '''

    def __init__(self, reader, stemmer, rater):
        '''
        @param reader: a L{Reader} object
        @param stemmer: a L{Stemmer} object
        @param rater: a L{Rater} object

        @returns: a new L{Tagger} object
        '''
        
        self.reader = reader
        self.stemmer = stemmer
        self.rater = rater

    def __call__(self, text, tags_number=5):
        '''
        @param text:        the string of text to be tagged
        @param tags_number: number of best tags to be returned

        Returns: a list of (hopefully) relevant tags
        ''' 

        tags = self.reader(text)
        tags = map(self.stemmer, tags)
        tags = self.rater(tags)

        return tags[:tags_number]

if __name__ == '__main__':

    import glob
    import pickle
    import sys
    import getopt
    import codecs
    from unidecode import unidecode

    dict_file = 'dict'
    n_tags = 5
    id_regexp = re.compile(r'.*_(\d*)', re.UNICODE)
    all_tags = {}
    all_ids = []
    stem_synonyms = {}
    if len(sys.argv) < 2:
        print 'No arguments given, running tests: '
        documents = glob.glob('tests/*')
    else:
      options = getopt.getopt(sys.argv[1:], 'd:n:')
      if len(options[0]) > 0:
        dict_file = options[0][0][1]
      if len(options[0]) > 1:
        n_tags = int(options[0][1][1])
      documents = options[1]

    from extras import SimpleReader
    stemmer = MyPtStemmer()
    reader = SimpleReader()

    with codecs.open('data/blacklist.txt', 'r', 'utf-8') as bl:
        blacklist = reader(bl.read())
    blacklist = [w.stem for w in map(stemmer, blacklist)]
    
    with codecs.open('data/synonyms.txt', 'r', 'utf-8') as sn:
        synonyms = reader(sn.read())
    synonyms = [w.stem for w in map(stemmer, synonyms)]
    
    print 'Loading dictionary... '
    weights = pickle.load(open('data/%s.pkl'%(dict_file), 'rb'))
    tagger = Tagger(Reader(), stemmer, Rater(weights, blacklist, synonyms))

    for doc in documents:
        with codecs.open(doc, 'r', 'utf-8') as file:
            file_contents = file.read()
            tags = tagger(file_contents, n_tags)
            article_id = int(id_regexp.match(doc).group(1))
            print 'Tags for ', article_id, ':'
            all_ids.append(article_id)
            #handle synonyms
            for tag in tags:
              stem = unidecode(tag.stem)
              if not all_tags.get(stem):
                all_tags[stem] = set()
              all_tags[stem].add(article_id)
              if not stem_synonyms.get(stem):
                stem_synonyms[stem] = []
              tag_string = unidecode(tag.string)
              if not tag_string in stem_synonyms[stem]: stem_synonyms[stem].append(tag_string)
            print tags
    from matrix_builder import print_normalized_sparse_matrix
    from matrix_builder import print_synonyms
    from link_matrix import print_normalized_link_matrix
    from link_matrix import print_math_prog
    from tag_exporter import print_tags
    from tag_exporter import print_whitelist_tags
    top_tags = dict(sorted(all_tags.items(), key=lambda y: -1*len(y[1]))[0:len(all_ids)])
    print_normalized_sparse_matrix(top_tags, sorted(all_ids), stem_synonyms)
    print_synonyms(stem_synonyms)
    print_math_prog(top_tags, sorted(all_ids))
    print_tags(top_tags)
    print_whitelist_tags(all_tags)
Tech Fingerprint

Alerts (4)

'open(' Use 'with open()' to ensure Files are properly closed
50
Complexity hotspot; lines 177 to 179 (total complexity: 5)
177 178 179