construct.py | searchcode

/helpers/hadoop/wikipedia/lighttag/construct.py

https://github.com/champ1/twittomatic
Python | 35 lines | 22 code | 11 blank | 2 comment | 7 complexity | b82503e189510df004e04eddb184c472 MD5 | raw file


# encoding=utf8

import sys
import gzip
import json

from utils import profiled

def extract_trie(filename='anchors/anchors.gz',
                 outfilename='anchors.trie', stopafter=sys.maxint):

    # Create a complete trie
    trie = datrie.Trie([chr(x) for x in range(1, 255)])

    with gzip.open(filename, 'r') as inputfile:
        skipped = 0

        for count, line in enumerate(inputfile):
            anchor = json.loads(line.strip())
            label = anchor['anchor'].lower()

            if not len(label.split()) <= 3 or \
               not len(label) >= 3:
                skipped += 1
                continue

            trie[label] = anchor['pages']

            if count >= stopafter:
                break

            if count % 10000 == 0:
                print "Anchors: Loaded %d, Skipped %d" % (count, skipped)

    trie.save(outfilename)