PageRenderTime 58ms CodeModel.GetById 30ms RepoModel.GetById 0ms app.codeStats 0ms

/helpers/hadoop/wikipedia/lighttag/construct.py

https://github.com/champ1/twittomatic
Python | 35 lines | 22 code | 11 blank | 2 comment | 7 complexity | b82503e189510df004e04eddb184c472 MD5 | raw file
  1. # encoding=utf8
  2. import sys
  3. import gzip
  4. import json
  5. from utils import profiled
  6. def extract_trie(filename='anchors/anchors.gz',
  7. outfilename='anchors.trie', stopafter=sys.maxint):
  8. # Create a complete trie
  9. trie = datrie.Trie([chr(x) for x in range(1, 255)])
  10. with gzip.open(filename, 'r') as inputfile:
  11. skipped = 0
  12. for count, line in enumerate(inputfile):
  13. anchor = json.loads(line.strip())
  14. label = anchor['anchor'].lower()
  15. if not len(label.split()) <= 3 or \
  16. not len(label) >= 3:
  17. skipped += 1
  18. continue
  19. trie[label] = anchor['pages']
  20. if count >= stopafter:
  21. break
  22. if count % 10000 == 0:
  23. print "Anchors: Loaded %d, Skipped %d" % (count, skipped)
  24. trie.save(outfilename)