/helpers/hadoop/wikipedia/lighttag/utils.py

https://github.com/champ1/twittomatic · Python · 33 lines · 27 code · 6 blank · 0 comment · 6 complexity · 07db4d7ff424ace33513792170751a58 MD5 · raw file

  1. import gzip
  2. import json
  3. import datetime
  4. from contextlib import contextmanager
  5. @contextmanager
  6. def profiled(str):
  7. start = datetime.datetime.now()
  8. yield start
  9. diff = datetime.datetime.now() - start
  10. print(str % diff)
  11. def iterate_anchors(anchorfile):
  12. with gzip.open(anchorfile, 'r') as inputfile:
  13. for line in inputfile:
  14. anchor = json.loads(line.strip())
  15. label = anchor['anchor']
  16. pages = anchor['pages']
  17. yield label.lower(), pages
  18. def iterate_mappings(titlesfile):
  19. with gzip.open(titlesfile, 'r') as inputfile:
  20. for line in inputfile:
  21. page = json.loads(line.strip())
  22. yield page['id'], page['name'], page['title'], page['length']
  23. def iterate_templates(templatefile):
  24. with gzip.open(templatefile, 'r') as inputfile:
  25. for line in inputfile:
  26. page = json.loads(line.strip())
  27. yield page['id'], page['templates']