PageRenderTime 51ms CodeModel.GetById 23ms RepoModel.GetById 1ms app.codeStats 0ms

/helpers/hadoop/wikipedia/lighttag/utils.py

https://github.com/champ1/twittomatic
Python | 33 lines | 27 code | 6 blank | 0 comment | 6 complexity | 07db4d7ff424ace33513792170751a58 MD5 | raw file
  1. import gzip
  2. import json
  3. import datetime
  4. from contextlib import contextmanager
  5. @contextmanager
  6. def profiled(str):
  7. start = datetime.datetime.now()
  8. yield start
  9. diff = datetime.datetime.now() - start
  10. print(str % diff)
  11. def iterate_anchors(anchorfile):
  12. with gzip.open(anchorfile, 'r') as inputfile:
  13. for line in inputfile:
  14. anchor = json.loads(line.strip())
  15. label = anchor['anchor']
  16. pages = anchor['pages']
  17. yield label.lower(), pages
  18. def iterate_mappings(titlesfile):
  19. with gzip.open(titlesfile, 'r') as inputfile:
  20. for line in inputfile:
  21. page = json.loads(line.strip())
  22. yield page['id'], page['name'], page['title'], page['length']
  23. def iterate_templates(templatefile):
  24. with gzip.open(templatefile, 'r') as inputfile:
  25. for line in inputfile:
  26. page = json.loads(line.strip())
  27. yield page['id'], page['templates']