PageRenderTime 42ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 0ms

/helpers/hadoop/wikipedia/anchors/extract-lp.py

https://github.com/champ1/twittomatic
Python | 32 lines | 25 code | 7 blank | 0 comment | 8 complexity | cad3ef2d8315e0fe6ac8b6604b62c65a MD5 | raw file
  1. import sys
  2. def all_anchors(anchors):
  3. with open(anchors, 'r') as inputfile:
  4. for line in inputfile:
  5. anchorname = line.rstrip('\n')[len("anchor:"):]
  6. yield anchorname
  7. ianchor = all_anchors(sys.argv[1])
  8. current = ianchor.next()
  9. for line in sys.stdin:
  10. anchor, lp = line.rstrip('\n').rsplit('\t', 1)
  11. lp = int(lp)
  12. finished = False
  13. try:
  14. while not finished:
  15. ret = cmp(current, anchor)
  16. if ret < 0:
  17. current = ianchor.next()
  18. continue
  19. elif ret == 0:
  20. print "%s\t%s" % (anchor, lp)
  21. current = ianchor.next()
  22. continue
  23. finished = True
  24. except StopIteration:
  25. break