PageRenderTime 46ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/helpers/graphs/hashtag-entity/extract.py

https://github.com/champ1/twittomatic
Python | 36 lines | 23 code | 8 blank | 5 comment | 7 complexity | 1a197745dedbdb21c8151271b0d6956f MD5 | raw file
  1. """
  2. Scripts that read annotation file and outputs a tab-separated stream
  3. Use it in conjunction of sort -k1 -u
  4. """
  5. import sys
  6. import json
  7. import gzip
  8. def read(inputfile):
  9. with gzip.open(inputfile, 'r') as input:
  10. for line in input:
  11. obj = json.loads(line.strip())
  12. hts = obj['hts']
  13. annotations = obj['annotations']
  14. for ht in sorted(hts):
  15. for annotation in sorted(annotations):
  16. line = "%s\t%s\t%s\t%s" % (ht, annotation[0], annotation[1], annotation[2])
  17. print line.encode('utf8')
  18. if __name__ == "__main__":
  19. from optparse import OptionParser
  20. parser = OptionParser(description="Read the annotation file and outputs TSV")
  21. parser.add_option("-i", "--input", dest="inputfile",
  22. help="Annotation file in json.gz format")
  23. (options, args) = parser.parse_args()
  24. if options.inputfile:
  25. read(options.inputfile)
  26. else:
  27. parser.print_help()