PageRenderTime 45ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/helpers/graphs/hashtag-entity/render.py

https://github.com/champ1/twittomatic
Python | 99 lines | 86 code | 9 blank | 4 comment | 7 complexity | 60bd14afd8ff845928f4e5677f4fc972 MD5 | raw file
  1. # encoding=utf8
  2. """
  3. Simple script that takes in input the TSV annotated file and generate the graphml file
  4. """
  5. import gzip
  6. from collections import defaultdict
  7. class Renderer(object):
  8. def __init__(self, options):
  9. self.inputfile = options.inputfile
  10. self.outputfile = options.outputfile
  11. self.skip_single = options.skip_single
  12. if options.blacklist:
  13. self.blacklist = self.load_blacklist(options.blacklist)
  14. else:
  15. self.blacklist = set()
  16. def load_blacklist(self, inputfile):
  17. titles = set()
  18. with open(self.inputfile, 'r') as inputfile:
  19. for title in inputfile:
  20. titles.add(title.strip())
  21. return titles()
  22. def iterate(self):
  23. with gzip.open(self.inputfile, 'r') as inputfile:
  24. prevhashtag = None
  25. pages = []
  26. for line in inputfile:
  27. try:
  28. hashtag, wid, rho, title = line.strip().split('\t', 3)
  29. except:
  30. hashtag, wid, rho = line.strip().split('\t', 2)
  31. title = ''
  32. hashtag = "#" + hashtag
  33. if prevhashtag == hashtag:
  34. pages.append((int(wid), float(rho), title))
  35. else:
  36. if prevhashtag:
  37. yield prevhashtag, pages
  38. prevhashtag = hashtag
  39. pages = [(int(wid), float(rho), title)]
  40. if prevhashtag:
  41. yield prevhashtag, pages
  42. raise StopIteration
  43. def run(self):
  44. with gzip.open(self.outputfile, 'w') as outputfile:
  45. for count, (hashtag, pages) in enumerate(self.iterate()):
  46. if self.skip_single and len(pages) <= 1:
  47. continue
  48. counters = defaultdict(list)
  49. mappings = {}
  50. for wid, rho, title in filter(lambda x: x[2] not in self.blacklist, pages):
  51. counters[wid].append(rho)
  52. mappings[wid] = title
  53. for wid, rhos in sorted(counters.items()):
  54. rhos.sort()
  55. line = "%d\t%d\t%s\t%s\t%s\n" % (count + 1, wid + 100000000,
  56. ':'.join(map(str, rhos)),
  57. hashtag,
  58. title)
  59. outputfile.write(line)
  60. if __name__ == "__main__":
  61. from optparse import OptionParser
  62. parser = OptionParser(description="Read the annotation the TSV annotation file and generate the final graph")
  63. parser.add_option("-i", "--input", dest="inputfile",
  64. help="Annotation file in tsv.gz format")
  65. parser.add_option("-o", "--output", dest="outputfile",
  66. help="Output file")
  67. parser.add_option("-s", "--skip-single", dest="skip_single", action="store_true",
  68. help="Skip edges with just one annotation")
  69. parser.add_option("-b", "--blacklist", dest="blacklist",
  70. help="Specify a blacklist file containing Wikipedia pages to be ignored")
  71. (options, args) = parser.parse_args()
  72. if options.inputfile and options.outputfile:
  73. app = Renderer(options)
  74. app.run()
  75. else:
  76. parser.print_help()