/helpers/graphs/hashtag-entity/render-graphml.py

https://github.com/champ1/twittomatic · Python · 148 lines · 113 code · 32 blank · 3 comment · 15 complexity · 8a50b35c35da9ad8f68dc326a7e550e8 MD5 · raw file

  1. """
  2. Simple script that ouputs a GraphML file
  3. """
  4. import gzip
  5. from xml.sax.saxutils import XMLGenerator
  6. from xml.sax.xmlreader import AttributesNSImpl
  7. class XMLWriter(object):
  8. def __init__(self, output):
  9. self.output = XMLGenerator(output, "utf-8")
  10. self.level = 0
  11. def start_element(self, name, attrs, nochar=False):
  12. self.output.characters(' ' * self.level)
  13. self.output.startElementNS((None, name), name, self.attrs(attrs))
  14. if not nochar:
  15. self.output.characters('\n')
  16. self.level += 1
  17. def end_element(self, name, nochar=False):
  18. self.level -= 1
  19. if not nochar:
  20. self.output.characters(' ' * self.level)
  21. self.output.endElementNS((None, name), name)
  22. self.output.characters('\n')
  23. def start_document(self, keys):
  24. self.start_element('graphml', {
  25. "xmlns": u"http://graphml.graphdrawing.org/xmlns",
  26. "xmlns:xsi": u"http://www.w3.org/2001/XMLSchema-instance",
  27. "xsi:schemaLocation": u"http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd",
  28. })
  29. for kid, kfor, kname, ktype in keys:
  30. self.start_element('key', {
  31. "id": kname,
  32. "for": kfor,
  33. "attr.name": kname,
  34. "attr.type": ktype,
  35. })
  36. self.end_element('key')
  37. self.start_element('graph', {
  38. 'id': 'G',
  39. 'edgedefault': 'undirected',
  40. })
  41. def end_document(self):
  42. self.end_element('graph')
  43. self.output.endElementNS((None, u'graphml'), u'graphml')
  44. self.output.endDocument()
  45. def attrs(self, attributes):
  46. return AttributesNSImpl(dict(((None, k), v) for (k,v) in attributes.iteritems()), {})
  47. class GraphMLRenderer(object):
  48. def __init__(self, options):
  49. self.inputfile = options.inputfile
  50. self.outputfile = options.outputfile
  51. self.ht_nodes = options.ht_nodes
  52. self.wiki_nodes = options.wiki_nodes
  53. def run(self):
  54. with gzip.open(self.outputfile, 'w') as output:
  55. xml = XMLWriter(output)
  56. xml.start_document([
  57. ('weight', 'edge', 'weight', 'string'),
  58. ('name', 'node', 'name', 'string'),
  59. ('title', 'node', 'title', 'string'),
  60. ])
  61. hashtags = self.add_nodes(xml, self.ht_nodes, 'name')
  62. pages = self.add_nodes(xml, self.wiki_nodes, 'title')
  63. edges = self.add_edges(xml)
  64. xml.end_document()
  65. print "%d hashtags, %d wikipedia pages, %d edges" % (hashtags, pages, edges)
  66. def add_edges(self, xml, lastid=200000000):
  67. with gzip.open(self.inputfile, 'r') as inputfile:
  68. for count, line in enumerate(inputfile):
  69. try:
  70. ht_id, wiki_id, rhos, ht_name, wiki_name = line.strip().split('\t', 4)
  71. except:
  72. ht_id, wiki_id, rhos, ht_name = line.strip().split('\t', 3)
  73. wiki_name = ''
  74. xml.start_element('edge', {
  75. "id": str(lastid),
  76. "source": ht_id,
  77. "target": wiki_id,
  78. "label": "linked",
  79. })
  80. xml.start_element('data', {'key': 'weight'}, True)
  81. xml.output.characters(rhos)
  82. xml.end_element('data', True)
  83. xml.end_element('edge')
  84. lastid += 1
  85. return count + 1
  86. def add_nodes(self, xml, filename, attribute):
  87. with open(filename, 'r') as inputfile:
  88. for count, line in enumerate(inputfile):
  89. try:
  90. node_id, node_name = line.strip().split('\t', 1)
  91. except:
  92. node_id = line.strip()
  93. node_name = ''
  94. xml.start_element('node', {'id': node_id})
  95. xml.start_element('data', {'key': attribute}, True)
  96. xml.output.characters(node_name)
  97. xml.end_element('data', True)
  98. xml.end_element('node')
  99. return count + 1
  100. if __name__ == "__main__":
  101. from optparse import OptionParser
  102. parser = OptionParser(description="Render the HE graph in GraphML format")
  103. parser.add_option("-i", "--input", dest="inputfile",
  104. help="Graph file in tsv.gz format")
  105. parser.add_option("-o", "--output", dest="outputfile",
  106. help="Output file")
  107. parser.add_option("--ht-nodes", dest="ht_nodes",
  108. help="Hashtag nodes")
  109. parser.add_option("--wiki-nodes", dest="wiki_nodes",
  110. help="Wikipedia nodes")
  111. (options, args) = parser.parse_args()
  112. if options.inputfile and options.outputfile:
  113. app = GraphMLRenderer(options)
  114. app.run()
  115. else:
  116. parser.print_help()