render.py | searchcode

/helpers/graphs/hashtag-entity/render.py

https://github.com/champ1/twittomatic · Python · 99 lines · 72 code · 23 blank · 4 comment · 21 complexity · 60bd14afd8ff845928f4e5677f4fc972 MD5 · raw file


# encoding=utf8

"""
Simple script that takes in input the TSV annotated file and generate the graphml file
"""

import gzip
from collections import defaultdict

class Renderer(object):
    def __init__(self, options):
        self.inputfile = options.inputfile
        self.outputfile = options.outputfile
        self.skip_single = options.skip_single

        if options.blacklist:
            self.blacklist = self.load_blacklist(options.blacklist)
        else:
            self.blacklist = set()

    def load_blacklist(self, inputfile):
        titles = set()

        with open(self.inputfile, 'r') as inputfile:
            for title in inputfile:
                titles.add(title.strip())

        return titles()

    def iterate(self):
        with gzip.open(self.inputfile, 'r') as inputfile:
            prevhashtag = None
            pages = []

            for line in inputfile:
                try:
                    hashtag, wid, rho, title = line.strip().split('\t', 3)
                except:
                    hashtag, wid, rho = line.strip().split('\t', 2)
                    title = ''

                hashtag = "#" + hashtag

                if prevhashtag == hashtag:
                    pages.append((int(wid), float(rho), title))
                else:
                    if prevhashtag:
                        yield prevhashtag, pages

                    prevhashtag = hashtag
                    pages = [(int(wid), float(rho), title)]

            if prevhashtag:
                yield prevhashtag, pages

        raise StopIteration

    def run(self):
        with gzip.open(self.outputfile, 'w') as outputfile:
            for count, (hashtag, pages) in enumerate(self.iterate()):
                if self.skip_single and len(pages) <= 1:
                    continue

                counters = defaultdict(list)
                mappings = {}

                for wid, rho, title in filter(lambda x: x[2] not in self.blacklist, pages):
                    counters[wid].append(rho)
                    mappings[wid] = title

                for wid, rhos in sorted(counters.items()):
                    rhos.sort()

                    line = "%d\t%d\t%s\t%s\t%s\n" % (count + 1, wid + 100000000,
                                                     ':'.join(map(str, rhos)),
                                                     hashtag,
                                                     title)
                    outputfile.write(line)

if __name__ == "__main__":
    from optparse import OptionParser

    parser = OptionParser(description="Read the annotation the TSV annotation file and generate the final graph")
    parser.add_option("-i", "--input", dest="inputfile",
                      help="Annotation file in tsv.gz format")
    parser.add_option("-o", "--output", dest="outputfile",
                      help="Output file")
    parser.add_option("-s", "--skip-single", dest="skip_single", action="store_true",
                      help="Skip edges with just one annotation")
    parser.add_option("-b", "--blacklist", dest="blacklist",
                      help="Specify a blacklist file containing Wikipedia pages to be ignored")

    (options, args) = parser.parse_args()

    if options.inputfile and options.outputfile:
        app = Renderer(options)
        app.run()
    else:
        parser.print_help()

Tech Fingerprint

Standard Library: IO & Files

Alerts (9)

'def' Ensure functions have docstrings for documentation
21 30 58
'gzip.open(' Potential decompression bomb vulnerability in Python code if input is untrusted; ensure to limit the number of bytes read.
31 59
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
38
Complexity hotspot; lines 59 to 61 (total complexity: 4)
59 60 61