twittomatic /helpers/hadoop/wikipedia/lighttag/utils.py

Language Python Lines 34
MD5 Hash 07db4d7ff424ace33513792170751a58
Repository https://github.com/champ1/twittomatic.git View Raw File View Project SPDX
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import gzip
import json
import datetime
from contextlib import contextmanager

@contextmanager
def profiled(str):
    start = datetime.datetime.now()
    yield start
    diff = datetime.datetime.now() - start
    print(str % diff)

def iterate_anchors(anchorfile):
    with gzip.open(anchorfile, 'r') as inputfile:
        for line in inputfile:
            anchor = json.loads(line.strip())

            label = anchor['anchor']
            pages = anchor['pages']

            yield label.lower(), pages

def iterate_mappings(titlesfile):
    with gzip.open(titlesfile, 'r') as inputfile:
        for line in inputfile:
            page = json.loads(line.strip())
            yield page['id'], page['name'], page['title'], page['length']

def iterate_templates(templatefile):
    with gzip.open(templatefile, 'r') as inputfile:
        for line in inputfile:
            page = json.loads(line.strip())
            yield page['id'], page['templates']
Back to Top