extract_metadata_from_enron_json.py - Source: <http://docs.…

/scripts/enron/extract_metadata_from_enron_json.py

https://gitlab.com/f7o/newsleak · Python · 91 lines · 55 code · 11 blank · 25 comment · 8 complexity · 49adcbf7c8745f09bb3b0a3b2ba26ace MD5 · raw file

# -*- coding: utf-8 -*-
"""\
This module converts the enron mail files in json (given a directory where .json files exist) to a CSV file format which will be imported to
a PSQL datase format.
It generates two files for the Document and Metadata tables.
Note: some email data do not contain a date value. In this case, an artificial date, which is the
current date is added. If this inappropriate causing wrong information, comment that line and in
stead continue to the next email, using the :continue keyword in the exption block

"""
import sys
import json
import datetime
import csv
import glob
import codecs
import cStringIO
from cablemap.core import cables_from_source
from cablemap.core.utils import titlefy, cables_from_csv

# Source: <http://docs.python.org/library/csv.html>
class UnicodeWriter:
    """
    A CSV writer which will write rows to CSV file "f",
    which is encoded in the given encoding.
    """

    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
        # Redirect output to a queue
        self.queue = cStringIO.StringIO()
        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
        self.stream = f
        self.encoder = codecs.getincrementalencoder(encoding)()

    def writerow(self, row):
        self.writer.writerow([s.encode("utf-8") for s in row])
        # Fetch UTF-8 output from the queue ...
        data = self.queue.getvalue()
        data = data.decode("utf-8")
        # ... and reencode it into the target encoding
        data = self.encoder.encode(data)
        # write to the target stream
        self.stream.write(data)
        # empty queue
        self.queue.truncate(0)

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


def generate_csv(dir_name, document_out, metadata_out):
    """\
    Walks through the given `dir_name` which contains json files (with extension .json) and generates the CSV file `document_out` and
    metadata_out files which are to be imported to the Document and Metadata tables.
    """
    writer_meta = UnicodeWriter(open(metadata_out, 'wb'), delimiter=',', quotechar='"', escapechar='\\', quoting = csv.QUOTE_ALL)
    writer_doc = UnicodeWriter(open(document_out, 'wb'), delimiter=',', quotechar='"', escapechar='\\', quoting = csv.QUOTE_ALL)
    for filename in glob.glob(dir_name+'/*.json'):
        print(filename + " done")
        for line in tuple(open(filename, 'r')):
            enron  = json.loads(line)
            id = enron["id"]
            # Body of the emial document and the date : store them as Document
            try:
             writer_doc.writerow((str(id), enron["body"],  enron["date"]))
            except Exception, e: # date is missed - shall we add artificial date or ignore everything??
                now = datetime.datetime.now()
                writer_doc.writerow((str(id), enron["body"], now.strftime("%Y-%m-%d %H:%M:%S")))

            # Single element meta
            writer_meta.writerow((str(id), "Subject", enron["subject"], "Text"))
            writer_meta.writerow((str(id), "Timezone", enron["timezone"], "Text"))


            # Nested meta - recipients
            for r in enron["recipients"]:
                writer_meta.writerow((str(id), "Recipients.name", r["name"], "Text"))
                writer_meta.writerow((str(id), "Recipients.email", r["email"], "Text"))
                writer_meta.writerow((str(id), "Recipients.order", str(r["order"]), "Number"))
                writer_meta.writerow((str(id), "Recipients.type", r["type"], "Text"))
                writer_meta.writerow((str(id), "Recipients.id", str(r["id"]), "Number"))

            # sender metadats
            s = enron["sender"]
            writer_meta.writerow((str(id), "sender.id", str(s["id"]), "Number"))
            writer_meta.writerow((str(id), "sender.email", s["email"], "Text"))
            writer_meta.writerow((str(id), "sender.name", s["name"], "Text"))

if __name__ == '__main__':
    generate_csv(sys.argv[1], sys.argv[2], sys.argv[3]))