/scripts/enron/extract_metadata_from_enron_json.py

https://gitlab.com/f7o/newsleak · Python · 91 lines · 55 code · 11 blank · 25 comment · 8 complexity · 49adcbf7c8745f09bb3b0a3b2ba26ace MD5 · raw file

  1. # -*- coding: utf-8 -*-
  2. """\
  3. This module converts the enron mail files in json (given a directory where .json files exist) to a CSV file format which will be imported to
  4. a PSQL datase format.
  5. It generates two files for the Document and Metadata tables.
  6. Note: some email data do not contain a date value. In this case, an artificial date, which is the
  7. current date is added. If this inappropriate causing wrong information, comment that line and in
  8. stead continue to the next email, using the :continue keyword in the exption block
  9. """
  10. import sys
  11. import json
  12. import datetime
  13. import csv
  14. import glob
  15. import codecs
  16. import cStringIO
  17. from cablemap.core import cables_from_source
  18. from cablemap.core.utils import titlefy, cables_from_csv
  19. # Source: <http://docs.python.org/library/csv.html>
  20. class UnicodeWriter:
  21. """
  22. A CSV writer which will write rows to CSV file "f",
  23. which is encoded in the given encoding.
  24. """
  25. def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
  26. # Redirect output to a queue
  27. self.queue = cStringIO.StringIO()
  28. self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
  29. self.stream = f
  30. self.encoder = codecs.getincrementalencoder(encoding)()
  31. def writerow(self, row):
  32. self.writer.writerow([s.encode("utf-8") for s in row])
  33. # Fetch UTF-8 output from the queue ...
  34. data = self.queue.getvalue()
  35. data = data.decode("utf-8")
  36. # ... and reencode it into the target encoding
  37. data = self.encoder.encode(data)
  38. # write to the target stream
  39. self.stream.write(data)
  40. # empty queue
  41. self.queue.truncate(0)
  42. def writerows(self, rows):
  43. for row in rows:
  44. self.writerow(row)
  45. def generate_csv(dir_name, document_out, metadata_out):
  46. """\
  47. Walks through the given `dir_name` which contains json files (with extension .json) and generates the CSV file `document_out` and
  48. metadata_out files which are to be imported to the Document and Metadata tables.
  49. """
  50. writer_meta = UnicodeWriter(open(metadata_out, 'wb'), delimiter=',', quotechar='"', escapechar='\\', quoting = csv.QUOTE_ALL)
  51. writer_doc = UnicodeWriter(open(document_out, 'wb'), delimiter=',', quotechar='"', escapechar='\\', quoting = csv.QUOTE_ALL)
  52. for filename in glob.glob(dir_name+'/*.json'):
  53. print(filename + " done")
  54. for line in tuple(open(filename, 'r')):
  55. enron = json.loads(line)
  56. id = enron["id"]
  57. # Body of the emial document and the date : store them as Document
  58. try:
  59. writer_doc.writerow((str(id), enron["body"], enron["date"]))
  60. except Exception, e: # date is missed - shall we add artificial date or ignore everything??
  61. now = datetime.datetime.now()
  62. writer_doc.writerow((str(id), enron["body"], now.strftime("%Y-%m-%d %H:%M:%S")))
  63. # Single element meta
  64. writer_meta.writerow((str(id), "Subject", enron["subject"], "Text"))
  65. writer_meta.writerow((str(id), "Timezone", enron["timezone"], "Text"))
  66. # Nested meta - recipients
  67. for r in enron["recipients"]:
  68. writer_meta.writerow((str(id), "Recipients.name", r["name"], "Text"))
  69. writer_meta.writerow((str(id), "Recipients.email", r["email"], "Text"))
  70. writer_meta.writerow((str(id), "Recipients.order", str(r["order"]), "Number"))
  71. writer_meta.writerow((str(id), "Recipients.type", r["type"], "Text"))
  72. writer_meta.writerow((str(id), "Recipients.id", str(r["id"]), "Number"))
  73. # sender metadats
  74. s = enron["sender"]
  75. writer_meta.writerow((str(id), "sender.id", str(s["id"]), "Number"))
  76. writer_meta.writerow((str(id), "sender.email", s["email"], "Text"))
  77. writer_meta.writerow((str(id), "sender.name", s["name"], "Text"))
  78. if __name__ == '__main__':
  79. generate_csv(sys.argv[1], sys.argv[2], sys.argv[3]))