/eml_to_csv.py
Python | 230 lines | 222 code | 6 blank | 2 comment | 6 complexity | 2098880ffe15da77c69cf07d90ad5df2 MD5 | raw file
- import sys
- import os
- import re
-
- from operator import itemgetter
- from email.utils import parsedate_tz, mktime_tz
-
- # return fileList[] with all files inside 'dir' matching 'regEx'
- def g_file_list(dir, regEx):
- if not os.path.isdir(dir):
- sys.exit(dir + " invalid directory")
-
- cRegEx = re.compile(regEx)
-
- fileList = []
- # loop on all files and select files matching 'regEx'
- for root, folders, files in os.walk(dir):
- for name in files:
- if cRegEx.search(name):
- path = os.path.join(root, name)
- fileList.append(path)
-
- return fileList
-
- def g_header_field(fieldName, header):
- regEx = re.compile(r"^" + fieldName + r": (?P<" + fieldName + r">.*?)\n\S", re.DOTALL|re.MULTILINE|re.IGNORECASE)
- match = regEx.search(header)
- if not match:
- return ""
-
- field = match.group(1)
-
- # trim out all the repeat whitespace and also newlines
- newline = re.compile(r"[\n\r\f\v]")
- field = newline.sub("", field)
- field = re.sub(r"(\t| {2,})", " ", field)
- return field
-
- def g_header(text):
- headerEnd = text.find("\n\n")
- header = text[:headerEnd]
- return header
-
- def g_raw_contacts(header):
- # to do: reply-to addresses <abc@def.com> need to take precedence
- # over from addresses, except sometimes the name will only be
- # in the from address
- to = g_header_field("to", header)
- _from = g_header_field("from", header)
- cc = g_header_field("cc", header)
- bcc = g_header_field("bcc", header)
- contacts = ",".join([to, _from, cc, bcc])
-
- return g_clean_contacts(contacts).split(",")
-
- def g_clean_contacts(contacts):
- contacts = contacts.replace("<Undisclosed-Recipient:;>", "")
- # replace quoted commas with a temporary character: "\xc2"
- quotedComma = re.compile(r'("[^"<]*),([^"<]*")')
- contacts = quotedComma.sub("\\1\xc2\\2", contacts)
- # remove empty contacts
- empties = re.compile(r",{2,}")
- contacts = empties.sub(",", contacts).strip(",")
- return contacts
-
- def g_date(header):
- date = g_header_field("date", header)
- date = parsedate_tz(date)
- return date
-
- def g_subject(header):
- subject = g_header_field("subject", header)
- return subject
-
- def g_name(rawContact):
- emailIdx = rawContact.find("<")
- if emailIdx < 0:
- return ""
-
- # restore comma :/
- name = rawContact[:emailIdx]
- name = name.replace("\xc2", ",")
- name = name.strip(" \"',")
- return name
-
- def g_email(rawContact):
- emailBracketIdx = rawContact.find("<")
- if emailBracketIdx < 0:
- email = rawContact
- else:
- email = rawContact[emailBracketIdx:]
-
- email = email.strip("<> ")
- return email
-
- def write_contacts(file, contacts):
- head = g_csv("First name", "Middle name", "Last name", "Name", "E-mail address", "Notes")
- file.write(head)
-
- for contact in contacts:
- first, middle, last, display = g_name_parts(contact['name'])
- email = contact['email']
- notes = g_notes(contact)
-
- line = g_csv(first, middle, last, display, email, notes)
- file.write(line)
-
- def g_csv(*seq):
- return ",".join(['"' + v.replace('"', '""') + '"' for v in seq]) + "\n"
-
- def g_name_parts(name):
- # ugly function should be refactored, so on :)
- # also names in form: "Bob & Jill Smith" -> "Bob & Jill","Smith","Bob & Jill Smith"
- # might be nice
- if not name:
- return "", "", "", ""
- commaCount = name.count(",")
- if commaCount > 1:
- return "", "", "", g_clean_name(name)
-
- if commaCount == 1:
- commaIdx = name.find(",")
- # swap first & last
- name = name[commaIdx+1:].strip() + " " + name[:commaIdx]
-
- spaceCount = name.count(" ")
- wordCount = spaceCount + 1
-
- if wordCount > 3:
- return "", "", "", g_clean_name(name)
-
- regEx = re.compile(r"(?P<first>[^ ]+)( ((?P<middle>[^ ]+) )?(?P<last>.+))?")
- m = regEx.match(name)
-
- middle = m.group('middle')
- if not middle:
- middle = ""
-
- if middle.lower() in ("and", "of", "customer"):
- return "", "", "", g_clean_name(name)
-
- last = m.group('last')
- if not last:
- last = ""
-
- first = g_clean_name(m.group('first'))
- middle = g_clean_name(middle)
- last = g_clean_name(last)
- display = (first + " " + last).strip()
- return [first, middle, last, display]
-
- def g_clean_name(name):
- if len(name) <= 2:
- return name
- if name.upper() == name:
- return name.title()
- if name[0].lower() == name[0]:
- return name.title()
- return name
-
- def g_notes(contact):
- date = contact['date']
-
- note = "Date: " + str(date[:3])
-
- subject = contact['subject']
- if subject:
- note += "\tSubject: " + subject
-
- folder = contact['folder']
- note += "\tFolder: " + folder
-
- return note
-
- def g_folder(filePath, root):
- lastSlashIdx = filePath.rfind("\\")
- return filePath[:lastSlashIdx].replace(root, "")[1:]
-
- def g_date_sorted_fl(emlFolder):
- fileList = g_file_list(emlFolder, r"\.eml$")
-
- sortedFileList = []
- for file in fileList:
- text = open(file, "r").read()
- header = g_header(text)
-
- date = g_date(header)
- sortedFileList.append({'file':file, 'date':date})
-
- sortedFileList.sort(key=itemgetter('date'), reverse=True)
- return sortedFileList
-
- def eml_to_csv(emlFolder, csvFile):
- sortedFileList = g_date_sorted_fl(emlFolder)
-
- emails = set()
- contacts = []
- for L in sortedFileList:
- file = L['file']
- text = open(file, "r").read()
- header = g_header(text)
-
- date = g_date(header)
- subject = g_subject(header)
-
- rawContacts = g_raw_contacts(header)
- for rawContact in rawContacts:
- email = g_email(rawContact)
- if (email.lower() in emails):
- continue
- emails.add(email.lower())
-
- name = g_name(rawContact)
-
- folder = g_folder(file, emlFolder)
- contact = {'name':name, 'email':email, 'date':date, 'subject':subject, 'folder':folder}
- contacts.append(contact)
-
- f = open(csvFile, "w")
- #print contacts
- write_contacts(f, contacts)
- f.close()
-
- if __name__ == "__main__":
- print("Begin!")
- import sys
- eml_to_csv(sys.argv[1], sys.argv[2])
- print("Finished.")
-
- #eml_to_csv(emlFolder="C:\Temp\Mail", csvFile="contacts.csv")