eml_to_csv.py - This Python script converts email files (.e…

/eml_to_csv.py

https://bitbucket.org/chrisgalpin/eml-to-csv-windows-live-mail · Python · 230 lines · 164 code · 51 blank · 15 comment · 27 complexity · 2098880ffe15da77c69cf07d90ad5df2 MD5 · raw file


import sys

import os

import re



from operator import itemgetter

from email.utils import parsedate_tz, mktime_tz



# return fileList[] with all files inside 'dir' matching 'regEx'

def g_file_list(dir, regEx):

    if not os.path.isdir(dir):

        sys.exit(dir + " invalid directory")



    cRegEx = re.compile(regEx)



    fileList = []

    # loop on all files and select files matching 'regEx'

    for root, folders, files in os.walk(dir):

        for name in files:

            if cRegEx.search(name):

                path = os.path.join(root, name)

                fileList.append(path)



    return fileList



def g_header_field(fieldName, header):

    regEx = re.compile(r"^" + fieldName + r": (?P<" + fieldName + r">.*?)\n\S", re.DOTALL|re.MULTILINE|re.IGNORECASE)

    match = regEx.search(header)

    if not match:

        return ""



    field = match.group(1)



    # trim out all the repeat whitespace and also newlines

    newline = re.compile(r"[\n\r\f\v]")

    field = newline.sub("", field)

    field = re.sub(r"(\t| {2,})", " ", field)

    return field



def g_header(text):

    headerEnd = text.find("\n\n")

    header = text[:headerEnd]

    return header



def g_raw_contacts(header):

    # to do: reply-to addresses <abc@def.com> need to take precedence

    # over from addresses, except sometimes the name will only be

    # in the from address

    to = g_header_field("to", header)

    _from = g_header_field("from", header)

    cc = g_header_field("cc", header)

    bcc = g_header_field("bcc", header)

    contacts = ",".join([to, _from, cc, bcc])



    return g_clean_contacts(contacts).split(",")



def g_clean_contacts(contacts):

    contacts = contacts.replace("<Undisclosed-Recipient:;>", "")

    # replace quoted commas with a temporary character: "\xc2"

    quotedComma = re.compile(r'("[^"<]*),([^"<]*")')

    contacts = quotedComma.sub("\\1\xc2\\2", contacts)

    # remove empty contacts

    empties = re.compile(r",{2,}")

    contacts = empties.sub(",", contacts).strip(",")

    return contacts



def g_date(header):

    date = g_header_field("date", header)

    date = parsedate_tz(date)

    return date



def g_subject(header):

    subject = g_header_field("subject", header)

    return subject



def g_name(rawContact):

    emailIdx = rawContact.find("<")

    if emailIdx < 0:

        return ""



    # restore comma :/

    name = rawContact[:emailIdx]

    name = name.replace("\xc2", ",")

    name = name.strip(" \"',")

    return name



def g_email(rawContact):

    emailBracketIdx = rawContact.find("<")

    if emailBracketIdx < 0:

        email = rawContact

    else:

        email = rawContact[emailBracketIdx:]

    

    email = email.strip("<> ")

    return email



def write_contacts(file, contacts):

    head = g_csv("First name", "Middle name", "Last name", "Name", "E-mail address", "Notes")

    file.write(head)



    for contact in contacts:

        first, middle, last, display = g_name_parts(contact['name'])

        email = contact['email']

        notes = g_notes(contact)



        line = g_csv(first, middle, last, display, email, notes)

        file.write(line)



def g_csv(*seq):

    return ",".join(['"' + v.replace('"', '""') + '"' for v in seq]) + "\n"



def g_name_parts(name):

    # ugly function should be refactored, so on :)

    # also names in form: "Bob & Jill Smith" -> "Bob & Jill","Smith","Bob & Jill Smith"

    #   might be nice

    if not name:

        return "", "", "", ""

    commaCount = name.count(",")

    if commaCount > 1:

        return "", "", "", g_clean_name(name)



    if commaCount == 1:

        commaIdx = name.find(",")

        # swap first & last

        name = name[commaIdx+1:].strip() + " " + name[:commaIdx]



    spaceCount = name.count(" ")

    wordCount = spaceCount + 1



    if wordCount > 3:

        return "", "", "", g_clean_name(name)

        

    regEx = re.compile(r"(?P<first>[^ ]+)( ((?P<middle>[^ ]+) )?(?P<last>.+))?")

    m = regEx.match(name)



    middle = m.group('middle')

    if not middle:

        middle = ""

    

    if middle.lower() in ("and", "of", "customer"):

        return "", "", "", g_clean_name(name)



    last = m.group('last')

    if not last:

        last = ""

    

    first = g_clean_name(m.group('first'))

    middle = g_clean_name(middle)

    last = g_clean_name(last)

    display = (first + " " + last).strip()

    return [first, middle, last, display]



def g_clean_name(name):

    if len(name) <= 2:

        return name

    if name.upper() == name:

        return name.title()

    if name[0].lower() == name[0]:

        return name.title()

    return name



def g_notes(contact):

    date = contact['date']



    note = "Date: " + str(date[:3])



    subject = contact['subject']

    if subject:

        note += "\tSubject: " + subject



    folder = contact['folder']

    note += "\tFolder: " + folder



    return note



def g_folder(filePath, root):

    lastSlashIdx = filePath.rfind("\\")

    return filePath[:lastSlashIdx].replace(root, "")[1:]



def g_date_sorted_fl(emlFolder):

    fileList = g_file_list(emlFolder, r"\.eml$")



    sortedFileList = []

    for file in fileList:

        text = open(file, "r").read()

        header = g_header(text)



        date = g_date(header)

        sortedFileList.append({'file':file, 'date':date})



    sortedFileList.sort(key=itemgetter('date'), reverse=True)

    return sortedFileList



def eml_to_csv(emlFolder, csvFile):

    sortedFileList = g_date_sorted_fl(emlFolder)



    emails = set()

    contacts = []

    for L in sortedFileList:

        file = L['file']

        text = open(file, "r").read()

        header = g_header(text)



        date = g_date(header)

        subject = g_subject(header)



        rawContacts = g_raw_contacts(header)

        for rawContact in rawContacts:

            email = g_email(rawContact)

            if (email.lower() in emails):

                continue

            emails.add(email.lower())



            name = g_name(rawContact)

            

            folder = g_folder(file, emlFolder)

            contact = {'name':name, 'email':email, 'date':date, 'subject':subject, 'folder':folder}

            contacts.append(contact)



    f = open(csvFile, "w")

    #print contacts

    write_contacts(f, contacts)

    f.close()



if __name__ == "__main__":

    print("Begin!")

    import sys

    eml_to_csv(sys.argv[1], sys.argv[2])

    print("Finished.")



#eml_to_csv(emlFolder="C:\Temp\Mail", csvFile="contacts.csv")

Summary ✨

This Python script converts email files (.eml) to CSV format, extracting contact information such as name, email address, date, subject, and folder. It reads email files from a specified directory, parses the header and body of each email, and extracts relevant data. The extracted data is then written to a CSV file for further analysis or processing.

Tech Fingerprint

Alerts (14)

'def' Ensure functions have docstrings for documentation
9 25 39 44 56 66 71 75 86 96 108
Complexity hotspot; lines 17 to 19 (total complexity: 3)
17 18 19