PageRenderTime 1824ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/eml_to_csv.py

https://bitbucket.org/chrisgalpin/eml-to-csv-windows-live-mail
Python | 230 lines | 222 code | 6 blank | 2 comment | 6 complexity | 2098880ffe15da77c69cf07d90ad5df2 MD5 | raw file
  1. import sys
  2. import os
  3. import re
  4. from operator import itemgetter
  5. from email.utils import parsedate_tz, mktime_tz
  6. # return fileList[] with all files inside 'dir' matching 'regEx'
  7. def g_file_list(dir, regEx):
  8. if not os.path.isdir(dir):
  9. sys.exit(dir + " invalid directory")
  10. cRegEx = re.compile(regEx)
  11. fileList = []
  12. # loop on all files and select files matching 'regEx'
  13. for root, folders, files in os.walk(dir):
  14. for name in files:
  15. if cRegEx.search(name):
  16. path = os.path.join(root, name)
  17. fileList.append(path)
  18. return fileList
  19. def g_header_field(fieldName, header):
  20. regEx = re.compile(r"^" + fieldName + r": (?P<" + fieldName + r">.*?)\n\S", re.DOTALL|re.MULTILINE|re.IGNORECASE)
  21. match = regEx.search(header)
  22. if not match:
  23. return ""
  24. field = match.group(1)
  25. # trim out all the repeat whitespace and also newlines
  26. newline = re.compile(r"[\n\r\f\v]")
  27. field = newline.sub("", field)
  28. field = re.sub(r"(\t| {2,})", " ", field)
  29. return field
  30. def g_header(text):
  31. headerEnd = text.find("\n\n")
  32. header = text[:headerEnd]
  33. return header
  34. def g_raw_contacts(header):
  35. # to do: reply-to addresses <abc@def.com> need to take precedence
  36. # over from addresses, except sometimes the name will only be
  37. # in the from address
  38. to = g_header_field("to", header)
  39. _from = g_header_field("from", header)
  40. cc = g_header_field("cc", header)
  41. bcc = g_header_field("bcc", header)
  42. contacts = ",".join([to, _from, cc, bcc])
  43. return g_clean_contacts(contacts).split(",")
  44. def g_clean_contacts(contacts):
  45. contacts = contacts.replace("<Undisclosed-Recipient:;>", "")
  46. # replace quoted commas with a temporary character: "\xc2"
  47. quotedComma = re.compile(r'("[^"<]*),([^"<]*")')
  48. contacts = quotedComma.sub("\\1\xc2\\2", contacts)
  49. # remove empty contacts
  50. empties = re.compile(r",{2,}")
  51. contacts = empties.sub(",", contacts).strip(",")
  52. return contacts
  53. def g_date(header):
  54. date = g_header_field("date", header)
  55. date = parsedate_tz(date)
  56. return date
  57. def g_subject(header):
  58. subject = g_header_field("subject", header)
  59. return subject
  60. def g_name(rawContact):
  61. emailIdx = rawContact.find("<")
  62. if emailIdx < 0:
  63. return ""
  64. # restore comma :/
  65. name = rawContact[:emailIdx]
  66. name = name.replace("\xc2", ",")
  67. name = name.strip(" \"',")
  68. return name
  69. def g_email(rawContact):
  70. emailBracketIdx = rawContact.find("<")
  71. if emailBracketIdx < 0:
  72. email = rawContact
  73. else:
  74. email = rawContact[emailBracketIdx:]
  75. email = email.strip("<> ")
  76. return email
  77. def write_contacts(file, contacts):
  78. head = g_csv("First name", "Middle name", "Last name", "Name", "E-mail address", "Notes")
  79. file.write(head)
  80. for contact in contacts:
  81. first, middle, last, display = g_name_parts(contact['name'])
  82. email = contact['email']
  83. notes = g_notes(contact)
  84. line = g_csv(first, middle, last, display, email, notes)
  85. file.write(line)
  86. def g_csv(*seq):
  87. return ",".join(['"' + v.replace('"', '""') + '"' for v in seq]) + "\n"
  88. def g_name_parts(name):
  89. # ugly function should be refactored, so on :)
  90. # also names in form: "Bob & Jill Smith" -> "Bob & Jill","Smith","Bob & Jill Smith"
  91. # might be nice
  92. if not name:
  93. return "", "", "", ""
  94. commaCount = name.count(",")
  95. if commaCount > 1:
  96. return "", "", "", g_clean_name(name)
  97. if commaCount == 1:
  98. commaIdx = name.find(",")
  99. # swap first & last
  100. name = name[commaIdx+1:].strip() + " " + name[:commaIdx]
  101. spaceCount = name.count(" ")
  102. wordCount = spaceCount + 1
  103. if wordCount > 3:
  104. return "", "", "", g_clean_name(name)
  105. regEx = re.compile(r"(?P<first>[^ ]+)( ((?P<middle>[^ ]+) )?(?P<last>.+))?")
  106. m = regEx.match(name)
  107. middle = m.group('middle')
  108. if not middle:
  109. middle = ""
  110. if middle.lower() in ("and", "of", "customer"):
  111. return "", "", "", g_clean_name(name)
  112. last = m.group('last')
  113. if not last:
  114. last = ""
  115. first = g_clean_name(m.group('first'))
  116. middle = g_clean_name(middle)
  117. last = g_clean_name(last)
  118. display = (first + " " + last).strip()
  119. return [first, middle, last, display]
  120. def g_clean_name(name):
  121. if len(name) <= 2:
  122. return name
  123. if name.upper() == name:
  124. return name.title()
  125. if name[0].lower() == name[0]:
  126. return name.title()
  127. return name
  128. def g_notes(contact):
  129. date = contact['date']
  130. note = "Date: " + str(date[:3])
  131. subject = contact['subject']
  132. if subject:
  133. note += "\tSubject: " + subject
  134. folder = contact['folder']
  135. note += "\tFolder: " + folder
  136. return note
  137. def g_folder(filePath, root):
  138. lastSlashIdx = filePath.rfind("\\")
  139. return filePath[:lastSlashIdx].replace(root, "")[1:]
  140. def g_date_sorted_fl(emlFolder):
  141. fileList = g_file_list(emlFolder, r"\.eml$")
  142. sortedFileList = []
  143. for file in fileList:
  144. text = open(file, "r").read()
  145. header = g_header(text)
  146. date = g_date(header)
  147. sortedFileList.append({'file':file, 'date':date})
  148. sortedFileList.sort(key=itemgetter('date'), reverse=True)
  149. return sortedFileList
  150. def eml_to_csv(emlFolder, csvFile):
  151. sortedFileList = g_date_sorted_fl(emlFolder)
  152. emails = set()
  153. contacts = []
  154. for L in sortedFileList:
  155. file = L['file']
  156. text = open(file, "r").read()
  157. header = g_header(text)
  158. date = g_date(header)
  159. subject = g_subject(header)
  160. rawContacts = g_raw_contacts(header)
  161. for rawContact in rawContacts:
  162. email = g_email(rawContact)
  163. if (email.lower() in emails):
  164. continue
  165. emails.add(email.lower())
  166. name = g_name(rawContact)
  167. folder = g_folder(file, emlFolder)
  168. contact = {'name':name, 'email':email, 'date':date, 'subject':subject, 'folder':folder}
  169. contacts.append(contact)
  170. f = open(csvFile, "w")
  171. #print contacts
  172. write_contacts(f, contacts)
  173. f.close()
  174. if __name__ == "__main__":
  175. print("Begin!")
  176. import sys
  177. eml_to_csv(sys.argv[1], sys.argv[2])
  178. print("Finished.")
  179. #eml_to_csv(emlFolder="C:\Temp\Mail", csvFile="contacts.csv")