/Lib/email/_parseaddr.py

http://unladen-swallow.googlecode.com/ · Python · 480 lines · 372 code · 45 blank · 63 comment · 81 complexity · 919dd3be394950eaafdc3f8af967f20b MD5 · raw file

  1. # Copyright (C) 2002-2007 Python Software Foundation
  2. # Contact: email-sig@python.org
  3. """Email address parsing code.
  4. Lifted directly from rfc822.py. This should eventually be rewritten.
  5. """
  6. __all__ = [
  7. 'mktime_tz',
  8. 'parsedate',
  9. 'parsedate_tz',
  10. 'quote',
  11. ]
  12. import time
  13. SPACE = ' '
  14. EMPTYSTRING = ''
  15. COMMASPACE = ', '
  16. # Parse a date field
  17. _monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
  18. 'aug', 'sep', 'oct', 'nov', 'dec',
  19. 'january', 'february', 'march', 'april', 'may', 'june', 'july',
  20. 'august', 'september', 'october', 'november', 'december']
  21. _daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
  22. # The timezone table does not include the military time zones defined
  23. # in RFC822, other than Z. According to RFC1123, the description in
  24. # RFC822 gets the signs wrong, so we can't rely on any such time
  25. # zones. RFC1123 recommends that numeric timezone indicators be used
  26. # instead of timezone names.
  27. _timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
  28. 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
  29. 'EST': -500, 'EDT': -400, # Eastern
  30. 'CST': -600, 'CDT': -500, # Central
  31. 'MST': -700, 'MDT': -600, # Mountain
  32. 'PST': -800, 'PDT': -700 # Pacific
  33. }
  34. def parsedate_tz(data):
  35. """Convert a date string to a time tuple.
  36. Accounts for military timezones.
  37. """
  38. data = data.split()
  39. # The FWS after the comma after the day-of-week is optional, so search and
  40. # adjust for this.
  41. if data[0].endswith(',') or data[0].lower() in _daynames:
  42. # There's a dayname here. Skip it
  43. del data[0]
  44. else:
  45. i = data[0].rfind(',')
  46. if i >= 0:
  47. data[0] = data[0][i+1:]
  48. if len(data) == 3: # RFC 850 date, deprecated
  49. stuff = data[0].split('-')
  50. if len(stuff) == 3:
  51. data = stuff + data[1:]
  52. if len(data) == 4:
  53. s = data[3]
  54. i = s.find('+')
  55. if i > 0:
  56. data[3:] = [s[:i], s[i+1:]]
  57. else:
  58. data.append('') # Dummy tz
  59. if len(data) < 5:
  60. return None
  61. data = data[:5]
  62. [dd, mm, yy, tm, tz] = data
  63. mm = mm.lower()
  64. if mm not in _monthnames:
  65. dd, mm = mm, dd.lower()
  66. if mm not in _monthnames:
  67. return None
  68. mm = _monthnames.index(mm) + 1
  69. if mm > 12:
  70. mm -= 12
  71. if dd[-1] == ',':
  72. dd = dd[:-1]
  73. i = yy.find(':')
  74. if i > 0:
  75. yy, tm = tm, yy
  76. if yy[-1] == ',':
  77. yy = yy[:-1]
  78. if not yy[0].isdigit():
  79. yy, tz = tz, yy
  80. if tm[-1] == ',':
  81. tm = tm[:-1]
  82. tm = tm.split(':')
  83. if len(tm) == 2:
  84. [thh, tmm] = tm
  85. tss = '0'
  86. elif len(tm) == 3:
  87. [thh, tmm, tss] = tm
  88. else:
  89. return None
  90. try:
  91. yy = int(yy)
  92. dd = int(dd)
  93. thh = int(thh)
  94. tmm = int(tmm)
  95. tss = int(tss)
  96. except ValueError:
  97. return None
  98. tzoffset = None
  99. tz = tz.upper()
  100. if tz in _timezones:
  101. tzoffset = _timezones[tz]
  102. else:
  103. try:
  104. tzoffset = int(tz)
  105. except ValueError:
  106. pass
  107. # Convert a timezone offset into seconds ; -0500 -> -18000
  108. if tzoffset:
  109. if tzoffset < 0:
  110. tzsign = -1
  111. tzoffset = -tzoffset
  112. else:
  113. tzsign = 1
  114. tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
  115. # Daylight Saving Time flag is set to -1, since DST is unknown.
  116. return yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset
  117. def parsedate(data):
  118. """Convert a time string to a time tuple."""
  119. t = parsedate_tz(data)
  120. if isinstance(t, tuple):
  121. return t[:9]
  122. else:
  123. return t
  124. def mktime_tz(data):
  125. """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
  126. if data[9] is None:
  127. # No zone info, so localtime is better assumption than GMT
  128. return time.mktime(data[:8] + (-1,))
  129. else:
  130. t = time.mktime(data[:8] + (0,))
  131. return t - data[9] - time.timezone
  132. def quote(str):
  133. """Add quotes around a string."""
  134. return str.replace('\\', '\\\\').replace('"', '\\"')
  135. class AddrlistClass:
  136. """Address parser class by Ben Escoto.
  137. To understand what this class does, it helps to have a copy of RFC 2822 in
  138. front of you.
  139. Note: this class interface is deprecated and may be removed in the future.
  140. Use rfc822.AddressList instead.
  141. """
  142. def __init__(self, field):
  143. """Initialize a new instance.
  144. `field' is an unparsed address header field, containing
  145. one or more addresses.
  146. """
  147. self.specials = '()<>@,:;.\"[]'
  148. self.pos = 0
  149. self.LWS = ' \t'
  150. self.CR = '\r\n'
  151. self.FWS = self.LWS + self.CR
  152. self.atomends = self.specials + self.LWS + self.CR
  153. # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
  154. # is obsolete syntax. RFC 2822 requires that we recognize obsolete
  155. # syntax, so allow dots in phrases.
  156. self.phraseends = self.atomends.replace('.', '')
  157. self.field = field
  158. self.commentlist = []
  159. def gotonext(self):
  160. """Parse up to the start of the next address."""
  161. while self.pos < len(self.field):
  162. if self.field[self.pos] in self.LWS + '\n\r':
  163. self.pos += 1
  164. elif self.field[self.pos] == '(':
  165. self.commentlist.append(self.getcomment())
  166. else:
  167. break
  168. def getaddrlist(self):
  169. """Parse all addresses.
  170. Returns a list containing all of the addresses.
  171. """
  172. result = []
  173. while self.pos < len(self.field):
  174. ad = self.getaddress()
  175. if ad:
  176. result += ad
  177. else:
  178. result.append(('', ''))
  179. return result
  180. def getaddress(self):
  181. """Parse the next address."""
  182. self.commentlist = []
  183. self.gotonext()
  184. oldpos = self.pos
  185. oldcl = self.commentlist
  186. plist = self.getphraselist()
  187. self.gotonext()
  188. returnlist = []
  189. if self.pos >= len(self.field):
  190. # Bad email address technically, no domain.
  191. if plist:
  192. returnlist = [(SPACE.join(self.commentlist), plist[0])]
  193. elif self.field[self.pos] in '.@':
  194. # email address is just an addrspec
  195. # this isn't very efficient since we start over
  196. self.pos = oldpos
  197. self.commentlist = oldcl
  198. addrspec = self.getaddrspec()
  199. returnlist = [(SPACE.join(self.commentlist), addrspec)]
  200. elif self.field[self.pos] == ':':
  201. # address is a group
  202. returnlist = []
  203. fieldlen = len(self.field)
  204. self.pos += 1
  205. while self.pos < len(self.field):
  206. self.gotonext()
  207. if self.pos < fieldlen and self.field[self.pos] == ';':
  208. self.pos += 1
  209. break
  210. returnlist = returnlist + self.getaddress()
  211. elif self.field[self.pos] == '<':
  212. # Address is a phrase then a route addr
  213. routeaddr = self.getrouteaddr()
  214. if self.commentlist:
  215. returnlist = [(SPACE.join(plist) + ' (' +
  216. ' '.join(self.commentlist) + ')', routeaddr)]
  217. else:
  218. returnlist = [(SPACE.join(plist), routeaddr)]
  219. else:
  220. if plist:
  221. returnlist = [(SPACE.join(self.commentlist), plist[0])]
  222. elif self.field[self.pos] in self.specials:
  223. self.pos += 1
  224. self.gotonext()
  225. if self.pos < len(self.field) and self.field[self.pos] == ',':
  226. self.pos += 1
  227. return returnlist
  228. def getrouteaddr(self):
  229. """Parse a route address (Return-path value).
  230. This method just skips all the route stuff and returns the addrspec.
  231. """
  232. if self.field[self.pos] != '<':
  233. return
  234. expectroute = False
  235. self.pos += 1
  236. self.gotonext()
  237. adlist = ''
  238. while self.pos < len(self.field):
  239. if expectroute:
  240. self.getdomain()
  241. expectroute = False
  242. elif self.field[self.pos] == '>':
  243. self.pos += 1
  244. break
  245. elif self.field[self.pos] == '@':
  246. self.pos += 1
  247. expectroute = True
  248. elif self.field[self.pos] == ':':
  249. self.pos += 1
  250. else:
  251. adlist = self.getaddrspec()
  252. self.pos += 1
  253. break
  254. self.gotonext()
  255. return adlist
  256. def getaddrspec(self):
  257. """Parse an RFC 2822 addr-spec."""
  258. aslist = []
  259. self.gotonext()
  260. while self.pos < len(self.field):
  261. if self.field[self.pos] == '.':
  262. aslist.append('.')
  263. self.pos += 1
  264. elif self.field[self.pos] == '"':
  265. aslist.append('"%s"' % self.getquote())
  266. elif self.field[self.pos] in self.atomends:
  267. break
  268. else:
  269. aslist.append(self.getatom())
  270. self.gotonext()
  271. if self.pos >= len(self.field) or self.field[self.pos] != '@':
  272. return EMPTYSTRING.join(aslist)
  273. aslist.append('@')
  274. self.pos += 1
  275. self.gotonext()
  276. return EMPTYSTRING.join(aslist) + self.getdomain()
  277. def getdomain(self):
  278. """Get the complete domain name from an address."""
  279. sdlist = []
  280. while self.pos < len(self.field):
  281. if self.field[self.pos] in self.LWS:
  282. self.pos += 1
  283. elif self.field[self.pos] == '(':
  284. self.commentlist.append(self.getcomment())
  285. elif self.field[self.pos] == '[':
  286. sdlist.append(self.getdomainliteral())
  287. elif self.field[self.pos] == '.':
  288. self.pos += 1
  289. sdlist.append('.')
  290. elif self.field[self.pos] in self.atomends:
  291. break
  292. else:
  293. sdlist.append(self.getatom())
  294. return EMPTYSTRING.join(sdlist)
  295. def getdelimited(self, beginchar, endchars, allowcomments=True):
  296. """Parse a header fragment delimited by special characters.
  297. `beginchar' is the start character for the fragment.
  298. If self is not looking at an instance of `beginchar' then
  299. getdelimited returns the empty string.
  300. `endchars' is a sequence of allowable end-delimiting characters.
  301. Parsing stops when one of these is encountered.
  302. If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
  303. within the parsed fragment.
  304. """
  305. if self.field[self.pos] != beginchar:
  306. return ''
  307. slist = ['']
  308. quote = False
  309. self.pos += 1
  310. while self.pos < len(self.field):
  311. if quote:
  312. slist.append(self.field[self.pos])
  313. quote = False
  314. elif self.field[self.pos] in endchars:
  315. self.pos += 1
  316. break
  317. elif allowcomments and self.field[self.pos] == '(':
  318. slist.append(self.getcomment())
  319. continue # have already advanced pos from getcomment
  320. elif self.field[self.pos] == '\\':
  321. quote = True
  322. else:
  323. slist.append(self.field[self.pos])
  324. self.pos += 1
  325. return EMPTYSTRING.join(slist)
  326. def getquote(self):
  327. """Get a quote-delimited fragment from self's field."""
  328. return self.getdelimited('"', '"\r', False)
  329. def getcomment(self):
  330. """Get a parenthesis-delimited fragment from self's field."""
  331. return self.getdelimited('(', ')\r', True)
  332. def getdomainliteral(self):
  333. """Parse an RFC 2822 domain-literal."""
  334. return '[%s]' % self.getdelimited('[', ']\r', False)
  335. def getatom(self, atomends=None):
  336. """Parse an RFC 2822 atom.
  337. Optional atomends specifies a different set of end token delimiters
  338. (the default is to use self.atomends). This is used e.g. in
  339. getphraselist() since phrase endings must not include the `.' (which
  340. is legal in phrases)."""
  341. atomlist = ['']
  342. if atomends is None:
  343. atomends = self.atomends
  344. while self.pos < len(self.field):
  345. if self.field[self.pos] in atomends:
  346. break
  347. else:
  348. atomlist.append(self.field[self.pos])
  349. self.pos += 1
  350. return EMPTYSTRING.join(atomlist)
  351. def getphraselist(self):
  352. """Parse a sequence of RFC 2822 phrases.
  353. A phrase is a sequence of words, which are in turn either RFC 2822
  354. atoms or quoted-strings. Phrases are canonicalized by squeezing all
  355. runs of continuous whitespace into one space.
  356. """
  357. plist = []
  358. while self.pos < len(self.field):
  359. if self.field[self.pos] in self.FWS:
  360. self.pos += 1
  361. elif self.field[self.pos] == '"':
  362. plist.append(self.getquote())
  363. elif self.field[self.pos] == '(':
  364. self.commentlist.append(self.getcomment())
  365. elif self.field[self.pos] in self.phraseends:
  366. break
  367. else:
  368. plist.append(self.getatom(self.phraseends))
  369. return plist
  370. class AddressList(AddrlistClass):
  371. """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
  372. def __init__(self, field):
  373. AddrlistClass.__init__(self, field)
  374. if field:
  375. self.addresslist = self.getaddrlist()
  376. else:
  377. self.addresslist = []
  378. def __len__(self):
  379. return len(self.addresslist)
  380. def __add__(self, other):
  381. # Set union
  382. newaddr = AddressList(None)
  383. newaddr.addresslist = self.addresslist[:]
  384. for x in other.addresslist:
  385. if not x in self.addresslist:
  386. newaddr.addresslist.append(x)
  387. return newaddr
  388. def __iadd__(self, other):
  389. # Set union, in-place
  390. for x in other.addresslist:
  391. if not x in self.addresslist:
  392. self.addresslist.append(x)
  393. return self
  394. def __sub__(self, other):
  395. # Set difference
  396. newaddr = AddressList(None)
  397. for x in self.addresslist:
  398. if not x in other.addresslist:
  399. newaddr.addresslist.append(x)
  400. return newaddr
  401. def __isub__(self, other):
  402. # Set difference, in-place
  403. for x in other.addresslist:
  404. if x in self.addresslist:
  405. self.addresslist.remove(x)
  406. return self
  407. def __getitem__(self, index):
  408. # Make indexing, slices, and 'in' work
  409. return self.addresslist[index]