PageRenderTime 409ms CodeModel.GetById 141ms app.highlight 125ms RepoModel.GetById 137ms app.codeStats 0ms

/Lib/email/_parseaddr.py

http://unladen-swallow.googlecode.com/
Python | 480 lines | 455 code | 10 blank | 15 comment | 10 complexity | 919dd3be394950eaafdc3f8af967f20b MD5 | raw file
  1# Copyright (C) 2002-2007 Python Software Foundation
  2# Contact: email-sig@python.org
  3
  4"""Email address parsing code.
  5
  6Lifted directly from rfc822.py.  This should eventually be rewritten.
  7"""
  8
  9__all__ = [
 10    'mktime_tz',
 11    'parsedate',
 12    'parsedate_tz',
 13    'quote',
 14    ]
 15
 16import time
 17
 18SPACE = ' '
 19EMPTYSTRING = ''
 20COMMASPACE = ', '
 21
 22# Parse a date field
 23_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
 24               'aug', 'sep', 'oct', 'nov', 'dec',
 25               'january', 'february', 'march', 'april', 'may', 'june', 'july',
 26               'august', 'september', 'october', 'november', 'december']
 27
 28_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
 29
 30# The timezone table does not include the military time zones defined
 31# in RFC822, other than Z.  According to RFC1123, the description in
 32# RFC822 gets the signs wrong, so we can't rely on any such time
 33# zones.  RFC1123 recommends that numeric timezone indicators be used
 34# instead of timezone names.
 35
 36_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
 37              'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
 38              'EST': -500, 'EDT': -400,  # Eastern
 39              'CST': -600, 'CDT': -500,  # Central
 40              'MST': -700, 'MDT': -600,  # Mountain
 41              'PST': -800, 'PDT': -700   # Pacific
 42              }
 43
 44
 45def parsedate_tz(data):
 46    """Convert a date string to a time tuple.
 47
 48    Accounts for military timezones.
 49    """
 50    data = data.split()
 51    # The FWS after the comma after the day-of-week is optional, so search and
 52    # adjust for this.
 53    if data[0].endswith(',') or data[0].lower() in _daynames:
 54        # There's a dayname here. Skip it
 55        del data[0]
 56    else:
 57        i = data[0].rfind(',')
 58        if i >= 0:
 59            data[0] = data[0][i+1:]
 60    if len(data) == 3: # RFC 850 date, deprecated
 61        stuff = data[0].split('-')
 62        if len(stuff) == 3:
 63            data = stuff + data[1:]
 64    if len(data) == 4:
 65        s = data[3]
 66        i = s.find('+')
 67        if i > 0:
 68            data[3:] = [s[:i], s[i+1:]]
 69        else:
 70            data.append('') # Dummy tz
 71    if len(data) < 5:
 72        return None
 73    data = data[:5]
 74    [dd, mm, yy, tm, tz] = data
 75    mm = mm.lower()
 76    if mm not in _monthnames:
 77        dd, mm = mm, dd.lower()
 78        if mm not in _monthnames:
 79            return None
 80    mm = _monthnames.index(mm) + 1
 81    if mm > 12:
 82        mm -= 12
 83    if dd[-1] == ',':
 84        dd = dd[:-1]
 85    i = yy.find(':')
 86    if i > 0:
 87        yy, tm = tm, yy
 88    if yy[-1] == ',':
 89        yy = yy[:-1]
 90    if not yy[0].isdigit():
 91        yy, tz = tz, yy
 92    if tm[-1] == ',':
 93        tm = tm[:-1]
 94    tm = tm.split(':')
 95    if len(tm) == 2:
 96        [thh, tmm] = tm
 97        tss = '0'
 98    elif len(tm) == 3:
 99        [thh, tmm, tss] = tm
100    else:
101        return None
102    try:
103        yy = int(yy)
104        dd = int(dd)
105        thh = int(thh)
106        tmm = int(tmm)
107        tss = int(tss)
108    except ValueError:
109        return None
110    tzoffset = None
111    tz = tz.upper()
112    if tz in _timezones:
113        tzoffset = _timezones[tz]
114    else:
115        try:
116            tzoffset = int(tz)
117        except ValueError:
118            pass
119    # Convert a timezone offset into seconds ; -0500 -> -18000
120    if tzoffset:
121        if tzoffset < 0:
122            tzsign = -1
123            tzoffset = -tzoffset
124        else:
125            tzsign = 1
126        tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
127    # Daylight Saving Time flag is set to -1, since DST is unknown.
128    return yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset
129
130
131def parsedate(data):
132    """Convert a time string to a time tuple."""
133    t = parsedate_tz(data)
134    if isinstance(t, tuple):
135        return t[:9]
136    else:
137        return t
138
139
140def mktime_tz(data):
141    """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
142    if data[9] is None:
143        # No zone info, so localtime is better assumption than GMT
144        return time.mktime(data[:8] + (-1,))
145    else:
146        t = time.mktime(data[:8] + (0,))
147        return t - data[9] - time.timezone
148
149
150def quote(str):
151    """Add quotes around a string."""
152    return str.replace('\\', '\\\\').replace('"', '\\"')
153
154
155class AddrlistClass:
156    """Address parser class by Ben Escoto.
157
158    To understand what this class does, it helps to have a copy of RFC 2822 in
159    front of you.
160
161    Note: this class interface is deprecated and may be removed in the future.
162    Use rfc822.AddressList instead.
163    """
164
165    def __init__(self, field):
166        """Initialize a new instance.
167
168        `field' is an unparsed address header field, containing
169        one or more addresses.
170        """
171        self.specials = '()<>@,:;.\"[]'
172        self.pos = 0
173        self.LWS = ' \t'
174        self.CR = '\r\n'
175        self.FWS = self.LWS + self.CR
176        self.atomends = self.specials + self.LWS + self.CR
177        # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
178        # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
179        # syntax, so allow dots in phrases.
180        self.phraseends = self.atomends.replace('.', '')
181        self.field = field
182        self.commentlist = []
183
184    def gotonext(self):
185        """Parse up to the start of the next address."""
186        while self.pos < len(self.field):
187            if self.field[self.pos] in self.LWS + '\n\r':
188                self.pos += 1
189            elif self.field[self.pos] == '(':
190                self.commentlist.append(self.getcomment())
191            else:
192                break
193
194    def getaddrlist(self):
195        """Parse all addresses.
196
197        Returns a list containing all of the addresses.
198        """
199        result = []
200        while self.pos < len(self.field):
201            ad = self.getaddress()
202            if ad:
203                result += ad
204            else:
205                result.append(('', ''))
206        return result
207
208    def getaddress(self):
209        """Parse the next address."""
210        self.commentlist = []
211        self.gotonext()
212
213        oldpos = self.pos
214        oldcl = self.commentlist
215        plist = self.getphraselist()
216
217        self.gotonext()
218        returnlist = []
219
220        if self.pos >= len(self.field):
221            # Bad email address technically, no domain.
222            if plist:
223                returnlist = [(SPACE.join(self.commentlist), plist[0])]
224
225        elif self.field[self.pos] in '.@':
226            # email address is just an addrspec
227            # this isn't very efficient since we start over
228            self.pos = oldpos
229            self.commentlist = oldcl
230            addrspec = self.getaddrspec()
231            returnlist = [(SPACE.join(self.commentlist), addrspec)]
232
233        elif self.field[self.pos] == ':':
234            # address is a group
235            returnlist = []
236
237            fieldlen = len(self.field)
238            self.pos += 1
239            while self.pos < len(self.field):
240                self.gotonext()
241                if self.pos < fieldlen and self.field[self.pos] == ';':
242                    self.pos += 1
243                    break
244                returnlist = returnlist + self.getaddress()
245
246        elif self.field[self.pos] == '<':
247            # Address is a phrase then a route addr
248            routeaddr = self.getrouteaddr()
249
250            if self.commentlist:
251                returnlist = [(SPACE.join(plist) + ' (' +
252                               ' '.join(self.commentlist) + ')', routeaddr)]
253            else:
254                returnlist = [(SPACE.join(plist), routeaddr)]
255
256        else:
257            if plist:
258                returnlist = [(SPACE.join(self.commentlist), plist[0])]
259            elif self.field[self.pos] in self.specials:
260                self.pos += 1
261
262        self.gotonext()
263        if self.pos < len(self.field) and self.field[self.pos] == ',':
264            self.pos += 1
265        return returnlist
266
267    def getrouteaddr(self):
268        """Parse a route address (Return-path value).
269
270        This method just skips all the route stuff and returns the addrspec.
271        """
272        if self.field[self.pos] != '<':
273            return
274
275        expectroute = False
276        self.pos += 1
277        self.gotonext()
278        adlist = ''
279        while self.pos < len(self.field):
280            if expectroute:
281                self.getdomain()
282                expectroute = False
283            elif self.field[self.pos] == '>':
284                self.pos += 1
285                break
286            elif self.field[self.pos] == '@':
287                self.pos += 1
288                expectroute = True
289            elif self.field[self.pos] == ':':
290                self.pos += 1
291            else:
292                adlist = self.getaddrspec()
293                self.pos += 1
294                break
295            self.gotonext()
296
297        return adlist
298
299    def getaddrspec(self):
300        """Parse an RFC 2822 addr-spec."""
301        aslist = []
302
303        self.gotonext()
304        while self.pos < len(self.field):
305            if self.field[self.pos] == '.':
306                aslist.append('.')
307                self.pos += 1
308            elif self.field[self.pos] == '"':
309                aslist.append('"%s"' % self.getquote())
310            elif self.field[self.pos] in self.atomends:
311                break
312            else:
313                aslist.append(self.getatom())
314            self.gotonext()
315
316        if self.pos >= len(self.field) or self.field[self.pos] != '@':
317            return EMPTYSTRING.join(aslist)
318
319        aslist.append('@')
320        self.pos += 1
321        self.gotonext()
322        return EMPTYSTRING.join(aslist) + self.getdomain()
323
324    def getdomain(self):
325        """Get the complete domain name from an address."""
326        sdlist = []
327        while self.pos < len(self.field):
328            if self.field[self.pos] in self.LWS:
329                self.pos += 1
330            elif self.field[self.pos] == '(':
331                self.commentlist.append(self.getcomment())
332            elif self.field[self.pos] == '[':
333                sdlist.append(self.getdomainliteral())
334            elif self.field[self.pos] == '.':
335                self.pos += 1
336                sdlist.append('.')
337            elif self.field[self.pos] in self.atomends:
338                break
339            else:
340                sdlist.append(self.getatom())
341        return EMPTYSTRING.join(sdlist)
342
343    def getdelimited(self, beginchar, endchars, allowcomments=True):
344        """Parse a header fragment delimited by special characters.
345
346        `beginchar' is the start character for the fragment.
347        If self is not looking at an instance of `beginchar' then
348        getdelimited returns the empty string.
349
350        `endchars' is a sequence of allowable end-delimiting characters.
351        Parsing stops when one of these is encountered.
352
353        If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
354        within the parsed fragment.
355        """
356        if self.field[self.pos] != beginchar:
357            return ''
358
359        slist = ['']
360        quote = False
361        self.pos += 1
362        while self.pos < len(self.field):
363            if quote:
364                slist.append(self.field[self.pos])
365                quote = False
366            elif self.field[self.pos] in endchars:
367                self.pos += 1
368                break
369            elif allowcomments and self.field[self.pos] == '(':
370                slist.append(self.getcomment())
371                continue        # have already advanced pos from getcomment
372            elif self.field[self.pos] == '\\':
373                quote = True
374            else:
375                slist.append(self.field[self.pos])
376            self.pos += 1
377
378        return EMPTYSTRING.join(slist)
379
380    def getquote(self):
381        """Get a quote-delimited fragment from self's field."""
382        return self.getdelimited('"', '"\r', False)
383
384    def getcomment(self):
385        """Get a parenthesis-delimited fragment from self's field."""
386        return self.getdelimited('(', ')\r', True)
387
388    def getdomainliteral(self):
389        """Parse an RFC 2822 domain-literal."""
390        return '[%s]' % self.getdelimited('[', ']\r', False)
391
392    def getatom(self, atomends=None):
393        """Parse an RFC 2822 atom.
394
395        Optional atomends specifies a different set of end token delimiters
396        (the default is to use self.atomends).  This is used e.g. in
397        getphraselist() since phrase endings must not include the `.' (which
398        is legal in phrases)."""
399        atomlist = ['']
400        if atomends is None:
401            atomends = self.atomends
402
403        while self.pos < len(self.field):
404            if self.field[self.pos] in atomends:
405                break
406            else:
407                atomlist.append(self.field[self.pos])
408            self.pos += 1
409
410        return EMPTYSTRING.join(atomlist)
411
412    def getphraselist(self):
413        """Parse a sequence of RFC 2822 phrases.
414
415        A phrase is a sequence of words, which are in turn either RFC 2822
416        atoms or quoted-strings.  Phrases are canonicalized by squeezing all
417        runs of continuous whitespace into one space.
418        """
419        plist = []
420
421        while self.pos < len(self.field):
422            if self.field[self.pos] in self.FWS:
423                self.pos += 1
424            elif self.field[self.pos] == '"':
425                plist.append(self.getquote())
426            elif self.field[self.pos] == '(':
427                self.commentlist.append(self.getcomment())
428            elif self.field[self.pos] in self.phraseends:
429                break
430            else:
431                plist.append(self.getatom(self.phraseends))
432
433        return plist
434
435class AddressList(AddrlistClass):
436    """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
437    def __init__(self, field):
438        AddrlistClass.__init__(self, field)
439        if field:
440            self.addresslist = self.getaddrlist()
441        else:
442            self.addresslist = []
443
444    def __len__(self):
445        return len(self.addresslist)
446
447    def __add__(self, other):
448        # Set union
449        newaddr = AddressList(None)
450        newaddr.addresslist = self.addresslist[:]
451        for x in other.addresslist:
452            if not x in self.addresslist:
453                newaddr.addresslist.append(x)
454        return newaddr
455
456    def __iadd__(self, other):
457        # Set union, in-place
458        for x in other.addresslist:
459            if not x in self.addresslist:
460                self.addresslist.append(x)
461        return self
462
463    def __sub__(self, other):
464        # Set difference
465        newaddr = AddressList(None)
466        for x in self.addresslist:
467            if not x in other.addresslist:
468                newaddr.addresslist.append(x)
469        return newaddr
470
471    def __isub__(self, other):
472        # Set difference, in-place
473        for x in other.addresslist:
474            if x in self.addresslist:
475                self.addresslist.remove(x)
476        return self
477
478    def __getitem__(self, index):
479        # Make indexing, slices, and 'in' work
480        return self.addresslist[index]