PageRenderTime 41ms CodeModel.GetById 17ms app.highlight 19ms RepoModel.GetById 1ms app.codeStats 0ms

/Lib/email/feedparser.py

http://unladen-swallow.googlecode.com/
Python | 480 lines | 403 code | 9 blank | 68 comment | 61 complexity | 55e31c00d9b7da2a261bb885a98e3743 MD5 | raw file
  1# Copyright (C) 2004-2006 Python Software Foundation
  2# Authors: Baxter, Wouters and Warsaw
  3# Contact: email-sig@python.org
  4
  5"""FeedParser - An email feed parser.
  6
  7The feed parser implements an interface for incrementally parsing an email
  8message, line by line.  This has advantages for certain applications, such as
  9those reading email messages off a socket.
 10
 11FeedParser.feed() is the primary interface for pushing new data into the
 12parser.  It returns when there's nothing more it can do with the available
 13data.  When you have no more data to push into the parser, call .close().
 14This completes the parsing and returns the root message object.
 15
 16The other advantage of this parser is that it will never throw a parsing
 17exception.  Instead, when it finds something unexpected, it adds a 'defect' to
 18the current message.  Defects are just instances that live on the message
 19object's .defects attribute.
 20"""
 21
 22__all__ = ['FeedParser']
 23
 24import re
 25
 26from email import errors
 27from email import message
 28
 29NLCRE = re.compile('\r\n|\r|\n')
 30NLCRE_bol = re.compile('(\r\n|\r|\n)')
 31NLCRE_eol = re.compile('(\r\n|\r|\n)$')
 32NLCRE_crack = re.compile('(\r\n|\r|\n)')
 33# RFC 2822 $3.6.8 Optional fields.  ftext is %d33-57 / %d59-126, Any character
 34# except controls, SP, and ":".
 35headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')
 36EMPTYSTRING = ''
 37NL = '\n'
 38
 39NeedMoreData = object()
 40
 41
 42
 43class BufferedSubFile(object):
 44    """A file-ish object that can have new data loaded into it.
 45
 46    You can also push and pop line-matching predicates onto a stack.  When the
 47    current predicate matches the current line, a false EOF response
 48    (i.e. empty string) is returned instead.  This lets the parser adhere to a
 49    simple abstraction -- it parses until EOF closes the current message.
 50    """
 51    def __init__(self):
 52        # The last partial line pushed into this object.
 53        self._partial = ''
 54        # The list of full, pushed lines, in reverse order
 55        self._lines = []
 56        # The stack of false-EOF checking predicates.
 57        self._eofstack = []
 58        # A flag indicating whether the file has been closed or not.
 59        self._closed = False
 60
 61    def push_eof_matcher(self, pred):
 62        self._eofstack.append(pred)
 63
 64    def pop_eof_matcher(self):
 65        return self._eofstack.pop()
 66
 67    def close(self):
 68        # Don't forget any trailing partial line.
 69        self._lines.append(self._partial)
 70        self._partial = ''
 71        self._closed = True
 72
 73    def readline(self):
 74        if not self._lines:
 75            if self._closed:
 76                return ''
 77            return NeedMoreData
 78        # Pop the line off the stack and see if it matches the current
 79        # false-EOF predicate.
 80        line = self._lines.pop()
 81        # RFC 2046, section 5.1.2 requires us to recognize outer level
 82        # boundaries at any level of inner nesting.  Do this, but be sure it's
 83        # in the order of most to least nested.
 84        for ateof in self._eofstack[::-1]:
 85            if ateof(line):
 86                # We're at the false EOF.  But push the last line back first.
 87                self._lines.append(line)
 88                return ''
 89        return line
 90
 91    def unreadline(self, line):
 92        # Let the consumer push a line back into the buffer.
 93        assert line is not NeedMoreData
 94        self._lines.append(line)
 95
 96    def push(self, data):
 97        """Push some new data into this object."""
 98        # Handle any previous leftovers
 99        data, self._partial = self._partial + data, ''
100        # Crack into lines, but preserve the newlines on the end of each
101        parts = NLCRE_crack.split(data)
102        # The *ahem* interesting behaviour of re.split when supplied grouping
103        # parentheses is that the last element of the resulting list is the
104        # data after the final RE.  In the case of a NL/CR terminated string,
105        # this is the empty string.
106        self._partial = parts.pop()
107        # parts is a list of strings, alternating between the line contents
108        # and the eol character(s).  Gather up a list of lines after
109        # re-attaching the newlines.
110        lines = []
111        for i in range(len(parts) // 2):
112            lines.append(parts[i*2] + parts[i*2+1])
113        self.pushlines(lines)
114
115    def pushlines(self, lines):
116        # Reverse and insert at the front of the lines.
117        self._lines[:0] = lines[::-1]
118
119    def is_closed(self):
120        return self._closed
121
122    def __iter__(self):
123        return self
124
125    def next(self):
126        line = self.readline()
127        if line == '':
128            raise StopIteration
129        return line
130
131
132
133class FeedParser:
134    """A feed-style parser of email."""
135
136    def __init__(self, _factory=message.Message):
137        """_factory is called with no arguments to create a new message obj"""
138        self._factory = _factory
139        self._input = BufferedSubFile()
140        self._msgstack = []
141        self._parse = self._parsegen().next
142        self._cur = None
143        self._last = None
144        self._headersonly = False
145
146    # Non-public interface for supporting Parser's headersonly flag
147    def _set_headersonly(self):
148        self._headersonly = True
149
150    def feed(self, data):
151        """Push more data into the parser."""
152        self._input.push(data)
153        self._call_parse()
154
155    def _call_parse(self):
156        try:
157            self._parse()
158        except StopIteration:
159            pass
160
161    def close(self):
162        """Parse all remaining data and return the root message object."""
163        self._input.close()
164        self._call_parse()
165        root = self._pop_message()
166        assert not self._msgstack
167        # Look for final set of defects
168        if root.get_content_maintype() == 'multipart' \
169               and not root.is_multipart():
170            root.defects.append(errors.MultipartInvariantViolationDefect())
171        return root
172
173    def _new_message(self):
174        msg = self._factory()
175        if self._cur and self._cur.get_content_type() == 'multipart/digest':
176            msg.set_default_type('message/rfc822')
177        if self._msgstack:
178            self._msgstack[-1].attach(msg)
179        self._msgstack.append(msg)
180        self._cur = msg
181        self._last = msg
182
183    def _pop_message(self):
184        retval = self._msgstack.pop()
185        if self._msgstack:
186            self._cur = self._msgstack[-1]
187        else:
188            self._cur = None
189        return retval
190
191    def _parsegen(self):
192        # Create a new message and start by parsing headers.
193        self._new_message()
194        headers = []
195        # Collect the headers, searching for a line that doesn't match the RFC
196        # 2822 header or continuation pattern (including an empty line).
197        for line in self._input:
198            if line is NeedMoreData:
199                yield NeedMoreData
200                continue
201            if not headerRE.match(line):
202                # If we saw the RFC defined header/body separator
203                # (i.e. newline), just throw it away. Otherwise the line is
204                # part of the body so push it back.
205                if not NLCRE.match(line):
206                    self._input.unreadline(line)
207                break
208            headers.append(line)
209        # Done with the headers, so parse them and figure out what we're
210        # supposed to see in the body of the message.
211        self._parse_headers(headers)
212        # Headers-only parsing is a backwards compatibility hack, which was
213        # necessary in the older parser, which could throw errors.  All
214        # remaining lines in the input are thrown into the message body.
215        if self._headersonly:
216            lines = []
217            while True:
218                line = self._input.readline()
219                if line is NeedMoreData:
220                    yield NeedMoreData
221                    continue
222                if line == '':
223                    break
224                lines.append(line)
225            self._cur.set_payload(EMPTYSTRING.join(lines))
226            return
227        if self._cur.get_content_type() == 'message/delivery-status':
228            # message/delivery-status contains blocks of headers separated by
229            # a blank line.  We'll represent each header block as a separate
230            # nested message object, but the processing is a bit different
231            # than standard message/* types because there is no body for the
232            # nested messages.  A blank line separates the subparts.
233            while True:
234                self._input.push_eof_matcher(NLCRE.match)
235                for retval in self._parsegen():
236                    if retval is NeedMoreData:
237                        yield NeedMoreData
238                        continue
239                    break
240                msg = self._pop_message()
241                # We need to pop the EOF matcher in order to tell if we're at
242                # the end of the current file, not the end of the last block
243                # of message headers.
244                self._input.pop_eof_matcher()
245                # The input stream must be sitting at the newline or at the
246                # EOF.  We want to see if we're at the end of this subpart, so
247                # first consume the blank line, then test the next line to see
248                # if we're at this subpart's EOF.
249                while True:
250                    line = self._input.readline()
251                    if line is NeedMoreData:
252                        yield NeedMoreData
253                        continue
254                    break
255                while True:
256                    line = self._input.readline()
257                    if line is NeedMoreData:
258                        yield NeedMoreData
259                        continue
260                    break
261                if line == '':
262                    break
263                # Not at EOF so this is a line we're going to need.
264                self._input.unreadline(line)
265            return
266        if self._cur.get_content_maintype() == 'message':
267            # The message claims to be a message/* type, then what follows is
268            # another RFC 2822 message.
269            for retval in self._parsegen():
270                if retval is NeedMoreData:
271                    yield NeedMoreData
272                    continue
273                break
274            self._pop_message()
275            return
276        if self._cur.get_content_maintype() == 'multipart':
277            boundary = self._cur.get_boundary()
278            if boundary is None:
279                # The message /claims/ to be a multipart but it has not
280                # defined a boundary.  That's a problem which we'll handle by
281                # reading everything until the EOF and marking the message as
282                # defective.
283                self._cur.defects.append(errors.NoBoundaryInMultipartDefect())
284                lines = []
285                for line in self._input:
286                    if line is NeedMoreData:
287                        yield NeedMoreData
288                        continue
289                    lines.append(line)
290                self._cur.set_payload(EMPTYSTRING.join(lines))
291                return
292            # Create a line match predicate which matches the inter-part
293            # boundary as well as the end-of-multipart boundary.  Don't push
294            # this onto the input stream until we've scanned past the
295            # preamble.
296            separator = '--' + boundary
297            boundaryre = re.compile(
298                '(?P<sep>' + re.escape(separator) +
299                r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
300            capturing_preamble = True
301            preamble = []
302            linesep = False
303            while True:
304                line = self._input.readline()
305                if line is NeedMoreData:
306                    yield NeedMoreData
307                    continue
308                if line == '':
309                    break
310                mo = boundaryre.match(line)
311                if mo:
312                    # If we're looking at the end boundary, we're done with
313                    # this multipart.  If there was a newline at the end of
314                    # the closing boundary, then we need to initialize the
315                    # epilogue with the empty string (see below).
316                    if mo.group('end'):
317                        linesep = mo.group('linesep')
318                        break
319                    # We saw an inter-part boundary.  Were we in the preamble?
320                    if capturing_preamble:
321                        if preamble:
322                            # According to RFC 2046, the last newline belongs
323                            # to the boundary.
324                            lastline = preamble[-1]
325                            eolmo = NLCRE_eol.search(lastline)
326                            if eolmo:
327                                preamble[-1] = lastline[:-len(eolmo.group(0))]
328                            self._cur.preamble = EMPTYSTRING.join(preamble)
329                        capturing_preamble = False
330                        self._input.unreadline(line)
331                        continue
332                    # We saw a boundary separating two parts.  Consume any
333                    # multiple boundary lines that may be following.  Our
334                    # interpretation of RFC 2046 BNF grammar does not produce
335                    # body parts within such double boundaries.
336                    while True:
337                        line = self._input.readline()
338                        if line is NeedMoreData:
339                            yield NeedMoreData
340                            continue
341                        mo = boundaryre.match(line)
342                        if not mo:
343                            self._input.unreadline(line)
344                            break
345                    # Recurse to parse this subpart; the input stream points
346                    # at the subpart's first line.
347                    self._input.push_eof_matcher(boundaryre.match)
348                    for retval in self._parsegen():
349                        if retval is NeedMoreData:
350                            yield NeedMoreData
351                            continue
352                        break
353                    # Because of RFC 2046, the newline preceding the boundary
354                    # separator actually belongs to the boundary, not the
355                    # previous subpart's payload (or epilogue if the previous
356                    # part is a multipart).
357                    if self._last.get_content_maintype() == 'multipart':
358                        epilogue = self._last.epilogue
359                        if epilogue == '':
360                            self._last.epilogue = None
361                        elif epilogue is not None:
362                            mo = NLCRE_eol.search(epilogue)
363                            if mo:
364                                end = len(mo.group(0))
365                                self._last.epilogue = epilogue[:-end]
366                    else:
367                        payload = self._last.get_payload()
368                        if isinstance(payload, basestring):
369                            mo = NLCRE_eol.search(payload)
370                            if mo:
371                                payload = payload[:-len(mo.group(0))]
372                                self._last.set_payload(payload)
373                    self._input.pop_eof_matcher()
374                    self._pop_message()
375                    # Set the multipart up for newline cleansing, which will
376                    # happen if we're in a nested multipart.
377                    self._last = self._cur
378                else:
379                    # I think we must be in the preamble
380                    assert capturing_preamble
381                    preamble.append(line)
382            # We've seen either the EOF or the end boundary.  If we're still
383            # capturing the preamble, we never saw the start boundary.  Note
384            # that as a defect and store the captured text as the payload.
385            # Everything from here to the EOF is epilogue.
386            if capturing_preamble:
387                self._cur.defects.append(errors.StartBoundaryNotFoundDefect())
388                self._cur.set_payload(EMPTYSTRING.join(preamble))
389                epilogue = []
390                for line in self._input:
391                    if line is NeedMoreData:
392                        yield NeedMoreData
393                        continue
394                self._cur.epilogue = EMPTYSTRING.join(epilogue)
395                return
396            # If the end boundary ended in a newline, we'll need to make sure
397            # the epilogue isn't None
398            if linesep:
399                epilogue = ['']
400            else:
401                epilogue = []
402            for line in self._input:
403                if line is NeedMoreData:
404                    yield NeedMoreData
405                    continue
406                epilogue.append(line)
407            # Any CRLF at the front of the epilogue is not technically part of
408            # the epilogue.  Also, watch out for an empty string epilogue,
409            # which means a single newline.
410            if epilogue:
411                firstline = epilogue[0]
412                bolmo = NLCRE_bol.match(firstline)
413                if bolmo:
414                    epilogue[0] = firstline[len(bolmo.group(0)):]
415            self._cur.epilogue = EMPTYSTRING.join(epilogue)
416            return
417        # Otherwise, it's some non-multipart type, so the entire rest of the
418        # file contents becomes the payload.
419        lines = []
420        for line in self._input:
421            if line is NeedMoreData:
422                yield NeedMoreData
423                continue
424            lines.append(line)
425        self._cur.set_payload(EMPTYSTRING.join(lines))
426
427    def _parse_headers(self, lines):
428        # Passed a list of lines that make up the headers for the current msg
429        lastheader = ''
430        lastvalue = []
431        for lineno, line in enumerate(lines):
432            # Check for continuation
433            if line[0] in ' \t':
434                if not lastheader:
435                    # The first line of the headers was a continuation.  This
436                    # is illegal, so let's note the defect, store the illegal
437                    # line, and ignore it for purposes of headers.
438                    defect = errors.FirstHeaderLineIsContinuationDefect(line)
439                    self._cur.defects.append(defect)
440                    continue
441                lastvalue.append(line)
442                continue
443            if lastheader:
444                # XXX reconsider the joining of folded lines
445                lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
446                self._cur[lastheader] = lhdr
447                lastheader, lastvalue = '', []
448            # Check for envelope header, i.e. unix-from
449            if line.startswith('From '):
450                if lineno == 0:
451                    # Strip off the trailing newline
452                    mo = NLCRE_eol.search(line)
453                    if mo:
454                        line = line[:-len(mo.group(0))]
455                    self._cur.set_unixfrom(line)
456                    continue
457                elif lineno == len(lines) - 1:
458                    # Something looking like a unix-from at the end - it's
459                    # probably the first line of the body, so push back the
460                    # line and stop.
461                    self._input.unreadline(line)
462                    return
463                else:
464                    # Weirdly placed unix-from line.  Note this as a defect
465                    # and ignore it.
466                    defect = errors.MisplacedEnvelopeHeaderDefect(line)
467                    self._cur.defects.append(defect)
468                    continue
469            # Split the line on the colon separating field name from value.
470            i = line.find(':')
471            if i < 0:
472                defect = errors.MalformedHeaderDefect(line)
473                self._cur.defects.append(defect)
474                continue
475            lastheader = line[:i]
476            lastvalue = [line[i+1:].lstrip()]
477        # Done with all the lines, so handle the last header.
478        if lastheader:
479            # XXX reconsider the joining of folded lines
480            self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')