/Lib/email/feedparser.py
Python | 480 lines | 403 code | 9 blank | 68 comment | 61 complexity | 55e31c00d9b7da2a261bb885a98e3743 MD5 | raw file
1# Copyright (C) 2004-2006 Python Software Foundation 2# Authors: Baxter, Wouters and Warsaw 3# Contact: email-sig@python.org 4 5"""FeedParser - An email feed parser. 6 7The feed parser implements an interface for incrementally parsing an email 8message, line by line. This has advantages for certain applications, such as 9those reading email messages off a socket. 10 11FeedParser.feed() is the primary interface for pushing new data into the 12parser. It returns when there's nothing more it can do with the available 13data. When you have no more data to push into the parser, call .close(). 14This completes the parsing and returns the root message object. 15 16The other advantage of this parser is that it will never throw a parsing 17exception. Instead, when it finds something unexpected, it adds a 'defect' to 18the current message. Defects are just instances that live on the message 19object's .defects attribute. 20""" 21 22__all__ = ['FeedParser'] 23 24import re 25 26from email import errors 27from email import message 28 29NLCRE = re.compile('\r\n|\r|\n') 30NLCRE_bol = re.compile('(\r\n|\r|\n)') 31NLCRE_eol = re.compile('(\r\n|\r|\n)$') 32NLCRE_crack = re.compile('(\r\n|\r|\n)') 33# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character 34# except controls, SP, and ":". 35headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])') 36EMPTYSTRING = '' 37NL = '\n' 38 39NeedMoreData = object() 40 41 42 43class BufferedSubFile(object): 44 """A file-ish object that can have new data loaded into it. 45 46 You can also push and pop line-matching predicates onto a stack. When the 47 current predicate matches the current line, a false EOF response 48 (i.e. empty string) is returned instead. This lets the parser adhere to a 49 simple abstraction -- it parses until EOF closes the current message. 50 """ 51 def __init__(self): 52 # The last partial line pushed into this object. 53 self._partial = '' 54 # The list of full, pushed lines, in reverse order 55 self._lines = [] 56 # The stack of false-EOF checking predicates. 57 self._eofstack = [] 58 # A flag indicating whether the file has been closed or not. 59 self._closed = False 60 61 def push_eof_matcher(self, pred): 62 self._eofstack.append(pred) 63 64 def pop_eof_matcher(self): 65 return self._eofstack.pop() 66 67 def close(self): 68 # Don't forget any trailing partial line. 69 self._lines.append(self._partial) 70 self._partial = '' 71 self._closed = True 72 73 def readline(self): 74 if not self._lines: 75 if self._closed: 76 return '' 77 return NeedMoreData 78 # Pop the line off the stack and see if it matches the current 79 # false-EOF predicate. 80 line = self._lines.pop() 81 # RFC 2046, section 5.1.2 requires us to recognize outer level 82 # boundaries at any level of inner nesting. Do this, but be sure it's 83 # in the order of most to least nested. 84 for ateof in self._eofstack[::-1]: 85 if ateof(line): 86 # We're at the false EOF. But push the last line back first. 87 self._lines.append(line) 88 return '' 89 return line 90 91 def unreadline(self, line): 92 # Let the consumer push a line back into the buffer. 93 assert line is not NeedMoreData 94 self._lines.append(line) 95 96 def push(self, data): 97 """Push some new data into this object.""" 98 # Handle any previous leftovers 99 data, self._partial = self._partial + data, '' 100 # Crack into lines, but preserve the newlines on the end of each 101 parts = NLCRE_crack.split(data) 102 # The *ahem* interesting behaviour of re.split when supplied grouping 103 # parentheses is that the last element of the resulting list is the 104 # data after the final RE. In the case of a NL/CR terminated string, 105 # this is the empty string. 106 self._partial = parts.pop() 107 # parts is a list of strings, alternating between the line contents 108 # and the eol character(s). Gather up a list of lines after 109 # re-attaching the newlines. 110 lines = [] 111 for i in range(len(parts) // 2): 112 lines.append(parts[i*2] + parts[i*2+1]) 113 self.pushlines(lines) 114 115 def pushlines(self, lines): 116 # Reverse and insert at the front of the lines. 117 self._lines[:0] = lines[::-1] 118 119 def is_closed(self): 120 return self._closed 121 122 def __iter__(self): 123 return self 124 125 def next(self): 126 line = self.readline() 127 if line == '': 128 raise StopIteration 129 return line 130 131 132 133class FeedParser: 134 """A feed-style parser of email.""" 135 136 def __init__(self, _factory=message.Message): 137 """_factory is called with no arguments to create a new message obj""" 138 self._factory = _factory 139 self._input = BufferedSubFile() 140 self._msgstack = [] 141 self._parse = self._parsegen().next 142 self._cur = None 143 self._last = None 144 self._headersonly = False 145 146 # Non-public interface for supporting Parser's headersonly flag 147 def _set_headersonly(self): 148 self._headersonly = True 149 150 def feed(self, data): 151 """Push more data into the parser.""" 152 self._input.push(data) 153 self._call_parse() 154 155 def _call_parse(self): 156 try: 157 self._parse() 158 except StopIteration: 159 pass 160 161 def close(self): 162 """Parse all remaining data and return the root message object.""" 163 self._input.close() 164 self._call_parse() 165 root = self._pop_message() 166 assert not self._msgstack 167 # Look for final set of defects 168 if root.get_content_maintype() == 'multipart' \ 169 and not root.is_multipart(): 170 root.defects.append(errors.MultipartInvariantViolationDefect()) 171 return root 172 173 def _new_message(self): 174 msg = self._factory() 175 if self._cur and self._cur.get_content_type() == 'multipart/digest': 176 msg.set_default_type('message/rfc822') 177 if self._msgstack: 178 self._msgstack[-1].attach(msg) 179 self._msgstack.append(msg) 180 self._cur = msg 181 self._last = msg 182 183 def _pop_message(self): 184 retval = self._msgstack.pop() 185 if self._msgstack: 186 self._cur = self._msgstack[-1] 187 else: 188 self._cur = None 189 return retval 190 191 def _parsegen(self): 192 # Create a new message and start by parsing headers. 193 self._new_message() 194 headers = [] 195 # Collect the headers, searching for a line that doesn't match the RFC 196 # 2822 header or continuation pattern (including an empty line). 197 for line in self._input: 198 if line is NeedMoreData: 199 yield NeedMoreData 200 continue 201 if not headerRE.match(line): 202 # If we saw the RFC defined header/body separator 203 # (i.e. newline), just throw it away. Otherwise the line is 204 # part of the body so push it back. 205 if not NLCRE.match(line): 206 self._input.unreadline(line) 207 break 208 headers.append(line) 209 # Done with the headers, so parse them and figure out what we're 210 # supposed to see in the body of the message. 211 self._parse_headers(headers) 212 # Headers-only parsing is a backwards compatibility hack, which was 213 # necessary in the older parser, which could throw errors. All 214 # remaining lines in the input are thrown into the message body. 215 if self._headersonly: 216 lines = [] 217 while True: 218 line = self._input.readline() 219 if line is NeedMoreData: 220 yield NeedMoreData 221 continue 222 if line == '': 223 break 224 lines.append(line) 225 self._cur.set_payload(EMPTYSTRING.join(lines)) 226 return 227 if self._cur.get_content_type() == 'message/delivery-status': 228 # message/delivery-status contains blocks of headers separated by 229 # a blank line. We'll represent each header block as a separate 230 # nested message object, but the processing is a bit different 231 # than standard message/* types because there is no body for the 232 # nested messages. A blank line separates the subparts. 233 while True: 234 self._input.push_eof_matcher(NLCRE.match) 235 for retval in self._parsegen(): 236 if retval is NeedMoreData: 237 yield NeedMoreData 238 continue 239 break 240 msg = self._pop_message() 241 # We need to pop the EOF matcher in order to tell if we're at 242 # the end of the current file, not the end of the last block 243 # of message headers. 244 self._input.pop_eof_matcher() 245 # The input stream must be sitting at the newline or at the 246 # EOF. We want to see if we're at the end of this subpart, so 247 # first consume the blank line, then test the next line to see 248 # if we're at this subpart's EOF. 249 while True: 250 line = self._input.readline() 251 if line is NeedMoreData: 252 yield NeedMoreData 253 continue 254 break 255 while True: 256 line = self._input.readline() 257 if line is NeedMoreData: 258 yield NeedMoreData 259 continue 260 break 261 if line == '': 262 break 263 # Not at EOF so this is a line we're going to need. 264 self._input.unreadline(line) 265 return 266 if self._cur.get_content_maintype() == 'message': 267 # The message claims to be a message/* type, then what follows is 268 # another RFC 2822 message. 269 for retval in self._parsegen(): 270 if retval is NeedMoreData: 271 yield NeedMoreData 272 continue 273 break 274 self._pop_message() 275 return 276 if self._cur.get_content_maintype() == 'multipart': 277 boundary = self._cur.get_boundary() 278 if boundary is None: 279 # The message /claims/ to be a multipart but it has not 280 # defined a boundary. That's a problem which we'll handle by 281 # reading everything until the EOF and marking the message as 282 # defective. 283 self._cur.defects.append(errors.NoBoundaryInMultipartDefect()) 284 lines = [] 285 for line in self._input: 286 if line is NeedMoreData: 287 yield NeedMoreData 288 continue 289 lines.append(line) 290 self._cur.set_payload(EMPTYSTRING.join(lines)) 291 return 292 # Create a line match predicate which matches the inter-part 293 # boundary as well as the end-of-multipart boundary. Don't push 294 # this onto the input stream until we've scanned past the 295 # preamble. 296 separator = '--' + boundary 297 boundaryre = re.compile( 298 '(?P<sep>' + re.escape(separator) + 299 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$') 300 capturing_preamble = True 301 preamble = [] 302 linesep = False 303 while True: 304 line = self._input.readline() 305 if line is NeedMoreData: 306 yield NeedMoreData 307 continue 308 if line == '': 309 break 310 mo = boundaryre.match(line) 311 if mo: 312 # If we're looking at the end boundary, we're done with 313 # this multipart. If there was a newline at the end of 314 # the closing boundary, then we need to initialize the 315 # epilogue with the empty string (see below). 316 if mo.group('end'): 317 linesep = mo.group('linesep') 318 break 319 # We saw an inter-part boundary. Were we in the preamble? 320 if capturing_preamble: 321 if preamble: 322 # According to RFC 2046, the last newline belongs 323 # to the boundary. 324 lastline = preamble[-1] 325 eolmo = NLCRE_eol.search(lastline) 326 if eolmo: 327 preamble[-1] = lastline[:-len(eolmo.group(0))] 328 self._cur.preamble = EMPTYSTRING.join(preamble) 329 capturing_preamble = False 330 self._input.unreadline(line) 331 continue 332 # We saw a boundary separating two parts. Consume any 333 # multiple boundary lines that may be following. Our 334 # interpretation of RFC 2046 BNF grammar does not produce 335 # body parts within such double boundaries. 336 while True: 337 line = self._input.readline() 338 if line is NeedMoreData: 339 yield NeedMoreData 340 continue 341 mo = boundaryre.match(line) 342 if not mo: 343 self._input.unreadline(line) 344 break 345 # Recurse to parse this subpart; the input stream points 346 # at the subpart's first line. 347 self._input.push_eof_matcher(boundaryre.match) 348 for retval in self._parsegen(): 349 if retval is NeedMoreData: 350 yield NeedMoreData 351 continue 352 break 353 # Because of RFC 2046, the newline preceding the boundary 354 # separator actually belongs to the boundary, not the 355 # previous subpart's payload (or epilogue if the previous 356 # part is a multipart). 357 if self._last.get_content_maintype() == 'multipart': 358 epilogue = self._last.epilogue 359 if epilogue == '': 360 self._last.epilogue = None 361 elif epilogue is not None: 362 mo = NLCRE_eol.search(epilogue) 363 if mo: 364 end = len(mo.group(0)) 365 self._last.epilogue = epilogue[:-end] 366 else: 367 payload = self._last.get_payload() 368 if isinstance(payload, basestring): 369 mo = NLCRE_eol.search(payload) 370 if mo: 371 payload = payload[:-len(mo.group(0))] 372 self._last.set_payload(payload) 373 self._input.pop_eof_matcher() 374 self._pop_message() 375 # Set the multipart up for newline cleansing, which will 376 # happen if we're in a nested multipart. 377 self._last = self._cur 378 else: 379 # I think we must be in the preamble 380 assert capturing_preamble 381 preamble.append(line) 382 # We've seen either the EOF or the end boundary. If we're still 383 # capturing the preamble, we never saw the start boundary. Note 384 # that as a defect and store the captured text as the payload. 385 # Everything from here to the EOF is epilogue. 386 if capturing_preamble: 387 self._cur.defects.append(errors.StartBoundaryNotFoundDefect()) 388 self._cur.set_payload(EMPTYSTRING.join(preamble)) 389 epilogue = [] 390 for line in self._input: 391 if line is NeedMoreData: 392 yield NeedMoreData 393 continue 394 self._cur.epilogue = EMPTYSTRING.join(epilogue) 395 return 396 # If the end boundary ended in a newline, we'll need to make sure 397 # the epilogue isn't None 398 if linesep: 399 epilogue = [''] 400 else: 401 epilogue = [] 402 for line in self._input: 403 if line is NeedMoreData: 404 yield NeedMoreData 405 continue 406 epilogue.append(line) 407 # Any CRLF at the front of the epilogue is not technically part of 408 # the epilogue. Also, watch out for an empty string epilogue, 409 # which means a single newline. 410 if epilogue: 411 firstline = epilogue[0] 412 bolmo = NLCRE_bol.match(firstline) 413 if bolmo: 414 epilogue[0] = firstline[len(bolmo.group(0)):] 415 self._cur.epilogue = EMPTYSTRING.join(epilogue) 416 return 417 # Otherwise, it's some non-multipart type, so the entire rest of the 418 # file contents becomes the payload. 419 lines = [] 420 for line in self._input: 421 if line is NeedMoreData: 422 yield NeedMoreData 423 continue 424 lines.append(line) 425 self._cur.set_payload(EMPTYSTRING.join(lines)) 426 427 def _parse_headers(self, lines): 428 # Passed a list of lines that make up the headers for the current msg 429 lastheader = '' 430 lastvalue = [] 431 for lineno, line in enumerate(lines): 432 # Check for continuation 433 if line[0] in ' \t': 434 if not lastheader: 435 # The first line of the headers was a continuation. This 436 # is illegal, so let's note the defect, store the illegal 437 # line, and ignore it for purposes of headers. 438 defect = errors.FirstHeaderLineIsContinuationDefect(line) 439 self._cur.defects.append(defect) 440 continue 441 lastvalue.append(line) 442 continue 443 if lastheader: 444 # XXX reconsider the joining of folded lines 445 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n') 446 self._cur[lastheader] = lhdr 447 lastheader, lastvalue = '', [] 448 # Check for envelope header, i.e. unix-from 449 if line.startswith('From '): 450 if lineno == 0: 451 # Strip off the trailing newline 452 mo = NLCRE_eol.search(line) 453 if mo: 454 line = line[:-len(mo.group(0))] 455 self._cur.set_unixfrom(line) 456 continue 457 elif lineno == len(lines) - 1: 458 # Something looking like a unix-from at the end - it's 459 # probably the first line of the body, so push back the 460 # line and stop. 461 self._input.unreadline(line) 462 return 463 else: 464 # Weirdly placed unix-from line. Note this as a defect 465 # and ignore it. 466 defect = errors.MisplacedEnvelopeHeaderDefect(line) 467 self._cur.defects.append(defect) 468 continue 469 # Split the line on the colon separating field name from value. 470 i = line.find(':') 471 if i < 0: 472 defect = errors.MalformedHeaderDefect(line) 473 self._cur.defects.append(defect) 474 continue 475 lastheader = line[:i] 476 lastvalue = [line[i+1:].lstrip()] 477 # Done with all the lines, so handle the last header. 478 if lastheader: 479 # XXX reconsider the joining of folded lines 480 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')