/Lib/sre_parse.py
Python | 796 lines | 700 code | 46 blank | 50 comment | 151 complexity | cb303a35856e78bdc9f7d4420c6b15e3 MD5 | raw file
1# 2# Secret Labs' Regular Expression Engine 3# 4# convert re-style regular expression to sre pattern 5# 6# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. 7# 8# See the sre.py file for information on usage and redistribution. 9# 10 11"""Internal support module for sre""" 12 13# XXX: show string offset and offending character for all errors 14 15import sys 16 17from sre_constants import * 18 19def set(seq): 20 s = {} 21 for elem in seq: 22 s[elem] = 1 23 return s 24 25SPECIAL_CHARS = ".\\[{()*+?^$|" 26REPEAT_CHARS = "*+?{" 27 28DIGITS = set("0123456789") 29 30OCTDIGITS = set("01234567") 31HEXDIGITS = set("0123456789abcdefABCDEF") 32 33WHITESPACE = set(" \t\n\r\v\f") 34 35ESCAPES = { 36 r"\a": (LITERAL, ord("\a")), 37 r"\b": (LITERAL, ord("\b")), 38 r"\f": (LITERAL, ord("\f")), 39 r"\n": (LITERAL, ord("\n")), 40 r"\r": (LITERAL, ord("\r")), 41 r"\t": (LITERAL, ord("\t")), 42 r"\v": (LITERAL, ord("\v")), 43 r"\\": (LITERAL, ord("\\")) 44} 45 46CATEGORIES = { 47 r"\A": (AT, AT_BEGINNING_STRING), # start of string 48 r"\b": (AT, AT_BOUNDARY), 49 r"\B": (AT, AT_NON_BOUNDARY), 50 r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), 51 r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), 52 r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), 53 r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), 54 r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), 55 r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), 56 r"\Z": (AT, AT_END_STRING), # end of string 57} 58 59FLAGS = { 60 # standard flags 61 "i": SRE_FLAG_IGNORECASE, 62 "L": SRE_FLAG_LOCALE, 63 "m": SRE_FLAG_MULTILINE, 64 "s": SRE_FLAG_DOTALL, 65 "x": SRE_FLAG_VERBOSE, 66 # extensions 67 "t": SRE_FLAG_TEMPLATE, 68 "u": SRE_FLAG_UNICODE, 69} 70 71class Pattern: 72 # master pattern object. keeps track of global attributes 73 def __init__(self): 74 self.flags = 0 75 self.open = [] 76 self.groups = 1 77 self.groupdict = {} 78 def opengroup(self, name=None): 79 gid = self.groups 80 self.groups = gid + 1 81 if name is not None: 82 ogid = self.groupdict.get(name, None) 83 if ogid is not None: 84 raise error, ("redefinition of group name %s as group %d; " 85 "was group %d" % (repr(name), gid, ogid)) 86 self.groupdict[name] = gid 87 self.open.append(gid) 88 return gid 89 def closegroup(self, gid): 90 self.open.remove(gid) 91 def checkgroup(self, gid): 92 return gid < self.groups and gid not in self.open 93 94class SubPattern: 95 # a subpattern, in intermediate form 96 def __init__(self, pattern, data=None): 97 self.pattern = pattern 98 if data is None: 99 data = [] 100 self.data = data 101 self.width = None 102 def dump(self, level=0): 103 nl = 1 104 seqtypes = type(()), type([]) 105 for op, av in self.data: 106 print level*" " + op,; nl = 0 107 if op == "in": 108 # member sublanguage 109 print; nl = 1 110 for op, a in av: 111 print (level+1)*" " + op, a 112 elif op == "branch": 113 print; nl = 1 114 i = 0 115 for a in av[1]: 116 if i > 0: 117 print level*" " + "or" 118 a.dump(level+1); nl = 1 119 i = i + 1 120 elif type(av) in seqtypes: 121 for a in av: 122 if isinstance(a, SubPattern): 123 if not nl: print 124 a.dump(level+1); nl = 1 125 else: 126 print a, ; nl = 0 127 else: 128 print av, ; nl = 0 129 if not nl: print 130 def __repr__(self): 131 return repr(self.data) 132 def __len__(self): 133 return len(self.data) 134 def __delitem__(self, index): 135 del self.data[index] 136 def __getitem__(self, index): 137 if isinstance(index, slice): 138 return SubPattern(self.pattern, self.data[index]) 139 return self.data[index] 140 def __setitem__(self, index, code): 141 self.data[index] = code 142 def insert(self, index, code): 143 self.data.insert(index, code) 144 def append(self, code): 145 self.data.append(code) 146 def getwidth(self): 147 # determine the width (min, max) for this subpattern 148 if self.width: 149 return self.width 150 lo = hi = 0L 151 UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) 152 REPEATCODES = (MIN_REPEAT, MAX_REPEAT) 153 for op, av in self.data: 154 if op is BRANCH: 155 i = sys.maxint 156 j = 0 157 for av in av[1]: 158 l, h = av.getwidth() 159 i = min(i, l) 160 j = max(j, h) 161 lo = lo + i 162 hi = hi + j 163 elif op is CALL: 164 i, j = av.getwidth() 165 lo = lo + i 166 hi = hi + j 167 elif op is SUBPATTERN: 168 i, j = av[1].getwidth() 169 lo = lo + i 170 hi = hi + j 171 elif op in REPEATCODES: 172 i, j = av[2].getwidth() 173 lo = lo + long(i) * av[0] 174 hi = hi + long(j) * av[1] 175 elif op in UNITCODES: 176 lo = lo + 1 177 hi = hi + 1 178 elif op == SUCCESS: 179 break 180 self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint)) 181 return self.width 182 183class Tokenizer: 184 def __init__(self, string): 185 self.string = string 186 self.index = 0 187 self.__next() 188 def __next(self): 189 if self.index >= len(self.string): 190 self.next = None 191 return 192 char = self.string[self.index] 193 if char[0] == "\\": 194 try: 195 c = self.string[self.index + 1] 196 except IndexError: 197 raise error, "bogus escape (end of line)" 198 char = char + c 199 self.index = self.index + len(char) 200 self.next = char 201 def match(self, char, skip=1): 202 if char == self.next: 203 if skip: 204 self.__next() 205 return 1 206 return 0 207 def get(self): 208 this = self.next 209 self.__next() 210 return this 211 def tell(self): 212 return self.index, self.next 213 def seek(self, index): 214 self.index, self.next = index 215 216def isident(char): 217 return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" 218 219def isdigit(char): 220 return "0" <= char <= "9" 221 222def isname(name): 223 # check that group name is a valid string 224 if not isident(name[0]): 225 return False 226 for char in name[1:]: 227 if not isident(char) and not isdigit(char): 228 return False 229 return True 230 231def _class_escape(source, escape): 232 # handle escape code inside character class 233 code = ESCAPES.get(escape) 234 if code: 235 return code 236 code = CATEGORIES.get(escape) 237 if code: 238 return code 239 try: 240 c = escape[1:2] 241 if c == "x": 242 # hexadecimal escape (exactly two digits) 243 while source.next in HEXDIGITS and len(escape) < 4: 244 escape = escape + source.get() 245 escape = escape[2:] 246 if len(escape) != 2: 247 raise error, "bogus escape: %s" % repr("\\" + escape) 248 return LITERAL, int(escape, 16) & 0xff 249 elif c in OCTDIGITS: 250 # octal escape (up to three digits) 251 while source.next in OCTDIGITS and len(escape) < 4: 252 escape = escape + source.get() 253 escape = escape[1:] 254 return LITERAL, int(escape, 8) & 0xff 255 elif c in DIGITS: 256 raise error, "bogus escape: %s" % repr(escape) 257 if len(escape) == 2: 258 return LITERAL, ord(escape[1]) 259 except ValueError: 260 pass 261 raise error, "bogus escape: %s" % repr(escape) 262 263def _escape(source, escape, state): 264 # handle escape code in expression 265 code = CATEGORIES.get(escape) 266 if code: 267 return code 268 code = ESCAPES.get(escape) 269 if code: 270 return code 271 try: 272 c = escape[1:2] 273 if c == "x": 274 # hexadecimal escape 275 while source.next in HEXDIGITS and len(escape) < 4: 276 escape = escape + source.get() 277 if len(escape) != 4: 278 raise ValueError 279 return LITERAL, int(escape[2:], 16) & 0xff 280 elif c == "0": 281 # octal escape 282 while source.next in OCTDIGITS and len(escape) < 4: 283 escape = escape + source.get() 284 return LITERAL, int(escape[1:], 8) & 0xff 285 elif c in DIGITS: 286 # octal escape *or* decimal group reference (sigh) 287 if source.next in DIGITS: 288 escape = escape + source.get() 289 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and 290 source.next in OCTDIGITS): 291 # got three octal digits; this is an octal escape 292 escape = escape + source.get() 293 return LITERAL, int(escape[1:], 8) & 0xff 294 # not an octal escape, so this is a group reference 295 group = int(escape[1:]) 296 if group < state.groups: 297 if not state.checkgroup(group): 298 raise error, "cannot refer to open group" 299 return GROUPREF, group 300 raise ValueError 301 if len(escape) == 2: 302 return LITERAL, ord(escape[1]) 303 except ValueError: 304 pass 305 raise error, "bogus escape: %s" % repr(escape) 306 307def _parse_sub(source, state, nested=1): 308 # parse an alternation: a|b|c 309 310 items = [] 311 itemsappend = items.append 312 sourcematch = source.match 313 while 1: 314 itemsappend(_parse(source, state)) 315 if sourcematch("|"): 316 continue 317 if not nested: 318 break 319 if not source.next or sourcematch(")", 0): 320 break 321 else: 322 raise error, "pattern not properly closed" 323 324 if len(items) == 1: 325 return items[0] 326 327 subpattern = SubPattern(state) 328 subpatternappend = subpattern.append 329 330 # check if all items share a common prefix 331 while 1: 332 prefix = None 333 for item in items: 334 if not item: 335 break 336 if prefix is None: 337 prefix = item[0] 338 elif item[0] != prefix: 339 break 340 else: 341 # all subitems start with a common "prefix". 342 # move it out of the branch 343 for item in items: 344 del item[0] 345 subpatternappend(prefix) 346 continue # check next one 347 break 348 349 # check if the branch can be replaced by a character set 350 for item in items: 351 if len(item) != 1 or item[0][0] != LITERAL: 352 break 353 else: 354 # we can store this as a character set instead of a 355 # branch (the compiler may optimize this even more) 356 set = [] 357 setappend = set.append 358 for item in items: 359 setappend(item[0]) 360 subpatternappend((IN, set)) 361 return subpattern 362 363 subpattern.append((BRANCH, (None, items))) 364 return subpattern 365 366def _parse_sub_cond(source, state, condgroup): 367 item_yes = _parse(source, state) 368 if source.match("|"): 369 item_no = _parse(source, state) 370 if source.match("|"): 371 raise error, "conditional backref with more than two branches" 372 else: 373 item_no = None 374 if source.next and not source.match(")", 0): 375 raise error, "pattern not properly closed" 376 subpattern = SubPattern(state) 377 subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) 378 return subpattern 379 380_PATTERNENDERS = set("|)") 381_ASSERTCHARS = set("=!<") 382_LOOKBEHINDASSERTCHARS = set("=!") 383_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT]) 384 385def _parse(source, state): 386 # parse a simple pattern 387 subpattern = SubPattern(state) 388 389 # precompute constants into local variables 390 subpatternappend = subpattern.append 391 sourceget = source.get 392 sourcematch = source.match 393 _len = len 394 PATTERNENDERS = _PATTERNENDERS 395 ASSERTCHARS = _ASSERTCHARS 396 LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS 397 REPEATCODES = _REPEATCODES 398 399 while 1: 400 401 if source.next in PATTERNENDERS: 402 break # end of subpattern 403 this = sourceget() 404 if this is None: 405 break # end of pattern 406 407 if state.flags & SRE_FLAG_VERBOSE: 408 # skip whitespace and comments 409 if this in WHITESPACE: 410 continue 411 if this == "#": 412 while 1: 413 this = sourceget() 414 if this in (None, "\n"): 415 break 416 continue 417 418 if this and this[0] not in SPECIAL_CHARS: 419 subpatternappend((LITERAL, ord(this))) 420 421 elif this == "[": 422 # character set 423 set = [] 424 setappend = set.append 425## if sourcematch(":"): 426## pass # handle character classes 427 if sourcematch("^"): 428 setappend((NEGATE, None)) 429 # check remaining characters 430 start = set[:] 431 while 1: 432 this = sourceget() 433 if this == "]" and set != start: 434 break 435 elif this and this[0] == "\\": 436 code1 = _class_escape(source, this) 437 elif this: 438 code1 = LITERAL, ord(this) 439 else: 440 raise error, "unexpected end of regular expression" 441 if sourcematch("-"): 442 # potential range 443 this = sourceget() 444 if this == "]": 445 if code1[0] is IN: 446 code1 = code1[1][0] 447 setappend(code1) 448 setappend((LITERAL, ord("-"))) 449 break 450 elif this: 451 if this[0] == "\\": 452 code2 = _class_escape(source, this) 453 else: 454 code2 = LITERAL, ord(this) 455 if code1[0] != LITERAL or code2[0] != LITERAL: 456 raise error, "bad character range" 457 lo = code1[1] 458 hi = code2[1] 459 if hi < lo: 460 raise error, "bad character range" 461 setappend((RANGE, (lo, hi))) 462 else: 463 raise error, "unexpected end of regular expression" 464 else: 465 if code1[0] is IN: 466 code1 = code1[1][0] 467 setappend(code1) 468 469 # XXX: <fl> should move set optimization to compiler! 470 if _len(set)==1 and set[0][0] is LITERAL: 471 subpatternappend(set[0]) # optimization 472 elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: 473 subpatternappend((NOT_LITERAL, set[1][1])) # optimization 474 else: 475 # XXX: <fl> should add charmap optimization here 476 subpatternappend((IN, set)) 477 478 elif this and this[0] in REPEAT_CHARS: 479 # repeat previous item 480 if this == "?": 481 min, max = 0, 1 482 elif this == "*": 483 min, max = 0, MAXREPEAT 484 485 elif this == "+": 486 min, max = 1, MAXREPEAT 487 elif this == "{": 488 if source.next == "}": 489 subpatternappend((LITERAL, ord(this))) 490 continue 491 here = source.tell() 492 min, max = 0, MAXREPEAT 493 lo = hi = "" 494 while source.next in DIGITS: 495 lo = lo + source.get() 496 if sourcematch(","): 497 while source.next in DIGITS: 498 hi = hi + sourceget() 499 else: 500 hi = lo 501 if not sourcematch("}"): 502 subpatternappend((LITERAL, ord(this))) 503 source.seek(here) 504 continue 505 if lo: 506 min = int(lo) 507 if hi: 508 max = int(hi) 509 if max < min: 510 raise error, "bad repeat interval" 511 else: 512 raise error, "not supported" 513 # figure out which item to repeat 514 if subpattern: 515 item = subpattern[-1:] 516 else: 517 item = None 518 if not item or (_len(item) == 1 and item[0][0] == AT): 519 raise error, "nothing to repeat" 520 if item[0][0] in REPEATCODES: 521 raise error, "multiple repeat" 522 if sourcematch("?"): 523 subpattern[-1] = (MIN_REPEAT, (min, max, item)) 524 else: 525 subpattern[-1] = (MAX_REPEAT, (min, max, item)) 526 527 elif this == ".": 528 subpatternappend((ANY, None)) 529 530 elif this == "(": 531 group = 1 532 name = None 533 condgroup = None 534 if sourcematch("?"): 535 group = 0 536 # options 537 if sourcematch("P"): 538 # python extensions 539 if sourcematch("<"): 540 # named group: skip forward to end of name 541 name = "" 542 while 1: 543 char = sourceget() 544 if char is None: 545 raise error, "unterminated name" 546 if char == ">": 547 break 548 name = name + char 549 group = 1 550 if not isname(name): 551 raise error, "bad character in group name" 552 elif sourcematch("="): 553 # named backreference 554 name = "" 555 while 1: 556 char = sourceget() 557 if char is None: 558 raise error, "unterminated name" 559 if char == ")": 560 break 561 name = name + char 562 if not isname(name): 563 raise error, "bad character in group name" 564 gid = state.groupdict.get(name) 565 if gid is None: 566 raise error, "unknown group name" 567 subpatternappend((GROUPREF, gid)) 568 continue 569 else: 570 char = sourceget() 571 if char is None: 572 raise error, "unexpected end of pattern" 573 raise error, "unknown specifier: ?P%s" % char 574 elif sourcematch(":"): 575 # non-capturing group 576 group = 2 577 elif sourcematch("#"): 578 # comment 579 while 1: 580 if source.next is None or source.next == ")": 581 break 582 sourceget() 583 if not sourcematch(")"): 584 raise error, "unbalanced parenthesis" 585 continue 586 elif source.next in ASSERTCHARS: 587 # lookahead assertions 588 char = sourceget() 589 dir = 1 590 if char == "<": 591 if source.next not in LOOKBEHINDASSERTCHARS: 592 raise error, "syntax error" 593 dir = -1 # lookbehind 594 char = sourceget() 595 p = _parse_sub(source, state) 596 if not sourcematch(")"): 597 raise error, "unbalanced parenthesis" 598 if char == "=": 599 subpatternappend((ASSERT, (dir, p))) 600 else: 601 subpatternappend((ASSERT_NOT, (dir, p))) 602 continue 603 elif sourcematch("("): 604 # conditional backreference group 605 condname = "" 606 while 1: 607 char = sourceget() 608 if char is None: 609 raise error, "unterminated name" 610 if char == ")": 611 break 612 condname = condname + char 613 group = 2 614 if isname(condname): 615 condgroup = state.groupdict.get(condname) 616 if condgroup is None: 617 raise error, "unknown group name" 618 else: 619 try: 620 condgroup = int(condname) 621 except ValueError: 622 raise error, "bad character in group name" 623 else: 624 # flags 625 if not source.next in FLAGS: 626 raise error, "unexpected end of pattern" 627 while source.next in FLAGS: 628 state.flags = state.flags | FLAGS[sourceget()] 629 if group: 630 # parse group contents 631 if group == 2: 632 # anonymous group 633 group = None 634 else: 635 group = state.opengroup(name) 636 if condgroup: 637 p = _parse_sub_cond(source, state, condgroup) 638 else: 639 p = _parse_sub(source, state) 640 if not sourcematch(")"): 641 raise error, "unbalanced parenthesis" 642 if group is not None: 643 state.closegroup(group) 644 subpatternappend((SUBPATTERN, (group, p))) 645 else: 646 while 1: 647 char = sourceget() 648 if char is None: 649 raise error, "unexpected end of pattern" 650 if char == ")": 651 break 652 raise error, "unknown extension" 653 654 elif this == "^": 655 subpatternappend((AT, AT_BEGINNING)) 656 657 elif this == "$": 658 subpattern.append((AT, AT_END)) 659 660 elif this and this[0] == "\\": 661 code = _escape(source, this, state) 662 subpatternappend(code) 663 664 else: 665 raise error, "parser error" 666 667 return subpattern 668 669def parse(str, flags=0, pattern=None): 670 # parse 're' pattern into list of (opcode, argument) tuples 671 672 source = Tokenizer(str) 673 674 if pattern is None: 675 pattern = Pattern() 676 pattern.flags = flags 677 pattern.str = str 678 679 p = _parse_sub(source, pattern, 0) 680 681 tail = source.get() 682 if tail == ")": 683 raise error, "unbalanced parenthesis" 684 elif tail: 685 raise error, "bogus characters at end of regular expression" 686 687 if flags & SRE_FLAG_DEBUG: 688 p.dump() 689 690 if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: 691 # the VERBOSE flag was switched on inside the pattern. to be 692 # on the safe side, we'll parse the whole thing again... 693 return parse(str, p.pattern.flags) 694 695 return p 696 697def parse_template(source, pattern): 698 # parse 're' replacement string into list of literals and 699 # group references 700 s = Tokenizer(source) 701 sget = s.get 702 p = [] 703 a = p.append 704 def literal(literal, p=p, pappend=a): 705 if p and p[-1][0] is LITERAL: 706 p[-1] = LITERAL, p[-1][1] + literal 707 else: 708 pappend((LITERAL, literal)) 709 sep = source[:0] 710 if type(sep) is type(""): 711 makechar = chr 712 else: 713 makechar = unichr 714 while 1: 715 this = sget() 716 if this is None: 717 break # end of replacement string 718 if this and this[0] == "\\": 719 # group 720 c = this[1:2] 721 if c == "g": 722 name = "" 723 if s.match("<"): 724 while 1: 725 char = sget() 726 if char is None: 727 raise error, "unterminated group name" 728 if char == ">": 729 break 730 name = name + char 731 if not name: 732 raise error, "bad group name" 733 try: 734 index = int(name) 735 if index < 0: 736 raise error, "negative group number" 737 except ValueError: 738 if not isname(name): 739 raise error, "bad character in group name" 740 try: 741 index = pattern.groupindex[name] 742 except KeyError: 743 raise IndexError, "unknown group name" 744 a((MARK, index)) 745 elif c == "0": 746 if s.next in OCTDIGITS: 747 this = this + sget() 748 if s.next in OCTDIGITS: 749 this = this + sget() 750 literal(makechar(int(this[1:], 8) & 0xff)) 751 elif c in DIGITS: 752 isoctal = False 753 if s.next in DIGITS: 754 this = this + sget() 755 if (c in OCTDIGITS and this[2] in OCTDIGITS and 756 s.next in OCTDIGITS): 757 this = this + sget() 758 isoctal = True 759 literal(makechar(int(this[1:], 8) & 0xff)) 760 if not isoctal: 761 a((MARK, int(this[1:]))) 762 else: 763 try: 764 this = makechar(ESCAPES[this][1]) 765 except KeyError: 766 pass 767 literal(this) 768 else: 769 literal(this) 770 # convert template to groups and literals lists 771 i = 0 772 groups = [] 773 groupsappend = groups.append 774 literals = [None] * len(p) 775 for c, s in p: 776 if c is MARK: 777 groupsappend((i, s)) 778 # literal[i] is already None 779 else: 780 literals[i] = s 781 i = i + 1 782 return groups, literals 783 784def expand_template(template, match): 785 g = match.group 786 sep = match.string[:0] 787 groups, literals = template 788 literals = literals[:] 789 try: 790 for index, group in groups: 791 literals[index] = s = g(group) 792 if s is None: 793 raise error, "unmatched group" 794 except IndexError: 795 raise error, "invalid group reference" 796 return sep.join(literals)