PageRenderTime 50ms CodeModel.GetById 1ms app.highlight 43ms RepoModel.GetById 1ms app.codeStats 0ms

/Lib/sre_parse.py

http://unladen-swallow.googlecode.com/
Python | 796 lines | 700 code | 46 blank | 50 comment | 151 complexity | cb303a35856e78bdc9f7d4420c6b15e3 MD5 | raw file
  1#
  2# Secret Labs' Regular Expression Engine
  3#
  4# convert re-style regular expression to sre pattern
  5#
  6# Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
  7#
  8# See the sre.py file for information on usage and redistribution.
  9#
 10
 11"""Internal support module for sre"""
 12
 13# XXX: show string offset and offending character for all errors
 14
 15import sys
 16
 17from sre_constants import *
 18
 19def set(seq):
 20    s = {}
 21    for elem in seq:
 22        s[elem] = 1
 23    return s
 24
 25SPECIAL_CHARS = ".\\[{()*+?^$|"
 26REPEAT_CHARS = "*+?{"
 27
 28DIGITS = set("0123456789")
 29
 30OCTDIGITS = set("01234567")
 31HEXDIGITS = set("0123456789abcdefABCDEF")
 32
 33WHITESPACE = set(" \t\n\r\v\f")
 34
 35ESCAPES = {
 36    r"\a": (LITERAL, ord("\a")),
 37    r"\b": (LITERAL, ord("\b")),
 38    r"\f": (LITERAL, ord("\f")),
 39    r"\n": (LITERAL, ord("\n")),
 40    r"\r": (LITERAL, ord("\r")),
 41    r"\t": (LITERAL, ord("\t")),
 42    r"\v": (LITERAL, ord("\v")),
 43    r"\\": (LITERAL, ord("\\"))
 44}
 45
 46CATEGORIES = {
 47    r"\A": (AT, AT_BEGINNING_STRING), # start of string
 48    r"\b": (AT, AT_BOUNDARY),
 49    r"\B": (AT, AT_NON_BOUNDARY),
 50    r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
 51    r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
 52    r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
 53    r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
 54    r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
 55    r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
 56    r"\Z": (AT, AT_END_STRING), # end of string
 57}
 58
 59FLAGS = {
 60    # standard flags
 61    "i": SRE_FLAG_IGNORECASE,
 62    "L": SRE_FLAG_LOCALE,
 63    "m": SRE_FLAG_MULTILINE,
 64    "s": SRE_FLAG_DOTALL,
 65    "x": SRE_FLAG_VERBOSE,
 66    # extensions
 67    "t": SRE_FLAG_TEMPLATE,
 68    "u": SRE_FLAG_UNICODE,
 69}
 70
 71class Pattern:
 72    # master pattern object.  keeps track of global attributes
 73    def __init__(self):
 74        self.flags = 0
 75        self.open = []
 76        self.groups = 1
 77        self.groupdict = {}
 78    def opengroup(self, name=None):
 79        gid = self.groups
 80        self.groups = gid + 1
 81        if name is not None:
 82            ogid = self.groupdict.get(name, None)
 83            if ogid is not None:
 84                raise error, ("redefinition of group name %s as group %d; "
 85                              "was group %d" % (repr(name), gid,  ogid))
 86            self.groupdict[name] = gid
 87        self.open.append(gid)
 88        return gid
 89    def closegroup(self, gid):
 90        self.open.remove(gid)
 91    def checkgroup(self, gid):
 92        return gid < self.groups and gid not in self.open
 93
 94class SubPattern:
 95    # a subpattern, in intermediate form
 96    def __init__(self, pattern, data=None):
 97        self.pattern = pattern
 98        if data is None:
 99            data = []
100        self.data = data
101        self.width = None
102    def dump(self, level=0):
103        nl = 1
104        seqtypes = type(()), type([])
105        for op, av in self.data:
106            print level*"  " + op,; nl = 0
107            if op == "in":
108                # member sublanguage
109                print; nl = 1
110                for op, a in av:
111                    print (level+1)*"  " + op, a
112            elif op == "branch":
113                print; nl = 1
114                i = 0
115                for a in av[1]:
116                    if i > 0:
117                        print level*"  " + "or"
118                    a.dump(level+1); nl = 1
119                    i = i + 1
120            elif type(av) in seqtypes:
121                for a in av:
122                    if isinstance(a, SubPattern):
123                        if not nl: print
124                        a.dump(level+1); nl = 1
125                    else:
126                        print a, ; nl = 0
127            else:
128                print av, ; nl = 0
129            if not nl: print
130    def __repr__(self):
131        return repr(self.data)
132    def __len__(self):
133        return len(self.data)
134    def __delitem__(self, index):
135        del self.data[index]
136    def __getitem__(self, index):
137        if isinstance(index, slice):
138            return SubPattern(self.pattern, self.data[index])
139        return self.data[index]
140    def __setitem__(self, index, code):
141        self.data[index] = code
142    def insert(self, index, code):
143        self.data.insert(index, code)
144    def append(self, code):
145        self.data.append(code)
146    def getwidth(self):
147        # determine the width (min, max) for this subpattern
148        if self.width:
149            return self.width
150        lo = hi = 0L
151        UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
152        REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
153        for op, av in self.data:
154            if op is BRANCH:
155                i = sys.maxint
156                j = 0
157                for av in av[1]:
158                    l, h = av.getwidth()
159                    i = min(i, l)
160                    j = max(j, h)
161                lo = lo + i
162                hi = hi + j
163            elif op is CALL:
164                i, j = av.getwidth()
165                lo = lo + i
166                hi = hi + j
167            elif op is SUBPATTERN:
168                i, j = av[1].getwidth()
169                lo = lo + i
170                hi = hi + j
171            elif op in REPEATCODES:
172                i, j = av[2].getwidth()
173                lo = lo + long(i) * av[0]
174                hi = hi + long(j) * av[1]
175            elif op in UNITCODES:
176                lo = lo + 1
177                hi = hi + 1
178            elif op == SUCCESS:
179                break
180        self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
181        return self.width
182
183class Tokenizer:
184    def __init__(self, string):
185        self.string = string
186        self.index = 0
187        self.__next()
188    def __next(self):
189        if self.index >= len(self.string):
190            self.next = None
191            return
192        char = self.string[self.index]
193        if char[0] == "\\":
194            try:
195                c = self.string[self.index + 1]
196            except IndexError:
197                raise error, "bogus escape (end of line)"
198            char = char + c
199        self.index = self.index + len(char)
200        self.next = char
201    def match(self, char, skip=1):
202        if char == self.next:
203            if skip:
204                self.__next()
205            return 1
206        return 0
207    def get(self):
208        this = self.next
209        self.__next()
210        return this
211    def tell(self):
212        return self.index, self.next
213    def seek(self, index):
214        self.index, self.next = index
215
216def isident(char):
217    return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
218
219def isdigit(char):
220    return "0" <= char <= "9"
221
222def isname(name):
223    # check that group name is a valid string
224    if not isident(name[0]):
225        return False
226    for char in name[1:]:
227        if not isident(char) and not isdigit(char):
228            return False
229    return True
230
231def _class_escape(source, escape):
232    # handle escape code inside character class
233    code = ESCAPES.get(escape)
234    if code:
235        return code
236    code = CATEGORIES.get(escape)
237    if code:
238        return code
239    try:
240        c = escape[1:2]
241        if c == "x":
242            # hexadecimal escape (exactly two digits)
243            while source.next in HEXDIGITS and len(escape) < 4:
244                escape = escape + source.get()
245            escape = escape[2:]
246            if len(escape) != 2:
247                raise error, "bogus escape: %s" % repr("\\" + escape)
248            return LITERAL, int(escape, 16) & 0xff
249        elif c in OCTDIGITS:
250            # octal escape (up to three digits)
251            while source.next in OCTDIGITS and len(escape) < 4:
252                escape = escape + source.get()
253            escape = escape[1:]
254            return LITERAL, int(escape, 8) & 0xff
255        elif c in DIGITS:
256            raise error, "bogus escape: %s" % repr(escape)
257        if len(escape) == 2:
258            return LITERAL, ord(escape[1])
259    except ValueError:
260        pass
261    raise error, "bogus escape: %s" % repr(escape)
262
263def _escape(source, escape, state):
264    # handle escape code in expression
265    code = CATEGORIES.get(escape)
266    if code:
267        return code
268    code = ESCAPES.get(escape)
269    if code:
270        return code
271    try:
272        c = escape[1:2]
273        if c == "x":
274            # hexadecimal escape
275            while source.next in HEXDIGITS and len(escape) < 4:
276                escape = escape + source.get()
277            if len(escape) != 4:
278                raise ValueError
279            return LITERAL, int(escape[2:], 16) & 0xff
280        elif c == "0":
281            # octal escape
282            while source.next in OCTDIGITS and len(escape) < 4:
283                escape = escape + source.get()
284            return LITERAL, int(escape[1:], 8) & 0xff
285        elif c in DIGITS:
286            # octal escape *or* decimal group reference (sigh)
287            if source.next in DIGITS:
288                escape = escape + source.get()
289                if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
290                    source.next in OCTDIGITS):
291                    # got three octal digits; this is an octal escape
292                    escape = escape + source.get()
293                    return LITERAL, int(escape[1:], 8) & 0xff
294            # not an octal escape, so this is a group reference
295            group = int(escape[1:])
296            if group < state.groups:
297                if not state.checkgroup(group):
298                    raise error, "cannot refer to open group"
299                return GROUPREF, group
300            raise ValueError
301        if len(escape) == 2:
302            return LITERAL, ord(escape[1])
303    except ValueError:
304        pass
305    raise error, "bogus escape: %s" % repr(escape)
306
307def _parse_sub(source, state, nested=1):
308    # parse an alternation: a|b|c
309
310    items = []
311    itemsappend = items.append
312    sourcematch = source.match
313    while 1:
314        itemsappend(_parse(source, state))
315        if sourcematch("|"):
316            continue
317        if not nested:
318            break
319        if not source.next or sourcematch(")", 0):
320            break
321        else:
322            raise error, "pattern not properly closed"
323
324    if len(items) == 1:
325        return items[0]
326
327    subpattern = SubPattern(state)
328    subpatternappend = subpattern.append
329
330    # check if all items share a common prefix
331    while 1:
332        prefix = None
333        for item in items:
334            if not item:
335                break
336            if prefix is None:
337                prefix = item[0]
338            elif item[0] != prefix:
339                break
340        else:
341            # all subitems start with a common "prefix".
342            # move it out of the branch
343            for item in items:
344                del item[0]
345            subpatternappend(prefix)
346            continue # check next one
347        break
348
349    # check if the branch can be replaced by a character set
350    for item in items:
351        if len(item) != 1 or item[0][0] != LITERAL:
352            break
353    else:
354        # we can store this as a character set instead of a
355        # branch (the compiler may optimize this even more)
356        set = []
357        setappend = set.append
358        for item in items:
359            setappend(item[0])
360        subpatternappend((IN, set))
361        return subpattern
362
363    subpattern.append((BRANCH, (None, items)))
364    return subpattern
365
366def _parse_sub_cond(source, state, condgroup):
367    item_yes = _parse(source, state)
368    if source.match("|"):
369        item_no = _parse(source, state)
370        if source.match("|"):
371            raise error, "conditional backref with more than two branches"
372    else:
373        item_no = None
374    if source.next and not source.match(")", 0):
375        raise error, "pattern not properly closed"
376    subpattern = SubPattern(state)
377    subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
378    return subpattern
379
380_PATTERNENDERS = set("|)")
381_ASSERTCHARS = set("=!<")
382_LOOKBEHINDASSERTCHARS = set("=!")
383_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
384
385def _parse(source, state):
386    # parse a simple pattern
387    subpattern = SubPattern(state)
388
389    # precompute constants into local variables
390    subpatternappend = subpattern.append
391    sourceget = source.get
392    sourcematch = source.match
393    _len = len
394    PATTERNENDERS = _PATTERNENDERS
395    ASSERTCHARS = _ASSERTCHARS
396    LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
397    REPEATCODES = _REPEATCODES
398
399    while 1:
400
401        if source.next in PATTERNENDERS:
402            break # end of subpattern
403        this = sourceget()
404        if this is None:
405            break # end of pattern
406
407        if state.flags & SRE_FLAG_VERBOSE:
408            # skip whitespace and comments
409            if this in WHITESPACE:
410                continue
411            if this == "#":
412                while 1:
413                    this = sourceget()
414                    if this in (None, "\n"):
415                        break
416                continue
417
418        if this and this[0] not in SPECIAL_CHARS:
419            subpatternappend((LITERAL, ord(this)))
420
421        elif this == "[":
422            # character set
423            set = []
424            setappend = set.append
425##          if sourcematch(":"):
426##              pass # handle character classes
427            if sourcematch("^"):
428                setappend((NEGATE, None))
429            # check remaining characters
430            start = set[:]
431            while 1:
432                this = sourceget()
433                if this == "]" and set != start:
434                    break
435                elif this and this[0] == "\\":
436                    code1 = _class_escape(source, this)
437                elif this:
438                    code1 = LITERAL, ord(this)
439                else:
440                    raise error, "unexpected end of regular expression"
441                if sourcematch("-"):
442                    # potential range
443                    this = sourceget()
444                    if this == "]":
445                        if code1[0] is IN:
446                            code1 = code1[1][0]
447                        setappend(code1)
448                        setappend((LITERAL, ord("-")))
449                        break
450                    elif this:
451                        if this[0] == "\\":
452                            code2 = _class_escape(source, this)
453                        else:
454                            code2 = LITERAL, ord(this)
455                        if code1[0] != LITERAL or code2[0] != LITERAL:
456                            raise error, "bad character range"
457                        lo = code1[1]
458                        hi = code2[1]
459                        if hi < lo:
460                            raise error, "bad character range"
461                        setappend((RANGE, (lo, hi)))
462                    else:
463                        raise error, "unexpected end of regular expression"
464                else:
465                    if code1[0] is IN:
466                        code1 = code1[1][0]
467                    setappend(code1)
468
469            # XXX: <fl> should move set optimization to compiler!
470            if _len(set)==1 and set[0][0] is LITERAL:
471                subpatternappend(set[0]) # optimization
472            elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
473                subpatternappend((NOT_LITERAL, set[1][1])) # optimization
474            else:
475                # XXX: <fl> should add charmap optimization here
476                subpatternappend((IN, set))
477
478        elif this and this[0] in REPEAT_CHARS:
479            # repeat previous item
480            if this == "?":
481                min, max = 0, 1
482            elif this == "*":
483                min, max = 0, MAXREPEAT
484
485            elif this == "+":
486                min, max = 1, MAXREPEAT
487            elif this == "{":
488                if source.next == "}":
489                    subpatternappend((LITERAL, ord(this)))
490                    continue
491                here = source.tell()
492                min, max = 0, MAXREPEAT
493                lo = hi = ""
494                while source.next in DIGITS:
495                    lo = lo + source.get()
496                if sourcematch(","):
497                    while source.next in DIGITS:
498                        hi = hi + sourceget()
499                else:
500                    hi = lo
501                if not sourcematch("}"):
502                    subpatternappend((LITERAL, ord(this)))
503                    source.seek(here)
504                    continue
505                if lo:
506                    min = int(lo)
507                if hi:
508                    max = int(hi)
509                if max < min:
510                    raise error, "bad repeat interval"
511            else:
512                raise error, "not supported"
513            # figure out which item to repeat
514            if subpattern:
515                item = subpattern[-1:]
516            else:
517                item = None
518            if not item or (_len(item) == 1 and item[0][0] == AT):
519                raise error, "nothing to repeat"
520            if item[0][0] in REPEATCODES:
521                raise error, "multiple repeat"
522            if sourcematch("?"):
523                subpattern[-1] = (MIN_REPEAT, (min, max, item))
524            else:
525                subpattern[-1] = (MAX_REPEAT, (min, max, item))
526
527        elif this == ".":
528            subpatternappend((ANY, None))
529
530        elif this == "(":
531            group = 1
532            name = None
533            condgroup = None
534            if sourcematch("?"):
535                group = 0
536                # options
537                if sourcematch("P"):
538                    # python extensions
539                    if sourcematch("<"):
540                        # named group: skip forward to end of name
541                        name = ""
542                        while 1:
543                            char = sourceget()
544                            if char is None:
545                                raise error, "unterminated name"
546                            if char == ">":
547                                break
548                            name = name + char
549                        group = 1
550                        if not isname(name):
551                            raise error, "bad character in group name"
552                    elif sourcematch("="):
553                        # named backreference
554                        name = ""
555                        while 1:
556                            char = sourceget()
557                            if char is None:
558                                raise error, "unterminated name"
559                            if char == ")":
560                                break
561                            name = name + char
562                        if not isname(name):
563                            raise error, "bad character in group name"
564                        gid = state.groupdict.get(name)
565                        if gid is None:
566                            raise error, "unknown group name"
567                        subpatternappend((GROUPREF, gid))
568                        continue
569                    else:
570                        char = sourceget()
571                        if char is None:
572                            raise error, "unexpected end of pattern"
573                        raise error, "unknown specifier: ?P%s" % char
574                elif sourcematch(":"):
575                    # non-capturing group
576                    group = 2
577                elif sourcematch("#"):
578                    # comment
579                    while 1:
580                        if source.next is None or source.next == ")":
581                            break
582                        sourceget()
583                    if not sourcematch(")"):
584                        raise error, "unbalanced parenthesis"
585                    continue
586                elif source.next in ASSERTCHARS:
587                    # lookahead assertions
588                    char = sourceget()
589                    dir = 1
590                    if char == "<":
591                        if source.next not in LOOKBEHINDASSERTCHARS:
592                            raise error, "syntax error"
593                        dir = -1 # lookbehind
594                        char = sourceget()
595                    p = _parse_sub(source, state)
596                    if not sourcematch(")"):
597                        raise error, "unbalanced parenthesis"
598                    if char == "=":
599                        subpatternappend((ASSERT, (dir, p)))
600                    else:
601                        subpatternappend((ASSERT_NOT, (dir, p)))
602                    continue
603                elif sourcematch("("):
604                    # conditional backreference group
605                    condname = ""
606                    while 1:
607                        char = sourceget()
608                        if char is None:
609                            raise error, "unterminated name"
610                        if char == ")":
611                            break
612                        condname = condname + char
613                    group = 2
614                    if isname(condname):
615                        condgroup = state.groupdict.get(condname)
616                        if condgroup is None:
617                            raise error, "unknown group name"
618                    else:
619                        try:
620                            condgroup = int(condname)
621                        except ValueError:
622                            raise error, "bad character in group name"
623                else:
624                    # flags
625                    if not source.next in FLAGS:
626                        raise error, "unexpected end of pattern"
627                    while source.next in FLAGS:
628                        state.flags = state.flags | FLAGS[sourceget()]
629            if group:
630                # parse group contents
631                if group == 2:
632                    # anonymous group
633                    group = None
634                else:
635                    group = state.opengroup(name)
636                if condgroup:
637                    p = _parse_sub_cond(source, state, condgroup)
638                else:
639                    p = _parse_sub(source, state)
640                if not sourcematch(")"):
641                    raise error, "unbalanced parenthesis"
642                if group is not None:
643                    state.closegroup(group)
644                subpatternappend((SUBPATTERN, (group, p)))
645            else:
646                while 1:
647                    char = sourceget()
648                    if char is None:
649                        raise error, "unexpected end of pattern"
650                    if char == ")":
651                        break
652                    raise error, "unknown extension"
653
654        elif this == "^":
655            subpatternappend((AT, AT_BEGINNING))
656
657        elif this == "$":
658            subpattern.append((AT, AT_END))
659
660        elif this and this[0] == "\\":
661            code = _escape(source, this, state)
662            subpatternappend(code)
663
664        else:
665            raise error, "parser error"
666
667    return subpattern
668
669def parse(str, flags=0, pattern=None):
670    # parse 're' pattern into list of (opcode, argument) tuples
671
672    source = Tokenizer(str)
673
674    if pattern is None:
675        pattern = Pattern()
676    pattern.flags = flags
677    pattern.str = str
678
679    p = _parse_sub(source, pattern, 0)
680
681    tail = source.get()
682    if tail == ")":
683        raise error, "unbalanced parenthesis"
684    elif tail:
685        raise error, "bogus characters at end of regular expression"
686
687    if flags & SRE_FLAG_DEBUG:
688        p.dump()
689
690    if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
691        # the VERBOSE flag was switched on inside the pattern.  to be
692        # on the safe side, we'll parse the whole thing again...
693        return parse(str, p.pattern.flags)
694
695    return p
696
697def parse_template(source, pattern):
698    # parse 're' replacement string into list of literals and
699    # group references
700    s = Tokenizer(source)
701    sget = s.get
702    p = []
703    a = p.append
704    def literal(literal, p=p, pappend=a):
705        if p and p[-1][0] is LITERAL:
706            p[-1] = LITERAL, p[-1][1] + literal
707        else:
708            pappend((LITERAL, literal))
709    sep = source[:0]
710    if type(sep) is type(""):
711        makechar = chr
712    else:
713        makechar = unichr
714    while 1:
715        this = sget()
716        if this is None:
717            break # end of replacement string
718        if this and this[0] == "\\":
719            # group
720            c = this[1:2]
721            if c == "g":
722                name = ""
723                if s.match("<"):
724                    while 1:
725                        char = sget()
726                        if char is None:
727                            raise error, "unterminated group name"
728                        if char == ">":
729                            break
730                        name = name + char
731                if not name:
732                    raise error, "bad group name"
733                try:
734                    index = int(name)
735                    if index < 0:
736                        raise error, "negative group number"
737                except ValueError:
738                    if not isname(name):
739                        raise error, "bad character in group name"
740                    try:
741                        index = pattern.groupindex[name]
742                    except KeyError:
743                        raise IndexError, "unknown group name"
744                a((MARK, index))
745            elif c == "0":
746                if s.next in OCTDIGITS:
747                    this = this + sget()
748                    if s.next in OCTDIGITS:
749                        this = this + sget()
750                literal(makechar(int(this[1:], 8) & 0xff))
751            elif c in DIGITS:
752                isoctal = False
753                if s.next in DIGITS:
754                    this = this + sget()
755                    if (c in OCTDIGITS and this[2] in OCTDIGITS and
756                        s.next in OCTDIGITS):
757                        this = this + sget()
758                        isoctal = True
759                        literal(makechar(int(this[1:], 8) & 0xff))
760                if not isoctal:
761                    a((MARK, int(this[1:])))
762            else:
763                try:
764                    this = makechar(ESCAPES[this][1])
765                except KeyError:
766                    pass
767                literal(this)
768        else:
769            literal(this)
770    # convert template to groups and literals lists
771    i = 0
772    groups = []
773    groupsappend = groups.append
774    literals = [None] * len(p)
775    for c, s in p:
776        if c is MARK:
777            groupsappend((i, s))
778            # literal[i] is already None
779        else:
780            literals[i] = s
781        i = i + 1
782    return groups, literals
783
784def expand_template(template, match):
785    g = match.group
786    sep = match.string[:0]
787    groups, literals = template
788    literals = literals[:]
789    try:
790        for index, group in groups:
791            literals[index] = s = g(group)
792            if s is None:
793                raise error, "unmatched group"
794    except IndexError:
795        raise error, "invalid group reference"
796    return sep.join(literals)