sre_parse.py - This Python code is a part of a regular expr…

/Lib/sre_parse.py

http://unladen-swallow.googlecode.com/ · Python · 796 lines · 675 code · 56 blank · 65 comment · 260 complexity · cb303a35856e78bdc9f7d4420c6b15e3 MD5 · raw file

#
# Secret Labs' Regular Expression Engine
#
# convert re-style regular expression to sre pattern
#
# Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
#
# See the sre.py file for information on usage and redistribution.
#

"""Internal support module for sre"""

# XXX: show string offset and offending character for all errors

import sys

from sre_constants import *

def set(seq):
    s = {}
    for elem in seq:
        s[elem] = 1
    return s

SPECIAL_CHARS = ".\\[{()*+?^$|"
REPEAT_CHARS = "*+?{"

DIGITS = set("0123456789")

OCTDIGITS = set("01234567")
HEXDIGITS = set("0123456789abcdefABCDEF")

WHITESPACE = set(" \t\n\r\v\f")

ESCAPES = {
    r"\a": (LITERAL, ord("\a")),
    r"\b": (LITERAL, ord("\b")),
    r"\f": (LITERAL, ord("\f")),
    r"\n": (LITERAL, ord("\n")),
    r"\r": (LITERAL, ord("\r")),
    r"\t": (LITERAL, ord("\t")),
    r"\v": (LITERAL, ord("\v")),
    r"\\": (LITERAL, ord("\\"))
}

CATEGORIES = {
    r"\A": (AT, AT_BEGINNING_STRING), # start of string
    r"\b": (AT, AT_BOUNDARY),
    r"\B": (AT, AT_NON_BOUNDARY),
    r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
    r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
    r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
    r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
    r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
    r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
    r"\Z": (AT, AT_END_STRING), # end of string
}

FLAGS = {
    # standard flags
    "i": SRE_FLAG_IGNORECASE,
    "L": SRE_FLAG_LOCALE,
    "m": SRE_FLAG_MULTILINE,
    "s": SRE_FLAG_DOTALL,
    "x": SRE_FLAG_VERBOSE,
    # extensions
    "t": SRE_FLAG_TEMPLATE,
    "u": SRE_FLAG_UNICODE,
}

class Pattern:
    # master pattern object.  keeps track of global attributes
    def __init__(self):
        self.flags = 0
        self.open = []
        self.groups = 1
        self.groupdict = {}
    def opengroup(self, name=None):
        gid = self.groups
        self.groups = gid + 1
        if name is not None:
            ogid = self.groupdict.get(name, None)
            if ogid is not None:
                raise error, ("redefinition of group name %s as group %d; "
                              "was group %d" % (repr(name), gid,  ogid))
            self.groupdict[name] = gid
        self.open.append(gid)
        return gid
    def closegroup(self, gid):
        self.open.remove(gid)
    def checkgroup(self, gid):
        return gid < self.groups and gid not in self.open

class SubPattern:
    # a subpattern, in intermediate form
    def __init__(self, pattern, data=None):
        self.pattern = pattern
        if data is None:
            data = []
        self.data = data
        self.width = None
    def dump(self, level=0):
        nl = 1
        seqtypes = type(()), type([])
        for op, av in self.data:
            print level*"  " + op,; nl = 0
            if op == "in":
                # member sublanguage
                print; nl = 1
                for op, a in av:
                    print (level+1)*"  " + op, a
            elif op == "branch":
                print; nl = 1
                i = 0
                for a in av[1]:
                    if i > 0:
                        print level*"  " + "or"
                    a.dump(level+1); nl = 1
                    i = i + 1
            elif type(av) in seqtypes:
                for a in av:
                    if isinstance(a, SubPattern):
                        if not nl: print
                        a.dump(level+1); nl = 1
                    else:
                        print a, ; nl = 0
            else:
                print av, ; nl = 0
            if not nl: print
    def __repr__(self):
        return repr(self.data)
    def __len__(self):
        return len(self.data)
    def __delitem__(self, index):
        del self.data[index]
    def __getitem__(self, index):
        if isinstance(index, slice):
            return SubPattern(self.pattern, self.data[index])
        return self.data[index]
    def __setitem__(self, index, code):
        self.data[index] = code
    def insert(self, index, code):
        self.data.insert(index, code)
    def append(self, code):
        self.data.append(code)
    def getwidth(self):
        # determine the width (min, max) for this subpattern
        if self.width:
            return self.width
        lo = hi = 0L
        UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
        REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
        for op, av in self.data:
            if op is BRANCH:
                i = sys.maxint
                j = 0
                for av in av[1]:
                    l, h = av.getwidth()
                    i = min(i, l)
                    j = max(j, h)
                lo = lo + i
                hi = hi + j
            elif op is CALL:
                i, j = av.getwidth()
                lo = lo + i
                hi = hi + j
            elif op is SUBPATTERN:
                i, j = av[1].getwidth()
                lo = lo + i
                hi = hi + j
            elif op in REPEATCODES:
                i, j = av[2].getwidth()
                lo = lo + long(i) * av[0]
                hi = hi + long(j) * av[1]
            elif op in UNITCODES:
                lo = lo + 1
                hi = hi + 1
            elif op == SUCCESS:
                break
        self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
        return self.width

class Tokenizer:
    def __init__(self, string):
        self.string = string
        self.index = 0
        self.__next()
    def __next(self):
        if self.index >= len(self.string):
            self.next = None
            return
        char = self.string[self.index]
        if char[0] == "\\":
            try:
                c = self.string[self.index + 1]
            except IndexError:
                raise error, "bogus escape (end of line)"
            char = char + c
        self.index = self.index + len(char)
        self.next = char
    def match(self, char, skip=1):
        if char == self.next:
            if skip:
                self.__next()
            return 1
        return 0
    def get(self):
        this = self.next
        self.__next()
        return this
    def tell(self):
        return self.index, self.next
    def seek(self, index):
        self.index, self.next = index

def isident(char):
    return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"

def isdigit(char):
    return "0" <= char <= "9"

def isname(name):
    # check that group name is a valid string
    if not isident(name[0]):
        return False
    for char in name[1:]:
        if not isident(char) and not isdigit(char):
            return False
    return True

def _class_escape(source, escape):
    # handle escape code inside character class
    code = ESCAPES.get(escape)
    if code:
        return code
    code = CATEGORIES.get(escape)
    if code:
        return code
    try:
        c = escape[1:2]
        if c == "x":
            # hexadecimal escape (exactly two digits)
            while source.next in HEXDIGITS and len(escape) < 4:
                escape = escape + source.get()
            escape = escape[2:]
            if len(escape) != 2:
                raise error, "bogus escape: %s" % repr("\\" + escape)
            return LITERAL, int(escape, 16) & 0xff
        elif c in OCTDIGITS:
            # octal escape (up to three digits)
            while source.next in OCTDIGITS and len(escape) < 4:
                escape = escape + source.get()
            escape = escape[1:]
            return LITERAL, int(escape, 8) & 0xff
        elif c in DIGITS:
            raise error, "bogus escape: %s" % repr(escape)
        if len(escape) == 2:
            return LITERAL, ord(escape[1])
    except ValueError:
        pass
    raise error, "bogus escape: %s" % repr(escape)

def _escape(source, escape, state):
    # handle escape code in expression
    code = CATEGORIES.get(escape)
    if code:
        return code
    code = ESCAPES.get(escape)
    if code:
        return code
    try:
        c = escape[1:2]
        if c == "x":
            # hexadecimal escape
            while source.next in HEXDIGITS and len(escape) < 4:
                escape = escape + source.get()
            if len(escape) != 4:
                raise ValueError
            return LITERAL, int(escape[2:], 16) & 0xff
        elif c == "0":
            # octal escape
            while source.next in OCTDIGITS and len(escape) < 4:
                escape = escape + source.get()
            return LITERAL, int(escape[1:], 8) & 0xff
        elif c in DIGITS:
            # octal escape *or* decimal group reference (sigh)
            if source.next in DIGITS:
                escape = escape + source.get()
                if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
                    source.next in OCTDIGITS):
                    # got three octal digits; this is an octal escape
                    escape = escape + source.get()
                    return LITERAL, int(escape[1:], 8) & 0xff
            # not an octal escape, so this is a group reference
            group = int(escape[1:])
            if group < state.groups:
                if not state.checkgroup(group):
                    raise error, "cannot refer to open group"
                return GROUPREF, group
            raise ValueError
        if len(escape) == 2:
            return LITERAL, ord(escape[1])
    except ValueError:
        pass
    raise error, "bogus escape: %s" % repr(escape)

def _parse_sub(source, state, nested=1):
    # parse an alternation: a|b|c

    items = []
    itemsappend = items.append
    sourcematch = source.match
    while 1:
        itemsappend(_parse(source, state))
        if sourcematch("|"):
            continue
        if not nested:
            break
        if not source.next or sourcematch(")", 0):
            break
        else:
            raise error, "pattern not properly closed"

    if len(items) == 1:
        return items[0]

    subpattern = SubPattern(state)
    subpatternappend = subpattern.append

    # check if all items share a common prefix
    while 1:
        prefix = None
        for item in items:
            if not item:
                break
            if prefix is None:
                prefix = item[0]
            elif item[0] != prefix:
                break
        else:
            # all subitems start with a common "prefix".
            # move it out of the branch
            for item in items:
                del item[0]
            subpatternappend(prefix)
            continue # check next one
        break

    # check if the branch can be replaced by a character set
    for item in items:
        if len(item) != 1 or item[0][0] != LITERAL:
            break
    else:
        # we can store this as a character set instead of a
        # branch (the compiler may optimize this even more)
        set = []
        setappend = set.append
        for item in items:
            setappend(item[0])
        subpatternappend((IN, set))
        return subpattern

    subpattern.append((BRANCH, (None, items)))
    return subpattern

def _parse_sub_cond(source, state, condgroup):
    item_yes = _parse(source, state)
    if source.match("|"):
        item_no = _parse(source, state)
        if source.match("|"):
            raise error, "conditional backref with more than two branches"
    else:
        item_no = None
    if source.next and not source.match(")", 0):
        raise error, "pattern not properly closed"
    subpattern = SubPattern(state)
    subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
    return subpattern

_PATTERNENDERS = set("|)")
_ASSERTCHARS = set("=!<")
_LOOKBEHINDASSERTCHARS = set("=!")
_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])

def _parse(source, state):
    # parse a simple pattern
    subpattern = SubPattern(state)

    # precompute constants into local variables
    subpatternappend = subpattern.append
    sourceget = source.get
    sourcematch = source.match
    _len = len
    PATTERNENDERS = _PATTERNENDERS
    ASSERTCHARS = _ASSERTCHARS
    LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
    REPEATCODES = _REPEATCODES

    while 1:

        if source.next in PATTERNENDERS:
            break # end of subpattern
        this = sourceget()
        if this is None:
            break # end of pattern

        if state.flags & SRE_FLAG_VERBOSE:
            # skip whitespace and comments
            if this in WHITESPACE:
                continue
            if this == "#":
                while 1:
                    this = sourceget()
                    if this in (None, "\n"):
                        break
                continue

        if this and this[0] not in SPECIAL_CHARS:
            subpatternappend((LITERAL, ord(this)))

        elif this == "[":
            # character set
            set = []
            setappend = set.append
##          if sourcematch(":"):
##              pass # handle character classes
            if sourcematch("^"):
                setappend((NEGATE, None))
            # check remaining characters
            start = set[:]
            while 1:
                this = sourceget()
                if this == "]" and set != start:
                    break
                elif this and this[0] == "\\":
                    code1 = _class_escape(source, this)
                elif this:
                    code1 = LITERAL, ord(this)
                else:
                    raise error, "unexpected end of regular expression"
                if sourcematch("-"):
                    # potential range
                    this = sourceget()
                    if this == "]":
                        if code1[0] is IN:
                            code1 = code1[1][0]
                        setappend(code1)
                        setappend((LITERAL, ord("-")))
                        break
                    elif this:
                        if this[0] == "\\":
                            code2 = _class_escape(source, this)
                        else:
                            code2 = LITERAL, ord(this)
                        if code1[0] != LITERAL or code2[0] != LITERAL:
                            raise error, "bad character range"
                        lo = code1[1]
                        hi = code2[1]
                        if hi < lo:
                            raise error, "bad character range"
                        setappend((RANGE, (lo, hi)))
                    else:
                        raise error, "unexpected end of regular expression"
                else:
                    if code1[0] is IN:
                        code1 = code1[1][0]
                    setappend(code1)

            # XXX: <fl> should move set optimization to compiler!
            if _len(set)==1 and set[0][0] is LITERAL:
                subpatternappend(set[0]) # optimization
            elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
                subpatternappend((NOT_LITERAL, set[1][1])) # optimization
            else:
                # XXX: <fl> should add charmap optimization here
                subpatternappend((IN, set))

        elif this and this[0] in REPEAT_CHARS:
            # repeat previous item
            if this == "?":
                min, max = 0, 1
            elif this == "*":
                min, max = 0, MAXREPEAT

            elif this == "+":
                min, max = 1, MAXREPEAT
            elif this == "{":
                if source.next == "}":
                    subpatternappend((LITERAL, ord(this)))
                    continue
                here = source.tell()
                min, max = 0, MAXREPEAT
                lo = hi = ""
                while source.next in DIGITS:
                    lo = lo + source.get()
                if sourcematch(","):
                    while source.next in DIGITS:
                        hi = hi + sourceget()
                else:
                    hi = lo
                if not sourcematch("}"):
                    subpatternappend((LITERAL, ord(this)))
                    source.seek(here)
                    continue
                if lo:
                    min = int(lo)
                if hi:
                    max = int(hi)
                if max < min:
                    raise error, "bad repeat interval"
            else:
                raise error, "not supported"
            # figure out which item to repeat
            if subpattern:
                item = subpattern[-1:]
            else:
                item = None
            if not item or (_len(item) == 1 and item[0][0] == AT):
                raise error, "nothing to repeat"
            if item[0][0] in REPEATCODES:
                raise error, "multiple repeat"
            if sourcematch("?"):
                subpattern[-1] = (MIN_REPEAT, (min, max, item))
            else:
                subpattern[-1] = (MAX_REPEAT, (min, max, item))

        elif this == ".":
            subpatternappend((ANY, None))

        elif this == "(":
            group = 1
            name = None
            condgroup = None
            if sourcematch("?"):
                group = 0
                # options
                if sourcematch("P"):
                    # python extensions
                    if sourcematch("<"):
                        # named group: skip forward to end of name
                        name = ""
                        while 1:
                            char = sourceget()
                            if char is None:
                                raise error, "unterminated name"
                            if char == ">":
                                break
                            name = name + char
                        group = 1
                        if not isname(name):
                            raise error, "bad character in group name"
                    elif sourcematch("="):
                        # named backreference
                        name = ""
                        while 1:
                            char = sourceget()
                            if char is None:
                                raise error, "unterminated name"
                            if char == ")":
                                break
                            name = name + char
                        if not isname(name):
                            raise error, "bad character in group name"
                        gid = state.groupdict.get(name)
                        if gid is None:
                            raise error, "unknown group name"
                        subpatternappend((GROUPREF, gid))
                        continue
                    else:
                        char = sourceget()
                        if char is None:
                            raise error, "unexpected end of pattern"
                        raise error, "unknown specifier: ?P%s" % char
                elif sourcematch(":"):
                    # non-capturing group
                    group = 2
                elif sourcematch("#"):
                    # comment
                    while 1:
                        if source.next is None or source.next == ")":
                            break
                        sourceget()
                    if not sourcematch(")"):
                        raise error, "unbalanced parenthesis"
                    continue
                elif source.next in ASSERTCHARS:
                    # lookahead assertions
                    char = sourceget()
                    dir = 1
                    if char == "<":
                        if source.next not in LOOKBEHINDASSERTCHARS:
                            raise error, "syntax error"
                        dir = -1 # lookbehind
                        char = sourceget()
                    p = _parse_sub(source, state)
                    if not sourcematch(")"):
                        raise error, "unbalanced parenthesis"
                    if char == "=":
                        subpatternappend((ASSERT, (dir, p)))
                    else:
                        subpatternappend((ASSERT_NOT, (dir, p)))
                    continue
                elif sourcematch("("):
                    # conditional backreference group
                    condname = ""
                    while 1:
                        char = sourceget()
                        if char is None:
                            raise error, "unterminated name"
                        if char == ")":
                            break
                        condname = condname + char
                    group = 2
                    if isname(condname):
                        condgroup = state.groupdict.get(condname)
                        if condgroup is None:
                            raise error, "unknown group name"
                    else:
                        try:
                            condgroup = int(condname)
                        except ValueError:
                            raise error, "bad character in group name"
                else:
                    # flags
                    if not source.next in FLAGS:
                        raise error, "unexpected end of pattern"
                    while source.next in FLAGS:
                        state.flags = state.flags | FLAGS[sourceget()]
            if group:
                # parse group contents
                if group == 2:
                    # anonymous group
                    group = None
                else:
                    group = state.opengroup(name)
                if condgroup:
                    p = _parse_sub_cond(source, state, condgroup)
                else:
                    p = _parse_sub(source, state)
                if not sourcematch(")"):
                    raise error, "unbalanced parenthesis"
                if group is not None:
                    state.closegroup(group)
                subpatternappend((SUBPATTERN, (group, p)))
            else:
                while 1:
                    char = sourceget()
                    if char is None:
                        raise error, "unexpected end of pattern"
                    if char == ")":
                        break
                    raise error, "unknown extension"

        elif this == "^":
            subpatternappend((AT, AT_BEGINNING))

        elif this == "$":
            subpattern.append((AT, AT_END))

        elif this and this[0] == "\\":
            code = _escape(source, this, state)
            subpatternappend(code)

        else:
            raise error, "parser error"

    return subpattern

def parse(str, flags=0, pattern=None):
    # parse 're' pattern into list of (opcode, argument) tuples

    source = Tokenizer(str)

    if pattern is None:
        pattern = Pattern()
    pattern.flags = flags
    pattern.str = str

    p = _parse_sub(source, pattern, 0)

    tail = source.get()
    if tail == ")":
        raise error, "unbalanced parenthesis"
    elif tail:
        raise error, "bogus characters at end of regular expression"

    if flags & SRE_FLAG_DEBUG:
        p.dump()

    if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
        # the VERBOSE flag was switched on inside the pattern.  to be
        # on the safe side, we'll parse the whole thing again...
        return parse(str, p.pattern.flags)

    return p

def parse_template(source, pattern):
    # parse 're' replacement string into list of literals and
    # group references
    s = Tokenizer(source)
    sget = s.get
    p = []
    a = p.append
    def literal(literal, p=p, pappend=a):
        if p and p[-1][0] is LITERAL:
            p[-1] = LITERAL, p[-1][1] + literal
        else:
            pappend((LITERAL, literal))
    sep = source[:0]
    if type(sep) is type(""):
        makechar = chr
    else:
        makechar = unichr
    while 1:
        this = sget()
        if this is None:
            break # end of replacement string
        if this and this[0] == "\\":
            # group
            c = this[1:2]
            if c == "g":
                name = ""
                if s.match("<"):
                    while 1:
                        char = sget()
                        if char is None:
                            raise error, "unterminated group name"
                        if char == ">":
                            break
                        name = name + char
                if not name:
                    raise error, "bad group name"
                try:
                    index = int(name)
                    if index < 0:
                        raise error, "negative group number"
                except ValueError:
                    if not isname(name):
                        raise error, "bad character in group name"
                    try:
                        index = pattern.groupindex[name]
                    except KeyError:
                        raise IndexError, "unknown group name"
                a((MARK, index))
            elif c == "0":
                if s.next in OCTDIGITS:
                    this = this + sget()
                    if s.next in OCTDIGITS:
                        this = this + sget()
                literal(makechar(int(this[1:], 8) & 0xff))
            elif c in DIGITS:
                isoctal = False
                if s.next in DIGITS:
                    this = this + sget()
                    if (c in OCTDIGITS and this[2] in OCTDIGITS and
                        s.next in OCTDIGITS):
                        this = this + sget()
                        isoctal = True
                        literal(makechar(int(this[1:], 8) & 0xff))
                if not isoctal:
                    a((MARK, int(this[1:])))
            else:
                try:
                    this = makechar(ESCAPES[this][1])
                except KeyError:
                    pass
                literal(this)
        else:
            literal(this)
    # convert template to groups and literals lists
    i = 0
    groups = []
    groupsappend = groups.append
    literals = [None] * len(p)
    for c, s in p:
        if c is MARK:
            groupsappend((i, s))
            # literal[i] is already None
        else:
            literals[i] = s
        i = i + 1
    return groups, literals

def expand_template(template, match):
    g = match.group
    sep = match.string[:0]
    groups, literals = template
    literals = literals[:]
    try:
        for index, group in groups:
            literals[index] = s = g(group)
            if s is None:
                raise error, "unmatched group"
    except IndexError:
        raise error, "invalid group reference"
    return sep.join(literals)
Summary ✨

This Python code is a part of a regular expression engine, specifically for parsing and compiling regular expressions. It takes a string input and breaks it down into a list of (opcode, argument) tuples that can be used to match patterns in strings. The code also handles escape sequences, groups, and replacement templates.
Tech Fingerprint

Alerts (33)

'import *' Avoid to prevent namespace pollution; import specific names or use aliases
17
'def' Ensure functions have docstrings for documentation
19 78 89 91 102 142 144 146 201 207 211 213 216 219 222 669 697 704 784
'global' Avoid global variables; use function parameters or class attributes for better scope management
72
'type(' Use isinstance() for type checking instead of type()
104 120 710
Complexity hotspot; lines 120 to 123 (total complexity: 4)
120 121 122 123
'isinstance(' Overuse may indicate design issues; consider polymorphism
122 137
'del' Avoid unless necessary; Python's garbage collector typically handles object deletion
135 344
'try:' Ensure try blocks have corresponding except or finally blocks
239