PageRenderTime 78ms CodeModel.GetById 44ms app.highlight 29ms RepoModel.GetById 1ms app.codeStats 1ms

/Lib/lib2to3/pgen2/tokenize.py

http://unladen-swallow.googlecode.com/
Python | 405 lines | 362 code | 20 blank | 23 comment | 19 complexity | 06aea8121aa7b0fc71345d011813d4b4 MD5 | raw file
  1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
  2# All rights reserved.
  3
  4"""Tokenization help for Python programs.
  5
  6generate_tokens(readline) is a generator that breaks a stream of
  7text into Python tokens.  It accepts a readline-like method which is called
  8repeatedly to get the next line of input (or "" for EOF).  It generates
  95-tuples with these members:
 10
 11    the token type (see token.py)
 12    the token (a string)
 13    the starting (row, column) indices of the token (a 2-tuple of ints)
 14    the ending (row, column) indices of the token (a 2-tuple of ints)
 15    the original line (string)
 16
 17It is designed to match the working of the Python tokenizer exactly, except
 18that it produces COMMENT tokens for comments and gives type OP for all
 19operators
 20
 21Older entry points
 22    tokenize_loop(readline, tokeneater)
 23    tokenize(readline, tokeneater=printtoken)
 24are the same, except instead of generating tokens, tokeneater is a callback
 25function to which the 5 fields described above are passed as 5 arguments,
 26each time a new token is found."""
 27
 28__author__ = 'Ka-Ping Yee <ping@lfw.org>'
 29__credits__ = \
 30    'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
 31
 32import string, re
 33from lib2to3.pgen2.token import *
 34
 35from . import token
 36__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
 37           "generate_tokens", "untokenize"]
 38del token
 39
 40def group(*choices): return '(' + '|'.join(choices) + ')'
 41def any(*choices): return group(*choices) + '*'
 42def maybe(*choices): return group(*choices) + '?'
 43
 44Whitespace = r'[ \f\t]*'
 45Comment = r'#[^\r\n]*'
 46Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
 47Name = r'[a-zA-Z_]\w*'
 48
 49Binnumber = r'0[bB][01]*'
 50Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
 51Octnumber = r'0[oO]?[0-7]*[lL]?'
 52Decnumber = r'[1-9]\d*[lL]?'
 53Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
 54Exponent = r'[eE][-+]?\d+'
 55Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
 56Expfloat = r'\d+' + Exponent
 57Floatnumber = group(Pointfloat, Expfloat)
 58Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
 59Number = group(Imagnumber, Floatnumber, Intnumber)
 60
 61# Tail end of ' string.
 62Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
 63# Tail end of " string.
 64Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
 65# Tail end of ''' string.
 66Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
 67# Tail end of """ string.
 68Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
 69Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
 70# Single-line ' or " string.
 71String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
 72               r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
 73
 74# Because of leftmost-then-longest match semantics, be sure to put the
 75# longest operators first (e.g., if = came before ==, == would get
 76# recognized as two instances of =).
 77Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
 78                 r"//=?", r"->",
 79                 r"[+\-*/%&|^=<>]=?",
 80                 r"~")
 81
 82Bracket = '[][(){}]'
 83Special = group(r'\r?\n', r'[:;.,`@]')
 84Funny = group(Operator, Bracket, Special)
 85
 86PlainToken = group(Number, Funny, String, Name)
 87Token = Ignore + PlainToken
 88
 89# First (or only) line of ' or " string.
 90ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
 91                group("'", r'\\\r?\n'),
 92                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
 93                group('"', r'\\\r?\n'))
 94PseudoExtras = group(r'\\\r?\n', Comment, Triple)
 95PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 96
 97tokenprog, pseudoprog, single3prog, double3prog = map(
 98    re.compile, (Token, PseudoToken, Single3, Double3))
 99endprogs = {"'": re.compile(Single), '"': re.compile(Double),
100            "'''": single3prog, '"""': double3prog,
101            "r'''": single3prog, 'r"""': double3prog,
102            "u'''": single3prog, 'u"""': double3prog,
103            "b'''": single3prog, 'b"""': double3prog,
104            "ur'''": single3prog, 'ur"""': double3prog,
105            "br'''": single3prog, 'br"""': double3prog,
106            "R'''": single3prog, 'R"""': double3prog,
107            "U'''": single3prog, 'U"""': double3prog,
108            "B'''": single3prog, 'B"""': double3prog,
109            "uR'''": single3prog, 'uR"""': double3prog,
110            "Ur'''": single3prog, 'Ur"""': double3prog,
111            "UR'''": single3prog, 'UR"""': double3prog,
112            "bR'''": single3prog, 'bR"""': double3prog,
113            "Br'''": single3prog, 'Br"""': double3prog,
114            "BR'''": single3prog, 'BR"""': double3prog,
115            'r': None, 'R': None,
116            'u': None, 'U': None,
117            'b': None, 'B': None}
118
119triple_quoted = {}
120for t in ("'''", '"""',
121          "r'''", 'r"""', "R'''", 'R"""',
122          "u'''", 'u"""', "U'''", 'U"""',
123          "b'''", 'b"""', "B'''", 'B"""',
124          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
125          "uR'''", 'uR"""', "UR'''", 'UR"""',
126          "br'''", 'br"""', "Br'''", 'Br"""',
127          "bR'''", 'bR"""', "BR'''", 'BR"""',):
128    triple_quoted[t] = t
129single_quoted = {}
130for t in ("'", '"',
131          "r'", 'r"', "R'", 'R"',
132          "u'", 'u"', "U'", 'U"',
133          "b'", 'b"', "B'", 'B"',
134          "ur'", 'ur"', "Ur'", 'Ur"',
135          "uR'", 'uR"', "UR'", 'UR"',
136          "br'", 'br"', "Br'", 'Br"',
137          "bR'", 'bR"', "BR'", 'BR"', ):
138    single_quoted[t] = t
139
140tabsize = 8
141
142class TokenError(Exception): pass
143
144class StopTokenizing(Exception): pass
145
146def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
147    print "%d,%d-%d,%d:\t%s\t%s" % \
148        (srow, scol, erow, ecol, tok_name[type], repr(token))
149
150def tokenize(readline, tokeneater=printtoken):
151    """
152    The tokenize() function accepts two parameters: one representing the
153    input stream, and one providing an output mechanism for tokenize().
154
155    The first parameter, readline, must be a callable object which provides
156    the same interface as the readline() method of built-in file objects.
157    Each call to the function should return one line of input as a string.
158
159    The second parameter, tokeneater, must also be a callable object. It is
160    called once for each token, with five arguments, corresponding to the
161    tuples generated by generate_tokens().
162    """
163    try:
164        tokenize_loop(readline, tokeneater)
165    except StopTokenizing:
166        pass
167
168# backwards compatible interface
169def tokenize_loop(readline, tokeneater):
170    for token_info in generate_tokens(readline):
171        tokeneater(*token_info)
172
173class Untokenizer:
174
175    def __init__(self):
176        self.tokens = []
177        self.prev_row = 1
178        self.prev_col = 0
179
180    def add_whitespace(self, start):
181        row, col = start
182        assert row <= self.prev_row
183        col_offset = col - self.prev_col
184        if col_offset:
185            self.tokens.append(" " * col_offset)
186
187    def untokenize(self, iterable):
188        for t in iterable:
189            if len(t) == 2:
190                self.compat(t, iterable)
191                break
192            tok_type, token, start, end, line = t
193            self.add_whitespace(start)
194            self.tokens.append(token)
195            self.prev_row, self.prev_col = end
196            if tok_type in (NEWLINE, NL):
197                self.prev_row += 1
198                self.prev_col = 0
199        return "".join(self.tokens)
200
201    def compat(self, token, iterable):
202        startline = False
203        indents = []
204        toks_append = self.tokens.append
205        toknum, tokval = token
206        if toknum in (NAME, NUMBER):
207            tokval += ' '
208        if toknum in (NEWLINE, NL):
209            startline = True
210        for tok in iterable:
211            toknum, tokval = tok[:2]
212
213            if toknum in (NAME, NUMBER):
214                tokval += ' '
215
216            if toknum == INDENT:
217                indents.append(tokval)
218                continue
219            elif toknum == DEDENT:
220                indents.pop()
221                continue
222            elif toknum in (NEWLINE, NL):
223                startline = True
224            elif startline and indents:
225                toks_append(indents[-1])
226                startline = False
227            toks_append(tokval)
228
229def untokenize(iterable):
230    """Transform tokens back into Python source code.
231
232    Each element returned by the iterable must be a token sequence
233    with at least two elements, a token number and token value.  If
234    only two tokens are passed, the resulting output is poor.
235
236    Round-trip invariant for full input:
237        Untokenized source will match input source exactly
238
239    Round-trip invariant for limited intput:
240        # Output text will tokenize the back to the input
241        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
242        newcode = untokenize(t1)
243        readline = iter(newcode.splitlines(1)).next
244        t2 = [tok[:2] for tokin generate_tokens(readline)]
245        assert t1 == t2
246    """
247    ut = Untokenizer()
248    return ut.untokenize(iterable)
249
250def generate_tokens(readline):
251    """
252    The generate_tokens() generator requires one argment, readline, which
253    must be a callable object which provides the same interface as the
254    readline() method of built-in file objects. Each call to the function
255    should return one line of input as a string.  Alternately, readline
256    can be a callable function terminating with StopIteration:
257        readline = open(myfile).next    # Example of alternate readline
258
259    The generator produces 5-tuples with these members: the token type; the
260    token string; a 2-tuple (srow, scol) of ints specifying the row and
261    column where the token begins in the source; a 2-tuple (erow, ecol) of
262    ints specifying the row and column where the token ends in the source;
263    and the line on which the token was found. The line passed is the
264    logical line; continuation lines are included.
265    """
266    lnum = parenlev = continued = 0
267    namechars, numchars = string.ascii_letters + '_', '0123456789'
268    contstr, needcont = '', 0
269    contline = None
270    indents = [0]
271
272    while 1:                                   # loop over lines in stream
273        try:
274            line = readline()
275        except StopIteration:
276            line = ''
277        lnum = lnum + 1
278        pos, max = 0, len(line)
279
280        if contstr:                            # continued string
281            if not line:
282                raise TokenError, ("EOF in multi-line string", strstart)
283            endmatch = endprog.match(line)
284            if endmatch:
285                pos = end = endmatch.end(0)
286                yield (STRING, contstr + line[:end],
287                       strstart, (lnum, end), contline + line)
288                contstr, needcont = '', 0
289                contline = None
290            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
291                yield (ERRORTOKEN, contstr + line,
292                           strstart, (lnum, len(line)), contline)
293                contstr = ''
294                contline = None
295                continue
296            else:
297                contstr = contstr + line
298                contline = contline + line
299                continue
300
301        elif parenlev == 0 and not continued:  # new statement
302            if not line: break
303            column = 0
304            while pos < max:                   # measure leading whitespace
305                if line[pos] == ' ': column = column + 1
306                elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
307                elif line[pos] == '\f': column = 0
308                else: break
309                pos = pos + 1
310            if pos == max: break
311
312            if line[pos] in '#\r\n':           # skip comments or blank lines
313                if line[pos] == '#':
314                    comment_token = line[pos:].rstrip('\r\n')
315                    nl_pos = pos + len(comment_token)
316                    yield (COMMENT, comment_token,
317                           (lnum, pos), (lnum, pos + len(comment_token)), line)
318                    yield (NL, line[nl_pos:],
319                           (lnum, nl_pos), (lnum, len(line)), line)
320                else:
321                    yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
322                           (lnum, pos), (lnum, len(line)), line)
323                continue
324
325            if column > indents[-1]:           # count indents or dedents
326                indents.append(column)
327                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
328            while column < indents[-1]:
329                if column not in indents:
330                    raise IndentationError(
331                        "unindent does not match any outer indentation level",
332                        ("<tokenize>", lnum, pos, line))
333                indents = indents[:-1]
334                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
335
336        else:                                  # continued statement
337            if not line:
338                raise TokenError, ("EOF in multi-line statement", (lnum, 0))
339            continued = 0
340
341        while pos < max:
342            pseudomatch = pseudoprog.match(line, pos)
343            if pseudomatch:                                # scan for tokens
344                start, end = pseudomatch.span(1)
345                spos, epos, pos = (lnum, start), (lnum, end), end
346                token, initial = line[start:end], line[start]
347
348                if initial in numchars or \
349                   (initial == '.' and token != '.'):      # ordinary number
350                    yield (NUMBER, token, spos, epos, line)
351                elif initial in '\r\n':
352                    newline = NEWLINE
353                    if parenlev > 0:
354                        newline = NL
355                    yield (newline, token, spos, epos, line)
356                elif initial == '#':
357                    assert not token.endswith("\n")
358                    yield (COMMENT, token, spos, epos, line)
359                elif token in triple_quoted:
360                    endprog = endprogs[token]
361                    endmatch = endprog.match(line, pos)
362                    if endmatch:                           # all on one line
363                        pos = endmatch.end(0)
364                        token = line[start:pos]
365                        yield (STRING, token, spos, (lnum, pos), line)
366                    else:
367                        strstart = (lnum, start)           # multiple lines
368                        contstr = line[start:]
369                        contline = line
370                        break
371                elif initial in single_quoted or \
372                    token[:2] in single_quoted or \
373                    token[:3] in single_quoted:
374                    if token[-1] == '\n':                  # continued string
375                        strstart = (lnum, start)
376                        endprog = (endprogs[initial] or endprogs[token[1]] or
377                                   endprogs[token[2]])
378                        contstr, needcont = line[start:], 1
379                        contline = line
380                        break
381                    else:                                  # ordinary string
382                        yield (STRING, token, spos, epos, line)
383                elif initial in namechars:                 # ordinary name
384                    yield (NAME, token, spos, epos, line)
385                elif initial == '\\':                      # continued stmt
386                    # This yield is new; needed for better idempotency:
387                    yield (NL, token, spos, (lnum, pos), line)
388                    continued = 1
389                else:
390                    if initial in '([{': parenlev = parenlev + 1
391                    elif initial in ')]}': parenlev = parenlev - 1
392                    yield (OP, token, spos, epos, line)
393            else:
394                yield (ERRORTOKEN, line[pos],
395                           (lnum, pos), (lnum, pos+1), line)
396                pos = pos + 1
397
398    for indent in indents[1:]:                 # pop remaining indent levels
399        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
400    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
401
402if __name__ == '__main__':                     # testing
403    import sys
404    if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
405    else: tokenize(sys.stdin.readline)