PageRenderTime 1455ms CodeModel.GetById 202ms app.highlight 878ms RepoModel.GetById 199ms app.codeStats 1ms

/Lib/tokenize.py

http://unladen-swallow.googlecode.com/
Python | 415 lines | 375 code | 19 blank | 21 comment | 19 complexity | 28cb2c24f7b2f709fd2ef6e5c9d2631e MD5 | raw file
  1"""Tokenization help for Python programs.
  2
  3generate_tokens(readline) is a generator that breaks a stream of
  4text into Python tokens.  It accepts a readline-like method which is called
  5repeatedly to get the next line of input (or "" for EOF).  It generates
  65-tuples with these members:
  7
  8    the token type (see token.py)
  9    the token (a string)
 10    the starting (row, column) indices of the token (a 2-tuple of ints)
 11    the ending (row, column) indices of the token (a 2-tuple of ints)
 12    the original line (string)
 13
 14It is designed to match the working of the Python tokenizer exactly, except
 15that it produces COMMENT tokens for comments and gives type OP for all
 16operators
 17
 18Older entry points
 19    tokenize_loop(readline, tokeneater)
 20    tokenize(readline, tokeneater=printtoken)
 21are the same, except instead of generating tokens, tokeneater is a callback
 22function to which the 5 fields described above are passed as 5 arguments,
 23each time a new token is found."""
 24
 25__author__ = 'Ka-Ping Yee <ping@lfw.org>'
 26__credits__ = \
 27    'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro, Raymond Hettinger'
 28
 29import string, re
 30from token import *
 31
 32import token
 33__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
 34           "generate_tokens", "NL", "untokenize"]
 35del x
 36del token
 37
 38COMMENT = N_TOKENS
 39tok_name[COMMENT] = 'COMMENT'
 40NL = N_TOKENS + 1
 41tok_name[NL] = 'NL'
 42N_TOKENS += 2
 43
 44def group(*choices): return '(' + '|'.join(choices) + ')'
 45def any(*choices): return group(*choices) + '*'
 46def maybe(*choices): return group(*choices) + '?'
 47
 48Whitespace = r'[ \f\t]*'
 49Comment = r'#[^\r\n]*'
 50Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
 51Name = r'[a-zA-Z_]\w*'
 52
 53Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
 54Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
 55Binnumber = r'0[bB][01]+[lL]?'
 56Decnumber = r'[1-9]\d*[lL]?'
 57Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
 58Exponent = r'[eE][-+]?\d+'
 59Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
 60Expfloat = r'\d+' + Exponent
 61Floatnumber = group(Pointfloat, Expfloat)
 62Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
 63Number = group(Imagnumber, Floatnumber, Intnumber)
 64
 65# Tail end of ' string.
 66Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
 67# Tail end of " string.
 68Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
 69# Tail end of ''' string.
 70Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
 71# Tail end of """ string.
 72Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
 73Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
 74# Single-line ' or " string.
 75String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
 76               r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
 77
 78# Because of leftmost-then-longest match semantics, be sure to put the
 79# longest operators first (e.g., if = came before ==, == would get
 80# recognized as two instances of =).
 81Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
 82                 r"//=?",
 83                 r"[+\-*/%&|^=<>]=?",
 84                 r"~")
 85
 86Bracket = '[][(){}]'
 87Special = group(r'\r?\n', r'[:;.,`@]')
 88Funny = group(Operator, Bracket, Special)
 89
 90PlainToken = group(Number, Funny, String, Name)
 91Token = Ignore + PlainToken
 92
 93# First (or only) line of ' or " string.
 94ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
 95                group("'", r'\\\r?\n'),
 96                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
 97                group('"', r'\\\r?\n'))
 98PseudoExtras = group(r'\\\r?\n', Comment, Triple)
 99PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
100
101tokenprog, pseudoprog, single3prog, double3prog = map(
102    re.compile, (Token, PseudoToken, Single3, Double3))
103endprogs = {"'": re.compile(Single), '"': re.compile(Double),
104            "'''": single3prog, '"""': double3prog,
105            "r'''": single3prog, 'r"""': double3prog,
106            "u'''": single3prog, 'u"""': double3prog,
107            "ur'''": single3prog, 'ur"""': double3prog,
108            "R'''": single3prog, 'R"""': double3prog,
109            "U'''": single3prog, 'U"""': double3prog,
110            "uR'''": single3prog, 'uR"""': double3prog,
111            "Ur'''": single3prog, 'Ur"""': double3prog,
112            "UR'''": single3prog, 'UR"""': double3prog,
113            "b'''": single3prog, 'b"""': double3prog,
114            "br'''": single3prog, 'br"""': double3prog,
115            "B'''": single3prog, 'B"""': double3prog,
116            "bR'''": single3prog, 'bR"""': double3prog,
117            "Br'''": single3prog, 'Br"""': double3prog,
118            "BR'''": single3prog, 'BR"""': double3prog,
119            'r': None, 'R': None, 'u': None, 'U': None,
120            'b': None, 'B': None}
121
122triple_quoted = {}
123for t in ("'''", '"""',
124          "r'''", 'r"""', "R'''", 'R"""',
125          "u'''", 'u"""', "U'''", 'U"""',
126          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
127          "uR'''", 'uR"""', "UR'''", 'UR"""',
128          "b'''", 'b"""', "B'''", 'B"""',
129          "br'''", 'br"""', "Br'''", 'Br"""',
130          "bR'''", 'bR"""', "BR'''", 'BR"""'):
131    triple_quoted[t] = t
132single_quoted = {}
133for t in ("'", '"',
134          "r'", 'r"', "R'", 'R"',
135          "u'", 'u"', "U'", 'U"',
136          "ur'", 'ur"', "Ur'", 'Ur"',
137          "uR'", 'uR"', "UR'", 'UR"',
138          "b'", 'b"', "B'", 'B"',
139          "br'", 'br"', "Br'", 'Br"',
140          "bR'", 'bR"', "BR'", 'BR"' ):
141    single_quoted[t] = t
142
143tabsize = 8
144
145class TokenError(Exception): pass
146
147class StopTokenizing(Exception): pass
148
149def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
150    srow, scol = srow_scol
151    erow, ecol = erow_ecol
152    print "%d,%d-%d,%d:\t%s\t%s" % \
153        (srow, scol, erow, ecol, tok_name[type], repr(token))
154
155def tokenize(readline, tokeneater=printtoken):
156    """
157    The tokenize() function accepts two parameters: one representing the
158    input stream, and one providing an output mechanism for tokenize().
159
160    The first parameter, readline, must be a callable object which provides
161    the same interface as the readline() method of built-in file objects.
162    Each call to the function should return one line of input as a string.
163
164    The second parameter, tokeneater, must also be a callable object. It is
165    called once for each token, with five arguments, corresponding to the
166    tuples generated by generate_tokens().
167    """
168    try:
169        tokenize_loop(readline, tokeneater)
170    except StopTokenizing:
171        pass
172
173# backwards compatible interface
174def tokenize_loop(readline, tokeneater):
175    for token_info in generate_tokens(readline):
176        tokeneater(*token_info)
177
178class Untokenizer:
179
180    def __init__(self):
181        self.tokens = []
182        self.prev_row = 1
183        self.prev_col = 0
184
185    def add_whitespace(self, start):
186        row, col = start
187        assert row <= self.prev_row
188        col_offset = col - self.prev_col
189        if col_offset:
190            self.tokens.append(" " * col_offset)
191
192    def untokenize(self, iterable):
193        for t in iterable:
194            if len(t) == 2:
195                self.compat(t, iterable)
196                break
197            tok_type, token, start, end, line = t
198            self.add_whitespace(start)
199            self.tokens.append(token)
200            self.prev_row, self.prev_col = end
201            if tok_type in (NEWLINE, NL):
202                self.prev_row += 1
203                self.prev_col = 0
204        return "".join(self.tokens)
205
206    def compat(self, token, iterable):
207        startline = False
208        indents = []
209        toks_append = self.tokens.append
210        toknum, tokval = token
211        if toknum in (NAME, NUMBER):
212            tokval += ' '
213        if toknum in (NEWLINE, NL):
214            startline = True
215        prevstring = False
216        for tok in iterable:
217            toknum, tokval = tok[:2]
218
219            if toknum in (NAME, NUMBER):
220                tokval += ' '
221
222            # Insert a space between two consecutive strings
223            if toknum == STRING:
224                if prevstring:
225                    tokval = ' ' + tokval
226                prevstring = True
227            else:
228                prevstring = False
229
230            if toknum == INDENT:
231                indents.append(tokval)
232                continue
233            elif toknum == DEDENT:
234                indents.pop()
235                continue
236            elif toknum in (NEWLINE, NL):
237                startline = True
238            elif startline and indents:
239                toks_append(indents[-1])
240                startline = False
241            toks_append(tokval)
242
243def untokenize(iterable):
244    """Transform tokens back into Python source code.
245
246    Each element returned by the iterable must be a token sequence
247    with at least two elements, a token number and token value.  If
248    only two tokens are passed, the resulting output is poor.
249
250    Round-trip invariant for full input:
251        Untokenized source will match input source exactly
252
253    Round-trip invariant for limited intput:
254        # Output text will tokenize the back to the input
255        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
256        newcode = untokenize(t1)
257        readline = iter(newcode.splitlines(1)).next
258        t2 = [tok[:2] for tok in generate_tokens(readline)]
259        assert t1 == t2
260    """
261    ut = Untokenizer()
262    return ut.untokenize(iterable)
263
264def generate_tokens(readline):
265    """
266    The generate_tokens() generator requires one argment, readline, which
267    must be a callable object which provides the same interface as the
268    readline() method of built-in file objects. Each call to the function
269    should return one line of input as a string.  Alternately, readline
270    can be a callable function terminating with StopIteration:
271        readline = open(myfile).next    # Example of alternate readline
272
273    The generator produces 5-tuples with these members: the token type; the
274    token string; a 2-tuple (srow, scol) of ints specifying the row and
275    column where the token begins in the source; a 2-tuple (erow, ecol) of
276    ints specifying the row and column where the token ends in the source;
277    and the line on which the token was found. The line passed is the
278    logical line; continuation lines are included.
279    """
280    lnum = parenlev = continued = 0
281    namechars, numchars = string.ascii_letters + '_', '0123456789'
282    contstr, needcont = '', 0
283    contline = None
284    indents = [0]
285
286    while 1:                                   # loop over lines in stream
287        try:
288            line = readline()
289        except StopIteration:
290            line = ''
291        lnum = lnum + 1
292        pos, max = 0, len(line)
293
294        if contstr:                            # continued string
295            if not line:
296                raise TokenError, ("EOF in multi-line string", strstart)
297            endmatch = endprog.match(line)
298            if endmatch:
299                pos = end = endmatch.end(0)
300                yield (STRING, contstr + line[:end],
301                       strstart, (lnum, end), contline + line)
302                contstr, needcont = '', 0
303                contline = None
304            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
305                yield (ERRORTOKEN, contstr + line,
306                           strstart, (lnum, len(line)), contline)
307                contstr = ''
308                contline = None
309                continue
310            else:
311                contstr = contstr + line
312                contline = contline + line
313                continue
314
315        elif parenlev == 0 and not continued:  # new statement
316            if not line: break
317            column = 0
318            while pos < max:                   # measure leading whitespace
319                if line[pos] == ' ': column = column + 1
320                elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
321                elif line[pos] == '\f': column = 0
322                else: break
323                pos = pos + 1
324            if pos == max: break
325
326            if line[pos] in '#\r\n':           # skip comments or blank lines
327                if line[pos] == '#':
328                    comment_token = line[pos:].rstrip('\r\n')
329                    nl_pos = pos + len(comment_token)
330                    yield (COMMENT, comment_token,
331                           (lnum, pos), (lnum, pos + len(comment_token)), line)
332                    yield (NL, line[nl_pos:],
333                           (lnum, nl_pos), (lnum, len(line)), line)
334                else:
335                    yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
336                           (lnum, pos), (lnum, len(line)), line)
337                continue
338
339            if column > indents[-1]:           # count indents or dedents
340                indents.append(column)
341                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
342            while column < indents[-1]:
343                if column not in indents:
344                    raise IndentationError(
345                        "unindent does not match any outer indentation level",
346                        ("<tokenize>", lnum, pos, line))
347                indents = indents[:-1]
348                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
349
350        else:                                  # continued statement
351            if not line:
352                raise TokenError, ("EOF in multi-line statement", (lnum, 0))
353            continued = 0
354
355        while pos < max:
356            pseudomatch = pseudoprog.match(line, pos)
357            if pseudomatch:                                # scan for tokens
358                start, end = pseudomatch.span(1)
359                spos, epos, pos = (lnum, start), (lnum, end), end
360                token, initial = line[start:end], line[start]
361
362                if initial in numchars or \
363                   (initial == '.' and token != '.'):      # ordinary number
364                    yield (NUMBER, token, spos, epos, line)
365                elif initial in '\r\n':
366                    yield (NL if parenlev > 0 else NEWLINE,
367                           token, spos, epos, line)
368                elif initial == '#':
369                    assert not token.endswith("\n")
370                    yield (COMMENT, token, spos, epos, line)
371                elif token in triple_quoted:
372                    endprog = endprogs[token]
373                    endmatch = endprog.match(line, pos)
374                    if endmatch:                           # all on one line
375                        pos = endmatch.end(0)
376                        token = line[start:pos]
377                        yield (STRING, token, spos, (lnum, pos), line)
378                    else:
379                        strstart = (lnum, start)           # multiple lines
380                        contstr = line[start:]
381                        contline = line
382                        break
383                elif initial in single_quoted or \
384                    token[:2] in single_quoted or \
385                    token[:3] in single_quoted:
386                    if token[-1] == '\n':                  # continued string
387                        strstart = (lnum, start)
388                        endprog = (endprogs[initial] or endprogs[token[1]] or
389                                   endprogs[token[2]])
390                        contstr, needcont = line[start:], 1
391                        contline = line
392                        break
393                    else:                                  # ordinary string
394                        yield (STRING, token, spos, epos, line)
395                elif initial in namechars:                 # ordinary name
396                    yield (NAME, token, spos, epos, line)
397                elif initial == '\\':                      # continued stmt
398                    continued = 1
399                else:
400                    if initial in '([{': parenlev = parenlev + 1
401                    elif initial in ')]}': parenlev = parenlev - 1
402                    yield (OP, token, spos, epos, line)
403            else:
404                yield (ERRORTOKEN, line[pos],
405                           (lnum, pos), (lnum, pos+1), line)
406                pos = pos + 1
407
408    for indent in indents[1:]:                 # pop remaining indent levels
409        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
410    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
411
412if __name__ == '__main__':                     # testing
413    import sys
414    if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
415    else: tokenize(sys.stdin.readline)