1 """Tokenization help for Python programs.
3 generate_tokens(readline) is a generator that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF). It generates
6 5-tuples with these members:
8 the token type (see token.py)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
14 It is designed to match the working of the Python tokenizer exactly, except
15 that it produces COMMENT tokens for comments and gives type OP for all
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21 are the same, except instead of generating tokens, tokeneater is a callback
22 function to which the 5 fields described above are passed as 5 arguments,
23 each time a new token is found."""
25 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
27 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
33 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
34 "generate_tokens", "NL", "untokenize"]
39 tok_name[COMMENT] = 'COMMENT'
44 def group(*choices): return '(' + '|'.join(choices) + ')'
45 def any(*choices): return group(*choices) + '*'
46 def maybe(*choices): return group(*choices) + '?'
48 Whitespace = r'[ \f\t]*'
49 Comment = r'#[^\r\n]*'
50 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
51 Name = r'[a-zA-Z_]\w*'
53 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
54 Octnumber = r'0[0-7]*[lL]?'
55 Decnumber = r'[1-9]\d*[lL]?'
56 Intnumber = group(Hexnumber, Octnumber, Decnumber)
57 Exponent = r'[eE][-+]?\d+'
58 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
59 Expfloat = r'\d+' + Exponent
60 Floatnumber = group(Pointfloat, Expfloat)
61 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
62 Number = group(Imagnumber, Floatnumber, Intnumber)
64 # Tail end of ' string.
65 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
66 # Tail end of " string.
67 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
68 # Tail end of ''' string.
69 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
70 # Tail end of """ string.
71 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
72 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
73 # Single-line ' or " string.
74 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
75 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
77 # Because of leftmost-then-longest match semantics, be sure to put the
78 # longest operators first (e.g., if = came before ==, == would get
79 # recognized as two instances of =).
80 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
86 Special = group(r'\r?\n', r'[:;.,`@]')
87 Funny = group(Operator, Bracket, Special)
89 PlainToken = group(Number, Funny, String, Name)
90 Token = Ignore + PlainToken
92 # First (or only) line of ' or " string.
93 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
94 group("'", r'\\\r?\n'),
95 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
96 group('"', r'\\\r?\n'))
97 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
98 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
100 tokenprog, pseudoprog, single3prog, double3prog = map(
101 re.compile, (Token, PseudoToken, Single3, Double3))
102 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
103 "'''": single3prog, '"""': double3prog,
104 "r'''": single3prog, 'r"""': double3prog,
105 "u'''": single3prog, 'u"""': double3prog,
106 "ur'''": single3prog, 'ur"""': double3prog,
107 "R'''": single3prog, 'R"""': double3prog,
108 "U'''": single3prog, 'U"""': double3prog,
109 "uR'''": single3prog, 'uR"""': double3prog,
110 "Ur'''": single3prog, 'Ur"""': double3prog,
111 "UR'''": single3prog, 'UR"""': double3prog,
112 'r': None, 'R': None, 'u': None, 'U': None}
115 for t in ("'''", '"""',
116 "r'''", 'r"""', "R'''", 'R"""',
117 "u'''", 'u"""', "U'''", 'U"""',
118 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
119 "uR'''", 'uR"""', "UR'''", 'UR"""'):
123 "r'", 'r"', "R'", 'R"',
124 "u'", 'u"', "U'", 'U"',
125 "ur'", 'ur"', "Ur'", 'Ur"',
126 "uR'", 'uR"', "UR'", 'UR"' ):
131 class TokenError(Exception): pass
133 class StopTokenizing(Exception): pass
135 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
136 print "%d,%d-%d,%d:\t%s\t%s" % \
137 (srow, scol, erow, ecol, tok_name[type], repr(token))
139 def tokenize(readline, tokeneater=printtoken):
141 The tokenize() function accepts two parameters: one representing the
142 input stream, and one providing an output mechanism for tokenize().
144 The first parameter, readline, must be a callable object which provides
145 the same interface as the readline() method of built-in file objects.
146 Each call to the function should return one line of input as a string.
148 The second parameter, tokeneater, must also be a callable object. It is
149 called once for each token, with five arguments, corresponding to the
150 tuples generated by generate_tokens().
153 tokenize_loop(readline, tokeneater)
154 except StopTokenizing:
157 # backwards compatible interface
158 def tokenize_loop(readline, tokeneater):
159 for token_info in generate_tokens(readline):
160 tokeneater(*token_info)
163 def untokenize(iterable):
164 """Transform tokens back into Python source code.
166 Each element returned by the iterable must be a token sequence
167 with at least two elements, a token number and token value.
169 Round-trip invariant:
170 # Output text will tokenize the back to the input
171 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
172 newcode = untokenize(t1)
173 readline = iter(newcode.splitlines(1)).next
174 t2 = [tok[:2] for tokin generate_tokens(readline)]
181 toks_append = toks.append
183 toknum, tokval = tok[:2]
185 if toknum in (NAME, NUMBER):
189 indents.append(tokval)
191 elif toknum == DEDENT:
194 elif toknum in (NEWLINE, COMMENT, NL):
196 elif startline and indents:
197 toks_append(indents[-1])
203 def generate_tokens(readline):
205 The generate_tokens() generator requires one argment, readline, which
206 must be a callable object which provides the same interface as the
207 readline() method of built-in file objects. Each call to the function
208 should return one line of input as a string. Alternately, readline
209 can be a callable function terminating with StopIteration:
210 readline = open(myfile).next # Example of alternate readline
212 The generator produces 5-tuples with these members: the token type; the
213 token string; a 2-tuple (srow, scol) of ints specifying the row and
214 column where the token begins in the source; a 2-tuple (erow, ecol) of
215 ints specifying the row and column where the token ends in the source;
216 and the line on which the token was found. The line passed is the
217 logical line; continuation lines are included.
219 lnum = parenlev = continued = 0
220 namechars, numchars = string.ascii_letters + '_', '0123456789'
221 contstr, needcont = '', 0
225 while 1: # loop over lines in stream
228 except StopIteration:
231 pos, max = 0, len(line)
233 if contstr: # continued string
235 raise TokenError, ("EOF in multi-line string", strstart)
236 endmatch = endprog.match(line)
238 pos = end = endmatch.end(0)
239 yield (STRING, contstr + line[:end],
240 strstart, (lnum, end), contline + line)
241 contstr, needcont = '', 0
243 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
244 yield (ERRORTOKEN, contstr + line,
245 strstart, (lnum, len(line)), contline)
250 contstr = contstr + line
251 contline = contline + line
254 elif parenlev == 0 and not continued: # new statement
257 while pos < max: # measure leading whitespace
258 if line[pos] == ' ': column = column + 1
259 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
260 elif line[pos] == '\f': column = 0
265 if line[pos] in '#\r\n': # skip comments or blank lines
266 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
267 (lnum, pos), (lnum, len(line)), line)
270 if column > indents[-1]: # count indents or dedents
271 indents.append(column)
272 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
273 while column < indents[-1]:
274 if column not in indents:
275 raise IndentationError(
276 "unindent does not match any outer indentation level",
277 ("<tokenize>", lnum, pos, line))
278 indents = indents[:-1]
279 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
281 else: # continued statement
283 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
287 pseudomatch = pseudoprog.match(line, pos)
288 if pseudomatch: # scan for tokens
289 start, end = pseudomatch.span(1)
290 spos, epos, pos = (lnum, start), (lnum, end), end
291 token, initial = line[start:end], line[start]
293 if initial in numchars or \
294 (initial == '.' and token != '.'): # ordinary number
295 yield (NUMBER, token, spos, epos, line)
296 elif initial in '\r\n':
297 yield (parenlev > 0 and NL or NEWLINE,
298 token, spos, epos, line)
300 yield (COMMENT, token, spos, epos, line)
301 elif token in triple_quoted:
302 endprog = endprogs[token]
303 endmatch = endprog.match(line, pos)
304 if endmatch: # all on one line
305 pos = endmatch.end(0)
306 token = line[start:pos]
307 yield (STRING, token, spos, (lnum, pos), line)
309 strstart = (lnum, start) # multiple lines
310 contstr = line[start:]
313 elif initial in single_quoted or \
314 token[:2] in single_quoted or \
315 token[:3] in single_quoted:
316 if token[-1] == '\n': # continued string
317 strstart = (lnum, start)
318 endprog = (endprogs[initial] or endprogs[token[1]] or
320 contstr, needcont = line[start:], 1
323 else: # ordinary string
324 yield (STRING, token, spos, epos, line)
325 elif initial in namechars: # ordinary name
326 yield (NAME, token, spos, epos, line)
327 elif initial == '\\': # continued stmt
330 if initial in '([{': parenlev = parenlev + 1
331 elif initial in ')]}': parenlev = parenlev - 1
332 yield (OP, token, spos, epos, line)
334 yield (ERRORTOKEN, line[pos],
335 (lnum, pos), (lnum, pos+1), line)
338 for indent in indents[1:]: # pop remaining indent levels
339 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
340 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
342 if __name__ == '__main__': # testing
344 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
345 else: tokenize(sys.stdin.readline)