1 """Tokenization help for Python programs.
3 generate_tokens(readline) is a generator that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF). It generates
6 5-tuples with these members:
8 the token type (see token.py)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
14 It is designed to match the working of the Python tokenizer exactly, except
15 that it produces COMMENT tokens for comments and gives type OP for all
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21 are the same, except instead of generating tokens, tokeneater is a callback
22 function to which the 5 fields described above are passed as 5 arguments,
23 each time a new token is found."""
25 from __future__
import generators
27 __author__ =
'Ka-Ping Yee <ping@lfw.org>'
29 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
35 __all__ = [x
for x
in dir(token)
if x[0] !=
'_'] + [
"COMMENT",
"tokenize",
"NL"]
39 tok_name[COMMENT] =
'COMMENT'
44 def group(*choices):
return '(' +
'|'.
join(choices) +
')'
45 def any(*choices):
return apply(group, choices) +
'*'
46 def maybe(*choices):
return apply(group, choices) +
'?'
48 Whitespace =
r'[ \f\t]*'
49 Comment =
r'#[^\r\n]*'
50 Ignore = Whitespace +
any(
r'\\\r?\n' + Whitespace) +
maybe(Comment)
51 Name =
r'[a-zA-Z_]\w*'
53 Hexnumber =
r'0[xX][\da-fA-F]*[lL]?'
54 Octnumber =
r'0[0-7]*[lL]?'
55 Decnumber =
r'[1-9]\d*[lL]?'
56 Intnumber =
group(Hexnumber, Octnumber, Decnumber)
57 Exponent =
r'[eE][-+]?\d+'
58 Pointfloat =
group(
r'\d+\.\d*',
r'\.\d+') +
maybe(Exponent)
59 Expfloat =
r'\d+' + Exponent
60 Floatnumber =
group(Pointfloat, Expfloat)
61 Imagnumber =
group(
r'\d+[jJ]', Floatnumber +
r'[jJ]')
62 Number =
group(Imagnumber, Floatnumber, Intnumber)
65 Single =
r"[^'\\]*(?:\\.[^'\\]*)*'"
67 Double =
r'[^"\\]*(?:\\.[^"\\]*)*"'
69 Single3 =
r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
71 Double3 =
r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
72 Triple =
group(
"[uU]?[rR]?'''",
'[uU]?[rR]?"""')
74 String =
group(
r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
75 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
80 Operator =
group(
r"\*\*=?",
r">>=?",
r"<<=?",
r"<>",
r"!=",
86 Special =
group(
r'\r?\n',
r'[:;.,`]')
87 Funny =
group(Operator, Bracket, Special)
89 PlainToken =
group(Number, Funny, String, Name)
90 Token = Ignore + PlainToken
93 ContStr =
group(
r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
94 group(
"'",
r'\\\r?\n'),
95 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
96 group(
'"',
r'\\\r?\n'))
97 PseudoExtras =
group(
r'\\\r?\n', Comment, Triple)
98 PseudoToken = Whitespace +
group(PseudoExtras, Number, Funny, ContStr, Name)
100 tokenprog, pseudoprog, single3prog, double3prog = map(
101 re.compile, (Token, PseudoToken, Single3, Double3))
102 endprogs = {
"'": re.compile(Single),
'"': re.compile(Double),
103 "'''": single3prog,
'"""': double3prog,
104 "r'''": single3prog,
'r"""': double3prog,
105 "u'''": single3prog,
'u"""': double3prog,
106 "ur'''": single3prog,
'ur"""': double3prog,
107 "R'''": single3prog,
'R"""': double3prog,
108 "U'''": single3prog,
'U"""': double3prog,
109 "uR'''": single3prog,
'uR"""': double3prog,
110 "Ur'''": single3prog,
'Ur"""': double3prog,
111 "UR'''": single3prog,
'UR"""': double3prog,
112 'r': None, 'R': None, 'u': None, 'U': None}
116 class TokenError(Exception):
pass
118 class StopTokenizing(Exception):
pass
120 def printtoken(type, token, (srow, scol), (erow, ecol), line):
121 print "%d,%d-%d,%d:\t%s\t%s" % \
122 (srow, scol, erow, ecol, tok_name[type],
repr(token))
124 def tokenize(readline, tokeneater=printtoken):
126 tokenize_loop(readline, tokeneater)
127 except StopTokenizing:
131 def tokenize_loop(readline, tokeneater):
132 for token_info
in generate_tokens(readline):
133 apply(tokeneater, token_info)
135 def generate_tokens(readline):
136 lnum = parenlev = continued = 0
137 namechars, numchars = string.ascii_letters +
'_',
'0123456789'
138 contstr, needcont =
'', 0
145 pos, max = 0, len(line)
149 raise TokenError, (
"EOF in multi-line string", strstart)
150 endmatch = endprog.match(line)
152 pos = end = endmatch.end(0)
153 yield (STRING, contstr + line[:end],
154 strstart, (lnum, end), contline + line)
155 contstr, needcont =
'', 0
157 elif needcont
and line[-2:] !=
'\\\n' and line[-3:] !=
'\\\r\n':
158 yield (ERRORTOKEN, contstr + line,
159 strstart, (lnum, len(line)), contline)
164 contstr = contstr + line
165 contline = contline + line
168 elif parenlev == 0
and not continued:
172 if line[pos] ==
' ': column = column + 1
173 elif line[pos] ==
'\t': column = (column/tabsize + 1)*tabsize
174 elif line[pos] ==
'\f': column = 0
179 if line[pos]
in '#\r\n':
180 yield ((NL, COMMENT)[line[pos] ==
'#'], line[pos:],
181 (lnum, pos), (lnum, len(line)), line)
184 if column > indents[-1]:
185 indents.append(column)
186 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
187 while column < indents[-1]:
188 indents = indents[:-1]
189 yield (DEDENT,
'', (lnum, pos), (lnum, pos), line)
193 raise TokenError, (
"EOF in multi-line statement", (lnum, 0))
197 pseudomatch = pseudoprog.match(line, pos)
199 start, end = pseudomatch.span(1)
200 spos, epos, pos = (lnum, start), (lnum, end), end
201 token, initial = line[start:end], line[start]
203 if initial
in numchars
or \
204 (initial ==
'.' and token !=
'.'):
205 yield (NUMBER, token, spos, epos, line)
206 elif initial
in '\r\n':
207 yield (parenlev > 0
and NL
or NEWLINE,
208 token, spos, epos, line)
210 yield (COMMENT, token, spos, epos, line)
211 elif token
in (
"'''",
'"""',
212 "r'''",
'r"""',
"R'''",
'R"""',
213 "u'''",
'u"""',
"U'''",
'U"""',
214 "ur'''",
'ur"""',
"Ur'''",
'Ur"""',
215 "uR'''",
'uR"""',
"UR'''",
'UR"""'):
216 endprog = endprogs[token]
217 endmatch = endprog.match(line, pos)
219 pos = endmatch.end(0)
220 token = line[start:pos]
221 yield (STRING, token, spos, (lnum, pos), line)
223 strstart = (lnum, start)
224 contstr = line[start:]
227 elif initial
in (
"'",
'"')
or \
228 token[:2]
in (
"r'", 'r"', "R'", 'R"',
229 "u'", 'u"', "U'", 'U"') or \
230 token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
231 "uR'", 'uR"', "UR'", 'UR"' ):
232 if token[-1] ==
'\n':
233 strstart = (lnum, start)
234 endprog = (endprogs[initial]
or endprogs[token[1]]
or
236 contstr, needcont = line[start:], 1
240 yield (STRING, token, spos, epos, line)
241 elif initial
in namechars:
242 yield (NAME, token, spos, epos, line)
243 elif initial ==
'\\':
246 if initial
in '([{': parenlev = parenlev + 1
247 elif initial
in ')]}': parenlev = parenlev - 1
248 yield (OP, token, spos, epos, line)
250 yield (ERRORTOKEN, line[pos],
251 (lnum, pos), (lnum, pos+1), line)
254 for indent
in indents[1:]:
255 yield (DEDENT,
'', (lnum, 0), (lnum, 0),
'')
256 yield (ENDMARKER,
'', (lnum, 0), (lnum, 0),
'')
258 if __name__ ==
'__main__':
260 if len(sys.argv) > 1:
tokenize(
open(sys.argv[1]).readline)