Vega strike Python Modules doc  0.5.1
Documentation of the " Modules " folder of Vega strike
 All Data Structures Namespaces Files Functions Variables
tokenize.py
Go to the documentation of this file.
1 """Tokenization help for Python programs.
2 
3 generate_tokens(readline) is a generator that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF). It generates
6 5-tuples with these members:
7 
8  the token type (see token.py)
9  the token (a string)
10  the starting (row, column) indices of the token (a 2-tuple of ints)
11  the ending (row, column) indices of the token (a 2-tuple of ints)
12  the original line (string)
13 
14 It is designed to match the working of the Python tokenizer exactly, except
15 that it produces COMMENT tokens for comments and gives type OP for all
16 operators
17 
18 Older entry points
19  tokenize_loop(readline, tokeneater)
20  tokenize(readline, tokeneater=printtoken)
21 are the same, except instead of generating tokens, tokeneater is a callback
22 function to which the 5 fields described above are passed as 5 arguments,
23 each time a new token is found."""
24 
25 from __future__ import generators
26 
27 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
28 __credits__ = \
29  'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
30 
31 import string, re
32 from token import *
33 
34 import token
35 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]
36 del token
37 
38 COMMENT = N_TOKENS
39 tok_name[COMMENT] = 'COMMENT'
40 NL = N_TOKENS + 1
41 tok_name[NL] = 'NL'
42 N_TOKENS += 2
43 
44 def group(*choices): return '(' + '|'.join(choices) + ')'
45 def any(*choices): return apply(group, choices) + '*'
46 def maybe(*choices): return apply(group, choices) + '?'
47 
48 Whitespace = r'[ \f\t]*'
49 Comment = r'#[^\r\n]*'
50 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
51 Name = r'[a-zA-Z_]\w*'
52 
53 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
54 Octnumber = r'0[0-7]*[lL]?'
55 Decnumber = r'[1-9]\d*[lL]?'
56 Intnumber = group(Hexnumber, Octnumber, Decnumber)
57 Exponent = r'[eE][-+]?\d+'
58 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
59 Expfloat = r'\d+' + Exponent
60 Floatnumber = group(Pointfloat, Expfloat)
61 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
62 Number = group(Imagnumber, Floatnumber, Intnumber)
63 
64 # Tail end of ' string.
65 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
66 # Tail end of " string.
67 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
68 # Tail end of ''' string.
69 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
70 # Tail end of """ string.
71 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
72 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
73 # Single-line ' or " string.
74 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
75  r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
76 
77 # Because of leftmost-then-longest match semantics, be sure to put the
78 # longest operators first (e.g., if = came before ==, == would get
79 # recognized as two instances of =).
80 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
81  r"//=?",
82  r"[+\-*/%&|^=<>]=?",
83  r"~")
84 
85 Bracket = '[][(){}]'
86 Special = group(r'\r?\n', r'[:;.,`]')
87 Funny = group(Operator, Bracket, Special)
88 
89 PlainToken = group(Number, Funny, String, Name)
90 Token = Ignore + PlainToken
91 
92 # First (or only) line of ' or " string.
93 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
94  group("'", r'\\\r?\n'),
95  r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
96  group('"', r'\\\r?\n'))
97 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
98 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
99 
100 tokenprog, pseudoprog, single3prog, double3prog = map(
101  re.compile, (Token, PseudoToken, Single3, Double3))
102 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
103  "'''": single3prog, '"""': double3prog,
104  "r'''": single3prog, 'r"""': double3prog,
105  "u'''": single3prog, 'u"""': double3prog,
106  "ur'''": single3prog, 'ur"""': double3prog,
107  "R'''": single3prog, 'R"""': double3prog,
108  "U'''": single3prog, 'U"""': double3prog,
109  "uR'''": single3prog, 'uR"""': double3prog,
110  "Ur'''": single3prog, 'Ur"""': double3prog,
111  "UR'''": single3prog, 'UR"""': double3prog,
112  'r': None, 'R': None, 'u': None, 'U': None}
113 
114 tabsize = 8
115 
116 class TokenError(Exception): pass
117 
118 class StopTokenizing(Exception): pass
119 
120 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
121  print "%d,%d-%d,%d:\t%s\t%s" % \
122  (srow, scol, erow, ecol, tok_name[type], repr(token))
123 
124 def tokenize(readline, tokeneater=printtoken):
125  try:
126  tokenize_loop(readline, tokeneater)
127  except StopTokenizing:
128  pass
129 
130 # backwards compatible interface
131 def tokenize_loop(readline, tokeneater):
132  for token_info in generate_tokens(readline):
133  apply(tokeneater, token_info)
134 
135 def generate_tokens(readline):
136  lnum = parenlev = continued = 0
137  namechars, numchars = string.ascii_letters + '_', '0123456789'
138  contstr, needcont = '', 0
139  contline = None
140  indents = [0]
141 
142  while 1: # loop over lines in stream
143  line = readline()
144  lnum = lnum + 1
145  pos, max = 0, len(line)
146 
147  if contstr: # continued string
148  if not line:
149  raise TokenError, ("EOF in multi-line string", strstart)
150  endmatch = endprog.match(line)
151  if endmatch:
152  pos = end = endmatch.end(0)
153  yield (STRING, contstr + line[:end],
154  strstart, (lnum, end), contline + line)
155  contstr, needcont = '', 0
156  contline = None
157  elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
158  yield (ERRORTOKEN, contstr + line,
159  strstart, (lnum, len(line)), contline)
160  contstr = ''
161  contline = None
162  continue
163  else:
164  contstr = contstr + line
165  contline = contline + line
166  continue
167 
168  elif parenlev == 0 and not continued: # new statement
169  if not line: break
170  column = 0
171  while pos < max: # measure leading whitespace
172  if line[pos] == ' ': column = column + 1
173  elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
174  elif line[pos] == '\f': column = 0
175  else: break
176  pos = pos + 1
177  if pos == max: break
178 
179  if line[pos] in '#\r\n': # skip comments or blank lines
180  yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
181  (lnum, pos), (lnum, len(line)), line)
182  continue
183 
184  if column > indents[-1]: # count indents or dedents
185  indents.append(column)
186  yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
187  while column < indents[-1]:
188  indents = indents[:-1]
189  yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
190 
191  else: # continued statement
192  if not line:
193  raise TokenError, ("EOF in multi-line statement", (lnum, 0))
194  continued = 0
195 
196  while pos < max:
197  pseudomatch = pseudoprog.match(line, pos)
198  if pseudomatch: # scan for tokens
199  start, end = pseudomatch.span(1)
200  spos, epos, pos = (lnum, start), (lnum, end), end
201  token, initial = line[start:end], line[start]
202 
203  if initial in numchars or \
204  (initial == '.' and token != '.'): # ordinary number
205  yield (NUMBER, token, spos, epos, line)
206  elif initial in '\r\n':
207  yield (parenlev > 0 and NL or NEWLINE,
208  token, spos, epos, line)
209  elif initial == '#':
210  yield (COMMENT, token, spos, epos, line)
211  elif token in ("'''", '"""', # triple-quoted
212  "r'''", 'r"""', "R'''", 'R"""',
213  "u'''", 'u"""', "U'''", 'U"""',
214  "ur'''", 'ur"""', "Ur'''", 'Ur"""',
215  "uR'''", 'uR"""', "UR'''", 'UR"""'):
216  endprog = endprogs[token]
217  endmatch = endprog.match(line, pos)
218  if endmatch: # all on one line
219  pos = endmatch.end(0)
220  token = line[start:pos]
221  yield (STRING, token, spos, (lnum, pos), line)
222  else:
223  strstart = (lnum, start) # multiple lines
224  contstr = line[start:]
225  contline = line
226  break
227  elif initial in ("'", '"') or \
228  token[:2] in ("r'", 'r"', "R'", 'R"',
229  "u'", 'u"', "U'", 'U"') or \
230  token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
231  "uR'", 'uR"', "UR'", 'UR"' ):
232  if token[-1] == '\n': # continued string
233  strstart = (lnum, start)
234  endprog = (endprogs[initial] or endprogs[token[1]] or
235  endprogs[token[2]])
236  contstr, needcont = line[start:], 1
237  contline = line
238  break
239  else: # ordinary string
240  yield (STRING, token, spos, epos, line)
241  elif initial in namechars: # ordinary name
242  yield (NAME, token, spos, epos, line)
243  elif initial == '\\': # continued stmt
244  continued = 1
245  else:
246  if initial in '([{': parenlev = parenlev + 1
247  elif initial in ')]}': parenlev = parenlev - 1
248  yield (OP, token, spos, epos, line)
249  else:
250  yield (ERRORTOKEN, line[pos],
251  (lnum, pos), (lnum, pos+1), line)
252  pos = pos + 1
253 
254  for indent in indents[1:]: # pop remaining indent levels
255  yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
256  yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
257 
258 if __name__ == '__main__': # testing
259  import sys
260  if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
261  else: tokenize(sys.stdin.readline)