Vega strike Python Modules doc  0.5.1
Documentation of the " Modules " folder of Vega strike
 All Data Structures Namespaces Files Functions Variables
sre_compile.py
Go to the documentation of this file.
1 #
2 # Secret Labs' Regular Expression Engine
3 #
4 # convert template to internal format
5 #
6 # Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
7 #
8 # See the sre.py file for information on usage and redistribution.
9 #
10 
11 """Internal support module for sre"""
12 
13 import _sre,sys
14 
15 from sre_constants import *
16 
17 assert _sre.MAGIC == MAGIC, "SRE module mismatch"
18 
19 MAXCODE = 65535
20 
21 def _compile(code, pattern, flags):
22  # internal: compile a (sub)pattern
23  emit = code.append
24  for op, av in pattern:
25  if op in (LITERAL, NOT_LITERAL):
26  if flags & SRE_FLAG_IGNORECASE:
27  emit(OPCODES[OP_IGNORE[op]])
28  emit(_sre.getlower(av, flags))
29  else:
30  emit(OPCODES[op])
31  emit(av)
32  elif op is IN:
33  if flags & SRE_FLAG_IGNORECASE:
34  emit(OPCODES[OP_IGNORE[op]])
35  def fixup(literal, flags=flags):
36  return _sre.getlower(literal, flags)
37  else:
38  emit(OPCODES[op])
39  fixup = lambda x: x
40  skip = len(code); emit(0)
41  _compile_charset(av, flags, code, fixup)
42  code[skip] = len(code) - skip
43  elif op is ANY:
44  if flags & SRE_FLAG_DOTALL:
45  emit(OPCODES[ANY_ALL])
46  else:
47  emit(OPCODES[ANY])
48  elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
49  if flags & SRE_FLAG_TEMPLATE:
50  raise error, "internal: unsupported template operator"
51  emit(OPCODES[REPEAT])
52  skip = len(code); emit(0)
53  emit(av[0])
54  emit(av[1])
55  _compile(code, av[2], flags)
56  emit(OPCODES[SUCCESS])
57  code[skip] = len(code) - skip
58  elif _simple(av) and op == MAX_REPEAT:
59  emit(OPCODES[REPEAT_ONE])
60  skip = len(code); emit(0)
61  emit(av[0])
62  emit(av[1])
63  _compile(code, av[2], flags)
64  emit(OPCODES[SUCCESS])
65  code[skip] = len(code) - skip
66  else:
67  emit(OPCODES[REPEAT])
68  skip = len(code); emit(0)
69  emit(av[0])
70  emit(av[1])
71  _compile(code, av[2], flags)
72  code[skip] = len(code) - skip
73  if op == MAX_REPEAT:
74  emit(OPCODES[MAX_UNTIL])
75  else:
76  emit(OPCODES[MIN_UNTIL])
77  elif op is SUBPATTERN:
78  if av[0]:
79  emit(OPCODES[MARK])
80  emit((av[0]-1)*2)
81  # _compile_info(code, av[1], flags)
82  _compile(code, av[1], flags)
83  if av[0]:
84  emit(OPCODES[MARK])
85  emit((av[0]-1)*2+1)
86  elif op in (SUCCESS, FAILURE):
87  emit(OPCODES[op])
88  elif op in (ASSERT, ASSERT_NOT):
89  emit(OPCODES[op])
90  skip = len(code); emit(0)
91  if av[0] >= 0:
92  emit(0) # look ahead
93  else:
94  lo, hi = av[1].getwidth()
95  if lo != hi:
96  raise error, "look-behind requires fixed-width pattern"
97  emit(lo) # look behind
98  _compile(code, av[1], flags)
99  emit(OPCODES[SUCCESS])
100  code[skip] = len(code) - skip
101  elif op is CALL:
102  emit(OPCODES[op])
103  skip = len(code); emit(0)
104  _compile(code, av, flags)
105  emit(OPCODES[SUCCESS])
106  code[skip] = len(code) - skip
107  elif op is AT:
108  emit(OPCODES[op])
109  if flags & SRE_FLAG_MULTILINE:
110  av = AT_MULTILINE.get(av, av)
111  if flags & SRE_FLAG_LOCALE:
112  av = AT_LOCALE.get(av, av)
113  elif flags & SRE_FLAG_UNICODE:
114  av = AT_UNICODE.get(av, av)
115  emit(ATCODES[av])
116  elif op is BRANCH:
117  emit(OPCODES[op])
118  tail = []
119  for av in av[1]:
120  skip = len(code); emit(0)
121  # _compile_info(code, av, flags)
122  _compile(code, av, flags)
123  emit(OPCODES[JUMP])
124  tail.append(len(code)); emit(0)
125  code[skip] = len(code) - skip
126  emit(0) # end of branch
127  for tail in tail:
128  code[tail] = len(code) - tail
129  elif op is CATEGORY:
130  emit(OPCODES[op])
131  if flags & SRE_FLAG_LOCALE:
132  av = CH_LOCALE[av]
133  elif flags & SRE_FLAG_UNICODE:
134  av = CH_UNICODE[av]
135  emit(CHCODES[av])
136  elif op is GROUPREF:
137  if flags & SRE_FLAG_IGNORECASE:
138  emit(OPCODES[OP_IGNORE[op]])
139  else:
140  emit(OPCODES[op])
141  emit(av-1)
142  else:
143  raise ValueError, ("unsupported operand type", op)
144 
145 def _compile_charset(charset, flags, code, fixup=None):
146  # compile charset subprogram
147  emit = code.append
148  if not fixup:
149  fixup = lambda x: x
150  for op, av in _optimize_charset(charset, fixup):
151  emit(OPCODES[op])
152  if op is NEGATE:
153  pass
154  elif op is LITERAL:
155  emit(fixup(av))
156  elif op is RANGE:
157  emit(fixup(av[0]))
158  emit(fixup(av[1]))
159  elif op is CHARSET:
160  code.extend(av)
161  elif op is BIGCHARSET:
162  code.extend(av)
163  elif op is CATEGORY:
164  if flags & SRE_FLAG_LOCALE:
165  emit(CHCODES[CH_LOCALE[av]])
166  elif flags & SRE_FLAG_UNICODE:
167  emit(CHCODES[CH_UNICODE[av]])
168  else:
169  emit(CHCODES[av])
170  else:
171  raise error, "internal: unsupported set operator"
172  emit(OPCODES[FAILURE])
173 
174 def _optimize_charset(charset, fixup):
175  # internal: optimize character set
176  out = []
177  charmap = [0]*256
178  try:
179  for op, av in charset:
180  if op is NEGATE:
181  out.append((op, av))
182  elif op is LITERAL:
183  charmap[fixup(av)] = 1
184  elif op is RANGE:
185  for i in range(fixup(av[0]), fixup(av[1])+1):
186  charmap[i] = 1
187  elif op is CATEGORY:
188  # XXX: could append to charmap tail
189  return charset # cannot compress
190  except IndexError:
191  # character set contains unicode characters
192  return _optimize_unicode(charset, fixup)
193  # compress character map
194  i = p = n = 0
195  runs = []
196  for c in charmap:
197  if c:
198  if n == 0:
199  p = i
200  n = n + 1
201  elif n:
202  runs.append((p, n))
203  n = 0
204  i = i + 1
205  if n:
206  runs.append((p, n))
207  if len(runs) <= 2:
208  # use literal/range
209  for p, n in runs:
210  if n == 1:
211  out.append((LITERAL, p))
212  else:
213  out.append((RANGE, (p, p+n-1)))
214  if len(out) < len(charset):
215  return out
216  else:
217  # use bitmap
218  data = _mk_bitmap(charmap)
219  out.append((CHARSET, data))
220  return out
221  return charset
222 
223 def _mk_bitmap(bits):
224  data = []
225  m = 1; v = 0
226  for c in bits:
227  if c:
228  v = v + m
229  m = m << 1
230  if m > MAXCODE:
231  data.append(v)
232  m = 1; v = 0
233  return data
234 
235 # To represent a big charset, first a bitmap of all characters in the
236 # set is constructed. Then, this bitmap is sliced into chunks of 256
237 # characters, duplicate chunks are eliminitated, and each chunk is
238 # given a number. In the compiled expression, the charset is
239 # represented by a 16-bit word sequence, consisting of one word for
240 # the number of different chunks, a sequence of 256 bytes (128 words)
241 # of chunk numbers indexed by their original chunk position, and a
242 # sequence of chunks (16 words each).
243 
244 # Compression is normally good: in a typical charset, large ranges of
245 # Unicode will be either completely excluded (e.g. if only cyrillic
246 # letters are to be matched), or completely included (e.g. if large
247 # subranges of Kanji match). These ranges will be represented by
248 # chunks of all one-bits or all zero-bits.
249 
250 # Matching can be also done efficiently: the more significant byte of
251 # the Unicode character is an index into the chunk number, and the
252 # less significant byte is a bit index in the chunk (just like the
253 # CHARSET matching).
254 
255 def _optimize_unicode(charset, fixup):
256  charmap = [0]*65536
257  negate = 0
258  for op, av in charset:
259  if op is NEGATE:
260  negate = 1
261  elif op is LITERAL:
262  charmap[fixup(av)] = 1
263  elif op is RANGE:
264  for i in range(fixup(av[0]), fixup(av[1])+1):
265  charmap[i] = 1
266  elif op is CATEGORY:
267  # XXX: could expand category
268  return charset # cannot compress
269  if negate:
270  for i in range(65536):
271  charmap[i] = not charmap[i]
272  comps = {}
273  mapping = [0]*256
274  block = 0
275  data = []
276  for i in range(256):
277  chunk = tuple(charmap[i*256:(i+1)*256])
278  new = comps.setdefault(chunk, block)
279  mapping[i] = new
280  if new == block:
281  block += 1
282  data += _mk_bitmap(chunk)
283  header = [block]
284  assert MAXCODE == 65535
285  for i in range(128):
286  if sys.byteorder == 'big':
287  header.append(256*mapping[2*i]+mapping[2*i+1])
288  else:
289  header.append(mapping[2*i]+256*mapping[2*i+1])
290  data[0:0] = header
291  return [(BIGCHARSET, data)]
292 
293 def _simple(av):
294  # check if av is a "simple" operator
295  lo, hi = av[2].getwidth()
296  if lo == 0 and hi == MAXREPEAT:
297  raise error, "nothing to repeat"
298  return lo == hi == 1 and av[2][0][0] != SUBPATTERN
299 
300 def _compile_info(code, pattern, flags):
301  # internal: compile an info block. in the current version,
302  # this contains min/max pattern width, and an optional literal
303  # prefix or a character map
304  lo, hi = pattern.getwidth()
305  if lo == 0:
306  return # not worth it
307  # look for a literal prefix
308  prefix = []
309  prefix_skip = 0
310  charset = [] # not used
311  if not (flags & SRE_FLAG_IGNORECASE):
312  # look for literal prefix
313  for op, av in pattern.data:
314  if op is LITERAL:
315  if len(prefix) == prefix_skip:
316  prefix_skip = prefix_skip + 1
317  prefix.append(av)
318  elif op is SUBPATTERN and len(av[1]) == 1:
319  op, av = av[1][0]
320  if op is LITERAL:
321  prefix.append(av)
322  else:
323  break
324  else:
325  break
326  # if no prefix, look for charset prefix
327  if not prefix and pattern.data:
328  op, av = pattern.data[0]
329  if op is SUBPATTERN and av[1]:
330  op, av = av[1][0]
331  if op is LITERAL:
332  charset.append((op, av))
333  elif op is BRANCH:
334  c = []
335  for p in av[1]:
336  if not p:
337  break
338  op, av = p[0]
339  if op is LITERAL:
340  c.append((op, av))
341  else:
342  break
343  else:
344  charset = c
345  elif op is BRANCH:
346  c = []
347  for p in av[1]:
348  if not p:
349  break
350  op, av = p[0]
351  if op is LITERAL:
352  c.append((op, av))
353  else:
354  break
355  else:
356  charset = c
357  elif op is IN:
358  charset = av
359 ## if prefix:
360 ## print "*** PREFIX", prefix, prefix_skip
361 ## if charset:
362 ## print "*** CHARSET", charset
363  # add an info block
364  emit = code.append
365  emit(OPCODES[INFO])
366  skip = len(code); emit(0)
367  # literal flag
368  mask = 0
369  if prefix:
370  mask = SRE_INFO_PREFIX
371  if len(prefix) == prefix_skip == len(pattern.data):
372  mask = mask + SRE_INFO_LITERAL
373  elif charset:
374  mask = mask + SRE_INFO_CHARSET
375  emit(mask)
376  # pattern length
377  if lo < MAXCODE:
378  emit(lo)
379  else:
380  emit(MAXCODE)
381  prefix = prefix[:MAXCODE]
382  if hi < MAXCODE:
383  emit(hi)
384  else:
385  emit(0)
386  # add literal prefix
387  if prefix:
388  emit(len(prefix)) # length
389  emit(prefix_skip) # skip
390  code.extend(prefix)
391  # generate overlap table
392  table = [-1] + ([0]*len(prefix))
393  for i in range(len(prefix)):
394  table[i+1] = table[i]+1
395  while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
396  table[i+1] = table[table[i+1]-1]+1
397  code.extend(table[1:]) # don't store first entry
398  elif charset:
399  _compile_charset(charset, 0, code)
400  code[skip] = len(code) - skip
401 
402 STRING_TYPES = [type("")]
403 
404 try:
405  STRING_TYPES.append(type(unicode("")))
406 except NameError:
407  pass
408 
409 def _code(p, flags):
410 
411  flags = p.pattern.flags | flags
412  code = []
413 
414  # compile info block
415  _compile_info(code, p, flags)
416 
417  # compile the pattern
418  _compile(code, p.data, flags)
419 
420  code.append(OPCODES[SUCCESS])
421 
422  return code
423 
424 def compile(p, flags=0):
425  # internal: convert pattern list to internal format
426 
427  if type(p) in STRING_TYPES:
428  import sre_parse
429  pattern = p
430  p = sre_parse.parse(p, flags)
431  else:
432  pattern = None
433 
434  code = _code(p, flags)
435 
436  # print code
437 
438  # XXX: <fl> get rid of this limitation!
439  assert p.pattern.groups <= 100,\
440  "sorry, but this version only supports 100 named groups"
441 
442  # map in either direction
443  groupindex = p.pattern.groupdict
444  indexgroup = [None] * p.pattern.groups
445  for k, i in groupindex.items():
446  indexgroup[i] = k
447 
448  return _sre.compile(
449  pattern, flags, code,
450  p.pattern.groups-1,
451  groupindex, indexgroup
452  )