11 """Internal support module for sre"""
18 from sre_constants
import *
20 SPECIAL_CHARS =
".\\[{()*+?^$|"
26 HEXDIGITS =
tuple(
"0123456789abcdefABCDEF")
28 WHITESPACE =
tuple(
" \t\n\r\v\f")
31 r"\a": (LITERAL, ord(
"\a")),
32 r"\b": (LITERAL, ord(
"\b")),
33 r"\f": (LITERAL, ord(
"\f")),
34 r"\n": (LITERAL, ord(
"\n")),
35 r"\r": (LITERAL, ord(
"\r")),
36 r"\t": (LITERAL, ord(
"\t")),
37 r"\v": (LITERAL, ord(
"\v")),
38 r"\\": (LITERAL, ord(
"\\"))
42 r"\A": (AT, AT_BEGINNING_STRING),
43 r"\b": (AT, AT_BOUNDARY),
44 r"\B": (AT, AT_NON_BOUNDARY),
45 r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
46 r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
47 r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
48 r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
49 r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
50 r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
51 r"\Z": (AT, AT_END_STRING),
56 "i": SRE_FLAG_IGNORECASE,
58 "m": SRE_FLAG_MULTILINE,
60 "x": SRE_FLAG_VERBOSE,
62 "t": SRE_FLAG_TEMPLATE,
63 "u": SRE_FLAG_UNICODE,
84 ogid = self.groupdict.get(name,
None)
86 raise error, (
"redefinition of group name %s as group %d; "
87 "was group %d" % (
repr(name), gid, ogid))
94 return gid < self.
groups and gid
not in self.
open
106 for op, av
in self.
data:
107 print level*
" " + op,; nl = 0
112 print (level+1)*
" " + op, a
118 print level*
" " +
"or"
119 a.dump(level+1); nl = 1
121 elif type(av)
in (type(()), type([])):
123 if isinstance(a, SubPattern):
125 a.dump(level+1); nl = 1
134 return len(self.
data)
138 return self.
data[index]
140 self.
data[index] = code
144 self.data.insert(index, code)
146 self.data.append(code)
152 for op, av
in self.
data:
166 elif op
is SUBPATTERN:
170 elif op
in (MIN_REPEAT, MAX_REPEAT):
172 lo = lo + long(i) * av[0]
173 hi = hi + long(j) * av[1]
174 elif op
in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
179 self.
width = int(
min(lo, sys.maxint)), int(
min(hi, sys.maxint))
196 raise error,
"bogus escape (end of line)"
201 if char == self.
next:
216 return "a" <= char <=
"z" or "A" <= char <=
"Z" or char ==
"_"
219 return "0" <= char <=
"9"
230 def _group(escape, groups):
233 gid =
atoi(escape[1:])
234 if gid
and gid < groups:
240 def _class_escape(source, escape):
242 code = ESCAPES.get(escape)
245 code = CATEGORIES.get(escape)
249 if escape[1:2] ==
"x":
251 while source.next
in HEXDIGITS
and len(escape) < 4:
252 escape = escape + source.get()
255 raise error,
"bogus escape: %s" %
repr(
"\\" + escape)
256 return LITERAL,
atoi(escape, 16) & 0xff
257 elif str(escape[1:2])
in OCTDIGITS:
259 while source.next
in OCTDIGITS
and len(escape) < 5:
260 escape = escape + source.get()
262 return LITERAL,
atoi(escape, 8) & 0xff
264 return LITERAL, ord(escape[1])
267 raise error,
"bogus escape: %s" %
repr(escape)
269 def _escape(source, escape, state):
271 code = CATEGORIES.get(escape)
274 code = ESCAPES.get(escape)
278 if escape[1:2] ==
"x":
280 while source.next
in HEXDIGITS
and len(escape) < 4:
281 escape = escape + source.get()
284 return LITERAL,
atoi(escape[2:], 16) & 0xff
285 elif escape[1:2] ==
"0":
287 while source.next
in OCTDIGITS
and len(escape) < 4:
288 escape = escape + source.get()
289 return LITERAL,
atoi(escape[1:], 8) & 0xff
290 elif escape[1:2]
in DIGITS:
293 if source.next
in DIGITS:
294 escape = escape + source.get()
295 if (escape[1]
in OCTDIGITS
and escape[2]
in OCTDIGITS
and
296 source.next
in OCTDIGITS):
298 escape = escape + source.get()
299 return LITERAL,
atoi(escape[1:], 8) & 0xff
301 group = _group(escape, state.groups)
303 if not state.checkgroup(group):
304 raise error,
"cannot refer to open group"
305 return GROUPREF, group
308 return LITERAL, ord(escape[1])
311 raise error,
"bogus escape: %s" %
repr(escape)
313 def _parse_sub(source, state, nested=1):
318 items.append(_parse(source, state))
319 if source.match(
"|"):
323 if not source.next
or source.match(
")", 0):
326 raise error,
"pattern not properly closed"
341 elif item[0] != prefix:
348 subpattern.append(prefix)
354 if len(item) != 1
or item[0][0] != LITERAL:
362 subpattern.append((IN, set))
365 subpattern.append((BRANCH, (
None, items)))
368 def _parse(source, state):
375 if source.next
in (
"|",
")"):
381 if state.flags & SRE_FLAG_VERBOSE:
383 if this
in WHITESPACE:
388 if this
in (
None,
"\n"):
392 if this
and this[0]
not in SPECIAL_CHARS:
393 subpattern.append((LITERAL, ord(this)))
400 if source.match(
"^"):
401 set.append((NEGATE,
None))
406 if this ==
"]" and set != start:
408 elif this
and this[0] ==
"\\":
409 code1 = _class_escape(source, this)
411 code1 = LITERAL, ord(this)
413 raise error,
"unexpected end of regular expression"
414 if source.match(
"-"):
421 set.append((LITERAL, ord(
"-")))
425 code2 = _class_escape(source, this)
427 code2 = LITERAL, ord(this)
428 if code1[0] != LITERAL
or code2[0] != LITERAL:
429 raise error,
"bad character range"
433 raise error,
"bad character range"
434 set.append((RANGE, (lo, hi)))
441 if len(set)==1
and set[0][0]
is LITERAL:
442 subpattern.append(set[0])
443 elif len(set)==2
and set[0][0]
is NEGATE
and set[1][0]
is LITERAL:
444 subpattern.append((NOT_LITERAL, set[1][1]))
447 subpattern.append((IN, set))
449 elif this
and this[0]
in REPEAT_CHARS:
454 min, max = 0, MAXREPEAT
457 min, max = 1, MAXREPEAT
460 min, max = 0, MAXREPEAT
462 while source.next
in DIGITS:
463 lo = lo + source.get()
464 if source.match(
","):
465 while source.next
in DIGITS:
466 hi = hi + source.get()
469 if not source.match(
"}"):
470 subpattern.append((LITERAL, ord(this)))
478 raise error,
"bad repeat interval"
480 raise error,
"not supported"
483 item = subpattern[-1:]
486 if not item
or (len(item) == 1
and item[0][0] == AT):
487 raise error,
"nothing to repeat"
488 if item[0][0]
in (MIN_REPEAT, MAX_REPEAT):
489 raise error,
"multiple repeat"
490 if source.match(
"?"):
491 subpattern[-1] = (MIN_REPEAT, (min, max, item))
493 subpattern[-1] = (MAX_REPEAT, (min, max, item))
496 subpattern.append((ANY,
None))
501 if source.match(
"?"):
504 if source.match(
"P"):
506 if source.match(
"<"):
512 raise error,
"unterminated name"
518 raise error,
"bad character in group name"
519 elif source.match(
"="):
525 raise error,
"unterminated name"
530 raise error,
"bad character in group name"
531 gid = state.groupdict.get(name)
533 raise error,
"unknown group name"
534 subpattern.append((GROUPREF, gid))
539 raise error,
"unexpected end of pattern"
540 raise error,
"unknown specifier: ?P%s" % char
541 elif source.match(
":"):
544 elif source.match(
"#"):
547 if source.next
is None or source.next ==
")":
550 if not source.match(
")"):
551 raise error,
"unbalanced parenthesis"
553 elif source.next
in (
"=",
"!",
"<"):
558 if source.next
not in (
"=",
"!"):
559 raise error,
"syntax error"
562 p = _parse_sub(source, state)
563 if not source.match(
")"):
564 raise error,
"unbalanced parenthesis"
566 subpattern.append((ASSERT, (dir, p)))
568 subpattern.append((ASSERT_NOT, (dir, p)))
572 if not FLAGS.has_key(source.next):
573 raise error,
"unexpected end of pattern"
574 while FLAGS.has_key(source.next):
575 state.flags = state.flags | FLAGS[source.get()]
582 group = state.opengroup(name)
583 p = _parse_sub(source, state)
584 if not source.match(
")"):
585 raise error,
"unbalanced parenthesis"
586 if group
is not None:
587 state.closegroup(group)
588 subpattern.append((SUBPATTERN, (group, p)))
593 raise error,
"unexpected end of pattern"
596 raise error,
"unknown extension"
599 subpattern.append((AT, AT_BEGINNING))
602 subpattern.append((AT, AT_END))
604 elif this
and this[0] ==
"\\":
605 code = _escape(source, this, state)
606 subpattern.append(code)
609 raise error,
"parser error"
613 def parse(str, flags=0, pattern=None):
620 pattern.flags = flags
623 p = _parse_sub(source, pattern, 0)
627 raise error,
"unbalanced parenthesis"
629 raise error,
"bogus characters at end of regular expression"
631 if flags & SRE_FLAG_DEBUG:
634 if not (flags & SRE_FLAG_VERBOSE)
and p.pattern.flags & SRE_FLAG_VERBOSE:
637 return parse(str, p.pattern.flags)
647 def literal(literal, p=p):
648 if p
and p[-1][0]
is LITERAL:
649 p[-1] = LITERAL, p[-1][1] + literal
651 p.append((LITERAL, literal))
653 if type(sep)
is type(
""):
661 if this
and this[0] ==
"\\":
669 raise error,
"unterminated group name"
674 raise error,
"bad group name"
679 raise error,
"bad character in group name"
681 index = pattern.groupindex[name]
683 raise IndexError,
"unknown group name"
685 elif len(this) > 1
and this[1]
in DIGITS:
688 group = _group(this, pattern.groups+1)
690 if (s.next
not in DIGITS
or
691 not _group(this + s.next, pattern.groups+1)):
694 elif s.next
in OCTDIGITS:
695 this = this + s.get()
700 code = LITERAL, makechar(
atoi(this[-6:], 8) & 0xff)
701 if code[0]
is LITERAL:
707 this = makechar(ESCAPES[this][1])
719 groups.append((i, s))
720 literals.append(
None)
724 return groups, literals
728 sep = match.string[:0]
729 groups, literals = template
730 literals = literals[:]
732 for index, group
in groups:
733 literals[index] = s =
g(group)
737 raise error,
"empty group"