1 """A parser for SGML, using the derived class as a static DTD."""
15 __all__ = [
"SGMLParser"]
19 interesting = re.compile(
'[&<]')
20 incomplete = re.compile(
'&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
25 entityref = re.compile(
'&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
26 charref = re.compile(
'&#([0-9]+)[^0-9]')
28 starttagopen = re.compile(
'<[>a-zA-Z]')
29 shorttagopen = re.compile(
'<[a-zA-Z][-.a-zA-Z0-9]*/')
30 shorttag = re.compile(
'<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
31 piclose = re.compile(
'>')
32 endbracket = re.compile(
'[<>]')
33 commentclose = re.compile(
r'--\s*>')
34 tagfind = re.compile(
'[a-zA-Z][-_.a-zA-Z0-9]*')
35 attrfind = re.compile(
36 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
37 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~\'"]*))?')
41 """Exception raised for all parse errors."""
59 """Initialize and reset this instance."""
64 """Reset this instance. Loses all unprocessed data."""
70 markupbase.ParserBase.reset(self)
73 """Enter literal mode (CDATA) till EOF.
75 Intended for derived classes only.
80 """Enter literal mode (CDATA).
82 Intended for derived classes only.
87 """Feed some data to the parser.
89 Call this as often as you want, with as little or as much text
90 as you want (may include '\n'). (This just saves the text,
91 all the processing is done by goahead().)
98 """Handle the remaining data."""
116 match = interesting.search(rawdata, i)
117 if match: j = match.start()
123 if rawdata[i] ==
'<':
124 if starttagopen.match(rawdata, i):
133 if rawdata.startswith(
"</", i):
147 if rawdata.startswith(
"<!--", i):
152 if rawdata.startswith(
"<?", i):
157 if rawdata.startswith(
"<!", i):
161 k = self.parse_declaration(i)
165 elif rawdata[i] ==
'&':
170 match = charref.match(rawdata, i)
172 name = match.group(1)
175 if rawdata[i-1] !=
';': i = i-1
177 match = entityref.match(rawdata, i)
179 name = match.group(1)
182 if rawdata[i-1] !=
';': i = i-1
185 self.
error(
'neither < nor & ??')
188 match = incomplete.match(rawdata, i)
208 if rawdata[i:i+4] !=
'<!--':
209 self.
error(
'unexpected call to parse_comment()')
210 match = commentclose.search(rawdata, i+4)
219 _decl_otherchars =
'='
224 if rawdata[i:i+2] !=
'<?':
225 self.
error(
'unexpected call to parse_pi()')
226 match = piclose.search(rawdata, i+2)
234 __starttag_text =
None
243 if shorttagopen.match(rawdata, i):
248 match = shorttag.match(rawdata, i)
251 tag, data = match.group(1, 2)
259 match = endbracket.search(rawdata, i+1)
265 if rawdata[i:i+2] ==
'<>':
270 match = tagfind.match(rawdata, i+1)
272 self.
error(
'unexpected call to parse_starttag')
274 tag = rawdata[i+1:k].
lower()
277 match = attrfind.match(rawdata, k)
279 attrname, rest, attrvalue = match.group(1, 2, 3)
282 elif attrvalue[:1] ==
'\'' == attrvalue[-1:]
or \
283 attrvalue[:1] ==
'"' == attrvalue[-1:]:
284 attrvalue = attrvalue[1:-1]
285 attrs.append((attrname.lower(), attrvalue))
287 if rawdata[j] ==
'>':
296 match = endbracket.search(rawdata, i+1)
301 if rawdata[j] ==
'>':
316 method = getattr(self,
'start_' + tag)
317 except AttributeError:
319 method = getattr(self,
'do_' + tag)
320 except AttributeError:
327 self.stack.append(tag)
334 found = len(self.
stack) - 1
339 if tag
not in self.
stack:
341 method = getattr(self,
'end_' + tag)
342 except AttributeError:
347 found = len(self.
stack)
348 for i
in range(found):
349 if self.
stack[i] == tag: found = i
350 while len(self.
stack) > found:
353 method = getattr(self,
'end_' + tag)
354 except AttributeError:
373 print '*** Unbalanced </' + tag +
'>'
374 print '*** Stack:', self.
stack
377 """Handle character reference, no need to override."""
383 if not 0 <= n <= 255:
390 {
'lt':
'<',
'gt':
'>',
'amp':
'&',
'quot':
'"',
'apos':
'\''}
393 """Handle entity references.
395 There should be no need to override this method; it can be
396 tailored by setting up the self.entitydefs mapping appropriately.
399 if table.has_key(name):
432 SGMLParser.__init__(self, verbose)
443 print 'data:', `data`
449 r = r[:32] +
'...' + r[-32:]
455 print 'start tag: <' + tag +
'>'
457 print 'start tag: <' + tag,
458 for name, value
in attrs:
459 print name +
'=' +
'"' + value +
'"',
464 print 'end tag: </' + tag +
'>'
468 print '*** unknown entity ref: &' + ref +
';'
472 print '*** unknown char ref: &#' + ref +
';'
475 SGMLParser.close(self)
485 if args
and args[0] ==
'-s':
489 klass = TestSGMLParser
506 if f
is not sys.stdin:
515 if __name__ ==
'__main__':