1 """A parser for XML, using the derived class as static DTD."""
18 _Name =
'[a-zA-Z_:][-a-zA-Z0-9._:]*'
19 _QStr =
"(?:'[^']*'|\"[^\"]*\")"
20 illegal = re.compile(
'[^\t\r\n -\176\240-\377]')
21 interesting = re.compile(
'[]&<]')
24 ref = re.compile(
'&(' + _Name +
'|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
25 entityref = re.compile(
'&(?P<name>' + _Name +
')[^-a-zA-Z0-9._:]')
26 charref = re.compile(
'&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
27 space = re.compile(_S +
'$')
28 newline = re.compile(
'\n')
30 attrfind = re.compile(
31 _S +
'(?P<name>' + _Name +
')'
32 '(' + _opS +
'=' + _opS +
33 '(?P<value>'+_QStr+
'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?')
34 starttagopen = re.compile(
'<' + _Name)
35 starttagend = re.compile(_opS +
'(?P<slash>/?)>')
36 starttagmatch = re.compile(
'<(?P<tagname>'+_Name+
')'
37 '(?P<attrs>(?:'+attrfind.pattern+
')*)'+
39 endtagopen = re.compile(
'</')
40 endbracket = re.compile(_opS +
'>')
41 endbracketfind = re.compile(
'(?:[^>\'"]|'+_QStr+
')*>')
42 tagfind = re.compile(_Name)
43 cdataopen = re.compile(
r'<!\[CDATA\[')
44 cdataclose = re.compile(
r'\]\]>')
48 _SystemLiteral =
'(?P<%s>'+_QStr+
')'
49 _PublicLiteral =
'(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
50 "'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
51 _ExternalId =
'(?:SYSTEM|' \
52 'PUBLIC'+_S+_PublicLiteral%
'pubid'+ \
53 ')'+_S+_SystemLiteral%
'syslit'
54 doctype = re.compile(
'<!DOCTYPE'+_S+
'(?P<name>'+_Name+
')'
55 '(?:'+_S+_ExternalId+
')?'+_opS)
56 xmldecl = re.compile(
'<\?xml'+_S+
57 'version'+_opS+
'='+_opS+
'(?P<version>'+_QStr+
')'+
58 '(?:'+_S+
'encoding'+_opS+
'='+_opS+
59 "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
60 '"[A-Za-z][-A-Za-z0-9._]*"))?'
61 '(?:'+_S+
'standalone'+_opS+
'='+_opS+
62 '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
64 procopen = re.compile(
r'<\?(?P<proc>' + _Name +
')' + _opS)
65 procclose = re.compile(_opS +
r'\?>')
66 commentopen = re.compile(
'<!--')
67 commentclose = re.compile(
'-->')
68 doubledash = re.compile(
'--')
72 _NCName =
'[a-zA-Z_][-a-zA-Z0-9._]*'
73 ncname = re.compile(_NCName +
'$')
74 qname = re.compile(
'(?:(?P<prefix>' + _NCName +
'):)?'
75 '(?P<local>' + _NCName +
')$')
77 xmlns = re.compile(
'xmlns(?::(?P<ncname>'+_NCName+
'))?$')
92 __accept_unquoted_attributes = 0
93 __accept_missing_endtag_name = 0
96 __translate_attribute_references = 1
101 if kw.has_key(
'accept_unquoted_attributes'):
102 self.__accept_unquoted_attributes = kw[
'accept_unquoted_attributes']
103 if kw.has_key(
'accept_missing_endtag_name'):
104 self.__accept_missing_endtag_name = kw[
'accept_missing_endtag_name']
105 if kw.has_key(
'map_case'):
106 self.__map_case = kw[
'map_case']
107 if kw.has_key(
'accept_utf8'):
108 self.__accept_utf8 = kw[
'accept_utf8']
109 if kw.has_key(
'translate_attribute_references'):
110 self.__translate_attribute_references = kw[
'translate_attribute_references']
113 def __fixelements(self):
116 self.__fixdict(self.__dict__)
117 self.__fixclass(self.__class__)
119 def __fixclass(self, kl):
120 self.__fixdict(kl.__dict__)
121 for k
in kl.__bases__:
124 def __fixdict(self, dict):
125 for key
in dict.keys():
126 if key[:6] ==
'start_':
128 start, end = self.elements.get(tag, (
None,
None))
130 self.elements[tag] = getattr(self, key), end
131 elif key[:4] ==
'end_':
133 start, end = self.elements.get(tag, (
None,
None))
135 self.elements[tag] = start, getattr(self, key)
145 self.__seen_doctype =
None
146 self.__seen_starttag = 0
147 self.__use_namespaces = 0
148 self.__namespaces = {
'xml':
None}
151 if self.elements
is XMLParser.elements:
155 def setnomoretags(self):
156 self.nomoretags = self.literal = 1
159 def setliteral(self, *args):
166 def feed(self, data):
167 self.rawdata = self.rawdata + data
179 def translate_references(self, data, all = 1):
180 if not self.__translate_attribute_references:
184 res = amp.search(data, i)
188 res = ref.match(data, s)
190 self.syntax_error(
"bogus `&'")
198 str = chr(int(str[2:], 16))
200 str = chr(int(str[1:]))
201 if data[i - 1] !=
';':
202 self.syntax_error(
"`;' missing after char reference")
205 if self.entitydefs.has_key(str):
206 str = self.entitydefs[str]
208 elif data[i - 1] !=
';':
209 self.syntax_error(
"bogus `&'")
213 self.syntax_error(
"reference to unknown entity `&%s;'" % str)
214 str =
'&' + str +
';'
215 elif data[i - 1] !=
';':
216 self.syntax_error(
"bogus `&'")
222 data = data[:s] + str + data[i:]
229 def getnamespace(self):
231 for t, d, nst
in self.stack:
238 def goahead(self, end):
239 rawdata = self.rawdata
247 self.handle_data(data)
248 self.lineno = self.lineno + data.count(
'\n')
251 res = interesting.search(rawdata, i)
258 if self.__at_start
and space.match(data)
is None:
259 self.syntax_error(
'illegal data at start of file')
261 if not self.stack
and space.match(data)
is None:
262 self.syntax_error(
'data not in content')
263 if not self.__accept_utf8
and illegal.search(data):
264 self.syntax_error(
'illegal character in content')
265 self.handle_data(data)
266 self.lineno = self.lineno + data.count(
'\n')
269 if rawdata[i] ==
'<':
270 if starttagopen.match(rawdata, i):
273 self.handle_data(data)
274 self.lineno = self.lineno + data.count(
'\n')
277 k = self.parse_starttag(i)
279 self.__seen_starttag = 1
280 self.lineno = self.lineno + rawdata[i:k].
count(
'\n')
283 if endtagopen.match(rawdata, i):
284 k = self.parse_endtag(i)
286 self.lineno = self.lineno + rawdata[i:k].
count(
'\n')
289 if commentopen.match(rawdata, i):
292 self.handle_data(data)
293 self.lineno = self.lineno + data.count(
'\n')
296 k = self.parse_comment(i)
298 self.lineno = self.lineno + rawdata[i:k].
count(
'\n')
301 if cdataopen.match(rawdata, i):
302 k = self.parse_cdata(i)
304 self.lineno = self.lineno + rawdata[i:k].
count(
'\n')
307 res = xmldecl.match(rawdata, i)
309 if not self.__at_start:
310 self.syntax_error(
"<?xml?> declaration not at start of document")
311 version, encoding, standalone = res.group(
'version',
314 if version[1:-1] !=
'1.0':
315 raise Error(
'only XML version 1.0 supported')
316 if encoding: encoding = encoding[1:-1]
317 if standalone: standalone = standalone[1:-1]
318 self.handle_xml(encoding, standalone)
321 res = procopen.match(rawdata, i)
323 k = self.parse_proc(i)
325 self.lineno = self.lineno + rawdata[i:k].
count(
'\n')
328 res = doctype.match(rawdata, i)
332 self.handle_data(data)
333 self.lineno = self.lineno + data.count(
'\n')
336 if self.__seen_doctype:
337 self.syntax_error(
'multiple DOCTYPE elements')
338 if self.__seen_starttag:
339 self.syntax_error(
'DOCTYPE not at beginning of document')
340 k = self.parse_doctype(res)
342 self.__seen_doctype = res.group(
'name')
344 self.__seen_doctype = self.__seen_doctype.lower()
345 self.lineno = self.lineno + rawdata[i:k].
count(
'\n')
348 elif rawdata[i] ==
'&':
351 self.handle_data(data)
354 res = charref.match(rawdata, i)
357 if rawdata[i-1] !=
';':
358 self.syntax_error(
"`;' missing in charref")
361 self.syntax_error(
'data not in content')
362 self.handle_charref(res.group(
'char')[:-1])
363 self.lineno = self.lineno + res.group(0).
count(
'\n')
365 res = entityref.match(rawdata, i)
368 if rawdata[i-1] !=
';':
369 self.syntax_error(
"`;' missing in entityref")
371 name = res.group(
'name')
374 if self.entitydefs.has_key(name):
375 self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
379 self.unknown_entityref(name)
380 self.lineno = self.lineno + res.group(0).
count(
'\n')
382 elif rawdata[i] ==
']':
385 self.handle_data(data)
390 if cdataclose.match(rawdata, i):
391 self.syntax_error(
"bogus `]]>'")
392 self.handle_data(rawdata[i])
396 raise Error(
'neither < nor & ??')
405 self.syntax_error(
"bogus `%s'" % data)
406 if not self.__accept_utf8
and illegal.search(data):
407 self.syntax_error(
'illegal character in content')
408 self.handle_data(data)
409 self.lineno = self.lineno + data.count(
'\n')
410 self.rawdata = rawdata[i+1:]
411 return self.goahead(end)
412 self.rawdata = rawdata[i:]
414 if not self.__seen_starttag:
415 self.syntax_error(
'no elements in file')
417 self.syntax_error(
'missing end tags')
419 self.finish_endtag(self.stack[-1][0])
422 def parse_comment(self, i):
423 rawdata = self.rawdata
424 if rawdata[i:i+4] !=
'<!--':
425 raise Error(
'unexpected call to handle_comment')
426 res = commentclose.search(rawdata, i+4)
429 if doubledash.search(rawdata, i+4, res.start(0)):
430 self.syntax_error(
"`--' inside comment")
431 if rawdata[res.start(0)-1] ==
'-':
432 self.syntax_error(
'comment cannot end in three dashes')
433 if not self.__accept_utf8
and \
434 illegal.search(rawdata, i+4, res.start(0)):
435 self.syntax_error(
'illegal character in comment')
436 self.handle_comment(rawdata[i+4: res.start(0)])
440 def parse_doctype(self, res):
441 rawdata = self.rawdata
443 name = res.group(
'name')
446 pubid, syslit = res.group(
'pubid',
'syslit')
447 if pubid
is not None:
449 pubid =
' '.
join(pubid.split())
450 if syslit
is not None: syslit = syslit[1:-1]
454 if rawdata[k] ==
'[':
460 if not sq
and c ==
'"':
462 elif not dq
and c ==
"'":
466 elif level <= 0
and c ==
']':
467 res = endbracket.match(rawdata, k+1)
470 self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
477 self.syntax_error(
"bogus `>' in DOCTYPE")
479 res = endbracketfind.match(rawdata, k)
482 if endbracket.match(rawdata, k)
is None:
483 self.syntax_error(
'garbage in DOCTYPE')
484 self.handle_doctype(name, pubid, syslit,
None)
488 def parse_cdata(self, i):
489 rawdata = self.rawdata
490 if rawdata[i:i+9] !=
'<![CDATA[':
491 raise Error(
'unexpected call to parse_cdata')
492 res = cdataclose.search(rawdata, i+9)
495 if not self.__accept_utf8
and \
496 illegal.search(rawdata, i+9, res.start(0)):
497 self.syntax_error(
'illegal character in CDATA')
499 self.syntax_error(
'CDATA not in content')
500 self.handle_cdata(rawdata[i+9:res.start(0)])
503 __xml_namespace_attributes = {
'ns':
None,
'src':
None,
'prefix':
None}
505 def parse_proc(self, i):
506 rawdata = self.rawdata
507 end = procclose.search(rawdata, i)
511 if not self.__accept_utf8
and illegal.search(rawdata, i+2, j):
512 self.syntax_error(
'illegal character in processing instruction')
513 res = tagfind.match(rawdata, i+2)
515 raise Error(
'unexpected call to parse_proc')
520 if name ==
'xml:namespace':
521 self.syntax_error(
'old-fashioned namespace declaration')
522 self.__use_namespaces = -1
526 if self.__seen_doctype
or self.__seen_starttag:
527 self.syntax_error(
'xml:namespace declaration too late in document')
528 attrdict, namespace, k = self.parse_attributes(name, k, j)
530 self.syntax_error(
'namespace declaration inside namespace declaration')
531 for attrname
in attrdict.keys():
532 if not self.__xml_namespace_attributes.has_key(attrname):
533 self.syntax_error(
"unknown attribute `%s' in xml:namespace tag" % attrname)
534 if not attrdict.has_key(
'ns')
or not attrdict.has_key(
'prefix'):
535 self.syntax_error(
'xml:namespace without required attributes')
536 prefix = attrdict.get(
'prefix')
537 if ncname.match(prefix)
is None:
538 self.syntax_error(
'xml:namespace illegal prefix value')
540 if self.__namespaces.has_key(prefix):
541 self.syntax_error(
'xml:namespace prefix not unique')
542 self.__namespaces[prefix] = attrdict[
'ns']
544 if name.lower() ==
'xml':
545 self.syntax_error(
'illegal processing instruction target name')
546 self.handle_proc(name, rawdata[k:j])
550 def parse_attributes(self, tag, i, j):
551 rawdata = self.rawdata
555 res = attrfind.match(rawdata, i)
558 attrname, attrvalue = res.group(
'name',
'value')
560 attrname = attrname.lower()
562 if attrvalue
is None:
563 self.syntax_error(
"no value specified for attribute `%s'" % attrname)
565 elif attrvalue[:1] ==
"'" == attrvalue[-1:]
or \
566 attrvalue[:1] ==
'"' == attrvalue[-1:]:
567 attrvalue = attrvalue[1:-1]
568 elif not self.__accept_unquoted_attributes:
569 self.syntax_error(
"attribute `%s' value not quoted" % attrname)
570 res = xmlns.match(attrname)
573 ncname = res.group(
'ncname')
574 namespace[ncname
or ''] = attrvalue
or None
575 if not self.__use_namespaces:
576 self.__use_namespaces = len(self.stack)+1
579 self.syntax_error(
"`<' illegal in attribute value")
580 if attrdict.has_key(attrname):
581 self.syntax_error(
"attribute `%s' specified twice" % attrname)
582 attrvalue = attrvalue.translate(attrtrans)
583 attrdict[attrname] = self.translate_references(attrvalue)
584 return attrdict, namespace, i
587 def parse_starttag(self, i):
588 rawdata = self.rawdata
590 end = endbracketfind.match(rawdata, i+1)
593 tag = starttagmatch.match(rawdata, i)
594 if tag
is None or tag.end(0) != end.end(0):
595 self.syntax_error(
'garbage in starttag')
597 nstag = tagname = tag.group(
'tagname')
599 nstag = tagname = nstag.lower()
600 if not self.__seen_starttag
and self.__seen_doctype
and \
601 tagname != self.__seen_doctype:
602 self.syntax_error(
'starttag does not match DOCTYPE')
603 if self.__seen_starttag
and not self.stack:
604 self.syntax_error(
'multiple elements on top level')
605 k, j = tag.span(
'attrs')
606 attrdict, nsdict, k = self.parse_attributes(tagname, k, j)
607 self.stack.append((tagname, nsdict, nstag))
608 if self.__use_namespaces:
609 res = qname.match(tagname)
613 prefix, nstag = res.group(
'prefix',
'local')
617 for t, d, nst
in self.stack:
618 if d.has_key(prefix):
620 if ns
is None and prefix !=
'':
621 ns = self.__namespaces.get(prefix)
623 nstag = ns +
' ' + nstag
625 nstag = prefix +
':' + nstag
626 self.stack[-1] = tagname, nsdict, nstag
629 for key
in attrdict.keys():
630 attrnamemap[key] = key
631 if self.__use_namespaces:
633 for key, val
in attrdict.items():
635 res = qname.match(key)
637 aprefix, key = res.group(
'prefix',
'local')
643 for t, d, nst
in self.stack:
644 if d.has_key(aprefix):
646 if ans
is None and aprefix !=
'':
647 ans = self.__namespaces.get(aprefix)
649 key = ans +
' ' + key
651 key = aprefix +
':' + key
655 attrnamemap[key] = okey
657 attributes = self.attributes.get(nstag)
658 if attributes
is not None:
659 for key
in attrdict.keys():
660 if not attributes.has_key(key):
661 self.syntax_error(
"unknown attribute `%s' in tag `%s'" % (attrnamemap[key], tagname))
662 for key, val
in attributes.items():
663 if val
is not None and not attrdict.has_key(key):
665 method = self.elements.get(nstag, (
None,
None))[0]
666 self.finish_starttag(nstag, attrdict, method)
667 if tag.group(
'slash') ==
'/':
668 self.finish_endtag(tagname)
672 def parse_endtag(self, i):
673 rawdata = self.rawdata
674 end = endbracketfind.match(rawdata, i+1)
677 res = tagfind.match(rawdata, i+2)
680 self.handle_data(rawdata[i])
682 if not self.__accept_missing_endtag_name:
683 self.syntax_error(
'no name specified in end tag')
684 tag = self.stack[-1][0]
691 if not self.stack
or tag != self.stack[-1][0]:
692 self.handle_data(rawdata[i])
695 if endbracket.match(rawdata, k)
is None:
696 self.syntax_error(
'garbage in end tag')
697 self.finish_endtag(tag)
701 def finish_starttag(self, tagname, attrdict, method):
702 if method
is not None:
703 self.handle_starttag(tagname, method, attrdict)
705 self.unknown_starttag(tagname, attrdict)
708 def finish_endtag(self, tag):
711 self.syntax_error(
'name-less end tag')
712 found = len(self.stack) - 1
714 self.unknown_endtag(tag)
718 for i
in range(len(self.stack)):
719 if tag == self.stack[i][0]:
722 self.syntax_error(
'unopened end tag')
724 while len(self.stack) > found:
725 if found < len(self.stack) - 1:
726 self.syntax_error(
'missing close tag for %s' % self.stack[-1][2])
727 nstag = self.stack[-1][2]
728 method = self.elements.get(nstag, (
None,
None))[1]
729 if method
is not None:
730 self.handle_endtag(nstag, method)
732 self.unknown_endtag(nstag)
733 if self.__use_namespaces == len(self.stack):
734 self.__use_namespaces = 0
738 def handle_xml(self, encoding, standalone):
742 def handle_doctype(self, tag, pubid, syslit, data):
746 def handle_starttag(self, tag, method, attrs):
750 def handle_endtag(self, tag, method):
754 def handle_charref(self, name):
757 n = int(name[1:], 16)
761 self.unknown_charref(name)
763 if not 0 <= n <= 255:
764 self.unknown_charref(name)
766 self.handle_data(chr(n))
769 entitydefs = {
'lt':
'<',
777 def handle_data(self, data):
781 def handle_cdata(self, data):
785 def handle_comment(self, data):
789 def handle_proc(self, name, data):
793 def syntax_error(self, message):
794 raise Error(
'Syntax error at line %d: %s' % (self.lineno, message))
797 def unknown_starttag(self, tag, attrs):
pass
798 def unknown_endtag(self, tag):
pass
799 def unknown_charref(self, ref):
pass
800 def unknown_entityref(self, name):
801 self.syntax_error(
"reference to unknown entity `&%s;'" % name)
804 class TestXMLParser(XMLParser):
808 apply(XMLParser.__init__, (self,), kw)
810 def handle_xml(self, encoding, standalone):
812 print 'xml: encoding =',encoding,
'standalone =',standalone
814 def handle_doctype(self, tag, pubid, syslit, data):
816 print 'DOCTYPE:',tag, `data`
818 def handle_data(self, data):
819 self.testdata = self.testdata + data
820 if len(`self.testdata`) >= 70:
827 print 'data:', `data`
829 def handle_cdata(self, data):
831 print 'cdata:', `data`
833 def handle_proc(self, name, data):
835 print 'processing:',name,`data`
837 def handle_comment(self, data):
841 r = r[:32] +
'...' + r[-32:]
844 def syntax_error(self, message):
845 print 'error at line %d:' % self.lineno, message
847 def unknown_starttag(self, tag, attrs):
850 print 'start tag: <' + tag +
'>'
852 print 'start tag: <' + tag,
853 for name, value
in attrs.items():
854 print name +
'=' +
'"' + value +
'"',
857 def unknown_endtag(self, tag):
859 print 'end tag: </' + tag +
'>'
861 def unknown_entityref(self, ref):
863 print '*** unknown entity ref: &' + ref +
';'
865 def unknown_charref(self, ref):
867 print '*** unknown char ref: &#' + ref +
';'
870 XMLParser.close(self)
873 def test(args = None):
875 from time
import time
881 klass = TestXMLParser
904 if f
is not sys.stdin:
921 print 'total time: %g' % (t1-t0)
925 print 'total time: %g' % (t1-t0)
928 if __name__ ==
'__main__':