Vega strike Python Modules doc  0.5.1
Documentation of the " Modules " folder of Vega strike
 All Data Structures Namespaces Files Functions Variables
xmllib.py
Go to the documentation of this file.
1 """A parser for XML, using the derived class as static DTD."""
2 
3 # Author: Sjoerd Mullender.
4 
5 import re
6 import string
7 
8 
9 version = '0.3'
10 
11 class Error(RuntimeError):
12  pass
13 
14 # Regular expressions used for parsing
15 
16 _S = '[ \t\r\n]+' # white space
17 _opS = '[ \t\r\n]*' # optional white space
18 _Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*' # valid XML name
19 _QStr = "(?:'[^']*'|\"[^\"]*\")" # quoted XML string
20 illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
21 interesting = re.compile('[]&<]')
22 
23 amp = re.compile('&')
24 ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
25 entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
26 charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
27 space = re.compile(_S + '$')
28 newline = re.compile('\n')
29 
30 attrfind = re.compile(
31  _S + '(?P<name>' + _Name + ')'
32  '(' + _opS + '=' + _opS +
33  '(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?')
34 starttagopen = re.compile('<' + _Name)
35 starttagend = re.compile(_opS + '(?P<slash>/?)>')
36 starttagmatch = re.compile('<(?P<tagname>'+_Name+')'
37  '(?P<attrs>(?:'+attrfind.pattern+')*)'+
38  starttagend.pattern)
39 endtagopen = re.compile('</')
40 endbracket = re.compile(_opS + '>')
41 endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>')
42 tagfind = re.compile(_Name)
43 cdataopen = re.compile(r'<!\[CDATA\[')
44 cdataclose = re.compile(r'\]\]>')
45 # this matches one of the following:
46 # SYSTEM SystemLiteral
47 # PUBLIC PubidLiteral SystemLiteral
48 _SystemLiteral = '(?P<%s>'+_QStr+')'
49 _PublicLiteral = '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
50  "'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
51 _ExternalId = '(?:SYSTEM|' \
52  'PUBLIC'+_S+_PublicLiteral%'pubid'+ \
53  ')'+_S+_SystemLiteral%'syslit'
54 doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
55  '(?:'+_S+_ExternalId+')?'+_opS)
56 xmldecl = re.compile('<\?xml'+_S+
57  'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+
58  '(?:'+_S+'encoding'+_opS+'='+_opS+
59  "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
60  '"[A-Za-z][-A-Za-z0-9._]*"))?'
61  '(?:'+_S+'standalone'+_opS+'='+_opS+
62  '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
63  _opS+'\?>')
64 procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)
65 procclose = re.compile(_opS + r'\?>')
66 commentopen = re.compile('<!--')
67 commentclose = re.compile('-->')
68 doubledash = re.compile('--')
69 attrtrans = string.maketrans(' \r\n\t', ' ')
70 
71 # definitions for XML namespaces
72 _NCName = '[a-zA-Z_][-a-zA-Z0-9._]*' # XML Name, minus the ":"
73 ncname = re.compile(_NCName + '$')
74 qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix
75  '(?P<local>' + _NCName + ')$')
76 
77 xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$')
78 
79 # XML parser base class -- find tags and call handler functions.
80 # Usage: p = XMLParser(); p.feed(data); ...; p.close().
81 # The dtd is defined by deriving a class which defines methods with
82 # special names to handle tags: start_foo and end_foo to handle <foo>
83 # and </foo>, respectively. The data between tags is passed to the
84 # parser by calling self.handle_data() with some data as argument (the
85 # data may be split up in arbitrary chunks).
86 
87 class XMLParser:
88  attributes = {} # default, to be overridden
89  elements = {} # default, to be overridden
90 
91  # parsing options, settable using keyword args in __init__
92  __accept_unquoted_attributes = 0
93  __accept_missing_endtag_name = 0
94  __map_case = 0
95  __accept_utf8 = 0
96  __translate_attribute_references = 1
97 
98  # Interface -- initialize and reset this instance
99  def __init__(self, **kw):
100  self.__fixed = 0
101  if kw.has_key('accept_unquoted_attributes'):
102  self.__accept_unquoted_attributes = kw['accept_unquoted_attributes']
103  if kw.has_key('accept_missing_endtag_name'):
104  self.__accept_missing_endtag_name = kw['accept_missing_endtag_name']
105  if kw.has_key('map_case'):
106  self.__map_case = kw['map_case']
107  if kw.has_key('accept_utf8'):
108  self.__accept_utf8 = kw['accept_utf8']
109  if kw.has_key('translate_attribute_references'):
110  self.__translate_attribute_references = kw['translate_attribute_references']
111  self.reset()
112 
113  def __fixelements(self):
114  self.__fixed = 1
115  self.elements = {}
116  self.__fixdict(self.__dict__)
117  self.__fixclass(self.__class__)
118 
119  def __fixclass(self, kl):
120  self.__fixdict(kl.__dict__)
121  for k in kl.__bases__:
122  self.__fixclass(k)
123 
124  def __fixdict(self, dict):
125  for key in dict.keys():
126  if key[:6] == 'start_':
127  tag = key[6:]
128  start, end = self.elements.get(tag, (None, None))
129  if start is None:
130  self.elements[tag] = getattr(self, key), end
131  elif key[:4] == 'end_':
132  tag = key[4:]
133  start, end = self.elements.get(tag, (None, None))
134  if end is None:
135  self.elements[tag] = start, getattr(self, key)
136 
137  # Interface -- reset this instance. Loses all unprocessed data
138  def reset(self):
139  self.rawdata = ''
140  self.stack = []
141  self.nomoretags = 0
142  self.literal = 0
143  self.lineno = 1
144  self.__at_start = 1
145  self.__seen_doctype = None
146  self.__seen_starttag = 0
147  self.__use_namespaces = 0
148  self.__namespaces = {'xml':None} # xml is implicitly declared
149  # backward compatibility hack: if elements not overridden,
150  # fill it in ourselves
151  if self.elements is XMLParser.elements:
152  self.__fixelements()
153 
154  # For derived classes only -- enter literal mode (CDATA) till EOF
155  def setnomoretags(self):
156  self.nomoretags = self.literal = 1
157 
158  # For derived classes only -- enter literal mode (CDATA)
159  def setliteral(self, *args):
160  self.literal = 1
161 
162  # Interface -- feed some data to the parser. Call this as
163  # often as you want, with as little or as much text as you
164  # want (may include '\n'). (This just saves the text, all the
165  # processing is done by goahead().)
166  def feed(self, data):
167  self.rawdata = self.rawdata + data
168  self.goahead(0)
169 
170  # Interface -- handle the remaining data
171  def close(self):
172  self.goahead(1)
173  if self.__fixed:
174  self.__fixed = 0
175  # remove self.elements so that we don't leak
176  del self.elements
177 
178  # Interface -- translate references
179  def translate_references(self, data, all = 1):
180  if not self.__translate_attribute_references:
181  return data
182  i = 0
183  while 1:
184  res = amp.search(data, i)
185  if res is None:
186  return data
187  s = res.start(0)
188  res = ref.match(data, s)
189  if res is None:
190  self.syntax_error("bogus `&'")
191  i = s+1
192  continue
193  i = res.end(0)
194  str = res.group(1)
195  rescan = 0
196  if str[0] == '#':
197  if str[1] == 'x':
198  str = chr(int(str[2:], 16))
199  else:
200  str = chr(int(str[1:]))
201  if data[i - 1] != ';':
202  self.syntax_error("`;' missing after char reference")
203  i = i-1
204  elif all:
205  if self.entitydefs.has_key(str):
206  str = self.entitydefs[str]
207  rescan = 1
208  elif data[i - 1] != ';':
209  self.syntax_error("bogus `&'")
210  i = s + 1 # just past the &
211  continue
212  else:
213  self.syntax_error("reference to unknown entity `&%s;'" % str)
214  str = '&' + str + ';'
215  elif data[i - 1] != ';':
216  self.syntax_error("bogus `&'")
217  i = s + 1 # just past the &
218  continue
219 
220  # when we get here, str contains the translated text and i points
221  # to the end of the string that is to be replaced
222  data = data[:s] + str + data[i:]
223  if rescan:
224  i = s
225  else:
226  i = s + len(str)
227 
228  # Interface - return a dictionary of all namespaces currently valid
229  def getnamespace(self):
230  nsdict = {}
231  for t, d, nst in self.stack:
232  nsdict.update(d)
233  return nsdict
234 
235  # Internal -- handle data as far as reasonable. May leave state
236  # and data to be processed by a subsequent call. If 'end' is
237  # true, force handling all data as if followed by EOF marker.
238  def goahead(self, end):
239  rawdata = self.rawdata
240  i = 0
241  n = len(rawdata)
242  while i < n:
243  if i > 0:
244  self.__at_start = 0
245  if self.nomoretags:
246  data = rawdata[i:n]
247  self.handle_data(data)
248  self.lineno = self.lineno + data.count('\n')
249  i = n
250  break
251  res = interesting.search(rawdata, i)
252  if res:
253  j = res.start(0)
254  else:
255  j = n
256  if i < j:
257  data = rawdata[i:j]
258  if self.__at_start and space.match(data) is None:
259  self.syntax_error('illegal data at start of file')
260  self.__at_start = 0
261  if not self.stack and space.match(data) is None:
262  self.syntax_error('data not in content')
263  if not self.__accept_utf8 and illegal.search(data):
264  self.syntax_error('illegal character in content')
265  self.handle_data(data)
266  self.lineno = self.lineno + data.count('\n')
267  i = j
268  if i == n: break
269  if rawdata[i] == '<':
270  if starttagopen.match(rawdata, i):
271  if self.literal:
272  data = rawdata[i]
273  self.handle_data(data)
274  self.lineno = self.lineno + data.count('\n')
275  i = i+1
276  continue
277  k = self.parse_starttag(i)
278  if k < 0: break
279  self.__seen_starttag = 1
280  self.lineno = self.lineno + rawdata[i:k].count('\n')
281  i = k
282  continue
283  if endtagopen.match(rawdata, i):
284  k = self.parse_endtag(i)
285  if k < 0: break
286  self.lineno = self.lineno + rawdata[i:k].count('\n')
287  i = k
288  continue
289  if commentopen.match(rawdata, i):
290  if self.literal:
291  data = rawdata[i]
292  self.handle_data(data)
293  self.lineno = self.lineno + data.count('\n')
294  i = i+1
295  continue
296  k = self.parse_comment(i)
297  if k < 0: break
298  self.lineno = self.lineno + rawdata[i:k].count('\n')
299  i = k
300  continue
301  if cdataopen.match(rawdata, i):
302  k = self.parse_cdata(i)
303  if k < 0: break
304  self.lineno = self.lineno + rawdata[i:k].count('\n')
305  i = k
306  continue
307  res = xmldecl.match(rawdata, i)
308  if res:
309  if not self.__at_start:
310  self.syntax_error("<?xml?> declaration not at start of document")
311  version, encoding, standalone = res.group('version',
312  'encoding',
313  'standalone')
314  if version[1:-1] != '1.0':
315  raise Error('only XML version 1.0 supported')
316  if encoding: encoding = encoding[1:-1]
317  if standalone: standalone = standalone[1:-1]
318  self.handle_xml(encoding, standalone)
319  i = res.end(0)
320  continue
321  res = procopen.match(rawdata, i)
322  if res:
323  k = self.parse_proc(i)
324  if k < 0: break
325  self.lineno = self.lineno + rawdata[i:k].count('\n')
326  i = k
327  continue
328  res = doctype.match(rawdata, i)
329  if res:
330  if self.literal:
331  data = rawdata[i]
332  self.handle_data(data)
333  self.lineno = self.lineno + data.count('\n')
334  i = i+1
335  continue
336  if self.__seen_doctype:
337  self.syntax_error('multiple DOCTYPE elements')
338  if self.__seen_starttag:
339  self.syntax_error('DOCTYPE not at beginning of document')
340  k = self.parse_doctype(res)
341  if k < 0: break
342  self.__seen_doctype = res.group('name')
343  if self.__map_case:
344  self.__seen_doctype = self.__seen_doctype.lower()
345  self.lineno = self.lineno + rawdata[i:k].count('\n')
346  i = k
347  continue
348  elif rawdata[i] == '&':
349  if self.literal:
350  data = rawdata[i]
351  self.handle_data(data)
352  i = i+1
353  continue
354  res = charref.match(rawdata, i)
355  if res is not None:
356  i = res.end(0)
357  if rawdata[i-1] != ';':
358  self.syntax_error("`;' missing in charref")
359  i = i-1
360  if not self.stack:
361  self.syntax_error('data not in content')
362  self.handle_charref(res.group('char')[:-1])
363  self.lineno = self.lineno + res.group(0).count('\n')
364  continue
365  res = entityref.match(rawdata, i)
366  if res is not None:
367  i = res.end(0)
368  if rawdata[i-1] != ';':
369  self.syntax_error("`;' missing in entityref")
370  i = i-1
371  name = res.group('name')
372  if self.__map_case:
373  name = name.lower()
374  if self.entitydefs.has_key(name):
375  self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
376  n = len(rawdata)
377  i = res.start(0)
378  else:
379  self.unknown_entityref(name)
380  self.lineno = self.lineno + res.group(0).count('\n')
381  continue
382  elif rawdata[i] == ']':
383  if self.literal:
384  data = rawdata[i]
385  self.handle_data(data)
386  i = i+1
387  continue
388  if n-i < 3:
389  break
390  if cdataclose.match(rawdata, i):
391  self.syntax_error("bogus `]]>'")
392  self.handle_data(rawdata[i])
393  i = i+1
394  continue
395  else:
396  raise Error('neither < nor & ??')
397  # We get here only if incomplete matches but
398  # nothing else
399  break
400  # end while
401  if i > 0:
402  self.__at_start = 0
403  if end and i < n:
404  data = rawdata[i]
405  self.syntax_error("bogus `%s'" % data)
406  if not self.__accept_utf8 and illegal.search(data):
407  self.syntax_error('illegal character in content')
408  self.handle_data(data)
409  self.lineno = self.lineno + data.count('\n')
410  self.rawdata = rawdata[i+1:]
411  return self.goahead(end)
412  self.rawdata = rawdata[i:]
413  if end:
414  if not self.__seen_starttag:
415  self.syntax_error('no elements in file')
416  if self.stack:
417  self.syntax_error('missing end tags')
418  while self.stack:
419  self.finish_endtag(self.stack[-1][0])
420 
421  # Internal -- parse comment, return length or -1 if not terminated
422  def parse_comment(self, i):
423  rawdata = self.rawdata
424  if rawdata[i:i+4] != '<!--':
425  raise Error('unexpected call to handle_comment')
426  res = commentclose.search(rawdata, i+4)
427  if res is None:
428  return -1
429  if doubledash.search(rawdata, i+4, res.start(0)):
430  self.syntax_error("`--' inside comment")
431  if rawdata[res.start(0)-1] == '-':
432  self.syntax_error('comment cannot end in three dashes')
433  if not self.__accept_utf8 and \
434  illegal.search(rawdata, i+4, res.start(0)):
435  self.syntax_error('illegal character in comment')
436  self.handle_comment(rawdata[i+4: res.start(0)])
437  return res.end(0)
438 
439  # Internal -- handle DOCTYPE tag, return length or -1 if not terminated
440  def parse_doctype(self, res):
441  rawdata = self.rawdata
442  n = len(rawdata)
443  name = res.group('name')
444  if self.__map_case:
445  name = name.lower()
446  pubid, syslit = res.group('pubid', 'syslit')
447  if pubid is not None:
448  pubid = pubid[1:-1] # remove quotes
449  pubid = ' '.join(pubid.split()) # normalize
450  if syslit is not None: syslit = syslit[1:-1] # remove quotes
451  j = k = res.end(0)
452  if k >= n:
453  return -1
454  if rawdata[k] == '[':
455  level = 0
456  k = k+1
457  dq = sq = 0
458  while k < n:
459  c = rawdata[k]
460  if not sq and c == '"':
461  dq = not dq
462  elif not dq and c == "'":
463  sq = not sq
464  elif sq or dq:
465  pass
466  elif level <= 0 and c == ']':
467  res = endbracket.match(rawdata, k+1)
468  if res is None:
469  return -1
470  self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
471  return res.end(0)
472  elif c == '<':
473  level = level + 1
474  elif c == '>':
475  level = level - 1
476  if level < 0:
477  self.syntax_error("bogus `>' in DOCTYPE")
478  k = k+1
479  res = endbracketfind.match(rawdata, k)
480  if res is None:
481  return -1
482  if endbracket.match(rawdata, k) is None:
483  self.syntax_error('garbage in DOCTYPE')
484  self.handle_doctype(name, pubid, syslit, None)
485  return res.end(0)
486 
487  # Internal -- handle CDATA tag, return length or -1 if not terminated
488  def parse_cdata(self, i):
489  rawdata = self.rawdata
490  if rawdata[i:i+9] != '<![CDATA[':
491  raise Error('unexpected call to parse_cdata')
492  res = cdataclose.search(rawdata, i+9)
493  if res is None:
494  return -1
495  if not self.__accept_utf8 and \
496  illegal.search(rawdata, i+9, res.start(0)):
497  self.syntax_error('illegal character in CDATA')
498  if not self.stack:
499  self.syntax_error('CDATA not in content')
500  self.handle_cdata(rawdata[i+9:res.start(0)])
501  return res.end(0)
502 
503  __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None}
504  # Internal -- handle a processing instruction tag
505  def parse_proc(self, i):
506  rawdata = self.rawdata
507  end = procclose.search(rawdata, i)
508  if end is None:
509  return -1
510  j = end.start(0)
511  if not self.__accept_utf8 and illegal.search(rawdata, i+2, j):
512  self.syntax_error('illegal character in processing instruction')
513  res = tagfind.match(rawdata, i+2)
514  if res is None:
515  raise Error('unexpected call to parse_proc')
516  k = res.end(0)
517  name = res.group(0)
518  if self.__map_case:
519  name = name.lower()
520  if name == 'xml:namespace':
521  self.syntax_error('old-fashioned namespace declaration')
522  self.__use_namespaces = -1
523  # namespace declaration
524  # this must come after the <?xml?> declaration (if any)
525  # and before the <!DOCTYPE> (if any).
526  if self.__seen_doctype or self.__seen_starttag:
527  self.syntax_error('xml:namespace declaration too late in document')
528  attrdict, namespace, k = self.parse_attributes(name, k, j)
529  if namespace:
530  self.syntax_error('namespace declaration inside namespace declaration')
531  for attrname in attrdict.keys():
532  if not self.__xml_namespace_attributes.has_key(attrname):
533  self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname)
534  if not attrdict.has_key('ns') or not attrdict.has_key('prefix'):
535  self.syntax_error('xml:namespace without required attributes')
536  prefix = attrdict.get('prefix')
537  if ncname.match(prefix) is None:
538  self.syntax_error('xml:namespace illegal prefix value')
539  return end.end(0)
540  if self.__namespaces.has_key(prefix):
541  self.syntax_error('xml:namespace prefix not unique')
542  self.__namespaces[prefix] = attrdict['ns']
543  else:
544  if name.lower() == 'xml':
545  self.syntax_error('illegal processing instruction target name')
546  self.handle_proc(name, rawdata[k:j])
547  return end.end(0)
548 
549  # Internal -- parse attributes between i and j
550  def parse_attributes(self, tag, i, j):
551  rawdata = self.rawdata
552  attrdict = {}
553  namespace = {}
554  while i < j:
555  res = attrfind.match(rawdata, i)
556  if res is None:
557  break
558  attrname, attrvalue = res.group('name', 'value')
559  if self.__map_case:
560  attrname = attrname.lower()
561  i = res.end(0)
562  if attrvalue is None:
563  self.syntax_error("no value specified for attribute `%s'" % attrname)
564  attrvalue = attrname
565  elif attrvalue[:1] == "'" == attrvalue[-1:] or \
566  attrvalue[:1] == '"' == attrvalue[-1:]:
567  attrvalue = attrvalue[1:-1]
568  elif not self.__accept_unquoted_attributes:
569  self.syntax_error("attribute `%s' value not quoted" % attrname)
570  res = xmlns.match(attrname)
571  if res is not None:
572  # namespace declaration
573  ncname = res.group('ncname')
574  namespace[ncname or ''] = attrvalue or None
575  if not self.__use_namespaces:
576  self.__use_namespaces = len(self.stack)+1
577  continue
578  if '<' in attrvalue:
579  self.syntax_error("`<' illegal in attribute value")
580  if attrdict.has_key(attrname):
581  self.syntax_error("attribute `%s' specified twice" % attrname)
582  attrvalue = attrvalue.translate(attrtrans)
583  attrdict[attrname] = self.translate_references(attrvalue)
584  return attrdict, namespace, i
585 
586  # Internal -- handle starttag, return length or -1 if not terminated
587  def parse_starttag(self, i):
588  rawdata = self.rawdata
589  # i points to start of tag
590  end = endbracketfind.match(rawdata, i+1)
591  if end is None:
592  return -1
593  tag = starttagmatch.match(rawdata, i)
594  if tag is None or tag.end(0) != end.end(0):
595  self.syntax_error('garbage in starttag')
596  return end.end(0)
597  nstag = tagname = tag.group('tagname')
598  if self.__map_case:
599  nstag = tagname = nstag.lower()
600  if not self.__seen_starttag and self.__seen_doctype and \
601  tagname != self.__seen_doctype:
602  self.syntax_error('starttag does not match DOCTYPE')
603  if self.__seen_starttag and not self.stack:
604  self.syntax_error('multiple elements on top level')
605  k, j = tag.span('attrs')
606  attrdict, nsdict, k = self.parse_attributes(tagname, k, j)
607  self.stack.append((tagname, nsdict, nstag))
608  if self.__use_namespaces:
609  res = qname.match(tagname)
610  else:
611  res = None
612  if res is not None:
613  prefix, nstag = res.group('prefix', 'local')
614  if prefix is None:
615  prefix = ''
616  ns = None
617  for t, d, nst in self.stack:
618  if d.has_key(prefix):
619  ns = d[prefix]
620  if ns is None and prefix != '':
621  ns = self.__namespaces.get(prefix)
622  if ns is not None:
623  nstag = ns + ' ' + nstag
624  elif prefix != '':
625  nstag = prefix + ':' + nstag # undo split
626  self.stack[-1] = tagname, nsdict, nstag
627  # translate namespace of attributes
628  attrnamemap = {} # map from new name to old name (used for error reporting)
629  for key in attrdict.keys():
630  attrnamemap[key] = key
631  if self.__use_namespaces:
632  nattrdict = {}
633  for key, val in attrdict.items():
634  okey = key
635  res = qname.match(key)
636  if res is not None:
637  aprefix, key = res.group('prefix', 'local')
638  if self.__map_case:
639  key = key.lower()
640  if aprefix is None:
641  aprefix = ''
642  ans = None
643  for t, d, nst in self.stack:
644  if d.has_key(aprefix):
645  ans = d[aprefix]
646  if ans is None and aprefix != '':
647  ans = self.__namespaces.get(aprefix)
648  if ans is not None:
649  key = ans + ' ' + key
650  elif aprefix != '':
651  key = aprefix + ':' + key
652  elif ns is not None:
653  key = ns + ' ' + key
654  nattrdict[key] = val
655  attrnamemap[key] = okey
656  attrdict = nattrdict
657  attributes = self.attributes.get(nstag)
658  if attributes is not None:
659  for key in attrdict.keys():
660  if not attributes.has_key(key):
661  self.syntax_error("unknown attribute `%s' in tag `%s'" % (attrnamemap[key], tagname))
662  for key, val in attributes.items():
663  if val is not None and not attrdict.has_key(key):
664  attrdict[key] = val
665  method = self.elements.get(nstag, (None, None))[0]
666  self.finish_starttag(nstag, attrdict, method)
667  if tag.group('slash') == '/':
668  self.finish_endtag(tagname)
669  return tag.end(0)
670 
671  # Internal -- parse endtag
672  def parse_endtag(self, i):
673  rawdata = self.rawdata
674  end = endbracketfind.match(rawdata, i+1)
675  if end is None:
676  return -1
677  res = tagfind.match(rawdata, i+2)
678  if res is None:
679  if self.literal:
680  self.handle_data(rawdata[i])
681  return i+1
682  if not self.__accept_missing_endtag_name:
683  self.syntax_error('no name specified in end tag')
684  tag = self.stack[-1][0]
685  k = i+2
686  else:
687  tag = res.group(0)
688  if self.__map_case:
689  tag = tag.lower()
690  if self.literal:
691  if not self.stack or tag != self.stack[-1][0]:
692  self.handle_data(rawdata[i])
693  return i+1
694  k = res.end(0)
695  if endbracket.match(rawdata, k) is None:
696  self.syntax_error('garbage in end tag')
697  self.finish_endtag(tag)
698  return end.end(0)
699 
700  # Internal -- finish processing of start tag
701  def finish_starttag(self, tagname, attrdict, method):
702  if method is not None:
703  self.handle_starttag(tagname, method, attrdict)
704  else:
705  self.unknown_starttag(tagname, attrdict)
706 
707  # Internal -- finish processing of end tag
708  def finish_endtag(self, tag):
709  self.literal = 0
710  if not tag:
711  self.syntax_error('name-less end tag')
712  found = len(self.stack) - 1
713  if found < 0:
714  self.unknown_endtag(tag)
715  return
716  else:
717  found = -1
718  for i in range(len(self.stack)):
719  if tag == self.stack[i][0]:
720  found = i
721  if found == -1:
722  self.syntax_error('unopened end tag')
723  return
724  while len(self.stack) > found:
725  if found < len(self.stack) - 1:
726  self.syntax_error('missing close tag for %s' % self.stack[-1][2])
727  nstag = self.stack[-1][2]
728  method = self.elements.get(nstag, (None, None))[1]
729  if method is not None:
730  self.handle_endtag(nstag, method)
731  else:
732  self.unknown_endtag(nstag)
733  if self.__use_namespaces == len(self.stack):
734  self.__use_namespaces = 0
735  del self.stack[-1]
736 
737  # Overridable -- handle xml processing instruction
738  def handle_xml(self, encoding, standalone):
739  pass
740 
741  # Overridable -- handle DOCTYPE
742  def handle_doctype(self, tag, pubid, syslit, data):
743  pass
744 
745  # Overridable -- handle start tag
746  def handle_starttag(self, tag, method, attrs):
747  method(attrs)
748 
749  # Overridable -- handle end tag
750  def handle_endtag(self, tag, method):
751  method()
752 
753  # Example -- handle character reference, no need to override
754  def handle_charref(self, name):
755  try:
756  if name[0] == 'x':
757  n = int(name[1:], 16)
758  else:
759  n = int(name)
760  except ValueError:
761  self.unknown_charref(name)
762  return
763  if not 0 <= n <= 255:
764  self.unknown_charref(name)
765  return
766  self.handle_data(chr(n))
767 
768  # Definition of entities -- derived classes may override
769  entitydefs = {'lt': '&#60;', # must use charref
770  'gt': '&#62;',
771  'amp': '&#38;', # must use charref
772  'quot': '&#34;',
773  'apos': '&#39;',
774  }
775 
776  # Example -- handle data, should be overridden
777  def handle_data(self, data):
778  pass
779 
780  # Example -- handle cdata, could be overridden
781  def handle_cdata(self, data):
782  pass
783 
784  # Example -- handle comment, could be overridden
785  def handle_comment(self, data):
786  pass
787 
788  # Example -- handle processing instructions, could be overridden
789  def handle_proc(self, name, data):
790  pass
791 
792  # Example -- handle relatively harmless syntax errors, could be overridden
793  def syntax_error(self, message):
794  raise Error('Syntax error at line %d: %s' % (self.lineno, message))
795 
796  # To be overridden -- handlers for unknown objects
797  def unknown_starttag(self, tag, attrs): pass
798  def unknown_endtag(self, tag): pass
799  def unknown_charref(self, ref): pass
800  def unknown_entityref(self, name):
801  self.syntax_error("reference to unknown entity `&%s;'" % name)
802 
803 
804 class TestXMLParser(XMLParser):
805 
806  def __init__(self, **kw):
807  self.testdata = ""
808  apply(XMLParser.__init__, (self,), kw)
809 
810  def handle_xml(self, encoding, standalone):
811  self.flush()
812  print 'xml: encoding =',encoding,'standalone =',standalone
813 
814  def handle_doctype(self, tag, pubid, syslit, data):
815  self.flush()
816  print 'DOCTYPE:',tag, `data`
817 
818  def handle_data(self, data):
819  self.testdata = self.testdata + data
820  if len(`self.testdata`) >= 70:
821  self.flush()
822 
823  def flush(self):
824  data = self.testdata
825  if data:
826  self.testdata = ""
827  print 'data:', `data`
828 
829  def handle_cdata(self, data):
830  self.flush()
831  print 'cdata:', `data`
832 
833  def handle_proc(self, name, data):
834  self.flush()
835  print 'processing:',name,`data`
836 
837  def handle_comment(self, data):
838  self.flush()
839  r = `data`
840  if len(r) > 68:
841  r = r[:32] + '...' + r[-32:]
842  print 'comment:', r
843 
844  def syntax_error(self, message):
845  print 'error at line %d:' % self.lineno, message
846 
847  def unknown_starttag(self, tag, attrs):
848  self.flush()
849  if not attrs:
850  print 'start tag: <' + tag + '>'
851  else:
852  print 'start tag: <' + tag,
853  for name, value in attrs.items():
854  print name + '=' + '"' + value + '"',
855  print '>'
856 
857  def unknown_endtag(self, tag):
858  self.flush()
859  print 'end tag: </' + tag + '>'
860 
861  def unknown_entityref(self, ref):
862  self.flush()
863  print '*** unknown entity ref: &' + ref + ';'
864 
865  def unknown_charref(self, ref):
866  self.flush()
867  print '*** unknown char ref: &#' + ref + ';'
868 
869  def close(self):
870  XMLParser.close(self)
871  self.flush()
872 
873 def test(args = None):
874  import sys, getopt
875  from time import time
876 
877  if not args:
878  args = sys.argv[1:]
879 
880  opts, args = getopt.getopt(args, 'st')
881  klass = TestXMLParser
882  do_time = 0
883  for o, a in opts:
884  if o == '-s':
885  klass = XMLParser
886  elif o == '-t':
887  do_time = 1
888 
889  if args:
890  file = args[0]
891  else:
892  file = 'test.xml'
893 
894  if file == '-':
895  f = sys.stdin
896  else:
897  try:
898  f = open(file, 'r')
899  except IOError, msg:
900  print file, ":", msg
901  sys.exit(1)
902 
903  data = f.read()
904  if f is not sys.stdin:
905  f.close()
906 
907  x = klass()
908  t0 = time()
909  try:
910  if do_time:
911  x.feed(data)
912  x.close()
913  else:
914  for c in data:
915  x.feed(c)
916  x.close()
917  except Error, msg:
918  t1 = time()
919  print msg
920  if do_time:
921  print 'total time: %g' % (t1-t0)
922  sys.exit(1)
923  t1 = time()
924  if do_time:
925  print 'total time: %g' % (t1-t0)
926 
927 
928 if __name__ == '__main__':
929  test()