Vega strike Python Modules doc  0.5.1
Documentation of the " Modules " folder of Vega strike
 All Data Structures Namespaces Files Functions Variables
sgmllib.py
Go to the documentation of this file.
1 """A parser for SGML, using the derived class as a static DTD."""
2 
3 # XXX This only supports those SGML features used by HTML.
4 
5 # XXX There should be a way to distinguish between PCDATA (parsed
6 # character data -- the normal case), RCDATA (replaceable character
7 # data -- only char and entity references and end tags are special)
8 # and CDATA (character data -- only end tags are special). RCDATA is
9 # not supported at all.
10 
11 
12 import markupbase
13 import re
14 
15 __all__ = ["SGMLParser"]
16 
17 # Regular expressions used for parsing
18 
19 interesting = re.compile('[&<]')
20 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
21  '<([a-zA-Z][^<>]*|'
22  '/([a-zA-Z][^<>]*)?|'
23  '![^<>]*)?')
24 
25 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
26 charref = re.compile('&#([0-9]+)[^0-9]')
27 
28 starttagopen = re.compile('<[>a-zA-Z]')
29 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
30 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
31 piclose = re.compile('>')
32 endbracket = re.compile('[<>]')
33 commentclose = re.compile(r'--\s*>')
34 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
35 attrfind = re.compile(
36  r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
37  r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~\'"]*))?')
38 
39 
40 class SGMLParseError(RuntimeError):
41  """Exception raised for all parse errors."""
42  pass
43 
44 
45 # SGML parser base class -- find tags and call handler functions.
46 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
47 # The dtd is defined by deriving a class which defines methods
48 # with special names to handle tags: start_foo and end_foo to handle
49 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
50 # (Tags are converted to lower case for this purpose.) The data
51 # between tags is passed to the parser by calling self.handle_data()
52 # with some data as argument (the data may be split up in arbitrary
53 # chunks). Entity references are passed by calling
54 # self.handle_entityref() with the entity reference as argument.
55 
56 class SGMLParser(markupbase.ParserBase):
57 
58  def __init__(self, verbose=0):
59  """Initialize and reset this instance."""
60  self.verbose = verbose
61  self.reset()
62 
63  def reset(self):
64  """Reset this instance. Loses all unprocessed data."""
65  self.rawdata = ''
66  self.stack = []
67  self.lasttag = '???'
68  self.nomoretags = 0
69  self.literal = 0
70  markupbase.ParserBase.reset(self)
71 
72  def setnomoretags(self):
73  """Enter literal mode (CDATA) till EOF.
74 
75  Intended for derived classes only.
76  """
77  self.nomoretags = self.literal = 1
78 
79  def setliteral(self, *args):
80  """Enter literal mode (CDATA).
81 
82  Intended for derived classes only.
83  """
84  self.literal = 1
85 
86  def feed(self, data):
87  """Feed some data to the parser.
88 
89  Call this as often as you want, with as little or as much text
90  as you want (may include '\n'). (This just saves the text,
91  all the processing is done by goahead().)
92  """
93 
94  self.rawdata = self.rawdata + data
95  self.goahead(0)
96 
97  def close(self):
98  """Handle the remaining data."""
99  self.goahead(1)
100 
101  def error(self, message):
102  raise SGMLParseError(message)
103 
104  # Internal -- handle data as far as reasonable. May leave state
105  # and data to be processed by a subsequent call. If 'end' is
106  # true, force handling all data as if followed by EOF marker.
107  def goahead(self, end):
108  rawdata = self.rawdata
109  i = 0
110  n = len(rawdata)
111  while i < n:
112  if self.nomoretags:
113  self.handle_data(rawdata[i:n])
114  i = n
115  break
116  match = interesting.search(rawdata, i)
117  if match: j = match.start()
118  else: j = n
119  if i < j:
120  self.handle_data(rawdata[i:j])
121  i = j
122  if i == n: break
123  if rawdata[i] == '<':
124  if starttagopen.match(rawdata, i):
125  if self.literal:
126  self.handle_data(rawdata[i])
127  i = i+1
128  continue
129  k = self.parse_starttag(i)
130  if k < 0: break
131  i = k
132  continue
133  if rawdata.startswith("</", i):
134  k = self.parse_endtag(i)
135  if k < 0: break
136  i = k
137  self.literal = 0
138  continue
139  if self.literal:
140  if n > (i + 1):
141  self.handle_data("<")
142  i = i+1
143  else:
144  # incomplete
145  break
146  continue
147  if rawdata.startswith("<!--", i):
148  k = self.parse_comment(i)
149  if k < 0: break
150  i = k
151  continue
152  if rawdata.startswith("<?", i):
153  k = self.parse_pi(i)
154  if k < 0: break
155  i = i+k
156  continue
157  if rawdata.startswith("<!", i):
158  # This is some sort of declaration; in "HTML as
159  # deployed," this should only be the document type
160  # declaration ("<!DOCTYPE html...>").
161  k = self.parse_declaration(i)
162  if k < 0: break
163  i = k
164  continue
165  elif rawdata[i] == '&':
166  if self.literal:
167  self.handle_data(rawdata[i])
168  i = i+1
169  continue
170  match = charref.match(rawdata, i)
171  if match:
172  name = match.group(1)
173  self.handle_charref(name)
174  i = match.end(0)
175  if rawdata[i-1] != ';': i = i-1
176  continue
177  match = entityref.match(rawdata, i)
178  if match:
179  name = match.group(1)
180  self.handle_entityref(name)
181  i = match.end(0)
182  if rawdata[i-1] != ';': i = i-1
183  continue
184  else:
185  self.error('neither < nor & ??')
186  # We get here only if incomplete matches but
187  # nothing else
188  match = incomplete.match(rawdata, i)
189  if not match:
190  self.handle_data(rawdata[i])
191  i = i+1
192  continue
193  j = match.end(0)
194  if j == n:
195  break # Really incomplete
196  self.handle_data(rawdata[i:j])
197  i = j
198  # end while
199  if end and i < n:
200  self.handle_data(rawdata[i:n])
201  i = n
202  self.rawdata = rawdata[i:]
203  # XXX if end: check for empty stack
204 
205  # Internal -- parse comment, return length or -1 if not terminated
206  def parse_comment(self, i, report=1):
207  rawdata = self.rawdata
208  if rawdata[i:i+4] != '<!--':
209  self.error('unexpected call to parse_comment()')
210  match = commentclose.search(rawdata, i+4)
211  if not match:
212  return -1
213  if report:
214  j = match.start(0)
215  self.handle_comment(rawdata[i+4: j])
216  return match.end(0)
217 
218  # Extensions for the DOCTYPE scanner:
219  _decl_otherchars = '='
220 
221  # Internal -- parse processing instr, return length or -1 if not terminated
222  def parse_pi(self, i):
223  rawdata = self.rawdata
224  if rawdata[i:i+2] != '<?':
225  self.error('unexpected call to parse_pi()')
226  match = piclose.search(rawdata, i+2)
227  if not match:
228  return -1
229  j = match.start(0)
230  self.handle_pi(rawdata[i+2: j])
231  j = match.end(0)
232  return j-i
233 
234  __starttag_text = None
235  def get_starttag_text(self):
236  return self.__starttag_text
237 
238  # Internal -- handle starttag, return length or -1 if not terminated
239  def parse_starttag(self, i):
240  self.__starttag_text = None
241  start_pos = i
242  rawdata = self.rawdata
243  if shorttagopen.match(rawdata, i):
244  # SGML shorthand: <tag/data/ == <tag>data</tag>
245  # XXX Can data contain &... (entity or char refs)?
246  # XXX Can data contain < or > (tag characters)?
247  # XXX Can there be whitespace before the first /?
248  match = shorttag.match(rawdata, i)
249  if not match:
250  return -1
251  tag, data = match.group(1, 2)
252  self.__starttag_text = '<%s/' % tag
253  tag = tag.lower()
254  k = match.end(0)
255  self.finish_shorttag(tag, data)
256  self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
257  return k
258  # XXX The following should skip matching quotes (' or ")
259  match = endbracket.search(rawdata, i+1)
260  if not match:
261  return -1
262  j = match.start(0)
263  # Now parse the data between i+1 and j into a tag and attrs
264  attrs = []
265  if rawdata[i:i+2] == '<>':
266  # SGML shorthand: <> == <last open tag seen>
267  k = j
268  tag = self.lasttag
269  else:
270  match = tagfind.match(rawdata, i+1)
271  if not match:
272  self.error('unexpected call to parse_starttag')
273  k = match.end(0)
274  tag = rawdata[i+1:k].lower()
275  self.lasttag = tag
276  while k < j:
277  match = attrfind.match(rawdata, k)
278  if not match: break
279  attrname, rest, attrvalue = match.group(1, 2, 3)
280  if not rest:
281  attrvalue = attrname
282  elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
283  attrvalue[:1] == '"' == attrvalue[-1:]:
284  attrvalue = attrvalue[1:-1]
285  attrs.append((attrname.lower(), attrvalue))
286  k = match.end(0)
287  if rawdata[j] == '>':
288  j = j+1
289  self.__starttag_text = rawdata[start_pos:j]
290  self.finish_starttag(tag, attrs)
291  return j
292 
293  # Internal -- parse endtag
294  def parse_endtag(self, i):
295  rawdata = self.rawdata
296  match = endbracket.search(rawdata, i+1)
297  if not match:
298  return -1
299  j = match.start(0)
300  tag = rawdata[i+2:j].strip().lower()
301  if rawdata[j] == '>':
302  j = j+1
303  self.finish_endtag(tag)
304  return j
305 
306  # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
307  def finish_shorttag(self, tag, data):
308  self.finish_starttag(tag, [])
309  self.handle_data(data)
310  self.finish_endtag(tag)
311 
312  # Internal -- finish processing of start tag
313  # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
314  def finish_starttag(self, tag, attrs):
315  try:
316  method = getattr(self, 'start_' + tag)
317  except AttributeError:
318  try:
319  method = getattr(self, 'do_' + tag)
320  except AttributeError:
321  self.unknown_starttag(tag, attrs)
322  return -1
323  else:
324  self.handle_starttag(tag, method, attrs)
325  return 0
326  else:
327  self.stack.append(tag)
328  self.handle_starttag(tag, method, attrs)
329  return 1
330 
331  # Internal -- finish processing of end tag
332  def finish_endtag(self, tag):
333  if not tag:
334  found = len(self.stack) - 1
335  if found < 0:
336  self.unknown_endtag(tag)
337  return
338  else:
339  if tag not in self.stack:
340  try:
341  method = getattr(self, 'end_' + tag)
342  except AttributeError:
343  self.unknown_endtag(tag)
344  else:
345  self.report_unbalanced(tag)
346  return
347  found = len(self.stack)
348  for i in range(found):
349  if self.stack[i] == tag: found = i
350  while len(self.stack) > found:
351  tag = self.stack[-1]
352  try:
353  method = getattr(self, 'end_' + tag)
354  except AttributeError:
355  method = None
356  if method:
357  self.handle_endtag(tag, method)
358  else:
359  self.unknown_endtag(tag)
360  del self.stack[-1]
361 
362  # Overridable -- handle start tag
363  def handle_starttag(self, tag, method, attrs):
364  method(attrs)
365 
366  # Overridable -- handle end tag
367  def handle_endtag(self, tag, method):
368  method()
369 
370  # Example -- report an unbalanced </...> tag.
371  def report_unbalanced(self, tag):
372  if self.verbose:
373  print '*** Unbalanced </' + tag + '>'
374  print '*** Stack:', self.stack
375 
376  def handle_charref(self, name):
377  """Handle character reference, no need to override."""
378  try:
379  n = int(name)
380  except ValueError:
381  self.unknown_charref(name)
382  return
383  if not 0 <= n <= 255:
384  self.unknown_charref(name)
385  return
386  self.handle_data(chr(n))
387 
388  # Definition of entities -- derived classes may override
389  entitydefs = \
390  {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
391 
392  def handle_entityref(self, name):
393  """Handle entity references.
394 
395  There should be no need to override this method; it can be
396  tailored by setting up the self.entitydefs mapping appropriately.
397  """
398  table = self.entitydefs
399  if table.has_key(name):
400  self.handle_data(table[name])
401  else:
402  self.unknown_entityref(name)
403  return
404 
405  # Example -- handle data, should be overridden
406  def handle_data(self, data):
407  pass
408 
409  # Example -- handle comment, could be overridden
410  def handle_comment(self, data):
411  pass
412 
413  # Example -- handle declaration, could be overridden
414  def handle_decl(self, decl):
415  pass
416 
417  # Example -- handle processing instruction, could be overridden
418  def handle_pi(self, data):
419  pass
420 
421  # To be overridden -- handlers for unknown objects
422  def unknown_starttag(self, tag, attrs): pass
423  def unknown_endtag(self, tag): pass
424  def unknown_charref(self, ref): pass
425  def unknown_entityref(self, ref): pass
426 
427 
429 
430  def __init__(self, verbose=0):
431  self.testdata = ""
432  SGMLParser.__init__(self, verbose)
433 
434  def handle_data(self, data):
435  self.testdata = self.testdata + data
436  if len(`self.testdata`) >= 70:
437  self.flush()
438 
439  def flush(self):
440  data = self.testdata
441  if data:
442  self.testdata = ""
443  print 'data:', `data`
444 
445  def handle_comment(self, data):
446  self.flush()
447  r = `data`
448  if len(r) > 68:
449  r = r[:32] + '...' + r[-32:]
450  print 'comment:', r
451 
452  def unknown_starttag(self, tag, attrs):
453  self.flush()
454  if not attrs:
455  print 'start tag: <' + tag + '>'
456  else:
457  print 'start tag: <' + tag,
458  for name, value in attrs:
459  print name + '=' + '"' + value + '"',
460  print '>'
461 
462  def unknown_endtag(self, tag):
463  self.flush()
464  print 'end tag: </' + tag + '>'
465 
466  def unknown_entityref(self, ref):
467  self.flush()
468  print '*** unknown entity ref: &' + ref + ';'
469 
470  def unknown_charref(self, ref):
471  self.flush()
472  print '*** unknown char ref: &#' + ref + ';'
473 
474  def close(self):
475  SGMLParser.close(self)
476  self.flush()
477 
478 
479 def test(args = None):
480  import sys
481 
482  if not args:
483  args = sys.argv[1:]
484 
485  if args and args[0] == '-s':
486  args = args[1:]
487  klass = SGMLParser
488  else:
489  klass = TestSGMLParser
490 
491  if args:
492  file = args[0]
493  else:
494  file = 'test.html'
495 
496  if file == '-':
497  f = sys.stdin
498  else:
499  try:
500  f = open(file, 'r')
501  except IOError, msg:
502  print file, ":", msg
503  sys.exit(1)
504 
505  data = f.read()
506  if f is not sys.stdin:
507  f.close()
508 
509  x = klass()
510  for c in data:
511  x.feed(c)
512  x.close()
513 
514 
515 if __name__ == '__main__':
516  test()