Vega strike Python Modules doc  0.5.1
Documentation of the " Modules " folder of Vega strike
 All Data Structures Namespaces Files Functions Variables
HTMLParser.py
Go to the documentation of this file.
1 """A parser for HTML and XHTML."""
2 
3 # This file is based on sgmllib.py, but the API is slightly different.
4 
5 # XXX There should be a way to distinguish between PCDATA (parsed
6 # character data -- the normal case), RCDATA (replaceable character
7 # data -- only char and entity references and end tags are special)
8 # and CDATA (character data -- only end tags are special).
9 
10 
11 import markupbase
12 import re
13 
14 # Regular expressions used for parsing
15 
16 interesting_normal = re.compile('[&<]')
17 interesting_cdata = re.compile(r'<(/|\Z)')
18 incomplete = re.compile('&[a-zA-Z#]')
19 
20 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
21 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
22 
23 starttagopen = re.compile('<[a-zA-Z]')
24 piclose = re.compile('>')
25 commentclose = re.compile(r'--\s*>')
26 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
27 attrfind = re.compile(
28  r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
29  r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
30 
31 locatestarttagend = re.compile(r"""
32  <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
33  (?:\s+ # whitespace before attribute name
34  (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
35  (?:\s*=\s* # value indicator
36  (?:'[^']*' # LITA-enclosed value
37  |\"[^\"]*\" # LIT-enclosed value
38  |[^'\">\s]+ # bare value
39  )
40  )?
41  )
42  )*
43  \s* # trailing whitespace
44 """, re.VERBOSE)
45 endendtag = re.compile('>')
46 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
47 
48 
49 class HTMLParseError(Exception):
50  """Exception raised for all parse errors."""
51 
52  def __init__(self, msg, position=(None, None)):
53  assert msg
54  self.msg = msg
55  self.lineno = position[0]
56  self.offset = position[1]
57 
58  def __str__(self):
59  result = self.msg
60  if self.lineno is not None:
61  result = result + ", at line %d" % self.lineno
62  if self.offset is not None:
63  result = result + ", column %d" % (self.offset + 1)
64  return result
65 
66 
67 class HTMLParser(markupbase.ParserBase):
68  """Find tags and other markup and call handler functions.
69 
70  Usage:
71  p = HTMLParser()
72  p.feed(data)
73  ...
74  p.close()
75 
76  Start tags are handled by calling self.handle_starttag() or
77  self.handle_startendtag(); end tags by self.handle_endtag(). The
78  data between tags is passed from the parser to the derived class
79  by calling self.handle_data() with the data as argument (the data
80  may be split up in arbitrary chunks). Entity references are
81  passed by calling self.handle_entityref() with the entity
82  reference as the argument. Numeric character references are
83  passed to self.handle_charref() with the string containing the
84  reference as the argument.
85  """
86 
87  CDATA_CONTENT_ELEMENTS = ("script", "style")
88 
89 
90  def __init__(self):
91  """Initialize and reset this instance."""
92  self.reset()
93 
94  def reset(self):
95  """Reset this instance. Loses all unprocessed data."""
96  self.rawdata = ''
97  self.lasttag = '???'
98  self.interesting = interesting_normal
99  markupbase.ParserBase.reset(self)
100 
101  def feed(self, data):
102  """Feed data to the parser.
103 
104  Call this as often as you want, with as little or as much text
105  as you want (may include '\n').
106  """
107  self.rawdata = self.rawdata + data
108  self.goahead(0)
109 
110  def close(self):
111  """Handle any buffered data."""
112  self.goahead(1)
113 
114  def error(self, message):
115  raise HTMLParseError(message, self.getpos())
116 
117  __starttag_text = None
118 
119  def get_starttag_text(self):
120  """Return full source of start tag: '<...>'."""
121  return self.__starttag_text
122 
123  def set_cdata_mode(self):
124  self.interesting = interesting_cdata
125 
126  def clear_cdata_mode(self):
127  self.interesting = interesting_normal
128 
129  # Internal -- handle data as far as reasonable. May leave state
130  # and data to be processed by a subsequent call. If 'end' is
131  # true, force handling all data as if followed by EOF marker.
132  def goahead(self, end):
133  rawdata = self.rawdata
134  i = 0
135  n = len(rawdata)
136  while i < n:
137  match = self.interesting.search(rawdata, i) # < or &
138  if match:
139  j = match.start()
140  else:
141  j = n
142  if i < j: self.handle_data(rawdata[i:j])
143  i = self.updatepos(i, j)
144  if i == n: break
145  startswith = rawdata.startswith
146  if startswith('<', i):
147  if starttagopen.match(rawdata, i): # < + letter
148  k = self.parse_starttag(i)
149  elif startswith("</", i):
150  k = self.parse_endtag(i)
151  if k >= 0:
152  self.clear_cdata_mode()
153  elif startswith("<!--", i):
154  k = self.parse_comment(i)
155  elif startswith("<?", i):
156  k = self.parse_pi(i)
157  elif startswith("<!", i):
158  k = self.parse_declaration(i)
159  elif (i + 1) < n:
160  self.handle_data("<")
161  k = i + 1
162  else:
163  break
164  if k < 0:
165  if end:
166  self.error("EOF in middle of construct")
167  break
168  i = self.updatepos(i, k)
169  elif startswith("&#", i):
170  match = charref.match(rawdata, i)
171  if match:
172  name = match.group()[2:-1]
173  self.handle_charref(name)
174  k = match.end()
175  if not startswith(';', k-1):
176  k = k - 1
177  i = self.updatepos(i, k)
178  continue
179  else:
180  break
181  elif startswith('&', i):
182  match = entityref.match(rawdata, i)
183  if match:
184  name = match.group(1)
185  self.handle_entityref(name)
186  k = match.end()
187  if not startswith(';', k-1):
188  k = k - 1
189  i = self.updatepos(i, k)
190  continue
191  match = incomplete.match(rawdata, i)
192  if match:
193  # match.group() will contain at least 2 chars
194  if end and match.group() == rawdata[i:]:
195  self.error("EOF in middle of entity or char ref")
196  # incomplete
197  break
198  elif (i + 1) < n:
199  # not the end of the buffer, and can't be confused
200  # with some other construct
201  self.handle_data("&")
202  i = self.updatepos(i, i + 1)
203  else:
204  break
205  else:
206  assert 0, "interesting.search() lied"
207  # end while
208  if end and i < n:
209  self.handle_data(rawdata[i:n])
210  i = self.updatepos(i, n)
211  self.rawdata = rawdata[i:]
212 
213  # Internal -- parse comment, return end or -1 if not terminated
214  def parse_comment(self, i, report=1):
215  rawdata = self.rawdata
216  assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
217  match = commentclose.search(rawdata, i+4)
218  if not match:
219  return -1
220  if report:
221  j = match.start()
222  self.handle_comment(rawdata[i+4: j])
223  j = match.end()
224  return j
225 
226  # Internal -- parse processing instr, return end or -1 if not terminated
227  def parse_pi(self, i):
228  rawdata = self.rawdata
229  assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
230  match = piclose.search(rawdata, i+2) # >
231  if not match:
232  return -1
233  j = match.start()
234  self.handle_pi(rawdata[i+2: j])
235  j = match.end()
236  return j
237 
238  # Internal -- handle starttag, return end or -1 if not terminated
239  def parse_starttag(self, i):
240  self.__starttag_text = None
241  endpos = self.check_for_whole_start_tag(i)
242  if endpos < 0:
243  return endpos
244  rawdata = self.rawdata
245  self.__starttag_text = rawdata[i:endpos]
246 
247  # Now parse the data between i+1 and j into a tag and attrs
248  attrs = []
249  match = tagfind.match(rawdata, i+1)
250  assert match, 'unexpected call to parse_starttag()'
251  k = match.end()
252  self.lasttag = tag = rawdata[i+1:k].lower()
253 
254  while k < endpos:
255  m = attrfind.match(rawdata, k)
256  if not m:
257  break
258  attrname, rest, attrvalue = m.group(1, 2, 3)
259  if not rest:
260  attrvalue = None
261  elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
262  attrvalue[:1] == '"' == attrvalue[-1:]:
263  attrvalue = attrvalue[1:-1]
264  attrvalue = self.unescape(attrvalue)
265  attrs.append((attrname.lower(), attrvalue))
266  k = m.end()
267 
268  end = rawdata[k:endpos].strip()
269  if end not in (">", "/>"):
270  lineno, offset = self.getpos()
271  if "\n" in self.__starttag_text:
272  lineno = lineno + self.__starttag_text.count("\n")
273  offset = len(self.__starttag_text) \
274  - self.__starttag_text.rfind("\n")
275  else:
276  offset = offset + len(self.__starttag_text)
277  self.error("junk characters in start tag: %s"
278  % `rawdata[k:endpos][:20]`)
279  if end.endswith('/>'):
280  # XHTML-style empty tag: <span attr="value" />
281  self.handle_startendtag(tag, attrs)
282  else:
283  self.handle_starttag(tag, attrs)
284  if tag in self.CDATA_CONTENT_ELEMENTS:
285  self.set_cdata_mode()
286  return endpos
287 
288  # Internal -- check to see if we have a complete starttag; return end
289  # or -1 if incomplete.
290  def check_for_whole_start_tag(self, i):
291  rawdata = self.rawdata
292  m = locatestarttagend.match(rawdata, i)
293  if m:
294  j = m.end()
295  next = rawdata[j:j+1]
296  if next == ">":
297  return j + 1
298  if next == "/":
299  if rawdata.startswith("/>", j):
300  return j + 2
301  if rawdata.startswith("/", j):
302  # buffer boundary
303  return -1
304  # else bogus input
305  self.updatepos(i, j + 1)
306  self.error("malformed empty start tag")
307  if next == "":
308  # end of input
309  return -1
310  if next in ("abcdefghijklmnopqrstuvwxyz=/"
311  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
312  # end of input in or before attribute value, or we have the
313  # '/' from a '/>' ending
314  return -1
315  self.updatepos(i, j)
316  self.error("malformed start tag")
317  raise AssertionError("we should not get here!")
318 
319  # Internal -- parse endtag, return end or -1 if incomplete
320  def parse_endtag(self, i):
321  rawdata = self.rawdata
322  assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
323  match = endendtag.search(rawdata, i+1) # >
324  if not match:
325  return -1
326  j = match.end()
327  match = endtagfind.match(rawdata, i) # </ + tag + >
328  if not match:
329  self.error("bad end tag: %s" % `rawdata[i:j]`)
330  tag = match.group(1)
331  self.handle_endtag(tag.lower())
332  return j
333 
334  # Overridable -- finish processing of start+end tag: <tag.../>
335  def handle_startendtag(self, tag, attrs):
336  self.handle_starttag(tag, attrs)
337  self.handle_endtag(tag)
338 
339  # Overridable -- handle start tag
340  def handle_starttag(self, tag, attrs):
341  pass
342 
343  # Overridable -- handle end tag
344  def handle_endtag(self, tag):
345  pass
346 
347  # Overridable -- handle character reference
348  def handle_charref(self, name):
349  pass
350 
351  # Overridable -- handle entity reference
352  def handle_entityref(self, name):
353  pass
354 
355  # Overridable -- handle data
356  def handle_data(self, data):
357  pass
358 
359  # Overridable -- handle comment
360  def handle_comment(self, data):
361  pass
362 
363  # Overridable -- handle declaration
364  def handle_decl(self, decl):
365  pass
366 
367  # Overridable -- handle processing instruction
368  def handle_pi(self, data):
369  pass
370 
371  def unknown_decl(self, data):
372  self.error("unknown declaration: " + `data`)
373 
374  # Internal -- helper to remove special character quoting
375  def unescape(self, s):
376  if '&' not in s:
377  return s
378  s = s.replace("&lt;", "<")
379  s = s.replace("&gt;", ">")
380  s = s.replace("&apos;", "'")
381  s = s.replace("&quot;", '"')
382  s = s.replace("&amp;", "&") # Must be last
383  return s