1 """A parser for HTML and XHTML."""
16 interesting_normal = re.compile(
'[&<]')
17 interesting_cdata = re.compile(
r'<(/|\Z)')
18 incomplete = re.compile(
'&[a-zA-Z#]')
20 entityref = re.compile(
'&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
21 charref = re.compile(
'&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
23 starttagopen = re.compile(
'<[a-zA-Z]')
24 piclose = re.compile(
'>')
25 commentclose = re.compile(
r'--\s*>')
26 tagfind = re.compile(
'[a-zA-Z][-.a-zA-Z0-9:_]*')
27 attrfind = re.compile(
28 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
29 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
31 locatestarttagend = re.compile(
r"""
32 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
33 (?:\s+ # whitespace before attribute name
34 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
35 (?:\s*=\s* # value indicator
36 (?:'[^']*' # LITA-enclosed value
37 |\"[^\"]*\" # LIT-enclosed value
38 |[^'\">\s]+ # bare value
43 \s* # trailing whitespace
45 endendtag = re.compile(
'>')
46 endtagfind = re.compile(
'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
49 class HTMLParseError(Exception):
50 """Exception raised for all parse errors."""
52 def __init__(self, msg, position=(
None,
None)):
55 self.lineno = position[0]
56 self.offset = position[1]
60 if self.lineno
is not None:
61 result = result +
", at line %d" % self.lineno
62 if self.offset
is not None:
63 result = result +
", column %d" % (self.offset + 1)
68 """Find tags and other markup and call handler functions.
76 Start tags are handled by calling self.handle_starttag() or
77 self.handle_startendtag(); end tags by self.handle_endtag(). The
78 data between tags is passed from the parser to the derived class
79 by calling self.handle_data() with the data as argument (the data
80 may be split up in arbitrary chunks). Entity references are
81 passed by calling self.handle_entityref() with the entity
82 reference as the argument. Numeric character references are
83 passed to self.handle_charref() with the string containing the
84 reference as the argument.
87 CDATA_CONTENT_ELEMENTS = (
"script",
"style")
91 """Initialize and reset this instance."""
95 """Reset this instance. Loses all unprocessed data."""
98 self.interesting = interesting_normal
99 markupbase.ParserBase.reset(self)
101 def feed(self, data):
102 """Feed data to the parser.
104 Call this as often as you want, with as little or as much text
105 as you want (may include '\n').
107 self.rawdata = self.rawdata + data
111 """Handle any buffered data."""
114 def error(self, message):
115 raise HTMLParseError(message, self.getpos())
117 __starttag_text =
None
119 def get_starttag_text(self):
120 """Return full source of start tag: '<...>'."""
121 return self.__starttag_text
123 def set_cdata_mode(self):
124 self.interesting = interesting_cdata
126 def clear_cdata_mode(self):
127 self.interesting = interesting_normal
132 def goahead(self, end):
133 rawdata = self.rawdata
137 match = self.interesting.search(rawdata, i)
142 if i < j: self.handle_data(rawdata[i:j])
143 i = self.updatepos(i, j)
145 startswith = rawdata.startswith
146 if startswith(
'<', i):
147 if starttagopen.match(rawdata, i):
148 k = self.parse_starttag(i)
149 elif startswith(
"</", i):
150 k = self.parse_endtag(i)
152 self.clear_cdata_mode()
153 elif startswith(
"<!--", i):
154 k = self.parse_comment(i)
155 elif startswith(
"<?", i):
157 elif startswith(
"<!", i):
158 k = self.parse_declaration(i)
160 self.handle_data(
"<")
166 self.error(
"EOF in middle of construct")
168 i = self.updatepos(i, k)
169 elif startswith(
"&#", i):
170 match = charref.match(rawdata, i)
172 name = match.group()[2:-1]
173 self.handle_charref(name)
175 if not startswith(
';', k-1):
177 i = self.updatepos(i, k)
181 elif startswith(
'&', i):
182 match = entityref.match(rawdata, i)
184 name = match.group(1)
185 self.handle_entityref(name)
187 if not startswith(
';', k-1):
189 i = self.updatepos(i, k)
191 match = incomplete.match(rawdata, i)
194 if end
and match.group() == rawdata[i:]:
195 self.error(
"EOF in middle of entity or char ref")
201 self.handle_data(
"&")
202 i = self.updatepos(i, i + 1)
206 assert 0,
"interesting.search() lied"
209 self.handle_data(rawdata[i:n])
210 i = self.updatepos(i, n)
211 self.rawdata = rawdata[i:]
214 def parse_comment(self, i, report=1):
215 rawdata = self.rawdata
216 assert rawdata[i:i+4] ==
'<!--',
'unexpected call to parse_comment()'
217 match = commentclose.search(rawdata, i+4)
222 self.handle_comment(rawdata[i+4: j])
227 def parse_pi(self, i):
228 rawdata = self.rawdata
229 assert rawdata[i:i+2] ==
'<?',
'unexpected call to parse_pi()'
230 match = piclose.search(rawdata, i+2)
234 self.handle_pi(rawdata[i+2: j])
239 def parse_starttag(self, i):
240 self.__starttag_text =
None
241 endpos = self.check_for_whole_start_tag(i)
244 rawdata = self.rawdata
245 self.__starttag_text = rawdata[i:endpos]
249 match = tagfind.match(rawdata, i+1)
250 assert match,
'unexpected call to parse_starttag()'
252 self.lasttag = tag = rawdata[i+1:k].
lower()
255 m = attrfind.match(rawdata, k)
258 attrname, rest, attrvalue = m.group(1, 2, 3)
261 elif attrvalue[:1] ==
'\'' == attrvalue[-1:]
or \
262 attrvalue[:1] ==
'"' == attrvalue[-1:]:
263 attrvalue = attrvalue[1:-1]
264 attrvalue = self.unescape(attrvalue)
265 attrs.append((attrname.lower(), attrvalue))
268 end = rawdata[k:endpos].
strip()
269 if end
not in (
">",
"/>"):
270 lineno, offset = self.getpos()
271 if "\n" in self.__starttag_text:
272 lineno = lineno + self.__starttag_text.count(
"\n")
273 offset = len(self.__starttag_text) \
274 - self.__starttag_text.rfind(
"\n")
276 offset = offset + len(self.__starttag_text)
277 self.error(
"junk characters in start tag: %s"
278 % `rawdata[k:endpos][:20]`)
279 if end.endswith(
'/>'):
281 self.handle_startendtag(tag, attrs)
283 self.handle_starttag(tag, attrs)
284 if tag
in self.CDATA_CONTENT_ELEMENTS:
285 self.set_cdata_mode()
290 def check_for_whole_start_tag(self, i):
291 rawdata = self.rawdata
292 m = locatestarttagend.match(rawdata, i)
295 next = rawdata[j:j+1]
299 if rawdata.startswith(
"/>", j):
301 if rawdata.startswith(
"/", j):
305 self.updatepos(i, j + 1)
306 self.error(
"malformed empty start tag")
310 if next
in (
"abcdefghijklmnopqrstuvwxyz=/"
311 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
316 self.error(
"malformed start tag")
317 raise AssertionError(
"we should not get here!")
320 def parse_endtag(self, i):
321 rawdata = self.rawdata
322 assert rawdata[i:i+2] ==
"</",
"unexpected call to parse_endtag"
323 match = endendtag.search(rawdata, i+1)
327 match = endtagfind.match(rawdata, i)
329 self.error(
"bad end tag: %s" % `rawdata[i:j]`)
331 self.handle_endtag(tag.lower())
335 def handle_startendtag(self, tag, attrs):
336 self.handle_starttag(tag, attrs)
337 self.handle_endtag(tag)
340 def handle_starttag(self, tag, attrs):
344 def handle_endtag(self, tag):
348 def handle_charref(self, name):
352 def handle_entityref(self, name):
356 def handle_data(self, data):
360 def handle_comment(self, data):
364 def handle_decl(self, decl):
368 def handle_pi(self, data):
371 def unknown_decl(self, data):
372 self.error(
"unknown declaration: " + `data`)
375 def unescape(self, s):
378 s = s.replace(
"<",
"<")
379 s = s.replace(
">",
">")
380 s = s.replace(
"'",
"'")
381 s = s.replace(
""",
'"')
382 s = s.replace(
"&",
"&")