1 """Shared support for scanning document type declarations in HTML and XHTML."""
6 _declname_match = re.compile(
r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
7 _declstringlit_match = re.compile(
r'(\'[^\']*\'|"[^"]*")\s*').match
13 """Parser base class which provides some common support methods used
14 by the SGML/HTML and XHTML parsers."""
17 if self.__class__
is ParserBase:
19 "markupbase.ParserBase must be subclassed")
21 def error(self, message):
22 raise NotImplementedError(
23 "subclasses of ParserBase must override error()")
30 """Return current line number and offset."""
31 return self.lineno, self.offset
37 def updatepos(self, i, j):
40 rawdata = self.rawdata
43 self.lineno = self.lineno + nlines
45 self.offset = j-(pos+1)
47 self.offset = self.offset + j-i
53 def parse_declaration(self, i):
57 rawdata = self.rawdata
59 assert rawdata[i:j] ==
"<!",
"unexpected call to parse_declaration"
60 if rawdata[j:j+1]
in (
"-",
""):
66 decltype, j = self._scan_name(j, i)
69 if decltype ==
"doctype":
70 self._decl_otherchars =
''
76 if decltype ==
"doctype":
77 self.handle_decl(data)
79 self.unknown_decl(data)
82 m = _declstringlit_match(rawdata, j)
86 elif c
in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
87 name, j = self._scan_name(j, i)
88 elif c
in self._decl_otherchars:
91 if decltype ==
"doctype":
92 j = self._parse_doctype_subset(j + 1, i)
94 self.error(
"unexpected '[' char in declaration")
97 "unexpected %s char in declaration" % `rawdata[j]`)
104 def _parse_doctype_subset(self, i, declstartpos):
105 rawdata = self.rawdata
116 self.updatepos(declstartpos, j + 1)
117 self.error(
"unexpected char in internal subset (in %s)"
125 if rawdata[j:j+4] ==
"<!--":
126 j = self.parse_comment(j, report=0)
130 name, j = self._scan_name(j + 2, declstartpos)
133 if name
not in (
"attlist",
"element",
"entity",
"notation"):
134 self.updatepos(declstartpos, j + 2)
136 "unknown declaration %s in internal subset" % `name`)
138 meth = getattr(self,
"_parse_doctype_" + name)
139 j = meth(j, declstartpos)
147 s, j = self._scan_name(j + 1, declstartpos)
150 if rawdata[j] ==
";":
154 while j < n
and rawdata[j]
in string.whitespace:
157 if rawdata[j] ==
">":
159 self.updatepos(declstartpos, j)
160 self.error(
"unexpected char after internal subset")
163 elif c
in string.whitespace:
166 self.updatepos(declstartpos, j)
167 self.error(
"unexpected char %s in internal subset" % `c`)
172 def _parse_doctype_element(self, i, declstartpos):
173 name, j = self._scan_name(i, declstartpos)
177 rawdata = self.rawdata
178 if '>' in rawdata[j:]:
183 def _parse_doctype_attlist(self, i, declstartpos):
184 rawdata = self.rawdata
185 name, j = self._scan_name(i, declstartpos)
194 name, j = self._scan_name(j, declstartpos)
202 if ")" in rawdata[j:]:
206 while rawdata[j:j+1]
in string.whitespace:
212 name, j = self._scan_name(j, declstartpos)
217 m = _declstringlit_match(rawdata, j)
226 if rawdata[j:] ==
"#":
229 name, j = self._scan_name(j + 1, declstartpos)
240 def _parse_doctype_notation(self, i, declstartpos):
241 name, j = self._scan_name(i, declstartpos)
244 rawdata = self.rawdata
253 m = _declstringlit_match(rawdata, j)
258 name, j = self._scan_name(j, declstartpos)
263 def _parse_doctype_entity(self, i, declstartpos):
264 rawdata = self.rawdata
265 if rawdata[i:i+1] ==
"%":
271 if c
in string.whitespace:
277 name, j = self._scan_name(j, declstartpos)
281 c = self.rawdata[j:j+1]
285 m = _declstringlit_match(rawdata, j)
293 name, j = self._scan_name(j, declstartpos)
299 def _scan_name(self, i, declstartpos):
300 rawdata = self.rawdata
308 if (i + len(s)) == n:
312 self.updatepos(declstartpos, i)
313 self.error(
"expected name token")
316 def unknown_decl(self, data):