Vega strike Python Modules doc  0.5.1
Documentation of the " Modules " folder of Vega strike
 All Data Structures Namespaces Files Functions Variables
markupbase.py
Go to the documentation of this file.
1 """Shared support for scanning document type declarations in HTML and XHTML."""
2 
3 import re
4 import string
5 
6 _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
7 _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
8 
9 del re
10 
11 
12 class ParserBase:
13  """Parser base class which provides some common support methods used
14  by the SGML/HTML and XHTML parsers."""
15 
16  def __init__(self):
17  if self.__class__ is ParserBase:
18  raise RuntimeError(
19  "markupbase.ParserBase must be subclassed")
20 
21  def error(self, message):
22  raise NotImplementedError(
23  "subclasses of ParserBase must override error()")
24 
25  def reset(self):
26  self.lineno = 1
27  self.offset = 0
28 
29  def getpos(self):
30  """Return current line number and offset."""
31  return self.lineno, self.offset
32 
33  # Internal -- update line number and offset. This should be
34  # called for each piece of data exactly once, in order -- in other
35  # words the concatenation of all the input strings to this
36  # function should be exactly the entire input.
37  def updatepos(self, i, j):
38  if i >= j:
39  return j
40  rawdata = self.rawdata
41  nlines = string.count(rawdata, "\n", i, j)
42  if nlines:
43  self.lineno = self.lineno + nlines
44  pos = string.rindex(rawdata, "\n", i, j) # Should not fail
45  self.offset = j-(pos+1)
46  else:
47  self.offset = self.offset + j-i
48  return j
49 
50  _decl_otherchars = ''
51 
52  # Internal -- parse declaration (for use by subclasses).
53  def parse_declaration(self, i):
54  # This is some sort of declaration; in "HTML as
55  # deployed," this should only be the document type
56  # declaration ("<!DOCTYPE html...>").
57  rawdata = self.rawdata
58  j = i + 2
59  assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
60  if rawdata[j:j+1] in ("-", ""):
61  # Start of comment followed by buffer boundary,
62  # or just a buffer boundary.
63  return -1
64  # in practice, this should look like: ((name|stringlit) S*)+ '>'
65  n = len(rawdata)
66  decltype, j = self._scan_name(j, i)
67  if j < 0:
68  return j
69  if decltype == "doctype":
70  self._decl_otherchars = ''
71  while j < n:
72  c = rawdata[j]
73  if c == ">":
74  # end of declaration syntax
75  data = rawdata[i+2:j]
76  if decltype == "doctype":
77  self.handle_decl(data)
78  else:
79  self.unknown_decl(data)
80  return j + 1
81  if c in "\"'":
82  m = _declstringlit_match(rawdata, j)
83  if not m:
84  return -1 # incomplete
85  j = m.end()
86  elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
87  name, j = self._scan_name(j, i)
88  elif c in self._decl_otherchars:
89  j = j + 1
90  elif c == "[":
91  if decltype == "doctype":
92  j = self._parse_doctype_subset(j + 1, i)
93  else:
94  self.error("unexpected '[' char in declaration")
95  else:
96  self.error(
97  "unexpected %s char in declaration" % `rawdata[j]`)
98  if j < 0:
99  return j
100  return -1 # incomplete
101 
102  # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
103  # returning the index just past any whitespace following the trailing ']'.
104  def _parse_doctype_subset(self, i, declstartpos):
105  rawdata = self.rawdata
106  n = len(rawdata)
107  j = i
108  while j < n:
109  c = rawdata[j]
110  if c == "<":
111  s = rawdata[j:j+2]
112  if s == "<":
113  # end of buffer; incomplete
114  return -1
115  if s != "<!":
116  self.updatepos(declstartpos, j + 1)
117  self.error("unexpected char in internal subset (in %s)"
118  % `s`)
119  if (j + 2) == n:
120  # end of buffer; incomplete
121  return -1
122  if (j + 4) > n:
123  # end of buffer; incomplete
124  return -1
125  if rawdata[j:j+4] == "<!--":
126  j = self.parse_comment(j, report=0)
127  if j < 0:
128  return j
129  continue
130  name, j = self._scan_name(j + 2, declstartpos)
131  if j == -1:
132  return -1
133  if name not in ("attlist", "element", "entity", "notation"):
134  self.updatepos(declstartpos, j + 2)
135  self.error(
136  "unknown declaration %s in internal subset" % `name`)
137  # handle the individual names
138  meth = getattr(self, "_parse_doctype_" + name)
139  j = meth(j, declstartpos)
140  if j < 0:
141  return j
142  elif c == "%":
143  # parameter entity reference
144  if (j + 1) == n:
145  # end of buffer; incomplete
146  return -1
147  s, j = self._scan_name(j + 1, declstartpos)
148  if j < 0:
149  return j
150  if rawdata[j] == ";":
151  j = j + 1
152  elif c == "]":
153  j = j + 1
154  while j < n and rawdata[j] in string.whitespace:
155  j = j + 1
156  if j < n:
157  if rawdata[j] == ">":
158  return j
159  self.updatepos(declstartpos, j)
160  self.error("unexpected char after internal subset")
161  else:
162  return -1
163  elif c in string.whitespace:
164  j = j + 1
165  else:
166  self.updatepos(declstartpos, j)
167  self.error("unexpected char %s in internal subset" % `c`)
168  # end of buffer reached
169  return -1
170 
171  # Internal -- scan past <!ELEMENT declarations
172  def _parse_doctype_element(self, i, declstartpos):
173  name, j = self._scan_name(i, declstartpos)
174  if j == -1:
175  return -1
176  # style content model; just skip until '>'
177  rawdata = self.rawdata
178  if '>' in rawdata[j:]:
179  return string.find(rawdata, ">", j) + 1
180  return -1
181 
182  # Internal -- scan past <!ATTLIST declarations
183  def _parse_doctype_attlist(self, i, declstartpos):
184  rawdata = self.rawdata
185  name, j = self._scan_name(i, declstartpos)
186  c = rawdata[j:j+1]
187  if c == "":
188  return -1
189  if c == ">":
190  return j + 1
191  while 1:
192  # scan a series of attribute descriptions; simplified:
193  # name type [value] [#constraint]
194  name, j = self._scan_name(j, declstartpos)
195  if j < 0:
196  return j
197  c = rawdata[j:j+1]
198  if c == "":
199  return -1
200  if c == "(":
201  # an enumerated type; look for ')'
202  if ")" in rawdata[j:]:
203  j = string.find(rawdata, ")", j) + 1
204  else:
205  return -1
206  while rawdata[j:j+1] in string.whitespace:
207  j = j + 1
208  if not rawdata[j:]:
209  # end of buffer, incomplete
210  return -1
211  else:
212  name, j = self._scan_name(j, declstartpos)
213  c = rawdata[j:j+1]
214  if not c:
215  return -1
216  if c in "'\"":
217  m = _declstringlit_match(rawdata, j)
218  if m:
219  j = m.end()
220  else:
221  return -1
222  c = rawdata[j:j+1]
223  if not c:
224  return -1
225  if c == "#":
226  if rawdata[j:] == "#":
227  # end of buffer
228  return -1
229  name, j = self._scan_name(j + 1, declstartpos)
230  if j < 0:
231  return j
232  c = rawdata[j:j+1]
233  if not c:
234  return -1
235  if c == '>':
236  # all done
237  return j + 1
238 
239  # Internal -- scan past <!NOTATION declarations
240  def _parse_doctype_notation(self, i, declstartpos):
241  name, j = self._scan_name(i, declstartpos)
242  if j < 0:
243  return j
244  rawdata = self.rawdata
245  while 1:
246  c = rawdata[j:j+1]
247  if not c:
248  # end of buffer; incomplete
249  return -1
250  if c == '>':
251  return j + 1
252  if c in "'\"":
253  m = _declstringlit_match(rawdata, j)
254  if not m:
255  return -1
256  j = m.end()
257  else:
258  name, j = self._scan_name(j, declstartpos)
259  if j < 0:
260  return j
261 
262  # Internal -- scan past <!ENTITY declarations
263  def _parse_doctype_entity(self, i, declstartpos):
264  rawdata = self.rawdata
265  if rawdata[i:i+1] == "%":
266  j = i + 1
267  while 1:
268  c = rawdata[j:j+1]
269  if not c:
270  return -1
271  if c in string.whitespace:
272  j = j + 1
273  else:
274  break
275  else:
276  j = i
277  name, j = self._scan_name(j, declstartpos)
278  if j < 0:
279  return j
280  while 1:
281  c = self.rawdata[j:j+1]
282  if not c:
283  return -1
284  if c in "'\"":
285  m = _declstringlit_match(rawdata, j)
286  if m:
287  j = m.end()
288  else:
289  return -1 # incomplete
290  elif c == ">":
291  return j + 1
292  else:
293  name, j = self._scan_name(j, declstartpos)
294  if j < 0:
295  return j
296 
297  # Internal -- scan a name token and the new position and the token, or
298  # return -1 if we've reached the end of the buffer.
299  def _scan_name(self, i, declstartpos):
300  rawdata = self.rawdata
301  n = len(rawdata)
302  if i == n:
303  return None, -1
304  m = _declname_match(rawdata, i)
305  if m:
306  s = m.group()
307  name = s.strip()
308  if (i + len(s)) == n:
309  return None, -1 # end of buffer
310  return string.lower(name), m.end()
311  else:
312  self.updatepos(declstartpos, i)
313  self.error("expected name token")
314 
315  # To be overridden -- handlers for unknown objects
316  def unknown_decl(self, data):
317  pass