Vega strike Python Modules doc  0.5.1
Documentation of the " Modules " folder of Vega strike
 All Data Structures Namespaces Files Functions Variables
htmllib.py
Go to the documentation of this file.
1 """HTML 2.0 parser.
2 
3 See the HTML 2.0 specification:
4 http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
5 """
6 
7 
8 from sgmllib import SGMLParser
9 from formatter import AS_IS
10 
11 __all__ = ["HTMLParser"]
12 
14 
15  from htmlentitydefs import entitydefs
16 
17  def __init__(self, formatter, verbose=0):
18  SGMLParser.__init__(self, verbose)
19  self.formatter = formatter
20  self.savedata = None
21  self.isindex = 0
22  self.title = None
23  self.base = None
24  self.anchor = None
25  self.anchorlist = []
26  self.nofill = 0
27  self.list_stack = []
28 
29  # ------ Methods used internally; some may be overridden
30 
31  # --- Formatter interface, taking care of 'savedata' mode;
32  # shouldn't need to be overridden
33 
34  def handle_data(self, data):
35  if self.savedata is not None:
36  self.savedata = self.savedata + data
37  else:
38  if self.nofill:
39  self.formatter.add_literal_data(data)
40  else:
41  self.formatter.add_flowing_data(data)
42 
43  # --- Hooks to save data; shouldn't need to be overridden
44 
45  def save_bgn(self):
46  self.savedata = ''
47 
48  def save_end(self):
49  data = self.savedata
50  self.savedata = None
51  if not self.nofill:
52  data = ' '.join(data.split())
53  return data
54 
55  # --- Hooks for anchors; should probably be overridden
56 
57  def anchor_bgn(self, href, name, type):
58  self.anchor = href
59  if self.anchor:
60  self.anchorlist.append(href)
61 
62  def anchor_end(self):
63  if self.anchor:
64  self.handle_data("[%d]" % len(self.anchorlist))
65  self.anchor = None
66 
67  # --- Hook for images; should probably be overridden
68 
69  def handle_image(self, src, alt, *args):
70  self.handle_data(alt)
71 
72  # --------- Top level elememts
73 
74  def start_html(self, attrs): pass
75  def end_html(self): pass
76 
77  def start_head(self, attrs): pass
78  def end_head(self): pass
79 
80  def start_body(self, attrs): pass
81  def end_body(self): pass
82 
83  # ------ Head elements
84 
85  def start_title(self, attrs):
86  self.save_bgn()
87 
88  def end_title(self):
89  self.title = self.save_end()
90 
91  def do_base(self, attrs):
92  for a, v in attrs:
93  if a == 'href':
94  self.base = v
95 
96  def do_isindex(self, attrs):
97  self.isindex = 1
98 
99  def do_link(self, attrs):
100  pass
101 
102  def do_meta(self, attrs):
103  pass
104 
105  def do_nextid(self, attrs): # Deprecated
106  pass
107 
108  # ------ Body elements
109 
110  # --- Headings
111 
112  def start_h1(self, attrs):
113  self.formatter.end_paragraph(1)
114  self.formatter.push_font(('h1', 0, 1, 0))
115 
116  def end_h1(self):
117  self.formatter.end_paragraph(1)
118  self.formatter.pop_font()
119 
120  def start_h2(self, attrs):
121  self.formatter.end_paragraph(1)
122  self.formatter.push_font(('h2', 0, 1, 0))
123 
124  def end_h2(self):
125  self.formatter.end_paragraph(1)
126  self.formatter.pop_font()
127 
128  def start_h3(self, attrs):
129  self.formatter.end_paragraph(1)
130  self.formatter.push_font(('h3', 0, 1, 0))
131 
132  def end_h3(self):
133  self.formatter.end_paragraph(1)
134  self.formatter.pop_font()
135 
136  def start_h4(self, attrs):
137  self.formatter.end_paragraph(1)
138  self.formatter.push_font(('h4', 0, 1, 0))
139 
140  def end_h4(self):
141  self.formatter.end_paragraph(1)
142  self.formatter.pop_font()
143 
144  def start_h5(self, attrs):
145  self.formatter.end_paragraph(1)
146  self.formatter.push_font(('h5', 0, 1, 0))
147 
148  def end_h5(self):
149  self.formatter.end_paragraph(1)
150  self.formatter.pop_font()
151 
152  def start_h6(self, attrs):
153  self.formatter.end_paragraph(1)
154  self.formatter.push_font(('h6', 0, 1, 0))
155 
156  def end_h6(self):
157  self.formatter.end_paragraph(1)
158  self.formatter.pop_font()
159 
160  # --- Block Structuring Elements
161 
162  def do_p(self, attrs):
163  self.formatter.end_paragraph(1)
164 
165  def start_pre(self, attrs):
166  self.formatter.end_paragraph(1)
167  self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
168  self.nofill = self.nofill + 1
169 
170  def end_pre(self):
171  self.formatter.end_paragraph(1)
172  self.formatter.pop_font()
173  self.nofill = max(0, self.nofill - 1)
174 
175  def start_xmp(self, attrs):
176  self.start_pre(attrs)
177  self.setliteral('xmp') # Tell SGML parser
178 
179  def end_xmp(self):
180  self.end_pre()
181 
182  def start_listing(self, attrs):
183  self.start_pre(attrs)
184  self.setliteral('listing') # Tell SGML parser
185 
186  def end_listing(self):
187  self.end_pre()
188 
189  def start_address(self, attrs):
190  self.formatter.end_paragraph(0)
191  self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
192 
193  def end_address(self):
194  self.formatter.end_paragraph(0)
195  self.formatter.pop_font()
196 
197  def start_blockquote(self, attrs):
198  self.formatter.end_paragraph(1)
199  self.formatter.push_margin('blockquote')
200 
201  def end_blockquote(self):
202  self.formatter.end_paragraph(1)
203  self.formatter.pop_margin()
204 
205  # --- List Elements
206 
207  def start_ul(self, attrs):
208  self.formatter.end_paragraph(not self.list_stack)
209  self.formatter.push_margin('ul')
210  self.list_stack.append(['ul', '*', 0])
211 
212  def end_ul(self):
213  if self.list_stack: del self.list_stack[-1]
214  self.formatter.end_paragraph(not self.list_stack)
215  self.formatter.pop_margin()
216 
217  def do_li(self, attrs):
218  self.formatter.end_paragraph(0)
219  if self.list_stack:
220  [dummy, label, counter] = top = self.list_stack[-1]
221  top[2] = counter = counter+1
222  else:
223  label, counter = '*', 0
224  self.formatter.add_label_data(label, counter)
225 
226  def start_ol(self, attrs):
227  self.formatter.end_paragraph(not self.list_stack)
228  self.formatter.push_margin('ol')
229  label = '1.'
230  for a, v in attrs:
231  if a == 'type':
232  if len(v) == 1: v = v + '.'
233  label = v
234  self.list_stack.append(['ol', label, 0])
235 
236  def end_ol(self):
237  if self.list_stack: del self.list_stack[-1]
238  self.formatter.end_paragraph(not self.list_stack)
239  self.formatter.pop_margin()
240 
241  def start_menu(self, attrs):
242  self.start_ul(attrs)
243 
244  def end_menu(self):
245  self.end_ul()
246 
247  def start_dir(self, attrs):
248  self.start_ul(attrs)
249 
250  def end_dir(self):
251  self.end_ul()
252 
253  def start_dl(self, attrs):
254  self.formatter.end_paragraph(1)
255  self.list_stack.append(['dl', '', 0])
256 
257  def end_dl(self):
258  self.ddpop(1)
259  if self.list_stack: del self.list_stack[-1]
260 
261  def do_dt(self, attrs):
262  self.ddpop()
263 
264  def do_dd(self, attrs):
265  self.ddpop()
266  self.formatter.push_margin('dd')
267  self.list_stack.append(['dd', '', 0])
268 
269  def ddpop(self, bl=0):
270  self.formatter.end_paragraph(bl)
271  if self.list_stack:
272  if self.list_stack[-1][0] == 'dd':
273  del self.list_stack[-1]
274  self.formatter.pop_margin()
275 
276  # --- Phrase Markup
277 
278  # Idiomatic Elements
279 
280  def start_cite(self, attrs): self.start_i(attrs)
281  def end_cite(self): self.end_i()
282 
283  def start_code(self, attrs): self.start_tt(attrs)
284  def end_code(self): self.end_tt()
285 
286  def start_em(self, attrs): self.start_i(attrs)
287  def end_em(self): self.end_i()
288 
289  def start_kbd(self, attrs): self.start_tt(attrs)
290  def end_kbd(self): self.end_tt()
291 
292  def start_samp(self, attrs): self.start_tt(attrs)
293  def end_samp(self): self.end_tt()
294 
295  def start_strong(self, attrs): self.start_b(attrs)
296  def end_strong(self): self.end_b()
297 
298  def start_var(self, attrs): self.start_i(attrs)
299  def end_var(self): self.end_i()
300 
301  # Typographic Elements
302 
303  def start_i(self, attrs):
304  self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
305  def end_i(self):
306  self.formatter.pop_font()
307 
308  def start_b(self, attrs):
309  self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
310  def end_b(self):
311  self.formatter.pop_font()
312 
313  def start_tt(self, attrs):
314  self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
315  def end_tt(self):
316  self.formatter.pop_font()
317 
318  def start_a(self, attrs):
319  href = ''
320  name = ''
321  type = ''
322  for attrname, value in attrs:
323  value = value.strip()
324  if attrname == 'href':
325  href = value
326  if attrname == 'name':
327  name = value
328  if attrname == 'type':
329  type = value.lower()
330  self.anchor_bgn(href, name, type)
331 
332  def end_a(self):
333  self.anchor_end()
334 
335  # --- Line Break
336 
337  def do_br(self, attrs):
338  self.formatter.add_line_break()
339 
340  # --- Horizontal Rule
341 
342  def do_hr(self, attrs):
343  self.formatter.add_hor_rule()
344 
345  # --- Image
346 
347  def do_img(self, attrs):
348  align = ''
349  alt = '(image)'
350  ismap = ''
351  src = ''
352  width = 0
353  height = 0
354  for attrname, value in attrs:
355  if attrname == 'align':
356  align = value
357  if attrname == 'alt':
358  alt = value
359  if attrname == 'ismap':
360  ismap = value
361  if attrname == 'src':
362  src = value
363  if attrname == 'width':
364  try: width = int(value)
365  except ValueError: pass
366  if attrname == 'height':
367  try: height = int(value)
368  except ValueError: pass
369  self.handle_image(src, alt, ismap, align, width, height)
370 
371  # --- Really Old Unofficial Deprecated Stuff
372 
373  def do_plaintext(self, attrs):
374  self.start_pre(attrs)
375  self.setnomoretags() # Tell SGML parser
376 
377  # --- Unhandled tags
378 
379  def unknown_starttag(self, tag, attrs):
380  pass
381 
382  def unknown_endtag(self, tag):
383  pass
384 
385 
386 def test(args = None):
387  import sys, formatter
388 
389  if not args:
390  args = sys.argv[1:]
391 
392  silent = args and args[0] == '-s'
393  if silent:
394  del args[0]
395 
396  if args:
397  file = args[0]
398  else:
399  file = 'test.html'
400 
401  if file == '-':
402  f = sys.stdin
403  else:
404  try:
405  f = open(file, 'r')
406  except IOError, msg:
407  print file, ":", msg
408  sys.exit(1)
409 
410  data = f.read()
411 
412  if f is not sys.stdin:
413  f.close()
414 
415  if silent:
417  else:
419 
420  p = HTMLParser(f)
421  p.feed(data)
422  p.close()
423 
424 
425 if __name__ == '__main__':
426  test()