3 See the HTML 2.0 specification:
4 http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
8 from sgmllib
import SGMLParser
9 from formatter
import AS_IS
11 __all__ = [
"HTMLParser"]
15 from htmlentitydefs
import entitydefs
18 SGMLParser.__init__(self, verbose)
39 self.formatter.add_literal_data(data)
41 self.formatter.add_flowing_data(data)
52 data =
' '.
join(data.split())
60 self.anchorlist.append(href)
113 self.formatter.end_paragraph(1)
114 self.formatter.push_font((
'h1', 0, 1, 0))
117 self.formatter.end_paragraph(1)
118 self.formatter.pop_font()
121 self.formatter.end_paragraph(1)
122 self.formatter.push_font((
'h2', 0, 1, 0))
125 self.formatter.end_paragraph(1)
126 self.formatter.pop_font()
129 self.formatter.end_paragraph(1)
130 self.formatter.push_font((
'h3', 0, 1, 0))
133 self.formatter.end_paragraph(1)
134 self.formatter.pop_font()
137 self.formatter.end_paragraph(1)
138 self.formatter.push_font((
'h4', 0, 1, 0))
141 self.formatter.end_paragraph(1)
142 self.formatter.pop_font()
145 self.formatter.end_paragraph(1)
146 self.formatter.push_font((
'h5', 0, 1, 0))
149 self.formatter.end_paragraph(1)
150 self.formatter.pop_font()
153 self.formatter.end_paragraph(1)
154 self.formatter.push_font((
'h6', 0, 1, 0))
157 self.formatter.end_paragraph(1)
158 self.formatter.pop_font()
163 self.formatter.end_paragraph(1)
166 self.formatter.end_paragraph(1)
167 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
171 self.formatter.end_paragraph(1)
172 self.formatter.pop_font()
190 self.formatter.end_paragraph(0)
191 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
194 self.formatter.end_paragraph(0)
195 self.formatter.pop_font()
198 self.formatter.end_paragraph(1)
199 self.formatter.push_margin(
'blockquote')
202 self.formatter.end_paragraph(1)
203 self.formatter.pop_margin()
208 self.formatter.end_paragraph(
not self.
list_stack)
209 self.formatter.push_margin(
'ul')
210 self.list_stack.append([
'ul',
'*', 0])
214 self.formatter.end_paragraph(
not self.
list_stack)
215 self.formatter.pop_margin()
218 self.formatter.end_paragraph(0)
220 [dummy, label, counter] = top = self.
list_stack[-1]
221 top[2] = counter = counter+1
223 label, counter =
'*', 0
224 self.formatter.add_label_data(label, counter)
227 self.formatter.end_paragraph(
not self.
list_stack)
228 self.formatter.push_margin(
'ol')
232 if len(v) == 1: v = v +
'.'
234 self.list_stack.append([
'ol', label, 0])
238 self.formatter.end_paragraph(
not self.
list_stack)
239 self.formatter.pop_margin()
254 self.formatter.end_paragraph(1)
255 self.list_stack.append([
'dl',
'', 0])
266 self.formatter.push_margin(
'dd')
267 self.list_stack.append([
'dd',
'', 0])
270 self.formatter.end_paragraph(bl)
274 self.formatter.pop_margin()
304 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
306 self.formatter.pop_font()
309 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
311 self.formatter.pop_font()
314 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
316 self.formatter.pop_font()
322 for attrname, value
in attrs:
323 value = value.strip()
324 if attrname ==
'href':
326 if attrname ==
'name':
328 if attrname ==
'type':
338 self.formatter.add_line_break()
343 self.formatter.add_hor_rule()
354 for attrname, value
in attrs:
355 if attrname ==
'align':
357 if attrname ==
'alt':
359 if attrname ==
'ismap':
361 if attrname ==
'src':
363 if attrname ==
'width':
364 try: width = int(value)
365 except ValueError:
pass
366 if attrname ==
'height':
367 try: height = int(value)
368 except ValueError:
pass
369 self.
handle_image(src, alt, ismap, align, width, height)
387 import sys, formatter
392 silent = args
and args[0] ==
'-s'
412 if f
is not sys.stdin:
425 if __name__ ==
'__main__':