ยปCore Development>Code coverage>Lib/htmllib.py

Python code coverage for Lib/htmllib.py

#countcontent
1n/a"""HTML 2.0 parser.
2n/a
3n/aSee the HTML 2.0 specification:
4n/ahttp://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
51"""
6n/a
71from warnings import warnpy3k
81warnpy3k("the htmllib module has been removed in Python 3.0",
91 stacklevel=2)
101del warnpy3k
11n/a
121import sgmllib
13n/a
141from formatter import AS_IS
15n/a
161__all__ = ["HTMLParser", "HTMLParseError"]
17n/a
18n/a
192class HTMLParseError(sgmllib.SGMLParseError):
201 """Error raised when an HTML document can't be parsed."""
21n/a
22n/a
232class HTMLParser(sgmllib.SGMLParser):
24n/a """This is the basic HTML parser class.
25n/a
26n/a It supports all entity names required by the XHTML 1.0 Recommendation.
27n/a It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
28n/a elements.
29n/a
301 """
31n/a
321 from htmlentitydefs import entitydefs
33n/a
341 def __init__(self, formatter, verbose=0):
35n/a """Creates an instance of the HTMLParser class.
36n/a
37n/a The formatter parameter is the formatter instance associated with
38n/a the parser.
39n/a
40n/a """
412 sgmllib.SGMLParser.__init__(self, verbose)
422 self.formatter = formatter
43n/a
441 def error(self, message):
450 raise HTMLParseError(message)
46n/a
471 def reset(self):
482 sgmllib.SGMLParser.reset(self)
492 self.savedata = None
502 self.isindex = 0
512 self.title = None
522 self.base = None
532 self.anchor = None
542 self.anchorlist = []
552 self.nofill = 0
562 self.list_stack = []
57n/a
58n/a # ------ Methods used internally; some may be overridden
59n/a
60n/a # --- Formatter interface, taking care of 'savedata' mode;
61n/a # shouldn't need to be overridden
62n/a
631 def handle_data(self, data):
6412 if self.savedata is not None:
650 self.savedata = self.savedata + data
66n/a else:
6712 if self.nofill:
680 self.formatter.add_literal_data(data)
69n/a else:
7012 self.formatter.add_flowing_data(data)
71n/a
72n/a # --- Hooks to save data; shouldn't need to be overridden
73n/a
741 def save_bgn(self):
75n/a """Begins saving character data in a buffer instead of sending it
76n/a to the formatter object.
77n/a
78n/a Retrieve the stored data via the save_end() method. Use of the
79n/a save_bgn() / save_end() pair may not be nested.
80n/a
81n/a """
820 self.savedata = ''
83n/a
841 def save_end(self):
85n/a """Ends buffering character data and returns all data saved since
86n/a the preceding call to the save_bgn() method.
87n/a
88n/a If the nofill flag is false, whitespace is collapsed to single
89n/a spaces. A call to this method without a preceding call to the
90n/a save_bgn() method will raise a TypeError exception.
91n/a
92n/a """
930 data = self.savedata
940 self.savedata = None
950 if not self.nofill:
960 data = ' '.join(data.split())
970 return data
98n/a
99n/a # --- Hooks for anchors; should probably be overridden
100n/a
1011 def anchor_bgn(self, href, name, type):
102n/a """This method is called at the start of an anchor region.
103n/a
104n/a The arguments correspond to the attributes of the <A> tag with
105n/a the same names. The default implementation maintains a list of
106n/a hyperlinks (defined by the HREF attribute for <A> tags) within
107n/a the document. The list of hyperlinks is available as the data
108n/a attribute anchorlist.
109n/a
110n/a """
1110 self.anchor = href
1120 if self.anchor:
1130 self.anchorlist.append(href)
114n/a
1151 def anchor_end(self):
116n/a """This method is called at the end of an anchor region.
117n/a
118n/a The default implementation adds a textual footnote marker using an
119n/a index into the list of hyperlinks created by the anchor_bgn()method.
120n/a
121n/a """
1223 if self.anchor:
1230 self.handle_data("[%d]" % len(self.anchorlist))
1240 self.anchor = None
125n/a
126n/a # --- Hook for images; should probably be overridden
127n/a
1281 def handle_image(self, src, alt, *args):
129n/a """This method is called to handle images.
130n/a
131n/a The default implementation simply passes the alt value to the
132n/a handle_data() method.
133n/a
134n/a """
1350 self.handle_data(alt)
136n/a
137n/a # --------- Top level elememts
138n/a
1392 def start_html(self, attrs): pass
1402 def end_html(self): pass
141n/a
1421 def start_head(self, attrs): pass
1431 def end_head(self): pass
144n/a
1452 def start_body(self, attrs): pass
1462 def end_body(self): pass
147n/a
148n/a # ------ Head elements
149n/a
1501 def start_title(self, attrs):
1510 self.save_bgn()
152n/a
1531 def end_title(self):
1540 self.title = self.save_end()
155n/a
1561 def do_base(self, attrs):
1570 for a, v in attrs:
1580 if a == 'href':
1590 self.base = v
160n/a
1611 def do_isindex(self, attrs):
1620 self.isindex = 1
163n/a
1641 def do_link(self, attrs):
1650 pass
166n/a
1671 def do_meta(self, attrs):
1680 pass
169n/a
1701 def do_nextid(self, attrs): # Deprecated
1710 pass
172n/a
173n/a # ------ Body elements
174n/a
175n/a # --- Headings
176n/a
1771 def start_h1(self, attrs):
1780 self.formatter.end_paragraph(1)
1790 self.formatter.push_font(('h1', 0, 1, 0))
180n/a
1811 def end_h1(self):
1820 self.formatter.end_paragraph(1)
1830 self.formatter.pop_font()
184n/a
1851 def start_h2(self, attrs):
1860 self.formatter.end_paragraph(1)
1870 self.formatter.push_font(('h2', 0, 1, 0))
188n/a
1891 def end_h2(self):
1900 self.formatter.end_paragraph(1)
1910 self.formatter.pop_font()
192n/a
1931 def start_h3(self, attrs):
1940 self.formatter.end_paragraph(1)
1950 self.formatter.push_font(('h3', 0, 1, 0))
196n/a
1971 def end_h3(self):
1980 self.formatter.end_paragraph(1)
1990 self.formatter.pop_font()
200n/a
2011 def start_h4(self, attrs):
2020 self.formatter.end_paragraph(1)
2030 self.formatter.push_font(('h4', 0, 1, 0))
204n/a
2051 def end_h4(self):
2060 self.formatter.end_paragraph(1)
2070 self.formatter.pop_font()
208n/a
2091 def start_h5(self, attrs):
2100 self.formatter.end_paragraph(1)
2110 self.formatter.push_font(('h5', 0, 1, 0))
212n/a
2131 def end_h5(self):
2140 self.formatter.end_paragraph(1)
2150 self.formatter.pop_font()
216n/a
2171 def start_h6(self, attrs):
2180 self.formatter.end_paragraph(1)
2190 self.formatter.push_font(('h6', 0, 1, 0))
220n/a
2211 def end_h6(self):
2220 self.formatter.end_paragraph(1)
2230 self.formatter.pop_font()
224n/a
225n/a # --- Block Structuring Elements
226n/a
2271 def do_p(self, attrs):
2280 self.formatter.end_paragraph(1)
229n/a
2301 def start_pre(self, attrs):
2310 self.formatter.end_paragraph(1)
2320 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
2330 self.nofill = self.nofill + 1
234n/a
2351 def end_pre(self):
2360 self.formatter.end_paragraph(1)
2370 self.formatter.pop_font()
2380 self.nofill = max(0, self.nofill - 1)
239n/a
2401 def start_xmp(self, attrs):
2410 self.start_pre(attrs)
2420 self.setliteral('xmp') # Tell SGML parser
243n/a
2441 def end_xmp(self):
2450 self.end_pre()
246n/a
2471 def start_listing(self, attrs):
2480 self.start_pre(attrs)
2490 self.setliteral('listing') # Tell SGML parser
250n/a
2511 def end_listing(self):
2520 self.end_pre()
253n/a
2541 def start_address(self, attrs):
2550 self.formatter.end_paragraph(0)
2560 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
257n/a
2581 def end_address(self):
2590 self.formatter.end_paragraph(0)
2600 self.formatter.pop_font()
261n/a
2621 def start_blockquote(self, attrs):
2630 self.formatter.end_paragraph(1)
2640 self.formatter.push_margin('blockquote')
265n/a
2661 def end_blockquote(self):
2670 self.formatter.end_paragraph(1)
2680 self.formatter.pop_margin()
269n/a
270n/a # --- List Elements
271n/a
2721 def start_ul(self, attrs):
2730 self.formatter.end_paragraph(not self.list_stack)
2740 self.formatter.push_margin('ul')
2750 self.list_stack.append(['ul', '*', 0])
276n/a
2771 def end_ul(self):
2780 if self.list_stack: del self.list_stack[-1]
2790 self.formatter.end_paragraph(not self.list_stack)
2800 self.formatter.pop_margin()
281n/a
2821 def do_li(self, attrs):
2830 self.formatter.end_paragraph(0)
2840 if self.list_stack:
2850 [dummy, label, counter] = top = self.list_stack[-1]
2860 top[2] = counter = counter+1
287n/a else:
2880 label, counter = '*', 0
2890 self.formatter.add_label_data(label, counter)
290n/a
2911 def start_ol(self, attrs):
2920 self.formatter.end_paragraph(not self.list_stack)
2930 self.formatter.push_margin('ol')
2940 label = '1.'
2950 for a, v in attrs:
2960 if a == 'type':
2970 if len(v) == 1: v = v + '.'
2980 label = v
2990 self.list_stack.append(['ol', label, 0])
300n/a
3011 def end_ol(self):
3020 if self.list_stack: del self.list_stack[-1]
3030 self.formatter.end_paragraph(not self.list_stack)
3040 self.formatter.pop_margin()
305n/a
3061 def start_menu(self, attrs):
3070 self.start_ul(attrs)
308n/a
3091 def end_menu(self):
3100 self.end_ul()
311n/a
3121 def start_dir(self, attrs):
3130 self.start_ul(attrs)
314n/a
3151 def end_dir(self):
3160 self.end_ul()
317n/a
3181 def start_dl(self, attrs):
3190 self.formatter.end_paragraph(1)
3200 self.list_stack.append(['dl', '', 0])
321n/a
3221 def end_dl(self):
3230 self.ddpop(1)
3240 if self.list_stack: del self.list_stack[-1]
325n/a
3261 def do_dt(self, attrs):
3270 self.ddpop()
328n/a
3291 def do_dd(self, attrs):
3300 self.ddpop()
3310 self.formatter.push_margin('dd')
3320 self.list_stack.append(['dd', '', 0])
333n/a
3341 def ddpop(self, bl=0):
3350 self.formatter.end_paragraph(bl)
3360 if self.list_stack:
3370 if self.list_stack[-1][0] == 'dd':
3380 del self.list_stack[-1]
3390 self.formatter.pop_margin()
340n/a
341n/a # --- Phrase Markup
342n/a
343n/a # Idiomatic Elements
344n/a
3451 def start_cite(self, attrs): self.start_i(attrs)
3461 def end_cite(self): self.end_i()
347n/a
3481 def start_code(self, attrs): self.start_tt(attrs)
3491 def end_code(self): self.end_tt()
350n/a
3511 def start_em(self, attrs): self.start_i(attrs)
3521 def end_em(self): self.end_i()
353n/a
3541 def start_kbd(self, attrs): self.start_tt(attrs)
3551 def end_kbd(self): self.end_tt()
356n/a
3571 def start_samp(self, attrs): self.start_tt(attrs)
3581 def end_samp(self): self.end_tt()
359n/a
3601 def start_strong(self, attrs): self.start_b(attrs)
3611 def end_strong(self): self.end_b()
362n/a
3631 def start_var(self, attrs): self.start_i(attrs)
3641 def end_var(self): self.end_i()
365n/a
366n/a # Typographic Elements
367n/a
3681 def start_i(self, attrs):
3690 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
3701 def end_i(self):
3710 self.formatter.pop_font()
372n/a
3731 def start_b(self, attrs):
3740 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
3751 def end_b(self):
3760 self.formatter.pop_font()
377n/a
3781 def start_tt(self, attrs):
3790 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
3801 def end_tt(self):
3810 self.formatter.pop_font()
382n/a
3831 def start_a(self, attrs):
3843 href = ''
3853 name = ''
3863 type = ''
3877 for attrname, value in attrs:
3884 value = value.strip()
3894 if attrname == 'href':
3902 href = value
3914 if attrname == 'name':
3922 name = value
3934 if attrname == 'type':
3940 type = value.lower()
3953 self.anchor_bgn(href, name, type)
396n/a
3971 def end_a(self):
3983 self.anchor_end()
399n/a
400n/a # --- Line Break
401n/a
4021 def do_br(self, attrs):
4030 self.formatter.add_line_break()
404n/a
405n/a # --- Horizontal Rule
406n/a
4071 def do_hr(self, attrs):
4080 self.formatter.add_hor_rule()
409n/a
410n/a # --- Image
411n/a
4121 def do_img(self, attrs):
4130 align = ''
4140 alt = '(image)'
4150 ismap = ''
4160 src = ''
4170 width = 0
4180 height = 0
4190 for attrname, value in attrs:
4200 if attrname == 'align':
4210 align = value
4220 if attrname == 'alt':
4230 alt = value
4240 if attrname == 'ismap':
4250 ismap = value
4260 if attrname == 'src':
4270 src = value
4280 if attrname == 'width':
4290 try: width = int(value)
4300 except ValueError: pass
4310 if attrname == 'height':
4320 try: height = int(value)
4330 except ValueError: pass
4340 self.handle_image(src, alt, ismap, align, width, height)
435n/a
436n/a # --- Really Old Unofficial Deprecated Stuff
437n/a
4381 def do_plaintext(self, attrs):
4390 self.start_pre(attrs)
4400 self.setnomoretags() # Tell SGML parser
441n/a
442n/a # --- Unhandled tags
443n/a
4441 def unknown_starttag(self, tag, attrs):
4450 pass
446n/a
4471 def unknown_endtag(self, tag):
4480 pass
449n/a
450n/a
4511def test(args = None):
4520 import sys, formatter
453n/a
4540 if not args:
4550 args = sys.argv[1:]
456n/a
4570 silent = args and args[0] == '-s'
4580 if silent:
4590 del args[0]
460n/a
4610 if args:
4620 file = args[0]
463n/a else:
4640 file = 'test.html'
465n/a
4660 if file == '-':
4670 f = sys.stdin
468n/a else:
4690 try:
4700 f = open(file, 'r')
4710 except IOError, msg:
4720 print file, ":", msg
4730 sys.exit(1)
474n/a
4750 data = f.read()
476n/a
4770 if f is not sys.stdin:
4780 f.close()
479n/a
4800 if silent:
4810 f = formatter.NullFormatter()
482n/a else:
4830 f = formatter.AbstractFormatter(formatter.DumbWriter())
484n/a
4850 p = HTMLParser(f)
4860 p.feed(data)
4870 p.close()
488n/a
489n/a
4901if __name__ == '__main__':
4910 test()