ยปCore Development>Code coverage>Lib/HTMLParser.py

Python code coverage for Lib/HTMLParser.py

#countcontent
11"""A parser for HTML and XHTML."""
2n/a
3n/a# This file is based on sgmllib.py, but the API is slightly different.
4n/a
5n/a# XXX There should be a way to distinguish between PCDATA (parsed
6n/a# character data -- the normal case), RCDATA (replaceable character
7n/a# data -- only char and entity references and end tags are special)
8n/a# and CDATA (character data -- only end tags are special).
9n/a
10n/a
111import markupbase
121import re
13n/a
14n/a# Regular expressions used for parsing
15n/a
161interesting_normal = re.compile('[&<]')
171interesting_cdata = re.compile(r'<(/|\Z)')
181incomplete = re.compile('&[a-zA-Z#]')
19n/a
201entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
211charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
22n/a
231starttagopen = re.compile('<[a-zA-Z]')
241piclose = re.compile('>')
251commentclose = re.compile(r'--\s*>')
261tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
271attrfind = re.compile(
281 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
29n/a r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
30n/a
311locatestarttagend = re.compile(r"""
32n/a <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
33n/a (?:\s+ # whitespace before attribute name
34n/a (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
35n/a (?:\s*=\s* # value indicator
36n/a (?:'[^']*' # LITA-enclosed value
37n/a |\"[^\"]*\" # LIT-enclosed value
38n/a |[^'\">\s]+ # bare value
39n/a )
40n/a )?
41n/a )
42n/a )*
43n/a \s* # trailing whitespace
441""", re.VERBOSE)
451endendtag = re.compile('>')
461endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
47n/a
48n/a
492class HTMLParseError(Exception):
501 """Exception raised for all parse errors."""
51n/a
521 def __init__(self, msg, position=(None, None)):
5316 assert msg
5416 self.msg = msg
5516 self.lineno = position[0]
5616 self.offset = position[1]
57n/a
581 def __str__(self):
590 result = self.msg
600 if self.lineno is not None:
610 result = result + ", at line %d" % self.lineno
620 if self.offset is not None:
630 result = result + ", column %d" % (self.offset + 1)
640 return result
65n/a
66n/a
672class HTMLParser(markupbase.ParserBase):
68n/a """Find tags and other markup and call handler functions.
69n/a
70n/a Usage:
71n/a p = HTMLParser()
72n/a p.feed(data)
73n/a ...
74n/a p.close()
75n/a
76n/a Start tags are handled by calling self.handle_starttag() or
77n/a self.handle_startendtag(); end tags by self.handle_endtag(). The
78n/a data between tags is passed from the parser to the derived class
79n/a by calling self.handle_data() with the data as argument (the data
80n/a may be split up in arbitrary chunks). Entity references are
81n/a passed by calling self.handle_entityref() with the entity
82n/a reference as the argument. Numeric character references are
83n/a passed to self.handle_charref() with the string containing the
84n/a reference as the argument.
851 """
86n/a
871 CDATA_CONTENT_ELEMENTS = ("script", "style")
88n/a
89n/a
901 def __init__(self):
91n/a """Initialize and reset this instance."""
9267 self.reset()
93n/a
941 def reset(self):
95n/a """Reset this instance. Loses all unprocessed data."""
9667 self.rawdata = ''
9767 self.lasttag = '???'
9867 self.interesting = interesting_normal
9967 markupbase.ParserBase.reset(self)
100n/a
1011 def feed(self, data):
102n/a """Feed data to the parser.
103n/a
104n/a Call this as often as you want, with as little or as much text
105n/a as you want (may include '\n').
106n/a """
1071322 self.rawdata = self.rawdata + data
1081322 self.goahead(0)
109n/a
1101 def close(self):
111n/a """Handle any buffered data."""
11260 self.goahead(1)
113n/a
1141 def error(self, message):
11516 raise HTMLParseError(message, self.getpos())
116n/a
1171 __starttag_text = None
118n/a
1191 def get_starttag_text(self):
120n/a """Return full source of start tag: '<...>'."""
1211 return self.__starttag_text
122n/a
1231 def set_cdata_mode(self):
1242 self.interesting = interesting_cdata
125n/a
1261 def clear_cdata_mode(self):
1278 self.interesting = interesting_normal
128n/a
129n/a # Internal -- handle data as far as reasonable. May leave state
130n/a # and data to be processed by a subsequent call. If 'end' is
131n/a # true, force handling all data as if followed by EOF marker.
1321 def goahead(self, end):
1331382 rawdata = self.rawdata
1341382 i = 0
1351382 n = len(rawdata)
1361454 while i < n:
1371336 match = self.interesting.search(rawdata, i) # < or &
1381336 if match:
1391175 j = match.start()
140n/a else:
141161 j = n
1421336 if i < j: self.handle_data(rawdata[i:j])
1431336 i = self.updatepos(i, j)
1441336 if i == n: break
1451175 startswith = rawdata.startswith
1461175 if startswith('<', i):
1471131 if starttagopen.match(rawdata, i): # < + letter
148446 k = self.parse_starttag(i)
149685 elif startswith("</", i):
15044 k = self.parse_endtag(i)
151641 elif startswith("<!--", i):
152100 k = self.parse_comment(i)
153541 elif startswith("<?", i):
15450 k = self.parse_pi(i)
155491 elif startswith("<!", i):
156442 k = self.parse_declaration(i)
15749 elif (i + 1) < n:
1583 self.handle_data("<")
1593 k = i + 1
160n/a else:
16146 break
1621078 if k < 0:
1631012 if end:
1649 self.error("EOF in middle of construct")
1651003 break
16666 i = self.updatepos(i, k)
16744 elif startswith("&#", i):
16816 match = charref.match(rawdata, i)
16916 if match:
1702 name = match.group()[2:-1]
1712 self.handle_charref(name)
1722 k = match.end()
1732 if not startswith(';', k-1):
1740 k = k - 1
1752 i = self.updatepos(i, k)
1762 continue
177n/a else:
17814 if ";" in rawdata[i:]: #bail by consuming &#
1791 self.handle_data(rawdata[0:2])
1801 i = self.updatepos(i, 2)
18114 break
18228 elif startswith('&', i):
18328 match = entityref.match(rawdata, i)
18428 if match:
1852 name = match.group(1)
1862 self.handle_entityref(name)
1872 k = match.end()
1882 if not startswith(';', k-1):
1891 k = k - 1
1902 i = self.updatepos(i, k)
1912 continue
19226 match = incomplete.match(rawdata, i)
19326 if match:
194n/a # match.group() will contain at least 2 chars
19515 if end and match.group() == rawdata[i:]:
1960 self.error("EOF in middle of entity or char ref")
197n/a # incomplete
19815 break
19911 elif (i + 1) < n:
200n/a # not the end of the buffer, and can't be confused
201n/a # with some other construct
2022 self.handle_data("&")
2032 i = self.updatepos(i, i + 1)
204n/a else:
2059 break
206n/a else:
2070 assert 0, "interesting.search() lied"
208n/a # end while
2091366 if end and i < n:
2101 self.handle_data(rawdata[i:n])
2111 i = self.updatepos(i, n)
2121366 self.rawdata = rawdata[i:]
213n/a
214n/a # Internal -- parse processing instr, return end or -1 if not terminated
2151 def parse_pi(self, i):
21650 rawdata = self.rawdata
21750 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
21850 match = piclose.search(rawdata, i+2) # >
21950 if not match:
22048 return -1
2212 j = match.start()
2222 self.handle_pi(rawdata[i+2: j])
2232 j = match.end()
2242 return j
225n/a
226n/a # Internal -- handle starttag, return end or -1 if not terminated
2271 def parse_starttag(self, i):
228446 self.__starttag_text = None
229446 endpos = self.check_for_whole_start_tag(i)
230444 if endpos < 0:
231407 return endpos
23237 rawdata = self.rawdata
23337 self.__starttag_text = rawdata[i:endpos]
234n/a
235n/a # Now parse the data between i+1 and j into a tag and attrs
23637 attrs = []
23737 match = tagfind.match(rawdata, i+1)
23837 assert match, 'unexpected call to parse_starttag()'
23937 k = match.end()
24037 self.lasttag = tag = rawdata[i+1:k].lower()
241n/a
24284 while k < endpos:
24384 m = attrfind.match(rawdata, k)
24484 if not m:
24537 break
24647 attrname, rest, attrvalue = m.group(1, 2, 3)
24747 if not rest:
2485 attrvalue = None
24942 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
25016 attrvalue[:1] == '"' == attrvalue[-1:]:
25133 attrvalue = attrvalue[1:-1]
25233 attrvalue = self.unescape(attrvalue)
25347 attrs.append((attrname.lower(), attrvalue))
25447 k = m.end()
255n/a
25637 end = rawdata[k:endpos].strip()
25737 if end not in (">", "/>"):
2580 lineno, offset = self.getpos()
2590 if "\n" in self.__starttag_text:
2600 lineno = lineno + self.__starttag_text.count("\n")
2610 offset = len(self.__starttag_text) \
2620 - self.__starttag_text.rfind("\n")
263n/a else:
2640 offset = offset + len(self.__starttag_text)
2650 self.error("junk characters in start tag: %r"
2660 % (rawdata[k:endpos][:20],))
26737 if end.endswith('/>'):
268n/a # XHTML-style empty tag: <span attr="value" />
2692 self.handle_startendtag(tag, attrs)
270n/a else:
27135 self.handle_starttag(tag, attrs)
27235 if tag in self.CDATA_CONTENT_ELEMENTS:
2732 self.set_cdata_mode()
27437 return endpos
275n/a
276n/a # Internal -- check to see if we have a complete starttag; return end
277n/a # or -1 if incomplete.
2781 def check_for_whole_start_tag(self, i):
279446 rawdata = self.rawdata
280446 m = locatestarttagend.match(rawdata, i)
281446 if m:
282446 j = m.end()
283446 next = rawdata[j:j+1]
284446 if next == ">":
28535 return j + 1
286411 if next == "/":
2874 if rawdata.startswith("/>", j):
2882 return j + 2
2892 if rawdata.startswith("/", j):
290n/a # buffer boundary
2912 return -1
292n/a # else bogus input
2930 self.updatepos(i, j + 1)
2940 self.error("malformed empty start tag")
295407 if next == "":
296n/a # end of input
297236 return -1
298171 if next in ("abcdefghijklmnopqrstuvwxyz=/"
299n/a "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
300n/a # end of input in or before attribute value, or we have the
301n/a # '/' from a '/>' ending
302169 return -1
3032 self.updatepos(i, j)
3042 self.error("malformed start tag")
3050 raise AssertionError("we should not get here!")
306n/a
307n/a # Internal -- parse endtag, return end or -1 if incomplete
3081 def parse_endtag(self, i):
30944 rawdata = self.rawdata
31044 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
31144 match = endendtag.search(rawdata, i+1) # >
31244 if not match:
31333 return -1
31411 j = match.end()
31511 match = endtagfind.match(rawdata, i) # </ + tag + >
31611 if not match:
3173 self.error("bad end tag: %r" % (rawdata[i:j],))
3188 tag = match.group(1)
3198 self.handle_endtag(tag.lower())
3208 self.clear_cdata_mode()
3218 return j
322n/a
323n/a # Overridable -- finish processing of start+end tag: <tag.../>
3241 def handle_startendtag(self, tag, attrs):
3250 self.handle_starttag(tag, attrs)
3260 self.handle_endtag(tag)
327n/a
328n/a # Overridable -- handle start tag
3291 def handle_starttag(self, tag, attrs):
3300 pass
331n/a
332n/a # Overridable -- handle end tag
3331 def handle_endtag(self, tag):
3340 pass
335n/a
336n/a # Overridable -- handle character reference
3371 def handle_charref(self, name):
3380 pass
339n/a
340n/a # Overridable -- handle entity reference
3411 def handle_entityref(self, name):
3420 pass
343n/a
344n/a # Overridable -- handle data
3451 def handle_data(self, data):
3460 pass
347n/a
348n/a # Overridable -- handle comment
3491 def handle_comment(self, data):
3500 pass
351n/a
352n/a # Overridable -- handle declaration
3531 def handle_decl(self, decl):
3540 pass
355n/a
356n/a # Overridable -- handle processing instruction
3571 def handle_pi(self, data):
3580 pass
359n/a
3601 def unknown_decl(self, data):
3610 self.error("unknown declaration: %r" % (data,))
362n/a
363n/a # Internal -- helper to remove special character quoting
3641 entitydefs = None
3651 def unescape(self, s):
36633 if '&' not in s:
36731 return s
3682 def replaceEntities(s):
3699 s = s.groups()[0]
3709 if s[0] == "#":
3712 s = s[1:]
3722 if s[0] in ['x','X']:
3731 c = int(s[1:], 16)
374n/a else:
3751 c = int(s)
3762 return unichr(c)
377n/a else:
378n/a # Cannot use name2codepoint directly, because HTMLParser supports apos,
379n/a # which is not part of HTML 4
3807 import htmlentitydefs
3817 if HTMLParser.entitydefs is None:
3821 entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
383253 for k, v in htmlentitydefs.name2codepoint.iteritems():
384252 entitydefs[k] = unichr(v)
3857 try:
3867 return self.entitydefs[s]
3870 except KeyError:
3880 return '&'+s+';'
389n/a
3902 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)