ยปCore Development>Code coverage>Lib/html/parser.py

Python code coverage for Lib/html/parser.py

#countcontent
1n/a"""A parser for HTML and XHTML."""
2n/a
3n/a# This file is based on sgmllib.py, but the API is slightly different.
4n/a
5n/a# XXX There should be a way to distinguish between PCDATA (parsed
6n/a# character data -- the normal case), RCDATA (replaceable character
7n/a# data -- only char and entity references and end tags are special)
8n/a# and CDATA (character data -- only end tags are special).
9n/a
10n/a
11n/aimport re
12n/aimport warnings
13n/aimport _markupbase
14n/a
15n/afrom html import unescape
16n/a
17n/a
18n/a__all__ = ['HTMLParser']
19n/a
20n/a# Regular expressions used for parsing
21n/a
22n/ainteresting_normal = re.compile('[&<]')
23n/aincomplete = re.compile('&[a-zA-Z#]')
24n/a
25n/aentityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
26n/acharref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
27n/a
28n/astarttagopen = re.compile('<[a-zA-Z]')
29n/apiclose = re.compile('>')
30n/acommentclose = re.compile(r'--\s*>')
31n/a# Note:
32n/a# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
33n/a# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
34n/a# explode, so don't do it.
35n/a# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
36n/a# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
37n/atagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
38n/aattrfind_tolerant = re.compile(
39n/a r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
40n/a r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
41n/alocatestarttagend_tolerant = re.compile(r"""
42n/a <[a-zA-Z][^\t\n\r\f />\x00]* # tag name
43n/a (?:[\s/]* # optional whitespace before attribute name
44n/a (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
45n/a (?:\s*=+\s* # value indicator
46n/a (?:'[^']*' # LITA-enclosed value
47n/a |"[^"]*" # LIT-enclosed value
48n/a |(?!['"])[^>\s]* # bare value
49n/a )
50n/a (?:\s*,)* # possibly followed by a comma
51n/a )?(?:\s|/(?!>))*
52n/a )*
53n/a )?
54n/a \s* # trailing whitespace
55n/a""", re.VERBOSE)
56n/aendendtag = re.compile('>')
57n/a# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
58n/a# </ and the tag name, so maybe this should be fixed
59n/aendtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
60n/a
61n/a
62n/a
63n/aclass HTMLParser(_markupbase.ParserBase):
64n/a """Find tags and other markup and call handler functions.
65n/a
66n/a Usage:
67n/a p = HTMLParser()
68n/a p.feed(data)
69n/a ...
70n/a p.close()
71n/a
72n/a Start tags are handled by calling self.handle_starttag() or
73n/a self.handle_startendtag(); end tags by self.handle_endtag(). The
74n/a data between tags is passed from the parser to the derived class
75n/a by calling self.handle_data() with the data as argument (the data
76n/a may be split up in arbitrary chunks). If convert_charrefs is
77n/a True the character references are converted automatically to the
78n/a corresponding Unicode character (and self.handle_data() is no
79n/a longer split in chunks), otherwise they are passed by calling
80n/a self.handle_entityref() or self.handle_charref() with the string
81n/a containing respectively the named or numeric reference as the
82n/a argument.
83n/a """
84n/a
85n/a CDATA_CONTENT_ELEMENTS = ("script", "style")
86n/a
87n/a def __init__(self, *, convert_charrefs=True):
88n/a """Initialize and reset this instance.
89n/a
90n/a If convert_charrefs is True (the default), all character references
91n/a are automatically converted to the corresponding Unicode characters.
92n/a """
93n/a self.convert_charrefs = convert_charrefs
94n/a self.reset()
95n/a
96n/a def reset(self):
97n/a """Reset this instance. Loses all unprocessed data."""
98n/a self.rawdata = ''
99n/a self.lasttag = '???'
100n/a self.interesting = interesting_normal
101n/a self.cdata_elem = None
102n/a _markupbase.ParserBase.reset(self)
103n/a
104n/a def feed(self, data):
105n/a r"""Feed data to the parser.
106n/a
107n/a Call this as often as you want, with as little or as much text
108n/a as you want (may include '\n').
109n/a """
110n/a self.rawdata = self.rawdata + data
111n/a self.goahead(0)
112n/a
113n/a def close(self):
114n/a """Handle any buffered data."""
115n/a self.goahead(1)
116n/a
117n/a __starttag_text = None
118n/a
119n/a def get_starttag_text(self):
120n/a """Return full source of start tag: '<...>'."""
121n/a return self.__starttag_text
122n/a
123n/a def set_cdata_mode(self, elem):
124n/a self.cdata_elem = elem.lower()
125n/a self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
126n/a
127n/a def clear_cdata_mode(self):
128n/a self.interesting = interesting_normal
129n/a self.cdata_elem = None
130n/a
131n/a # Internal -- handle data as far as reasonable. May leave state
132n/a # and data to be processed by a subsequent call. If 'end' is
133n/a # true, force handling all data as if followed by EOF marker.
134n/a def goahead(self, end):
135n/a rawdata = self.rawdata
136n/a i = 0
137n/a n = len(rawdata)
138n/a while i < n:
139n/a if self.convert_charrefs and not self.cdata_elem:
140n/a j = rawdata.find('<', i)
141n/a if j < 0:
142n/a # if we can't find the next <, either we are at the end
143n/a # or there's more text incoming. If the latter is True,
144n/a # we can't pass the text to handle_data in case we have
145n/a # a charref cut in half at end. Try to determine if
146n/a # this is the case before proceeding by looking for an
147n/a # & near the end and see if it's followed by a space or ;.
148n/a amppos = rawdata.rfind('&', max(i, n-34))
149n/a if (amppos >= 0 and
150n/a not re.compile(r'[\s;]').search(rawdata, amppos)):
151n/a break # wait till we get all the text
152n/a j = n
153n/a else:
154n/a match = self.interesting.search(rawdata, i) # < or &
155n/a if match:
156n/a j = match.start()
157n/a else:
158n/a if self.cdata_elem:
159n/a break
160n/a j = n
161n/a if i < j:
162n/a if self.convert_charrefs and not self.cdata_elem:
163n/a self.handle_data(unescape(rawdata[i:j]))
164n/a else:
165n/a self.handle_data(rawdata[i:j])
166n/a i = self.updatepos(i, j)
167n/a if i == n: break
168n/a startswith = rawdata.startswith
169n/a if startswith('<', i):
170n/a if starttagopen.match(rawdata, i): # < + letter
171n/a k = self.parse_starttag(i)
172n/a elif startswith("</", i):
173n/a k = self.parse_endtag(i)
174n/a elif startswith("<!--", i):
175n/a k = self.parse_comment(i)
176n/a elif startswith("<?", i):
177n/a k = self.parse_pi(i)
178n/a elif startswith("<!", i):
179n/a k = self.parse_html_declaration(i)
180n/a elif (i + 1) < n:
181n/a self.handle_data("<")
182n/a k = i + 1
183n/a else:
184n/a break
185n/a if k < 0:
186n/a if not end:
187n/a break
188n/a k = rawdata.find('>', i + 1)
189n/a if k < 0:
190n/a k = rawdata.find('<', i + 1)
191n/a if k < 0:
192n/a k = i + 1
193n/a else:
194n/a k += 1
195n/a if self.convert_charrefs and not self.cdata_elem:
196n/a self.handle_data(unescape(rawdata[i:k]))
197n/a else:
198n/a self.handle_data(rawdata[i:k])
199n/a i = self.updatepos(i, k)
200n/a elif startswith("&#", i):
201n/a match = charref.match(rawdata, i)
202n/a if match:
203n/a name = match.group()[2:-1]
204n/a self.handle_charref(name)
205n/a k = match.end()
206n/a if not startswith(';', k-1):
207n/a k = k - 1
208n/a i = self.updatepos(i, k)
209n/a continue
210n/a else:
211n/a if ";" in rawdata[i:]: # bail by consuming &#
212n/a self.handle_data(rawdata[i:i+2])
213n/a i = self.updatepos(i, i+2)
214n/a break
215n/a elif startswith('&', i):
216n/a match = entityref.match(rawdata, i)
217n/a if match:
218n/a name = match.group(1)
219n/a self.handle_entityref(name)
220n/a k = match.end()
221n/a if not startswith(';', k-1):
222n/a k = k - 1
223n/a i = self.updatepos(i, k)
224n/a continue
225n/a match = incomplete.match(rawdata, i)
226n/a if match:
227n/a # match.group() will contain at least 2 chars
228n/a if end and match.group() == rawdata[i:]:
229n/a k = match.end()
230n/a if k <= i:
231n/a k = n
232n/a i = self.updatepos(i, i + 1)
233n/a # incomplete
234n/a break
235n/a elif (i + 1) < n:
236n/a # not the end of the buffer, and can't be confused
237n/a # with some other construct
238n/a self.handle_data("&")
239n/a i = self.updatepos(i, i + 1)
240n/a else:
241n/a break
242n/a else:
243n/a assert 0, "interesting.search() lied"
244n/a # end while
245n/a if end and i < n and not self.cdata_elem:
246n/a if self.convert_charrefs and not self.cdata_elem:
247n/a self.handle_data(unescape(rawdata[i:n]))
248n/a else:
249n/a self.handle_data(rawdata[i:n])
250n/a i = self.updatepos(i, n)
251n/a self.rawdata = rawdata[i:]
252n/a
253n/a # Internal -- parse html declarations, return length or -1 if not terminated
254n/a # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
255n/a # See also parse_declaration in _markupbase
256n/a def parse_html_declaration(self, i):
257n/a rawdata = self.rawdata
258n/a assert rawdata[i:i+2] == '<!', ('unexpected call to '
259n/a 'parse_html_declaration()')
260n/a if rawdata[i:i+4] == '<!--':
261n/a # this case is actually already handled in goahead()
262n/a return self.parse_comment(i)
263n/a elif rawdata[i:i+3] == '<![':
264n/a return self.parse_marked_section(i)
265n/a elif rawdata[i:i+9].lower() == '<!doctype':
266n/a # find the closing >
267n/a gtpos = rawdata.find('>', i+9)
268n/a if gtpos == -1:
269n/a return -1
270n/a self.handle_decl(rawdata[i+2:gtpos])
271n/a return gtpos+1
272n/a else:
273n/a return self.parse_bogus_comment(i)
274n/a
275n/a # Internal -- parse bogus comment, return length or -1 if not terminated
276n/a # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
277n/a def parse_bogus_comment(self, i, report=1):
278n/a rawdata = self.rawdata
279n/a assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
280n/a 'parse_comment()')
281n/a pos = rawdata.find('>', i+2)
282n/a if pos == -1:
283n/a return -1
284n/a if report:
285n/a self.handle_comment(rawdata[i+2:pos])
286n/a return pos + 1
287n/a
288n/a # Internal -- parse processing instr, return end or -1 if not terminated
289n/a def parse_pi(self, i):
290n/a rawdata = self.rawdata
291n/a assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
292n/a match = piclose.search(rawdata, i+2) # >
293n/a if not match:
294n/a return -1
295n/a j = match.start()
296n/a self.handle_pi(rawdata[i+2: j])
297n/a j = match.end()
298n/a return j
299n/a
300n/a # Internal -- handle starttag, return end or -1 if not terminated
301n/a def parse_starttag(self, i):
302n/a self.__starttag_text = None
303n/a endpos = self.check_for_whole_start_tag(i)
304n/a if endpos < 0:
305n/a return endpos
306n/a rawdata = self.rawdata
307n/a self.__starttag_text = rawdata[i:endpos]
308n/a
309n/a # Now parse the data between i+1 and j into a tag and attrs
310n/a attrs = []
311n/a match = tagfind_tolerant.match(rawdata, i+1)
312n/a assert match, 'unexpected call to parse_starttag()'
313n/a k = match.end()
314n/a self.lasttag = tag = match.group(1).lower()
315n/a while k < endpos:
316n/a m = attrfind_tolerant.match(rawdata, k)
317n/a if not m:
318n/a break
319n/a attrname, rest, attrvalue = m.group(1, 2, 3)
320n/a if not rest:
321n/a attrvalue = None
322n/a elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
323n/a attrvalue[:1] == '"' == attrvalue[-1:]:
324n/a attrvalue = attrvalue[1:-1]
325n/a if attrvalue:
326n/a attrvalue = unescape(attrvalue)
327n/a attrs.append((attrname.lower(), attrvalue))
328n/a k = m.end()
329n/a
330n/a end = rawdata[k:endpos].strip()
331n/a if end not in (">", "/>"):
332n/a lineno, offset = self.getpos()
333n/a if "\n" in self.__starttag_text:
334n/a lineno = lineno + self.__starttag_text.count("\n")
335n/a offset = len(self.__starttag_text) \
336n/a - self.__starttag_text.rfind("\n")
337n/a else:
338n/a offset = offset + len(self.__starttag_text)
339n/a self.handle_data(rawdata[i:endpos])
340n/a return endpos
341n/a if end.endswith('/>'):
342n/a # XHTML-style empty tag: <span attr="value" />
343n/a self.handle_startendtag(tag, attrs)
344n/a else:
345n/a self.handle_starttag(tag, attrs)
346n/a if tag in self.CDATA_CONTENT_ELEMENTS:
347n/a self.set_cdata_mode(tag)
348n/a return endpos
349n/a
350n/a # Internal -- check to see if we have a complete starttag; return end
351n/a # or -1 if incomplete.
352n/a def check_for_whole_start_tag(self, i):
353n/a rawdata = self.rawdata
354n/a m = locatestarttagend_tolerant.match(rawdata, i)
355n/a if m:
356n/a j = m.end()
357n/a next = rawdata[j:j+1]
358n/a if next == ">":
359n/a return j + 1
360n/a if next == "/":
361n/a if rawdata.startswith("/>", j):
362n/a return j + 2
363n/a if rawdata.startswith("/", j):
364n/a # buffer boundary
365n/a return -1
366n/a # else bogus input
367n/a if j > i:
368n/a return j
369n/a else:
370n/a return i + 1
371n/a if next == "":
372n/a # end of input
373n/a return -1
374n/a if next in ("abcdefghijklmnopqrstuvwxyz=/"
375n/a "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
376n/a # end of input in or before attribute value, or we have the
377n/a # '/' from a '/>' ending
378n/a return -1
379n/a if j > i:
380n/a return j
381n/a else:
382n/a return i + 1
383n/a raise AssertionError("we should not get here!")
384n/a
385n/a # Internal -- parse endtag, return end or -1 if incomplete
386n/a def parse_endtag(self, i):
387n/a rawdata = self.rawdata
388n/a assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
389n/a match = endendtag.search(rawdata, i+1) # >
390n/a if not match:
391n/a return -1
392n/a gtpos = match.end()
393n/a match = endtagfind.match(rawdata, i) # </ + tag + >
394n/a if not match:
395n/a if self.cdata_elem is not None:
396n/a self.handle_data(rawdata[i:gtpos])
397n/a return gtpos
398n/a # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
399n/a namematch = tagfind_tolerant.match(rawdata, i+2)
400n/a if not namematch:
401n/a # w3.org/TR/html5/tokenization.html#end-tag-open-state
402n/a if rawdata[i:i+3] == '</>':
403n/a return i+3
404n/a else:
405n/a return self.parse_bogus_comment(i)
406n/a tagname = namematch.group(1).lower()
407n/a # consume and ignore other stuff between the name and the >
408n/a # Note: this is not 100% correct, since we might have things like
409n/a # </tag attr=">">, but looking for > after tha name should cover
410n/a # most of the cases and is much simpler
411n/a gtpos = rawdata.find('>', namematch.end())
412n/a self.handle_endtag(tagname)
413n/a return gtpos+1
414n/a
415n/a elem = match.group(1).lower() # script or style
416n/a if self.cdata_elem is not None:
417n/a if elem != self.cdata_elem:
418n/a self.handle_data(rawdata[i:gtpos])
419n/a return gtpos
420n/a
421n/a self.handle_endtag(elem.lower())
422n/a self.clear_cdata_mode()
423n/a return gtpos
424n/a
425n/a # Overridable -- finish processing of start+end tag: <tag.../>
426n/a def handle_startendtag(self, tag, attrs):
427n/a self.handle_starttag(tag, attrs)
428n/a self.handle_endtag(tag)
429n/a
430n/a # Overridable -- handle start tag
431n/a def handle_starttag(self, tag, attrs):
432n/a pass
433n/a
434n/a # Overridable -- handle end tag
435n/a def handle_endtag(self, tag):
436n/a pass
437n/a
438n/a # Overridable -- handle character reference
439n/a def handle_charref(self, name):
440n/a pass
441n/a
442n/a # Overridable -- handle entity reference
443n/a def handle_entityref(self, name):
444n/a pass
445n/a
446n/a # Overridable -- handle data
447n/a def handle_data(self, data):
448n/a pass
449n/a
450n/a # Overridable -- handle comment
451n/a def handle_comment(self, data):
452n/a pass
453n/a
454n/a # Overridable -- handle declaration
455n/a def handle_decl(self, decl):
456n/a pass
457n/a
458n/a # Overridable -- handle processing instruction
459n/a def handle_pi(self, data):
460n/a pass
461n/a
462n/a def unknown_decl(self, data):
463n/a pass
464n/a
465n/a # Internal -- helper to remove special character quoting
466n/a def unescape(self, s):
467n/a warnings.warn('The unescape method is deprecated and will be removed '
468n/a 'in 3.5, use html.unescape() instead.',
469n/a DeprecationWarning, stacklevel=2)
470n/a return unescape(s)