ยปCore Development>Code coverage>Lib/_markupbase.py

Python code coverage for Lib/_markupbase.py

#countcontent
1n/a"""Shared support for scanning document type declarations in HTML and XHTML.
2n/a
3n/aThis module is used as a foundation for the html.parser module. It has no
4n/adocumented public API and should not be used directly.
5n/a
6n/a"""
7n/a
8n/aimport re
9n/a
10n/a_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
11n/a_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
12n/a_commentclose = re.compile(r'--\s*>')
13n/a_markedsectionclose = re.compile(r']\s*]\s*>')
14n/a
15n/a# An analysis of the MS-Word extensions is available at
16n/a# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
17n/a
18n/a_msmarkedsectionclose = re.compile(r']\s*>')
19n/a
20n/adel re
21n/a
22n/a
23n/aclass ParserBase:
24n/a """Parser base class which provides some common support methods used
25n/a by the SGML/HTML and XHTML parsers."""
26n/a
27n/a def __init__(self):
28n/a if self.__class__ is ParserBase:
29n/a raise RuntimeError(
30n/a "_markupbase.ParserBase must be subclassed")
31n/a
32n/a def error(self, message):
33n/a raise NotImplementedError(
34n/a "subclasses of ParserBase must override error()")
35n/a
36n/a def reset(self):
37n/a self.lineno = 1
38n/a self.offset = 0
39n/a
40n/a def getpos(self):
41n/a """Return current line number and offset."""
42n/a return self.lineno, self.offset
43n/a
44n/a # Internal -- update line number and offset. This should be
45n/a # called for each piece of data exactly once, in order -- in other
46n/a # words the concatenation of all the input strings to this
47n/a # function should be exactly the entire input.
48n/a def updatepos(self, i, j):
49n/a if i >= j:
50n/a return j
51n/a rawdata = self.rawdata
52n/a nlines = rawdata.count("\n", i, j)
53n/a if nlines:
54n/a self.lineno = self.lineno + nlines
55n/a pos = rawdata.rindex("\n", i, j) # Should not fail
56n/a self.offset = j-(pos+1)
57n/a else:
58n/a self.offset = self.offset + j-i
59n/a return j
60n/a
61n/a _decl_otherchars = ''
62n/a
63n/a # Internal -- parse declaration (for use by subclasses).
64n/a def parse_declaration(self, i):
65n/a # This is some sort of declaration; in "HTML as
66n/a # deployed," this should only be the document type
67n/a # declaration ("<!DOCTYPE html...>").
68n/a # ISO 8879:1986, however, has more complex
69n/a # declaration syntax for elements in <!...>, including:
70n/a # --comment--
71n/a # [marked section]
72n/a # name in the following list: ENTITY, DOCTYPE, ELEMENT,
73n/a # ATTLIST, NOTATION, SHORTREF, USEMAP,
74n/a # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
75n/a rawdata = self.rawdata
76n/a j = i + 2
77n/a assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
78n/a if rawdata[j:j+1] == ">":
79n/a # the empty comment <!>
80n/a return j + 1
81n/a if rawdata[j:j+1] in ("-", ""):
82n/a # Start of comment followed by buffer boundary,
83n/a # or just a buffer boundary.
84n/a return -1
85n/a # A simple, practical version could look like: ((name|stringlit) S*) + '>'
86n/a n = len(rawdata)
87n/a if rawdata[j:j+2] == '--': #comment
88n/a # Locate --.*-- as the body of the comment
89n/a return self.parse_comment(i)
90n/a elif rawdata[j] == '[': #marked section
91n/a # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
92n/a # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
93n/a # Note that this is extended by Microsoft Office "Save as Web" function
94n/a # to include [if...] and [endif].
95n/a return self.parse_marked_section(i)
96n/a else: #all other declaration elements
97n/a decltype, j = self._scan_name(j, i)
98n/a if j < 0:
99n/a return j
100n/a if decltype == "doctype":
101n/a self._decl_otherchars = ''
102n/a while j < n:
103n/a c = rawdata[j]
104n/a if c == ">":
105n/a # end of declaration syntax
106n/a data = rawdata[i+2:j]
107n/a if decltype == "doctype":
108n/a self.handle_decl(data)
109n/a else:
110n/a # According to the HTML5 specs sections "8.2.4.44 Bogus
111n/a # comment state" and "8.2.4.45 Markup declaration open
112n/a # state", a comment token should be emitted.
113n/a # Calling unknown_decl provides more flexibility though.
114n/a self.unknown_decl(data)
115n/a return j + 1
116n/a if c in "\"'":
117n/a m = _declstringlit_match(rawdata, j)
118n/a if not m:
119n/a return -1 # incomplete
120n/a j = m.end()
121n/a elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
122n/a name, j = self._scan_name(j, i)
123n/a elif c in self._decl_otherchars:
124n/a j = j + 1
125n/a elif c == "[":
126n/a # this could be handled in a separate doctype parser
127n/a if decltype == "doctype":
128n/a j = self._parse_doctype_subset(j + 1, i)
129n/a elif decltype in {"attlist", "linktype", "link", "element"}:
130n/a # must tolerate []'d groups in a content model in an element declaration
131n/a # also in data attribute specifications of attlist declaration
132n/a # also link type declaration subsets in linktype declarations
133n/a # also link attribute specification lists in link declarations
134n/a self.error("unsupported '[' char in %s declaration" % decltype)
135n/a else:
136n/a self.error("unexpected '[' char in declaration")
137n/a else:
138n/a self.error(
139n/a "unexpected %r char in declaration" % rawdata[j])
140n/a if j < 0:
141n/a return j
142n/a return -1 # incomplete
143n/a
144n/a # Internal -- parse a marked section
145n/a # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
146n/a def parse_marked_section(self, i, report=1):
147n/a rawdata= self.rawdata
148n/a assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
149n/a sectName, j = self._scan_name( i+3, i )
150n/a if j < 0:
151n/a return j
152n/a if sectName in {"temp", "cdata", "ignore", "include", "rcdata"}:
153n/a # look for standard ]]> ending
154n/a match= _markedsectionclose.search(rawdata, i+3)
155n/a elif sectName in {"if", "else", "endif"}:
156n/a # look for MS Office ]> ending
157n/a match= _msmarkedsectionclose.search(rawdata, i+3)
158n/a else:
159n/a self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
160n/a if not match:
161n/a return -1
162n/a if report:
163n/a j = match.start(0)
164n/a self.unknown_decl(rawdata[i+3: j])
165n/a return match.end(0)
166n/a
167n/a # Internal -- parse comment, return length or -1 if not terminated
168n/a def parse_comment(self, i, report=1):
169n/a rawdata = self.rawdata
170n/a if rawdata[i:i+4] != '<!--':
171n/a self.error('unexpected call to parse_comment()')
172n/a match = _commentclose.search(rawdata, i+4)
173n/a if not match:
174n/a return -1
175n/a if report:
176n/a j = match.start(0)
177n/a self.handle_comment(rawdata[i+4: j])
178n/a return match.end(0)
179n/a
180n/a # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
181n/a # returning the index just past any whitespace following the trailing ']'.
182n/a def _parse_doctype_subset(self, i, declstartpos):
183n/a rawdata = self.rawdata
184n/a n = len(rawdata)
185n/a j = i
186n/a while j < n:
187n/a c = rawdata[j]
188n/a if c == "<":
189n/a s = rawdata[j:j+2]
190n/a if s == "<":
191n/a # end of buffer; incomplete
192n/a return -1
193n/a if s != "<!":
194n/a self.updatepos(declstartpos, j + 1)
195n/a self.error("unexpected char in internal subset (in %r)" % s)
196n/a if (j + 2) == n:
197n/a # end of buffer; incomplete
198n/a return -1
199n/a if (j + 4) > n:
200n/a # end of buffer; incomplete
201n/a return -1
202n/a if rawdata[j:j+4] == "<!--":
203n/a j = self.parse_comment(j, report=0)
204n/a if j < 0:
205n/a return j
206n/a continue
207n/a name, j = self._scan_name(j + 2, declstartpos)
208n/a if j == -1:
209n/a return -1
210n/a if name not in {"attlist", "element", "entity", "notation"}:
211n/a self.updatepos(declstartpos, j + 2)
212n/a self.error(
213n/a "unknown declaration %r in internal subset" % name)
214n/a # handle the individual names
215n/a meth = getattr(self, "_parse_doctype_" + name)
216n/a j = meth(j, declstartpos)
217n/a if j < 0:
218n/a return j
219n/a elif c == "%":
220n/a # parameter entity reference
221n/a if (j + 1) == n:
222n/a # end of buffer; incomplete
223n/a return -1
224n/a s, j = self._scan_name(j + 1, declstartpos)
225n/a if j < 0:
226n/a return j
227n/a if rawdata[j] == ";":
228n/a j = j + 1
229n/a elif c == "]":
230n/a j = j + 1
231n/a while j < n and rawdata[j].isspace():
232n/a j = j + 1
233n/a if j < n:
234n/a if rawdata[j] == ">":
235n/a return j
236n/a self.updatepos(declstartpos, j)
237n/a self.error("unexpected char after internal subset")
238n/a else:
239n/a return -1
240n/a elif c.isspace():
241n/a j = j + 1
242n/a else:
243n/a self.updatepos(declstartpos, j)
244n/a self.error("unexpected char %r in internal subset" % c)
245n/a # end of buffer reached
246n/a return -1
247n/a
248n/a # Internal -- scan past <!ELEMENT declarations
249n/a def _parse_doctype_element(self, i, declstartpos):
250n/a name, j = self._scan_name(i, declstartpos)
251n/a if j == -1:
252n/a return -1
253n/a # style content model; just skip until '>'
254n/a rawdata = self.rawdata
255n/a if '>' in rawdata[j:]:
256n/a return rawdata.find(">", j) + 1
257n/a return -1
258n/a
259n/a # Internal -- scan past <!ATTLIST declarations
260n/a def _parse_doctype_attlist(self, i, declstartpos):
261n/a rawdata = self.rawdata
262n/a name, j = self._scan_name(i, declstartpos)
263n/a c = rawdata[j:j+1]
264n/a if c == "":
265n/a return -1
266n/a if c == ">":
267n/a return j + 1
268n/a while 1:
269n/a # scan a series of attribute descriptions; simplified:
270n/a # name type [value] [#constraint]
271n/a name, j = self._scan_name(j, declstartpos)
272n/a if j < 0:
273n/a return j
274n/a c = rawdata[j:j+1]
275n/a if c == "":
276n/a return -1
277n/a if c == "(":
278n/a # an enumerated type; look for ')'
279n/a if ")" in rawdata[j:]:
280n/a j = rawdata.find(")", j) + 1
281n/a else:
282n/a return -1
283n/a while rawdata[j:j+1].isspace():
284n/a j = j + 1
285n/a if not rawdata[j:]:
286n/a # end of buffer, incomplete
287n/a return -1
288n/a else:
289n/a name, j = self._scan_name(j, declstartpos)
290n/a c = rawdata[j:j+1]
291n/a if not c:
292n/a return -1
293n/a if c in "'\"":
294n/a m = _declstringlit_match(rawdata, j)
295n/a if m:
296n/a j = m.end()
297n/a else:
298n/a return -1
299n/a c = rawdata[j:j+1]
300n/a if not c:
301n/a return -1
302n/a if c == "#":
303n/a if rawdata[j:] == "#":
304n/a # end of buffer
305n/a return -1
306n/a name, j = self._scan_name(j + 1, declstartpos)
307n/a if j < 0:
308n/a return j
309n/a c = rawdata[j:j+1]
310n/a if not c:
311n/a return -1
312n/a if c == '>':
313n/a # all done
314n/a return j + 1
315n/a
316n/a # Internal -- scan past <!NOTATION declarations
317n/a def _parse_doctype_notation(self, i, declstartpos):
318n/a name, j = self._scan_name(i, declstartpos)
319n/a if j < 0:
320n/a return j
321n/a rawdata = self.rawdata
322n/a while 1:
323n/a c = rawdata[j:j+1]
324n/a if not c:
325n/a # end of buffer; incomplete
326n/a return -1
327n/a if c == '>':
328n/a return j + 1
329n/a if c in "'\"":
330n/a m = _declstringlit_match(rawdata, j)
331n/a if not m:
332n/a return -1
333n/a j = m.end()
334n/a else:
335n/a name, j = self._scan_name(j, declstartpos)
336n/a if j < 0:
337n/a return j
338n/a
339n/a # Internal -- scan past <!ENTITY declarations
340n/a def _parse_doctype_entity(self, i, declstartpos):
341n/a rawdata = self.rawdata
342n/a if rawdata[i:i+1] == "%":
343n/a j = i + 1
344n/a while 1:
345n/a c = rawdata[j:j+1]
346n/a if not c:
347n/a return -1
348n/a if c.isspace():
349n/a j = j + 1
350n/a else:
351n/a break
352n/a else:
353n/a j = i
354n/a name, j = self._scan_name(j, declstartpos)
355n/a if j < 0:
356n/a return j
357n/a while 1:
358n/a c = self.rawdata[j:j+1]
359n/a if not c:
360n/a return -1
361n/a if c in "'\"":
362n/a m = _declstringlit_match(rawdata, j)
363n/a if m:
364n/a j = m.end()
365n/a else:
366n/a return -1 # incomplete
367n/a elif c == ">":
368n/a return j + 1
369n/a else:
370n/a name, j = self._scan_name(j, declstartpos)
371n/a if j < 0:
372n/a return j
373n/a
374n/a # Internal -- scan a name token and the new position and the token, or
375n/a # return -1 if we've reached the end of the buffer.
376n/a def _scan_name(self, i, declstartpos):
377n/a rawdata = self.rawdata
378n/a n = len(rawdata)
379n/a if i == n:
380n/a return None, -1
381n/a m = _declname_match(rawdata, i)
382n/a if m:
383n/a s = m.group()
384n/a name = s.strip()
385n/a if (i + len(s)) == n:
386n/a return None, -1 # end of buffer
387n/a return name.lower(), m.end()
388n/a else:
389n/a self.updatepos(declstartpos, i)
390n/a self.error("expected name token at %r"
391n/a % rawdata[declstartpos:declstartpos+20])
392n/a
393n/a # To be overridden -- handlers for unknown objects
394n/a def unknown_decl(self, data):
395n/a pass