ยปCore Development>Code coverage>Lib/markupbase.py

Python code coverage for Lib/markupbase.py

#countcontent
1n/a"""Shared support for scanning document type declarations in HTML and XHTML.
2n/a
3n/aThis module is used as a foundation for the HTMLParser and sgmllib
4n/amodules (indirectly, for htmllib as well). It has no documented
5n/apublic API and should not be used directly.
6n/a
71"""
8n/a
91import re
10n/a
111_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
121_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
131_commentclose = re.compile(r'--\s*>')
141_markedsectionclose = re.compile(r']\s*]\s*>')
15n/a
16n/a# An analysis of the MS-Word extensions is available at
17n/a# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
18n/a
191_msmarkedsectionclose = re.compile(r']\s*>')
20n/a
211del re
22n/a
23n/a
242class ParserBase:
25n/a """Parser base class which provides some common support methods used
261 by the SGML/HTML and XHTML parsers."""
27n/a
281 def __init__(self):
290 if self.__class__ is ParserBase:
300 raise RuntimeError(
310 "markupbase.ParserBase must be subclassed")
32n/a
331 def error(self, message):
340 raise NotImplementedError(
350 "subclasses of ParserBase must override error()")
36n/a
371 def reset(self):
38101 self.lineno = 1
39101 self.offset = 0
40n/a
411 def getpos(self):
42n/a """Return current line number and offset."""
4316 return self.lineno, self.offset
44n/a
45n/a # Internal -- update line number and offset. This should be
46n/a # called for each piece of data exactly once, in order -- in other
47n/a # words the concatenation of all the input strings to this
48n/a # function should be exactly the entire input.
491 def updatepos(self, i, j):
501412 if i >= j:
511174 return j
52238 rawdata = self.rawdata
53238 nlines = rawdata.count("\n", i, j)
54238 if nlines:
5514 self.lineno = self.lineno + nlines
5614 pos = rawdata.rindex("\n", i, j) # Should not fail
5714 self.offset = j-(pos+1)
58n/a else:
59224 self.offset = self.offset + j-i
60238 return j
61n/a
621 _decl_otherchars = ''
63n/a
64n/a # Internal -- parse declaration (for use by subclasses).
651 def parse_declaration(self, i):
66n/a # This is some sort of declaration; in "HTML as
67n/a # deployed," this should only be the document type
68n/a # declaration ("<!DOCTYPE html...>").
69n/a # ISO 8879:1986, however, has more complex
70n/a # declaration syntax for elements in <!...>, including:
71n/a # --comment--
72n/a # [marked section]
73n/a # name in the following list: ENTITY, DOCTYPE, ELEMENT,
74n/a # ATTLIST, NOTATION, SHORTREF, USEMAP,
75n/a # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
76574 rawdata = self.rawdata
77574 j = i + 2
78574 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
79574 if rawdata[j:j+1] == ">":
80n/a # the empty comment <!>
811 return j + 1
82573 if rawdata[j:j+1] in ("-", ""):
83n/a # Start of comment followed by buffer boundary,
84n/a # or just a buffer boundary.
8516 return -1
86n/a # A simple, practical version could look like: ((name|stringlit) S*) + '>'
87557 n = len(rawdata)
88557 if rawdata[j:j+2] == '--': #comment
89n/a # Locate --.*-- as the body of the comment
900 return self.parse_comment(i)
91557 elif rawdata[j] == '[': #marked section
92n/a # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
93n/a # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
94n/a # Note that this is extended by Microsoft Office "Save as Web" function
95n/a # to include [if...] and [endif].
962 return self.parse_marked_section(i)
97n/a else: #all other declaration elements
98555 decltype, j = self._scan_name(j, i)
99555 if j < 0:
10039 return j
101516 if decltype == "doctype":
102490 self._decl_otherchars = ''
1031103 while j < n:
1041096 c = rawdata[j]
1051096 if c == ">":
106n/a # end of declaration syntax
1076 data = rawdata[i+2:j]
1086 if decltype == "doctype":
1095 self.handle_decl(data)
110n/a else:
1111 self.unknown_decl(data)
1126 return j + 1
1131090 if c in "\"'":
11461 m = _declstringlit_match(rawdata, j)
11561 if not m:
11639 return -1 # incomplete
11722 j = m.end()
1181029 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
119578 name, j = self._scan_name(j, i)
120451 elif c in self._decl_otherchars:
12127 j = j + 1
122424 elif c == "[":
123n/a # this could be handled in a separate doctype parser
124421 if decltype == "doctype":
125421 j = self._parse_doctype_subset(j + 1, i)
1260 elif decltype in ("attlist", "linktype", "link", "element"):
127n/a # must tolerate []'d groups in a content model in an element declaration
128n/a # also in data attribute specifications of attlist declaration
129n/a # also link type declaration subsets in linktype declarations
130n/a # also link attribute specification lists in link declarations
1310 self.error("unsupported '[' char in %s declaration" % decltype)
132n/a else:
1330 self.error("unexpected '[' char in declaration")
134n/a else:
1353 self.error(
1363 "unexpected %r char in declaration" % rawdata[j])
1371048 if j < 0:
138461 return j
1397 return -1 # incomplete
140n/a
141n/a # Internal -- parse a marked section
142n/a # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
1431 def parse_marked_section(self, i, report=1):
1442 rawdata= self.rawdata
1452 assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
1462 sectName, j = self._scan_name( i+3, i )
1472 if j < 0:
1480 return j
1492 if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
150n/a # look for standard ]]> ending
1510 match= _markedsectionclose.search(rawdata, i+3)
1522 elif sectName in ("if", "else", "endif"):
153n/a # look for MS Office ]> ending
1542 match= _msmarkedsectionclose.search(rawdata, i+3)
155n/a else:
1560 self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
1572 if not match:
1580 return -1
1592 if report:
1602 j = match.start(0)
1612 self.unknown_decl(rawdata[i+3: j])
1622 return match.end(0)
163n/a
164n/a # Internal -- parse comment, return length or -1 if not terminated
1651 def parse_comment(self, i, report=1):
166137 rawdata = self.rawdata
167137 if rawdata[i:i+4] != '<!--':
1680 self.error('unexpected call to parse_comment()')
169137 match = _commentclose.search(rawdata, i+4)
170137 if not match:
171111 return -1
17226 if report:
17321 j = match.start(0)
17421 self.handle_comment(rawdata[i+4: j])
17526 return match.end(0)
176n/a
177n/a # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
178n/a # returning the index just past any whitespace following the trailing ']'.
1791 def _parse_doctype_subset(self, i, declstartpos):
180421 rawdata = self.rawdata
181421 n = len(rawdata)
182421 j = i
1836167 while j < n:
1846131 c = rawdata[j]
1856131 if c == "<":
1861505 s = rawdata[j:j+2]
1871505 if s == "<":
188n/a # end of buffer; incomplete
1898 return -1
1901497 if s != "<!":
1910 self.updatepos(declstartpos, j + 1)
1920 self.error("unexpected char in internal subset (in %r)" % s)
1931497 if (j + 2) == n:
194n/a # end of buffer; incomplete
1958 return -1
1961489 if (j + 4) > n:
197n/a # end of buffer; incomplete
1988 return -1
1991481 if rawdata[j:j+4] == "<!--":
20017 j = self.parse_comment(j, report=0)
20117 if j < 0:
20212 return j
2030 continue
2041464 name, j = self._scan_name(j + 2, declstartpos)
2051464 if j == -1:
20647 return -1
2071417 if name not in ("attlist", "element", "entity", "notation"):
2080 self.updatepos(declstartpos, j + 2)
2090 self.error(
2100 "unknown declaration %r in internal subset" % name)
211n/a # handle the individual names
2121417 meth = getattr(self, "_parse_doctype_" + name)
2131417 j = meth(j, declstartpos)
2141417 if j < 0:
215285 return j
2164626 elif c == "%":
217n/a # parameter entity reference
21836 if (j + 1) == n:
219n/a # end of buffer; incomplete
2201 return -1
22135 s, j = self._scan_name(j + 1, declstartpos)
22235 if j < 0:
22311 return j
22424 if rawdata[j] == ";":
22524 j = j + 1
2264590 elif c == "]":
2275 j = j + 1
2285 while j < n and rawdata[j].isspace():
2290 j = j + 1
2305 if j < n:
2313 if rawdata[j] == ">":
2323 return j
2330 self.updatepos(declstartpos, j)
2340 self.error("unexpected char after internal subset")
235n/a else:
2362 return -1
2374585 elif c.isspace():
2384585 j = j + 1
239n/a else:
2400 self.updatepos(declstartpos, j)
2410 self.error("unexpected char %r in internal subset" % c)
242n/a # end of buffer reached
24336 return -1
244n/a
245n/a # Internal -- scan past <!ELEMENT declarations
2461 def _parse_doctype_element(self, i, declstartpos):
247376 name, j = self._scan_name(i, declstartpos)
248376 if j == -1:
2495 return -1
250n/a # style content model; just skip until '>'
251371 rawdata = self.rawdata
252371 if '>' in rawdata[j:]:
253362 return rawdata.find(">", j) + 1
2549 return -1
255n/a
256n/a # Internal -- scan past <!ATTLIST declarations
2571 def _parse_doctype_attlist(self, i, declstartpos):
258368 rawdata = self.rawdata
259368 name, j = self._scan_name(i, declstartpos)
260368 c = rawdata[j:j+1]
261368 if c == "":
26215 return -1
263353 if c == ">":
2640 return j + 1
265353 while 1:
266n/a # scan a series of attribute descriptions; simplified:
267n/a # name type [value] [#constraint]
268661 name, j = self._scan_name(j, declstartpos)
269661 if j < 0:
27021 return j
271640 c = rawdata[j:j+1]
272640 if c == "":
2730 return -1
274640 if c == "(":
275n/a # an enumerated type; look for ')'
27611 if ")" in rawdata[j:]:
2775 j = rawdata.find(")", j) + 1
278n/a else:
2796 return -1
2809 while rawdata[j:j+1].isspace():
2814 j = j + 1
2825 if not rawdata[j:]:
283n/a # end of buffer, incomplete
2842 return -1
285n/a else:
286629 name, j = self._scan_name(j, declstartpos)
287632 c = rawdata[j:j+1]
288632 if not c:
28912 return -1
290620 if c in "'\"":
291294 m = _declstringlit_match(rawdata, j)
292294 if m:
293283 j = m.end()
294n/a else:
29511 return -1
296283 c = rawdata[j:j+1]
297283 if not c:
2981 return -1
299608 if c == "#":
300323 if rawdata[j:] == "#":
301n/a # end of buffer
3021 return -1
303322 name, j = self._scan_name(j + 1, declstartpos)
304322 if j < 0:
30514 return j
306308 c = rawdata[j:j+1]
307308 if not c:
3080 return -1
309593 if c == '>':
310n/a # all done
311285 return j + 1
312n/a
313n/a # Internal -- scan past <!NOTATION declarations
3141 def _parse_doctype_notation(self, i, declstartpos):
315267 name, j = self._scan_name(i, declstartpos)
316267 if j < 0:
3179 return j
318258 rawdata = self.rawdata
319258 while 1:
320714 c = rawdata[j:j+1]
321714 if not c:
322n/a # end of buffer; incomplete
3231 return -1
324713 if c == '>':
325204 return j + 1
326509 if c in "'\"":
327251 m = _declstringlit_match(rawdata, j)
328251 if not m:
32946 return -1
330205 j = m.end()
331n/a else:
332258 name, j = self._scan_name(j, declstartpos)
333258 if j < 0:
3347 return j
335n/a
336n/a # Internal -- scan past <!ENTITY declarations
3371 def _parse_doctype_entity(self, i, declstartpos):
338406 rawdata = self.rawdata
339406 if rawdata[i:i+1] == "%":
34070 j = i + 1
34170 while 1:
342139 c = rawdata[j:j+1]
343139 if not c:
3442 return -1
345137 if c.isspace():
34669 j = j + 1
347n/a else:
34868 break
349n/a else:
350336 j = i
351404 name, j = self._scan_name(j, declstartpos)
352404 if j < 0:
35330 return j
354374 while 1:
355787 c = self.rawdata[j:j+1]
356787 if not c:
3573 return -1
358784 if c in "'\"":
359367 m = _declstringlit_match(rawdata, j)
360367 if m:
361284 j = m.end()
362n/a else:
36383 return -1 # incomplete
364417 elif c == ">":
365281 return j + 1
366n/a else:
367136 name, j = self._scan_name(j, declstartpos)
368136 if j < 0:
3697 return j
370n/a
371n/a # Internal -- scan a name token and the new position and the token, or
372n/a # return -1 if we've reached the end of the buffer.
3731 def _scan_name(self, i, declstartpos):
3746055 rawdata = self.rawdata
3756055 n = len(rawdata)
3766055 if i == n:
3770 return None, -1
3786055 m = _declname_match(rawdata, i)
3796055 if m:
3806055 s = m.group()
3816055 name = s.strip()
3826055 if (i + len(s)) == n:
383260 return None, -1 # end of buffer
3845795 return name.lower(), m.end()
385n/a else:
3860 self.updatepos(declstartpos, i)
3870 self.error("expected name token at %r"
3880 % rawdata[declstartpos:declstartpos+20])
389n/a
390n/a # To be overridden -- handlers for unknown objects
3911 def unknown_decl(self, data):
3920 pass