Python code coverage for Lib/_markupbase.py

#	count	content
1	n/a	"""Shared support for scanning document type declarations in HTML and XHTML.
2	n/a
3	n/a	This module is used as a foundation for the html.parser module. It has no
4	n/a	documented public API and should not be used directly.
5	n/a
6	n/a	"""
7	n/a
8	n/a	import re
9	n/a
10	n/a	_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]\s').match
11	n/a	_declstringlit_match = re.compile(r'(\'[^\']\'\|"[^"]")\s*').match
12	n/a	_commentclose = re.compile(r'--\s*>')
13	n/a	_markedsectionclose = re.compile(r']\s]\s>')
14	n/a
15	n/a	# An analysis of the MS-Word extensions is available at
16	n/a	# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
17	n/a
18	n/a	_msmarkedsectionclose = re.compile(r']\s*>')
19	n/a
20	n/a	del re
21	n/a
22	n/a
23	n/a	class ParserBase:
24	n/a	"""Parser base class which provides some common support methods used
25	n/a	by the SGML/HTML and XHTML parsers."""
26	n/a
27	n/a	def __init__(self):
28	n/a	if self.__class__ is ParserBase:
29	n/a	raise RuntimeError(
30	n/a	"_markupbase.ParserBase must be subclassed")
31	n/a
32	n/a	def error(self, message):
33	n/a	raise NotImplementedError(
34	n/a	"subclasses of ParserBase must override error()")
35	n/a
36	n/a	def reset(self):
37	n/a	self.lineno = 1
38	n/a	self.offset = 0
39	n/a
40	n/a	def getpos(self):
41	n/a	"""Return current line number and offset."""
42	n/a	return self.lineno, self.offset
43	n/a
44	n/a	# Internal -- update line number and offset. This should be
45	n/a	# called for each piece of data exactly once, in order -- in other
46	n/a	# words the concatenation of all the input strings to this
47	n/a	# function should be exactly the entire input.
48	n/a	def updatepos(self, i, j):
49	n/a	if i >= j:
50	n/a	return j
51	n/a	rawdata = self.rawdata
52	n/a	nlines = rawdata.count("\n", i, j)
53	n/a	if nlines:
54	n/a	self.lineno = self.lineno + nlines
55	n/a	pos = rawdata.rindex("\n", i, j) # Should not fail
56	n/a	self.offset = j-(pos+1)
57	n/a	else:
58	n/a	self.offset = self.offset + j-i
59	n/a	return j
60	n/a
61	n/a	_decl_otherchars = ''
62	n/a
63	n/a	# Internal -- parse declaration (for use by subclasses).
64	n/a	def parse_declaration(self, i):
65	n/a	# This is some sort of declaration; in "HTML as
66	n/a	# deployed," this should only be the document type
67	n/a	# declaration ("<!DOCTYPE html...>").
68	n/a	# ISO 8879:1986, however, has more complex
69	n/a	# declaration syntax for elements in <!...>, including:
70	n/a	# --comment--
71	n/a	# [marked section]
72	n/a	# name in the following list: ENTITY, DOCTYPE, ELEMENT,
73	n/a	# ATTLIST, NOTATION, SHORTREF, USEMAP,
74	n/a	# LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
75	n/a	rawdata = self.rawdata
76	n/a	j = i + 2
77	n/a	assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
78	n/a	if rawdata[j:j+1] == ">":
79	n/a	# the empty comment <!>
80	n/a	return j + 1
81	n/a	if rawdata[j:j+1] in ("-", ""):
82	n/a	# Start of comment followed by buffer boundary,
83	n/a	# or just a buffer boundary.
84	n/a	return -1
85	n/a	# A simple, practical version could look like: ((name\|stringlit) S*) + '>'
86	n/a	n = len(rawdata)
87	n/a	if rawdata[j:j+2] == '--': #comment
88	n/a	# Locate --.*-- as the body of the comment
89	n/a	return self.parse_comment(i)
90	n/a	elif rawdata[j] == '[': #marked section
91	n/a	# Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
92	n/a	# Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
93	n/a	# Note that this is extended by Microsoft Office "Save as Web" function
94	n/a	# to include [if...] and [endif].
95	n/a	return self.parse_marked_section(i)
96	n/a	else: #all other declaration elements
97	n/a	decltype, j = self._scan_name(j, i)
98	n/a	if j < 0:
99	n/a	return j
100	n/a	if decltype == "doctype":
101	n/a	self._decl_otherchars = ''
102	n/a	while j < n:
103	n/a	c = rawdata[j]
104	n/a	if c == ">":
105	n/a	# end of declaration syntax
106	n/a	data = rawdata[i+2:j]
107	n/a	if decltype == "doctype":
108	n/a	self.handle_decl(data)
109	n/a	else:
110	n/a	# According to the HTML5 specs sections "8.2.4.44 Bogus
111	n/a	# comment state" and "8.2.4.45 Markup declaration open
112	n/a	# state", a comment token should be emitted.
113	n/a	# Calling unknown_decl provides more flexibility though.
114	n/a	self.unknown_decl(data)
115	n/a	return j + 1
116	n/a	if c in "\"'":
117	n/a	m = _declstringlit_match(rawdata, j)
118	n/a	if not m:
119	n/a	return -1 # incomplete
120	n/a	j = m.end()
121	n/a	elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
122	n/a	name, j = self._scan_name(j, i)
123	n/a	elif c in self._decl_otherchars:
124	n/a	j = j + 1
125	n/a	elif c == "[":
126	n/a	# this could be handled in a separate doctype parser
127	n/a	if decltype == "doctype":
128	n/a	j = self._parse_doctype_subset(j + 1, i)
129	n/a	elif decltype in {"attlist", "linktype", "link", "element"}:
130	n/a	# must tolerate []'d groups in a content model in an element declaration
131	n/a	# also in data attribute specifications of attlist declaration
132	n/a	# also link type declaration subsets in linktype declarations
133	n/a	# also link attribute specification lists in link declarations
134	n/a	self.error("unsupported '[' char in %s declaration" % decltype)
135	n/a	else:
136	n/a	self.error("unexpected '[' char in declaration")
137	n/a	else:
138	n/a	self.error(
139	n/a	"unexpected %r char in declaration" % rawdata[j])
140	n/a	if j < 0:
141	n/a	return j
142	n/a	return -1 # incomplete
143	n/a
144	n/a	# Internal -- parse a marked section
145	n/a	# Override this to handle MS-word extension syntax <![if word]>content<![endif]>
146	n/a	def parse_marked_section(self, i, report=1):
147	n/a	rawdata= self.rawdata
148	n/a	assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
149	n/a	sectName, j = self._scan_name( i+3, i )
150	n/a	if j < 0:
151	n/a	return j
152	n/a	if sectName in {"temp", "cdata", "ignore", "include", "rcdata"}:
153	n/a	# look for standard ]]> ending
154	n/a	match= _markedsectionclose.search(rawdata, i+3)
155	n/a	elif sectName in {"if", "else", "endif"}:
156	n/a	# look for MS Office ]> ending
157	n/a	match= _msmarkedsectionclose.search(rawdata, i+3)
158	n/a	else:
159	n/a	self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
160	n/a	if not match:
161	n/a	return -1
162	n/a	if report:
163	n/a	j = match.start(0)
164	n/a	self.unknown_decl(rawdata[i+3: j])
165	n/a	return match.end(0)
166	n/a
167	n/a	# Internal -- parse comment, return length or -1 if not terminated
168	n/a	def parse_comment(self, i, report=1):
169	n/a	rawdata = self.rawdata
170	n/a	if rawdata[i:i+4] != '<!--':
171	n/a	self.error('unexpected call to parse_comment()')
172	n/a	match = _commentclose.search(rawdata, i+4)
173	n/a	if not match:
174	n/a	return -1
175	n/a	if report:
176	n/a	j = match.start(0)
177	n/a	self.handle_comment(rawdata[i+4: j])
178	n/a	return match.end(0)
179	n/a
180	n/a	# Internal -- scan past the internal subset in a <!DOCTYPE declaration,
181	n/a	# returning the index just past any whitespace following the trailing ']'.
182	n/a	def _parse_doctype_subset(self, i, declstartpos):
183	n/a	rawdata = self.rawdata
184	n/a	n = len(rawdata)
185	n/a	j = i
186	n/a	while j < n:
187	n/a	c = rawdata[j]
188	n/a	if c == "<":
189	n/a	s = rawdata[j:j+2]
190	n/a	if s == "<":
191	n/a	# end of buffer; incomplete
192	n/a	return -1
193	n/a	if s != "<!":
194	n/a	self.updatepos(declstartpos, j + 1)
195	n/a	self.error("unexpected char in internal subset (in %r)" % s)
196	n/a	if (j + 2) == n:
197	n/a	# end of buffer; incomplete
198	n/a	return -1
199	n/a	if (j + 4) > n:
200	n/a	# end of buffer; incomplete
201	n/a	return -1
202	n/a	if rawdata[j:j+4] == "<!--":
203	n/a	j = self.parse_comment(j, report=0)
204	n/a	if j < 0:
205	n/a	return j
206	n/a	continue
207	n/a	name, j = self._scan_name(j + 2, declstartpos)
208	n/a	if j == -1:
209	n/a	return -1
210	n/a	if name not in {"attlist", "element", "entity", "notation"}:
211	n/a	self.updatepos(declstartpos, j + 2)
212	n/a	self.error(
213	n/a	"unknown declaration %r in internal subset" % name)
214	n/a	# handle the individual names
215	n/a	meth = getattr(self, "_parse_doctype_" + name)
216	n/a	j = meth(j, declstartpos)
217	n/a	if j < 0:
218	n/a	return j
219	n/a	elif c == "%":
220	n/a	# parameter entity reference
221	n/a	if (j + 1) == n:
222	n/a	# end of buffer; incomplete
223	n/a	return -1
224	n/a	s, j = self._scan_name(j + 1, declstartpos)
225	n/a	if j < 0:
226	n/a	return j
227	n/a	if rawdata[j] == ";":
228	n/a	j = j + 1
229	n/a	elif c == "]":
230	n/a	j = j + 1
231	n/a	while j < n and rawdata[j].isspace():
232	n/a	j = j + 1
233	n/a	if j < n:
234	n/a	if rawdata[j] == ">":
235	n/a	return j
236	n/a	self.updatepos(declstartpos, j)
237	n/a	self.error("unexpected char after internal subset")
238	n/a	else:
239	n/a	return -1
240	n/a	elif c.isspace():
241	n/a	j = j + 1
242	n/a	else:
243	n/a	self.updatepos(declstartpos, j)
244	n/a	self.error("unexpected char %r in internal subset" % c)
245	n/a	# end of buffer reached
246	n/a	return -1
247	n/a
248	n/a	# Internal -- scan past <!ELEMENT declarations
249	n/a	def _parse_doctype_element(self, i, declstartpos):
250	n/a	name, j = self._scan_name(i, declstartpos)
251	n/a	if j == -1:
252	n/a	return -1
253	n/a	# style content model; just skip until '>'
254	n/a	rawdata = self.rawdata
255	n/a	if '>' in rawdata[j:]:
256	n/a	return rawdata.find(">", j) + 1
257	n/a	return -1
258	n/a
259	n/a	# Internal -- scan past <!ATTLIST declarations
260	n/a	def _parse_doctype_attlist(self, i, declstartpos):
261	n/a	rawdata = self.rawdata
262	n/a	name, j = self._scan_name(i, declstartpos)
263	n/a	c = rawdata[j:j+1]
264	n/a	if c == "":
265	n/a	return -1
266	n/a	if c == ">":
267	n/a	return j + 1
268	n/a	while 1:
269	n/a	# scan a series of attribute descriptions; simplified:
270	n/a	# name type [value] [#constraint]
271	n/a	name, j = self._scan_name(j, declstartpos)
272	n/a	if j < 0:
273	n/a	return j
274	n/a	c = rawdata[j:j+1]
275	n/a	if c == "":
276	n/a	return -1
277	n/a	if c == "(":
278	n/a	# an enumerated type; look for ')'
279	n/a	if ")" in rawdata[j:]:
280	n/a	j = rawdata.find(")", j) + 1
281	n/a	else:
282	n/a	return -1
283	n/a	while rawdata[j:j+1].isspace():
284	n/a	j = j + 1
285	n/a	if not rawdata[j:]:
286	n/a	# end of buffer, incomplete
287	n/a	return -1
288	n/a	else:
289	n/a	name, j = self._scan_name(j, declstartpos)
290	n/a	c = rawdata[j:j+1]
291	n/a	if not c:
292	n/a	return -1
293	n/a	if c in "'\"":
294	n/a	m = _declstringlit_match(rawdata, j)
295	n/a	if m:
296	n/a	j = m.end()
297	n/a	else:
298	n/a	return -1
299	n/a	c = rawdata[j:j+1]
300	n/a	if not c:
301	n/a	return -1
302	n/a	if c == "#":
303	n/a	if rawdata[j:] == "#":
304	n/a	# end of buffer
305	n/a	return -1
306	n/a	name, j = self._scan_name(j + 1, declstartpos)
307	n/a	if j < 0:
308	n/a	return j
309	n/a	c = rawdata[j:j+1]
310	n/a	if not c:
311	n/a	return -1
312	n/a	if c == '>':
313	n/a	# all done
314	n/a	return j + 1
315	n/a
316	n/a	# Internal -- scan past <!NOTATION declarations
317	n/a	def _parse_doctype_notation(self, i, declstartpos):
318	n/a	name, j = self._scan_name(i, declstartpos)
319	n/a	if j < 0:
320	n/a	return j
321	n/a	rawdata = self.rawdata
322	n/a	while 1:
323	n/a	c = rawdata[j:j+1]
324	n/a	if not c:
325	n/a	# end of buffer; incomplete
326	n/a	return -1
327	n/a	if c == '>':
328	n/a	return j + 1
329	n/a	if c in "'\"":
330	n/a	m = _declstringlit_match(rawdata, j)
331	n/a	if not m:
332	n/a	return -1
333	n/a	j = m.end()
334	n/a	else:
335	n/a	name, j = self._scan_name(j, declstartpos)
336	n/a	if j < 0:
337	n/a	return j
338	n/a
339	n/a	# Internal -- scan past <!ENTITY declarations
340	n/a	def _parse_doctype_entity(self, i, declstartpos):
341	n/a	rawdata = self.rawdata
342	n/a	if rawdata[i:i+1] == "%":
343	n/a	j = i + 1
344	n/a	while 1:
345	n/a	c = rawdata[j:j+1]
346	n/a	if not c:
347	n/a	return -1
348	n/a	if c.isspace():
349	n/a	j = j + 1
350	n/a	else:
351	n/a	break
352	n/a	else:
353	n/a	j = i
354	n/a	name, j = self._scan_name(j, declstartpos)
355	n/a	if j < 0:
356	n/a	return j
357	n/a	while 1:
358	n/a	c = self.rawdata[j:j+1]
359	n/a	if not c:
360	n/a	return -1
361	n/a	if c in "'\"":
362	n/a	m = _declstringlit_match(rawdata, j)
363	n/a	if m:
364	n/a	j = m.end()
365	n/a	else:
366	n/a	return -1 # incomplete
367	n/a	elif c == ">":
368	n/a	return j + 1
369	n/a	else:
370	n/a	name, j = self._scan_name(j, declstartpos)
371	n/a	if j < 0:
372	n/a	return j
373	n/a
374	n/a	# Internal -- scan a name token and the new position and the token, or
375	n/a	# return -1 if we've reached the end of the buffer.
376	n/a	def _scan_name(self, i, declstartpos):
377	n/a	rawdata = self.rawdata
378	n/a	n = len(rawdata)
379	n/a	if i == n:
380	n/a	return None, -1
381	n/a	m = _declname_match(rawdata, i)
382	n/a	if m:
383	n/a	s = m.group()
384	n/a	name = s.strip()
385	n/a	if (i + len(s)) == n:
386	n/a	return None, -1 # end of buffer
387	n/a	return name.lower(), m.end()
388	n/a	else:
389	n/a	self.updatepos(declstartpos, i)
390	n/a	self.error("expected name token at %r"
391	n/a	% rawdata[declstartpos:declstartpos+20])
392	n/a
393	n/a	# To be overridden -- handlers for unknown objects
394	n/a	def unknown_decl(self, data):
395	n/a	pass