Python code coverage for Lib/markupbase.py

#	count	content
1	n/a	"""Shared support for scanning document type declarations in HTML and XHTML.
2	n/a
3	n/a	This module is used as a foundation for the HTMLParser and sgmllib
4	n/a	modules (indirectly, for htmllib as well). It has no documented
5	n/a	public API and should not be used directly.
6	n/a
7	1	"""
8	n/a
9	1	import re
10	n/a
11	1	_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]\s').match
12	1	_declstringlit_match = re.compile(r'(\'[^\']\'\|"[^"]")\s*').match
13	1	_commentclose = re.compile(r'--\s*>')
14	1	_markedsectionclose = re.compile(r']\s]\s>')
15	n/a
16	n/a	# An analysis of the MS-Word extensions is available at
17	n/a	# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
18	n/a
19	1	_msmarkedsectionclose = re.compile(r']\s*>')
20	n/a
21	1	del re
22	n/a
23	n/a
24	2	class ParserBase:
25	n/a	"""Parser base class which provides some common support methods used
26	1	by the SGML/HTML and XHTML parsers."""
27	n/a
28	1	def __init__(self):
29	0	if self.__class__ is ParserBase:
30	0	raise RuntimeError(
31	0	"markupbase.ParserBase must be subclassed")
32	n/a
33	1	def error(self, message):
34	0	raise NotImplementedError(
35	0	"subclasses of ParserBase must override error()")
36	n/a
37	1	def reset(self):
38	101	self.lineno = 1
39	101	self.offset = 0
40	n/a
41	1	def getpos(self):
42	n/a	"""Return current line number and offset."""
43	16	return self.lineno, self.offset
44	n/a
45	n/a	# Internal -- update line number and offset. This should be
46	n/a	# called for each piece of data exactly once, in order -- in other
47	n/a	# words the concatenation of all the input strings to this
48	n/a	# function should be exactly the entire input.
49	1	def updatepos(self, i, j):
50	1412	if i >= j:
51	1174	return j
52	238	rawdata = self.rawdata
53	238	nlines = rawdata.count("\n", i, j)
54	238	if nlines:
55	14	self.lineno = self.lineno + nlines
56	14	pos = rawdata.rindex("\n", i, j) # Should not fail
57	14	self.offset = j-(pos+1)
58	n/a	else:
59	224	self.offset = self.offset + j-i
60	238	return j
61	n/a
62	1	_decl_otherchars = ''
63	n/a
64	n/a	# Internal -- parse declaration (for use by subclasses).
65	1	def parse_declaration(self, i):
66	n/a	# This is some sort of declaration; in "HTML as
67	n/a	# deployed," this should only be the document type
68	n/a	# declaration ("<!DOCTYPE html...>").
69	n/a	# ISO 8879:1986, however, has more complex
70	n/a	# declaration syntax for elements in <!...>, including:
71	n/a	# --comment--
72	n/a	# [marked section]
73	n/a	# name in the following list: ENTITY, DOCTYPE, ELEMENT,
74	n/a	# ATTLIST, NOTATION, SHORTREF, USEMAP,
75	n/a	# LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
76	574	rawdata = self.rawdata
77	574	j = i + 2
78	574	assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
79	574	if rawdata[j:j+1] == ">":
80	n/a	# the empty comment <!>
81	1	return j + 1
82	573	if rawdata[j:j+1] in ("-", ""):
83	n/a	# Start of comment followed by buffer boundary,
84	n/a	# or just a buffer boundary.
85	16	return -1
86	n/a	# A simple, practical version could look like: ((name\|stringlit) S*) + '>'
87	557	n = len(rawdata)
88	557	if rawdata[j:j+2] == '--': #comment
89	n/a	# Locate --.*-- as the body of the comment
90	0	return self.parse_comment(i)
91	557	elif rawdata[j] == '[': #marked section
92	n/a	# Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
93	n/a	# Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
94	n/a	# Note that this is extended by Microsoft Office "Save as Web" function
95	n/a	# to include [if...] and [endif].
96	2	return self.parse_marked_section(i)
97	n/a	else: #all other declaration elements
98	555	decltype, j = self._scan_name(j, i)
99	555	if j < 0:
100	39	return j
101	516	if decltype == "doctype":
102	490	self._decl_otherchars = ''
103	1103	while j < n:
104	1096	c = rawdata[j]
105	1096	if c == ">":
106	n/a	# end of declaration syntax
107	6	data = rawdata[i+2:j]
108	6	if decltype == "doctype":
109	5	self.handle_decl(data)
110	n/a	else:
111	1	self.unknown_decl(data)
112	6	return j + 1
113	1090	if c in "\"'":
114	61	m = _declstringlit_match(rawdata, j)
115	61	if not m:
116	39	return -1 # incomplete
117	22	j = m.end()
118	1029	elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
119	578	name, j = self._scan_name(j, i)
120	451	elif c in self._decl_otherchars:
121	27	j = j + 1
122	424	elif c == "[":
123	n/a	# this could be handled in a separate doctype parser
124	421	if decltype == "doctype":
125	421	j = self._parse_doctype_subset(j + 1, i)
126	0	elif decltype in ("attlist", "linktype", "link", "element"):
127	n/a	# must tolerate []'d groups in a content model in an element declaration
128	n/a	# also in data attribute specifications of attlist declaration
129	n/a	# also link type declaration subsets in linktype declarations
130	n/a	# also link attribute specification lists in link declarations
131	0	self.error("unsupported '[' char in %s declaration" % decltype)
132	n/a	else:
133	0	self.error("unexpected '[' char in declaration")
134	n/a	else:
135	3	self.error(
136	3	"unexpected %r char in declaration" % rawdata[j])
137	1048	if j < 0:
138	461	return j
139	7	return -1 # incomplete
140	n/a
141	n/a	# Internal -- parse a marked section
142	n/a	# Override this to handle MS-word extension syntax <![if word]>content<![endif]>
143	1	def parse_marked_section(self, i, report=1):
144	2	rawdata= self.rawdata
145	2	assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
146	2	sectName, j = self._scan_name( i+3, i )
147	2	if j < 0:
148	0	return j
149	2	if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
150	n/a	# look for standard ]]> ending
151	0	match= _markedsectionclose.search(rawdata, i+3)
152	2	elif sectName in ("if", "else", "endif"):
153	n/a	# look for MS Office ]> ending
154	2	match= _msmarkedsectionclose.search(rawdata, i+3)
155	n/a	else:
156	0	self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
157	2	if not match:
158	0	return -1
159	2	if report:
160	2	j = match.start(0)
161	2	self.unknown_decl(rawdata[i+3: j])
162	2	return match.end(0)
163	n/a
164	n/a	# Internal -- parse comment, return length or -1 if not terminated
165	1	def parse_comment(self, i, report=1):
166	137	rawdata = self.rawdata
167	137	if rawdata[i:i+4] != '<!--':
168	0	self.error('unexpected call to parse_comment()')
169	137	match = _commentclose.search(rawdata, i+4)
170	137	if not match:
171	111	return -1
172	26	if report:
173	21	j = match.start(0)
174	21	self.handle_comment(rawdata[i+4: j])
175	26	return match.end(0)
176	n/a
177	n/a	# Internal -- scan past the internal subset in a <!DOCTYPE declaration,
178	n/a	# returning the index just past any whitespace following the trailing ']'.
179	1	def _parse_doctype_subset(self, i, declstartpos):
180	421	rawdata = self.rawdata
181	421	n = len(rawdata)
182	421	j = i
183	6167	while j < n:
184	6131	c = rawdata[j]
185	6131	if c == "<":
186	1505	s = rawdata[j:j+2]
187	1505	if s == "<":
188	n/a	# end of buffer; incomplete
189	8	return -1
190	1497	if s != "<!":
191	0	self.updatepos(declstartpos, j + 1)
192	0	self.error("unexpected char in internal subset (in %r)" % s)
193	1497	if (j + 2) == n:
194	n/a	# end of buffer; incomplete
195	8	return -1
196	1489	if (j + 4) > n:
197	n/a	# end of buffer; incomplete
198	8	return -1
199	1481	if rawdata[j:j+4] == "<!--":
200	17	j = self.parse_comment(j, report=0)
201	17	if j < 0:
202	12	return j
203	0	continue
204	1464	name, j = self._scan_name(j + 2, declstartpos)
205	1464	if j == -1:
206	47	return -1
207	1417	if name not in ("attlist", "element", "entity", "notation"):
208	0	self.updatepos(declstartpos, j + 2)
209	0	self.error(
210	0	"unknown declaration %r in internal subset" % name)
211	n/a	# handle the individual names
212	1417	meth = getattr(self, "_parse_doctype_" + name)
213	1417	j = meth(j, declstartpos)
214	1417	if j < 0:
215	285	return j
216	4626	elif c == "%":
217	n/a	# parameter entity reference
218	36	if (j + 1) == n:
219	n/a	# end of buffer; incomplete
220	1	return -1
221	35	s, j = self._scan_name(j + 1, declstartpos)
222	35	if j < 0:
223	11	return j
224	24	if rawdata[j] == ";":
225	24	j = j + 1
226	4590	elif c == "]":
227	5	j = j + 1
228	5	while j < n and rawdata[j].isspace():
229	0	j = j + 1
230	5	if j < n:
231	3	if rawdata[j] == ">":
232	3	return j
233	0	self.updatepos(declstartpos, j)
234	0	self.error("unexpected char after internal subset")
235	n/a	else:
236	2	return -1
237	4585	elif c.isspace():
238	4585	j = j + 1
239	n/a	else:
240	0	self.updatepos(declstartpos, j)
241	0	self.error("unexpected char %r in internal subset" % c)
242	n/a	# end of buffer reached
243	36	return -1
244	n/a
245	n/a	# Internal -- scan past <!ELEMENT declarations
246	1	def _parse_doctype_element(self, i, declstartpos):
247	376	name, j = self._scan_name(i, declstartpos)
248	376	if j == -1:
249	5	return -1
250	n/a	# style content model; just skip until '>'
251	371	rawdata = self.rawdata
252	371	if '>' in rawdata[j:]:
253	362	return rawdata.find(">", j) + 1
254	9	return -1
255	n/a
256	n/a	# Internal -- scan past <!ATTLIST declarations
257	1	def _parse_doctype_attlist(self, i, declstartpos):
258	368	rawdata = self.rawdata
259	368	name, j = self._scan_name(i, declstartpos)
260	368	c = rawdata[j:j+1]
261	368	if c == "":
262	15	return -1
263	353	if c == ">":
264	0	return j + 1
265	353	while 1:
266	n/a	# scan a series of attribute descriptions; simplified:
267	n/a	# name type [value] [#constraint]
268	661	name, j = self._scan_name(j, declstartpos)
269	661	if j < 0:
270	21	return j
271	640	c = rawdata[j:j+1]
272	640	if c == "":
273	0	return -1
274	640	if c == "(":
275	n/a	# an enumerated type; look for ')'
276	11	if ")" in rawdata[j:]:
277	5	j = rawdata.find(")", j) + 1
278	n/a	else:
279	6	return -1
280	9	while rawdata[j:j+1].isspace():
281	4	j = j + 1
282	5	if not rawdata[j:]:
283	n/a	# end of buffer, incomplete
284	2	return -1
285	n/a	else:
286	629	name, j = self._scan_name(j, declstartpos)
287	632	c = rawdata[j:j+1]
288	632	if not c:
289	12	return -1
290	620	if c in "'\"":
291	294	m = _declstringlit_match(rawdata, j)
292	294	if m:
293	283	j = m.end()
294	n/a	else:
295	11	return -1
296	283	c = rawdata[j:j+1]
297	283	if not c:
298	1	return -1
299	608	if c == "#":
300	323	if rawdata[j:] == "#":
301	n/a	# end of buffer
302	1	return -1
303	322	name, j = self._scan_name(j + 1, declstartpos)
304	322	if j < 0:
305	14	return j
306	308	c = rawdata[j:j+1]
307	308	if not c:
308	0	return -1
309	593	if c == '>':
310	n/a	# all done
311	285	return j + 1
312	n/a
313	n/a	# Internal -- scan past <!NOTATION declarations
314	1	def _parse_doctype_notation(self, i, declstartpos):
315	267	name, j = self._scan_name(i, declstartpos)
316	267	if j < 0:
317	9	return j
318	258	rawdata = self.rawdata
319	258	while 1:
320	714	c = rawdata[j:j+1]
321	714	if not c:
322	n/a	# end of buffer; incomplete
323	1	return -1
324	713	if c == '>':
325	204	return j + 1
326	509	if c in "'\"":
327	251	m = _declstringlit_match(rawdata, j)
328	251	if not m:
329	46	return -1
330	205	j = m.end()
331	n/a	else:
332	258	name, j = self._scan_name(j, declstartpos)
333	258	if j < 0:
334	7	return j
335	n/a
336	n/a	# Internal -- scan past <!ENTITY declarations
337	1	def _parse_doctype_entity(self, i, declstartpos):
338	406	rawdata = self.rawdata
339	406	if rawdata[i:i+1] == "%":
340	70	j = i + 1
341	70	while 1:
342	139	c = rawdata[j:j+1]
343	139	if not c:
344	2	return -1
345	137	if c.isspace():
346	69	j = j + 1
347	n/a	else:
348	68	break
349	n/a	else:
350	336	j = i
351	404	name, j = self._scan_name(j, declstartpos)
352	404	if j < 0:
353	30	return j
354	374	while 1:
355	787	c = self.rawdata[j:j+1]
356	787	if not c:
357	3	return -1
358	784	if c in "'\"":
359	367	m = _declstringlit_match(rawdata, j)
360	367	if m:
361	284	j = m.end()
362	n/a	else:
363	83	return -1 # incomplete
364	417	elif c == ">":
365	281	return j + 1
366	n/a	else:
367	136	name, j = self._scan_name(j, declstartpos)
368	136	if j < 0:
369	7	return j
370	n/a
371	n/a	# Internal -- scan a name token and the new position and the token, or
372	n/a	# return -1 if we've reached the end of the buffer.
373	1	def _scan_name(self, i, declstartpos):
374	6055	rawdata = self.rawdata
375	6055	n = len(rawdata)
376	6055	if i == n:
377	0	return None, -1
378	6055	m = _declname_match(rawdata, i)
379	6055	if m:
380	6055	s = m.group()
381	6055	name = s.strip()
382	6055	if (i + len(s)) == n:
383	260	return None, -1 # end of buffer
384	5795	return name.lower(), m.end()
385	n/a	else:
386	0	self.updatepos(declstartpos, i)
387	0	self.error("expected name token at %r"
388	0	% rawdata[declstartpos:declstartpos+20])
389	n/a
390	n/a	# To be overridden -- handlers for unknown objects
391	1	def unknown_decl(self, data):
392	0	pass