ยปCore Development>Code coverage>Lib/xmllib.py

Python code coverage for Lib/xmllib.py

#countcontent
11"""A parser for XML, using the derived class as static DTD."""
2n/a
3n/a# Author: Sjoerd Mullender.
4n/a
51import re
61import string
7n/a
81import warnings
91warnings.warn("The xmllib module is obsolete. Use xml.sax instead.",
101 DeprecationWarning, 2)
111del warnings
12n/a
131version = '0.3'
14n/a
152class Error(RuntimeError):
161 pass
17n/a
18n/a# Regular expressions used for parsing
19n/a
201_S = '[ \t\r\n]+' # white space
211_opS = '[ \t\r\n]*' # optional white space
221_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*' # valid XML name
231_QStr = "(?:'[^']*'|\"[^\"]*\")" # quoted XML string
241illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
251interesting = re.compile('[]&<]')
26n/a
271amp = re.compile('&')
281ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
291entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
301charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
311space = re.compile(_S + '$')
321newline = re.compile('\n')
33n/a
341attrfind = re.compile(
35n/a _S + '(?P<name>' + _Name + ')'
36n/a '(' + _opS + '=' + _opS +
371 '(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?')
381starttagopen = re.compile('<' + _Name)
391starttagend = re.compile(_opS + '(?P<slash>/?)>')
401starttagmatch = re.compile('<(?P<tagname>'+_Name+')'
411 '(?P<attrs>(?:'+attrfind.pattern+')*)'+
421 starttagend.pattern)
431endtagopen = re.compile('</')
441endbracket = re.compile(_opS + '>')
451endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>')
461tagfind = re.compile(_Name)
471cdataopen = re.compile(r'<!\[CDATA\[')
481cdataclose = re.compile(r'\]\]>')
49n/a# this matches one of the following:
50n/a# SYSTEM SystemLiteral
51n/a# PUBLIC PubidLiteral SystemLiteral
521_SystemLiteral = '(?P<%s>'+_QStr+')'
531_PublicLiteral = '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
54n/a "'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
55n/a_ExternalId = '(?:SYSTEM|' \
56n/a 'PUBLIC'+_S+_PublicLiteral%'pubid'+ \
571 ')'+_S+_SystemLiteral%'syslit'
581doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
591 '(?:'+_S+_ExternalId+')?'+_opS)
601xmldecl = re.compile('<\?xml'+_S+
61n/a 'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+
62n/a '(?:'+_S+'encoding'+_opS+'='+_opS+
63n/a "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
64n/a '"[A-Za-z][-A-Za-z0-9._]*"))?'
65n/a '(?:'+_S+'standalone'+_opS+'='+_opS+
66n/a '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
671 _opS+'\?>')
681procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)
691procclose = re.compile(_opS + r'\?>')
701commentopen = re.compile('<!--')
711commentclose = re.compile('-->')
721doubledash = re.compile('--')
731attrtrans = string.maketrans(' \r\n\t', ' ')
74n/a
75n/a# definitions for XML namespaces
761_NCName = '[a-zA-Z_][-a-zA-Z0-9._]*' # XML Name, minus the ":"
771ncname = re.compile(_NCName + '$')
781qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix
791 '(?P<local>' + _NCName + ')$')
80n/a
811xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$')
82n/a
83n/a# XML parser base class -- find tags and call handler functions.
84n/a# Usage: p = XMLParser(); p.feed(data); ...; p.close().
85n/a# The dtd is defined by deriving a class which defines methods with
86n/a# special names to handle tags: start_foo and end_foo to handle <foo>
87n/a# and </foo>, respectively. The data between tags is passed to the
88n/a# parser by calling self.handle_data() with some data as argument (the
89n/a# data may be split up in arbitrary chunks).
90n/a
912class XMLParser:
921 attributes = {} # default, to be overridden
931 elements = {} # default, to be overridden
94n/a
95n/a # parsing options, settable using keyword args in __init__
961 __accept_unquoted_attributes = 0
971 __accept_missing_endtag_name = 0
981 __map_case = 0
991 __accept_utf8 = 0
1001 __translate_attribute_references = 1
101n/a
102n/a # Interface -- initialize and reset this instance
1031 def __init__(self, **kw):
1042 self.__fixed = 0
1052 if 'accept_unquoted_attributes' in kw:
1060 self.__accept_unquoted_attributes = kw['accept_unquoted_attributes']
1072 if 'accept_missing_endtag_name' in kw:
1080 self.__accept_missing_endtag_name = kw['accept_missing_endtag_name']
1092 if 'map_case' in kw:
1100 self.__map_case = kw['map_case']
1112 if 'accept_utf8' in kw:
1120 self.__accept_utf8 = kw['accept_utf8']
1132 if 'translate_attribute_references' in kw:
1140 self.__translate_attribute_references = kw['translate_attribute_references']
1152 self.reset()
116n/a
1171 def __fixelements(self):
1182 self.__fixed = 1
1192 self.elements = {}
1202 self.__fixdict(self.__dict__)
1212 self.__fixclass(self.__class__)
122n/a
1231 def __fixclass(self, kl):
1243 self.__fixdict(kl.__dict__)
1254 for k in kl.__bases__:
1261 self.__fixclass(k)
127n/a
1281 def __fixdict(self, dict):
129124 for key in dict.keys():
130119 if key[:6] == 'start_':
1310 tag = key[6:]
1320 start, end = self.elements.get(tag, (None, None))
1330 if start is None:
1340 self.elements[tag] = getattr(self, key), end
135119 elif key[:4] == 'end_':
1360 tag = key[4:]
1370 start, end = self.elements.get(tag, (None, None))
1380 if end is None:
1390 self.elements[tag] = start, getattr(self, key)
140n/a
141n/a # Interface -- reset this instance. Loses all unprocessed data
1421 def reset(self):
1432 self.rawdata = ''
1442 self.stack = []
1452 self.nomoretags = 0
1462 self.literal = 0
1472 self.lineno = 1
1482 self.__at_start = 1
1492 self.__seen_doctype = None
1502 self.__seen_starttag = 0
1512 self.__use_namespaces = 0
1522 self.__namespaces = {'xml':None} # xml is implicitly declared
153n/a # backward compatibility hack: if elements not overridden,
154n/a # fill it in ourselves
1552 if self.elements is XMLParser.elements:
1562 self.__fixelements()
157n/a
158n/a # For derived classes only -- enter literal mode (CDATA) till EOF
1591 def setnomoretags(self):
1600 self.nomoretags = self.literal = 1
161n/a
162n/a # For derived classes only -- enter literal mode (CDATA)
1631 def setliteral(self, *args):
1640 self.literal = 1
165n/a
166n/a # Interface -- feed some data to the parser. Call this as
167n/a # often as you want, with as little or as much text as you
168n/a # want (may include '\n'). (This just saves the text, all the
169n/a # processing is done by goahead().)
1701 def feed(self, data):
171330 self.rawdata = self.rawdata + data
172330 self.goahead(0)
173n/a
174n/a # Interface -- handle the remaining data
1751 def close(self):
1762 self.goahead(1)
1772 if self.__fixed:
1782 self.__fixed = 0
179n/a # remove self.elements so that we don't leak
1802 del self.elements
181n/a
182n/a # Interface -- translate references
1831 def translate_references(self, data, all = 1):
1841 if not self.__translate_attribute_references:
1850 return data
1861 i = 0
1871 while 1:
1881 res = amp.search(data, i)
1891 if res is None:
1901 return data
1910 s = res.start(0)
1920 res = ref.match(data, s)
1930 if res is None:
1940 self.syntax_error("bogus `&'")
1950 i = s+1
1960 continue
1970 i = res.end(0)
1980 str = res.group(1)
1990 rescan = 0
2000 if str[0] == '#':
2010 if str[1] == 'x':
2020 str = chr(int(str[2:], 16))
203n/a else:
2040 str = chr(int(str[1:]))
2050 if data[i - 1] != ';':
2060 self.syntax_error("`;' missing after char reference")
2070 i = i-1
2080 elif all:
2090 if str in self.entitydefs:
2100 str = self.entitydefs[str]
2110 rescan = 1
2120 elif data[i - 1] != ';':
2130 self.syntax_error("bogus `&'")
2140 i = s + 1 # just past the &
2150 continue
216n/a else:
2170 self.syntax_error("reference to unknown entity `&%s;'" % str)
2180 str = '&' + str + ';'
2190 elif data[i - 1] != ';':
2200 self.syntax_error("bogus `&'")
2210 i = s + 1 # just past the &
2220 continue
223n/a
224n/a # when we get here, str contains the translated text and i points
225n/a # to the end of the string that is to be replaced
2260 data = data[:s] + str + data[i:]
2270 if rescan:
2280 i = s
229n/a else:
2300 i = s + len(str)
231n/a
232n/a # Interface - return a dictionary of all namespaces currently valid
2331 def getnamespace(self):
2340 nsdict = {}
2350 for t, d, nst in self.stack:
2360 nsdict.update(d)
2370 return nsdict
238n/a
239n/a # Internal -- handle data as far as reasonable. May leave state
240n/a # and data to be processed by a subsequent call. If 'end' is
241n/a # true, force handling all data as if followed by EOF marker.
2421 def goahead(self, end):
243332 rawdata = self.rawdata
244332 i = 0
245332 n = len(rawdata)
246339 while i < n:
247330 if i > 0:
2480 self.__at_start = 0
249330 if self.nomoretags:
2500 data = rawdata[i:n]
2510 self.handle_data(data)
2520 self.lineno = self.lineno + data.count('\n')
2530 i = n
2540 break
255330 res = interesting.search(rawdata, i)
256330 if res:
257312 j = res.start(0)
258n/a else:
25918 j = n
260330 if i < j:
26118 data = rawdata[i:j]
26218 if self.__at_start and space.match(data) is None:
2630 self.syntax_error('illegal data at start of file')
26418 self.__at_start = 0
26518 if not self.stack and space.match(data) is None:
2660 self.syntax_error('data not in content')
26718 if not self.__accept_utf8 and illegal.search(data):
2680 self.syntax_error('illegal character in content')
26918 self.handle_data(data)
27018 self.lineno = self.lineno + data.count('\n')
271330 i = j
272330 if i == n: break
273312 if rawdata[i] == '<':
274312 if starttagopen.match(rawdata, i):
27510 if self.literal:
2760 data = rawdata[i]
2770 self.handle_data(data)
2780 self.lineno = self.lineno + data.count('\n')
2790 i = i+1
2800 continue
28110 k = self.parse_starttag(i)
28210 if k < 0: break
2832 self.__seen_starttag = 1
2842 self.lineno = self.lineno + rawdata[i:k].count('\n')
2852 i = k
2862 continue
287302 if endtagopen.match(rawdata, i):
28810 k = self.parse_endtag(i)
28910 if k < 0: break
2901 self.lineno = self.lineno + rawdata[i:k].count('\n')
2911 i = k
2921 continue
293292 if commentopen.match(rawdata, i):
294105 if self.literal:
2950 data = rawdata[i]
2960 self.handle_data(data)
2970 self.lineno = self.lineno + data.count('\n')
2980 i = i+1
2990 continue
300105 k = self.parse_comment(i)
301105 if k < 0: break
3021 self.lineno = self.lineno + rawdata[i:k].count('\n')
3031 i = k
3041 continue
305187 if cdataopen.match(rawdata, i):
3060 k = self.parse_cdata(i)
3070 if k < 0: break
3080 self.lineno = self.lineno + rawdata[i:k].count('\n')
3090 i = k
3100 continue
311187 res = xmldecl.match(rawdata, i)
312187 if res:
3131 if not self.__at_start:
3140 self.syntax_error("<?xml?> declaration not at start of document")
3151 version, encoding, standalone = res.group('version',
3161 'encoding',
3171 'standalone')
3181 if version[1:-1] != '1.0':
3190 raise Error('only XML version 1.0 supported')
3201 if encoding: encoding = encoding[1:-1]
3211 if standalone: standalone = standalone[1:-1]
3221 self.handle_xml(encoding, standalone)
3231 i = res.end(0)
3241 continue
325186 res = procopen.match(rawdata, i)
326186 if res:
327122 k = self.parse_proc(i)
328122 if k < 0: break
3291 self.lineno = self.lineno + rawdata[i:k].count('\n')
3301 i = k
3311 continue
33264 res = doctype.match(rawdata, i)
33364 if res:
33445 if self.literal:
3350 data = rawdata[i]
3360 self.handle_data(data)
3370 self.lineno = self.lineno + data.count('\n')
3380 i = i+1
3390 continue
34045 if self.__seen_doctype:
3410 self.syntax_error('multiple DOCTYPE elements')
34245 if self.__seen_starttag:
3430 self.syntax_error('DOCTYPE not at beginning of document')
34445 k = self.parse_doctype(res)
34545 if k < 0: break
3461 self.__seen_doctype = res.group('name')
3471 if self.__map_case:
3480 self.__seen_doctype = self.__seen_doctype.lower()
3491 self.lineno = self.lineno + rawdata[i:k].count('\n')
3501 i = k
3511 continue
3520 elif rawdata[i] == '&':
3530 if self.literal:
3540 data = rawdata[i]
3550 self.handle_data(data)
3560 i = i+1
3570 continue
3580 res = charref.match(rawdata, i)
3590 if res is not None:
3600 i = res.end(0)
3610 if rawdata[i-1] != ';':
3620 self.syntax_error("`;' missing in charref")
3630 i = i-1
3640 if not self.stack:
3650 self.syntax_error('data not in content')
3660 self.handle_charref(res.group('char')[:-1])
3670 self.lineno = self.lineno + res.group(0).count('\n')
3680 continue
3690 res = entityref.match(rawdata, i)
3700 if res is not None:
3710 i = res.end(0)
3720 if rawdata[i-1] != ';':
3730 self.syntax_error("`;' missing in entityref")
3740 i = i-1
3750 name = res.group('name')
3760 if self.__map_case:
3770 name = name.lower()
3780 if name in self.entitydefs:
3790 self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
3800 n = len(rawdata)
3810 i = res.start(0)
382n/a else:
3830 self.unknown_entityref(name)
3840 self.lineno = self.lineno + res.group(0).count('\n')
3850 continue
3860 elif rawdata[i] == ']':
3870 if self.literal:
3880 data = rawdata[i]
3890 self.handle_data(data)
3900 i = i+1
3910 continue
3920 if n-i < 3:
3930 break
3940 if cdataclose.match(rawdata, i):
3950 self.syntax_error("bogus `]]>'")
3960 self.handle_data(rawdata[i])
3970 i = i+1
3980 continue
399n/a else:
4000 raise Error('neither < nor & ??')
401n/a # We get here only if incomplete matches but
402n/a # nothing else
40319 break
404n/a # end while
405332 if i > 0:
40625 self.__at_start = 0
407332 if end and i < n:
4080 data = rawdata[i]
4090 self.syntax_error("bogus `%s'" % data)
4100 if not self.__accept_utf8 and illegal.search(data):
4110 self.syntax_error('illegal character in content')
4120 self.handle_data(data)
4130 self.lineno = self.lineno + data.count('\n')
4140 self.rawdata = rawdata[i+1:]
4150 return self.goahead(end)
416332 self.rawdata = rawdata[i:]
417332 if end:
4182 if not self.__seen_starttag:
4190 self.syntax_error('no elements in file')
4202 if self.stack:
4210 self.syntax_error('missing end tags')
4220 while self.stack:
4230 self.finish_endtag(self.stack[-1][0])
424n/a
425n/a # Internal -- parse comment, return length or -1 if not terminated
4261 def parse_comment(self, i):
427105 rawdata = self.rawdata
428105 if rawdata[i:i+4] != '<!--':
4290 raise Error('unexpected call to handle_comment')
430105 res = commentclose.search(rawdata, i+4)
431105 if res is None:
432104 return -1
4331 if doubledash.search(rawdata, i+4, res.start(0)):
4340 self.syntax_error("`--' inside comment")
4351 if rawdata[res.start(0)-1] == '-':
4360 self.syntax_error('comment cannot end in three dashes')
4371 if not self.__accept_utf8 and \
4381 illegal.search(rawdata, i+4, res.start(0)):
4390 self.syntax_error('illegal character in comment')
4401 self.handle_comment(rawdata[i+4: res.start(0)])
4411 return res.end(0)
442n/a
443n/a # Internal -- handle DOCTYPE tag, return length or -1 if not terminated
4441 def parse_doctype(self, res):
44545 rawdata = self.rawdata
44645 n = len(rawdata)
44745 name = res.group('name')
44845 if self.__map_case:
4490 name = name.lower()
45045 pubid, syslit = res.group('pubid', 'syslit')
45145 if pubid is not None:
4520 pubid = pubid[1:-1] # remove quotes
4530 pubid = ' '.join(pubid.split()) # normalize
45445 if syslit is not None: syslit = syslit[1:-1] # remove quotes
45545 j = k = res.end(0)
45645 if k >= n:
4579 return -1
45836 if rawdata[k] == '[':
45936 level = 0
46036 k = k+1
46136 dq = sq = 0
462663 while k < n:
463629 c = rawdata[k]
464629 if not sq and c == '"':
4650 dq = not dq
466629 elif not dq and c == "'":
4670 sq = not sq
468629 elif sq or dq:
4690 pass
470629 elif level <= 0 and c == ']':
4712 res = endbracket.match(rawdata, k+1)
4722 if res is None:
4731 return -1
4741 self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
4751 return res.end(0)
476627 elif c == '<':
47732 level = level + 1
478595 elif c == '>':
4794 level = level - 1
4804 if level < 0:
4810 self.syntax_error("bogus `>' in DOCTYPE")
482627 k = k+1
48334 res = endbracketfind.match(rawdata, k)
48434 if res is None:
48534 return -1
4860 if endbracket.match(rawdata, k) is None:
4870 self.syntax_error('garbage in DOCTYPE')
4880 self.handle_doctype(name, pubid, syslit, None)
4890 return res.end(0)
490n/a
491n/a # Internal -- handle CDATA tag, return length or -1 if not terminated
4921 def parse_cdata(self, i):
4930 rawdata = self.rawdata
4940 if rawdata[i:i+9] != '<![CDATA[':
4950 raise Error('unexpected call to parse_cdata')
4960 res = cdataclose.search(rawdata, i+9)
4970 if res is None:
4980 return -1
4990 if not self.__accept_utf8 and \
5000 illegal.search(rawdata, i+9, res.start(0)):
5010 self.syntax_error('illegal character in CDATA')
5020 if not self.stack:
5030 self.syntax_error('CDATA not in content')
5040 self.handle_cdata(rawdata[i+9:res.start(0)])
5050 return res.end(0)
506n/a
5071 __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None}
508n/a # Internal -- handle a processing instruction tag
5091 def parse_proc(self, i):
510122 rawdata = self.rawdata
511122 end = procclose.search(rawdata, i)
512122 if end is None:
513121 return -1
5141 j = end.start(0)
5151 if not self.__accept_utf8 and illegal.search(rawdata, i+2, j):
5160 self.syntax_error('illegal character in processing instruction')
5171 res = tagfind.match(rawdata, i+2)
5181 if res is None:
5190 raise Error('unexpected call to parse_proc')
5201 k = res.end(0)
5211 name = res.group(0)
5221 if self.__map_case:
5230 name = name.lower()
5241 if name == 'xml:namespace':
5250 self.syntax_error('old-fashioned namespace declaration')
5260 self.__use_namespaces = -1
527n/a # namespace declaration
528n/a # this must come after the <?xml?> declaration (if any)
529n/a # and before the <!DOCTYPE> (if any).
5300 if self.__seen_doctype or self.__seen_starttag:
5310 self.syntax_error('xml:namespace declaration too late in document')
5320 attrdict, namespace, k = self.parse_attributes(name, k, j)
5330 if namespace:
5340 self.syntax_error('namespace declaration inside namespace declaration')
5350 for attrname in attrdict.keys():
5360 if not attrname in self.__xml_namespace_attributes:
5370 self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname)
5380 if not 'ns' in attrdict or not 'prefix' in attrdict:
5390 self.syntax_error('xml:namespace without required attributes')
5400 prefix = attrdict.get('prefix')
5410 if ncname.match(prefix) is None:
5420 self.syntax_error('xml:namespace illegal prefix value')
5430 return end.end(0)
5440 if prefix in self.__namespaces:
5450 self.syntax_error('xml:namespace prefix not unique')
5460 self.__namespaces[prefix] = attrdict['ns']
547n/a else:
5481 if name.lower() == 'xml':
5490 self.syntax_error('illegal processing instruction target name')
5501 self.handle_proc(name, rawdata[k:j])
5511 return end.end(0)
552n/a
553n/a # Internal -- parse attributes between i and j
5541 def parse_attributes(self, tag, i, j):
5552 rawdata = self.rawdata
5562 attrdict = {}
5572 namespace = {}
5584 while i < j:
5592 res = attrfind.match(rawdata, i)
5602 if res is None:
5610 break
5622 attrname, attrvalue = res.group('name', 'value')
5632 if self.__map_case:
5640 attrname = attrname.lower()
5652 i = res.end(0)
5662 if attrvalue is None:
5670 self.syntax_error("no value specified for attribute `%s'" % attrname)
5680 attrvalue = attrname
5692 elif attrvalue[:1] == "'" == attrvalue[-1:] or \
5700 attrvalue[:1] == '"' == attrvalue[-1:]:
5712 attrvalue = attrvalue[1:-1]
5720 elif not self.__accept_unquoted_attributes:
5730 self.syntax_error("attribute `%s' value not quoted" % attrname)
5742 res = xmlns.match(attrname)
5752 if res is not None:
576n/a # namespace declaration
5771 ncname = res.group('ncname')
5781 namespace[ncname or ''] = attrvalue or None
5791 if not self.__use_namespaces:
5801 self.__use_namespaces = len(self.stack)+1
5810 continue
5821 if '<' in attrvalue:
5830 self.syntax_error("`<' illegal in attribute value")
5841 if attrname in attrdict:
5850 self.syntax_error("attribute `%s' specified twice" % attrname)
5861 attrvalue = attrvalue.translate(attrtrans)
5871 attrdict[attrname] = self.translate_references(attrvalue)
5882 return attrdict, namespace, i
589n/a
590n/a # Internal -- handle starttag, return length or -1 if not terminated
5911 def parse_starttag(self, i):
59210 rawdata = self.rawdata
593n/a # i points to start of tag
59410 end = endbracketfind.match(rawdata, i+1)
59510 if end is None:
5968 return -1
5972 tag = starttagmatch.match(rawdata, i)
5982 if tag is None or tag.end(0) != end.end(0):
5990 self.syntax_error('garbage in starttag')
6000 return end.end(0)
6012 nstag = tagname = tag.group('tagname')
6022 if self.__map_case:
6030 nstag = tagname = nstag.lower()
6042 if not self.__seen_starttag and self.__seen_doctype and \
6051 tagname != self.__seen_doctype:
6060 self.syntax_error('starttag does not match DOCTYPE')
6072 if self.__seen_starttag and not self.stack:
6080 self.syntax_error('multiple elements on top level')
6092 k, j = tag.span('attrs')
6102 attrdict, nsdict, k = self.parse_attributes(tagname, k, j)
6112 self.stack.append((tagname, nsdict, nstag))
6122 if self.__use_namespaces:
6131 res = qname.match(tagname)
614n/a else:
6151 res = None
6162 if res is not None:
6171 prefix, nstag = res.group('prefix', 'local')
6181 if prefix is None:
6191 prefix = ''
6201 ns = None
6212 for t, d, nst in self.stack:
6221 if prefix in d:
6231 ns = d[prefix]
6241 if ns is None and prefix != '':
6250 ns = self.__namespaces.get(prefix)
6261 if ns is not None:
6271 nstag = ns + ' ' + nstag
6280 elif prefix != '':
6290 nstag = prefix + ':' + nstag # undo split
6301 self.stack[-1] = tagname, nsdict, nstag
631n/a # translate namespace of attributes
6322 attrnamemap = {} # map from new name to old name (used for error reporting)
6333 for key in attrdict.keys():
6341 attrnamemap[key] = key
6352 if self.__use_namespaces:
6361 nattrdict = {}
6372 for key, val in attrdict.items():
6381 okey = key
6391 res = qname.match(key)
6401 if res is not None:
6411 aprefix, key = res.group('prefix', 'local')
6421 if self.__map_case:
6430 key = key.lower()
6441 if aprefix is not None:
6450 ans = None
6460 for t, d, nst in self.stack:
6470 if aprefix in d:
6480 ans = d[aprefix]
6490 if ans is None:
6500 ans = self.__namespaces.get(aprefix)
6510 if ans is not None:
6520 key = ans + ' ' + key
653n/a else:
6540 key = aprefix + ':' + key
6551 nattrdict[key] = val
6561 attrnamemap[key] = okey
6571 attrdict = nattrdict
6582 attributes = self.attributes.get(nstag)
6592 if attributes is not None:
6600 for key in attrdict.keys():
6610 if not key in attributes:
6620 self.syntax_error("unknown attribute `%s' in tag `%s'" % (attrnamemap[key], tagname))
6630 for key, val in attributes.items():
6640 if val is not None and not key in attrdict:
6650 attrdict[key] = val
6662 method = self.elements.get(nstag, (None, None))[0]
6672 self.finish_starttag(nstag, attrdict, method)
6682 if tag.group('slash') == '/':
6691 self.finish_endtag(tagname)
6702 return tag.end(0)
671n/a
672n/a # Internal -- parse endtag
6731 def parse_endtag(self, i):
67410 rawdata = self.rawdata
67510 end = endbracketfind.match(rawdata, i+1)
67610 if end is None:
6779 return -1
6781 res = tagfind.match(rawdata, i+2)
6791 if res is None:
6800 if self.literal:
6810 self.handle_data(rawdata[i])
6820 return i+1
6830 if not self.__accept_missing_endtag_name:
6840 self.syntax_error('no name specified in end tag')
6850 tag = self.stack[-1][0]
6860 k = i+2
687n/a else:
6881 tag = res.group(0)
6891 if self.__map_case:
6900 tag = tag.lower()
6911 if self.literal:
6920 if not self.stack or tag != self.stack[-1][0]:
6930 self.handle_data(rawdata[i])
6940 return i+1
6951 k = res.end(0)
6961 if endbracket.match(rawdata, k) is None:
6970 self.syntax_error('garbage in end tag')
6981 self.finish_endtag(tag)
6991 return end.end(0)
700n/a
701n/a # Internal -- finish processing of start tag
7021 def finish_starttag(self, tagname, attrdict, method):
7032 if method is not None:
7040 self.handle_starttag(tagname, method, attrdict)
705n/a else:
7062 self.unknown_starttag(tagname, attrdict)
707n/a
708n/a # Internal -- finish processing of end tag
7091 def finish_endtag(self, tag):
7102 self.literal = 0
7112 if not tag:
7120 self.syntax_error('name-less end tag')
7130 found = len(self.stack) - 1
7140 if found < 0:
7150 self.unknown_endtag(tag)
7160 return
717n/a else:
7182 found = -1
7194 for i in range(len(self.stack)):
7202 if tag == self.stack[i][0]:
7212 found = i
7222 if found == -1:
7230 self.syntax_error('unopened end tag')
7240 return
7254 while len(self.stack) > found:
7262 if found < len(self.stack) - 1:
7270 self.syntax_error('missing close tag for %s' % self.stack[-1][2])
7282 nstag = self.stack[-1][2]
7292 method = self.elements.get(nstag, (None, None))[1]
7302 if method is not None:
7310 self.handle_endtag(nstag, method)
732n/a else:
7332 self.unknown_endtag(nstag)
7342 if self.__use_namespaces == len(self.stack):
7351 self.__use_namespaces = 0
7362 del self.stack[-1]
737n/a
738n/a # Overridable -- handle xml processing instruction
7391 def handle_xml(self, encoding, standalone):
7401 pass
741n/a
742n/a # Overridable -- handle DOCTYPE
7431 def handle_doctype(self, tag, pubid, syslit, data):
7441 pass
745n/a
746n/a # Overridable -- handle start tag
7471 def handle_starttag(self, tag, method, attrs):
7480 method(attrs)
749n/a
750n/a # Overridable -- handle end tag
7511 def handle_endtag(self, tag, method):
7520 method()
753n/a
754n/a # Example -- handle character reference, no need to override
7551 def handle_charref(self, name):
7560 try:
7570 if name[0] == 'x':
7580 n = int(name[1:], 16)
759n/a else:
7600 n = int(name)
7610 except ValueError:
7620 self.unknown_charref(name)
7630 return
7640 if not 0 <= n <= 255:
7650 self.unknown_charref(name)
7660 return
7670 self.handle_data(chr(n))
768n/a
769n/a # Definition of entities -- derived classes may override
7701 entitydefs = {'lt': '&#60;', # must use charref
7711 'gt': '&#62;',
7721 'amp': '&#38;', # must use charref
7731 'quot': '&#34;',
7741 'apos': '&#39;',
775n/a }
776n/a
777n/a # Example -- handle data, should be overridden
7781 def handle_data(self, data):
77918 pass
780n/a
781n/a # Example -- handle cdata, could be overridden
7821 def handle_cdata(self, data):
7830 pass
784n/a
785n/a # Example -- handle comment, could be overridden
7861 def handle_comment(self, data):
7871 pass
788n/a
789n/a # Example -- handle processing instructions, could be overridden
7901 def handle_proc(self, name, data):
7911 pass
792n/a
793n/a # Example -- handle relatively harmless syntax errors, could be overridden
7941 def syntax_error(self, message):
7950 raise Error('Syntax error at line %d: %s' % (self.lineno, message))
796n/a
797n/a # To be overridden -- handlers for unknown objects
7982 def unknown_starttag(self, tag, attrs): pass
7993 def unknown_endtag(self, tag): pass
8001 def unknown_charref(self, ref): pass
8011 def unknown_entityref(self, name):
8020 self.syntax_error("reference to unknown entity `&%s;'" % name)
803n/a
804n/a
8052class TestXMLParser(XMLParser):
806n/a
8071 def __init__(self, **kw):
8080 self.testdata = ""
8090 XMLParser.__init__(self, **kw)
810n/a
8111 def handle_xml(self, encoding, standalone):
8120 self.flush()
8130 print 'xml: encoding =',encoding,'standalone =',standalone
814n/a
8151 def handle_doctype(self, tag, pubid, syslit, data):
8160 self.flush()
8170 print 'DOCTYPE:',tag, repr(data)
818n/a
8191 def handle_data(self, data):
8200 self.testdata = self.testdata + data
8210 if len(repr(self.testdata)) >= 70:
8220 self.flush()
823n/a
8241 def flush(self):
8250 data = self.testdata
8260 if data:
8270 self.testdata = ""
8280 print 'data:', repr(data)
829n/a
8301 def handle_cdata(self, data):
8310 self.flush()
8320 print 'cdata:', repr(data)
833n/a
8341 def handle_proc(self, name, data):
8350 self.flush()
8360 print 'processing:',name,repr(data)
837n/a
8381 def handle_comment(self, data):
8390 self.flush()
8400 r = repr(data)
8410 if len(r) > 68:
8420 r = r[:32] + '...' + r[-32:]
8430 print 'comment:', r
844n/a
8451 def syntax_error(self, message):
8460 print 'error at line %d:' % self.lineno, message
847n/a
8481 def unknown_starttag(self, tag, attrs):
8490 self.flush()
8500 if not attrs:
8510 print 'start tag: <' + tag + '>'
852n/a else:
8530 print 'start tag: <' + tag,
8540 for name, value in attrs.items():
8550 print name + '=' + '"' + value + '"',
8560 print '>'
857n/a
8581 def unknown_endtag(self, tag):
8590 self.flush()
8600 print 'end tag: </' + tag + '>'
861n/a
8621 def unknown_entityref(self, ref):
8630 self.flush()
8640 print '*** unknown entity ref: &' + ref + ';'
865n/a
8661 def unknown_charref(self, ref):
8670 self.flush()
8680 print '*** unknown char ref: &#' + ref + ';'
869n/a
8701 def close(self):
8710 XMLParser.close(self)
8720 self.flush()
873n/a
8741def test(args = None):
8750 import sys, getopt
8760 from time import time
877n/a
8780 if not args:
8790 args = sys.argv[1:]
880n/a
8810 opts, args = getopt.getopt(args, 'st')
8820 klass = TestXMLParser
8830 do_time = 0
8840 for o, a in opts:
8850 if o == '-s':
8860 klass = XMLParser
8870 elif o == '-t':
8880 do_time = 1
889n/a
8900 if args:
8910 file = args[0]
892n/a else:
8930 file = 'test.xml'
894n/a
8950 if file == '-':
8960 f = sys.stdin
897n/a else:
8980 try:
8990 f = open(file, 'r')
9000 except IOError, msg:
9010 print file, ":", msg
9020 sys.exit(1)
903n/a
9040 data = f.read()
9050 if f is not sys.stdin:
9060 f.close()
907n/a
9080 x = klass()
9090 t0 = time()
9100 try:
9110 if do_time:
9120 x.feed(data)
9130 x.close()
914n/a else:
9150 for c in data:
9160 x.feed(c)
9170 x.close()
9180 except Error, msg:
9190 t1 = time()
9200 print msg
9210 if do_time:
9220 print 'total time: %g' % (t1-t0)
9230 sys.exit(1)
9240 t1 = time()
9250 if do_time:
9260 print 'total time: %g' % (t1-t0)
927n/a
928n/a
9291if __name__ == '__main__':
9300 test()