| 1 | 1 | """A parser for XML, using the derived class as static DTD.""" |
|---|
| 2 | n/a | |
|---|
| 3 | n/a | # Author: Sjoerd Mullender. |
|---|
| 4 | n/a | |
|---|
| 5 | 1 | import re |
|---|
| 6 | 1 | import string |
|---|
| 7 | n/a | |
|---|
| 8 | 1 | import warnings |
|---|
| 9 | 1 | warnings.warn("The xmllib module is obsolete. Use xml.sax instead.", |
|---|
| 10 | 1 | DeprecationWarning, 2) |
|---|
| 11 | 1 | del warnings |
|---|
| 12 | n/a | |
|---|
| 13 | 1 | version = '0.3' |
|---|
| 14 | n/a | |
|---|
| 15 | 2 | class Error(RuntimeError): |
|---|
| 16 | 1 | pass |
|---|
| 17 | n/a | |
|---|
| 18 | n/a | # Regular expressions used for parsing |
|---|
| 19 | n/a | |
|---|
| 20 | 1 | _S = '[ \t\r\n]+' # white space |
|---|
| 21 | 1 | _opS = '[ \t\r\n]*' # optional white space |
|---|
| 22 | 1 | _Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*' # valid XML name |
|---|
| 23 | 1 | _QStr = "(?:'[^']*'|\"[^\"]*\")" # quoted XML string |
|---|
| 24 | 1 | illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content |
|---|
| 25 | 1 | interesting = re.compile('[]&<]') |
|---|
| 26 | n/a | |
|---|
| 27 | 1 | amp = re.compile('&') |
|---|
| 28 | 1 | ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]') |
|---|
| 29 | 1 | entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]') |
|---|
| 30 | 1 | charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])') |
|---|
| 31 | 1 | space = re.compile(_S + '$') |
|---|
| 32 | 1 | newline = re.compile('\n') |
|---|
| 33 | n/a | |
|---|
| 34 | 1 | attrfind = re.compile( |
|---|
| 35 | n/a | _S + '(?P<name>' + _Name + ')' |
|---|
| 36 | n/a | '(' + _opS + '=' + _opS + |
|---|
| 37 | 1 | '(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?') |
|---|
| 38 | 1 | starttagopen = re.compile('<' + _Name) |
|---|
| 39 | 1 | starttagend = re.compile(_opS + '(?P<slash>/?)>') |
|---|
| 40 | 1 | starttagmatch = re.compile('<(?P<tagname>'+_Name+')' |
|---|
| 41 | 1 | '(?P<attrs>(?:'+attrfind.pattern+')*)'+ |
|---|
| 42 | 1 | starttagend.pattern) |
|---|
| 43 | 1 | endtagopen = re.compile('</') |
|---|
| 44 | 1 | endbracket = re.compile(_opS + '>') |
|---|
| 45 | 1 | endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>') |
|---|
| 46 | 1 | tagfind = re.compile(_Name) |
|---|
| 47 | 1 | cdataopen = re.compile(r'<!\[CDATA\[') |
|---|
| 48 | 1 | cdataclose = re.compile(r'\]\]>') |
|---|
| 49 | n/a | # this matches one of the following: |
|---|
| 50 | n/a | # SYSTEM SystemLiteral |
|---|
| 51 | n/a | # PUBLIC PubidLiteral SystemLiteral |
|---|
| 52 | 1 | _SystemLiteral = '(?P<%s>'+_QStr+')' |
|---|
| 53 | 1 | _PublicLiteral = '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \ |
|---|
| 54 | n/a | "'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')" |
|---|
| 55 | n/a | _ExternalId = '(?:SYSTEM|' \ |
|---|
| 56 | n/a | 'PUBLIC'+_S+_PublicLiteral%'pubid'+ \ |
|---|
| 57 | 1 | ')'+_S+_SystemLiteral%'syslit' |
|---|
| 58 | 1 | doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')' |
|---|
| 59 | 1 | '(?:'+_S+_ExternalId+')?'+_opS) |
|---|
| 60 | 1 | xmldecl = re.compile('<\?xml'+_S+ |
|---|
| 61 | n/a | 'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+ |
|---|
| 62 | n/a | '(?:'+_S+'encoding'+_opS+'='+_opS+ |
|---|
| 63 | n/a | "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|" |
|---|
| 64 | n/a | '"[A-Za-z][-A-Za-z0-9._]*"))?' |
|---|
| 65 | n/a | '(?:'+_S+'standalone'+_opS+'='+_opS+ |
|---|
| 66 | n/a | '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+ |
|---|
| 67 | 1 | _opS+'\?>') |
|---|
| 68 | 1 | procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS) |
|---|
| 69 | 1 | procclose = re.compile(_opS + r'\?>') |
|---|
| 70 | 1 | commentopen = re.compile('<!--') |
|---|
| 71 | 1 | commentclose = re.compile('-->') |
|---|
| 72 | 1 | doubledash = re.compile('--') |
|---|
| 73 | 1 | attrtrans = string.maketrans(' \r\n\t', ' ') |
|---|
| 74 | n/a | |
|---|
| 75 | n/a | # definitions for XML namespaces |
|---|
| 76 | 1 | _NCName = '[a-zA-Z_][-a-zA-Z0-9._]*' # XML Name, minus the ":" |
|---|
| 77 | 1 | ncname = re.compile(_NCName + '$') |
|---|
| 78 | 1 | qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix |
|---|
| 79 | 1 | '(?P<local>' + _NCName + ')$') |
|---|
| 80 | n/a | |
|---|
| 81 | 1 | xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$') |
|---|
| 82 | n/a | |
|---|
| 83 | n/a | # XML parser base class -- find tags and call handler functions. |
|---|
| 84 | n/a | # Usage: p = XMLParser(); p.feed(data); ...; p.close(). |
|---|
| 85 | n/a | # The dtd is defined by deriving a class which defines methods with |
|---|
| 86 | n/a | # special names to handle tags: start_foo and end_foo to handle <foo> |
|---|
| 87 | n/a | # and </foo>, respectively. The data between tags is passed to the |
|---|
| 88 | n/a | # parser by calling self.handle_data() with some data as argument (the |
|---|
| 89 | n/a | # data may be split up in arbitrary chunks). |
|---|
| 90 | n/a | |
|---|
| 91 | 2 | class XMLParser: |
|---|
| 92 | 1 | attributes = {} # default, to be overridden |
|---|
| 93 | 1 | elements = {} # default, to be overridden |
|---|
| 94 | n/a | |
|---|
| 95 | n/a | # parsing options, settable using keyword args in __init__ |
|---|
| 96 | 1 | __accept_unquoted_attributes = 0 |
|---|
| 97 | 1 | __accept_missing_endtag_name = 0 |
|---|
| 98 | 1 | __map_case = 0 |
|---|
| 99 | 1 | __accept_utf8 = 0 |
|---|
| 100 | 1 | __translate_attribute_references = 1 |
|---|
| 101 | n/a | |
|---|
| 102 | n/a | # Interface -- initialize and reset this instance |
|---|
| 103 | 1 | def __init__(self, **kw): |
|---|
| 104 | 2 | self.__fixed = 0 |
|---|
| 105 | 2 | if 'accept_unquoted_attributes' in kw: |
|---|
| 106 | 0 | self.__accept_unquoted_attributes = kw['accept_unquoted_attributes'] |
|---|
| 107 | 2 | if 'accept_missing_endtag_name' in kw: |
|---|
| 108 | 0 | self.__accept_missing_endtag_name = kw['accept_missing_endtag_name'] |
|---|
| 109 | 2 | if 'map_case' in kw: |
|---|
| 110 | 0 | self.__map_case = kw['map_case'] |
|---|
| 111 | 2 | if 'accept_utf8' in kw: |
|---|
| 112 | 0 | self.__accept_utf8 = kw['accept_utf8'] |
|---|
| 113 | 2 | if 'translate_attribute_references' in kw: |
|---|
| 114 | 0 | self.__translate_attribute_references = kw['translate_attribute_references'] |
|---|
| 115 | 2 | self.reset() |
|---|
| 116 | n/a | |
|---|
| 117 | 1 | def __fixelements(self): |
|---|
| 118 | 2 | self.__fixed = 1 |
|---|
| 119 | 2 | self.elements = {} |
|---|
| 120 | 2 | self.__fixdict(self.__dict__) |
|---|
| 121 | 2 | self.__fixclass(self.__class__) |
|---|
| 122 | n/a | |
|---|
| 123 | 1 | def __fixclass(self, kl): |
|---|
| 124 | 3 | self.__fixdict(kl.__dict__) |
|---|
| 125 | 4 | for k in kl.__bases__: |
|---|
| 126 | 1 | self.__fixclass(k) |
|---|
| 127 | n/a | |
|---|
| 128 | 1 | def __fixdict(self, dict): |
|---|
| 129 | 124 | for key in dict.keys(): |
|---|
| 130 | 119 | if key[:6] == 'start_': |
|---|
| 131 | 0 | tag = key[6:] |
|---|
| 132 | 0 | start, end = self.elements.get(tag, (None, None)) |
|---|
| 133 | 0 | if start is None: |
|---|
| 134 | 0 | self.elements[tag] = getattr(self, key), end |
|---|
| 135 | 119 | elif key[:4] == 'end_': |
|---|
| 136 | 0 | tag = key[4:] |
|---|
| 137 | 0 | start, end = self.elements.get(tag, (None, None)) |
|---|
| 138 | 0 | if end is None: |
|---|
| 139 | 0 | self.elements[tag] = start, getattr(self, key) |
|---|
| 140 | n/a | |
|---|
| 141 | n/a | # Interface -- reset this instance. Loses all unprocessed data |
|---|
| 142 | 1 | def reset(self): |
|---|
| 143 | 2 | self.rawdata = '' |
|---|
| 144 | 2 | self.stack = [] |
|---|
| 145 | 2 | self.nomoretags = 0 |
|---|
| 146 | 2 | self.literal = 0 |
|---|
| 147 | 2 | self.lineno = 1 |
|---|
| 148 | 2 | self.__at_start = 1 |
|---|
| 149 | 2 | self.__seen_doctype = None |
|---|
| 150 | 2 | self.__seen_starttag = 0 |
|---|
| 151 | 2 | self.__use_namespaces = 0 |
|---|
| 152 | 2 | self.__namespaces = {'xml':None} # xml is implicitly declared |
|---|
| 153 | n/a | # backward compatibility hack: if elements not overridden, |
|---|
| 154 | n/a | # fill it in ourselves |
|---|
| 155 | 2 | if self.elements is XMLParser.elements: |
|---|
| 156 | 2 | self.__fixelements() |
|---|
| 157 | n/a | |
|---|
| 158 | n/a | # For derived classes only -- enter literal mode (CDATA) till EOF |
|---|
| 159 | 1 | def setnomoretags(self): |
|---|
| 160 | 0 | self.nomoretags = self.literal = 1 |
|---|
| 161 | n/a | |
|---|
| 162 | n/a | # For derived classes only -- enter literal mode (CDATA) |
|---|
| 163 | 1 | def setliteral(self, *args): |
|---|
| 164 | 0 | self.literal = 1 |
|---|
| 165 | n/a | |
|---|
| 166 | n/a | # Interface -- feed some data to the parser. Call this as |
|---|
| 167 | n/a | # often as you want, with as little or as much text as you |
|---|
| 168 | n/a | # want (may include '\n'). (This just saves the text, all the |
|---|
| 169 | n/a | # processing is done by goahead().) |
|---|
| 170 | 1 | def feed(self, data): |
|---|
| 171 | 330 | self.rawdata = self.rawdata + data |
|---|
| 172 | 330 | self.goahead(0) |
|---|
| 173 | n/a | |
|---|
| 174 | n/a | # Interface -- handle the remaining data |
|---|
| 175 | 1 | def close(self): |
|---|
| 176 | 2 | self.goahead(1) |
|---|
| 177 | 2 | if self.__fixed: |
|---|
| 178 | 2 | self.__fixed = 0 |
|---|
| 179 | n/a | # remove self.elements so that we don't leak |
|---|
| 180 | 2 | del self.elements |
|---|
| 181 | n/a | |
|---|
| 182 | n/a | # Interface -- translate references |
|---|
| 183 | 1 | def translate_references(self, data, all = 1): |
|---|
| 184 | 1 | if not self.__translate_attribute_references: |
|---|
| 185 | 0 | return data |
|---|
| 186 | 1 | i = 0 |
|---|
| 187 | 1 | while 1: |
|---|
| 188 | 1 | res = amp.search(data, i) |
|---|
| 189 | 1 | if res is None: |
|---|
| 190 | 1 | return data |
|---|
| 191 | 0 | s = res.start(0) |
|---|
| 192 | 0 | res = ref.match(data, s) |
|---|
| 193 | 0 | if res is None: |
|---|
| 194 | 0 | self.syntax_error("bogus `&'") |
|---|
| 195 | 0 | i = s+1 |
|---|
| 196 | 0 | continue |
|---|
| 197 | 0 | i = res.end(0) |
|---|
| 198 | 0 | str = res.group(1) |
|---|
| 199 | 0 | rescan = 0 |
|---|
| 200 | 0 | if str[0] == '#': |
|---|
| 201 | 0 | if str[1] == 'x': |
|---|
| 202 | 0 | str = chr(int(str[2:], 16)) |
|---|
| 203 | n/a | else: |
|---|
| 204 | 0 | str = chr(int(str[1:])) |
|---|
| 205 | 0 | if data[i - 1] != ';': |
|---|
| 206 | 0 | self.syntax_error("`;' missing after char reference") |
|---|
| 207 | 0 | i = i-1 |
|---|
| 208 | 0 | elif all: |
|---|
| 209 | 0 | if str in self.entitydefs: |
|---|
| 210 | 0 | str = self.entitydefs[str] |
|---|
| 211 | 0 | rescan = 1 |
|---|
| 212 | 0 | elif data[i - 1] != ';': |
|---|
| 213 | 0 | self.syntax_error("bogus `&'") |
|---|
| 214 | 0 | i = s + 1 # just past the & |
|---|
| 215 | 0 | continue |
|---|
| 216 | n/a | else: |
|---|
| 217 | 0 | self.syntax_error("reference to unknown entity `&%s;'" % str) |
|---|
| 218 | 0 | str = '&' + str + ';' |
|---|
| 219 | 0 | elif data[i - 1] != ';': |
|---|
| 220 | 0 | self.syntax_error("bogus `&'") |
|---|
| 221 | 0 | i = s + 1 # just past the & |
|---|
| 222 | 0 | continue |
|---|
| 223 | n/a | |
|---|
| 224 | n/a | # when we get here, str contains the translated text and i points |
|---|
| 225 | n/a | # to the end of the string that is to be replaced |
|---|
| 226 | 0 | data = data[:s] + str + data[i:] |
|---|
| 227 | 0 | if rescan: |
|---|
| 228 | 0 | i = s |
|---|
| 229 | n/a | else: |
|---|
| 230 | 0 | i = s + len(str) |
|---|
| 231 | n/a | |
|---|
| 232 | n/a | # Interface - return a dictionary of all namespaces currently valid |
|---|
| 233 | 1 | def getnamespace(self): |
|---|
| 234 | 0 | nsdict = {} |
|---|
| 235 | 0 | for t, d, nst in self.stack: |
|---|
| 236 | 0 | nsdict.update(d) |
|---|
| 237 | 0 | return nsdict |
|---|
| 238 | n/a | |
|---|
| 239 | n/a | # Internal -- handle data as far as reasonable. May leave state |
|---|
| 240 | n/a | # and data to be processed by a subsequent call. If 'end' is |
|---|
| 241 | n/a | # true, force handling all data as if followed by EOF marker. |
|---|
| 242 | 1 | def goahead(self, end): |
|---|
| 243 | 332 | rawdata = self.rawdata |
|---|
| 244 | 332 | i = 0 |
|---|
| 245 | 332 | n = len(rawdata) |
|---|
| 246 | 339 | while i < n: |
|---|
| 247 | 330 | if i > 0: |
|---|
| 248 | 0 | self.__at_start = 0 |
|---|
| 249 | 330 | if self.nomoretags: |
|---|
| 250 | 0 | data = rawdata[i:n] |
|---|
| 251 | 0 | self.handle_data(data) |
|---|
| 252 | 0 | self.lineno = self.lineno + data.count('\n') |
|---|
| 253 | 0 | i = n |
|---|
| 254 | 0 | break |
|---|
| 255 | 330 | res = interesting.search(rawdata, i) |
|---|
| 256 | 330 | if res: |
|---|
| 257 | 312 | j = res.start(0) |
|---|
| 258 | n/a | else: |
|---|
| 259 | 18 | j = n |
|---|
| 260 | 330 | if i < j: |
|---|
| 261 | 18 | data = rawdata[i:j] |
|---|
| 262 | 18 | if self.__at_start and space.match(data) is None: |
|---|
| 263 | 0 | self.syntax_error('illegal data at start of file') |
|---|
| 264 | 18 | self.__at_start = 0 |
|---|
| 265 | 18 | if not self.stack and space.match(data) is None: |
|---|
| 266 | 0 | self.syntax_error('data not in content') |
|---|
| 267 | 18 | if not self.__accept_utf8 and illegal.search(data): |
|---|
| 268 | 0 | self.syntax_error('illegal character in content') |
|---|
| 269 | 18 | self.handle_data(data) |
|---|
| 270 | 18 | self.lineno = self.lineno + data.count('\n') |
|---|
| 271 | 330 | i = j |
|---|
| 272 | 330 | if i == n: break |
|---|
| 273 | 312 | if rawdata[i] == '<': |
|---|
| 274 | 312 | if starttagopen.match(rawdata, i): |
|---|
| 275 | 10 | if self.literal: |
|---|
| 276 | 0 | data = rawdata[i] |
|---|
| 277 | 0 | self.handle_data(data) |
|---|
| 278 | 0 | self.lineno = self.lineno + data.count('\n') |
|---|
| 279 | 0 | i = i+1 |
|---|
| 280 | 0 | continue |
|---|
| 281 | 10 | k = self.parse_starttag(i) |
|---|
| 282 | 10 | if k < 0: break |
|---|
| 283 | 2 | self.__seen_starttag = 1 |
|---|
| 284 | 2 | self.lineno = self.lineno + rawdata[i:k].count('\n') |
|---|
| 285 | 2 | i = k |
|---|
| 286 | 2 | continue |
|---|
| 287 | 302 | if endtagopen.match(rawdata, i): |
|---|
| 288 | 10 | k = self.parse_endtag(i) |
|---|
| 289 | 10 | if k < 0: break |
|---|
| 290 | 1 | self.lineno = self.lineno + rawdata[i:k].count('\n') |
|---|
| 291 | 1 | i = k |
|---|
| 292 | 1 | continue |
|---|
| 293 | 292 | if commentopen.match(rawdata, i): |
|---|
| 294 | 105 | if self.literal: |
|---|
| 295 | 0 | data = rawdata[i] |
|---|
| 296 | 0 | self.handle_data(data) |
|---|
| 297 | 0 | self.lineno = self.lineno + data.count('\n') |
|---|
| 298 | 0 | i = i+1 |
|---|
| 299 | 0 | continue |
|---|
| 300 | 105 | k = self.parse_comment(i) |
|---|
| 301 | 105 | if k < 0: break |
|---|
| 302 | 1 | self.lineno = self.lineno + rawdata[i:k].count('\n') |
|---|
| 303 | 1 | i = k |
|---|
| 304 | 1 | continue |
|---|
| 305 | 187 | if cdataopen.match(rawdata, i): |
|---|
| 306 | 0 | k = self.parse_cdata(i) |
|---|
| 307 | 0 | if k < 0: break |
|---|
| 308 | 0 | self.lineno = self.lineno + rawdata[i:k].count('\n') |
|---|
| 309 | 0 | i = k |
|---|
| 310 | 0 | continue |
|---|
| 311 | 187 | res = xmldecl.match(rawdata, i) |
|---|
| 312 | 187 | if res: |
|---|
| 313 | 1 | if not self.__at_start: |
|---|
| 314 | 0 | self.syntax_error("<?xml?> declaration not at start of document") |
|---|
| 315 | 1 | version, encoding, standalone = res.group('version', |
|---|
| 316 | 1 | 'encoding', |
|---|
| 317 | 1 | 'standalone') |
|---|
| 318 | 1 | if version[1:-1] != '1.0': |
|---|
| 319 | 0 | raise Error('only XML version 1.0 supported') |
|---|
| 320 | 1 | if encoding: encoding = encoding[1:-1] |
|---|
| 321 | 1 | if standalone: standalone = standalone[1:-1] |
|---|
| 322 | 1 | self.handle_xml(encoding, standalone) |
|---|
| 323 | 1 | i = res.end(0) |
|---|
| 324 | 1 | continue |
|---|
| 325 | 186 | res = procopen.match(rawdata, i) |
|---|
| 326 | 186 | if res: |
|---|
| 327 | 122 | k = self.parse_proc(i) |
|---|
| 328 | 122 | if k < 0: break |
|---|
| 329 | 1 | self.lineno = self.lineno + rawdata[i:k].count('\n') |
|---|
| 330 | 1 | i = k |
|---|
| 331 | 1 | continue |
|---|
| 332 | 64 | res = doctype.match(rawdata, i) |
|---|
| 333 | 64 | if res: |
|---|
| 334 | 45 | if self.literal: |
|---|
| 335 | 0 | data = rawdata[i] |
|---|
| 336 | 0 | self.handle_data(data) |
|---|
| 337 | 0 | self.lineno = self.lineno + data.count('\n') |
|---|
| 338 | 0 | i = i+1 |
|---|
| 339 | 0 | continue |
|---|
| 340 | 45 | if self.__seen_doctype: |
|---|
| 341 | 0 | self.syntax_error('multiple DOCTYPE elements') |
|---|
| 342 | 45 | if self.__seen_starttag: |
|---|
| 343 | 0 | self.syntax_error('DOCTYPE not at beginning of document') |
|---|
| 344 | 45 | k = self.parse_doctype(res) |
|---|
| 345 | 45 | if k < 0: break |
|---|
| 346 | 1 | self.__seen_doctype = res.group('name') |
|---|
| 347 | 1 | if self.__map_case: |
|---|
| 348 | 0 | self.__seen_doctype = self.__seen_doctype.lower() |
|---|
| 349 | 1 | self.lineno = self.lineno + rawdata[i:k].count('\n') |
|---|
| 350 | 1 | i = k |
|---|
| 351 | 1 | continue |
|---|
| 352 | 0 | elif rawdata[i] == '&': |
|---|
| 353 | 0 | if self.literal: |
|---|
| 354 | 0 | data = rawdata[i] |
|---|
| 355 | 0 | self.handle_data(data) |
|---|
| 356 | 0 | i = i+1 |
|---|
| 357 | 0 | continue |
|---|
| 358 | 0 | res = charref.match(rawdata, i) |
|---|
| 359 | 0 | if res is not None: |
|---|
| 360 | 0 | i = res.end(0) |
|---|
| 361 | 0 | if rawdata[i-1] != ';': |
|---|
| 362 | 0 | self.syntax_error("`;' missing in charref") |
|---|
| 363 | 0 | i = i-1 |
|---|
| 364 | 0 | if not self.stack: |
|---|
| 365 | 0 | self.syntax_error('data not in content') |
|---|
| 366 | 0 | self.handle_charref(res.group('char')[:-1]) |
|---|
| 367 | 0 | self.lineno = self.lineno + res.group(0).count('\n') |
|---|
| 368 | 0 | continue |
|---|
| 369 | 0 | res = entityref.match(rawdata, i) |
|---|
| 370 | 0 | if res is not None: |
|---|
| 371 | 0 | i = res.end(0) |
|---|
| 372 | 0 | if rawdata[i-1] != ';': |
|---|
| 373 | 0 | self.syntax_error("`;' missing in entityref") |
|---|
| 374 | 0 | i = i-1 |
|---|
| 375 | 0 | name = res.group('name') |
|---|
| 376 | 0 | if self.__map_case: |
|---|
| 377 | 0 | name = name.lower() |
|---|
| 378 | 0 | if name in self.entitydefs: |
|---|
| 379 | 0 | self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:] |
|---|
| 380 | 0 | n = len(rawdata) |
|---|
| 381 | 0 | i = res.start(0) |
|---|
| 382 | n/a | else: |
|---|
| 383 | 0 | self.unknown_entityref(name) |
|---|
| 384 | 0 | self.lineno = self.lineno + res.group(0).count('\n') |
|---|
| 385 | 0 | continue |
|---|
| 386 | 0 | elif rawdata[i] == ']': |
|---|
| 387 | 0 | if self.literal: |
|---|
| 388 | 0 | data = rawdata[i] |
|---|
| 389 | 0 | self.handle_data(data) |
|---|
| 390 | 0 | i = i+1 |
|---|
| 391 | 0 | continue |
|---|
| 392 | 0 | if n-i < 3: |
|---|
| 393 | 0 | break |
|---|
| 394 | 0 | if cdataclose.match(rawdata, i): |
|---|
| 395 | 0 | self.syntax_error("bogus `]]>'") |
|---|
| 396 | 0 | self.handle_data(rawdata[i]) |
|---|
| 397 | 0 | i = i+1 |
|---|
| 398 | 0 | continue |
|---|
| 399 | n/a | else: |
|---|
| 400 | 0 | raise Error('neither < nor & ??') |
|---|
| 401 | n/a | # We get here only if incomplete matches but |
|---|
| 402 | n/a | # nothing else |
|---|
| 403 | 19 | break |
|---|
| 404 | n/a | # end while |
|---|
| 405 | 332 | if i > 0: |
|---|
| 406 | 25 | self.__at_start = 0 |
|---|
| 407 | 332 | if end and i < n: |
|---|
| 408 | 0 | data = rawdata[i] |
|---|
| 409 | 0 | self.syntax_error("bogus `%s'" % data) |
|---|
| 410 | 0 | if not self.__accept_utf8 and illegal.search(data): |
|---|
| 411 | 0 | self.syntax_error('illegal character in content') |
|---|
| 412 | 0 | self.handle_data(data) |
|---|
| 413 | 0 | self.lineno = self.lineno + data.count('\n') |
|---|
| 414 | 0 | self.rawdata = rawdata[i+1:] |
|---|
| 415 | 0 | return self.goahead(end) |
|---|
| 416 | 332 | self.rawdata = rawdata[i:] |
|---|
| 417 | 332 | if end: |
|---|
| 418 | 2 | if not self.__seen_starttag: |
|---|
| 419 | 0 | self.syntax_error('no elements in file') |
|---|
| 420 | 2 | if self.stack: |
|---|
| 421 | 0 | self.syntax_error('missing end tags') |
|---|
| 422 | 0 | while self.stack: |
|---|
| 423 | 0 | self.finish_endtag(self.stack[-1][0]) |
|---|
| 424 | n/a | |
|---|
| 425 | n/a | # Internal -- parse comment, return length or -1 if not terminated |
|---|
| 426 | 1 | def parse_comment(self, i): |
|---|
| 427 | 105 | rawdata = self.rawdata |
|---|
| 428 | 105 | if rawdata[i:i+4] != '<!--': |
|---|
| 429 | 0 | raise Error('unexpected call to handle_comment') |
|---|
| 430 | 105 | res = commentclose.search(rawdata, i+4) |
|---|
| 431 | 105 | if res is None: |
|---|
| 432 | 104 | return -1 |
|---|
| 433 | 1 | if doubledash.search(rawdata, i+4, res.start(0)): |
|---|
| 434 | 0 | self.syntax_error("`--' inside comment") |
|---|
| 435 | 1 | if rawdata[res.start(0)-1] == '-': |
|---|
| 436 | 0 | self.syntax_error('comment cannot end in three dashes') |
|---|
| 437 | 1 | if not self.__accept_utf8 and \ |
|---|
| 438 | 1 | illegal.search(rawdata, i+4, res.start(0)): |
|---|
| 439 | 0 | self.syntax_error('illegal character in comment') |
|---|
| 440 | 1 | self.handle_comment(rawdata[i+4: res.start(0)]) |
|---|
| 441 | 1 | return res.end(0) |
|---|
| 442 | n/a | |
|---|
| 443 | n/a | # Internal -- handle DOCTYPE tag, return length or -1 if not terminated |
|---|
| 444 | 1 | def parse_doctype(self, res): |
|---|
| 445 | 45 | rawdata = self.rawdata |
|---|
| 446 | 45 | n = len(rawdata) |
|---|
| 447 | 45 | name = res.group('name') |
|---|
| 448 | 45 | if self.__map_case: |
|---|
| 449 | 0 | name = name.lower() |
|---|
| 450 | 45 | pubid, syslit = res.group('pubid', 'syslit') |
|---|
| 451 | 45 | if pubid is not None: |
|---|
| 452 | 0 | pubid = pubid[1:-1] # remove quotes |
|---|
| 453 | 0 | pubid = ' '.join(pubid.split()) # normalize |
|---|
| 454 | 45 | if syslit is not None: syslit = syslit[1:-1] # remove quotes |
|---|
| 455 | 45 | j = k = res.end(0) |
|---|
| 456 | 45 | if k >= n: |
|---|
| 457 | 9 | return -1 |
|---|
| 458 | 36 | if rawdata[k] == '[': |
|---|
| 459 | 36 | level = 0 |
|---|
| 460 | 36 | k = k+1 |
|---|
| 461 | 36 | dq = sq = 0 |
|---|
| 462 | 663 | while k < n: |
|---|
| 463 | 629 | c = rawdata[k] |
|---|
| 464 | 629 | if not sq and c == '"': |
|---|
| 465 | 0 | dq = not dq |
|---|
| 466 | 629 | elif not dq and c == "'": |
|---|
| 467 | 0 | sq = not sq |
|---|
| 468 | 629 | elif sq or dq: |
|---|
| 469 | 0 | pass |
|---|
| 470 | 629 | elif level <= 0 and c == ']': |
|---|
| 471 | 2 | res = endbracket.match(rawdata, k+1) |
|---|
| 472 | 2 | if res is None: |
|---|
| 473 | 1 | return -1 |
|---|
| 474 | 1 | self.handle_doctype(name, pubid, syslit, rawdata[j+1:k]) |
|---|
| 475 | 1 | return res.end(0) |
|---|
| 476 | 627 | elif c == '<': |
|---|
| 477 | 32 | level = level + 1 |
|---|
| 478 | 595 | elif c == '>': |
|---|
| 479 | 4 | level = level - 1 |
|---|
| 480 | 4 | if level < 0: |
|---|
| 481 | 0 | self.syntax_error("bogus `>' in DOCTYPE") |
|---|
| 482 | 627 | k = k+1 |
|---|
| 483 | 34 | res = endbracketfind.match(rawdata, k) |
|---|
| 484 | 34 | if res is None: |
|---|
| 485 | 34 | return -1 |
|---|
| 486 | 0 | if endbracket.match(rawdata, k) is None: |
|---|
| 487 | 0 | self.syntax_error('garbage in DOCTYPE') |
|---|
| 488 | 0 | self.handle_doctype(name, pubid, syslit, None) |
|---|
| 489 | 0 | return res.end(0) |
|---|
| 490 | n/a | |
|---|
| 491 | n/a | # Internal -- handle CDATA tag, return length or -1 if not terminated |
|---|
| 492 | 1 | def parse_cdata(self, i): |
|---|
| 493 | 0 | rawdata = self.rawdata |
|---|
| 494 | 0 | if rawdata[i:i+9] != '<![CDATA[': |
|---|
| 495 | 0 | raise Error('unexpected call to parse_cdata') |
|---|
| 496 | 0 | res = cdataclose.search(rawdata, i+9) |
|---|
| 497 | 0 | if res is None: |
|---|
| 498 | 0 | return -1 |
|---|
| 499 | 0 | if not self.__accept_utf8 and \ |
|---|
| 500 | 0 | illegal.search(rawdata, i+9, res.start(0)): |
|---|
| 501 | 0 | self.syntax_error('illegal character in CDATA') |
|---|
| 502 | 0 | if not self.stack: |
|---|
| 503 | 0 | self.syntax_error('CDATA not in content') |
|---|
| 504 | 0 | self.handle_cdata(rawdata[i+9:res.start(0)]) |
|---|
| 505 | 0 | return res.end(0) |
|---|
| 506 | n/a | |
|---|
| 507 | 1 | __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None} |
|---|
| 508 | n/a | # Internal -- handle a processing instruction tag |
|---|
| 509 | 1 | def parse_proc(self, i): |
|---|
| 510 | 122 | rawdata = self.rawdata |
|---|
| 511 | 122 | end = procclose.search(rawdata, i) |
|---|
| 512 | 122 | if end is None: |
|---|
| 513 | 121 | return -1 |
|---|
| 514 | 1 | j = end.start(0) |
|---|
| 515 | 1 | if not self.__accept_utf8 and illegal.search(rawdata, i+2, j): |
|---|
| 516 | 0 | self.syntax_error('illegal character in processing instruction') |
|---|
| 517 | 1 | res = tagfind.match(rawdata, i+2) |
|---|
| 518 | 1 | if res is None: |
|---|
| 519 | 0 | raise Error('unexpected call to parse_proc') |
|---|
| 520 | 1 | k = res.end(0) |
|---|
| 521 | 1 | name = res.group(0) |
|---|
| 522 | 1 | if self.__map_case: |
|---|
| 523 | 0 | name = name.lower() |
|---|
| 524 | 1 | if name == 'xml:namespace': |
|---|
| 525 | 0 | self.syntax_error('old-fashioned namespace declaration') |
|---|
| 526 | 0 | self.__use_namespaces = -1 |
|---|
| 527 | n/a | # namespace declaration |
|---|
| 528 | n/a | # this must come after the <?xml?> declaration (if any) |
|---|
| 529 | n/a | # and before the <!DOCTYPE> (if any). |
|---|
| 530 | 0 | if self.__seen_doctype or self.__seen_starttag: |
|---|
| 531 | 0 | self.syntax_error('xml:namespace declaration too late in document') |
|---|
| 532 | 0 | attrdict, namespace, k = self.parse_attributes(name, k, j) |
|---|
| 533 | 0 | if namespace: |
|---|
| 534 | 0 | self.syntax_error('namespace declaration inside namespace declaration') |
|---|
| 535 | 0 | for attrname in attrdict.keys(): |
|---|
| 536 | 0 | if not attrname in self.__xml_namespace_attributes: |
|---|
| 537 | 0 | self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname) |
|---|
| 538 | 0 | if not 'ns' in attrdict or not 'prefix' in attrdict: |
|---|
| 539 | 0 | self.syntax_error('xml:namespace without required attributes') |
|---|
| 540 | 0 | prefix = attrdict.get('prefix') |
|---|
| 541 | 0 | if ncname.match(prefix) is None: |
|---|
| 542 | 0 | self.syntax_error('xml:namespace illegal prefix value') |
|---|
| 543 | 0 | return end.end(0) |
|---|
| 544 | 0 | if prefix in self.__namespaces: |
|---|
| 545 | 0 | self.syntax_error('xml:namespace prefix not unique') |
|---|
| 546 | 0 | self.__namespaces[prefix] = attrdict['ns'] |
|---|
| 547 | n/a | else: |
|---|
| 548 | 1 | if name.lower() == 'xml': |
|---|
| 549 | 0 | self.syntax_error('illegal processing instruction target name') |
|---|
| 550 | 1 | self.handle_proc(name, rawdata[k:j]) |
|---|
| 551 | 1 | return end.end(0) |
|---|
| 552 | n/a | |
|---|
| 553 | n/a | # Internal -- parse attributes between i and j |
|---|
| 554 | 1 | def parse_attributes(self, tag, i, j): |
|---|
| 555 | 2 | rawdata = self.rawdata |
|---|
| 556 | 2 | attrdict = {} |
|---|
| 557 | 2 | namespace = {} |
|---|
| 558 | 4 | while i < j: |
|---|
| 559 | 2 | res = attrfind.match(rawdata, i) |
|---|
| 560 | 2 | if res is None: |
|---|
| 561 | 0 | break |
|---|
| 562 | 2 | attrname, attrvalue = res.group('name', 'value') |
|---|
| 563 | 2 | if self.__map_case: |
|---|
| 564 | 0 | attrname = attrname.lower() |
|---|
| 565 | 2 | i = res.end(0) |
|---|
| 566 | 2 | if attrvalue is None: |
|---|
| 567 | 0 | self.syntax_error("no value specified for attribute `%s'" % attrname) |
|---|
| 568 | 0 | attrvalue = attrname |
|---|
| 569 | 2 | elif attrvalue[:1] == "'" == attrvalue[-1:] or \ |
|---|
| 570 | 0 | attrvalue[:1] == '"' == attrvalue[-1:]: |
|---|
| 571 | 2 | attrvalue = attrvalue[1:-1] |
|---|
| 572 | 0 | elif not self.__accept_unquoted_attributes: |
|---|
| 573 | 0 | self.syntax_error("attribute `%s' value not quoted" % attrname) |
|---|
| 574 | 2 | res = xmlns.match(attrname) |
|---|
| 575 | 2 | if res is not None: |
|---|
| 576 | n/a | # namespace declaration |
|---|
| 577 | 1 | ncname = res.group('ncname') |
|---|
| 578 | 1 | namespace[ncname or ''] = attrvalue or None |
|---|
| 579 | 1 | if not self.__use_namespaces: |
|---|
| 580 | 1 | self.__use_namespaces = len(self.stack)+1 |
|---|
| 581 | 0 | continue |
|---|
| 582 | 1 | if '<' in attrvalue: |
|---|
| 583 | 0 | self.syntax_error("`<' illegal in attribute value") |
|---|
| 584 | 1 | if attrname in attrdict: |
|---|
| 585 | 0 | self.syntax_error("attribute `%s' specified twice" % attrname) |
|---|
| 586 | 1 | attrvalue = attrvalue.translate(attrtrans) |
|---|
| 587 | 1 | attrdict[attrname] = self.translate_references(attrvalue) |
|---|
| 588 | 2 | return attrdict, namespace, i |
|---|
| 589 | n/a | |
|---|
| 590 | n/a | # Internal -- handle starttag, return length or -1 if not terminated |
|---|
| 591 | 1 | def parse_starttag(self, i): |
|---|
| 592 | 10 | rawdata = self.rawdata |
|---|
| 593 | n/a | # i points to start of tag |
|---|
| 594 | 10 | end = endbracketfind.match(rawdata, i+1) |
|---|
| 595 | 10 | if end is None: |
|---|
| 596 | 8 | return -1 |
|---|
| 597 | 2 | tag = starttagmatch.match(rawdata, i) |
|---|
| 598 | 2 | if tag is None or tag.end(0) != end.end(0): |
|---|
| 599 | 0 | self.syntax_error('garbage in starttag') |
|---|
| 600 | 0 | return end.end(0) |
|---|
| 601 | 2 | nstag = tagname = tag.group('tagname') |
|---|
| 602 | 2 | if self.__map_case: |
|---|
| 603 | 0 | nstag = tagname = nstag.lower() |
|---|
| 604 | 2 | if not self.__seen_starttag and self.__seen_doctype and \ |
|---|
| 605 | 1 | tagname != self.__seen_doctype: |
|---|
| 606 | 0 | self.syntax_error('starttag does not match DOCTYPE') |
|---|
| 607 | 2 | if self.__seen_starttag and not self.stack: |
|---|
| 608 | 0 | self.syntax_error('multiple elements on top level') |
|---|
| 609 | 2 | k, j = tag.span('attrs') |
|---|
| 610 | 2 | attrdict, nsdict, k = self.parse_attributes(tagname, k, j) |
|---|
| 611 | 2 | self.stack.append((tagname, nsdict, nstag)) |
|---|
| 612 | 2 | if self.__use_namespaces: |
|---|
| 613 | 1 | res = qname.match(tagname) |
|---|
| 614 | n/a | else: |
|---|
| 615 | 1 | res = None |
|---|
| 616 | 2 | if res is not None: |
|---|
| 617 | 1 | prefix, nstag = res.group('prefix', 'local') |
|---|
| 618 | 1 | if prefix is None: |
|---|
| 619 | 1 | prefix = '' |
|---|
| 620 | 1 | ns = None |
|---|
| 621 | 2 | for t, d, nst in self.stack: |
|---|
| 622 | 1 | if prefix in d: |
|---|
| 623 | 1 | ns = d[prefix] |
|---|
| 624 | 1 | if ns is None and prefix != '': |
|---|
| 625 | 0 | ns = self.__namespaces.get(prefix) |
|---|
| 626 | 1 | if ns is not None: |
|---|
| 627 | 1 | nstag = ns + ' ' + nstag |
|---|
| 628 | 0 | elif prefix != '': |
|---|
| 629 | 0 | nstag = prefix + ':' + nstag # undo split |
|---|
| 630 | 1 | self.stack[-1] = tagname, nsdict, nstag |
|---|
| 631 | n/a | # translate namespace of attributes |
|---|
| 632 | 2 | attrnamemap = {} # map from new name to old name (used for error reporting) |
|---|
| 633 | 3 | for key in attrdict.keys(): |
|---|
| 634 | 1 | attrnamemap[key] = key |
|---|
| 635 | 2 | if self.__use_namespaces: |
|---|
| 636 | 1 | nattrdict = {} |
|---|
| 637 | 2 | for key, val in attrdict.items(): |
|---|
| 638 | 1 | okey = key |
|---|
| 639 | 1 | res = qname.match(key) |
|---|
| 640 | 1 | if res is not None: |
|---|
| 641 | 1 | aprefix, key = res.group('prefix', 'local') |
|---|
| 642 | 1 | if self.__map_case: |
|---|
| 643 | 0 | key = key.lower() |
|---|
| 644 | 1 | if aprefix is not None: |
|---|
| 645 | 0 | ans = None |
|---|
| 646 | 0 | for t, d, nst in self.stack: |
|---|
| 647 | 0 | if aprefix in d: |
|---|
| 648 | 0 | ans = d[aprefix] |
|---|
| 649 | 0 | if ans is None: |
|---|
| 650 | 0 | ans = self.__namespaces.get(aprefix) |
|---|
| 651 | 0 | if ans is not None: |
|---|
| 652 | 0 | key = ans + ' ' + key |
|---|
| 653 | n/a | else: |
|---|
| 654 | 0 | key = aprefix + ':' + key |
|---|
| 655 | 1 | nattrdict[key] = val |
|---|
| 656 | 1 | attrnamemap[key] = okey |
|---|
| 657 | 1 | attrdict = nattrdict |
|---|
| 658 | 2 | attributes = self.attributes.get(nstag) |
|---|
| 659 | 2 | if attributes is not None: |
|---|
| 660 | 0 | for key in attrdict.keys(): |
|---|
| 661 | 0 | if not key in attributes: |
|---|
| 662 | 0 | self.syntax_error("unknown attribute `%s' in tag `%s'" % (attrnamemap[key], tagname)) |
|---|
| 663 | 0 | for key, val in attributes.items(): |
|---|
| 664 | 0 | if val is not None and not key in attrdict: |
|---|
| 665 | 0 | attrdict[key] = val |
|---|
| 666 | 2 | method = self.elements.get(nstag, (None, None))[0] |
|---|
| 667 | 2 | self.finish_starttag(nstag, attrdict, method) |
|---|
| 668 | 2 | if tag.group('slash') == '/': |
|---|
| 669 | 1 | self.finish_endtag(tagname) |
|---|
| 670 | 2 | return tag.end(0) |
|---|
| 671 | n/a | |
|---|
| 672 | n/a | # Internal -- parse endtag |
|---|
| 673 | 1 | def parse_endtag(self, i): |
|---|
| 674 | 10 | rawdata = self.rawdata |
|---|
| 675 | 10 | end = endbracketfind.match(rawdata, i+1) |
|---|
| 676 | 10 | if end is None: |
|---|
| 677 | 9 | return -1 |
|---|
| 678 | 1 | res = tagfind.match(rawdata, i+2) |
|---|
| 679 | 1 | if res is None: |
|---|
| 680 | 0 | if self.literal: |
|---|
| 681 | 0 | self.handle_data(rawdata[i]) |
|---|
| 682 | 0 | return i+1 |
|---|
| 683 | 0 | if not self.__accept_missing_endtag_name: |
|---|
| 684 | 0 | self.syntax_error('no name specified in end tag') |
|---|
| 685 | 0 | tag = self.stack[-1][0] |
|---|
| 686 | 0 | k = i+2 |
|---|
| 687 | n/a | else: |
|---|
| 688 | 1 | tag = res.group(0) |
|---|
| 689 | 1 | if self.__map_case: |
|---|
| 690 | 0 | tag = tag.lower() |
|---|
| 691 | 1 | if self.literal: |
|---|
| 692 | 0 | if not self.stack or tag != self.stack[-1][0]: |
|---|
| 693 | 0 | self.handle_data(rawdata[i]) |
|---|
| 694 | 0 | return i+1 |
|---|
| 695 | 1 | k = res.end(0) |
|---|
| 696 | 1 | if endbracket.match(rawdata, k) is None: |
|---|
| 697 | 0 | self.syntax_error('garbage in end tag') |
|---|
| 698 | 1 | self.finish_endtag(tag) |
|---|
| 699 | 1 | return end.end(0) |
|---|
| 700 | n/a | |
|---|
| 701 | n/a | # Internal -- finish processing of start tag |
|---|
| 702 | 1 | def finish_starttag(self, tagname, attrdict, method): |
|---|
| 703 | 2 | if method is not None: |
|---|
| 704 | 0 | self.handle_starttag(tagname, method, attrdict) |
|---|
| 705 | n/a | else: |
|---|
| 706 | 2 | self.unknown_starttag(tagname, attrdict) |
|---|
| 707 | n/a | |
|---|
| 708 | n/a | # Internal -- finish processing of end tag |
|---|
| 709 | 1 | def finish_endtag(self, tag): |
|---|
| 710 | 2 | self.literal = 0 |
|---|
| 711 | 2 | if not tag: |
|---|
| 712 | 0 | self.syntax_error('name-less end tag') |
|---|
| 713 | 0 | found = len(self.stack) - 1 |
|---|
| 714 | 0 | if found < 0: |
|---|
| 715 | 0 | self.unknown_endtag(tag) |
|---|
| 716 | 0 | return |
|---|
| 717 | n/a | else: |
|---|
| 718 | 2 | found = -1 |
|---|
| 719 | 4 | for i in range(len(self.stack)): |
|---|
| 720 | 2 | if tag == self.stack[i][0]: |
|---|
| 721 | 2 | found = i |
|---|
| 722 | 2 | if found == -1: |
|---|
| 723 | 0 | self.syntax_error('unopened end tag') |
|---|
| 724 | 0 | return |
|---|
| 725 | 4 | while len(self.stack) > found: |
|---|
| 726 | 2 | if found < len(self.stack) - 1: |
|---|
| 727 | 0 | self.syntax_error('missing close tag for %s' % self.stack[-1][2]) |
|---|
| 728 | 2 | nstag = self.stack[-1][2] |
|---|
| 729 | 2 | method = self.elements.get(nstag, (None, None))[1] |
|---|
| 730 | 2 | if method is not None: |
|---|
| 731 | 0 | self.handle_endtag(nstag, method) |
|---|
| 732 | n/a | else: |
|---|
| 733 | 2 | self.unknown_endtag(nstag) |
|---|
| 734 | 2 | if self.__use_namespaces == len(self.stack): |
|---|
| 735 | 1 | self.__use_namespaces = 0 |
|---|
| 736 | 2 | del self.stack[-1] |
|---|
| 737 | n/a | |
|---|
| 738 | n/a | # Overridable -- handle xml processing instruction |
|---|
| 739 | 1 | def handle_xml(self, encoding, standalone): |
|---|
| 740 | 1 | pass |
|---|
| 741 | n/a | |
|---|
| 742 | n/a | # Overridable -- handle DOCTYPE |
|---|
| 743 | 1 | def handle_doctype(self, tag, pubid, syslit, data): |
|---|
| 744 | 1 | pass |
|---|
| 745 | n/a | |
|---|
| 746 | n/a | # Overridable -- handle start tag |
|---|
| 747 | 1 | def handle_starttag(self, tag, method, attrs): |
|---|
| 748 | 0 | method(attrs) |
|---|
| 749 | n/a | |
|---|
| 750 | n/a | # Overridable -- handle end tag |
|---|
| 751 | 1 | def handle_endtag(self, tag, method): |
|---|
| 752 | 0 | method() |
|---|
| 753 | n/a | |
|---|
| 754 | n/a | # Example -- handle character reference, no need to override |
|---|
| 755 | 1 | def handle_charref(self, name): |
|---|
| 756 | 0 | try: |
|---|
| 757 | 0 | if name[0] == 'x': |
|---|
| 758 | 0 | n = int(name[1:], 16) |
|---|
| 759 | n/a | else: |
|---|
| 760 | 0 | n = int(name) |
|---|
| 761 | 0 | except ValueError: |
|---|
| 762 | 0 | self.unknown_charref(name) |
|---|
| 763 | 0 | return |
|---|
| 764 | 0 | if not 0 <= n <= 255: |
|---|
| 765 | 0 | self.unknown_charref(name) |
|---|
| 766 | 0 | return |
|---|
| 767 | 0 | self.handle_data(chr(n)) |
|---|
| 768 | n/a | |
|---|
| 769 | n/a | # Definition of entities -- derived classes may override |
|---|
| 770 | 1 | entitydefs = {'lt': '<', # must use charref |
|---|
| 771 | 1 | 'gt': '>', |
|---|
| 772 | 1 | 'amp': '&', # must use charref |
|---|
| 773 | 1 | 'quot': '"', |
|---|
| 774 | 1 | 'apos': ''', |
|---|
| 775 | n/a | } |
|---|
| 776 | n/a | |
|---|
| 777 | n/a | # Example -- handle data, should be overridden |
|---|
| 778 | 1 | def handle_data(self, data): |
|---|
| 779 | 18 | pass |
|---|
| 780 | n/a | |
|---|
| 781 | n/a | # Example -- handle cdata, could be overridden |
|---|
| 782 | 1 | def handle_cdata(self, data): |
|---|
| 783 | 0 | pass |
|---|
| 784 | n/a | |
|---|
| 785 | n/a | # Example -- handle comment, could be overridden |
|---|
| 786 | 1 | def handle_comment(self, data): |
|---|
| 787 | 1 | pass |
|---|
| 788 | n/a | |
|---|
| 789 | n/a | # Example -- handle processing instructions, could be overridden |
|---|
| 790 | 1 | def handle_proc(self, name, data): |
|---|
| 791 | 1 | pass |
|---|
| 792 | n/a | |
|---|
| 793 | n/a | # Example -- handle relatively harmless syntax errors, could be overridden |
|---|
| 794 | 1 | def syntax_error(self, message): |
|---|
| 795 | 0 | raise Error('Syntax error at line %d: %s' % (self.lineno, message)) |
|---|
| 796 | n/a | |
|---|
| 797 | n/a | # To be overridden -- handlers for unknown objects |
|---|
| 798 | 2 | def unknown_starttag(self, tag, attrs): pass |
|---|
| 799 | 3 | def unknown_endtag(self, tag): pass |
|---|
| 800 | 1 | def unknown_charref(self, ref): pass |
|---|
| 801 | 1 | def unknown_entityref(self, name): |
|---|
| 802 | 0 | self.syntax_error("reference to unknown entity `&%s;'" % name) |
|---|
| 803 | n/a | |
|---|
| 804 | n/a | |
|---|
| 805 | 2 | class TestXMLParser(XMLParser): |
|---|
| 806 | n/a | |
|---|
| 807 | 1 | def __init__(self, **kw): |
|---|
| 808 | 0 | self.testdata = "" |
|---|
| 809 | 0 | XMLParser.__init__(self, **kw) |
|---|
| 810 | n/a | |
|---|
| 811 | 1 | def handle_xml(self, encoding, standalone): |
|---|
| 812 | 0 | self.flush() |
|---|
| 813 | 0 | print 'xml: encoding =',encoding,'standalone =',standalone |
|---|
| 814 | n/a | |
|---|
| 815 | 1 | def handle_doctype(self, tag, pubid, syslit, data): |
|---|
| 816 | 0 | self.flush() |
|---|
| 817 | 0 | print 'DOCTYPE:',tag, repr(data) |
|---|
| 818 | n/a | |
|---|
| 819 | 1 | def handle_data(self, data): |
|---|
| 820 | 0 | self.testdata = self.testdata + data |
|---|
| 821 | 0 | if len(repr(self.testdata)) >= 70: |
|---|
| 822 | 0 | self.flush() |
|---|
| 823 | n/a | |
|---|
| 824 | 1 | def flush(self): |
|---|
| 825 | 0 | data = self.testdata |
|---|
| 826 | 0 | if data: |
|---|
| 827 | 0 | self.testdata = "" |
|---|
| 828 | 0 | print 'data:', repr(data) |
|---|
| 829 | n/a | |
|---|
| 830 | 1 | def handle_cdata(self, data): |
|---|
| 831 | 0 | self.flush() |
|---|
| 832 | 0 | print 'cdata:', repr(data) |
|---|
| 833 | n/a | |
|---|
| 834 | 1 | def handle_proc(self, name, data): |
|---|
| 835 | 0 | self.flush() |
|---|
| 836 | 0 | print 'processing:',name,repr(data) |
|---|
| 837 | n/a | |
|---|
| 838 | 1 | def handle_comment(self, data): |
|---|
| 839 | 0 | self.flush() |
|---|
| 840 | 0 | r = repr(data) |
|---|
| 841 | 0 | if len(r) > 68: |
|---|
| 842 | 0 | r = r[:32] + '...' + r[-32:] |
|---|
| 843 | 0 | print 'comment:', r |
|---|
| 844 | n/a | |
|---|
| 845 | 1 | def syntax_error(self, message): |
|---|
| 846 | 0 | print 'error at line %d:' % self.lineno, message |
|---|
| 847 | n/a | |
|---|
| 848 | 1 | def unknown_starttag(self, tag, attrs): |
|---|
| 849 | 0 | self.flush() |
|---|
| 850 | 0 | if not attrs: |
|---|
| 851 | 0 | print 'start tag: <' + tag + '>' |
|---|
| 852 | n/a | else: |
|---|
| 853 | 0 | print 'start tag: <' + tag, |
|---|
| 854 | 0 | for name, value in attrs.items(): |
|---|
| 855 | 0 | print name + '=' + '"' + value + '"', |
|---|
| 856 | 0 | print '>' |
|---|
| 857 | n/a | |
|---|
| 858 | 1 | def unknown_endtag(self, tag): |
|---|
| 859 | 0 | self.flush() |
|---|
| 860 | 0 | print 'end tag: </' + tag + '>' |
|---|
| 861 | n/a | |
|---|
| 862 | 1 | def unknown_entityref(self, ref): |
|---|
| 863 | 0 | self.flush() |
|---|
| 864 | 0 | print '*** unknown entity ref: &' + ref + ';' |
|---|
| 865 | n/a | |
|---|
| 866 | 1 | def unknown_charref(self, ref): |
|---|
| 867 | 0 | self.flush() |
|---|
| 868 | 0 | print '*** unknown char ref: &#' + ref + ';' |
|---|
| 869 | n/a | |
|---|
| 870 | 1 | def close(self): |
|---|
| 871 | 0 | XMLParser.close(self) |
|---|
| 872 | 0 | self.flush() |
|---|
| 873 | n/a | |
|---|
| 874 | 1 | def test(args = None): |
|---|
| 875 | 0 | import sys, getopt |
|---|
| 876 | 0 | from time import time |
|---|
| 877 | n/a | |
|---|
| 878 | 0 | if not args: |
|---|
| 879 | 0 | args = sys.argv[1:] |
|---|
| 880 | n/a | |
|---|
| 881 | 0 | opts, args = getopt.getopt(args, 'st') |
|---|
| 882 | 0 | klass = TestXMLParser |
|---|
| 883 | 0 | do_time = 0 |
|---|
| 884 | 0 | for o, a in opts: |
|---|
| 885 | 0 | if o == '-s': |
|---|
| 886 | 0 | klass = XMLParser |
|---|
| 887 | 0 | elif o == '-t': |
|---|
| 888 | 0 | do_time = 1 |
|---|
| 889 | n/a | |
|---|
| 890 | 0 | if args: |
|---|
| 891 | 0 | file = args[0] |
|---|
| 892 | n/a | else: |
|---|
| 893 | 0 | file = 'test.xml' |
|---|
| 894 | n/a | |
|---|
| 895 | 0 | if file == '-': |
|---|
| 896 | 0 | f = sys.stdin |
|---|
| 897 | n/a | else: |
|---|
| 898 | 0 | try: |
|---|
| 899 | 0 | f = open(file, 'r') |
|---|
| 900 | 0 | except IOError, msg: |
|---|
| 901 | 0 | print file, ":", msg |
|---|
| 902 | 0 | sys.exit(1) |
|---|
| 903 | n/a | |
|---|
| 904 | 0 | data = f.read() |
|---|
| 905 | 0 | if f is not sys.stdin: |
|---|
| 906 | 0 | f.close() |
|---|
| 907 | n/a | |
|---|
| 908 | 0 | x = klass() |
|---|
| 909 | 0 | t0 = time() |
|---|
| 910 | 0 | try: |
|---|
| 911 | 0 | if do_time: |
|---|
| 912 | 0 | x.feed(data) |
|---|
| 913 | 0 | x.close() |
|---|
| 914 | n/a | else: |
|---|
| 915 | 0 | for c in data: |
|---|
| 916 | 0 | x.feed(c) |
|---|
| 917 | 0 | x.close() |
|---|
| 918 | 0 | except Error, msg: |
|---|
| 919 | 0 | t1 = time() |
|---|
| 920 | 0 | print msg |
|---|
| 921 | 0 | if do_time: |
|---|
| 922 | 0 | print 'total time: %g' % (t1-t0) |
|---|
| 923 | 0 | sys.exit(1) |
|---|
| 924 | 0 | t1 = time() |
|---|
| 925 | 0 | if do_time: |
|---|
| 926 | 0 | print 'total time: %g' % (t1-t0) |
|---|
| 927 | n/a | |
|---|
| 928 | n/a | |
|---|
| 929 | 1 | if __name__ == '__main__': |
|---|
| 930 | 0 | test() |
|---|