ยปCore Development>Code coverage>Lib/xml/dom/expatbuilder.py

Python code coverage for Lib/xml/dom/expatbuilder.py

#countcontent
1n/a"""Facility to use the Expat parser to load a minidom instance
2n/afrom a string or file.
3n/a
4n/aThis avoids all the overhead of SAX and pulldom to gain performance.
5n/a"""
6n/a
7n/a# Warning!
8n/a#
9n/a# This module is tightly bound to the implementation details of the
10n/a# minidom DOM and can't be used with other DOM implementations. This
11n/a# is due, in part, to a lack of appropriate methods in the DOM (there is
12n/a# no way to create Entity and Notation nodes via the DOM Level 2
13n/a# interface), and for performance. The latter is the cause of some fairly
14n/a# cryptic code.
15n/a#
16n/a# Performance hacks:
17n/a#
18n/a# - .character_data_handler() has an extra case in which continuing
19n/a# data is appended to an existing Text node; this can be a
20n/a# speedup since pyexpat can break up character data into multiple
21n/a# callbacks even though we set the buffer_text attribute on the
22n/a# parser. This also gives us the advantage that we don't need a
23n/a# separate normalization pass.
24n/a#
25n/a# - Determining that a node exists is done using an identity comparison
26n/a# with None rather than a truth test; this avoids searching for and
27n/a# calling any methods on the node object if it exists. (A rather
28n/a# nice speedup is achieved this way as well!)
29n/a
30n/afrom xml.dom import xmlbuilder, minidom, Node
31n/afrom xml.dom import EMPTY_NAMESPACE, EMPTY_PREFIX, XMLNS_NAMESPACE
32n/afrom xml.parsers import expat
33n/afrom xml.dom.minidom import _append_child, _set_attribute_node
34n/afrom xml.dom.NodeFilter import NodeFilter
35n/a
36n/aTEXT_NODE = Node.TEXT_NODE
37n/aCDATA_SECTION_NODE = Node.CDATA_SECTION_NODE
38n/aDOCUMENT_NODE = Node.DOCUMENT_NODE
39n/a
40n/aFILTER_ACCEPT = xmlbuilder.DOMBuilderFilter.FILTER_ACCEPT
41n/aFILTER_REJECT = xmlbuilder.DOMBuilderFilter.FILTER_REJECT
42n/aFILTER_SKIP = xmlbuilder.DOMBuilderFilter.FILTER_SKIP
43n/aFILTER_INTERRUPT = xmlbuilder.DOMBuilderFilter.FILTER_INTERRUPT
44n/a
45n/atheDOMImplementation = minidom.getDOMImplementation()
46n/a
47n/a# Expat typename -> TypeInfo
48n/a_typeinfo_map = {
49n/a "CDATA": minidom.TypeInfo(None, "cdata"),
50n/a "ENUM": minidom.TypeInfo(None, "enumeration"),
51n/a "ENTITY": minidom.TypeInfo(None, "entity"),
52n/a "ENTITIES": minidom.TypeInfo(None, "entities"),
53n/a "ID": minidom.TypeInfo(None, "id"),
54n/a "IDREF": minidom.TypeInfo(None, "idref"),
55n/a "IDREFS": minidom.TypeInfo(None, "idrefs"),
56n/a "NMTOKEN": minidom.TypeInfo(None, "nmtoken"),
57n/a "NMTOKENS": minidom.TypeInfo(None, "nmtokens"),
58n/a }
59n/a
60n/aclass ElementInfo(object):
61n/a __slots__ = '_attr_info', '_model', 'tagName'
62n/a
63n/a def __init__(self, tagName, model=None):
64n/a self.tagName = tagName
65n/a self._attr_info = []
66n/a self._model = model
67n/a
68n/a def __getstate__(self):
69n/a return self._attr_info, self._model, self.tagName
70n/a
71n/a def __setstate__(self, state):
72n/a self._attr_info, self._model, self.tagName = state
73n/a
74n/a def getAttributeType(self, aname):
75n/a for info in self._attr_info:
76n/a if info[1] == aname:
77n/a t = info[-2]
78n/a if t[0] == "(":
79n/a return _typeinfo_map["ENUM"]
80n/a else:
81n/a return _typeinfo_map[info[-2]]
82n/a return minidom._no_type
83n/a
84n/a def getAttributeTypeNS(self, namespaceURI, localName):
85n/a return minidom._no_type
86n/a
87n/a def isElementContent(self):
88n/a if self._model:
89n/a type = self._model[0]
90n/a return type not in (expat.model.XML_CTYPE_ANY,
91n/a expat.model.XML_CTYPE_MIXED)
92n/a else:
93n/a return False
94n/a
95n/a def isEmpty(self):
96n/a if self._model:
97n/a return self._model[0] == expat.model.XML_CTYPE_EMPTY
98n/a else:
99n/a return False
100n/a
101n/a def isId(self, aname):
102n/a for info in self._attr_info:
103n/a if info[1] == aname:
104n/a return info[-2] == "ID"
105n/a return False
106n/a
107n/a def isIdNS(self, euri, ename, auri, aname):
108n/a # not sure this is meaningful
109n/a return self.isId((auri, aname))
110n/a
111n/adef _intern(builder, s):
112n/a return builder._intern_setdefault(s, s)
113n/a
114n/adef _parse_ns_name(builder, name):
115n/a assert ' ' in name
116n/a parts = name.split(' ')
117n/a intern = builder._intern_setdefault
118n/a if len(parts) == 3:
119n/a uri, localname, prefix = parts
120n/a prefix = intern(prefix, prefix)
121n/a qname = "%s:%s" % (prefix, localname)
122n/a qname = intern(qname, qname)
123n/a localname = intern(localname, localname)
124n/a elif len(parts) == 2:
125n/a uri, localname = parts
126n/a prefix = EMPTY_PREFIX
127n/a qname = localname = intern(localname, localname)
128n/a else:
129n/a raise ValueError("Unsupported syntax: spaces in URIs not supported: %r" % name)
130n/a return intern(uri, uri), localname, prefix, qname
131n/a
132n/a
133n/aclass ExpatBuilder:
134n/a """Document builder that uses Expat to build a ParsedXML.DOM document
135n/a instance."""
136n/a
137n/a def __init__(self, options=None):
138n/a if options is None:
139n/a options = xmlbuilder.Options()
140n/a self._options = options
141n/a if self._options.filter is not None:
142n/a self._filter = FilterVisibilityController(self._options.filter)
143n/a else:
144n/a self._filter = None
145n/a # This *really* doesn't do anything in this case, so
146n/a # override it with something fast & minimal.
147n/a self._finish_start_element = id
148n/a self._parser = None
149n/a self.reset()
150n/a
151n/a def createParser(self):
152n/a """Create a new parser object."""
153n/a return expat.ParserCreate()
154n/a
155n/a def getParser(self):
156n/a """Return the parser object, creating a new one if needed."""
157n/a if not self._parser:
158n/a self._parser = self.createParser()
159n/a self._intern_setdefault = self._parser.intern.setdefault
160n/a self._parser.buffer_text = True
161n/a self._parser.ordered_attributes = True
162n/a self._parser.specified_attributes = True
163n/a self.install(self._parser)
164n/a return self._parser
165n/a
166n/a def reset(self):
167n/a """Free all data structures used during DOM construction."""
168n/a self.document = theDOMImplementation.createDocument(
169n/a EMPTY_NAMESPACE, None, None)
170n/a self.curNode = self.document
171n/a self._elem_info = self.document._elem_info
172n/a self._cdata = False
173n/a
174n/a def install(self, parser):
175n/a """Install the callbacks needed to build the DOM into the parser."""
176n/a # This creates circular references!
177n/a parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
178n/a parser.StartElementHandler = self.first_element_handler
179n/a parser.EndElementHandler = self.end_element_handler
180n/a parser.ProcessingInstructionHandler = self.pi_handler
181n/a if self._options.entities:
182n/a parser.EntityDeclHandler = self.entity_decl_handler
183n/a parser.NotationDeclHandler = self.notation_decl_handler
184n/a if self._options.comments:
185n/a parser.CommentHandler = self.comment_handler
186n/a if self._options.cdata_sections:
187n/a parser.StartCdataSectionHandler = self.start_cdata_section_handler
188n/a parser.EndCdataSectionHandler = self.end_cdata_section_handler
189n/a parser.CharacterDataHandler = self.character_data_handler_cdata
190n/a else:
191n/a parser.CharacterDataHandler = self.character_data_handler
192n/a parser.ExternalEntityRefHandler = self.external_entity_ref_handler
193n/a parser.XmlDeclHandler = self.xml_decl_handler
194n/a parser.ElementDeclHandler = self.element_decl_handler
195n/a parser.AttlistDeclHandler = self.attlist_decl_handler
196n/a
197n/a def parseFile(self, file):
198n/a """Parse a document from a file object, returning the document
199n/a node."""
200n/a parser = self.getParser()
201n/a first_buffer = True
202n/a try:
203n/a while 1:
204n/a buffer = file.read(16*1024)
205n/a if not buffer:
206n/a break
207n/a parser.Parse(buffer, 0)
208n/a if first_buffer and self.document.documentElement:
209n/a self._setup_subset(buffer)
210n/a first_buffer = False
211n/a parser.Parse("", True)
212n/a except ParseEscape:
213n/a pass
214n/a doc = self.document
215n/a self.reset()
216n/a self._parser = None
217n/a return doc
218n/a
219n/a def parseString(self, string):
220n/a """Parse a document from a string, returning the document node."""
221n/a parser = self.getParser()
222n/a try:
223n/a parser.Parse(string, True)
224n/a self._setup_subset(string)
225n/a except ParseEscape:
226n/a pass
227n/a doc = self.document
228n/a self.reset()
229n/a self._parser = None
230n/a return doc
231n/a
232n/a def _setup_subset(self, buffer):
233n/a """Load the internal subset if there might be one."""
234n/a if self.document.doctype:
235n/a extractor = InternalSubsetExtractor()
236n/a extractor.parseString(buffer)
237n/a subset = extractor.getSubset()
238n/a self.document.doctype.internalSubset = subset
239n/a
240n/a def start_doctype_decl_handler(self, doctypeName, systemId, publicId,
241n/a has_internal_subset):
242n/a doctype = self.document.implementation.createDocumentType(
243n/a doctypeName, publicId, systemId)
244n/a doctype.ownerDocument = self.document
245n/a _append_child(self.document, doctype)
246n/a self.document.doctype = doctype
247n/a if self._filter and self._filter.acceptNode(doctype) == FILTER_REJECT:
248n/a self.document.doctype = None
249n/a del self.document.childNodes[-1]
250n/a doctype = None
251n/a self._parser.EntityDeclHandler = None
252n/a self._parser.NotationDeclHandler = None
253n/a if has_internal_subset:
254n/a if doctype is not None:
255n/a doctype.entities._seq = []
256n/a doctype.notations._seq = []
257n/a self._parser.CommentHandler = None
258n/a self._parser.ProcessingInstructionHandler = None
259n/a self._parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
260n/a
261n/a def end_doctype_decl_handler(self):
262n/a if self._options.comments:
263n/a self._parser.CommentHandler = self.comment_handler
264n/a self._parser.ProcessingInstructionHandler = self.pi_handler
265n/a if not (self._elem_info or self._filter):
266n/a self._finish_end_element = id
267n/a
268n/a def pi_handler(self, target, data):
269n/a node = self.document.createProcessingInstruction(target, data)
270n/a _append_child(self.curNode, node)
271n/a if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
272n/a self.curNode.removeChild(node)
273n/a
274n/a def character_data_handler_cdata(self, data):
275n/a childNodes = self.curNode.childNodes
276n/a if self._cdata:
277n/a if ( self._cdata_continue
278n/a and childNodes[-1].nodeType == CDATA_SECTION_NODE):
279n/a childNodes[-1].appendData(data)
280n/a return
281n/a node = self.document.createCDATASection(data)
282n/a self._cdata_continue = True
283n/a elif childNodes and childNodes[-1].nodeType == TEXT_NODE:
284n/a node = childNodes[-1]
285n/a value = node.data + data
286n/a node.data = value
287n/a return
288n/a else:
289n/a node = minidom.Text()
290n/a node.data = data
291n/a node.ownerDocument = self.document
292n/a _append_child(self.curNode, node)
293n/a
294n/a def character_data_handler(self, data):
295n/a childNodes = self.curNode.childNodes
296n/a if childNodes and childNodes[-1].nodeType == TEXT_NODE:
297n/a node = childNodes[-1]
298n/a node.data = node.data + data
299n/a return
300n/a node = minidom.Text()
301n/a node.data = node.data + data
302n/a node.ownerDocument = self.document
303n/a _append_child(self.curNode, node)
304n/a
305n/a def entity_decl_handler(self, entityName, is_parameter_entity, value,
306n/a base, systemId, publicId, notationName):
307n/a if is_parameter_entity:
308n/a # we don't care about parameter entities for the DOM
309n/a return
310n/a if not self._options.entities:
311n/a return
312n/a node = self.document._create_entity(entityName, publicId,
313n/a systemId, notationName)
314n/a if value is not None:
315n/a # internal entity
316n/a # node *should* be readonly, but we'll cheat
317n/a child = self.document.createTextNode(value)
318n/a node.childNodes.append(child)
319n/a self.document.doctype.entities._seq.append(node)
320n/a if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
321n/a del self.document.doctype.entities._seq[-1]
322n/a
323n/a def notation_decl_handler(self, notationName, base, systemId, publicId):
324n/a node = self.document._create_notation(notationName, publicId, systemId)
325n/a self.document.doctype.notations._seq.append(node)
326n/a if self._filter and self._filter.acceptNode(node) == FILTER_ACCEPT:
327n/a del self.document.doctype.notations._seq[-1]
328n/a
329n/a def comment_handler(self, data):
330n/a node = self.document.createComment(data)
331n/a _append_child(self.curNode, node)
332n/a if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
333n/a self.curNode.removeChild(node)
334n/a
335n/a def start_cdata_section_handler(self):
336n/a self._cdata = True
337n/a self._cdata_continue = False
338n/a
339n/a def end_cdata_section_handler(self):
340n/a self._cdata = False
341n/a self._cdata_continue = False
342n/a
343n/a def external_entity_ref_handler(self, context, base, systemId, publicId):
344n/a return 1
345n/a
346n/a def first_element_handler(self, name, attributes):
347n/a if self._filter is None and not self._elem_info:
348n/a self._finish_end_element = id
349n/a self.getParser().StartElementHandler = self.start_element_handler
350n/a self.start_element_handler(name, attributes)
351n/a
352n/a def start_element_handler(self, name, attributes):
353n/a node = self.document.createElement(name)
354n/a _append_child(self.curNode, node)
355n/a self.curNode = node
356n/a
357n/a if attributes:
358n/a for i in range(0, len(attributes), 2):
359n/a a = minidom.Attr(attributes[i], EMPTY_NAMESPACE,
360n/a None, EMPTY_PREFIX)
361n/a value = attributes[i+1]
362n/a a.value = value
363n/a a.ownerDocument = self.document
364n/a _set_attribute_node(node, a)
365n/a
366n/a if node is not self.document.documentElement:
367n/a self._finish_start_element(node)
368n/a
369n/a def _finish_start_element(self, node):
370n/a if self._filter:
371n/a # To be general, we'd have to call isSameNode(), but this
372n/a # is sufficient for minidom:
373n/a if node is self.document.documentElement:
374n/a return
375n/a filt = self._filter.startContainer(node)
376n/a if filt == FILTER_REJECT:
377n/a # ignore this node & all descendents
378n/a Rejecter(self)
379n/a elif filt == FILTER_SKIP:
380n/a # ignore this node, but make it's children become
381n/a # children of the parent node
382n/a Skipper(self)
383n/a else:
384n/a return
385n/a self.curNode = node.parentNode
386n/a node.parentNode.removeChild(node)
387n/a node.unlink()
388n/a
389n/a # If this ever changes, Namespaces.end_element_handler() needs to
390n/a # be changed to match.
391n/a #
392n/a def end_element_handler(self, name):
393n/a curNode = self.curNode
394n/a self.curNode = curNode.parentNode
395n/a self._finish_end_element(curNode)
396n/a
397n/a def _finish_end_element(self, curNode):
398n/a info = self._elem_info.get(curNode.tagName)
399n/a if info:
400n/a self._handle_white_text_nodes(curNode, info)
401n/a if self._filter:
402n/a if curNode is self.document.documentElement:
403n/a return
404n/a if self._filter.acceptNode(curNode) == FILTER_REJECT:
405n/a self.curNode.removeChild(curNode)
406n/a curNode.unlink()
407n/a
408n/a def _handle_white_text_nodes(self, node, info):
409n/a if (self._options.whitespace_in_element_content
410n/a or not info.isElementContent()):
411n/a return
412n/a
413n/a # We have element type information and should remove ignorable
414n/a # whitespace; identify for text nodes which contain only
415n/a # whitespace.
416n/a L = []
417n/a for child in node.childNodes:
418n/a if child.nodeType == TEXT_NODE and not child.data.strip():
419n/a L.append(child)
420n/a
421n/a # Remove ignorable whitespace from the tree.
422n/a for child in L:
423n/a node.removeChild(child)
424n/a
425n/a def element_decl_handler(self, name, model):
426n/a info = self._elem_info.get(name)
427n/a if info is None:
428n/a self._elem_info[name] = ElementInfo(name, model)
429n/a else:
430n/a assert info._model is None
431n/a info._model = model
432n/a
433n/a def attlist_decl_handler(self, elem, name, type, default, required):
434n/a info = self._elem_info.get(elem)
435n/a if info is None:
436n/a info = ElementInfo(elem)
437n/a self._elem_info[elem] = info
438n/a info._attr_info.append(
439n/a [None, name, None, None, default, 0, type, required])
440n/a
441n/a def xml_decl_handler(self, version, encoding, standalone):
442n/a self.document.version = version
443n/a self.document.encoding = encoding
444n/a # This is still a little ugly, thanks to the pyexpat API. ;-(
445n/a if standalone >= 0:
446n/a if standalone:
447n/a self.document.standalone = True
448n/a else:
449n/a self.document.standalone = False
450n/a
451n/a
452n/a# Don't include FILTER_INTERRUPT, since that's checked separately
453n/a# where allowed.
454n/a_ALLOWED_FILTER_RETURNS = (FILTER_ACCEPT, FILTER_REJECT, FILTER_SKIP)
455n/a
456n/aclass FilterVisibilityController(object):
457n/a """Wrapper around a DOMBuilderFilter which implements the checks
458n/a to make the whatToShow filter attribute work."""
459n/a
460n/a __slots__ = 'filter',
461n/a
462n/a def __init__(self, filter):
463n/a self.filter = filter
464n/a
465n/a def startContainer(self, node):
466n/a mask = self._nodetype_mask[node.nodeType]
467n/a if self.filter.whatToShow & mask:
468n/a val = self.filter.startContainer(node)
469n/a if val == FILTER_INTERRUPT:
470n/a raise ParseEscape
471n/a if val not in _ALLOWED_FILTER_RETURNS:
472n/a raise ValueError(
473n/a "startContainer() returned illegal value: " + repr(val))
474n/a return val
475n/a else:
476n/a return FILTER_ACCEPT
477n/a
478n/a def acceptNode(self, node):
479n/a mask = self._nodetype_mask[node.nodeType]
480n/a if self.filter.whatToShow & mask:
481n/a val = self.filter.acceptNode(node)
482n/a if val == FILTER_INTERRUPT:
483n/a raise ParseEscape
484n/a if val == FILTER_SKIP:
485n/a # move all child nodes to the parent, and remove this node
486n/a parent = node.parentNode
487n/a for child in node.childNodes[:]:
488n/a parent.appendChild(child)
489n/a # node is handled by the caller
490n/a return FILTER_REJECT
491n/a if val not in _ALLOWED_FILTER_RETURNS:
492n/a raise ValueError(
493n/a "acceptNode() returned illegal value: " + repr(val))
494n/a return val
495n/a else:
496n/a return FILTER_ACCEPT
497n/a
498n/a _nodetype_mask = {
499n/a Node.ELEMENT_NODE: NodeFilter.SHOW_ELEMENT,
500n/a Node.ATTRIBUTE_NODE: NodeFilter.SHOW_ATTRIBUTE,
501n/a Node.TEXT_NODE: NodeFilter.SHOW_TEXT,
502n/a Node.CDATA_SECTION_NODE: NodeFilter.SHOW_CDATA_SECTION,
503n/a Node.ENTITY_REFERENCE_NODE: NodeFilter.SHOW_ENTITY_REFERENCE,
504n/a Node.ENTITY_NODE: NodeFilter.SHOW_ENTITY,
505n/a Node.PROCESSING_INSTRUCTION_NODE: NodeFilter.SHOW_PROCESSING_INSTRUCTION,
506n/a Node.COMMENT_NODE: NodeFilter.SHOW_COMMENT,
507n/a Node.DOCUMENT_NODE: NodeFilter.SHOW_DOCUMENT,
508n/a Node.DOCUMENT_TYPE_NODE: NodeFilter.SHOW_DOCUMENT_TYPE,
509n/a Node.DOCUMENT_FRAGMENT_NODE: NodeFilter.SHOW_DOCUMENT_FRAGMENT,
510n/a Node.NOTATION_NODE: NodeFilter.SHOW_NOTATION,
511n/a }
512n/a
513n/a
514n/aclass FilterCrutch(object):
515n/a __slots__ = '_builder', '_level', '_old_start', '_old_end'
516n/a
517n/a def __init__(self, builder):
518n/a self._level = 0
519n/a self._builder = builder
520n/a parser = builder._parser
521n/a self._old_start = parser.StartElementHandler
522n/a self._old_end = parser.EndElementHandler
523n/a parser.StartElementHandler = self.start_element_handler
524n/a parser.EndElementHandler = self.end_element_handler
525n/a
526n/aclass Rejecter(FilterCrutch):
527n/a __slots__ = ()
528n/a
529n/a def __init__(self, builder):
530n/a FilterCrutch.__init__(self, builder)
531n/a parser = builder._parser
532n/a for name in ("ProcessingInstructionHandler",
533n/a "CommentHandler",
534n/a "CharacterDataHandler",
535n/a "StartCdataSectionHandler",
536n/a "EndCdataSectionHandler",
537n/a "ExternalEntityRefHandler",
538n/a ):
539n/a setattr(parser, name, None)
540n/a
541n/a def start_element_handler(self, *args):
542n/a self._level = self._level + 1
543n/a
544n/a def end_element_handler(self, *args):
545n/a if self._level == 0:
546n/a # restore the old handlers
547n/a parser = self._builder._parser
548n/a self._builder.install(parser)
549n/a parser.StartElementHandler = self._old_start
550n/a parser.EndElementHandler = self._old_end
551n/a else:
552n/a self._level = self._level - 1
553n/a
554n/aclass Skipper(FilterCrutch):
555n/a __slots__ = ()
556n/a
557n/a def start_element_handler(self, *args):
558n/a node = self._builder.curNode
559n/a self._old_start(*args)
560n/a if self._builder.curNode is not node:
561n/a self._level = self._level + 1
562n/a
563n/a def end_element_handler(self, *args):
564n/a if self._level == 0:
565n/a # We're popping back out of the node we're skipping, so we
566n/a # shouldn't need to do anything but reset the handlers.
567n/a self._builder._parser.StartElementHandler = self._old_start
568n/a self._builder._parser.EndElementHandler = self._old_end
569n/a self._builder = None
570n/a else:
571n/a self._level = self._level - 1
572n/a self._old_end(*args)
573n/a
574n/a
575n/a# framework document used by the fragment builder.
576n/a# Takes a string for the doctype, subset string, and namespace attrs string.
577n/a
578n/a_FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID = \
579n/a "http://xml.python.org/entities/fragment-builder/internal"
580n/a
581n/a_FRAGMENT_BUILDER_TEMPLATE = (
582n/a '''\
583n/a<!DOCTYPE wrapper
584n/a %%s [
585n/a <!ENTITY fragment-builder-internal
586n/a SYSTEM "%s">
587n/a%%s
588n/a]>
589n/a<wrapper %%s
590n/a>&fragment-builder-internal;</wrapper>'''
591n/a % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID)
592n/a
593n/a
594n/aclass FragmentBuilder(ExpatBuilder):
595n/a """Builder which constructs document fragments given XML source
596n/a text and a context node.
597n/a
598n/a The context node is expected to provide information about the
599n/a namespace declarations which are in scope at the start of the
600n/a fragment.
601n/a """
602n/a
603n/a def __init__(self, context, options=None):
604n/a if context.nodeType == DOCUMENT_NODE:
605n/a self.originalDocument = context
606n/a self.context = context
607n/a else:
608n/a self.originalDocument = context.ownerDocument
609n/a self.context = context
610n/a ExpatBuilder.__init__(self, options)
611n/a
612n/a def reset(self):
613n/a ExpatBuilder.reset(self)
614n/a self.fragment = None
615n/a
616n/a def parseFile(self, file):
617n/a """Parse a document fragment from a file object, returning the
618n/a fragment node."""
619n/a return self.parseString(file.read())
620n/a
621n/a def parseString(self, string):
622n/a """Parse a document fragment from a string, returning the
623n/a fragment node."""
624n/a self._source = string
625n/a parser = self.getParser()
626n/a doctype = self.originalDocument.doctype
627n/a ident = ""
628n/a if doctype:
629n/a subset = doctype.internalSubset or self._getDeclarations()
630n/a if doctype.publicId:
631n/a ident = ('PUBLIC "%s" "%s"'
632n/a % (doctype.publicId, doctype.systemId))
633n/a elif doctype.systemId:
634n/a ident = 'SYSTEM "%s"' % doctype.systemId
635n/a else:
636n/a subset = ""
637n/a nsattrs = self._getNSattrs() # get ns decls from node's ancestors
638n/a document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs)
639n/a try:
640n/a parser.Parse(document, 1)
641n/a except:
642n/a self.reset()
643n/a raise
644n/a fragment = self.fragment
645n/a self.reset()
646n/a## self._parser = None
647n/a return fragment
648n/a
649n/a def _getDeclarations(self):
650n/a """Re-create the internal subset from the DocumentType node.
651n/a
652n/a This is only needed if we don't already have the
653n/a internalSubset as a string.
654n/a """
655n/a doctype = self.context.ownerDocument.doctype
656n/a s = ""
657n/a if doctype:
658n/a for i in range(doctype.notations.length):
659n/a notation = doctype.notations.item(i)
660n/a if s:
661n/a s = s + "\n "
662n/a s = "%s<!NOTATION %s" % (s, notation.nodeName)
663n/a if notation.publicId:
664n/a s = '%s PUBLIC "%s"\n "%s">' \
665n/a % (s, notation.publicId, notation.systemId)
666n/a else:
667n/a s = '%s SYSTEM "%s">' % (s, notation.systemId)
668n/a for i in range(doctype.entities.length):
669n/a entity = doctype.entities.item(i)
670n/a if s:
671n/a s = s + "\n "
672n/a s = "%s<!ENTITY %s" % (s, entity.nodeName)
673n/a if entity.publicId:
674n/a s = '%s PUBLIC "%s"\n "%s"' \
675n/a % (s, entity.publicId, entity.systemId)
676n/a elif entity.systemId:
677n/a s = '%s SYSTEM "%s"' % (s, entity.systemId)
678n/a else:
679n/a s = '%s "%s"' % (s, entity.firstChild.data)
680n/a if entity.notationName:
681n/a s = "%s NOTATION %s" % (s, entity.notationName)
682n/a s = s + ">"
683n/a return s
684n/a
685n/a def _getNSattrs(self):
686n/a return ""
687n/a
688n/a def external_entity_ref_handler(self, context, base, systemId, publicId):
689n/a if systemId == _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID:
690n/a # this entref is the one that we made to put the subtree
691n/a # in; all of our given input is parsed in here.
692n/a old_document = self.document
693n/a old_cur_node = self.curNode
694n/a parser = self._parser.ExternalEntityParserCreate(context)
695n/a # put the real document back, parse into the fragment to return
696n/a self.document = self.originalDocument
697n/a self.fragment = self.document.createDocumentFragment()
698n/a self.curNode = self.fragment
699n/a try:
700n/a parser.Parse(self._source, 1)
701n/a finally:
702n/a self.curNode = old_cur_node
703n/a self.document = old_document
704n/a self._source = None
705n/a return -1
706n/a else:
707n/a return ExpatBuilder.external_entity_ref_handler(
708n/a self, context, base, systemId, publicId)
709n/a
710n/a
711n/aclass Namespaces:
712n/a """Mix-in class for builders; adds support for namespaces."""
713n/a
714n/a def _initNamespaces(self):
715n/a # list of (prefix, uri) ns declarations. Namespace attrs are
716n/a # constructed from this and added to the element's attrs.
717n/a self._ns_ordered_prefixes = []
718n/a
719n/a def createParser(self):
720n/a """Create a new namespace-handling parser."""
721n/a parser = expat.ParserCreate(namespace_separator=" ")
722n/a parser.namespace_prefixes = True
723n/a return parser
724n/a
725n/a def install(self, parser):
726n/a """Insert the namespace-handlers onto the parser."""
727n/a ExpatBuilder.install(self, parser)
728n/a if self._options.namespace_declarations:
729n/a parser.StartNamespaceDeclHandler = (
730n/a self.start_namespace_decl_handler)
731n/a
732n/a def start_namespace_decl_handler(self, prefix, uri):
733n/a """Push this namespace declaration on our storage."""
734n/a self._ns_ordered_prefixes.append((prefix, uri))
735n/a
736n/a def start_element_handler(self, name, attributes):
737n/a if ' ' in name:
738n/a uri, localname, prefix, qname = _parse_ns_name(self, name)
739n/a else:
740n/a uri = EMPTY_NAMESPACE
741n/a qname = name
742n/a localname = None
743n/a prefix = EMPTY_PREFIX
744n/a node = minidom.Element(qname, uri, prefix, localname)
745n/a node.ownerDocument = self.document
746n/a _append_child(self.curNode, node)
747n/a self.curNode = node
748n/a
749n/a if self._ns_ordered_prefixes:
750n/a for prefix, uri in self._ns_ordered_prefixes:
751n/a if prefix:
752n/a a = minidom.Attr(_intern(self, 'xmlns:' + prefix),
753n/a XMLNS_NAMESPACE, prefix, "xmlns")
754n/a else:
755n/a a = minidom.Attr("xmlns", XMLNS_NAMESPACE,
756n/a "xmlns", EMPTY_PREFIX)
757n/a a.value = uri
758n/a a.ownerDocument = self.document
759n/a _set_attribute_node(node, a)
760n/a del self._ns_ordered_prefixes[:]
761n/a
762n/a if attributes:
763n/a node._ensure_attributes()
764n/a _attrs = node._attrs
765n/a _attrsNS = node._attrsNS
766n/a for i in range(0, len(attributes), 2):
767n/a aname = attributes[i]
768n/a value = attributes[i+1]
769n/a if ' ' in aname:
770n/a uri, localname, prefix, qname = _parse_ns_name(self, aname)
771n/a a = minidom.Attr(qname, uri, localname, prefix)
772n/a _attrs[qname] = a
773n/a _attrsNS[(uri, localname)] = a
774n/a else:
775n/a a = minidom.Attr(aname, EMPTY_NAMESPACE,
776n/a aname, EMPTY_PREFIX)
777n/a _attrs[aname] = a
778n/a _attrsNS[(EMPTY_NAMESPACE, aname)] = a
779n/a a.ownerDocument = self.document
780n/a a.value = value
781n/a a.ownerElement = node
782n/a
783n/a if __debug__:
784n/a # This only adds some asserts to the original
785n/a # end_element_handler(), so we only define this when -O is not
786n/a # used. If changing one, be sure to check the other to see if
787n/a # it needs to be changed as well.
788n/a #
789n/a def end_element_handler(self, name):
790n/a curNode = self.curNode
791n/a if ' ' in name:
792n/a uri, localname, prefix, qname = _parse_ns_name(self, name)
793n/a assert (curNode.namespaceURI == uri
794n/a and curNode.localName == localname
795n/a and curNode.prefix == prefix), \
796n/a "element stack messed up! (namespace)"
797n/a else:
798n/a assert curNode.nodeName == name, \
799n/a "element stack messed up - bad nodeName"
800n/a assert curNode.namespaceURI == EMPTY_NAMESPACE, \
801n/a "element stack messed up - bad namespaceURI"
802n/a self.curNode = curNode.parentNode
803n/a self._finish_end_element(curNode)
804n/a
805n/a
806n/aclass ExpatBuilderNS(Namespaces, ExpatBuilder):
807n/a """Document builder that supports namespaces."""
808n/a
809n/a def reset(self):
810n/a ExpatBuilder.reset(self)
811n/a self._initNamespaces()
812n/a
813n/a
814n/aclass FragmentBuilderNS(Namespaces, FragmentBuilder):
815n/a """Fragment builder that supports namespaces."""
816n/a
817n/a def reset(self):
818n/a FragmentBuilder.reset(self)
819n/a self._initNamespaces()
820n/a
821n/a def _getNSattrs(self):
822n/a """Return string of namespace attributes from this element and
823n/a ancestors."""
824n/a # XXX This needs to be re-written to walk the ancestors of the
825n/a # context to build up the namespace information from
826n/a # declarations, elements, and attributes found in context.
827n/a # Otherwise we have to store a bunch more data on the DOM
828n/a # (though that *might* be more reliable -- not clear).
829n/a attrs = ""
830n/a context = self.context
831n/a L = []
832n/a while context:
833n/a if hasattr(context, '_ns_prefix_uri'):
834n/a for prefix, uri in context._ns_prefix_uri.items():
835n/a # add every new NS decl from context to L and attrs string
836n/a if prefix in L:
837n/a continue
838n/a L.append(prefix)
839n/a if prefix:
840n/a declname = "xmlns:" + prefix
841n/a else:
842n/a declname = "xmlns"
843n/a if attrs:
844n/a attrs = "%s\n %s='%s'" % (attrs, declname, uri)
845n/a else:
846n/a attrs = " %s='%s'" % (declname, uri)
847n/a context = context.parentNode
848n/a return attrs
849n/a
850n/a
851n/aclass ParseEscape(Exception):
852n/a """Exception raised to short-circuit parsing in InternalSubsetExtractor."""
853n/a pass
854n/a
855n/aclass InternalSubsetExtractor(ExpatBuilder):
856n/a """XML processor which can rip out the internal document type subset."""
857n/a
858n/a subset = None
859n/a
860n/a def getSubset(self):
861n/a """Return the internal subset as a string."""
862n/a return self.subset
863n/a
864n/a def parseFile(self, file):
865n/a try:
866n/a ExpatBuilder.parseFile(self, file)
867n/a except ParseEscape:
868n/a pass
869n/a
870n/a def parseString(self, string):
871n/a try:
872n/a ExpatBuilder.parseString(self, string)
873n/a except ParseEscape:
874n/a pass
875n/a
876n/a def install(self, parser):
877n/a parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
878n/a parser.StartElementHandler = self.start_element_handler
879n/a
880n/a def start_doctype_decl_handler(self, name, publicId, systemId,
881n/a has_internal_subset):
882n/a if has_internal_subset:
883n/a parser = self.getParser()
884n/a self.subset = []
885n/a parser.DefaultHandler = self.subset.append
886n/a parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
887n/a else:
888n/a raise ParseEscape()
889n/a
890n/a def end_doctype_decl_handler(self):
891n/a s = ''.join(self.subset).replace('\r\n', '\n').replace('\r', '\n')
892n/a self.subset = s
893n/a raise ParseEscape()
894n/a
895n/a def start_element_handler(self, name, attrs):
896n/a raise ParseEscape()
897n/a
898n/a
899n/adef parse(file, namespaces=True):
900n/a """Parse a document, returning the resulting Document node.
901n/a
902n/a 'file' may be either a file name or an open file object.
903n/a """
904n/a if namespaces:
905n/a builder = ExpatBuilderNS()
906n/a else:
907n/a builder = ExpatBuilder()
908n/a
909n/a if isinstance(file, str):
910n/a with open(file, 'rb') as fp:
911n/a result = builder.parseFile(fp)
912n/a else:
913n/a result = builder.parseFile(file)
914n/a return result
915n/a
916n/a
917n/adef parseString(string, namespaces=True):
918n/a """Parse a document from a string, returning the resulting
919n/a Document node.
920n/a """
921n/a if namespaces:
922n/a builder = ExpatBuilderNS()
923n/a else:
924n/a builder = ExpatBuilder()
925n/a return builder.parseString(string)
926n/a
927n/a
928n/adef parseFragment(file, context, namespaces=True):
929n/a """Parse a fragment of a document, given the context from which it
930n/a was originally extracted. context should be the parent of the
931n/a node(s) which are in the fragment.
932n/a
933n/a 'file' may be either a file name or an open file object.
934n/a """
935n/a if namespaces:
936n/a builder = FragmentBuilderNS(context)
937n/a else:
938n/a builder = FragmentBuilder(context)
939n/a
940n/a if isinstance(file, str):
941n/a with open(file, 'rb') as fp:
942n/a result = builder.parseFile(fp)
943n/a else:
944n/a result = builder.parseFile(file)
945n/a return result
946n/a
947n/a
948n/adef parseFragmentString(string, context, namespaces=True):
949n/a """Parse a fragment of a document from a string, given the context
950n/a from which it was originally extracted. context should be the
951n/a parent of the node(s) which are in the fragment.
952n/a """
953n/a if namespaces:
954n/a builder = FragmentBuilderNS(context)
955n/a else:
956n/a builder = FragmentBuilder(context)
957n/a return builder.parseString(string)
958n/a
959n/a
960n/adef makeBuilder(options):
961n/a """Create a builder based on an Options object."""
962n/a if options.namespaces:
963n/a return ExpatBuilderNS(options)
964n/a else:
965n/a return ExpatBuilder(options)