ยปCore Development>Code coverage>Lib/codecs.py

Python code coverage for Lib/codecs.py

#countcontent
1n/a""" codecs -- Python Codec Registry, API and helpers.
2n/a
3n/a
4n/aWritten by Marc-Andre Lemburg (mal@lemburg.com).
5n/a
6n/a(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7n/a
8n/a"""#"
9n/a
10n/aimport builtins, sys
11n/a
12n/a### Registry and builtin stateless codec functions
13n/a
14n/atry:
15n/a from _codecs import *
16n/aexcept ImportError as why:
17n/a raise SystemError('Failed to load the builtin codecs: %s' % why)
18n/a
19n/a__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
20n/a "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21n/a "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
22n/a "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23n/a "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
24n/a "StreamReader", "StreamWriter",
25n/a "StreamReaderWriter", "StreamRecoder",
26n/a "getencoder", "getdecoder", "getincrementalencoder",
27n/a "getincrementaldecoder", "getreader", "getwriter",
28n/a "encode", "decode", "iterencode", "iterdecode",
29n/a "strict_errors", "ignore_errors", "replace_errors",
30n/a "xmlcharrefreplace_errors",
31n/a "backslashreplace_errors", "namereplace_errors",
32n/a "register_error", "lookup_error"]
33n/a
34n/a### Constants
35n/a
36n/a#
37n/a# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
38n/a# and its possible byte string values
39n/a# for UTF8/UTF16/UTF32 output and little/big endian machines
40n/a#
41n/a
42n/a# UTF-8
43n/aBOM_UTF8 = b'\xef\xbb\xbf'
44n/a
45n/a# UTF-16, little endian
46n/aBOM_LE = BOM_UTF16_LE = b'\xff\xfe'
47n/a
48n/a# UTF-16, big endian
49n/aBOM_BE = BOM_UTF16_BE = b'\xfe\xff'
50n/a
51n/a# UTF-32, little endian
52n/aBOM_UTF32_LE = b'\xff\xfe\x00\x00'
53n/a
54n/a# UTF-32, big endian
55n/aBOM_UTF32_BE = b'\x00\x00\xfe\xff'
56n/a
57n/aif sys.byteorder == 'little':
58n/a
59n/a # UTF-16, native endianness
60n/a BOM = BOM_UTF16 = BOM_UTF16_LE
61n/a
62n/a # UTF-32, native endianness
63n/a BOM_UTF32 = BOM_UTF32_LE
64n/a
65n/aelse:
66n/a
67n/a # UTF-16, native endianness
68n/a BOM = BOM_UTF16 = BOM_UTF16_BE
69n/a
70n/a # UTF-32, native endianness
71n/a BOM_UTF32 = BOM_UTF32_BE
72n/a
73n/a# Old broken names (don't use in new code)
74n/aBOM32_LE = BOM_UTF16_LE
75n/aBOM32_BE = BOM_UTF16_BE
76n/aBOM64_LE = BOM_UTF32_LE
77n/aBOM64_BE = BOM_UTF32_BE
78n/a
79n/a
80n/a### Codec base classes (defining the API)
81n/a
82n/aclass CodecInfo(tuple):
83n/a """Codec details when looking up the codec registry"""
84n/a
85n/a # Private API to allow Python 3.4 to blacklist the known non-Unicode
86n/a # codecs in the standard library. A more general mechanism to
87n/a # reliably distinguish test encodings from other codecs will hopefully
88n/a # be defined for Python 3.5
89n/a #
90n/a # See http://bugs.python.org/issue19619
91n/a _is_text_encoding = True # Assume codecs are text encodings by default
92n/a
93n/a def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
94n/a incrementalencoder=None, incrementaldecoder=None, name=None,
95n/a *, _is_text_encoding=None):
96n/a self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
97n/a self.name = name
98n/a self.encode = encode
99n/a self.decode = decode
100n/a self.incrementalencoder = incrementalencoder
101n/a self.incrementaldecoder = incrementaldecoder
102n/a self.streamwriter = streamwriter
103n/a self.streamreader = streamreader
104n/a if _is_text_encoding is not None:
105n/a self._is_text_encoding = _is_text_encoding
106n/a return self
107n/a
108n/a def __repr__(self):
109n/a return "<%s.%s object for encoding %s at %#x>" % \
110n/a (self.__class__.__module__, self.__class__.__qualname__,
111n/a self.name, id(self))
112n/a
113n/aclass Codec:
114n/a
115n/a """ Defines the interface for stateless encoders/decoders.
116n/a
117n/a The .encode()/.decode() methods may use different error
118n/a handling schemes by providing the errors argument. These
119n/a string values are predefined:
120n/a
121n/a 'strict' - raise a ValueError error (or a subclass)
122n/a 'ignore' - ignore the character and continue with the next
123n/a 'replace' - replace with a suitable replacement character;
124n/a Python will use the official U+FFFD REPLACEMENT
125n/a CHARACTER for the builtin Unicode codecs on
126n/a decoding and '?' on encoding.
127n/a 'surrogateescape' - replace with private code points U+DCnn.
128n/a 'xmlcharrefreplace' - Replace with the appropriate XML
129n/a character reference (only for encoding).
130n/a 'backslashreplace' - Replace with backslashed escape sequences.
131n/a 'namereplace' - Replace with \\N{...} escape sequences
132n/a (only for encoding).
133n/a
134n/a The set of allowed values can be extended via register_error.
135n/a
136n/a """
137n/a def encode(self, input, errors='strict'):
138n/a
139n/a """ Encodes the object input and returns a tuple (output
140n/a object, length consumed).
141n/a
142n/a errors defines the error handling to apply. It defaults to
143n/a 'strict' handling.
144n/a
145n/a The method may not store state in the Codec instance. Use
146n/a StreamWriter for codecs which have to keep state in order to
147n/a make encoding efficient.
148n/a
149n/a The encoder must be able to handle zero length input and
150n/a return an empty object of the output object type in this
151n/a situation.
152n/a
153n/a """
154n/a raise NotImplementedError
155n/a
156n/a def decode(self, input, errors='strict'):
157n/a
158n/a """ Decodes the object input and returns a tuple (output
159n/a object, length consumed).
160n/a
161n/a input must be an object which provides the bf_getreadbuf
162n/a buffer slot. Python strings, buffer objects and memory
163n/a mapped files are examples of objects providing this slot.
164n/a
165n/a errors defines the error handling to apply. It defaults to
166n/a 'strict' handling.
167n/a
168n/a The method may not store state in the Codec instance. Use
169n/a StreamReader for codecs which have to keep state in order to
170n/a make decoding efficient.
171n/a
172n/a The decoder must be able to handle zero length input and
173n/a return an empty object of the output object type in this
174n/a situation.
175n/a
176n/a """
177n/a raise NotImplementedError
178n/a
179n/aclass IncrementalEncoder(object):
180n/a """
181n/a An IncrementalEncoder encodes an input in multiple steps. The input can
182n/a be passed piece by piece to the encode() method. The IncrementalEncoder
183n/a remembers the state of the encoding process between calls to encode().
184n/a """
185n/a def __init__(self, errors='strict'):
186n/a """
187n/a Creates an IncrementalEncoder instance.
188n/a
189n/a The IncrementalEncoder may use different error handling schemes by
190n/a providing the errors keyword argument. See the module docstring
191n/a for a list of possible values.
192n/a """
193n/a self.errors = errors
194n/a self.buffer = ""
195n/a
196n/a def encode(self, input, final=False):
197n/a """
198n/a Encodes input and returns the resulting object.
199n/a """
200n/a raise NotImplementedError
201n/a
202n/a def reset(self):
203n/a """
204n/a Resets the encoder to the initial state.
205n/a """
206n/a
207n/a def getstate(self):
208n/a """
209n/a Return the current state of the encoder.
210n/a """
211n/a return 0
212n/a
213n/a def setstate(self, state):
214n/a """
215n/a Set the current state of the encoder. state must have been
216n/a returned by getstate().
217n/a """
218n/a
219n/aclass BufferedIncrementalEncoder(IncrementalEncoder):
220n/a """
221n/a This subclass of IncrementalEncoder can be used as the baseclass for an
222n/a incremental encoder if the encoder must keep some of the output in a
223n/a buffer between calls to encode().
224n/a """
225n/a def __init__(self, errors='strict'):
226n/a IncrementalEncoder.__init__(self, errors)
227n/a # unencoded input that is kept between calls to encode()
228n/a self.buffer = ""
229n/a
230n/a def _buffer_encode(self, input, errors, final):
231n/a # Overwrite this method in subclasses: It must encode input
232n/a # and return an (output, length consumed) tuple
233n/a raise NotImplementedError
234n/a
235n/a def encode(self, input, final=False):
236n/a # encode input (taking the buffer into account)
237n/a data = self.buffer + input
238n/a (result, consumed) = self._buffer_encode(data, self.errors, final)
239n/a # keep unencoded input until the next call
240n/a self.buffer = data[consumed:]
241n/a return result
242n/a
243n/a def reset(self):
244n/a IncrementalEncoder.reset(self)
245n/a self.buffer = ""
246n/a
247n/a def getstate(self):
248n/a return self.buffer or 0
249n/a
250n/a def setstate(self, state):
251n/a self.buffer = state or ""
252n/a
253n/aclass IncrementalDecoder(object):
254n/a """
255n/a An IncrementalDecoder decodes an input in multiple steps. The input can
256n/a be passed piece by piece to the decode() method. The IncrementalDecoder
257n/a remembers the state of the decoding process between calls to decode().
258n/a """
259n/a def __init__(self, errors='strict'):
260n/a """
261n/a Create an IncrementalDecoder instance.
262n/a
263n/a The IncrementalDecoder may use different error handling schemes by
264n/a providing the errors keyword argument. See the module docstring
265n/a for a list of possible values.
266n/a """
267n/a self.errors = errors
268n/a
269n/a def decode(self, input, final=False):
270n/a """
271n/a Decode input and returns the resulting object.
272n/a """
273n/a raise NotImplementedError
274n/a
275n/a def reset(self):
276n/a """
277n/a Reset the decoder to the initial state.
278n/a """
279n/a
280n/a def getstate(self):
281n/a """
282n/a Return the current state of the decoder.
283n/a
284n/a This must be a (buffered_input, additional_state_info) tuple.
285n/a buffered_input must be a bytes object containing bytes that
286n/a were passed to decode() that have not yet been converted.
287n/a additional_state_info must be a non-negative integer
288n/a representing the state of the decoder WITHOUT yet having
289n/a processed the contents of buffered_input. In the initial state
290n/a and after reset(), getstate() must return (b"", 0).
291n/a """
292n/a return (b"", 0)
293n/a
294n/a def setstate(self, state):
295n/a """
296n/a Set the current state of the decoder.
297n/a
298n/a state must have been returned by getstate(). The effect of
299n/a setstate((b"", 0)) must be equivalent to reset().
300n/a """
301n/a
302n/aclass BufferedIncrementalDecoder(IncrementalDecoder):
303n/a """
304n/a This subclass of IncrementalDecoder can be used as the baseclass for an
305n/a incremental decoder if the decoder must be able to handle incomplete
306n/a byte sequences.
307n/a """
308n/a def __init__(self, errors='strict'):
309n/a IncrementalDecoder.__init__(self, errors)
310n/a # undecoded input that is kept between calls to decode()
311n/a self.buffer = b""
312n/a
313n/a def _buffer_decode(self, input, errors, final):
314n/a # Overwrite this method in subclasses: It must decode input
315n/a # and return an (output, length consumed) tuple
316n/a raise NotImplementedError
317n/a
318n/a def decode(self, input, final=False):
319n/a # decode input (taking the buffer into account)
320n/a data = self.buffer + input
321n/a (result, consumed) = self._buffer_decode(data, self.errors, final)
322n/a # keep undecoded input until the next call
323n/a self.buffer = data[consumed:]
324n/a return result
325n/a
326n/a def reset(self):
327n/a IncrementalDecoder.reset(self)
328n/a self.buffer = b""
329n/a
330n/a def getstate(self):
331n/a # additional state info is always 0
332n/a return (self.buffer, 0)
333n/a
334n/a def setstate(self, state):
335n/a # ignore additional state info
336n/a self.buffer = state[0]
337n/a
338n/a#
339n/a# The StreamWriter and StreamReader class provide generic working
340n/a# interfaces which can be used to implement new encoding submodules
341n/a# very easily. See encodings/utf_8.py for an example on how this is
342n/a# done.
343n/a#
344n/a
345n/aclass StreamWriter(Codec):
346n/a
347n/a def __init__(self, stream, errors='strict'):
348n/a
349n/a """ Creates a StreamWriter instance.
350n/a
351n/a stream must be a file-like object open for writing.
352n/a
353n/a The StreamWriter may use different error handling
354n/a schemes by providing the errors keyword argument. These
355n/a parameters are predefined:
356n/a
357n/a 'strict' - raise a ValueError (or a subclass)
358n/a 'ignore' - ignore the character and continue with the next
359n/a 'replace'- replace with a suitable replacement character
360n/a 'xmlcharrefreplace' - Replace with the appropriate XML
361n/a character reference.
362n/a 'backslashreplace' - Replace with backslashed escape
363n/a sequences.
364n/a 'namereplace' - Replace with \\N{...} escape sequences.
365n/a
366n/a The set of allowed parameter values can be extended via
367n/a register_error.
368n/a """
369n/a self.stream = stream
370n/a self.errors = errors
371n/a
372n/a def write(self, object):
373n/a
374n/a """ Writes the object's contents encoded to self.stream.
375n/a """
376n/a data, consumed = self.encode(object, self.errors)
377n/a self.stream.write(data)
378n/a
379n/a def writelines(self, list):
380n/a
381n/a """ Writes the concatenated list of strings to the stream
382n/a using .write().
383n/a """
384n/a self.write(''.join(list))
385n/a
386n/a def reset(self):
387n/a
388n/a """ Flushes and resets the codec buffers used for keeping state.
389n/a
390n/a Calling this method should ensure that the data on the
391n/a output is put into a clean state, that allows appending
392n/a of new fresh data without having to rescan the whole
393n/a stream to recover state.
394n/a
395n/a """
396n/a pass
397n/a
398n/a def seek(self, offset, whence=0):
399n/a self.stream.seek(offset, whence)
400n/a if whence == 0 and offset == 0:
401n/a self.reset()
402n/a
403n/a def __getattr__(self, name,
404n/a getattr=getattr):
405n/a
406n/a """ Inherit all other methods from the underlying stream.
407n/a """
408n/a return getattr(self.stream, name)
409n/a
410n/a def __enter__(self):
411n/a return self
412n/a
413n/a def __exit__(self, type, value, tb):
414n/a self.stream.close()
415n/a
416n/a###
417n/a
418n/aclass StreamReader(Codec):
419n/a
420n/a charbuffertype = str
421n/a
422n/a def __init__(self, stream, errors='strict'):
423n/a
424n/a """ Creates a StreamReader instance.
425n/a
426n/a stream must be a file-like object open for reading.
427n/a
428n/a The StreamReader may use different error handling
429n/a schemes by providing the errors keyword argument. These
430n/a parameters are predefined:
431n/a
432n/a 'strict' - raise a ValueError (or a subclass)
433n/a 'ignore' - ignore the character and continue with the next
434n/a 'replace'- replace with a suitable replacement character
435n/a 'backslashreplace' - Replace with backslashed escape sequences;
436n/a
437n/a The set of allowed parameter values can be extended via
438n/a register_error.
439n/a """
440n/a self.stream = stream
441n/a self.errors = errors
442n/a self.bytebuffer = b""
443n/a self._empty_charbuffer = self.charbuffertype()
444n/a self.charbuffer = self._empty_charbuffer
445n/a self.linebuffer = None
446n/a
447n/a def decode(self, input, errors='strict'):
448n/a raise NotImplementedError
449n/a
450n/a def read(self, size=-1, chars=-1, firstline=False):
451n/a
452n/a """ Decodes data from the stream self.stream and returns the
453n/a resulting object.
454n/a
455n/a chars indicates the number of decoded code points or bytes to
456n/a return. read() will never return more data than requested,
457n/a but it might return less, if there is not enough available.
458n/a
459n/a size indicates the approximate maximum number of decoded
460n/a bytes or code points to read for decoding. The decoder
461n/a can modify this setting as appropriate. The default value
462n/a -1 indicates to read and decode as much as possible. size
463n/a is intended to prevent having to decode huge files in one
464n/a step.
465n/a
466n/a If firstline is true, and a UnicodeDecodeError happens
467n/a after the first line terminator in the input only the first line
468n/a will be returned, the rest of the input will be kept until the
469n/a next call to read().
470n/a
471n/a The method should use a greedy read strategy, meaning that
472n/a it should read as much data as is allowed within the
473n/a definition of the encoding and the given size, e.g. if
474n/a optional encoding endings or state markers are available
475n/a on the stream, these should be read too.
476n/a """
477n/a # If we have lines cached, first merge them back into characters
478n/a if self.linebuffer:
479n/a self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
480n/a self.linebuffer = None
481n/a
482n/a # read until we get the required number of characters (if available)
483n/a while True:
484n/a # can the request be satisfied from the character buffer?
485n/a if chars >= 0:
486n/a if len(self.charbuffer) >= chars:
487n/a break
488n/a elif size >= 0:
489n/a if len(self.charbuffer) >= size:
490n/a break
491n/a # we need more data
492n/a if size < 0:
493n/a newdata = self.stream.read()
494n/a else:
495n/a newdata = self.stream.read(size)
496n/a # decode bytes (those remaining from the last call included)
497n/a data = self.bytebuffer + newdata
498n/a if not data:
499n/a break
500n/a try:
501n/a newchars, decodedbytes = self.decode(data, self.errors)
502n/a except UnicodeDecodeError as exc:
503n/a if firstline:
504n/a newchars, decodedbytes = \
505n/a self.decode(data[:exc.start], self.errors)
506n/a lines = newchars.splitlines(keepends=True)
507n/a if len(lines)<=1:
508n/a raise
509n/a else:
510n/a raise
511n/a # keep undecoded bytes until the next call
512n/a self.bytebuffer = data[decodedbytes:]
513n/a # put new characters in the character buffer
514n/a self.charbuffer += newchars
515n/a # there was no data available
516n/a if not newdata:
517n/a break
518n/a if chars < 0:
519n/a # Return everything we've got
520n/a result = self.charbuffer
521n/a self.charbuffer = self._empty_charbuffer
522n/a else:
523n/a # Return the first chars characters
524n/a result = self.charbuffer[:chars]
525n/a self.charbuffer = self.charbuffer[chars:]
526n/a return result
527n/a
528n/a def readline(self, size=None, keepends=True):
529n/a
530n/a """ Read one line from the input stream and return the
531n/a decoded data.
532n/a
533n/a size, if given, is passed as size argument to the
534n/a read() method.
535n/a
536n/a """
537n/a # If we have lines cached from an earlier read, return
538n/a # them unconditionally
539n/a if self.linebuffer:
540n/a line = self.linebuffer[0]
541n/a del self.linebuffer[0]
542n/a if len(self.linebuffer) == 1:
543n/a # revert to charbuffer mode; we might need more data
544n/a # next time
545n/a self.charbuffer = self.linebuffer[0]
546n/a self.linebuffer = None
547n/a if not keepends:
548n/a line = line.splitlines(keepends=False)[0]
549n/a return line
550n/a
551n/a readsize = size or 72
552n/a line = self._empty_charbuffer
553n/a # If size is given, we call read() only once
554n/a while True:
555n/a data = self.read(readsize, firstline=True)
556n/a if data:
557n/a # If we're at a "\r" read one extra character (which might
558n/a # be a "\n") to get a proper line ending. If the stream is
559n/a # temporarily exhausted we return the wrong line ending.
560n/a if (isinstance(data, str) and data.endswith("\r")) or \
561n/a (isinstance(data, bytes) and data.endswith(b"\r")):
562n/a data += self.read(size=1, chars=1)
563n/a
564n/a line += data
565n/a lines = line.splitlines(keepends=True)
566n/a if lines:
567n/a if len(lines) > 1:
568n/a # More than one line result; the first line is a full line
569n/a # to return
570n/a line = lines[0]
571n/a del lines[0]
572n/a if len(lines) > 1:
573n/a # cache the remaining lines
574n/a lines[-1] += self.charbuffer
575n/a self.linebuffer = lines
576n/a self.charbuffer = None
577n/a else:
578n/a # only one remaining line, put it back into charbuffer
579n/a self.charbuffer = lines[0] + self.charbuffer
580n/a if not keepends:
581n/a line = line.splitlines(keepends=False)[0]
582n/a break
583n/a line0withend = lines[0]
584n/a line0withoutend = lines[0].splitlines(keepends=False)[0]
585n/a if line0withend != line0withoutend: # We really have a line end
586n/a # Put the rest back together and keep it until the next call
587n/a self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
588n/a self.charbuffer
589n/a if keepends:
590n/a line = line0withend
591n/a else:
592n/a line = line0withoutend
593n/a break
594n/a # we didn't get anything or this was our only try
595n/a if not data or size is not None:
596n/a if line and not keepends:
597n/a line = line.splitlines(keepends=False)[0]
598n/a break
599n/a if readsize < 8000:
600n/a readsize *= 2
601n/a return line
602n/a
603n/a def readlines(self, sizehint=None, keepends=True):
604n/a
605n/a """ Read all lines available on the input stream
606n/a and return them as a list.
607n/a
608n/a Line breaks are implemented using the codec's decoder
609n/a method and are included in the list entries.
610n/a
611n/a sizehint, if given, is ignored since there is no efficient
612n/a way to finding the true end-of-line.
613n/a
614n/a """
615n/a data = self.read()
616n/a return data.splitlines(keepends)
617n/a
618n/a def reset(self):
619n/a
620n/a """ Resets the codec buffers used for keeping state.
621n/a
622n/a Note that no stream repositioning should take place.
623n/a This method is primarily intended to be able to recover
624n/a from decoding errors.
625n/a
626n/a """
627n/a self.bytebuffer = b""
628n/a self.charbuffer = self._empty_charbuffer
629n/a self.linebuffer = None
630n/a
631n/a def seek(self, offset, whence=0):
632n/a """ Set the input stream's current position.
633n/a
634n/a Resets the codec buffers used for keeping state.
635n/a """
636n/a self.stream.seek(offset, whence)
637n/a self.reset()
638n/a
639n/a def __next__(self):
640n/a
641n/a """ Return the next decoded line from the input stream."""
642n/a line = self.readline()
643n/a if line:
644n/a return line
645n/a raise StopIteration
646n/a
647n/a def __iter__(self):
648n/a return self
649n/a
650n/a def __getattr__(self, name,
651n/a getattr=getattr):
652n/a
653n/a """ Inherit all other methods from the underlying stream.
654n/a """
655n/a return getattr(self.stream, name)
656n/a
657n/a def __enter__(self):
658n/a return self
659n/a
660n/a def __exit__(self, type, value, tb):
661n/a self.stream.close()
662n/a
663n/a###
664n/a
665n/aclass StreamReaderWriter:
666n/a
667n/a """ StreamReaderWriter instances allow wrapping streams which
668n/a work in both read and write modes.
669n/a
670n/a The design is such that one can use the factory functions
671n/a returned by the codec.lookup() function to construct the
672n/a instance.
673n/a
674n/a """
675n/a # Optional attributes set by the file wrappers below
676n/a encoding = 'unknown'
677n/a
678n/a def __init__(self, stream, Reader, Writer, errors='strict'):
679n/a
680n/a """ Creates a StreamReaderWriter instance.
681n/a
682n/a stream must be a Stream-like object.
683n/a
684n/a Reader, Writer must be factory functions or classes
685n/a providing the StreamReader, StreamWriter interface resp.
686n/a
687n/a Error handling is done in the same way as defined for the
688n/a StreamWriter/Readers.
689n/a
690n/a """
691n/a self.stream = stream
692n/a self.reader = Reader(stream, errors)
693n/a self.writer = Writer(stream, errors)
694n/a self.errors = errors
695n/a
696n/a def read(self, size=-1):
697n/a
698n/a return self.reader.read(size)
699n/a
700n/a def readline(self, size=None):
701n/a
702n/a return self.reader.readline(size)
703n/a
704n/a def readlines(self, sizehint=None):
705n/a
706n/a return self.reader.readlines(sizehint)
707n/a
708n/a def __next__(self):
709n/a
710n/a """ Return the next decoded line from the input stream."""
711n/a return next(self.reader)
712n/a
713n/a def __iter__(self):
714n/a return self
715n/a
716n/a def write(self, data):
717n/a
718n/a return self.writer.write(data)
719n/a
720n/a def writelines(self, list):
721n/a
722n/a return self.writer.writelines(list)
723n/a
724n/a def reset(self):
725n/a
726n/a self.reader.reset()
727n/a self.writer.reset()
728n/a
729n/a def seek(self, offset, whence=0):
730n/a self.stream.seek(offset, whence)
731n/a self.reader.reset()
732n/a if whence == 0 and offset == 0:
733n/a self.writer.reset()
734n/a
735n/a def __getattr__(self, name,
736n/a getattr=getattr):
737n/a
738n/a """ Inherit all other methods from the underlying stream.
739n/a """
740n/a return getattr(self.stream, name)
741n/a
742n/a # these are needed to make "with codecs.open(...)" work properly
743n/a
744n/a def __enter__(self):
745n/a return self
746n/a
747n/a def __exit__(self, type, value, tb):
748n/a self.stream.close()
749n/a
750n/a###
751n/a
752n/aclass StreamRecoder:
753n/a
754n/a """ StreamRecoder instances translate data from one encoding to another.
755n/a
756n/a They use the complete set of APIs returned by the
757n/a codecs.lookup() function to implement their task.
758n/a
759n/a Data written to the StreamRecoder is first decoded into an
760n/a intermediate format (depending on the "decode" codec) and then
761n/a written to the underlying stream using an instance of the provided
762n/a Writer class.
763n/a
764n/a In the other direction, data is read from the underlying stream using
765n/a a Reader instance and then encoded and returned to the caller.
766n/a
767n/a """
768n/a # Optional attributes set by the file wrappers below
769n/a data_encoding = 'unknown'
770n/a file_encoding = 'unknown'
771n/a
772n/a def __init__(self, stream, encode, decode, Reader, Writer,
773n/a errors='strict'):
774n/a
775n/a """ Creates a StreamRecoder instance which implements a two-way
776n/a conversion: encode and decode work on the frontend (the
777n/a data visible to .read() and .write()) while Reader and Writer
778n/a work on the backend (the data in stream).
779n/a
780n/a You can use these objects to do transparent
781n/a transcodings from e.g. latin-1 to utf-8 and back.
782n/a
783n/a stream must be a file-like object.
784n/a
785n/a encode and decode must adhere to the Codec interface; Reader and
786n/a Writer must be factory functions or classes providing the
787n/a StreamReader and StreamWriter interfaces resp.
788n/a
789n/a Error handling is done in the same way as defined for the
790n/a StreamWriter/Readers.
791n/a
792n/a """
793n/a self.stream = stream
794n/a self.encode = encode
795n/a self.decode = decode
796n/a self.reader = Reader(stream, errors)
797n/a self.writer = Writer(stream, errors)
798n/a self.errors = errors
799n/a
800n/a def read(self, size=-1):
801n/a
802n/a data = self.reader.read(size)
803n/a data, bytesencoded = self.encode(data, self.errors)
804n/a return data
805n/a
806n/a def readline(self, size=None):
807n/a
808n/a if size is None:
809n/a data = self.reader.readline()
810n/a else:
811n/a data = self.reader.readline(size)
812n/a data, bytesencoded = self.encode(data, self.errors)
813n/a return data
814n/a
815n/a def readlines(self, sizehint=None):
816n/a
817n/a data = self.reader.read()
818n/a data, bytesencoded = self.encode(data, self.errors)
819n/a return data.splitlines(keepends=True)
820n/a
821n/a def __next__(self):
822n/a
823n/a """ Return the next decoded line from the input stream."""
824n/a data = next(self.reader)
825n/a data, bytesencoded = self.encode(data, self.errors)
826n/a return data
827n/a
828n/a def __iter__(self):
829n/a return self
830n/a
831n/a def write(self, data):
832n/a
833n/a data, bytesdecoded = self.decode(data, self.errors)
834n/a return self.writer.write(data)
835n/a
836n/a def writelines(self, list):
837n/a
838n/a data = ''.join(list)
839n/a data, bytesdecoded = self.decode(data, self.errors)
840n/a return self.writer.write(data)
841n/a
842n/a def reset(self):
843n/a
844n/a self.reader.reset()
845n/a self.writer.reset()
846n/a
847n/a def __getattr__(self, name,
848n/a getattr=getattr):
849n/a
850n/a """ Inherit all other methods from the underlying stream.
851n/a """
852n/a return getattr(self.stream, name)
853n/a
854n/a def __enter__(self):
855n/a return self
856n/a
857n/a def __exit__(self, type, value, tb):
858n/a self.stream.close()
859n/a
860n/a### Shortcuts
861n/a
862n/adef open(filename, mode='r', encoding=None, errors='strict', buffering=1):
863n/a
864n/a """ Open an encoded file using the given mode and return
865n/a a wrapped version providing transparent encoding/decoding.
866n/a
867n/a Note: The wrapped version will only accept the object format
868n/a defined by the codecs, i.e. Unicode objects for most builtin
869n/a codecs. Output is also codec dependent and will usually be
870n/a Unicode as well.
871n/a
872n/a Underlying encoded files are always opened in binary mode.
873n/a The default file mode is 'r', meaning to open the file in read mode.
874n/a
875n/a encoding specifies the encoding which is to be used for the
876n/a file.
877n/a
878n/a errors may be given to define the error handling. It defaults
879n/a to 'strict' which causes ValueErrors to be raised in case an
880n/a encoding error occurs.
881n/a
882n/a buffering has the same meaning as for the builtin open() API.
883n/a It defaults to line buffered.
884n/a
885n/a The returned wrapped file object provides an extra attribute
886n/a .encoding which allows querying the used encoding. This
887n/a attribute is only available if an encoding was specified as
888n/a parameter.
889n/a
890n/a """
891n/a if encoding is not None and \
892n/a 'b' not in mode:
893n/a # Force opening of the file in binary mode
894n/a mode = mode + 'b'
895n/a file = builtins.open(filename, mode, buffering)
896n/a if encoding is None:
897n/a return file
898n/a info = lookup(encoding)
899n/a srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
900n/a # Add attributes to simplify introspection
901n/a srw.encoding = encoding
902n/a return srw
903n/a
904n/adef EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
905n/a
906n/a """ Return a wrapped version of file which provides transparent
907n/a encoding translation.
908n/a
909n/a Data written to the wrapped file is decoded according
910n/a to the given data_encoding and then encoded to the underlying
911n/a file using file_encoding. The intermediate data type
912n/a will usually be Unicode but depends on the specified codecs.
913n/a
914n/a Bytes read from the file are decoded using file_encoding and then
915n/a passed back to the caller encoded using data_encoding.
916n/a
917n/a If file_encoding is not given, it defaults to data_encoding.
918n/a
919n/a errors may be given to define the error handling. It defaults
920n/a to 'strict' which causes ValueErrors to be raised in case an
921n/a encoding error occurs.
922n/a
923n/a The returned wrapped file object provides two extra attributes
924n/a .data_encoding and .file_encoding which reflect the given
925n/a parameters of the same name. The attributes can be used for
926n/a introspection by Python programs.
927n/a
928n/a """
929n/a if file_encoding is None:
930n/a file_encoding = data_encoding
931n/a data_info = lookup(data_encoding)
932n/a file_info = lookup(file_encoding)
933n/a sr = StreamRecoder(file, data_info.encode, data_info.decode,
934n/a file_info.streamreader, file_info.streamwriter, errors)
935n/a # Add attributes to simplify introspection
936n/a sr.data_encoding = data_encoding
937n/a sr.file_encoding = file_encoding
938n/a return sr
939n/a
940n/a### Helpers for codec lookup
941n/a
942n/adef getencoder(encoding):
943n/a
944n/a """ Lookup up the codec for the given encoding and return
945n/a its encoder function.
946n/a
947n/a Raises a LookupError in case the encoding cannot be found.
948n/a
949n/a """
950n/a return lookup(encoding).encode
951n/a
952n/adef getdecoder(encoding):
953n/a
954n/a """ Lookup up the codec for the given encoding and return
955n/a its decoder function.
956n/a
957n/a Raises a LookupError in case the encoding cannot be found.
958n/a
959n/a """
960n/a return lookup(encoding).decode
961n/a
962n/adef getincrementalencoder(encoding):
963n/a
964n/a """ Lookup up the codec for the given encoding and return
965n/a its IncrementalEncoder class or factory function.
966n/a
967n/a Raises a LookupError in case the encoding cannot be found
968n/a or the codecs doesn't provide an incremental encoder.
969n/a
970n/a """
971n/a encoder = lookup(encoding).incrementalencoder
972n/a if encoder is None:
973n/a raise LookupError(encoding)
974n/a return encoder
975n/a
976n/adef getincrementaldecoder(encoding):
977n/a
978n/a """ Lookup up the codec for the given encoding and return
979n/a its IncrementalDecoder class or factory function.
980n/a
981n/a Raises a LookupError in case the encoding cannot be found
982n/a or the codecs doesn't provide an incremental decoder.
983n/a
984n/a """
985n/a decoder = lookup(encoding).incrementaldecoder
986n/a if decoder is None:
987n/a raise LookupError(encoding)
988n/a return decoder
989n/a
990n/adef getreader(encoding):
991n/a
992n/a """ Lookup up the codec for the given encoding and return
993n/a its StreamReader class or factory function.
994n/a
995n/a Raises a LookupError in case the encoding cannot be found.
996n/a
997n/a """
998n/a return lookup(encoding).streamreader
999n/a
1000n/adef getwriter(encoding):
1001n/a
1002n/a """ Lookup up the codec for the given encoding and return
1003n/a its StreamWriter class or factory function.
1004n/a
1005n/a Raises a LookupError in case the encoding cannot be found.
1006n/a
1007n/a """
1008n/a return lookup(encoding).streamwriter
1009n/a
1010n/adef iterencode(iterator, encoding, errors='strict', **kwargs):
1011n/a """
1012n/a Encoding iterator.
1013n/a
1014n/a Encodes the input strings from the iterator using an IncrementalEncoder.
1015n/a
1016n/a errors and kwargs are passed through to the IncrementalEncoder
1017n/a constructor.
1018n/a """
1019n/a encoder = getincrementalencoder(encoding)(errors, **kwargs)
1020n/a for input in iterator:
1021n/a output = encoder.encode(input)
1022n/a if output:
1023n/a yield output
1024n/a output = encoder.encode("", True)
1025n/a if output:
1026n/a yield output
1027n/a
1028n/adef iterdecode(iterator, encoding, errors='strict', **kwargs):
1029n/a """
1030n/a Decoding iterator.
1031n/a
1032n/a Decodes the input strings from the iterator using an IncrementalDecoder.
1033n/a
1034n/a errors and kwargs are passed through to the IncrementalDecoder
1035n/a constructor.
1036n/a """
1037n/a decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1038n/a for input in iterator:
1039n/a output = decoder.decode(input)
1040n/a if output:
1041n/a yield output
1042n/a output = decoder.decode(b"", True)
1043n/a if output:
1044n/a yield output
1045n/a
1046n/a### Helpers for charmap-based codecs
1047n/a
1048n/adef make_identity_dict(rng):
1049n/a
1050n/a """ make_identity_dict(rng) -> dict
1051n/a
1052n/a Return a dictionary where elements of the rng sequence are
1053n/a mapped to themselves.
1054n/a
1055n/a """
1056n/a return {i:i for i in rng}
1057n/a
1058n/adef make_encoding_map(decoding_map):
1059n/a
1060n/a """ Creates an encoding map from a decoding map.
1061n/a
1062n/a If a target mapping in the decoding map occurs multiple
1063n/a times, then that target is mapped to None (undefined mapping),
1064n/a causing an exception when encountered by the charmap codec
1065n/a during translation.
1066n/a
1067n/a One example where this happens is cp875.py which decodes
1068n/a multiple character to \\u001a.
1069n/a
1070n/a """
1071n/a m = {}
1072n/a for k,v in decoding_map.items():
1073n/a if not v in m:
1074n/a m[v] = k
1075n/a else:
1076n/a m[v] = None
1077n/a return m
1078n/a
1079n/a### error handlers
1080n/a
1081n/atry:
1082n/a strict_errors = lookup_error("strict")
1083n/a ignore_errors = lookup_error("ignore")
1084n/a replace_errors = lookup_error("replace")
1085n/a xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1086n/a backslashreplace_errors = lookup_error("backslashreplace")
1087n/a namereplace_errors = lookup_error("namereplace")
1088n/aexcept LookupError:
1089n/a # In --disable-unicode builds, these error handler are missing
1090n/a strict_errors = None
1091n/a ignore_errors = None
1092n/a replace_errors = None
1093n/a xmlcharrefreplace_errors = None
1094n/a backslashreplace_errors = None
1095n/a namereplace_errors = None
1096n/a
1097n/a# Tell modulefinder that using codecs probably needs the encodings
1098n/a# package
1099n/a_false = 0
1100n/aif _false:
1101n/a import encodings
1102n/a
1103n/a### Tests
1104n/a
1105n/aif __name__ == '__main__':
1106n/a
1107n/a # Make stdout translate Latin-1 output into UTF-8 output
1108n/a sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1109n/a
1110n/a # Have stdin translate Latin-1 input into UTF-8 input
1111n/a sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')