Python code coverage for Lib/codecs.py

#	count	content
1	n/a	""" codecs -- Python Codec Registry, API and helpers.
2	n/a
3	n/a
4	n/a	Written by Marc-Andre Lemburg (mal@lemburg.com).
5	n/a
6	n/a	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7	n/a
8	n/a	"""#"
9	n/a
10	n/a	import builtins, sys
11	n/a
12	n/a	### Registry and builtin stateless codec functions
13	n/a
14	n/a	try:
15	n/a	from _codecs import *
16	n/a	except ImportError as why:
17	n/a	raise SystemError('Failed to load the builtin codecs: %s' % why)
18	n/a
19	n/a	__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
20	n/a	"BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21	n/a	"BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
22	n/a	"BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23	n/a	"CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
24	n/a	"StreamReader", "StreamWriter",
25	n/a	"StreamReaderWriter", "StreamRecoder",
26	n/a	"getencoder", "getdecoder", "getincrementalencoder",
27	n/a	"getincrementaldecoder", "getreader", "getwriter",
28	n/a	"encode", "decode", "iterencode", "iterdecode",
29	n/a	"strict_errors", "ignore_errors", "replace_errors",
30	n/a	"xmlcharrefreplace_errors",
31	n/a	"backslashreplace_errors", "namereplace_errors",
32	n/a	"register_error", "lookup_error"]
33	n/a
34	n/a	### Constants
35	n/a
36	n/a	#
37	n/a	# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
38	n/a	# and its possible byte string values
39	n/a	# for UTF8/UTF16/UTF32 output and little/big endian machines
40	n/a	#
41	n/a
42	n/a	# UTF-8
43	n/a	BOM_UTF8 = b'\xef\xbb\xbf'
44	n/a
45	n/a	# UTF-16, little endian
46	n/a	BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
47	n/a
48	n/a	# UTF-16, big endian
49	n/a	BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
50	n/a
51	n/a	# UTF-32, little endian
52	n/a	BOM_UTF32_LE = b'\xff\xfe\x00\x00'
53	n/a
54	n/a	# UTF-32, big endian
55	n/a	BOM_UTF32_BE = b'\x00\x00\xfe\xff'
56	n/a
57	n/a	if sys.byteorder == 'little':
58	n/a
59	n/a	# UTF-16, native endianness
60	n/a	BOM = BOM_UTF16 = BOM_UTF16_LE
61	n/a
62	n/a	# UTF-32, native endianness
63	n/a	BOM_UTF32 = BOM_UTF32_LE
64	n/a
65	n/a	else:
66	n/a
67	n/a	# UTF-16, native endianness
68	n/a	BOM = BOM_UTF16 = BOM_UTF16_BE
69	n/a
70	n/a	# UTF-32, native endianness
71	n/a	BOM_UTF32 = BOM_UTF32_BE
72	n/a
73	n/a	# Old broken names (don't use in new code)
74	n/a	BOM32_LE = BOM_UTF16_LE
75	n/a	BOM32_BE = BOM_UTF16_BE
76	n/a	BOM64_LE = BOM_UTF32_LE
77	n/a	BOM64_BE = BOM_UTF32_BE
78	n/a
79	n/a
80	n/a	### Codec base classes (defining the API)
81	n/a
82	n/a	class CodecInfo(tuple):
83	n/a	"""Codec details when looking up the codec registry"""
84	n/a
85	n/a	# Private API to allow Python 3.4 to blacklist the known non-Unicode
86	n/a	# codecs in the standard library. A more general mechanism to
87	n/a	# reliably distinguish test encodings from other codecs will hopefully
88	n/a	# be defined for Python 3.5
89	n/a	#
90	n/a	# See http://bugs.python.org/issue19619
91	n/a	_is_text_encoding = True # Assume codecs are text encodings by default
92	n/a
93	n/a	def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
94	n/a	incrementalencoder=None, incrementaldecoder=None, name=None,
95	n/a	*, _is_text_encoding=None):
96	n/a	self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
97	n/a	self.name = name
98	n/a	self.encode = encode
99	n/a	self.decode = decode
100	n/a	self.incrementalencoder = incrementalencoder
101	n/a	self.incrementaldecoder = incrementaldecoder
102	n/a	self.streamwriter = streamwriter
103	n/a	self.streamreader = streamreader
104	n/a	if _is_text_encoding is not None:
105	n/a	self._is_text_encoding = _is_text_encoding
106	n/a	return self
107	n/a
108	n/a	def __repr__(self):
109	n/a	return "<%s.%s object for encoding %s at %#x>" % \
110	n/a	(self.__class__.__module__, self.__class__.__qualname__,
111	n/a	self.name, id(self))
112	n/a
113	n/a	class Codec:
114	n/a
115	n/a	""" Defines the interface for stateless encoders/decoders.
116	n/a
117	n/a	The .encode()/.decode() methods may use different error
118	n/a	handling schemes by providing the errors argument. These
119	n/a	string values are predefined:
120	n/a
121	n/a	'strict' - raise a ValueError error (or a subclass)
122	n/a	'ignore' - ignore the character and continue with the next
123	n/a	'replace' - replace with a suitable replacement character;
124	n/a	Python will use the official U+FFFD REPLACEMENT
125	n/a	CHARACTER for the builtin Unicode codecs on
126	n/a	decoding and '?' on encoding.
127	n/a	'surrogateescape' - replace with private code points U+DCnn.
128	n/a	'xmlcharrefreplace' - Replace with the appropriate XML
129	n/a	character reference (only for encoding).
130	n/a	'backslashreplace' - Replace with backslashed escape sequences.
131	n/a	'namereplace' - Replace with \\N{...} escape sequences
132	n/a	(only for encoding).
133	n/a
134	n/a	The set of allowed values can be extended via register_error.
135	n/a
136	n/a	"""
137	n/a	def encode(self, input, errors='strict'):
138	n/a
139	n/a	""" Encodes the object input and returns a tuple (output
140	n/a	object, length consumed).
141	n/a
142	n/a	errors defines the error handling to apply. It defaults to
143	n/a	'strict' handling.
144	n/a
145	n/a	The method may not store state in the Codec instance. Use
146	n/a	StreamWriter for codecs which have to keep state in order to
147	n/a	make encoding efficient.
148	n/a
149	n/a	The encoder must be able to handle zero length input and
150	n/a	return an empty object of the output object type in this
151	n/a	situation.
152	n/a
153	n/a	"""
154	n/a	raise NotImplementedError
155	n/a
156	n/a	def decode(self, input, errors='strict'):
157	n/a
158	n/a	""" Decodes the object input and returns a tuple (output
159	n/a	object, length consumed).
160	n/a
161	n/a	input must be an object which provides the bf_getreadbuf
162	n/a	buffer slot. Python strings, buffer objects and memory
163	n/a	mapped files are examples of objects providing this slot.
164	n/a
165	n/a	errors defines the error handling to apply. It defaults to
166	n/a	'strict' handling.
167	n/a
168	n/a	The method may not store state in the Codec instance. Use
169	n/a	StreamReader for codecs which have to keep state in order to
170	n/a	make decoding efficient.
171	n/a
172	n/a	The decoder must be able to handle zero length input and
173	n/a	return an empty object of the output object type in this
174	n/a	situation.
175	n/a
176	n/a	"""
177	n/a	raise NotImplementedError
178	n/a
179	n/a	class IncrementalEncoder(object):
180	n/a	"""
181	n/a	An IncrementalEncoder encodes an input in multiple steps. The input can
182	n/a	be passed piece by piece to the encode() method. The IncrementalEncoder
183	n/a	remembers the state of the encoding process between calls to encode().
184	n/a	"""
185	n/a	def __init__(self, errors='strict'):
186	n/a	"""
187	n/a	Creates an IncrementalEncoder instance.
188	n/a
189	n/a	The IncrementalEncoder may use different error handling schemes by
190	n/a	providing the errors keyword argument. See the module docstring
191	n/a	for a list of possible values.
192	n/a	"""
193	n/a	self.errors = errors
194	n/a	self.buffer = ""
195	n/a
196	n/a	def encode(self, input, final=False):
197	n/a	"""
198	n/a	Encodes input and returns the resulting object.
199	n/a	"""
200	n/a	raise NotImplementedError
201	n/a
202	n/a	def reset(self):
203	n/a	"""
204	n/a	Resets the encoder to the initial state.
205	n/a	"""
206	n/a
207	n/a	def getstate(self):
208	n/a	"""
209	n/a	Return the current state of the encoder.
210	n/a	"""
211	n/a	return 0
212	n/a
213	n/a	def setstate(self, state):
214	n/a	"""
215	n/a	Set the current state of the encoder. state must have been
216	n/a	returned by getstate().
217	n/a	"""
218	n/a
219	n/a	class BufferedIncrementalEncoder(IncrementalEncoder):
220	n/a	"""
221	n/a	This subclass of IncrementalEncoder can be used as the baseclass for an
222	n/a	incremental encoder if the encoder must keep some of the output in a
223	n/a	buffer between calls to encode().
224	n/a	"""
225	n/a	def __init__(self, errors='strict'):
226	n/a	IncrementalEncoder.__init__(self, errors)
227	n/a	# unencoded input that is kept between calls to encode()
228	n/a	self.buffer = ""
229	n/a
230	n/a	def _buffer_encode(self, input, errors, final):
231	n/a	# Overwrite this method in subclasses: It must encode input
232	n/a	# and return an (output, length consumed) tuple
233	n/a	raise NotImplementedError
234	n/a
235	n/a	def encode(self, input, final=False):
236	n/a	# encode input (taking the buffer into account)
237	n/a	data = self.buffer + input
238	n/a	(result, consumed) = self._buffer_encode(data, self.errors, final)
239	n/a	# keep unencoded input until the next call
240	n/a	self.buffer = data[consumed:]
241	n/a	return result
242	n/a
243	n/a	def reset(self):
244	n/a	IncrementalEncoder.reset(self)
245	n/a	self.buffer = ""
246	n/a
247	n/a	def getstate(self):
248	n/a	return self.buffer or 0
249	n/a
250	n/a	def setstate(self, state):
251	n/a	self.buffer = state or ""
252	n/a
253	n/a	class IncrementalDecoder(object):
254	n/a	"""
255	n/a	An IncrementalDecoder decodes an input in multiple steps. The input can
256	n/a	be passed piece by piece to the decode() method. The IncrementalDecoder
257	n/a	remembers the state of the decoding process between calls to decode().
258	n/a	"""
259	n/a	def __init__(self, errors='strict'):
260	n/a	"""
261	n/a	Create an IncrementalDecoder instance.
262	n/a
263	n/a	The IncrementalDecoder may use different error handling schemes by
264	n/a	providing the errors keyword argument. See the module docstring
265	n/a	for a list of possible values.
266	n/a	"""
267	n/a	self.errors = errors
268	n/a
269	n/a	def decode(self, input, final=False):
270	n/a	"""
271	n/a	Decode input and returns the resulting object.
272	n/a	"""
273	n/a	raise NotImplementedError
274	n/a
275	n/a	def reset(self):
276	n/a	"""
277	n/a	Reset the decoder to the initial state.
278	n/a	"""
279	n/a
280	n/a	def getstate(self):
281	n/a	"""
282	n/a	Return the current state of the decoder.
283	n/a
284	n/a	This must be a (buffered_input, additional_state_info) tuple.
285	n/a	buffered_input must be a bytes object containing bytes that
286	n/a	were passed to decode() that have not yet been converted.
287	n/a	additional_state_info must be a non-negative integer
288	n/a	representing the state of the decoder WITHOUT yet having
289	n/a	processed the contents of buffered_input. In the initial state
290	n/a	and after reset(), getstate() must return (b"", 0).
291	n/a	"""
292	n/a	return (b"", 0)
293	n/a
294	n/a	def setstate(self, state):
295	n/a	"""
296	n/a	Set the current state of the decoder.
297	n/a
298	n/a	state must have been returned by getstate(). The effect of
299	n/a	setstate((b"", 0)) must be equivalent to reset().
300	n/a	"""
301	n/a
302	n/a	class BufferedIncrementalDecoder(IncrementalDecoder):
303	n/a	"""
304	n/a	This subclass of IncrementalDecoder can be used as the baseclass for an
305	n/a	incremental decoder if the decoder must be able to handle incomplete
306	n/a	byte sequences.
307	n/a	"""
308	n/a	def __init__(self, errors='strict'):
309	n/a	IncrementalDecoder.__init__(self, errors)
310	n/a	# undecoded input that is kept between calls to decode()
311	n/a	self.buffer = b""
312	n/a
313	n/a	def _buffer_decode(self, input, errors, final):
314	n/a	# Overwrite this method in subclasses: It must decode input
315	n/a	# and return an (output, length consumed) tuple
316	n/a	raise NotImplementedError
317	n/a
318	n/a	def decode(self, input, final=False):
319	n/a	# decode input (taking the buffer into account)
320	n/a	data = self.buffer + input
321	n/a	(result, consumed) = self._buffer_decode(data, self.errors, final)
322	n/a	# keep undecoded input until the next call
323	n/a	self.buffer = data[consumed:]
324	n/a	return result
325	n/a
326	n/a	def reset(self):
327	n/a	IncrementalDecoder.reset(self)
328	n/a	self.buffer = b""
329	n/a
330	n/a	def getstate(self):
331	n/a	# additional state info is always 0
332	n/a	return (self.buffer, 0)
333	n/a
334	n/a	def setstate(self, state):
335	n/a	# ignore additional state info
336	n/a	self.buffer = state[0]
337	n/a
338	n/a	#
339	n/a	# The StreamWriter and StreamReader class provide generic working
340	n/a	# interfaces which can be used to implement new encoding submodules
341	n/a	# very easily. See encodings/utf_8.py for an example on how this is
342	n/a	# done.
343	n/a	#
344	n/a
345	n/a	class StreamWriter(Codec):
346	n/a
347	n/a	def __init__(self, stream, errors='strict'):
348	n/a
349	n/a	""" Creates a StreamWriter instance.
350	n/a
351	n/a	stream must be a file-like object open for writing.
352	n/a
353	n/a	The StreamWriter may use different error handling
354	n/a	schemes by providing the errors keyword argument. These
355	n/a	parameters are predefined:
356	n/a
357	n/a	'strict' - raise a ValueError (or a subclass)
358	n/a	'ignore' - ignore the character and continue with the next
359	n/a	'replace'- replace with a suitable replacement character
360	n/a	'xmlcharrefreplace' - Replace with the appropriate XML
361	n/a	character reference.
362	n/a	'backslashreplace' - Replace with backslashed escape
363	n/a	sequences.
364	n/a	'namereplace' - Replace with \\N{...} escape sequences.
365	n/a
366	n/a	The set of allowed parameter values can be extended via
367	n/a	register_error.
368	n/a	"""
369	n/a	self.stream = stream
370	n/a	self.errors = errors
371	n/a
372	n/a	def write(self, object):
373	n/a
374	n/a	""" Writes the object's contents encoded to self.stream.
375	n/a	"""
376	n/a	data, consumed = self.encode(object, self.errors)
377	n/a	self.stream.write(data)
378	n/a
379	n/a	def writelines(self, list):
380	n/a
381	n/a	""" Writes the concatenated list of strings to the stream
382	n/a	using .write().
383	n/a	"""
384	n/a	self.write(''.join(list))
385	n/a
386	n/a	def reset(self):
387	n/a
388	n/a	""" Flushes and resets the codec buffers used for keeping state.
389	n/a
390	n/a	Calling this method should ensure that the data on the
391	n/a	output is put into a clean state, that allows appending
392	n/a	of new fresh data without having to rescan the whole
393	n/a	stream to recover state.
394	n/a
395	n/a	"""
396	n/a	pass
397	n/a
398	n/a	def seek(self, offset, whence=0):
399	n/a	self.stream.seek(offset, whence)
400	n/a	if whence == 0 and offset == 0:
401	n/a	self.reset()
402	n/a
403	n/a	def __getattr__(self, name,
404	n/a	getattr=getattr):
405	n/a
406	n/a	""" Inherit all other methods from the underlying stream.
407	n/a	"""
408	n/a	return getattr(self.stream, name)
409	n/a
410	n/a	def __enter__(self):
411	n/a	return self
412	n/a
413	n/a	def __exit__(self, type, value, tb):
414	n/a	self.stream.close()
415	n/a
416	n/a	###
417	n/a
418	n/a	class StreamReader(Codec):
419	n/a
420	n/a	charbuffertype = str
421	n/a
422	n/a	def __init__(self, stream, errors='strict'):
423	n/a
424	n/a	""" Creates a StreamReader instance.
425	n/a
426	n/a	stream must be a file-like object open for reading.
427	n/a
428	n/a	The StreamReader may use different error handling
429	n/a	schemes by providing the errors keyword argument. These
430	n/a	parameters are predefined:
431	n/a
432	n/a	'strict' - raise a ValueError (or a subclass)
433	n/a	'ignore' - ignore the character and continue with the next
434	n/a	'replace'- replace with a suitable replacement character
435	n/a	'backslashreplace' - Replace with backslashed escape sequences;
436	n/a
437	n/a	The set of allowed parameter values can be extended via
438	n/a	register_error.
439	n/a	"""
440	n/a	self.stream = stream
441	n/a	self.errors = errors
442	n/a	self.bytebuffer = b""
443	n/a	self._empty_charbuffer = self.charbuffertype()
444	n/a	self.charbuffer = self._empty_charbuffer
445	n/a	self.linebuffer = None
446	n/a
447	n/a	def decode(self, input, errors='strict'):
448	n/a	raise NotImplementedError
449	n/a
450	n/a	def read(self, size=-1, chars=-1, firstline=False):
451	n/a
452	n/a	""" Decodes data from the stream self.stream and returns the
453	n/a	resulting object.
454	n/a
455	n/a	chars indicates the number of decoded code points or bytes to
456	n/a	return. read() will never return more data than requested,
457	n/a	but it might return less, if there is not enough available.
458	n/a
459	n/a	size indicates the approximate maximum number of decoded
460	n/a	bytes or code points to read for decoding. The decoder
461	n/a	can modify this setting as appropriate. The default value
462	n/a	-1 indicates to read and decode as much as possible. size
463	n/a	is intended to prevent having to decode huge files in one
464	n/a	step.
465	n/a
466	n/a	If firstline is true, and a UnicodeDecodeError happens
467	n/a	after the first line terminator in the input only the first line
468	n/a	will be returned, the rest of the input will be kept until the
469	n/a	next call to read().
470	n/a
471	n/a	The method should use a greedy read strategy, meaning that
472	n/a	it should read as much data as is allowed within the
473	n/a	definition of the encoding and the given size, e.g. if
474	n/a	optional encoding endings or state markers are available
475	n/a	on the stream, these should be read too.
476	n/a	"""
477	n/a	# If we have lines cached, first merge them back into characters
478	n/a	if self.linebuffer:
479	n/a	self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
480	n/a	self.linebuffer = None
481	n/a
482	n/a	# read until we get the required number of characters (if available)
483	n/a	while True:
484	n/a	# can the request be satisfied from the character buffer?
485	n/a	if chars >= 0:
486	n/a	if len(self.charbuffer) >= chars:
487	n/a	break
488	n/a	elif size >= 0:
489	n/a	if len(self.charbuffer) >= size:
490	n/a	break
491	n/a	# we need more data
492	n/a	if size < 0:
493	n/a	newdata = self.stream.read()
494	n/a	else:
495	n/a	newdata = self.stream.read(size)
496	n/a	# decode bytes (those remaining from the last call included)
497	n/a	data = self.bytebuffer + newdata
498	n/a	if not data:
499	n/a	break
500	n/a	try:
501	n/a	newchars, decodedbytes = self.decode(data, self.errors)
502	n/a	except UnicodeDecodeError as exc:
503	n/a	if firstline:
504	n/a	newchars, decodedbytes = \
505	n/a	self.decode(data[:exc.start], self.errors)
506	n/a	lines = newchars.splitlines(keepends=True)
507	n/a	if len(lines)<=1:
508	n/a	raise
509	n/a	else:
510	n/a	raise
511	n/a	# keep undecoded bytes until the next call
512	n/a	self.bytebuffer = data[decodedbytes:]
513	n/a	# put new characters in the character buffer
514	n/a	self.charbuffer += newchars
515	n/a	# there was no data available
516	n/a	if not newdata:
517	n/a	break
518	n/a	if chars < 0:
519	n/a	# Return everything we've got
520	n/a	result = self.charbuffer
521	n/a	self.charbuffer = self._empty_charbuffer
522	n/a	else:
523	n/a	# Return the first chars characters
524	n/a	result = self.charbuffer[:chars]
525	n/a	self.charbuffer = self.charbuffer[chars:]
526	n/a	return result
527	n/a
528	n/a	def readline(self, size=None, keepends=True):
529	n/a
530	n/a	""" Read one line from the input stream and return the
531	n/a	decoded data.
532	n/a
533	n/a	size, if given, is passed as size argument to the
534	n/a	read() method.
535	n/a
536	n/a	"""
537	n/a	# If we have lines cached from an earlier read, return
538	n/a	# them unconditionally
539	n/a	if self.linebuffer:
540	n/a	line = self.linebuffer[0]
541	n/a	del self.linebuffer[0]
542	n/a	if len(self.linebuffer) == 1:
543	n/a	# revert to charbuffer mode; we might need more data
544	n/a	# next time
545	n/a	self.charbuffer = self.linebuffer[0]
546	n/a	self.linebuffer = None
547	n/a	if not keepends:
548	n/a	line = line.splitlines(keepends=False)[0]
549	n/a	return line
550	n/a
551	n/a	readsize = size or 72
552	n/a	line = self._empty_charbuffer
553	n/a	# If size is given, we call read() only once
554	n/a	while True:
555	n/a	data = self.read(readsize, firstline=True)
556	n/a	if data:
557	n/a	# If we're at a "\r" read one extra character (which might
558	n/a	# be a "\n") to get a proper line ending. If the stream is
559	n/a	# temporarily exhausted we return the wrong line ending.
560	n/a	if (isinstance(data, str) and data.endswith("\r")) or \
561	n/a	(isinstance(data, bytes) and data.endswith(b"\r")):
562	n/a	data += self.read(size=1, chars=1)
563	n/a
564	n/a	line += data
565	n/a	lines = line.splitlines(keepends=True)
566	n/a	if lines:
567	n/a	if len(lines) > 1:
568	n/a	# More than one line result; the first line is a full line
569	n/a	# to return
570	n/a	line = lines[0]
571	n/a	del lines[0]
572	n/a	if len(lines) > 1:
573	n/a	# cache the remaining lines
574	n/a	lines[-1] += self.charbuffer
575	n/a	self.linebuffer = lines
576	n/a	self.charbuffer = None
577	n/a	else:
578	n/a	# only one remaining line, put it back into charbuffer
579	n/a	self.charbuffer = lines[0] + self.charbuffer
580	n/a	if not keepends:
581	n/a	line = line.splitlines(keepends=False)[0]
582	n/a	break
583	n/a	line0withend = lines[0]
584	n/a	line0withoutend = lines[0].splitlines(keepends=False)[0]
585	n/a	if line0withend != line0withoutend: # We really have a line end
586	n/a	# Put the rest back together and keep it until the next call
587	n/a	self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
588	n/a	self.charbuffer
589	n/a	if keepends:
590	n/a	line = line0withend
591	n/a	else:
592	n/a	line = line0withoutend
593	n/a	break
594	n/a	# we didn't get anything or this was our only try
595	n/a	if not data or size is not None:
596	n/a	if line and not keepends:
597	n/a	line = line.splitlines(keepends=False)[0]
598	n/a	break
599	n/a	if readsize < 8000:
600	n/a	readsize *= 2
601	n/a	return line
602	n/a
603	n/a	def readlines(self, sizehint=None, keepends=True):
604	n/a
605	n/a	""" Read all lines available on the input stream
606	n/a	and return them as a list.
607	n/a
608	n/a	Line breaks are implemented using the codec's decoder
609	n/a	method and are included in the list entries.
610	n/a
611	n/a	sizehint, if given, is ignored since there is no efficient
612	n/a	way to finding the true end-of-line.
613	n/a
614	n/a	"""
615	n/a	data = self.read()
616	n/a	return data.splitlines(keepends)
617	n/a
618	n/a	def reset(self):
619	n/a
620	n/a	""" Resets the codec buffers used for keeping state.
621	n/a
622	n/a	Note that no stream repositioning should take place.
623	n/a	This method is primarily intended to be able to recover
624	n/a	from decoding errors.
625	n/a
626	n/a	"""
627	n/a	self.bytebuffer = b""
628	n/a	self.charbuffer = self._empty_charbuffer
629	n/a	self.linebuffer = None
630	n/a
631	n/a	def seek(self, offset, whence=0):
632	n/a	""" Set the input stream's current position.
633	n/a
634	n/a	Resets the codec buffers used for keeping state.
635	n/a	"""
636	n/a	self.stream.seek(offset, whence)
637	n/a	self.reset()
638	n/a
639	n/a	def __next__(self):
640	n/a
641	n/a	""" Return the next decoded line from the input stream."""
642	n/a	line = self.readline()
643	n/a	if line:
644	n/a	return line
645	n/a	raise StopIteration
646	n/a
647	n/a	def __iter__(self):
648	n/a	return self
649	n/a
650	n/a	def __getattr__(self, name,
651	n/a	getattr=getattr):
652	n/a
653	n/a	""" Inherit all other methods from the underlying stream.
654	n/a	"""
655	n/a	return getattr(self.stream, name)
656	n/a
657	n/a	def __enter__(self):
658	n/a	return self
659	n/a
660	n/a	def __exit__(self, type, value, tb):
661	n/a	self.stream.close()
662	n/a
663	n/a	###
664	n/a
665	n/a	class StreamReaderWriter:
666	n/a
667	n/a	""" StreamReaderWriter instances allow wrapping streams which
668	n/a	work in both read and write modes.
669	n/a
670	n/a	The design is such that one can use the factory functions
671	n/a	returned by the codec.lookup() function to construct the
672	n/a	instance.
673	n/a
674	n/a	"""
675	n/a	# Optional attributes set by the file wrappers below
676	n/a	encoding = 'unknown'
677	n/a
678	n/a	def __init__(self, stream, Reader, Writer, errors='strict'):
679	n/a
680	n/a	""" Creates a StreamReaderWriter instance.
681	n/a
682	n/a	stream must be a Stream-like object.
683	n/a
684	n/a	Reader, Writer must be factory functions or classes
685	n/a	providing the StreamReader, StreamWriter interface resp.
686	n/a
687	n/a	Error handling is done in the same way as defined for the
688	n/a	StreamWriter/Readers.
689	n/a
690	n/a	"""
691	n/a	self.stream = stream
692	n/a	self.reader = Reader(stream, errors)
693	n/a	self.writer = Writer(stream, errors)
694	n/a	self.errors = errors
695	n/a
696	n/a	def read(self, size=-1):
697	n/a
698	n/a	return self.reader.read(size)
699	n/a
700	n/a	def readline(self, size=None):
701	n/a
702	n/a	return self.reader.readline(size)
703	n/a
704	n/a	def readlines(self, sizehint=None):
705	n/a
706	n/a	return self.reader.readlines(sizehint)
707	n/a
708	n/a	def __next__(self):
709	n/a
710	n/a	""" Return the next decoded line from the input stream."""
711	n/a	return next(self.reader)
712	n/a
713	n/a	def __iter__(self):
714	n/a	return self
715	n/a
716	n/a	def write(self, data):
717	n/a
718	n/a	return self.writer.write(data)
719	n/a
720	n/a	def writelines(self, list):
721	n/a
722	n/a	return self.writer.writelines(list)
723	n/a
724	n/a	def reset(self):
725	n/a
726	n/a	self.reader.reset()
727	n/a	self.writer.reset()
728	n/a
729	n/a	def seek(self, offset, whence=0):
730	n/a	self.stream.seek(offset, whence)
731	n/a	self.reader.reset()
732	n/a	if whence == 0 and offset == 0:
733	n/a	self.writer.reset()
734	n/a
735	n/a	def __getattr__(self, name,
736	n/a	getattr=getattr):
737	n/a
738	n/a	""" Inherit all other methods from the underlying stream.
739	n/a	"""
740	n/a	return getattr(self.stream, name)
741	n/a
742	n/a	# these are needed to make "with codecs.open(...)" work properly
743	n/a
744	n/a	def __enter__(self):
745	n/a	return self
746	n/a
747	n/a	def __exit__(self, type, value, tb):
748	n/a	self.stream.close()
749	n/a
750	n/a	###
751	n/a
752	n/a	class StreamRecoder:
753	n/a
754	n/a	""" StreamRecoder instances translate data from one encoding to another.
755	n/a
756	n/a	They use the complete set of APIs returned by the
757	n/a	codecs.lookup() function to implement their task.
758	n/a
759	n/a	Data written to the StreamRecoder is first decoded into an
760	n/a	intermediate format (depending on the "decode" codec) and then
761	n/a	written to the underlying stream using an instance of the provided
762	n/a	Writer class.
763	n/a
764	n/a	In the other direction, data is read from the underlying stream using
765	n/a	a Reader instance and then encoded and returned to the caller.
766	n/a
767	n/a	"""
768	n/a	# Optional attributes set by the file wrappers below
769	n/a	data_encoding = 'unknown'
770	n/a	file_encoding = 'unknown'
771	n/a
772	n/a	def __init__(self, stream, encode, decode, Reader, Writer,
773	n/a	errors='strict'):
774	n/a
775	n/a	""" Creates a StreamRecoder instance which implements a two-way
776	n/a	conversion: encode and decode work on the frontend (the
777	n/a	data visible to .read() and .write()) while Reader and Writer
778	n/a	work on the backend (the data in stream).
779	n/a
780	n/a	You can use these objects to do transparent
781	n/a	transcodings from e.g. latin-1 to utf-8 and back.
782	n/a
783	n/a	stream must be a file-like object.
784	n/a
785	n/a	encode and decode must adhere to the Codec interface; Reader and
786	n/a	Writer must be factory functions or classes providing the
787	n/a	StreamReader and StreamWriter interfaces resp.
788	n/a
789	n/a	Error handling is done in the same way as defined for the
790	n/a	StreamWriter/Readers.
791	n/a
792	n/a	"""
793	n/a	self.stream = stream
794	n/a	self.encode = encode
795	n/a	self.decode = decode
796	n/a	self.reader = Reader(stream, errors)
797	n/a	self.writer = Writer(stream, errors)
798	n/a	self.errors = errors
799	n/a
800	n/a	def read(self, size=-1):
801	n/a
802	n/a	data = self.reader.read(size)
803	n/a	data, bytesencoded = self.encode(data, self.errors)
804	n/a	return data
805	n/a
806	n/a	def readline(self, size=None):
807	n/a
808	n/a	if size is None:
809	n/a	data = self.reader.readline()
810	n/a	else:
811	n/a	data = self.reader.readline(size)
812	n/a	data, bytesencoded = self.encode(data, self.errors)
813	n/a	return data
814	n/a
815	n/a	def readlines(self, sizehint=None):
816	n/a
817	n/a	data = self.reader.read()
818	n/a	data, bytesencoded = self.encode(data, self.errors)
819	n/a	return data.splitlines(keepends=True)
820	n/a
821	n/a	def __next__(self):
822	n/a
823	n/a	""" Return the next decoded line from the input stream."""
824	n/a	data = next(self.reader)
825	n/a	data, bytesencoded = self.encode(data, self.errors)
826	n/a	return data
827	n/a
828	n/a	def __iter__(self):
829	n/a	return self
830	n/a
831	n/a	def write(self, data):
832	n/a
833	n/a	data, bytesdecoded = self.decode(data, self.errors)
834	n/a	return self.writer.write(data)
835	n/a
836	n/a	def writelines(self, list):
837	n/a
838	n/a	data = ''.join(list)
839	n/a	data, bytesdecoded = self.decode(data, self.errors)
840	n/a	return self.writer.write(data)
841	n/a
842	n/a	def reset(self):
843	n/a
844	n/a	self.reader.reset()
845	n/a	self.writer.reset()
846	n/a
847	n/a	def __getattr__(self, name,
848	n/a	getattr=getattr):
849	n/a
850	n/a	""" Inherit all other methods from the underlying stream.
851	n/a	"""
852	n/a	return getattr(self.stream, name)
853	n/a
854	n/a	def __enter__(self):
855	n/a	return self
856	n/a
857	n/a	def __exit__(self, type, value, tb):
858	n/a	self.stream.close()
859	n/a
860	n/a	### Shortcuts
861	n/a
862	n/a	def open(filename, mode='r', encoding=None, errors='strict', buffering=1):
863	n/a
864	n/a	""" Open an encoded file using the given mode and return
865	n/a	a wrapped version providing transparent encoding/decoding.
866	n/a
867	n/a	Note: The wrapped version will only accept the object format
868	n/a	defined by the codecs, i.e. Unicode objects for most builtin
869	n/a	codecs. Output is also codec dependent and will usually be
870	n/a	Unicode as well.
871	n/a
872	n/a	Underlying encoded files are always opened in binary mode.
873	n/a	The default file mode is 'r', meaning to open the file in read mode.
874	n/a
875	n/a	encoding specifies the encoding which is to be used for the
876	n/a	file.
877	n/a
878	n/a	errors may be given to define the error handling. It defaults
879	n/a	to 'strict' which causes ValueErrors to be raised in case an
880	n/a	encoding error occurs.
881	n/a
882	n/a	buffering has the same meaning as for the builtin open() API.
883	n/a	It defaults to line buffered.
884	n/a
885	n/a	The returned wrapped file object provides an extra attribute
886	n/a	.encoding which allows querying the used encoding. This
887	n/a	attribute is only available if an encoding was specified as
888	n/a	parameter.
889	n/a
890	n/a	"""
891	n/a	if encoding is not None and \
892	n/a	'b' not in mode:
893	n/a	# Force opening of the file in binary mode
894	n/a	mode = mode + 'b'
895	n/a	file = builtins.open(filename, mode, buffering)
896	n/a	if encoding is None:
897	n/a	return file
898	n/a	info = lookup(encoding)
899	n/a	srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
900	n/a	# Add attributes to simplify introspection
901	n/a	srw.encoding = encoding
902	n/a	return srw
903	n/a
904	n/a	def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
905	n/a
906	n/a	""" Return a wrapped version of file which provides transparent
907	n/a	encoding translation.
908	n/a
909	n/a	Data written to the wrapped file is decoded according
910	n/a	to the given data_encoding and then encoded to the underlying
911	n/a	file using file_encoding. The intermediate data type
912	n/a	will usually be Unicode but depends on the specified codecs.
913	n/a
914	n/a	Bytes read from the file are decoded using file_encoding and then
915	n/a	passed back to the caller encoded using data_encoding.
916	n/a
917	n/a	If file_encoding is not given, it defaults to data_encoding.
918	n/a
919	n/a	errors may be given to define the error handling. It defaults
920	n/a	to 'strict' which causes ValueErrors to be raised in case an
921	n/a	encoding error occurs.
922	n/a
923	n/a	The returned wrapped file object provides two extra attributes
924	n/a	.data_encoding and .file_encoding which reflect the given
925	n/a	parameters of the same name. The attributes can be used for
926	n/a	introspection by Python programs.
927	n/a
928	n/a	"""
929	n/a	if file_encoding is None:
930	n/a	file_encoding = data_encoding
931	n/a	data_info = lookup(data_encoding)
932	n/a	file_info = lookup(file_encoding)
933	n/a	sr = StreamRecoder(file, data_info.encode, data_info.decode,
934	n/a	file_info.streamreader, file_info.streamwriter, errors)
935	n/a	# Add attributes to simplify introspection
936	n/a	sr.data_encoding = data_encoding
937	n/a	sr.file_encoding = file_encoding
938	n/a	return sr
939	n/a
940	n/a	### Helpers for codec lookup
941	n/a
942	n/a	def getencoder(encoding):
943	n/a
944	n/a	""" Lookup up the codec for the given encoding and return
945	n/a	its encoder function.
946	n/a
947	n/a	Raises a LookupError in case the encoding cannot be found.
948	n/a
949	n/a	"""
950	n/a	return lookup(encoding).encode
951	n/a
952	n/a	def getdecoder(encoding):
953	n/a
954	n/a	""" Lookup up the codec for the given encoding and return
955	n/a	its decoder function.
956	n/a
957	n/a	Raises a LookupError in case the encoding cannot be found.
958	n/a
959	n/a	"""
960	n/a	return lookup(encoding).decode
961	n/a
962	n/a	def getincrementalencoder(encoding):
963	n/a
964	n/a	""" Lookup up the codec for the given encoding and return
965	n/a	its IncrementalEncoder class or factory function.
966	n/a
967	n/a	Raises a LookupError in case the encoding cannot be found
968	n/a	or the codecs doesn't provide an incremental encoder.
969	n/a
970	n/a	"""
971	n/a	encoder = lookup(encoding).incrementalencoder
972	n/a	if encoder is None:
973	n/a	raise LookupError(encoding)
974	n/a	return encoder
975	n/a
976	n/a	def getincrementaldecoder(encoding):
977	n/a
978	n/a	""" Lookup up the codec for the given encoding and return
979	n/a	its IncrementalDecoder class or factory function.
980	n/a
981	n/a	Raises a LookupError in case the encoding cannot be found
982	n/a	or the codecs doesn't provide an incremental decoder.
983	n/a
984	n/a	"""
985	n/a	decoder = lookup(encoding).incrementaldecoder
986	n/a	if decoder is None:
987	n/a	raise LookupError(encoding)
988	n/a	return decoder
989	n/a
990	n/a	def getreader(encoding):
991	n/a
992	n/a	""" Lookup up the codec for the given encoding and return
993	n/a	its StreamReader class or factory function.
994	n/a
995	n/a	Raises a LookupError in case the encoding cannot be found.
996	n/a
997	n/a	"""
998	n/a	return lookup(encoding).streamreader
999	n/a
1000	n/a	def getwriter(encoding):
1001	n/a
1002	n/a	""" Lookup up the codec for the given encoding and return
1003	n/a	its StreamWriter class or factory function.
1004	n/a
1005	n/a	Raises a LookupError in case the encoding cannot be found.
1006	n/a
1007	n/a	"""
1008	n/a	return lookup(encoding).streamwriter
1009	n/a
1010	n/a	def iterencode(iterator, encoding, errors='strict', **kwargs):
1011	n/a	"""
1012	n/a	Encoding iterator.
1013	n/a
1014	n/a	Encodes the input strings from the iterator using an IncrementalEncoder.
1015	n/a
1016	n/a	errors and kwargs are passed through to the IncrementalEncoder
1017	n/a	constructor.
1018	n/a	"""
1019	n/a	encoder = getincrementalencoder(encoding)(errors, **kwargs)
1020	n/a	for input in iterator:
1021	n/a	output = encoder.encode(input)
1022	n/a	if output:
1023	n/a	yield output
1024	n/a	output = encoder.encode("", True)
1025	n/a	if output:
1026	n/a	yield output
1027	n/a
1028	n/a	def iterdecode(iterator, encoding, errors='strict', **kwargs):
1029	n/a	"""
1030	n/a	Decoding iterator.
1031	n/a
1032	n/a	Decodes the input strings from the iterator using an IncrementalDecoder.
1033	n/a
1034	n/a	errors and kwargs are passed through to the IncrementalDecoder
1035	n/a	constructor.
1036	n/a	"""
1037	n/a	decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1038	n/a	for input in iterator:
1039	n/a	output = decoder.decode(input)
1040	n/a	if output:
1041	n/a	yield output
1042	n/a	output = decoder.decode(b"", True)
1043	n/a	if output:
1044	n/a	yield output
1045	n/a
1046	n/a	### Helpers for charmap-based codecs
1047	n/a
1048	n/a	def make_identity_dict(rng):
1049	n/a
1050	n/a	""" make_identity_dict(rng) -> dict
1051	n/a
1052	n/a	Return a dictionary where elements of the rng sequence are
1053	n/a	mapped to themselves.
1054	n/a
1055	n/a	"""
1056	n/a	return {i:i for i in rng}
1057	n/a
1058	n/a	def make_encoding_map(decoding_map):
1059	n/a
1060	n/a	""" Creates an encoding map from a decoding map.
1061	n/a
1062	n/a	If a target mapping in the decoding map occurs multiple
1063	n/a	times, then that target is mapped to None (undefined mapping),
1064	n/a	causing an exception when encountered by the charmap codec
1065	n/a	during translation.
1066	n/a
1067	n/a	One example where this happens is cp875.py which decodes
1068	n/a	multiple character to \\u001a.
1069	n/a
1070	n/a	"""
1071	n/a	m = {}
1072	n/a	for k,v in decoding_map.items():
1073	n/a	if not v in m:
1074	n/a	m[v] = k
1075	n/a	else:
1076	n/a	m[v] = None
1077	n/a	return m
1078	n/a
1079	n/a	### error handlers
1080	n/a
1081	n/a	try:
1082	n/a	strict_errors = lookup_error("strict")
1083	n/a	ignore_errors = lookup_error("ignore")
1084	n/a	replace_errors = lookup_error("replace")
1085	n/a	xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1086	n/a	backslashreplace_errors = lookup_error("backslashreplace")
1087	n/a	namereplace_errors = lookup_error("namereplace")
1088	n/a	except LookupError:
1089	n/a	# In --disable-unicode builds, these error handler are missing
1090	n/a	strict_errors = None
1091	n/a	ignore_errors = None
1092	n/a	replace_errors = None
1093	n/a	xmlcharrefreplace_errors = None
1094	n/a	backslashreplace_errors = None
1095	n/a	namereplace_errors = None
1096	n/a
1097	n/a	# Tell modulefinder that using codecs probably needs the encodings
1098	n/a	# package
1099	n/a	_false = 0
1100	n/a	if _false:
1101	n/a	import encodings
1102	n/a
1103	n/a	### Tests
1104	n/a
1105	n/a	if __name__ == '__main__':
1106	n/a
1107	n/a	# Make stdout translate Latin-1 output into UTF-8 output
1108	n/a	sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1109	n/a
1110	n/a	# Have stdin translate Latin-1 input into UTF-8 input
1111	n/a	sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')