1 | n/a | """ codecs -- Python Codec Registry, API and helpers. |
---|
2 | n/a | |
---|
3 | n/a | |
---|
4 | n/a | Written by Marc-Andre Lemburg (mal@lemburg.com). |
---|
5 | n/a | |
---|
6 | n/a | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. |
---|
7 | n/a | |
---|
8 | n/a | """#" |
---|
9 | n/a | |
---|
10 | n/a | import builtins, sys |
---|
11 | n/a | |
---|
12 | n/a | ### Registry and builtin stateless codec functions |
---|
13 | n/a | |
---|
14 | n/a | try: |
---|
15 | n/a | from _codecs import * |
---|
16 | n/a | except ImportError as why: |
---|
17 | n/a | raise SystemError('Failed to load the builtin codecs: %s' % why) |
---|
18 | n/a | |
---|
19 | n/a | __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", |
---|
20 | n/a | "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", |
---|
21 | n/a | "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", |
---|
22 | n/a | "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", |
---|
23 | n/a | "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder", |
---|
24 | n/a | "StreamReader", "StreamWriter", |
---|
25 | n/a | "StreamReaderWriter", "StreamRecoder", |
---|
26 | n/a | "getencoder", "getdecoder", "getincrementalencoder", |
---|
27 | n/a | "getincrementaldecoder", "getreader", "getwriter", |
---|
28 | n/a | "encode", "decode", "iterencode", "iterdecode", |
---|
29 | n/a | "strict_errors", "ignore_errors", "replace_errors", |
---|
30 | n/a | "xmlcharrefreplace_errors", |
---|
31 | n/a | "backslashreplace_errors", "namereplace_errors", |
---|
32 | n/a | "register_error", "lookup_error"] |
---|
33 | n/a | |
---|
34 | n/a | ### Constants |
---|
35 | n/a | |
---|
36 | n/a | # |
---|
37 | n/a | # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) |
---|
38 | n/a | # and its possible byte string values |
---|
39 | n/a | # for UTF8/UTF16/UTF32 output and little/big endian machines |
---|
40 | n/a | # |
---|
41 | n/a | |
---|
42 | n/a | # UTF-8 |
---|
43 | n/a | BOM_UTF8 = b'\xef\xbb\xbf' |
---|
44 | n/a | |
---|
45 | n/a | # UTF-16, little endian |
---|
46 | n/a | BOM_LE = BOM_UTF16_LE = b'\xff\xfe' |
---|
47 | n/a | |
---|
48 | n/a | # UTF-16, big endian |
---|
49 | n/a | BOM_BE = BOM_UTF16_BE = b'\xfe\xff' |
---|
50 | n/a | |
---|
51 | n/a | # UTF-32, little endian |
---|
52 | n/a | BOM_UTF32_LE = b'\xff\xfe\x00\x00' |
---|
53 | n/a | |
---|
54 | n/a | # UTF-32, big endian |
---|
55 | n/a | BOM_UTF32_BE = b'\x00\x00\xfe\xff' |
---|
56 | n/a | |
---|
57 | n/a | if sys.byteorder == 'little': |
---|
58 | n/a | |
---|
59 | n/a | # UTF-16, native endianness |
---|
60 | n/a | BOM = BOM_UTF16 = BOM_UTF16_LE |
---|
61 | n/a | |
---|
62 | n/a | # UTF-32, native endianness |
---|
63 | n/a | BOM_UTF32 = BOM_UTF32_LE |
---|
64 | n/a | |
---|
65 | n/a | else: |
---|
66 | n/a | |
---|
67 | n/a | # UTF-16, native endianness |
---|
68 | n/a | BOM = BOM_UTF16 = BOM_UTF16_BE |
---|
69 | n/a | |
---|
70 | n/a | # UTF-32, native endianness |
---|
71 | n/a | BOM_UTF32 = BOM_UTF32_BE |
---|
72 | n/a | |
---|
73 | n/a | # Old broken names (don't use in new code) |
---|
74 | n/a | BOM32_LE = BOM_UTF16_LE |
---|
75 | n/a | BOM32_BE = BOM_UTF16_BE |
---|
76 | n/a | BOM64_LE = BOM_UTF32_LE |
---|
77 | n/a | BOM64_BE = BOM_UTF32_BE |
---|
78 | n/a | |
---|
79 | n/a | |
---|
80 | n/a | ### Codec base classes (defining the API) |
---|
81 | n/a | |
---|
82 | n/a | class CodecInfo(tuple): |
---|
83 | n/a | """Codec details when looking up the codec registry""" |
---|
84 | n/a | |
---|
85 | n/a | # Private API to allow Python 3.4 to blacklist the known non-Unicode |
---|
86 | n/a | # codecs in the standard library. A more general mechanism to |
---|
87 | n/a | # reliably distinguish test encodings from other codecs will hopefully |
---|
88 | n/a | # be defined for Python 3.5 |
---|
89 | n/a | # |
---|
90 | n/a | # See http://bugs.python.org/issue19619 |
---|
91 | n/a | _is_text_encoding = True # Assume codecs are text encodings by default |
---|
92 | n/a | |
---|
93 | n/a | def __new__(cls, encode, decode, streamreader=None, streamwriter=None, |
---|
94 | n/a | incrementalencoder=None, incrementaldecoder=None, name=None, |
---|
95 | n/a | *, _is_text_encoding=None): |
---|
96 | n/a | self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) |
---|
97 | n/a | self.name = name |
---|
98 | n/a | self.encode = encode |
---|
99 | n/a | self.decode = decode |
---|
100 | n/a | self.incrementalencoder = incrementalencoder |
---|
101 | n/a | self.incrementaldecoder = incrementaldecoder |
---|
102 | n/a | self.streamwriter = streamwriter |
---|
103 | n/a | self.streamreader = streamreader |
---|
104 | n/a | if _is_text_encoding is not None: |
---|
105 | n/a | self._is_text_encoding = _is_text_encoding |
---|
106 | n/a | return self |
---|
107 | n/a | |
---|
108 | n/a | def __repr__(self): |
---|
109 | n/a | return "<%s.%s object for encoding %s at %#x>" % \ |
---|
110 | n/a | (self.__class__.__module__, self.__class__.__qualname__, |
---|
111 | n/a | self.name, id(self)) |
---|
112 | n/a | |
---|
113 | n/a | class Codec: |
---|
114 | n/a | |
---|
115 | n/a | """ Defines the interface for stateless encoders/decoders. |
---|
116 | n/a | |
---|
117 | n/a | The .encode()/.decode() methods may use different error |
---|
118 | n/a | handling schemes by providing the errors argument. These |
---|
119 | n/a | string values are predefined: |
---|
120 | n/a | |
---|
121 | n/a | 'strict' - raise a ValueError error (or a subclass) |
---|
122 | n/a | 'ignore' - ignore the character and continue with the next |
---|
123 | n/a | 'replace' - replace with a suitable replacement character; |
---|
124 | n/a | Python will use the official U+FFFD REPLACEMENT |
---|
125 | n/a | CHARACTER for the builtin Unicode codecs on |
---|
126 | n/a | decoding and '?' on encoding. |
---|
127 | n/a | 'surrogateescape' - replace with private code points U+DCnn. |
---|
128 | n/a | 'xmlcharrefreplace' - Replace with the appropriate XML |
---|
129 | n/a | character reference (only for encoding). |
---|
130 | n/a | 'backslashreplace' - Replace with backslashed escape sequences. |
---|
131 | n/a | 'namereplace' - Replace with \\N{...} escape sequences |
---|
132 | n/a | (only for encoding). |
---|
133 | n/a | |
---|
134 | n/a | The set of allowed values can be extended via register_error. |
---|
135 | n/a | |
---|
136 | n/a | """ |
---|
137 | n/a | def encode(self, input, errors='strict'): |
---|
138 | n/a | |
---|
139 | n/a | """ Encodes the object input and returns a tuple (output |
---|
140 | n/a | object, length consumed). |
---|
141 | n/a | |
---|
142 | n/a | errors defines the error handling to apply. It defaults to |
---|
143 | n/a | 'strict' handling. |
---|
144 | n/a | |
---|
145 | n/a | The method may not store state in the Codec instance. Use |
---|
146 | n/a | StreamWriter for codecs which have to keep state in order to |
---|
147 | n/a | make encoding efficient. |
---|
148 | n/a | |
---|
149 | n/a | The encoder must be able to handle zero length input and |
---|
150 | n/a | return an empty object of the output object type in this |
---|
151 | n/a | situation. |
---|
152 | n/a | |
---|
153 | n/a | """ |
---|
154 | n/a | raise NotImplementedError |
---|
155 | n/a | |
---|
156 | n/a | def decode(self, input, errors='strict'): |
---|
157 | n/a | |
---|
158 | n/a | """ Decodes the object input and returns a tuple (output |
---|
159 | n/a | object, length consumed). |
---|
160 | n/a | |
---|
161 | n/a | input must be an object which provides the bf_getreadbuf |
---|
162 | n/a | buffer slot. Python strings, buffer objects and memory |
---|
163 | n/a | mapped files are examples of objects providing this slot. |
---|
164 | n/a | |
---|
165 | n/a | errors defines the error handling to apply. It defaults to |
---|
166 | n/a | 'strict' handling. |
---|
167 | n/a | |
---|
168 | n/a | The method may not store state in the Codec instance. Use |
---|
169 | n/a | StreamReader for codecs which have to keep state in order to |
---|
170 | n/a | make decoding efficient. |
---|
171 | n/a | |
---|
172 | n/a | The decoder must be able to handle zero length input and |
---|
173 | n/a | return an empty object of the output object type in this |
---|
174 | n/a | situation. |
---|
175 | n/a | |
---|
176 | n/a | """ |
---|
177 | n/a | raise NotImplementedError |
---|
178 | n/a | |
---|
179 | n/a | class IncrementalEncoder(object): |
---|
180 | n/a | """ |
---|
181 | n/a | An IncrementalEncoder encodes an input in multiple steps. The input can |
---|
182 | n/a | be passed piece by piece to the encode() method. The IncrementalEncoder |
---|
183 | n/a | remembers the state of the encoding process between calls to encode(). |
---|
184 | n/a | """ |
---|
185 | n/a | def __init__(self, errors='strict'): |
---|
186 | n/a | """ |
---|
187 | n/a | Creates an IncrementalEncoder instance. |
---|
188 | n/a | |
---|
189 | n/a | The IncrementalEncoder may use different error handling schemes by |
---|
190 | n/a | providing the errors keyword argument. See the module docstring |
---|
191 | n/a | for a list of possible values. |
---|
192 | n/a | """ |
---|
193 | n/a | self.errors = errors |
---|
194 | n/a | self.buffer = "" |
---|
195 | n/a | |
---|
196 | n/a | def encode(self, input, final=False): |
---|
197 | n/a | """ |
---|
198 | n/a | Encodes input and returns the resulting object. |
---|
199 | n/a | """ |
---|
200 | n/a | raise NotImplementedError |
---|
201 | n/a | |
---|
202 | n/a | def reset(self): |
---|
203 | n/a | """ |
---|
204 | n/a | Resets the encoder to the initial state. |
---|
205 | n/a | """ |
---|
206 | n/a | |
---|
207 | n/a | def getstate(self): |
---|
208 | n/a | """ |
---|
209 | n/a | Return the current state of the encoder. |
---|
210 | n/a | """ |
---|
211 | n/a | return 0 |
---|
212 | n/a | |
---|
213 | n/a | def setstate(self, state): |
---|
214 | n/a | """ |
---|
215 | n/a | Set the current state of the encoder. state must have been |
---|
216 | n/a | returned by getstate(). |
---|
217 | n/a | """ |
---|
218 | n/a | |
---|
219 | n/a | class BufferedIncrementalEncoder(IncrementalEncoder): |
---|
220 | n/a | """ |
---|
221 | n/a | This subclass of IncrementalEncoder can be used as the baseclass for an |
---|
222 | n/a | incremental encoder if the encoder must keep some of the output in a |
---|
223 | n/a | buffer between calls to encode(). |
---|
224 | n/a | """ |
---|
225 | n/a | def __init__(self, errors='strict'): |
---|
226 | n/a | IncrementalEncoder.__init__(self, errors) |
---|
227 | n/a | # unencoded input that is kept between calls to encode() |
---|
228 | n/a | self.buffer = "" |
---|
229 | n/a | |
---|
230 | n/a | def _buffer_encode(self, input, errors, final): |
---|
231 | n/a | # Overwrite this method in subclasses: It must encode input |
---|
232 | n/a | # and return an (output, length consumed) tuple |
---|
233 | n/a | raise NotImplementedError |
---|
234 | n/a | |
---|
235 | n/a | def encode(self, input, final=False): |
---|
236 | n/a | # encode input (taking the buffer into account) |
---|
237 | n/a | data = self.buffer + input |
---|
238 | n/a | (result, consumed) = self._buffer_encode(data, self.errors, final) |
---|
239 | n/a | # keep unencoded input until the next call |
---|
240 | n/a | self.buffer = data[consumed:] |
---|
241 | n/a | return result |
---|
242 | n/a | |
---|
243 | n/a | def reset(self): |
---|
244 | n/a | IncrementalEncoder.reset(self) |
---|
245 | n/a | self.buffer = "" |
---|
246 | n/a | |
---|
247 | n/a | def getstate(self): |
---|
248 | n/a | return self.buffer or 0 |
---|
249 | n/a | |
---|
250 | n/a | def setstate(self, state): |
---|
251 | n/a | self.buffer = state or "" |
---|
252 | n/a | |
---|
253 | n/a | class IncrementalDecoder(object): |
---|
254 | n/a | """ |
---|
255 | n/a | An IncrementalDecoder decodes an input in multiple steps. The input can |
---|
256 | n/a | be passed piece by piece to the decode() method. The IncrementalDecoder |
---|
257 | n/a | remembers the state of the decoding process between calls to decode(). |
---|
258 | n/a | """ |
---|
259 | n/a | def __init__(self, errors='strict'): |
---|
260 | n/a | """ |
---|
261 | n/a | Create an IncrementalDecoder instance. |
---|
262 | n/a | |
---|
263 | n/a | The IncrementalDecoder may use different error handling schemes by |
---|
264 | n/a | providing the errors keyword argument. See the module docstring |
---|
265 | n/a | for a list of possible values. |
---|
266 | n/a | """ |
---|
267 | n/a | self.errors = errors |
---|
268 | n/a | |
---|
269 | n/a | def decode(self, input, final=False): |
---|
270 | n/a | """ |
---|
271 | n/a | Decode input and returns the resulting object. |
---|
272 | n/a | """ |
---|
273 | n/a | raise NotImplementedError |
---|
274 | n/a | |
---|
275 | n/a | def reset(self): |
---|
276 | n/a | """ |
---|
277 | n/a | Reset the decoder to the initial state. |
---|
278 | n/a | """ |
---|
279 | n/a | |
---|
280 | n/a | def getstate(self): |
---|
281 | n/a | """ |
---|
282 | n/a | Return the current state of the decoder. |
---|
283 | n/a | |
---|
284 | n/a | This must be a (buffered_input, additional_state_info) tuple. |
---|
285 | n/a | buffered_input must be a bytes object containing bytes that |
---|
286 | n/a | were passed to decode() that have not yet been converted. |
---|
287 | n/a | additional_state_info must be a non-negative integer |
---|
288 | n/a | representing the state of the decoder WITHOUT yet having |
---|
289 | n/a | processed the contents of buffered_input. In the initial state |
---|
290 | n/a | and after reset(), getstate() must return (b"", 0). |
---|
291 | n/a | """ |
---|
292 | n/a | return (b"", 0) |
---|
293 | n/a | |
---|
294 | n/a | def setstate(self, state): |
---|
295 | n/a | """ |
---|
296 | n/a | Set the current state of the decoder. |
---|
297 | n/a | |
---|
298 | n/a | state must have been returned by getstate(). The effect of |
---|
299 | n/a | setstate((b"", 0)) must be equivalent to reset(). |
---|
300 | n/a | """ |
---|
301 | n/a | |
---|
302 | n/a | class BufferedIncrementalDecoder(IncrementalDecoder): |
---|
303 | n/a | """ |
---|
304 | n/a | This subclass of IncrementalDecoder can be used as the baseclass for an |
---|
305 | n/a | incremental decoder if the decoder must be able to handle incomplete |
---|
306 | n/a | byte sequences. |
---|
307 | n/a | """ |
---|
308 | n/a | def __init__(self, errors='strict'): |
---|
309 | n/a | IncrementalDecoder.__init__(self, errors) |
---|
310 | n/a | # undecoded input that is kept between calls to decode() |
---|
311 | n/a | self.buffer = b"" |
---|
312 | n/a | |
---|
313 | n/a | def _buffer_decode(self, input, errors, final): |
---|
314 | n/a | # Overwrite this method in subclasses: It must decode input |
---|
315 | n/a | # and return an (output, length consumed) tuple |
---|
316 | n/a | raise NotImplementedError |
---|
317 | n/a | |
---|
318 | n/a | def decode(self, input, final=False): |
---|
319 | n/a | # decode input (taking the buffer into account) |
---|
320 | n/a | data = self.buffer + input |
---|
321 | n/a | (result, consumed) = self._buffer_decode(data, self.errors, final) |
---|
322 | n/a | # keep undecoded input until the next call |
---|
323 | n/a | self.buffer = data[consumed:] |
---|
324 | n/a | return result |
---|
325 | n/a | |
---|
326 | n/a | def reset(self): |
---|
327 | n/a | IncrementalDecoder.reset(self) |
---|
328 | n/a | self.buffer = b"" |
---|
329 | n/a | |
---|
330 | n/a | def getstate(self): |
---|
331 | n/a | # additional state info is always 0 |
---|
332 | n/a | return (self.buffer, 0) |
---|
333 | n/a | |
---|
334 | n/a | def setstate(self, state): |
---|
335 | n/a | # ignore additional state info |
---|
336 | n/a | self.buffer = state[0] |
---|
337 | n/a | |
---|
338 | n/a | # |
---|
339 | n/a | # The StreamWriter and StreamReader class provide generic working |
---|
340 | n/a | # interfaces which can be used to implement new encoding submodules |
---|
341 | n/a | # very easily. See encodings/utf_8.py for an example on how this is |
---|
342 | n/a | # done. |
---|
343 | n/a | # |
---|
344 | n/a | |
---|
345 | n/a | class StreamWriter(Codec): |
---|
346 | n/a | |
---|
347 | n/a | def __init__(self, stream, errors='strict'): |
---|
348 | n/a | |
---|
349 | n/a | """ Creates a StreamWriter instance. |
---|
350 | n/a | |
---|
351 | n/a | stream must be a file-like object open for writing. |
---|
352 | n/a | |
---|
353 | n/a | The StreamWriter may use different error handling |
---|
354 | n/a | schemes by providing the errors keyword argument. These |
---|
355 | n/a | parameters are predefined: |
---|
356 | n/a | |
---|
357 | n/a | 'strict' - raise a ValueError (or a subclass) |
---|
358 | n/a | 'ignore' - ignore the character and continue with the next |
---|
359 | n/a | 'replace'- replace with a suitable replacement character |
---|
360 | n/a | 'xmlcharrefreplace' - Replace with the appropriate XML |
---|
361 | n/a | character reference. |
---|
362 | n/a | 'backslashreplace' - Replace with backslashed escape |
---|
363 | n/a | sequences. |
---|
364 | n/a | 'namereplace' - Replace with \\N{...} escape sequences. |
---|
365 | n/a | |
---|
366 | n/a | The set of allowed parameter values can be extended via |
---|
367 | n/a | register_error. |
---|
368 | n/a | """ |
---|
369 | n/a | self.stream = stream |
---|
370 | n/a | self.errors = errors |
---|
371 | n/a | |
---|
372 | n/a | def write(self, object): |
---|
373 | n/a | |
---|
374 | n/a | """ Writes the object's contents encoded to self.stream. |
---|
375 | n/a | """ |
---|
376 | n/a | data, consumed = self.encode(object, self.errors) |
---|
377 | n/a | self.stream.write(data) |
---|
378 | n/a | |
---|
379 | n/a | def writelines(self, list): |
---|
380 | n/a | |
---|
381 | n/a | """ Writes the concatenated list of strings to the stream |
---|
382 | n/a | using .write(). |
---|
383 | n/a | """ |
---|
384 | n/a | self.write(''.join(list)) |
---|
385 | n/a | |
---|
386 | n/a | def reset(self): |
---|
387 | n/a | |
---|
388 | n/a | """ Flushes and resets the codec buffers used for keeping state. |
---|
389 | n/a | |
---|
390 | n/a | Calling this method should ensure that the data on the |
---|
391 | n/a | output is put into a clean state, that allows appending |
---|
392 | n/a | of new fresh data without having to rescan the whole |
---|
393 | n/a | stream to recover state. |
---|
394 | n/a | |
---|
395 | n/a | """ |
---|
396 | n/a | pass |
---|
397 | n/a | |
---|
398 | n/a | def seek(self, offset, whence=0): |
---|
399 | n/a | self.stream.seek(offset, whence) |
---|
400 | n/a | if whence == 0 and offset == 0: |
---|
401 | n/a | self.reset() |
---|
402 | n/a | |
---|
403 | n/a | def __getattr__(self, name, |
---|
404 | n/a | getattr=getattr): |
---|
405 | n/a | |
---|
406 | n/a | """ Inherit all other methods from the underlying stream. |
---|
407 | n/a | """ |
---|
408 | n/a | return getattr(self.stream, name) |
---|
409 | n/a | |
---|
410 | n/a | def __enter__(self): |
---|
411 | n/a | return self |
---|
412 | n/a | |
---|
413 | n/a | def __exit__(self, type, value, tb): |
---|
414 | n/a | self.stream.close() |
---|
415 | n/a | |
---|
416 | n/a | ### |
---|
417 | n/a | |
---|
418 | n/a | class StreamReader(Codec): |
---|
419 | n/a | |
---|
420 | n/a | charbuffertype = str |
---|
421 | n/a | |
---|
422 | n/a | def __init__(self, stream, errors='strict'): |
---|
423 | n/a | |
---|
424 | n/a | """ Creates a StreamReader instance. |
---|
425 | n/a | |
---|
426 | n/a | stream must be a file-like object open for reading. |
---|
427 | n/a | |
---|
428 | n/a | The StreamReader may use different error handling |
---|
429 | n/a | schemes by providing the errors keyword argument. These |
---|
430 | n/a | parameters are predefined: |
---|
431 | n/a | |
---|
432 | n/a | 'strict' - raise a ValueError (or a subclass) |
---|
433 | n/a | 'ignore' - ignore the character and continue with the next |
---|
434 | n/a | 'replace'- replace with a suitable replacement character |
---|
435 | n/a | 'backslashreplace' - Replace with backslashed escape sequences; |
---|
436 | n/a | |
---|
437 | n/a | The set of allowed parameter values can be extended via |
---|
438 | n/a | register_error. |
---|
439 | n/a | """ |
---|
440 | n/a | self.stream = stream |
---|
441 | n/a | self.errors = errors |
---|
442 | n/a | self.bytebuffer = b"" |
---|
443 | n/a | self._empty_charbuffer = self.charbuffertype() |
---|
444 | n/a | self.charbuffer = self._empty_charbuffer |
---|
445 | n/a | self.linebuffer = None |
---|
446 | n/a | |
---|
447 | n/a | def decode(self, input, errors='strict'): |
---|
448 | n/a | raise NotImplementedError |
---|
449 | n/a | |
---|
450 | n/a | def read(self, size=-1, chars=-1, firstline=False): |
---|
451 | n/a | |
---|
452 | n/a | """ Decodes data from the stream self.stream and returns the |
---|
453 | n/a | resulting object. |
---|
454 | n/a | |
---|
455 | n/a | chars indicates the number of decoded code points or bytes to |
---|
456 | n/a | return. read() will never return more data than requested, |
---|
457 | n/a | but it might return less, if there is not enough available. |
---|
458 | n/a | |
---|
459 | n/a | size indicates the approximate maximum number of decoded |
---|
460 | n/a | bytes or code points to read for decoding. The decoder |
---|
461 | n/a | can modify this setting as appropriate. The default value |
---|
462 | n/a | -1 indicates to read and decode as much as possible. size |
---|
463 | n/a | is intended to prevent having to decode huge files in one |
---|
464 | n/a | step. |
---|
465 | n/a | |
---|
466 | n/a | If firstline is true, and a UnicodeDecodeError happens |
---|
467 | n/a | after the first line terminator in the input only the first line |
---|
468 | n/a | will be returned, the rest of the input will be kept until the |
---|
469 | n/a | next call to read(). |
---|
470 | n/a | |
---|
471 | n/a | The method should use a greedy read strategy, meaning that |
---|
472 | n/a | it should read as much data as is allowed within the |
---|
473 | n/a | definition of the encoding and the given size, e.g. if |
---|
474 | n/a | optional encoding endings or state markers are available |
---|
475 | n/a | on the stream, these should be read too. |
---|
476 | n/a | """ |
---|
477 | n/a | # If we have lines cached, first merge them back into characters |
---|
478 | n/a | if self.linebuffer: |
---|
479 | n/a | self.charbuffer = self._empty_charbuffer.join(self.linebuffer) |
---|
480 | n/a | self.linebuffer = None |
---|
481 | n/a | |
---|
482 | n/a | # read until we get the required number of characters (if available) |
---|
483 | n/a | while True: |
---|
484 | n/a | # can the request be satisfied from the character buffer? |
---|
485 | n/a | if chars >= 0: |
---|
486 | n/a | if len(self.charbuffer) >= chars: |
---|
487 | n/a | break |
---|
488 | n/a | elif size >= 0: |
---|
489 | n/a | if len(self.charbuffer) >= size: |
---|
490 | n/a | break |
---|
491 | n/a | # we need more data |
---|
492 | n/a | if size < 0: |
---|
493 | n/a | newdata = self.stream.read() |
---|
494 | n/a | else: |
---|
495 | n/a | newdata = self.stream.read(size) |
---|
496 | n/a | # decode bytes (those remaining from the last call included) |
---|
497 | n/a | data = self.bytebuffer + newdata |
---|
498 | n/a | if not data: |
---|
499 | n/a | break |
---|
500 | n/a | try: |
---|
501 | n/a | newchars, decodedbytes = self.decode(data, self.errors) |
---|
502 | n/a | except UnicodeDecodeError as exc: |
---|
503 | n/a | if firstline: |
---|
504 | n/a | newchars, decodedbytes = \ |
---|
505 | n/a | self.decode(data[:exc.start], self.errors) |
---|
506 | n/a | lines = newchars.splitlines(keepends=True) |
---|
507 | n/a | if len(lines)<=1: |
---|
508 | n/a | raise |
---|
509 | n/a | else: |
---|
510 | n/a | raise |
---|
511 | n/a | # keep undecoded bytes until the next call |
---|
512 | n/a | self.bytebuffer = data[decodedbytes:] |
---|
513 | n/a | # put new characters in the character buffer |
---|
514 | n/a | self.charbuffer += newchars |
---|
515 | n/a | # there was no data available |
---|
516 | n/a | if not newdata: |
---|
517 | n/a | break |
---|
518 | n/a | if chars < 0: |
---|
519 | n/a | # Return everything we've got |
---|
520 | n/a | result = self.charbuffer |
---|
521 | n/a | self.charbuffer = self._empty_charbuffer |
---|
522 | n/a | else: |
---|
523 | n/a | # Return the first chars characters |
---|
524 | n/a | result = self.charbuffer[:chars] |
---|
525 | n/a | self.charbuffer = self.charbuffer[chars:] |
---|
526 | n/a | return result |
---|
527 | n/a | |
---|
528 | n/a | def readline(self, size=None, keepends=True): |
---|
529 | n/a | |
---|
530 | n/a | """ Read one line from the input stream and return the |
---|
531 | n/a | decoded data. |
---|
532 | n/a | |
---|
533 | n/a | size, if given, is passed as size argument to the |
---|
534 | n/a | read() method. |
---|
535 | n/a | |
---|
536 | n/a | """ |
---|
537 | n/a | # If we have lines cached from an earlier read, return |
---|
538 | n/a | # them unconditionally |
---|
539 | n/a | if self.linebuffer: |
---|
540 | n/a | line = self.linebuffer[0] |
---|
541 | n/a | del self.linebuffer[0] |
---|
542 | n/a | if len(self.linebuffer) == 1: |
---|
543 | n/a | # revert to charbuffer mode; we might need more data |
---|
544 | n/a | # next time |
---|
545 | n/a | self.charbuffer = self.linebuffer[0] |
---|
546 | n/a | self.linebuffer = None |
---|
547 | n/a | if not keepends: |
---|
548 | n/a | line = line.splitlines(keepends=False)[0] |
---|
549 | n/a | return line |
---|
550 | n/a | |
---|
551 | n/a | readsize = size or 72 |
---|
552 | n/a | line = self._empty_charbuffer |
---|
553 | n/a | # If size is given, we call read() only once |
---|
554 | n/a | while True: |
---|
555 | n/a | data = self.read(readsize, firstline=True) |
---|
556 | n/a | if data: |
---|
557 | n/a | # If we're at a "\r" read one extra character (which might |
---|
558 | n/a | # be a "\n") to get a proper line ending. If the stream is |
---|
559 | n/a | # temporarily exhausted we return the wrong line ending. |
---|
560 | n/a | if (isinstance(data, str) and data.endswith("\r")) or \ |
---|
561 | n/a | (isinstance(data, bytes) and data.endswith(b"\r")): |
---|
562 | n/a | data += self.read(size=1, chars=1) |
---|
563 | n/a | |
---|
564 | n/a | line += data |
---|
565 | n/a | lines = line.splitlines(keepends=True) |
---|
566 | n/a | if lines: |
---|
567 | n/a | if len(lines) > 1: |
---|
568 | n/a | # More than one line result; the first line is a full line |
---|
569 | n/a | # to return |
---|
570 | n/a | line = lines[0] |
---|
571 | n/a | del lines[0] |
---|
572 | n/a | if len(lines) > 1: |
---|
573 | n/a | # cache the remaining lines |
---|
574 | n/a | lines[-1] += self.charbuffer |
---|
575 | n/a | self.linebuffer = lines |
---|
576 | n/a | self.charbuffer = None |
---|
577 | n/a | else: |
---|
578 | n/a | # only one remaining line, put it back into charbuffer |
---|
579 | n/a | self.charbuffer = lines[0] + self.charbuffer |
---|
580 | n/a | if not keepends: |
---|
581 | n/a | line = line.splitlines(keepends=False)[0] |
---|
582 | n/a | break |
---|
583 | n/a | line0withend = lines[0] |
---|
584 | n/a | line0withoutend = lines[0].splitlines(keepends=False)[0] |
---|
585 | n/a | if line0withend != line0withoutend: # We really have a line end |
---|
586 | n/a | # Put the rest back together and keep it until the next call |
---|
587 | n/a | self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \ |
---|
588 | n/a | self.charbuffer |
---|
589 | n/a | if keepends: |
---|
590 | n/a | line = line0withend |
---|
591 | n/a | else: |
---|
592 | n/a | line = line0withoutend |
---|
593 | n/a | break |
---|
594 | n/a | # we didn't get anything or this was our only try |
---|
595 | n/a | if not data or size is not None: |
---|
596 | n/a | if line and not keepends: |
---|
597 | n/a | line = line.splitlines(keepends=False)[0] |
---|
598 | n/a | break |
---|
599 | n/a | if readsize < 8000: |
---|
600 | n/a | readsize *= 2 |
---|
601 | n/a | return line |
---|
602 | n/a | |
---|
603 | n/a | def readlines(self, sizehint=None, keepends=True): |
---|
604 | n/a | |
---|
605 | n/a | """ Read all lines available on the input stream |
---|
606 | n/a | and return them as a list. |
---|
607 | n/a | |
---|
608 | n/a | Line breaks are implemented using the codec's decoder |
---|
609 | n/a | method and are included in the list entries. |
---|
610 | n/a | |
---|
611 | n/a | sizehint, if given, is ignored since there is no efficient |
---|
612 | n/a | way to finding the true end-of-line. |
---|
613 | n/a | |
---|
614 | n/a | """ |
---|
615 | n/a | data = self.read() |
---|
616 | n/a | return data.splitlines(keepends) |
---|
617 | n/a | |
---|
618 | n/a | def reset(self): |
---|
619 | n/a | |
---|
620 | n/a | """ Resets the codec buffers used for keeping state. |
---|
621 | n/a | |
---|
622 | n/a | Note that no stream repositioning should take place. |
---|
623 | n/a | This method is primarily intended to be able to recover |
---|
624 | n/a | from decoding errors. |
---|
625 | n/a | |
---|
626 | n/a | """ |
---|
627 | n/a | self.bytebuffer = b"" |
---|
628 | n/a | self.charbuffer = self._empty_charbuffer |
---|
629 | n/a | self.linebuffer = None |
---|
630 | n/a | |
---|
631 | n/a | def seek(self, offset, whence=0): |
---|
632 | n/a | """ Set the input stream's current position. |
---|
633 | n/a | |
---|
634 | n/a | Resets the codec buffers used for keeping state. |
---|
635 | n/a | """ |
---|
636 | n/a | self.stream.seek(offset, whence) |
---|
637 | n/a | self.reset() |
---|
638 | n/a | |
---|
639 | n/a | def __next__(self): |
---|
640 | n/a | |
---|
641 | n/a | """ Return the next decoded line from the input stream.""" |
---|
642 | n/a | line = self.readline() |
---|
643 | n/a | if line: |
---|
644 | n/a | return line |
---|
645 | n/a | raise StopIteration |
---|
646 | n/a | |
---|
647 | n/a | def __iter__(self): |
---|
648 | n/a | return self |
---|
649 | n/a | |
---|
650 | n/a | def __getattr__(self, name, |
---|
651 | n/a | getattr=getattr): |
---|
652 | n/a | |
---|
653 | n/a | """ Inherit all other methods from the underlying stream. |
---|
654 | n/a | """ |
---|
655 | n/a | return getattr(self.stream, name) |
---|
656 | n/a | |
---|
657 | n/a | def __enter__(self): |
---|
658 | n/a | return self |
---|
659 | n/a | |
---|
660 | n/a | def __exit__(self, type, value, tb): |
---|
661 | n/a | self.stream.close() |
---|
662 | n/a | |
---|
663 | n/a | ### |
---|
664 | n/a | |
---|
665 | n/a | class StreamReaderWriter: |
---|
666 | n/a | |
---|
667 | n/a | """ StreamReaderWriter instances allow wrapping streams which |
---|
668 | n/a | work in both read and write modes. |
---|
669 | n/a | |
---|
670 | n/a | The design is such that one can use the factory functions |
---|
671 | n/a | returned by the codec.lookup() function to construct the |
---|
672 | n/a | instance. |
---|
673 | n/a | |
---|
674 | n/a | """ |
---|
675 | n/a | # Optional attributes set by the file wrappers below |
---|
676 | n/a | encoding = 'unknown' |
---|
677 | n/a | |
---|
678 | n/a | def __init__(self, stream, Reader, Writer, errors='strict'): |
---|
679 | n/a | |
---|
680 | n/a | """ Creates a StreamReaderWriter instance. |
---|
681 | n/a | |
---|
682 | n/a | stream must be a Stream-like object. |
---|
683 | n/a | |
---|
684 | n/a | Reader, Writer must be factory functions or classes |
---|
685 | n/a | providing the StreamReader, StreamWriter interface resp. |
---|
686 | n/a | |
---|
687 | n/a | Error handling is done in the same way as defined for the |
---|
688 | n/a | StreamWriter/Readers. |
---|
689 | n/a | |
---|
690 | n/a | """ |
---|
691 | n/a | self.stream = stream |
---|
692 | n/a | self.reader = Reader(stream, errors) |
---|
693 | n/a | self.writer = Writer(stream, errors) |
---|
694 | n/a | self.errors = errors |
---|
695 | n/a | |
---|
696 | n/a | def read(self, size=-1): |
---|
697 | n/a | |
---|
698 | n/a | return self.reader.read(size) |
---|
699 | n/a | |
---|
700 | n/a | def readline(self, size=None): |
---|
701 | n/a | |
---|
702 | n/a | return self.reader.readline(size) |
---|
703 | n/a | |
---|
704 | n/a | def readlines(self, sizehint=None): |
---|
705 | n/a | |
---|
706 | n/a | return self.reader.readlines(sizehint) |
---|
707 | n/a | |
---|
708 | n/a | def __next__(self): |
---|
709 | n/a | |
---|
710 | n/a | """ Return the next decoded line from the input stream.""" |
---|
711 | n/a | return next(self.reader) |
---|
712 | n/a | |
---|
713 | n/a | def __iter__(self): |
---|
714 | n/a | return self |
---|
715 | n/a | |
---|
716 | n/a | def write(self, data): |
---|
717 | n/a | |
---|
718 | n/a | return self.writer.write(data) |
---|
719 | n/a | |
---|
720 | n/a | def writelines(self, list): |
---|
721 | n/a | |
---|
722 | n/a | return self.writer.writelines(list) |
---|
723 | n/a | |
---|
724 | n/a | def reset(self): |
---|
725 | n/a | |
---|
726 | n/a | self.reader.reset() |
---|
727 | n/a | self.writer.reset() |
---|
728 | n/a | |
---|
729 | n/a | def seek(self, offset, whence=0): |
---|
730 | n/a | self.stream.seek(offset, whence) |
---|
731 | n/a | self.reader.reset() |
---|
732 | n/a | if whence == 0 and offset == 0: |
---|
733 | n/a | self.writer.reset() |
---|
734 | n/a | |
---|
735 | n/a | def __getattr__(self, name, |
---|
736 | n/a | getattr=getattr): |
---|
737 | n/a | |
---|
738 | n/a | """ Inherit all other methods from the underlying stream. |
---|
739 | n/a | """ |
---|
740 | n/a | return getattr(self.stream, name) |
---|
741 | n/a | |
---|
742 | n/a | # these are needed to make "with codecs.open(...)" work properly |
---|
743 | n/a | |
---|
744 | n/a | def __enter__(self): |
---|
745 | n/a | return self |
---|
746 | n/a | |
---|
747 | n/a | def __exit__(self, type, value, tb): |
---|
748 | n/a | self.stream.close() |
---|
749 | n/a | |
---|
750 | n/a | ### |
---|
751 | n/a | |
---|
752 | n/a | class StreamRecoder: |
---|
753 | n/a | |
---|
754 | n/a | """ StreamRecoder instances translate data from one encoding to another. |
---|
755 | n/a | |
---|
756 | n/a | They use the complete set of APIs returned by the |
---|
757 | n/a | codecs.lookup() function to implement their task. |
---|
758 | n/a | |
---|
759 | n/a | Data written to the StreamRecoder is first decoded into an |
---|
760 | n/a | intermediate format (depending on the "decode" codec) and then |
---|
761 | n/a | written to the underlying stream using an instance of the provided |
---|
762 | n/a | Writer class. |
---|
763 | n/a | |
---|
764 | n/a | In the other direction, data is read from the underlying stream using |
---|
765 | n/a | a Reader instance and then encoded and returned to the caller. |
---|
766 | n/a | |
---|
767 | n/a | """ |
---|
768 | n/a | # Optional attributes set by the file wrappers below |
---|
769 | n/a | data_encoding = 'unknown' |
---|
770 | n/a | file_encoding = 'unknown' |
---|
771 | n/a | |
---|
772 | n/a | def __init__(self, stream, encode, decode, Reader, Writer, |
---|
773 | n/a | errors='strict'): |
---|
774 | n/a | |
---|
775 | n/a | """ Creates a StreamRecoder instance which implements a two-way |
---|
776 | n/a | conversion: encode and decode work on the frontend (the |
---|
777 | n/a | data visible to .read() and .write()) while Reader and Writer |
---|
778 | n/a | work on the backend (the data in stream). |
---|
779 | n/a | |
---|
780 | n/a | You can use these objects to do transparent |
---|
781 | n/a | transcodings from e.g. latin-1 to utf-8 and back. |
---|
782 | n/a | |
---|
783 | n/a | stream must be a file-like object. |
---|
784 | n/a | |
---|
785 | n/a | encode and decode must adhere to the Codec interface; Reader and |
---|
786 | n/a | Writer must be factory functions or classes providing the |
---|
787 | n/a | StreamReader and StreamWriter interfaces resp. |
---|
788 | n/a | |
---|
789 | n/a | Error handling is done in the same way as defined for the |
---|
790 | n/a | StreamWriter/Readers. |
---|
791 | n/a | |
---|
792 | n/a | """ |
---|
793 | n/a | self.stream = stream |
---|
794 | n/a | self.encode = encode |
---|
795 | n/a | self.decode = decode |
---|
796 | n/a | self.reader = Reader(stream, errors) |
---|
797 | n/a | self.writer = Writer(stream, errors) |
---|
798 | n/a | self.errors = errors |
---|
799 | n/a | |
---|
800 | n/a | def read(self, size=-1): |
---|
801 | n/a | |
---|
802 | n/a | data = self.reader.read(size) |
---|
803 | n/a | data, bytesencoded = self.encode(data, self.errors) |
---|
804 | n/a | return data |
---|
805 | n/a | |
---|
806 | n/a | def readline(self, size=None): |
---|
807 | n/a | |
---|
808 | n/a | if size is None: |
---|
809 | n/a | data = self.reader.readline() |
---|
810 | n/a | else: |
---|
811 | n/a | data = self.reader.readline(size) |
---|
812 | n/a | data, bytesencoded = self.encode(data, self.errors) |
---|
813 | n/a | return data |
---|
814 | n/a | |
---|
815 | n/a | def readlines(self, sizehint=None): |
---|
816 | n/a | |
---|
817 | n/a | data = self.reader.read() |
---|
818 | n/a | data, bytesencoded = self.encode(data, self.errors) |
---|
819 | n/a | return data.splitlines(keepends=True) |
---|
820 | n/a | |
---|
821 | n/a | def __next__(self): |
---|
822 | n/a | |
---|
823 | n/a | """ Return the next decoded line from the input stream.""" |
---|
824 | n/a | data = next(self.reader) |
---|
825 | n/a | data, bytesencoded = self.encode(data, self.errors) |
---|
826 | n/a | return data |
---|
827 | n/a | |
---|
828 | n/a | def __iter__(self): |
---|
829 | n/a | return self |
---|
830 | n/a | |
---|
831 | n/a | def write(self, data): |
---|
832 | n/a | |
---|
833 | n/a | data, bytesdecoded = self.decode(data, self.errors) |
---|
834 | n/a | return self.writer.write(data) |
---|
835 | n/a | |
---|
836 | n/a | def writelines(self, list): |
---|
837 | n/a | |
---|
838 | n/a | data = ''.join(list) |
---|
839 | n/a | data, bytesdecoded = self.decode(data, self.errors) |
---|
840 | n/a | return self.writer.write(data) |
---|
841 | n/a | |
---|
842 | n/a | def reset(self): |
---|
843 | n/a | |
---|
844 | n/a | self.reader.reset() |
---|
845 | n/a | self.writer.reset() |
---|
846 | n/a | |
---|
847 | n/a | def __getattr__(self, name, |
---|
848 | n/a | getattr=getattr): |
---|
849 | n/a | |
---|
850 | n/a | """ Inherit all other methods from the underlying stream. |
---|
851 | n/a | """ |
---|
852 | n/a | return getattr(self.stream, name) |
---|
853 | n/a | |
---|
854 | n/a | def __enter__(self): |
---|
855 | n/a | return self |
---|
856 | n/a | |
---|
857 | n/a | def __exit__(self, type, value, tb): |
---|
858 | n/a | self.stream.close() |
---|
859 | n/a | |
---|
860 | n/a | ### Shortcuts |
---|
861 | n/a | |
---|
862 | n/a | def open(filename, mode='r', encoding=None, errors='strict', buffering=1): |
---|
863 | n/a | |
---|
864 | n/a | """ Open an encoded file using the given mode and return |
---|
865 | n/a | a wrapped version providing transparent encoding/decoding. |
---|
866 | n/a | |
---|
867 | n/a | Note: The wrapped version will only accept the object format |
---|
868 | n/a | defined by the codecs, i.e. Unicode objects for most builtin |
---|
869 | n/a | codecs. Output is also codec dependent and will usually be |
---|
870 | n/a | Unicode as well. |
---|
871 | n/a | |
---|
872 | n/a | Underlying encoded files are always opened in binary mode. |
---|
873 | n/a | The default file mode is 'r', meaning to open the file in read mode. |
---|
874 | n/a | |
---|
875 | n/a | encoding specifies the encoding which is to be used for the |
---|
876 | n/a | file. |
---|
877 | n/a | |
---|
878 | n/a | errors may be given to define the error handling. It defaults |
---|
879 | n/a | to 'strict' which causes ValueErrors to be raised in case an |
---|
880 | n/a | encoding error occurs. |
---|
881 | n/a | |
---|
882 | n/a | buffering has the same meaning as for the builtin open() API. |
---|
883 | n/a | It defaults to line buffered. |
---|
884 | n/a | |
---|
885 | n/a | The returned wrapped file object provides an extra attribute |
---|
886 | n/a | .encoding which allows querying the used encoding. This |
---|
887 | n/a | attribute is only available if an encoding was specified as |
---|
888 | n/a | parameter. |
---|
889 | n/a | |
---|
890 | n/a | """ |
---|
891 | n/a | if encoding is not None and \ |
---|
892 | n/a | 'b' not in mode: |
---|
893 | n/a | # Force opening of the file in binary mode |
---|
894 | n/a | mode = mode + 'b' |
---|
895 | n/a | file = builtins.open(filename, mode, buffering) |
---|
896 | n/a | if encoding is None: |
---|
897 | n/a | return file |
---|
898 | n/a | info = lookup(encoding) |
---|
899 | n/a | srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) |
---|
900 | n/a | # Add attributes to simplify introspection |
---|
901 | n/a | srw.encoding = encoding |
---|
902 | n/a | return srw |
---|
903 | n/a | |
---|
904 | n/a | def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): |
---|
905 | n/a | |
---|
906 | n/a | """ Return a wrapped version of file which provides transparent |
---|
907 | n/a | encoding translation. |
---|
908 | n/a | |
---|
909 | n/a | Data written to the wrapped file is decoded according |
---|
910 | n/a | to the given data_encoding and then encoded to the underlying |
---|
911 | n/a | file using file_encoding. The intermediate data type |
---|
912 | n/a | will usually be Unicode but depends on the specified codecs. |
---|
913 | n/a | |
---|
914 | n/a | Bytes read from the file are decoded using file_encoding and then |
---|
915 | n/a | passed back to the caller encoded using data_encoding. |
---|
916 | n/a | |
---|
917 | n/a | If file_encoding is not given, it defaults to data_encoding. |
---|
918 | n/a | |
---|
919 | n/a | errors may be given to define the error handling. It defaults |
---|
920 | n/a | to 'strict' which causes ValueErrors to be raised in case an |
---|
921 | n/a | encoding error occurs. |
---|
922 | n/a | |
---|
923 | n/a | The returned wrapped file object provides two extra attributes |
---|
924 | n/a | .data_encoding and .file_encoding which reflect the given |
---|
925 | n/a | parameters of the same name. The attributes can be used for |
---|
926 | n/a | introspection by Python programs. |
---|
927 | n/a | |
---|
928 | n/a | """ |
---|
929 | n/a | if file_encoding is None: |
---|
930 | n/a | file_encoding = data_encoding |
---|
931 | n/a | data_info = lookup(data_encoding) |
---|
932 | n/a | file_info = lookup(file_encoding) |
---|
933 | n/a | sr = StreamRecoder(file, data_info.encode, data_info.decode, |
---|
934 | n/a | file_info.streamreader, file_info.streamwriter, errors) |
---|
935 | n/a | # Add attributes to simplify introspection |
---|
936 | n/a | sr.data_encoding = data_encoding |
---|
937 | n/a | sr.file_encoding = file_encoding |
---|
938 | n/a | return sr |
---|
939 | n/a | |
---|
940 | n/a | ### Helpers for codec lookup |
---|
941 | n/a | |
---|
942 | n/a | def getencoder(encoding): |
---|
943 | n/a | |
---|
944 | n/a | """ Lookup up the codec for the given encoding and return |
---|
945 | n/a | its encoder function. |
---|
946 | n/a | |
---|
947 | n/a | Raises a LookupError in case the encoding cannot be found. |
---|
948 | n/a | |
---|
949 | n/a | """ |
---|
950 | n/a | return lookup(encoding).encode |
---|
951 | n/a | |
---|
952 | n/a | def getdecoder(encoding): |
---|
953 | n/a | |
---|
954 | n/a | """ Lookup up the codec for the given encoding and return |
---|
955 | n/a | its decoder function. |
---|
956 | n/a | |
---|
957 | n/a | Raises a LookupError in case the encoding cannot be found. |
---|
958 | n/a | |
---|
959 | n/a | """ |
---|
960 | n/a | return lookup(encoding).decode |
---|
961 | n/a | |
---|
962 | n/a | def getincrementalencoder(encoding): |
---|
963 | n/a | |
---|
964 | n/a | """ Lookup up the codec for the given encoding and return |
---|
965 | n/a | its IncrementalEncoder class or factory function. |
---|
966 | n/a | |
---|
967 | n/a | Raises a LookupError in case the encoding cannot be found |
---|
968 | n/a | or the codecs doesn't provide an incremental encoder. |
---|
969 | n/a | |
---|
970 | n/a | """ |
---|
971 | n/a | encoder = lookup(encoding).incrementalencoder |
---|
972 | n/a | if encoder is None: |
---|
973 | n/a | raise LookupError(encoding) |
---|
974 | n/a | return encoder |
---|
975 | n/a | |
---|
976 | n/a | def getincrementaldecoder(encoding): |
---|
977 | n/a | |
---|
978 | n/a | """ Lookup up the codec for the given encoding and return |
---|
979 | n/a | its IncrementalDecoder class or factory function. |
---|
980 | n/a | |
---|
981 | n/a | Raises a LookupError in case the encoding cannot be found |
---|
982 | n/a | or the codecs doesn't provide an incremental decoder. |
---|
983 | n/a | |
---|
984 | n/a | """ |
---|
985 | n/a | decoder = lookup(encoding).incrementaldecoder |
---|
986 | n/a | if decoder is None: |
---|
987 | n/a | raise LookupError(encoding) |
---|
988 | n/a | return decoder |
---|
989 | n/a | |
---|
990 | n/a | def getreader(encoding): |
---|
991 | n/a | |
---|
992 | n/a | """ Lookup up the codec for the given encoding and return |
---|
993 | n/a | its StreamReader class or factory function. |
---|
994 | n/a | |
---|
995 | n/a | Raises a LookupError in case the encoding cannot be found. |
---|
996 | n/a | |
---|
997 | n/a | """ |
---|
998 | n/a | return lookup(encoding).streamreader |
---|
999 | n/a | |
---|
1000 | n/a | def getwriter(encoding): |
---|
1001 | n/a | |
---|
1002 | n/a | """ Lookup up the codec for the given encoding and return |
---|
1003 | n/a | its StreamWriter class or factory function. |
---|
1004 | n/a | |
---|
1005 | n/a | Raises a LookupError in case the encoding cannot be found. |
---|
1006 | n/a | |
---|
1007 | n/a | """ |
---|
1008 | n/a | return lookup(encoding).streamwriter |
---|
1009 | n/a | |
---|
1010 | n/a | def iterencode(iterator, encoding, errors='strict', **kwargs): |
---|
1011 | n/a | """ |
---|
1012 | n/a | Encoding iterator. |
---|
1013 | n/a | |
---|
1014 | n/a | Encodes the input strings from the iterator using an IncrementalEncoder. |
---|
1015 | n/a | |
---|
1016 | n/a | errors and kwargs are passed through to the IncrementalEncoder |
---|
1017 | n/a | constructor. |
---|
1018 | n/a | """ |
---|
1019 | n/a | encoder = getincrementalencoder(encoding)(errors, **kwargs) |
---|
1020 | n/a | for input in iterator: |
---|
1021 | n/a | output = encoder.encode(input) |
---|
1022 | n/a | if output: |
---|
1023 | n/a | yield output |
---|
1024 | n/a | output = encoder.encode("", True) |
---|
1025 | n/a | if output: |
---|
1026 | n/a | yield output |
---|
1027 | n/a | |
---|
1028 | n/a | def iterdecode(iterator, encoding, errors='strict', **kwargs): |
---|
1029 | n/a | """ |
---|
1030 | n/a | Decoding iterator. |
---|
1031 | n/a | |
---|
1032 | n/a | Decodes the input strings from the iterator using an IncrementalDecoder. |
---|
1033 | n/a | |
---|
1034 | n/a | errors and kwargs are passed through to the IncrementalDecoder |
---|
1035 | n/a | constructor. |
---|
1036 | n/a | """ |
---|
1037 | n/a | decoder = getincrementaldecoder(encoding)(errors, **kwargs) |
---|
1038 | n/a | for input in iterator: |
---|
1039 | n/a | output = decoder.decode(input) |
---|
1040 | n/a | if output: |
---|
1041 | n/a | yield output |
---|
1042 | n/a | output = decoder.decode(b"", True) |
---|
1043 | n/a | if output: |
---|
1044 | n/a | yield output |
---|
1045 | n/a | |
---|
1046 | n/a | ### Helpers for charmap-based codecs |
---|
1047 | n/a | |
---|
1048 | n/a | def make_identity_dict(rng): |
---|
1049 | n/a | |
---|
1050 | n/a | """ make_identity_dict(rng) -> dict |
---|
1051 | n/a | |
---|
1052 | n/a | Return a dictionary where elements of the rng sequence are |
---|
1053 | n/a | mapped to themselves. |
---|
1054 | n/a | |
---|
1055 | n/a | """ |
---|
1056 | n/a | return {i:i for i in rng} |
---|
1057 | n/a | |
---|
1058 | n/a | def make_encoding_map(decoding_map): |
---|
1059 | n/a | |
---|
1060 | n/a | """ Creates an encoding map from a decoding map. |
---|
1061 | n/a | |
---|
1062 | n/a | If a target mapping in the decoding map occurs multiple |
---|
1063 | n/a | times, then that target is mapped to None (undefined mapping), |
---|
1064 | n/a | causing an exception when encountered by the charmap codec |
---|
1065 | n/a | during translation. |
---|
1066 | n/a | |
---|
1067 | n/a | One example where this happens is cp875.py which decodes |
---|
1068 | n/a | multiple character to \\u001a. |
---|
1069 | n/a | |
---|
1070 | n/a | """ |
---|
1071 | n/a | m = {} |
---|
1072 | n/a | for k,v in decoding_map.items(): |
---|
1073 | n/a | if not v in m: |
---|
1074 | n/a | m[v] = k |
---|
1075 | n/a | else: |
---|
1076 | n/a | m[v] = None |
---|
1077 | n/a | return m |
---|
1078 | n/a | |
---|
1079 | n/a | ### error handlers |
---|
1080 | n/a | |
---|
1081 | n/a | try: |
---|
1082 | n/a | strict_errors = lookup_error("strict") |
---|
1083 | n/a | ignore_errors = lookup_error("ignore") |
---|
1084 | n/a | replace_errors = lookup_error("replace") |
---|
1085 | n/a | xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") |
---|
1086 | n/a | backslashreplace_errors = lookup_error("backslashreplace") |
---|
1087 | n/a | namereplace_errors = lookup_error("namereplace") |
---|
1088 | n/a | except LookupError: |
---|
1089 | n/a | # In --disable-unicode builds, these error handler are missing |
---|
1090 | n/a | strict_errors = None |
---|
1091 | n/a | ignore_errors = None |
---|
1092 | n/a | replace_errors = None |
---|
1093 | n/a | xmlcharrefreplace_errors = None |
---|
1094 | n/a | backslashreplace_errors = None |
---|
1095 | n/a | namereplace_errors = None |
---|
1096 | n/a | |
---|
1097 | n/a | # Tell modulefinder that using codecs probably needs the encodings |
---|
1098 | n/a | # package |
---|
1099 | n/a | _false = 0 |
---|
1100 | n/a | if _false: |
---|
1101 | n/a | import encodings |
---|
1102 | n/a | |
---|
1103 | n/a | ### Tests |
---|
1104 | n/a | |
---|
1105 | n/a | if __name__ == '__main__': |
---|
1106 | n/a | |
---|
1107 | n/a | # Make stdout translate Latin-1 output into UTF-8 output |
---|
1108 | n/a | sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') |
---|
1109 | n/a | |
---|
1110 | n/a | # Have stdin translate Latin-1 input into UTF-8 input |
---|
1111 | n/a | sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') |
---|