ยปCore Development>Code coverage>Python/codecs.c

Python code coverage for Python/codecs.c

#countcontent
1n/a/* ------------------------------------------------------------------------
2n/a
3n/a Python Codec Registry and support functions
4n/a
5n/aWritten by Marc-Andre Lemburg (mal@lemburg.com).
6n/a
7n/aCopyright (c) Corporation for National Research Initiatives.
8n/a
9n/a ------------------------------------------------------------------------ */
10n/a
11n/a#include "Python.h"
12n/a#include "ucnhash.h"
13n/a#include <ctype.h>
14n/a
15n/aconst char *Py_hexdigits = "0123456789abcdef";
16n/a
17n/a/* --- Codec Registry ----------------------------------------------------- */
18n/a
19n/a/* Import the standard encodings package which will register the first
20n/a codec search function.
21n/a
22n/a This is done in a lazy way so that the Unicode implementation does
23n/a not downgrade startup time of scripts not needing it.
24n/a
25n/a ImportErrors are silently ignored by this function. Only one try is
26n/a made.
27n/a
28n/a*/
29n/a
30n/astatic int _PyCodecRegistry_Init(void); /* Forward */
31n/a
32n/aint PyCodec_Register(PyObject *search_function)
33n/a{
34n/a PyInterpreterState *interp = PyThreadState_GET()->interp;
35n/a if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
36n/a goto onError;
37n/a if (search_function == NULL) {
38n/a PyErr_BadArgument();
39n/a goto onError;
40n/a }
41n/a if (!PyCallable_Check(search_function)) {
42n/a PyErr_SetString(PyExc_TypeError, "argument must be callable");
43n/a goto onError;
44n/a }
45n/a return PyList_Append(interp->codec_search_path, search_function);
46n/a
47n/a onError:
48n/a return -1;
49n/a}
50n/a
51n/a/* Convert a string to a normalized Python string: all characters are
52n/a converted to lower case, spaces are replaced with underscores. */
53n/a
54n/astatic
55n/aPyObject *normalizestring(const char *string)
56n/a{
57n/a size_t i;
58n/a size_t len = strlen(string);
59n/a char *p;
60n/a PyObject *v;
61n/a
62n/a if (len > PY_SSIZE_T_MAX) {
63n/a PyErr_SetString(PyExc_OverflowError, "string is too large");
64n/a return NULL;
65n/a }
66n/a
67n/a p = PyMem_Malloc(len + 1);
68n/a if (p == NULL)
69n/a return PyErr_NoMemory();
70n/a for (i = 0; i < len; i++) {
71n/a char ch = string[i];
72n/a if (ch == ' ')
73n/a ch = '-';
74n/a else
75n/a ch = Py_TOLOWER(Py_CHARMASK(ch));
76n/a p[i] = ch;
77n/a }
78n/a p[i] = '\0';
79n/a v = PyUnicode_FromString(p);
80n/a if (v == NULL)
81n/a return NULL;
82n/a PyMem_Free(p);
83n/a return v;
84n/a}
85n/a
86n/a/* Lookup the given encoding and return a tuple providing the codec
87n/a facilities.
88n/a
89n/a The encoding string is looked up converted to all lower-case
90n/a characters. This makes encodings looked up through this mechanism
91n/a effectively case-insensitive.
92n/a
93n/a If no codec is found, a LookupError is set and NULL returned.
94n/a
95n/a As side effect, this tries to load the encodings package, if not
96n/a yet done. This is part of the lazy load strategy for the encodings
97n/a package.
98n/a
99n/a*/
100n/a
101n/aPyObject *_PyCodec_Lookup(const char *encoding)
102n/a{
103n/a PyInterpreterState *interp;
104n/a PyObject *result, *args = NULL, *v;
105n/a Py_ssize_t i, len;
106n/a
107n/a if (encoding == NULL) {
108n/a PyErr_BadArgument();
109n/a goto onError;
110n/a }
111n/a
112n/a interp = PyThreadState_GET()->interp;
113n/a if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
114n/a goto onError;
115n/a
116n/a /* Convert the encoding to a normalized Python string: all
117n/a characters are converted to lower case, spaces and hyphens are
118n/a replaced with underscores. */
119n/a v = normalizestring(encoding);
120n/a if (v == NULL)
121n/a goto onError;
122n/a PyUnicode_InternInPlace(&v);
123n/a
124n/a /* First, try to lookup the name in the registry dictionary */
125n/a result = PyDict_GetItem(interp->codec_search_cache, v);
126n/a if (result != NULL) {
127n/a Py_INCREF(result);
128n/a Py_DECREF(v);
129n/a return result;
130n/a }
131n/a
132n/a /* Next, scan the search functions in order of registration */
133n/a args = PyTuple_New(1);
134n/a if (args == NULL)
135n/a goto onError;
136n/a PyTuple_SET_ITEM(args,0,v);
137n/a
138n/a len = PyList_Size(interp->codec_search_path);
139n/a if (len < 0)
140n/a goto onError;
141n/a if (len == 0) {
142n/a PyErr_SetString(PyExc_LookupError,
143n/a "no codec search functions registered: "
144n/a "can't find encoding");
145n/a goto onError;
146n/a }
147n/a
148n/a for (i = 0; i < len; i++) {
149n/a PyObject *func;
150n/a
151n/a func = PyList_GetItem(interp->codec_search_path, i);
152n/a if (func == NULL)
153n/a goto onError;
154n/a result = PyEval_CallObject(func, args);
155n/a if (result == NULL)
156n/a goto onError;
157n/a if (result == Py_None) {
158n/a Py_DECREF(result);
159n/a continue;
160n/a }
161n/a if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
162n/a PyErr_SetString(PyExc_TypeError,
163n/a "codec search functions must return 4-tuples");
164n/a Py_DECREF(result);
165n/a goto onError;
166n/a }
167n/a break;
168n/a }
169n/a if (i == len) {
170n/a /* XXX Perhaps we should cache misses too ? */
171n/a PyErr_Format(PyExc_LookupError,
172n/a "unknown encoding: %s", encoding);
173n/a goto onError;
174n/a }
175n/a
176n/a /* Cache and return the result */
177n/a if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
178n/a Py_DECREF(result);
179n/a goto onError;
180n/a }
181n/a Py_DECREF(args);
182n/a return result;
183n/a
184n/a onError:
185n/a Py_XDECREF(args);
186n/a return NULL;
187n/a}
188n/a
189n/aint _PyCodec_Forget(const char *encoding)
190n/a{
191n/a PyInterpreterState *interp;
192n/a PyObject *v;
193n/a int result;
194n/a
195n/a interp = PyThreadState_GET()->interp;
196n/a if (interp->codec_search_path == NULL) {
197n/a return -1;
198n/a }
199n/a
200n/a /* Convert the encoding to a normalized Python string: all
201n/a characters are converted to lower case, spaces and hyphens are
202n/a replaced with underscores. */
203n/a v = normalizestring(encoding);
204n/a if (v == NULL) {
205n/a return -1;
206n/a }
207n/a
208n/a /* Drop the named codec from the internal cache */
209n/a result = PyDict_DelItem(interp->codec_search_cache, v);
210n/a Py_DECREF(v);
211n/a
212n/a return result;
213n/a}
214n/a
215n/a/* Codec registry encoding check API. */
216n/a
217n/aint PyCodec_KnownEncoding(const char *encoding)
218n/a{
219n/a PyObject *codecs;
220n/a
221n/a codecs = _PyCodec_Lookup(encoding);
222n/a if (!codecs) {
223n/a PyErr_Clear();
224n/a return 0;
225n/a }
226n/a else {
227n/a Py_DECREF(codecs);
228n/a return 1;
229n/a }
230n/a}
231n/a
232n/astatic
233n/aPyObject *args_tuple(PyObject *object,
234n/a const char *errors)
235n/a{
236n/a PyObject *args;
237n/a
238n/a args = PyTuple_New(1 + (errors != NULL));
239n/a if (args == NULL)
240n/a return NULL;
241n/a Py_INCREF(object);
242n/a PyTuple_SET_ITEM(args,0,object);
243n/a if (errors) {
244n/a PyObject *v;
245n/a
246n/a v = PyUnicode_FromString(errors);
247n/a if (v == NULL) {
248n/a Py_DECREF(args);
249n/a return NULL;
250n/a }
251n/a PyTuple_SET_ITEM(args, 1, v);
252n/a }
253n/a return args;
254n/a}
255n/a
256n/a/* Helper function to get a codec item */
257n/a
258n/astatic
259n/aPyObject *codec_getitem(const char *encoding, int index)
260n/a{
261n/a PyObject *codecs;
262n/a PyObject *v;
263n/a
264n/a codecs = _PyCodec_Lookup(encoding);
265n/a if (codecs == NULL)
266n/a return NULL;
267n/a v = PyTuple_GET_ITEM(codecs, index);
268n/a Py_DECREF(codecs);
269n/a Py_INCREF(v);
270n/a return v;
271n/a}
272n/a
273n/a/* Helper functions to create an incremental codec. */
274n/astatic
275n/aPyObject *codec_makeincrementalcodec(PyObject *codec_info,
276n/a const char *errors,
277n/a const char *attrname)
278n/a{
279n/a PyObject *ret, *inccodec;
280n/a
281n/a inccodec = PyObject_GetAttrString(codec_info, attrname);
282n/a if (inccodec == NULL)
283n/a return NULL;
284n/a if (errors)
285n/a ret = PyObject_CallFunction(inccodec, "s", errors);
286n/a else
287n/a ret = _PyObject_CallNoArg(inccodec);
288n/a Py_DECREF(inccodec);
289n/a return ret;
290n/a}
291n/a
292n/astatic
293n/aPyObject *codec_getincrementalcodec(const char *encoding,
294n/a const char *errors,
295n/a const char *attrname)
296n/a{
297n/a PyObject *codec_info, *ret;
298n/a
299n/a codec_info = _PyCodec_Lookup(encoding);
300n/a if (codec_info == NULL)
301n/a return NULL;
302n/a ret = codec_makeincrementalcodec(codec_info, errors, attrname);
303n/a Py_DECREF(codec_info);
304n/a return ret;
305n/a}
306n/a
307n/a/* Helper function to create a stream codec. */
308n/a
309n/astatic
310n/aPyObject *codec_getstreamcodec(const char *encoding,
311n/a PyObject *stream,
312n/a const char *errors,
313n/a const int index)
314n/a{
315n/a PyObject *codecs, *streamcodec, *codeccls;
316n/a
317n/a codecs = _PyCodec_Lookup(encoding);
318n/a if (codecs == NULL)
319n/a return NULL;
320n/a
321n/a codeccls = PyTuple_GET_ITEM(codecs, index);
322n/a if (errors != NULL)
323n/a streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
324n/a else
325n/a streamcodec = PyObject_CallFunctionObjArgs(codeccls, stream, NULL);
326n/a Py_DECREF(codecs);
327n/a return streamcodec;
328n/a}
329n/a
330n/a/* Helpers to work with the result of _PyCodec_Lookup
331n/a
332n/a */
333n/aPyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
334n/a const char *errors)
335n/a{
336n/a return codec_makeincrementalcodec(codec_info, errors,
337n/a "incrementaldecoder");
338n/a}
339n/a
340n/aPyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
341n/a const char *errors)
342n/a{
343n/a return codec_makeincrementalcodec(codec_info, errors,
344n/a "incrementalencoder");
345n/a}
346n/a
347n/a
348n/a/* Convenience APIs to query the Codec registry.
349n/a
350n/a All APIs return a codec object with incremented refcount.
351n/a
352n/a */
353n/a
354n/aPyObject *PyCodec_Encoder(const char *encoding)
355n/a{
356n/a return codec_getitem(encoding, 0);
357n/a}
358n/a
359n/aPyObject *PyCodec_Decoder(const char *encoding)
360n/a{
361n/a return codec_getitem(encoding, 1);
362n/a}
363n/a
364n/aPyObject *PyCodec_IncrementalEncoder(const char *encoding,
365n/a const char *errors)
366n/a{
367n/a return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
368n/a}
369n/a
370n/aPyObject *PyCodec_IncrementalDecoder(const char *encoding,
371n/a const char *errors)
372n/a{
373n/a return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
374n/a}
375n/a
376n/aPyObject *PyCodec_StreamReader(const char *encoding,
377n/a PyObject *stream,
378n/a const char *errors)
379n/a{
380n/a return codec_getstreamcodec(encoding, stream, errors, 2);
381n/a}
382n/a
383n/aPyObject *PyCodec_StreamWriter(const char *encoding,
384n/a PyObject *stream,
385n/a const char *errors)
386n/a{
387n/a return codec_getstreamcodec(encoding, stream, errors, 3);
388n/a}
389n/a
390n/a/* Helper that tries to ensure the reported exception chain indicates the
391n/a * codec that was invoked to trigger the failure without changing the type
392n/a * of the exception raised.
393n/a */
394n/astatic void
395n/awrap_codec_error(const char *operation,
396n/a const char *encoding)
397n/a{
398n/a /* TrySetFromCause will replace the active exception with a suitably
399n/a * updated clone if it can, otherwise it will leave the original
400n/a * exception alone.
401n/a */
402n/a _PyErr_TrySetFromCause("%s with '%s' codec failed",
403n/a operation, encoding);
404n/a}
405n/a
406n/a/* Encode an object (e.g. a Unicode object) using the given encoding
407n/a and return the resulting encoded object (usually a Python string).
408n/a
409n/a errors is passed to the encoder factory as argument if non-NULL. */
410n/a
411n/astatic PyObject *
412n/a_PyCodec_EncodeInternal(PyObject *object,
413n/a PyObject *encoder,
414n/a const char *encoding,
415n/a const char *errors)
416n/a{
417n/a PyObject *args = NULL, *result = NULL;
418n/a PyObject *v = NULL;
419n/a
420n/a args = args_tuple(object, errors);
421n/a if (args == NULL)
422n/a goto onError;
423n/a
424n/a result = PyEval_CallObject(encoder, args);
425n/a if (result == NULL) {
426n/a wrap_codec_error("encoding", encoding);
427n/a goto onError;
428n/a }
429n/a
430n/a if (!PyTuple_Check(result) ||
431n/a PyTuple_GET_SIZE(result) != 2) {
432n/a PyErr_SetString(PyExc_TypeError,
433n/a "encoder must return a tuple (object, integer)");
434n/a goto onError;
435n/a }
436n/a v = PyTuple_GET_ITEM(result,0);
437n/a Py_INCREF(v);
438n/a /* We don't check or use the second (integer) entry. */
439n/a
440n/a Py_DECREF(args);
441n/a Py_DECREF(encoder);
442n/a Py_DECREF(result);
443n/a return v;
444n/a
445n/a onError:
446n/a Py_XDECREF(result);
447n/a Py_XDECREF(args);
448n/a Py_XDECREF(encoder);
449n/a return NULL;
450n/a}
451n/a
452n/a/* Decode an object (usually a Python string) using the given encoding
453n/a and return an equivalent object (e.g. a Unicode object).
454n/a
455n/a errors is passed to the decoder factory as argument if non-NULL. */
456n/a
457n/astatic PyObject *
458n/a_PyCodec_DecodeInternal(PyObject *object,
459n/a PyObject *decoder,
460n/a const char *encoding,
461n/a const char *errors)
462n/a{
463n/a PyObject *args = NULL, *result = NULL;
464n/a PyObject *v;
465n/a
466n/a args = args_tuple(object, errors);
467n/a if (args == NULL)
468n/a goto onError;
469n/a
470n/a result = PyEval_CallObject(decoder,args);
471n/a if (result == NULL) {
472n/a wrap_codec_error("decoding", encoding);
473n/a goto onError;
474n/a }
475n/a if (!PyTuple_Check(result) ||
476n/a PyTuple_GET_SIZE(result) != 2) {
477n/a PyErr_SetString(PyExc_TypeError,
478n/a "decoder must return a tuple (object,integer)");
479n/a goto onError;
480n/a }
481n/a v = PyTuple_GET_ITEM(result,0);
482n/a Py_INCREF(v);
483n/a /* We don't check or use the second (integer) entry. */
484n/a
485n/a Py_DECREF(args);
486n/a Py_DECREF(decoder);
487n/a Py_DECREF(result);
488n/a return v;
489n/a
490n/a onError:
491n/a Py_XDECREF(args);
492n/a Py_XDECREF(decoder);
493n/a Py_XDECREF(result);
494n/a return NULL;
495n/a}
496n/a
497n/a/* Generic encoding/decoding API */
498n/aPyObject *PyCodec_Encode(PyObject *object,
499n/a const char *encoding,
500n/a const char *errors)
501n/a{
502n/a PyObject *encoder;
503n/a
504n/a encoder = PyCodec_Encoder(encoding);
505n/a if (encoder == NULL)
506n/a return NULL;
507n/a
508n/a return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
509n/a}
510n/a
511n/aPyObject *PyCodec_Decode(PyObject *object,
512n/a const char *encoding,
513n/a const char *errors)
514n/a{
515n/a PyObject *decoder;
516n/a
517n/a decoder = PyCodec_Decoder(encoding);
518n/a if (decoder == NULL)
519n/a return NULL;
520n/a
521n/a return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
522n/a}
523n/a
524n/a/* Text encoding/decoding API */
525n/aPyObject * _PyCodec_LookupTextEncoding(const char *encoding,
526n/a const char *alternate_command)
527n/a{
528n/a _Py_IDENTIFIER(_is_text_encoding);
529n/a PyObject *codec;
530n/a PyObject *attr;
531n/a int is_text_codec;
532n/a
533n/a codec = _PyCodec_Lookup(encoding);
534n/a if (codec == NULL)
535n/a return NULL;
536n/a
537n/a /* Backwards compatibility: assume any raw tuple describes a text
538n/a * encoding, and the same for anything lacking the private
539n/a * attribute.
540n/a */
541n/a if (!PyTuple_CheckExact(codec)) {
542n/a attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
543n/a if (attr == NULL) {
544n/a if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
545n/a PyErr_Clear();
546n/a } else {
547n/a Py_DECREF(codec);
548n/a return NULL;
549n/a }
550n/a } else {
551n/a is_text_codec = PyObject_IsTrue(attr);
552n/a Py_DECREF(attr);
553n/a if (is_text_codec <= 0) {
554n/a Py_DECREF(codec);
555n/a if (!is_text_codec)
556n/a PyErr_Format(PyExc_LookupError,
557n/a "'%.400s' is not a text encoding; "
558n/a "use %s to handle arbitrary codecs",
559n/a encoding, alternate_command);
560n/a return NULL;
561n/a }
562n/a }
563n/a }
564n/a
565n/a /* This appears to be a valid text encoding */
566n/a return codec;
567n/a}
568n/a
569n/a
570n/astatic
571n/aPyObject *codec_getitem_checked(const char *encoding,
572n/a const char *alternate_command,
573n/a int index)
574n/a{
575n/a PyObject *codec;
576n/a PyObject *v;
577n/a
578n/a codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
579n/a if (codec == NULL)
580n/a return NULL;
581n/a
582n/a v = PyTuple_GET_ITEM(codec, index);
583n/a Py_INCREF(v);
584n/a Py_DECREF(codec);
585n/a return v;
586n/a}
587n/a
588n/astatic PyObject * _PyCodec_TextEncoder(const char *encoding)
589n/a{
590n/a return codec_getitem_checked(encoding, "codecs.encode()", 0);
591n/a}
592n/a
593n/astatic PyObject * _PyCodec_TextDecoder(const char *encoding)
594n/a{
595n/a return codec_getitem_checked(encoding, "codecs.decode()", 1);
596n/a}
597n/a
598n/aPyObject *_PyCodec_EncodeText(PyObject *object,
599n/a const char *encoding,
600n/a const char *errors)
601n/a{
602n/a PyObject *encoder;
603n/a
604n/a encoder = _PyCodec_TextEncoder(encoding);
605n/a if (encoder == NULL)
606n/a return NULL;
607n/a
608n/a return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
609n/a}
610n/a
611n/aPyObject *_PyCodec_DecodeText(PyObject *object,
612n/a const char *encoding,
613n/a const char *errors)
614n/a{
615n/a PyObject *decoder;
616n/a
617n/a decoder = _PyCodec_TextDecoder(encoding);
618n/a if (decoder == NULL)
619n/a return NULL;
620n/a
621n/a return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
622n/a}
623n/a
624n/a/* Register the error handling callback function error under the name
625n/a name. This function will be called by the codec when it encounters
626n/a an unencodable characters/undecodable bytes and doesn't know the
627n/a callback name, when name is specified as the error parameter
628n/a in the call to the encode/decode function.
629n/a Return 0 on success, -1 on error */
630n/aint PyCodec_RegisterError(const char *name, PyObject *error)
631n/a{
632n/a PyInterpreterState *interp = PyThreadState_GET()->interp;
633n/a if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
634n/a return -1;
635n/a if (!PyCallable_Check(error)) {
636n/a PyErr_SetString(PyExc_TypeError, "handler must be callable");
637n/a return -1;
638n/a }
639n/a return PyDict_SetItemString(interp->codec_error_registry,
640n/a name, error);
641n/a}
642n/a
643n/a/* Lookup the error handling callback function registered under the
644n/a name error. As a special case NULL can be passed, in which case
645n/a the error handling callback for strict encoding will be returned. */
646n/aPyObject *PyCodec_LookupError(const char *name)
647n/a{
648n/a PyObject *handler = NULL;
649n/a
650n/a PyInterpreterState *interp = PyThreadState_GET()->interp;
651n/a if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
652n/a return NULL;
653n/a
654n/a if (name==NULL)
655n/a name = "strict";
656n/a handler = PyDict_GetItemString(interp->codec_error_registry, name);
657n/a if (!handler)
658n/a PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
659n/a else
660n/a Py_INCREF(handler);
661n/a return handler;
662n/a}
663n/a
664n/astatic void wrong_exception_type(PyObject *exc)
665n/a{
666n/a PyErr_Format(PyExc_TypeError,
667n/a "don't know how to handle %.200s in error callback",
668n/a exc->ob_type->tp_name);
669n/a}
670n/a
671n/aPyObject *PyCodec_StrictErrors(PyObject *exc)
672n/a{
673n/a if (PyExceptionInstance_Check(exc))
674n/a PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
675n/a else
676n/a PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
677n/a return NULL;
678n/a}
679n/a
680n/a
681n/aPyObject *PyCodec_IgnoreErrors(PyObject *exc)
682n/a{
683n/a Py_ssize_t end;
684n/a
685n/a if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
686n/a if (PyUnicodeEncodeError_GetEnd(exc, &end))
687n/a return NULL;
688n/a }
689n/a else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
690n/a if (PyUnicodeDecodeError_GetEnd(exc, &end))
691n/a return NULL;
692n/a }
693n/a else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
694n/a if (PyUnicodeTranslateError_GetEnd(exc, &end))
695n/a return NULL;
696n/a }
697n/a else {
698n/a wrong_exception_type(exc);
699n/a return NULL;
700n/a }
701n/a return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
702n/a}
703n/a
704n/a
705n/aPyObject *PyCodec_ReplaceErrors(PyObject *exc)
706n/a{
707n/a Py_ssize_t start, end, i, len;
708n/a
709n/a if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
710n/a PyObject *res;
711n/a int kind;
712n/a void *data;
713n/a if (PyUnicodeEncodeError_GetStart(exc, &start))
714n/a return NULL;
715n/a if (PyUnicodeEncodeError_GetEnd(exc, &end))
716n/a return NULL;
717n/a len = end - start;
718n/a res = PyUnicode_New(len, '?');
719n/a if (res == NULL)
720n/a return NULL;
721n/a kind = PyUnicode_KIND(res);
722n/a data = PyUnicode_DATA(res);
723n/a for (i = 0; i < len; ++i)
724n/a PyUnicode_WRITE(kind, data, i, '?');
725n/a assert(_PyUnicode_CheckConsistency(res, 1));
726n/a return Py_BuildValue("(Nn)", res, end);
727n/a }
728n/a else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
729n/a if (PyUnicodeDecodeError_GetEnd(exc, &end))
730n/a return NULL;
731n/a return Py_BuildValue("(Cn)",
732n/a (int)Py_UNICODE_REPLACEMENT_CHARACTER,
733n/a end);
734n/a }
735n/a else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
736n/a PyObject *res;
737n/a int kind;
738n/a void *data;
739n/a if (PyUnicodeTranslateError_GetStart(exc, &start))
740n/a return NULL;
741n/a if (PyUnicodeTranslateError_GetEnd(exc, &end))
742n/a return NULL;
743n/a len = end - start;
744n/a res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
745n/a if (res == NULL)
746n/a return NULL;
747n/a kind = PyUnicode_KIND(res);
748n/a data = PyUnicode_DATA(res);
749n/a for (i=0; i < len; i++)
750n/a PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
751n/a assert(_PyUnicode_CheckConsistency(res, 1));
752n/a return Py_BuildValue("(Nn)", res, end);
753n/a }
754n/a else {
755n/a wrong_exception_type(exc);
756n/a return NULL;
757n/a }
758n/a}
759n/a
760n/aPyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
761n/a{
762n/a if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
763n/a PyObject *restuple;
764n/a PyObject *object;
765n/a Py_ssize_t i;
766n/a Py_ssize_t start;
767n/a Py_ssize_t end;
768n/a PyObject *res;
769n/a unsigned char *outp;
770n/a Py_ssize_t ressize;
771n/a Py_UCS4 ch;
772n/a if (PyUnicodeEncodeError_GetStart(exc, &start))
773n/a return NULL;
774n/a if (PyUnicodeEncodeError_GetEnd(exc, &end))
775n/a return NULL;
776n/a if (!(object = PyUnicodeEncodeError_GetObject(exc)))
777n/a return NULL;
778n/a if (end - start > PY_SSIZE_T_MAX / (2+7+1))
779n/a end = start + PY_SSIZE_T_MAX / (2+7+1);
780n/a for (i = start, ressize = 0; i < end; ++i) {
781n/a /* object is guaranteed to be "ready" */
782n/a ch = PyUnicode_READ_CHAR(object, i);
783n/a if (ch<10)
784n/a ressize += 2+1+1;
785n/a else if (ch<100)
786n/a ressize += 2+2+1;
787n/a else if (ch<1000)
788n/a ressize += 2+3+1;
789n/a else if (ch<10000)
790n/a ressize += 2+4+1;
791n/a else if (ch<100000)
792n/a ressize += 2+5+1;
793n/a else if (ch<1000000)
794n/a ressize += 2+6+1;
795n/a else
796n/a ressize += 2+7+1;
797n/a }
798n/a /* allocate replacement */
799n/a res = PyUnicode_New(ressize, 127);
800n/a if (res == NULL) {
801n/a Py_DECREF(object);
802n/a return NULL;
803n/a }
804n/a outp = PyUnicode_1BYTE_DATA(res);
805n/a /* generate replacement */
806n/a for (i = start; i < end; ++i) {
807n/a int digits;
808n/a int base;
809n/a ch = PyUnicode_READ_CHAR(object, i);
810n/a *outp++ = '&';
811n/a *outp++ = '#';
812n/a if (ch<10) {
813n/a digits = 1;
814n/a base = 1;
815n/a }
816n/a else if (ch<100) {
817n/a digits = 2;
818n/a base = 10;
819n/a }
820n/a else if (ch<1000) {
821n/a digits = 3;
822n/a base = 100;
823n/a }
824n/a else if (ch<10000) {
825n/a digits = 4;
826n/a base = 1000;
827n/a }
828n/a else if (ch<100000) {
829n/a digits = 5;
830n/a base = 10000;
831n/a }
832n/a else if (ch<1000000) {
833n/a digits = 6;
834n/a base = 100000;
835n/a }
836n/a else {
837n/a digits = 7;
838n/a base = 1000000;
839n/a }
840n/a while (digits-->0) {
841n/a *outp++ = '0' + ch/base;
842n/a ch %= base;
843n/a base /= 10;
844n/a }
845n/a *outp++ = ';';
846n/a }
847n/a assert(_PyUnicode_CheckConsistency(res, 1));
848n/a restuple = Py_BuildValue("(Nn)", res, end);
849n/a Py_DECREF(object);
850n/a return restuple;
851n/a }
852n/a else {
853n/a wrong_exception_type(exc);
854n/a return NULL;
855n/a }
856n/a}
857n/a
858n/aPyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
859n/a{
860n/a PyObject *object;
861n/a Py_ssize_t i;
862n/a Py_ssize_t start;
863n/a Py_ssize_t end;
864n/a PyObject *res;
865n/a unsigned char *outp;
866n/a int ressize;
867n/a Py_UCS4 c;
868n/a
869n/a if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
870n/a const unsigned char *p;
871n/a if (PyUnicodeDecodeError_GetStart(exc, &start))
872n/a return NULL;
873n/a if (PyUnicodeDecodeError_GetEnd(exc, &end))
874n/a return NULL;
875n/a if (!(object = PyUnicodeDecodeError_GetObject(exc)))
876n/a return NULL;
877n/a p = (const unsigned char*)PyBytes_AS_STRING(object);
878n/a res = PyUnicode_New(4 * (end - start), 127);
879n/a if (res == NULL) {
880n/a Py_DECREF(object);
881n/a return NULL;
882n/a }
883n/a outp = PyUnicode_1BYTE_DATA(res);
884n/a for (i = start; i < end; i++, outp += 4) {
885n/a unsigned char c = p[i];
886n/a outp[0] = '\\';
887n/a outp[1] = 'x';
888n/a outp[2] = Py_hexdigits[(c>>4)&0xf];
889n/a outp[3] = Py_hexdigits[c&0xf];
890n/a }
891n/a
892n/a assert(_PyUnicode_CheckConsistency(res, 1));
893n/a Py_DECREF(object);
894n/a return Py_BuildValue("(Nn)", res, end);
895n/a }
896n/a if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
897n/a if (PyUnicodeEncodeError_GetStart(exc, &start))
898n/a return NULL;
899n/a if (PyUnicodeEncodeError_GetEnd(exc, &end))
900n/a return NULL;
901n/a if (!(object = PyUnicodeEncodeError_GetObject(exc)))
902n/a return NULL;
903n/a }
904n/a else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
905n/a if (PyUnicodeTranslateError_GetStart(exc, &start))
906n/a return NULL;
907n/a if (PyUnicodeTranslateError_GetEnd(exc, &end))
908n/a return NULL;
909n/a if (!(object = PyUnicodeTranslateError_GetObject(exc)))
910n/a return NULL;
911n/a }
912n/a else {
913n/a wrong_exception_type(exc);
914n/a return NULL;
915n/a }
916n/a
917n/a if (end - start > PY_SSIZE_T_MAX / (1+1+8))
918n/a end = start + PY_SSIZE_T_MAX / (1+1+8);
919n/a for (i = start, ressize = 0; i < end; ++i) {
920n/a /* object is guaranteed to be "ready" */
921n/a c = PyUnicode_READ_CHAR(object, i);
922n/a if (c >= 0x10000) {
923n/a ressize += 1+1+8;
924n/a }
925n/a else if (c >= 0x100) {
926n/a ressize += 1+1+4;
927n/a }
928n/a else
929n/a ressize += 1+1+2;
930n/a }
931n/a res = PyUnicode_New(ressize, 127);
932n/a if (res == NULL) {
933n/a Py_DECREF(object);
934n/a return NULL;
935n/a }
936n/a outp = PyUnicode_1BYTE_DATA(res);
937n/a for (i = start; i < end; ++i) {
938n/a c = PyUnicode_READ_CHAR(object, i);
939n/a *outp++ = '\\';
940n/a if (c >= 0x00010000) {
941n/a *outp++ = 'U';
942n/a *outp++ = Py_hexdigits[(c>>28)&0xf];
943n/a *outp++ = Py_hexdigits[(c>>24)&0xf];
944n/a *outp++ = Py_hexdigits[(c>>20)&0xf];
945n/a *outp++ = Py_hexdigits[(c>>16)&0xf];
946n/a *outp++ = Py_hexdigits[(c>>12)&0xf];
947n/a *outp++ = Py_hexdigits[(c>>8)&0xf];
948n/a }
949n/a else if (c >= 0x100) {
950n/a *outp++ = 'u';
951n/a *outp++ = Py_hexdigits[(c>>12)&0xf];
952n/a *outp++ = Py_hexdigits[(c>>8)&0xf];
953n/a }
954n/a else
955n/a *outp++ = 'x';
956n/a *outp++ = Py_hexdigits[(c>>4)&0xf];
957n/a *outp++ = Py_hexdigits[c&0xf];
958n/a }
959n/a
960n/a assert(_PyUnicode_CheckConsistency(res, 1));
961n/a Py_DECREF(object);
962n/a return Py_BuildValue("(Nn)", res, end);
963n/a}
964n/a
965n/astatic _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
966n/a
967n/aPyObject *PyCodec_NameReplaceErrors(PyObject *exc)
968n/a{
969n/a if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
970n/a PyObject *restuple;
971n/a PyObject *object;
972n/a Py_ssize_t i;
973n/a Py_ssize_t start;
974n/a Py_ssize_t end;
975n/a PyObject *res;
976n/a unsigned char *outp;
977n/a Py_ssize_t ressize;
978n/a int replsize;
979n/a Py_UCS4 c;
980n/a char buffer[256]; /* NAME_MAXLEN */
981n/a if (PyUnicodeEncodeError_GetStart(exc, &start))
982n/a return NULL;
983n/a if (PyUnicodeEncodeError_GetEnd(exc, &end))
984n/a return NULL;
985n/a if (!(object = PyUnicodeEncodeError_GetObject(exc)))
986n/a return NULL;
987n/a if (!ucnhash_CAPI) {
988n/a /* load the unicode data module */
989n/a ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
990n/a PyUnicodeData_CAPSULE_NAME, 1);
991n/a if (!ucnhash_CAPI)
992n/a return NULL;
993n/a }
994n/a for (i = start, ressize = 0; i < end; ++i) {
995n/a /* object is guaranteed to be "ready" */
996n/a c = PyUnicode_READ_CHAR(object, i);
997n/a if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
998n/a replsize = 1+1+1+(int)strlen(buffer)+1;
999n/a }
1000n/a else if (c >= 0x10000) {
1001n/a replsize = 1+1+8;
1002n/a }
1003n/a else if (c >= 0x100) {
1004n/a replsize = 1+1+4;
1005n/a }
1006n/a else
1007n/a replsize = 1+1+2;
1008n/a if (ressize > PY_SSIZE_T_MAX - replsize)
1009n/a break;
1010n/a ressize += replsize;
1011n/a }
1012n/a end = i;
1013n/a res = PyUnicode_New(ressize, 127);
1014n/a if (res==NULL)
1015n/a return NULL;
1016n/a for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1017n/a i < end; ++i) {
1018n/a c = PyUnicode_READ_CHAR(object, i);
1019n/a *outp++ = '\\';
1020n/a if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
1021n/a *outp++ = 'N';
1022n/a *outp++ = '{';
1023n/a strcpy((char *)outp, buffer);
1024n/a outp += strlen(buffer);
1025n/a *outp++ = '}';
1026n/a continue;
1027n/a }
1028n/a if (c >= 0x00010000) {
1029n/a *outp++ = 'U';
1030n/a *outp++ = Py_hexdigits[(c>>28)&0xf];
1031n/a *outp++ = Py_hexdigits[(c>>24)&0xf];
1032n/a *outp++ = Py_hexdigits[(c>>20)&0xf];
1033n/a *outp++ = Py_hexdigits[(c>>16)&0xf];
1034n/a *outp++ = Py_hexdigits[(c>>12)&0xf];
1035n/a *outp++ = Py_hexdigits[(c>>8)&0xf];
1036n/a }
1037n/a else if (c >= 0x100) {
1038n/a *outp++ = 'u';
1039n/a *outp++ = Py_hexdigits[(c>>12)&0xf];
1040n/a *outp++ = Py_hexdigits[(c>>8)&0xf];
1041n/a }
1042n/a else
1043n/a *outp++ = 'x';
1044n/a *outp++ = Py_hexdigits[(c>>4)&0xf];
1045n/a *outp++ = Py_hexdigits[c&0xf];
1046n/a }
1047n/a
1048n/a assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1049n/a assert(_PyUnicode_CheckConsistency(res, 1));
1050n/a restuple = Py_BuildValue("(Nn)", res, end);
1051n/a Py_DECREF(object);
1052n/a return restuple;
1053n/a }
1054n/a else {
1055n/a wrong_exception_type(exc);
1056n/a return NULL;
1057n/a }
1058n/a}
1059n/a
1060n/a#define ENC_UNKNOWN -1
1061n/a#define ENC_UTF8 0
1062n/a#define ENC_UTF16BE 1
1063n/a#define ENC_UTF16LE 2
1064n/a#define ENC_UTF32BE 3
1065n/a#define ENC_UTF32LE 4
1066n/a
1067n/astatic int
1068n/aget_standard_encoding(const char *encoding, int *bytelength)
1069n/a{
1070n/a if (Py_TOLOWER(encoding[0]) == 'u' &&
1071n/a Py_TOLOWER(encoding[1]) == 't' &&
1072n/a Py_TOLOWER(encoding[2]) == 'f') {
1073n/a encoding += 3;
1074n/a if (*encoding == '-' || *encoding == '_' )
1075n/a encoding++;
1076n/a if (encoding[0] == '8' && encoding[1] == '\0') {
1077n/a *bytelength = 3;
1078n/a return ENC_UTF8;
1079n/a }
1080n/a else if (encoding[0] == '1' && encoding[1] == '6') {
1081n/a encoding += 2;
1082n/a *bytelength = 2;
1083n/a if (*encoding == '\0') {
1084n/a#ifdef WORDS_BIGENDIAN
1085n/a return ENC_UTF16BE;
1086n/a#else
1087n/a return ENC_UTF16LE;
1088n/a#endif
1089n/a }
1090n/a if (*encoding == '-' || *encoding == '_' )
1091n/a encoding++;
1092n/a if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1093n/a if (Py_TOLOWER(encoding[0]) == 'b')
1094n/a return ENC_UTF16BE;
1095n/a if (Py_TOLOWER(encoding[0]) == 'l')
1096n/a return ENC_UTF16LE;
1097n/a }
1098n/a }
1099n/a else if (encoding[0] == '3' && encoding[1] == '2') {
1100n/a encoding += 2;
1101n/a *bytelength = 4;
1102n/a if (*encoding == '\0') {
1103n/a#ifdef WORDS_BIGENDIAN
1104n/a return ENC_UTF32BE;
1105n/a#else
1106n/a return ENC_UTF32LE;
1107n/a#endif
1108n/a }
1109n/a if (*encoding == '-' || *encoding == '_' )
1110n/a encoding++;
1111n/a if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1112n/a if (Py_TOLOWER(encoding[0]) == 'b')
1113n/a return ENC_UTF32BE;
1114n/a if (Py_TOLOWER(encoding[0]) == 'l')
1115n/a return ENC_UTF32LE;
1116n/a }
1117n/a }
1118n/a }
1119n/a else if (strcmp(encoding, "CP_UTF8") == 0) {
1120n/a *bytelength = 3;
1121n/a return ENC_UTF8;
1122n/a }
1123n/a return ENC_UNKNOWN;
1124n/a}
1125n/a
1126n/a/* This handler is declared static until someone demonstrates
1127n/a a need to call it directly. */
1128n/astatic PyObject *
1129n/aPyCodec_SurrogatePassErrors(PyObject *exc)
1130n/a{
1131n/a PyObject *restuple;
1132n/a PyObject *object;
1133n/a PyObject *encode;
1134n/a const char *encoding;
1135n/a int code;
1136n/a int bytelength;
1137n/a Py_ssize_t i;
1138n/a Py_ssize_t start;
1139n/a Py_ssize_t end;
1140n/a PyObject *res;
1141n/a
1142n/a if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1143n/a unsigned char *outp;
1144n/a if (PyUnicodeEncodeError_GetStart(exc, &start))
1145n/a return NULL;
1146n/a if (PyUnicodeEncodeError_GetEnd(exc, &end))
1147n/a return NULL;
1148n/a if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1149n/a return NULL;
1150n/a if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1151n/a Py_DECREF(object);
1152n/a return NULL;
1153n/a }
1154n/a if (!(encoding = PyUnicode_AsUTF8(encode))) {
1155n/a Py_DECREF(object);
1156n/a Py_DECREF(encode);
1157n/a return NULL;
1158n/a }
1159n/a code = get_standard_encoding(encoding, &bytelength);
1160n/a Py_DECREF(encode);
1161n/a if (code == ENC_UNKNOWN) {
1162n/a /* Not supported, fail with original exception */
1163n/a PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1164n/a Py_DECREF(object);
1165n/a return NULL;
1166n/a }
1167n/a
1168n/a if (end - start > PY_SSIZE_T_MAX / bytelength)
1169n/a end = start + PY_SSIZE_T_MAX / bytelength;
1170n/a res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1171n/a if (!res) {
1172n/a Py_DECREF(object);
1173n/a return NULL;
1174n/a }
1175n/a outp = (unsigned char*)PyBytes_AsString(res);
1176n/a for (i = start; i < end; i++) {
1177n/a /* object is guaranteed to be "ready" */
1178n/a Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1179n/a if (!Py_UNICODE_IS_SURROGATE(ch)) {
1180n/a /* Not a surrogate, fail with original exception */
1181n/a PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1182n/a Py_DECREF(res);
1183n/a Py_DECREF(object);
1184n/a return NULL;
1185n/a }
1186n/a switch (code) {
1187n/a case ENC_UTF8:
1188n/a *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1189n/a *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1190n/a *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1191n/a break;
1192n/a case ENC_UTF16LE:
1193n/a *outp++ = (unsigned char) ch;
1194n/a *outp++ = (unsigned char)(ch >> 8);
1195n/a break;
1196n/a case ENC_UTF16BE:
1197n/a *outp++ = (unsigned char)(ch >> 8);
1198n/a *outp++ = (unsigned char) ch;
1199n/a break;
1200n/a case ENC_UTF32LE:
1201n/a *outp++ = (unsigned char) ch;
1202n/a *outp++ = (unsigned char)(ch >> 8);
1203n/a *outp++ = (unsigned char)(ch >> 16);
1204n/a *outp++ = (unsigned char)(ch >> 24);
1205n/a break;
1206n/a case ENC_UTF32BE:
1207n/a *outp++ = (unsigned char)(ch >> 24);
1208n/a *outp++ = (unsigned char)(ch >> 16);
1209n/a *outp++ = (unsigned char)(ch >> 8);
1210n/a *outp++ = (unsigned char) ch;
1211n/a break;
1212n/a }
1213n/a }
1214n/a restuple = Py_BuildValue("(On)", res, end);
1215n/a Py_DECREF(res);
1216n/a Py_DECREF(object);
1217n/a return restuple;
1218n/a }
1219n/a else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1220n/a const unsigned char *p;
1221n/a Py_UCS4 ch = 0;
1222n/a if (PyUnicodeDecodeError_GetStart(exc, &start))
1223n/a return NULL;
1224n/a if (PyUnicodeDecodeError_GetEnd(exc, &end))
1225n/a return NULL;
1226n/a if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1227n/a return NULL;
1228n/a p = (const unsigned char*)PyBytes_AS_STRING(object);
1229n/a if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1230n/a Py_DECREF(object);
1231n/a return NULL;
1232n/a }
1233n/a if (!(encoding = PyUnicode_AsUTF8(encode))) {
1234n/a Py_DECREF(object);
1235n/a Py_DECREF(encode);
1236n/a return NULL;
1237n/a }
1238n/a code = get_standard_encoding(encoding, &bytelength);
1239n/a Py_DECREF(encode);
1240n/a if (code == ENC_UNKNOWN) {
1241n/a /* Not supported, fail with original exception */
1242n/a PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1243n/a Py_DECREF(object);
1244n/a return NULL;
1245n/a }
1246n/a
1247n/a /* Try decoding a single surrogate character. If
1248n/a there are more, let the codec call us again. */
1249n/a p += start;
1250n/a if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1251n/a switch (code) {
1252n/a case ENC_UTF8:
1253n/a if ((p[0] & 0xf0) == 0xe0 &&
1254n/a (p[1] & 0xc0) == 0x80 &&
1255n/a (p[2] & 0xc0) == 0x80) {
1256n/a /* it's a three-byte code */
1257n/a ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1258n/a }
1259n/a break;
1260n/a case ENC_UTF16LE:
1261n/a ch = p[1] << 8 | p[0];
1262n/a break;
1263n/a case ENC_UTF16BE:
1264n/a ch = p[0] << 8 | p[1];
1265n/a break;
1266n/a case ENC_UTF32LE:
1267n/a ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1268n/a break;
1269n/a case ENC_UTF32BE:
1270n/a ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1271n/a break;
1272n/a }
1273n/a }
1274n/a
1275n/a Py_DECREF(object);
1276n/a if (!Py_UNICODE_IS_SURROGATE(ch)) {
1277n/a /* it's not a surrogate - fail */
1278n/a PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1279n/a return NULL;
1280n/a }
1281n/a res = PyUnicode_FromOrdinal(ch);
1282n/a if (res == NULL)
1283n/a return NULL;
1284n/a return Py_BuildValue("(Nn)", res, start + bytelength);
1285n/a }
1286n/a else {
1287n/a wrong_exception_type(exc);
1288n/a return NULL;
1289n/a }
1290n/a}
1291n/a
1292n/astatic PyObject *
1293n/aPyCodec_SurrogateEscapeErrors(PyObject *exc)
1294n/a{
1295n/a PyObject *restuple;
1296n/a PyObject *object;
1297n/a Py_ssize_t i;
1298n/a Py_ssize_t start;
1299n/a Py_ssize_t end;
1300n/a PyObject *res;
1301n/a
1302n/a if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1303n/a char *outp;
1304n/a if (PyUnicodeEncodeError_GetStart(exc, &start))
1305n/a return NULL;
1306n/a if (PyUnicodeEncodeError_GetEnd(exc, &end))
1307n/a return NULL;
1308n/a if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1309n/a return NULL;
1310n/a res = PyBytes_FromStringAndSize(NULL, end-start);
1311n/a if (!res) {
1312n/a Py_DECREF(object);
1313n/a return NULL;
1314n/a }
1315n/a outp = PyBytes_AsString(res);
1316n/a for (i = start; i < end; i++) {
1317n/a /* object is guaranteed to be "ready" */
1318n/a Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1319n/a if (ch < 0xdc80 || ch > 0xdcff) {
1320n/a /* Not a UTF-8b surrogate, fail with original exception */
1321n/a PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1322n/a Py_DECREF(res);
1323n/a Py_DECREF(object);
1324n/a return NULL;
1325n/a }
1326n/a *outp++ = ch - 0xdc00;
1327n/a }
1328n/a restuple = Py_BuildValue("(On)", res, end);
1329n/a Py_DECREF(res);
1330n/a Py_DECREF(object);
1331n/a return restuple;
1332n/a }
1333n/a else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1334n/a PyObject *str;
1335n/a const unsigned char *p;
1336n/a Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1337n/a int consumed = 0;
1338n/a if (PyUnicodeDecodeError_GetStart(exc, &start))
1339n/a return NULL;
1340n/a if (PyUnicodeDecodeError_GetEnd(exc, &end))
1341n/a return NULL;
1342n/a if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1343n/a return NULL;
1344n/a p = (const unsigned char*)PyBytes_AS_STRING(object);
1345n/a while (consumed < 4 && consumed < end-start) {
1346n/a /* Refuse to escape ASCII bytes. */
1347n/a if (p[start+consumed] < 128)
1348n/a break;
1349n/a ch[consumed] = 0xdc00 + p[start+consumed];
1350n/a consumed++;
1351n/a }
1352n/a Py_DECREF(object);
1353n/a if (!consumed) {
1354n/a /* codec complained about ASCII byte. */
1355n/a PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1356n/a return NULL;
1357n/a }
1358n/a str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1359n/a if (str == NULL)
1360n/a return NULL;
1361n/a return Py_BuildValue("(Nn)", str, start+consumed);
1362n/a }
1363n/a else {
1364n/a wrong_exception_type(exc);
1365n/a return NULL;
1366n/a }
1367n/a}
1368n/a
1369n/a
1370n/astatic PyObject *strict_errors(PyObject *self, PyObject *exc)
1371n/a{
1372n/a return PyCodec_StrictErrors(exc);
1373n/a}
1374n/a
1375n/a
1376n/astatic PyObject *ignore_errors(PyObject *self, PyObject *exc)
1377n/a{
1378n/a return PyCodec_IgnoreErrors(exc);
1379n/a}
1380n/a
1381n/a
1382n/astatic PyObject *replace_errors(PyObject *self, PyObject *exc)
1383n/a{
1384n/a return PyCodec_ReplaceErrors(exc);
1385n/a}
1386n/a
1387n/a
1388n/astatic PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1389n/a{
1390n/a return PyCodec_XMLCharRefReplaceErrors(exc);
1391n/a}
1392n/a
1393n/a
1394n/astatic PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1395n/a{
1396n/a return PyCodec_BackslashReplaceErrors(exc);
1397n/a}
1398n/a
1399n/astatic PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1400n/a{
1401n/a return PyCodec_NameReplaceErrors(exc);
1402n/a}
1403n/a
1404n/astatic PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1405n/a{
1406n/a return PyCodec_SurrogatePassErrors(exc);
1407n/a}
1408n/a
1409n/astatic PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1410n/a{
1411n/a return PyCodec_SurrogateEscapeErrors(exc);
1412n/a}
1413n/a
1414n/astatic int _PyCodecRegistry_Init(void)
1415n/a{
1416n/a static struct {
1417n/a char *name;
1418n/a PyMethodDef def;
1419n/a } methods[] =
1420n/a {
1421n/a {
1422n/a "strict",
1423n/a {
1424n/a "strict_errors",
1425n/a strict_errors,
1426n/a METH_O,
1427n/a PyDoc_STR("Implements the 'strict' error handling, which "
1428n/a "raises a UnicodeError on coding errors.")
1429n/a }
1430n/a },
1431n/a {
1432n/a "ignore",
1433n/a {
1434n/a "ignore_errors",
1435n/a ignore_errors,
1436n/a METH_O,
1437n/a PyDoc_STR("Implements the 'ignore' error handling, which "
1438n/a "ignores malformed data and continues.")
1439n/a }
1440n/a },
1441n/a {
1442n/a "replace",
1443n/a {
1444n/a "replace_errors",
1445n/a replace_errors,
1446n/a METH_O,
1447n/a PyDoc_STR("Implements the 'replace' error handling, which "
1448n/a "replaces malformed data with a replacement marker.")
1449n/a }
1450n/a },
1451n/a {
1452n/a "xmlcharrefreplace",
1453n/a {
1454n/a "xmlcharrefreplace_errors",
1455n/a xmlcharrefreplace_errors,
1456n/a METH_O,
1457n/a PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1458n/a "which replaces an unencodable character with the "
1459n/a "appropriate XML character reference.")
1460n/a }
1461n/a },
1462n/a {
1463n/a "backslashreplace",
1464n/a {
1465n/a "backslashreplace_errors",
1466n/a backslashreplace_errors,
1467n/a METH_O,
1468n/a PyDoc_STR("Implements the 'backslashreplace' error handling, "
1469n/a "which replaces malformed data with a backslashed "
1470n/a "escape sequence.")
1471n/a }
1472n/a },
1473n/a {
1474n/a "namereplace",
1475n/a {
1476n/a "namereplace_errors",
1477n/a namereplace_errors,
1478n/a METH_O,
1479n/a PyDoc_STR("Implements the 'namereplace' error handling, "
1480n/a "which replaces an unencodable character with a "
1481n/a "\\N{...} escape sequence.")
1482n/a }
1483n/a },
1484n/a {
1485n/a "surrogatepass",
1486n/a {
1487n/a "surrogatepass",
1488n/a surrogatepass_errors,
1489n/a METH_O
1490n/a }
1491n/a },
1492n/a {
1493n/a "surrogateescape",
1494n/a {
1495n/a "surrogateescape",
1496n/a surrogateescape_errors,
1497n/a METH_O
1498n/a }
1499n/a }
1500n/a };
1501n/a
1502n/a PyInterpreterState *interp = PyThreadState_GET()->interp;
1503n/a PyObject *mod;
1504n/a unsigned i;
1505n/a
1506n/a if (interp->codec_search_path != NULL)
1507n/a return 0;
1508n/a
1509n/a interp->codec_search_path = PyList_New(0);
1510n/a interp->codec_search_cache = PyDict_New();
1511n/a interp->codec_error_registry = PyDict_New();
1512n/a
1513n/a if (interp->codec_error_registry) {
1514n/a for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1515n/a PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1516n/a int res;
1517n/a if (!func)
1518n/a Py_FatalError("can't initialize codec error registry");
1519n/a res = PyCodec_RegisterError(methods[i].name, func);
1520n/a Py_DECREF(func);
1521n/a if (res)
1522n/a Py_FatalError("can't initialize codec error registry");
1523n/a }
1524n/a }
1525n/a
1526n/a if (interp->codec_search_path == NULL ||
1527n/a interp->codec_search_cache == NULL ||
1528n/a interp->codec_error_registry == NULL)
1529n/a Py_FatalError("can't initialize codec registry");
1530n/a
1531n/a mod = PyImport_ImportModuleNoBlock("encodings");
1532n/a if (mod == NULL) {
1533n/a return -1;
1534n/a }
1535n/a Py_DECREF(mod);
1536n/a interp->codecs_initialized = 1;
1537n/a return 0;
1538n/a}