ยปCore Development>Code coverage>Objects/unicodeobject.c

Python code coverage for Objects/unicodeobject.c

#countcontent
1n/a/*
2n/a
3n/aUnicode implementation based on original code by Fredrik Lundh,
4n/amodified by Marc-Andre Lemburg <mal@lemburg.com>.
5n/a
6n/aMajor speed upgrades to the method implementations at the Reykjavik
7n/aNeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8n/a
9n/aCopyright (c) Corporation for National Research Initiatives.
10n/a
11n/a--------------------------------------------------------------------
12n/aThe original string type implementation is:
13n/a
14n/a Copyright (c) 1999 by Secret Labs AB
15n/a Copyright (c) 1999 by Fredrik Lundh
16n/a
17n/aBy obtaining, using, and/or copying this software and/or its
18n/aassociated documentation, you agree that you have read, understood,
19n/aand will comply with the following terms and conditions:
20n/a
21n/aPermission to use, copy, modify, and distribute this software and its
22n/aassociated documentation for any purpose and without fee is hereby
23n/agranted, provided that the above copyright notice appears in all
24n/acopies, and that both that copyright notice and this permission notice
25n/aappear in supporting documentation, and that the name of Secret Labs
26n/aAB or the author not be used in advertising or publicity pertaining to
27n/adistribution of the software without specific, written prior
28n/apermission.
29n/a
30n/aSECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31n/aTHIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32n/aFITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33n/aANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34n/aWHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35n/aACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36n/aOF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37n/a--------------------------------------------------------------------
38n/a
39n/a*/
40n/a
41n/a#define PY_SSIZE_T_CLEAN
42n/a#include "Python.h"
43n/a#include "ucnhash.h"
44n/a#include "bytes_methods.h"
45n/a#include "stringlib/eq.h"
46n/a
47n/a#ifdef MS_WINDOWS
48n/a#include <windows.h>
49n/a#endif
50n/a
51n/a/*[clinic input]
52n/aclass str "PyObject *" "&PyUnicode_Type"
53n/a[clinic start generated code]*/
54n/a/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
55n/a
56n/a/*[python input]
57n/aclass Py_UCS4_converter(CConverter):
58n/a type = 'Py_UCS4'
59n/a converter = 'convert_uc'
60n/a
61n/a def converter_init(self):
62n/a if self.default is not unspecified:
63n/a self.c_default = ascii(self.default)
64n/a if len(self.c_default) > 4 or self.c_default[0] != "'":
65n/a self.c_default = hex(ord(self.default))
66n/a
67n/a[python start generated code]*/
68n/a/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
69n/a
70n/a/* --- Globals ------------------------------------------------------------
71n/a
72n/aNOTE: In the interpreter's initialization phase, some globals are currently
73n/a initialized dynamically as needed. In the process Unicode objects may
74n/a be created before the Unicode type is ready.
75n/a
76n/a*/
77n/a
78n/a
79n/a#ifdef __cplusplus
80n/aextern "C" {
81n/a#endif
82n/a
83n/a/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
84n/a#define MAX_UNICODE 0x10ffff
85n/a
86n/a#ifdef Py_DEBUG
87n/a# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
88n/a#else
89n/a# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
90n/a#endif
91n/a
92n/a#define _PyUnicode_UTF8(op) \
93n/a (((PyCompactUnicodeObject*)(op))->utf8)
94n/a#define PyUnicode_UTF8(op) \
95n/a (assert(_PyUnicode_CHECK(op)), \
96n/a assert(PyUnicode_IS_READY(op)), \
97n/a PyUnicode_IS_COMPACT_ASCII(op) ? \
98n/a ((char*)((PyASCIIObject*)(op) + 1)) : \
99n/a _PyUnicode_UTF8(op))
100n/a#define _PyUnicode_UTF8_LENGTH(op) \
101n/a (((PyCompactUnicodeObject*)(op))->utf8_length)
102n/a#define PyUnicode_UTF8_LENGTH(op) \
103n/a (assert(_PyUnicode_CHECK(op)), \
104n/a assert(PyUnicode_IS_READY(op)), \
105n/a PyUnicode_IS_COMPACT_ASCII(op) ? \
106n/a ((PyASCIIObject*)(op))->length : \
107n/a _PyUnicode_UTF8_LENGTH(op))
108n/a#define _PyUnicode_WSTR(op) \
109n/a (((PyASCIIObject*)(op))->wstr)
110n/a#define _PyUnicode_WSTR_LENGTH(op) \
111n/a (((PyCompactUnicodeObject*)(op))->wstr_length)
112n/a#define _PyUnicode_LENGTH(op) \
113n/a (((PyASCIIObject *)(op))->length)
114n/a#define _PyUnicode_STATE(op) \
115n/a (((PyASCIIObject *)(op))->state)
116n/a#define _PyUnicode_HASH(op) \
117n/a (((PyASCIIObject *)(op))->hash)
118n/a#define _PyUnicode_KIND(op) \
119n/a (assert(_PyUnicode_CHECK(op)), \
120n/a ((PyASCIIObject *)(op))->state.kind)
121n/a#define _PyUnicode_GET_LENGTH(op) \
122n/a (assert(_PyUnicode_CHECK(op)), \
123n/a ((PyASCIIObject *)(op))->length)
124n/a#define _PyUnicode_DATA_ANY(op) \
125n/a (((PyUnicodeObject*)(op))->data.any)
126n/a
127n/a#undef PyUnicode_READY
128n/a#define PyUnicode_READY(op) \
129n/a (assert(_PyUnicode_CHECK(op)), \
130n/a (PyUnicode_IS_READY(op) ? \
131n/a 0 : \
132n/a _PyUnicode_Ready(op)))
133n/a
134n/a#define _PyUnicode_SHARE_UTF8(op) \
135n/a (assert(_PyUnicode_CHECK(op)), \
136n/a assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
137n/a (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
138n/a#define _PyUnicode_SHARE_WSTR(op) \
139n/a (assert(_PyUnicode_CHECK(op)), \
140n/a (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
141n/a
142n/a/* true if the Unicode object has an allocated UTF-8 memory block
143n/a (not shared with other data) */
144n/a#define _PyUnicode_HAS_UTF8_MEMORY(op) \
145n/a ((!PyUnicode_IS_COMPACT_ASCII(op) \
146n/a && _PyUnicode_UTF8(op) \
147n/a && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
148n/a
149n/a/* true if the Unicode object has an allocated wstr memory block
150n/a (not shared with other data) */
151n/a#define _PyUnicode_HAS_WSTR_MEMORY(op) \
152n/a ((_PyUnicode_WSTR(op) && \
153n/a (!PyUnicode_IS_READY(op) || \
154n/a _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
155n/a
156n/a/* Generic helper macro to convert characters of different types.
157n/a from_type and to_type have to be valid type names, begin and end
158n/a are pointers to the source characters which should be of type
159n/a "from_type *". to is a pointer of type "to_type *" and points to the
160n/a buffer where the result characters are written to. */
161n/a#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
162n/a do { \
163n/a to_type *_to = (to_type *)(to); \
164n/a const from_type *_iter = (from_type *)(begin); \
165n/a const from_type *_end = (from_type *)(end); \
166n/a Py_ssize_t n = (_end) - (_iter); \
167n/a const from_type *_unrolled_end = \
168n/a _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
169n/a while (_iter < (_unrolled_end)) { \
170n/a _to[0] = (to_type) _iter[0]; \
171n/a _to[1] = (to_type) _iter[1]; \
172n/a _to[2] = (to_type) _iter[2]; \
173n/a _to[3] = (to_type) _iter[3]; \
174n/a _iter += 4; _to += 4; \
175n/a } \
176n/a while (_iter < (_end)) \
177n/a *_to++ = (to_type) *_iter++; \
178n/a } while (0)
179n/a
180n/a#ifdef MS_WINDOWS
181n/a /* On Windows, overallocate by 50% is the best factor */
182n/a# define OVERALLOCATE_FACTOR 2
183n/a#else
184n/a /* On Linux, overallocate by 25% is the best factor */
185n/a# define OVERALLOCATE_FACTOR 4
186n/a#endif
187n/a
188n/a/* This dictionary holds all interned unicode strings. Note that references
189n/a to strings in this dictionary are *not* counted in the string's ob_refcnt.
190n/a When the interned string reaches a refcnt of 0 the string deallocation
191n/a function will delete the reference from this dictionary.
192n/a
193n/a Another way to look at this is that to say that the actual reference
194n/a count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
195n/a*/
196n/astatic PyObject *interned = NULL;
197n/a
198n/a/* The empty Unicode object is shared to improve performance. */
199n/astatic PyObject *unicode_empty = NULL;
200n/a
201n/a#define _Py_INCREF_UNICODE_EMPTY() \
202n/a do { \
203n/a if (unicode_empty != NULL) \
204n/a Py_INCREF(unicode_empty); \
205n/a else { \
206n/a unicode_empty = PyUnicode_New(0, 0); \
207n/a if (unicode_empty != NULL) { \
208n/a Py_INCREF(unicode_empty); \
209n/a assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
210n/a } \
211n/a } \
212n/a } while (0)
213n/a
214n/a#define _Py_RETURN_UNICODE_EMPTY() \
215n/a do { \
216n/a _Py_INCREF_UNICODE_EMPTY(); \
217n/a return unicode_empty; \
218n/a } while (0)
219n/a
220n/a/* Forward declaration */
221n/astatic inline int
222n/a_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
223n/a
224n/a/* List of static strings. */
225n/astatic _Py_Identifier *static_strings = NULL;
226n/a
227n/a/* Single character Unicode strings in the Latin-1 range are being
228n/a shared as well. */
229n/astatic PyObject *unicode_latin1[256] = {NULL};
230n/a
231n/a/* Fast detection of the most frequent whitespace characters */
232n/aconst unsigned char _Py_ascii_whitespace[] = {
233n/a 0, 0, 0, 0, 0, 0, 0, 0,
234n/a/* case 0x0009: * CHARACTER TABULATION */
235n/a/* case 0x000A: * LINE FEED */
236n/a/* case 0x000B: * LINE TABULATION */
237n/a/* case 0x000C: * FORM FEED */
238n/a/* case 0x000D: * CARRIAGE RETURN */
239n/a 0, 1, 1, 1, 1, 1, 0, 0,
240n/a 0, 0, 0, 0, 0, 0, 0, 0,
241n/a/* case 0x001C: * FILE SEPARATOR */
242n/a/* case 0x001D: * GROUP SEPARATOR */
243n/a/* case 0x001E: * RECORD SEPARATOR */
244n/a/* case 0x001F: * UNIT SEPARATOR */
245n/a 0, 0, 0, 0, 1, 1, 1, 1,
246n/a/* case 0x0020: * SPACE */
247n/a 1, 0, 0, 0, 0, 0, 0, 0,
248n/a 0, 0, 0, 0, 0, 0, 0, 0,
249n/a 0, 0, 0, 0, 0, 0, 0, 0,
250n/a 0, 0, 0, 0, 0, 0, 0, 0,
251n/a
252n/a 0, 0, 0, 0, 0, 0, 0, 0,
253n/a 0, 0, 0, 0, 0, 0, 0, 0,
254n/a 0, 0, 0, 0, 0, 0, 0, 0,
255n/a 0, 0, 0, 0, 0, 0, 0, 0,
256n/a 0, 0, 0, 0, 0, 0, 0, 0,
257n/a 0, 0, 0, 0, 0, 0, 0, 0,
258n/a 0, 0, 0, 0, 0, 0, 0, 0,
259n/a 0, 0, 0, 0, 0, 0, 0, 0
260n/a};
261n/a
262n/a/* forward */
263n/astatic PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
264n/astatic PyObject* get_latin1_char(unsigned char ch);
265n/astatic int unicode_modifiable(PyObject *unicode);
266n/a
267n/a
268n/astatic PyObject *
269n/a_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
270n/astatic PyObject *
271n/a_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
272n/astatic PyObject *
273n/a_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
274n/a
275n/astatic PyObject *
276n/aunicode_encode_call_errorhandler(const char *errors,
277n/a PyObject **errorHandler,const char *encoding, const char *reason,
278n/a PyObject *unicode, PyObject **exceptionObject,
279n/a Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
280n/a
281n/astatic void
282n/araise_encode_exception(PyObject **exceptionObject,
283n/a const char *encoding,
284n/a PyObject *unicode,
285n/a Py_ssize_t startpos, Py_ssize_t endpos,
286n/a const char *reason);
287n/a
288n/a/* Same for linebreaks */
289n/astatic const unsigned char ascii_linebreak[] = {
290n/a 0, 0, 0, 0, 0, 0, 0, 0,
291n/a/* 0x000A, * LINE FEED */
292n/a/* 0x000B, * LINE TABULATION */
293n/a/* 0x000C, * FORM FEED */
294n/a/* 0x000D, * CARRIAGE RETURN */
295n/a 0, 0, 1, 1, 1, 1, 0, 0,
296n/a 0, 0, 0, 0, 0, 0, 0, 0,
297n/a/* 0x001C, * FILE SEPARATOR */
298n/a/* 0x001D, * GROUP SEPARATOR */
299n/a/* 0x001E, * RECORD SEPARATOR */
300n/a 0, 0, 0, 0, 1, 1, 1, 0,
301n/a 0, 0, 0, 0, 0, 0, 0, 0,
302n/a 0, 0, 0, 0, 0, 0, 0, 0,
303n/a 0, 0, 0, 0, 0, 0, 0, 0,
304n/a 0, 0, 0, 0, 0, 0, 0, 0,
305n/a
306n/a 0, 0, 0, 0, 0, 0, 0, 0,
307n/a 0, 0, 0, 0, 0, 0, 0, 0,
308n/a 0, 0, 0, 0, 0, 0, 0, 0,
309n/a 0, 0, 0, 0, 0, 0, 0, 0,
310n/a 0, 0, 0, 0, 0, 0, 0, 0,
311n/a 0, 0, 0, 0, 0, 0, 0, 0,
312n/a 0, 0, 0, 0, 0, 0, 0, 0,
313n/a 0, 0, 0, 0, 0, 0, 0, 0
314n/a};
315n/a
316n/astatic int convert_uc(PyObject *obj, void *addr);
317n/a
318n/a#include "clinic/unicodeobject.c.h"
319n/a
320n/atypedef enum {
321n/a _Py_ERROR_UNKNOWN=0,
322n/a _Py_ERROR_STRICT,
323n/a _Py_ERROR_SURROGATEESCAPE,
324n/a _Py_ERROR_REPLACE,
325n/a _Py_ERROR_IGNORE,
326n/a _Py_ERROR_BACKSLASHREPLACE,
327n/a _Py_ERROR_SURROGATEPASS,
328n/a _Py_ERROR_XMLCHARREFREPLACE,
329n/a _Py_ERROR_OTHER
330n/a} _Py_error_handler;
331n/a
332n/astatic _Py_error_handler
333n/aget_error_handler(const char *errors)
334n/a{
335n/a if (errors == NULL || strcmp(errors, "strict") == 0) {
336n/a return _Py_ERROR_STRICT;
337n/a }
338n/a if (strcmp(errors, "surrogateescape") == 0) {
339n/a return _Py_ERROR_SURROGATEESCAPE;
340n/a }
341n/a if (strcmp(errors, "replace") == 0) {
342n/a return _Py_ERROR_REPLACE;
343n/a }
344n/a if (strcmp(errors, "ignore") == 0) {
345n/a return _Py_ERROR_IGNORE;
346n/a }
347n/a if (strcmp(errors, "backslashreplace") == 0) {
348n/a return _Py_ERROR_BACKSLASHREPLACE;
349n/a }
350n/a if (strcmp(errors, "surrogatepass") == 0) {
351n/a return _Py_ERROR_SURROGATEPASS;
352n/a }
353n/a if (strcmp(errors, "xmlcharrefreplace") == 0) {
354n/a return _Py_ERROR_XMLCHARREFREPLACE;
355n/a }
356n/a return _Py_ERROR_OTHER;
357n/a}
358n/a
359n/a/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
360n/a This function is kept for backward compatibility with the old API. */
361n/aPy_UNICODE
362n/aPyUnicode_GetMax(void)
363n/a{
364n/a#ifdef Py_UNICODE_WIDE
365n/a return 0x10FFFF;
366n/a#else
367n/a /* This is actually an illegal character, so it should
368n/a not be passed to unichr. */
369n/a return 0xFFFF;
370n/a#endif
371n/a}
372n/a
373n/a#ifdef Py_DEBUG
374n/aint
375n/a_PyUnicode_CheckConsistency(PyObject *op, int check_content)
376n/a{
377n/a PyASCIIObject *ascii;
378n/a unsigned int kind;
379n/a
380n/a assert(PyUnicode_Check(op));
381n/a
382n/a ascii = (PyASCIIObject *)op;
383n/a kind = ascii->state.kind;
384n/a
385n/a if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
386n/a assert(kind == PyUnicode_1BYTE_KIND);
387n/a assert(ascii->state.ready == 1);
388n/a }
389n/a else {
390n/a PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
391n/a void *data;
392n/a
393n/a if (ascii->state.compact == 1) {
394n/a data = compact + 1;
395n/a assert(kind == PyUnicode_1BYTE_KIND
396n/a || kind == PyUnicode_2BYTE_KIND
397n/a || kind == PyUnicode_4BYTE_KIND);
398n/a assert(ascii->state.ascii == 0);
399n/a assert(ascii->state.ready == 1);
400n/a assert (compact->utf8 != data);
401n/a }
402n/a else {
403n/a PyUnicodeObject *unicode = (PyUnicodeObject *)op;
404n/a
405n/a data = unicode->data.any;
406n/a if (kind == PyUnicode_WCHAR_KIND) {
407n/a assert(ascii->length == 0);
408n/a assert(ascii->hash == -1);
409n/a assert(ascii->state.compact == 0);
410n/a assert(ascii->state.ascii == 0);
411n/a assert(ascii->state.ready == 0);
412n/a assert(ascii->state.interned == SSTATE_NOT_INTERNED);
413n/a assert(ascii->wstr != NULL);
414n/a assert(data == NULL);
415n/a assert(compact->utf8 == NULL);
416n/a }
417n/a else {
418n/a assert(kind == PyUnicode_1BYTE_KIND
419n/a || kind == PyUnicode_2BYTE_KIND
420n/a || kind == PyUnicode_4BYTE_KIND);
421n/a assert(ascii->state.compact == 0);
422n/a assert(ascii->state.ready == 1);
423n/a assert(data != NULL);
424n/a if (ascii->state.ascii) {
425n/a assert (compact->utf8 == data);
426n/a assert (compact->utf8_length == ascii->length);
427n/a }
428n/a else
429n/a assert (compact->utf8 != data);
430n/a }
431n/a }
432n/a if (kind != PyUnicode_WCHAR_KIND) {
433n/a if (
434n/a#if SIZEOF_WCHAR_T == 2
435n/a kind == PyUnicode_2BYTE_KIND
436n/a#else
437n/a kind == PyUnicode_4BYTE_KIND
438n/a#endif
439n/a )
440n/a {
441n/a assert(ascii->wstr == data);
442n/a assert(compact->wstr_length == ascii->length);
443n/a } else
444n/a assert(ascii->wstr != data);
445n/a }
446n/a
447n/a if (compact->utf8 == NULL)
448n/a assert(compact->utf8_length == 0);
449n/a if (ascii->wstr == NULL)
450n/a assert(compact->wstr_length == 0);
451n/a }
452n/a /* check that the best kind is used */
453n/a if (check_content && kind != PyUnicode_WCHAR_KIND)
454n/a {
455n/a Py_ssize_t i;
456n/a Py_UCS4 maxchar = 0;
457n/a void *data;
458n/a Py_UCS4 ch;
459n/a
460n/a data = PyUnicode_DATA(ascii);
461n/a for (i=0; i < ascii->length; i++)
462n/a {
463n/a ch = PyUnicode_READ(kind, data, i);
464n/a if (ch > maxchar)
465n/a maxchar = ch;
466n/a }
467n/a if (kind == PyUnicode_1BYTE_KIND) {
468n/a if (ascii->state.ascii == 0) {
469n/a assert(maxchar >= 128);
470n/a assert(maxchar <= 255);
471n/a }
472n/a else
473n/a assert(maxchar < 128);
474n/a }
475n/a else if (kind == PyUnicode_2BYTE_KIND) {
476n/a assert(maxchar >= 0x100);
477n/a assert(maxchar <= 0xFFFF);
478n/a }
479n/a else {
480n/a assert(maxchar >= 0x10000);
481n/a assert(maxchar <= MAX_UNICODE);
482n/a }
483n/a assert(PyUnicode_READ(kind, data, ascii->length) == 0);
484n/a }
485n/a return 1;
486n/a}
487n/a#endif
488n/a
489n/astatic PyObject*
490n/aunicode_result_wchar(PyObject *unicode)
491n/a{
492n/a#ifndef Py_DEBUG
493n/a Py_ssize_t len;
494n/a
495n/a len = _PyUnicode_WSTR_LENGTH(unicode);
496n/a if (len == 0) {
497n/a Py_DECREF(unicode);
498n/a _Py_RETURN_UNICODE_EMPTY();
499n/a }
500n/a
501n/a if (len == 1) {
502n/a wchar_t ch = _PyUnicode_WSTR(unicode)[0];
503n/a if ((Py_UCS4)ch < 256) {
504n/a PyObject *latin1_char = get_latin1_char((unsigned char)ch);
505n/a Py_DECREF(unicode);
506n/a return latin1_char;
507n/a }
508n/a }
509n/a
510n/a if (_PyUnicode_Ready(unicode) < 0) {
511n/a Py_DECREF(unicode);
512n/a return NULL;
513n/a }
514n/a#else
515n/a assert(Py_REFCNT(unicode) == 1);
516n/a
517n/a /* don't make the result ready in debug mode to ensure that the caller
518n/a makes the string ready before using it */
519n/a assert(_PyUnicode_CheckConsistency(unicode, 1));
520n/a#endif
521n/a return unicode;
522n/a}
523n/a
524n/astatic PyObject*
525n/aunicode_result_ready(PyObject *unicode)
526n/a{
527n/a Py_ssize_t length;
528n/a
529n/a length = PyUnicode_GET_LENGTH(unicode);
530n/a if (length == 0) {
531n/a if (unicode != unicode_empty) {
532n/a Py_DECREF(unicode);
533n/a _Py_RETURN_UNICODE_EMPTY();
534n/a }
535n/a return unicode_empty;
536n/a }
537n/a
538n/a if (length == 1) {
539n/a void *data = PyUnicode_DATA(unicode);
540n/a int kind = PyUnicode_KIND(unicode);
541n/a Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
542n/a if (ch < 256) {
543n/a PyObject *latin1_char = unicode_latin1[ch];
544n/a if (latin1_char != NULL) {
545n/a if (unicode != latin1_char) {
546n/a Py_INCREF(latin1_char);
547n/a Py_DECREF(unicode);
548n/a }
549n/a return latin1_char;
550n/a }
551n/a else {
552n/a assert(_PyUnicode_CheckConsistency(unicode, 1));
553n/a Py_INCREF(unicode);
554n/a unicode_latin1[ch] = unicode;
555n/a return unicode;
556n/a }
557n/a }
558n/a }
559n/a
560n/a assert(_PyUnicode_CheckConsistency(unicode, 1));
561n/a return unicode;
562n/a}
563n/a
564n/astatic PyObject*
565n/aunicode_result(PyObject *unicode)
566n/a{
567n/a assert(_PyUnicode_CHECK(unicode));
568n/a if (PyUnicode_IS_READY(unicode))
569n/a return unicode_result_ready(unicode);
570n/a else
571n/a return unicode_result_wchar(unicode);
572n/a}
573n/a
574n/astatic PyObject*
575n/aunicode_result_unchanged(PyObject *unicode)
576n/a{
577n/a if (PyUnicode_CheckExact(unicode)) {
578n/a if (PyUnicode_READY(unicode) == -1)
579n/a return NULL;
580n/a Py_INCREF(unicode);
581n/a return unicode;
582n/a }
583n/a else
584n/a /* Subtype -- return genuine unicode string with the same value. */
585n/a return _PyUnicode_Copy(unicode);
586n/a}
587n/a
588n/a/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
589n/a ASCII, Latin1, UTF-8, etc. */
590n/astatic char*
591n/abackslashreplace(_PyBytesWriter *writer, char *str,
592n/a PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
593n/a{
594n/a Py_ssize_t size, i;
595n/a Py_UCS4 ch;
596n/a enum PyUnicode_Kind kind;
597n/a void *data;
598n/a
599n/a assert(PyUnicode_IS_READY(unicode));
600n/a kind = PyUnicode_KIND(unicode);
601n/a data = PyUnicode_DATA(unicode);
602n/a
603n/a size = 0;
604n/a /* determine replacement size */
605n/a for (i = collstart; i < collend; ++i) {
606n/a Py_ssize_t incr;
607n/a
608n/a ch = PyUnicode_READ(kind, data, i);
609n/a if (ch < 0x100)
610n/a incr = 2+2;
611n/a else if (ch < 0x10000)
612n/a incr = 2+4;
613n/a else {
614n/a assert(ch <= MAX_UNICODE);
615n/a incr = 2+8;
616n/a }
617n/a if (size > PY_SSIZE_T_MAX - incr) {
618n/a PyErr_SetString(PyExc_OverflowError,
619n/a "encoded result is too long for a Python string");
620n/a return NULL;
621n/a }
622n/a size += incr;
623n/a }
624n/a
625n/a str = _PyBytesWriter_Prepare(writer, str, size);
626n/a if (str == NULL)
627n/a return NULL;
628n/a
629n/a /* generate replacement */
630n/a for (i = collstart; i < collend; ++i) {
631n/a ch = PyUnicode_READ(kind, data, i);
632n/a *str++ = '\\';
633n/a if (ch >= 0x00010000) {
634n/a *str++ = 'U';
635n/a *str++ = Py_hexdigits[(ch>>28)&0xf];
636n/a *str++ = Py_hexdigits[(ch>>24)&0xf];
637n/a *str++ = Py_hexdigits[(ch>>20)&0xf];
638n/a *str++ = Py_hexdigits[(ch>>16)&0xf];
639n/a *str++ = Py_hexdigits[(ch>>12)&0xf];
640n/a *str++ = Py_hexdigits[(ch>>8)&0xf];
641n/a }
642n/a else if (ch >= 0x100) {
643n/a *str++ = 'u';
644n/a *str++ = Py_hexdigits[(ch>>12)&0xf];
645n/a *str++ = Py_hexdigits[(ch>>8)&0xf];
646n/a }
647n/a else
648n/a *str++ = 'x';
649n/a *str++ = Py_hexdigits[(ch>>4)&0xf];
650n/a *str++ = Py_hexdigits[ch&0xf];
651n/a }
652n/a return str;
653n/a}
654n/a
655n/a/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
656n/a ASCII, Latin1, UTF-8, etc. */
657n/astatic char*
658n/axmlcharrefreplace(_PyBytesWriter *writer, char *str,
659n/a PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
660n/a{
661n/a Py_ssize_t size, i;
662n/a Py_UCS4 ch;
663n/a enum PyUnicode_Kind kind;
664n/a void *data;
665n/a
666n/a assert(PyUnicode_IS_READY(unicode));
667n/a kind = PyUnicode_KIND(unicode);
668n/a data = PyUnicode_DATA(unicode);
669n/a
670n/a size = 0;
671n/a /* determine replacement size */
672n/a for (i = collstart; i < collend; ++i) {
673n/a Py_ssize_t incr;
674n/a
675n/a ch = PyUnicode_READ(kind, data, i);
676n/a if (ch < 10)
677n/a incr = 2+1+1;
678n/a else if (ch < 100)
679n/a incr = 2+2+1;
680n/a else if (ch < 1000)
681n/a incr = 2+3+1;
682n/a else if (ch < 10000)
683n/a incr = 2+4+1;
684n/a else if (ch < 100000)
685n/a incr = 2+5+1;
686n/a else if (ch < 1000000)
687n/a incr = 2+6+1;
688n/a else {
689n/a assert(ch <= MAX_UNICODE);
690n/a incr = 2+7+1;
691n/a }
692n/a if (size > PY_SSIZE_T_MAX - incr) {
693n/a PyErr_SetString(PyExc_OverflowError,
694n/a "encoded result is too long for a Python string");
695n/a return NULL;
696n/a }
697n/a size += incr;
698n/a }
699n/a
700n/a str = _PyBytesWriter_Prepare(writer, str, size);
701n/a if (str == NULL)
702n/a return NULL;
703n/a
704n/a /* generate replacement */
705n/a for (i = collstart; i < collend; ++i) {
706n/a str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
707n/a }
708n/a return str;
709n/a}
710n/a
711n/a/* --- Bloom Filters ----------------------------------------------------- */
712n/a
713n/a/* stuff to implement simple "bloom filters" for Unicode characters.
714n/a to keep things simple, we use a single bitmask, using the least 5
715n/a bits from each unicode characters as the bit index. */
716n/a
717n/a/* the linebreak mask is set up by Unicode_Init below */
718n/a
719n/a#if LONG_BIT >= 128
720n/a#define BLOOM_WIDTH 128
721n/a#elif LONG_BIT >= 64
722n/a#define BLOOM_WIDTH 64
723n/a#elif LONG_BIT >= 32
724n/a#define BLOOM_WIDTH 32
725n/a#else
726n/a#error "LONG_BIT is smaller than 32"
727n/a#endif
728n/a
729n/a#define BLOOM_MASK unsigned long
730n/a
731n/astatic BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
732n/a
733n/a#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
734n/a
735n/a#define BLOOM_LINEBREAK(ch) \
736n/a ((ch) < 128U ? ascii_linebreak[(ch)] : \
737n/a (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
738n/a
739n/astatic inline BLOOM_MASK
740n/amake_bloom_mask(int kind, void* ptr, Py_ssize_t len)
741n/a{
742n/a#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
743n/a do { \
744n/a TYPE *data = (TYPE *)PTR; \
745n/a TYPE *end = data + LEN; \
746n/a Py_UCS4 ch; \
747n/a for (; data != end; data++) { \
748n/a ch = *data; \
749n/a MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
750n/a } \
751n/a break; \
752n/a } while (0)
753n/a
754n/a /* calculate simple bloom-style bitmask for a given unicode string */
755n/a
756n/a BLOOM_MASK mask;
757n/a
758n/a mask = 0;
759n/a switch (kind) {
760n/a case PyUnicode_1BYTE_KIND:
761n/a BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
762n/a break;
763n/a case PyUnicode_2BYTE_KIND:
764n/a BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
765n/a break;
766n/a case PyUnicode_4BYTE_KIND:
767n/a BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
768n/a break;
769n/a default:
770n/a assert(0);
771n/a }
772n/a return mask;
773n/a
774n/a#undef BLOOM_UPDATE
775n/a}
776n/a
777n/astatic int
778n/aensure_unicode(PyObject *obj)
779n/a{
780n/a if (!PyUnicode_Check(obj)) {
781n/a PyErr_Format(PyExc_TypeError,
782n/a "must be str, not %.100s",
783n/a Py_TYPE(obj)->tp_name);
784n/a return -1;
785n/a }
786n/a return PyUnicode_READY(obj);
787n/a}
788n/a
789n/a/* Compilation of templated routines */
790n/a
791n/a#include "stringlib/asciilib.h"
792n/a#include "stringlib/fastsearch.h"
793n/a#include "stringlib/partition.h"
794n/a#include "stringlib/split.h"
795n/a#include "stringlib/count.h"
796n/a#include "stringlib/find.h"
797n/a#include "stringlib/find_max_char.h"
798n/a#include "stringlib/localeutil.h"
799n/a#include "stringlib/undef.h"
800n/a
801n/a#include "stringlib/ucs1lib.h"
802n/a#include "stringlib/fastsearch.h"
803n/a#include "stringlib/partition.h"
804n/a#include "stringlib/split.h"
805n/a#include "stringlib/count.h"
806n/a#include "stringlib/find.h"
807n/a#include "stringlib/replace.h"
808n/a#include "stringlib/find_max_char.h"
809n/a#include "stringlib/localeutil.h"
810n/a#include "stringlib/undef.h"
811n/a
812n/a#include "stringlib/ucs2lib.h"
813n/a#include "stringlib/fastsearch.h"
814n/a#include "stringlib/partition.h"
815n/a#include "stringlib/split.h"
816n/a#include "stringlib/count.h"
817n/a#include "stringlib/find.h"
818n/a#include "stringlib/replace.h"
819n/a#include "stringlib/find_max_char.h"
820n/a#include "stringlib/localeutil.h"
821n/a#include "stringlib/undef.h"
822n/a
823n/a#include "stringlib/ucs4lib.h"
824n/a#include "stringlib/fastsearch.h"
825n/a#include "stringlib/partition.h"
826n/a#include "stringlib/split.h"
827n/a#include "stringlib/count.h"
828n/a#include "stringlib/find.h"
829n/a#include "stringlib/replace.h"
830n/a#include "stringlib/find_max_char.h"
831n/a#include "stringlib/localeutil.h"
832n/a#include "stringlib/undef.h"
833n/a
834n/a#include "stringlib/unicodedefs.h"
835n/a#include "stringlib/fastsearch.h"
836n/a#include "stringlib/count.h"
837n/a#include "stringlib/find.h"
838n/a#include "stringlib/undef.h"
839n/a
840n/a/* --- Unicode Object ----------------------------------------------------- */
841n/a
842n/astatic PyObject *
843n/afixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
844n/a
845n/astatic inline Py_ssize_t
846n/afindchar(const void *s, int kind,
847n/a Py_ssize_t size, Py_UCS4 ch,
848n/a int direction)
849n/a{
850n/a switch (kind) {
851n/a case PyUnicode_1BYTE_KIND:
852n/a if ((Py_UCS1) ch != ch)
853n/a return -1;
854n/a if (direction > 0)
855n/a return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
856n/a else
857n/a return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
858n/a case PyUnicode_2BYTE_KIND:
859n/a if ((Py_UCS2) ch != ch)
860n/a return -1;
861n/a if (direction > 0)
862n/a return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
863n/a else
864n/a return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
865n/a case PyUnicode_4BYTE_KIND:
866n/a if (direction > 0)
867n/a return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
868n/a else
869n/a return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
870n/a default:
871n/a assert(0);
872n/a return -1;
873n/a }
874n/a}
875n/a
876n/a#ifdef Py_DEBUG
877n/a/* Fill the data of a Unicode string with invalid characters to detect bugs
878n/a earlier.
879n/a
880n/a _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
881n/a ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
882n/a invalid character in Unicode 6.0. */
883n/astatic void
884n/aunicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
885n/a{
886n/a int kind = PyUnicode_KIND(unicode);
887n/a Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
888n/a Py_ssize_t length = _PyUnicode_LENGTH(unicode);
889n/a if (length <= old_length)
890n/a return;
891n/a memset(data + old_length * kind, 0xff, (length - old_length) * kind);
892n/a}
893n/a#endif
894n/a
895n/astatic PyObject*
896n/aresize_compact(PyObject *unicode, Py_ssize_t length)
897n/a{
898n/a Py_ssize_t char_size;
899n/a Py_ssize_t struct_size;
900n/a Py_ssize_t new_size;
901n/a int share_wstr;
902n/a PyObject *new_unicode;
903n/a#ifdef Py_DEBUG
904n/a Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
905n/a#endif
906n/a
907n/a assert(unicode_modifiable(unicode));
908n/a assert(PyUnicode_IS_READY(unicode));
909n/a assert(PyUnicode_IS_COMPACT(unicode));
910n/a
911n/a char_size = PyUnicode_KIND(unicode);
912n/a if (PyUnicode_IS_ASCII(unicode))
913n/a struct_size = sizeof(PyASCIIObject);
914n/a else
915n/a struct_size = sizeof(PyCompactUnicodeObject);
916n/a share_wstr = _PyUnicode_SHARE_WSTR(unicode);
917n/a
918n/a if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
919n/a PyErr_NoMemory();
920n/a return NULL;
921n/a }
922n/a new_size = (struct_size + (length + 1) * char_size);
923n/a
924n/a if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
925n/a PyObject_DEL(_PyUnicode_UTF8(unicode));
926n/a _PyUnicode_UTF8(unicode) = NULL;
927n/a _PyUnicode_UTF8_LENGTH(unicode) = 0;
928n/a }
929n/a _Py_DEC_REFTOTAL;
930n/a _Py_ForgetReference(unicode);
931n/a
932n/a new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
933n/a if (new_unicode == NULL) {
934n/a _Py_NewReference(unicode);
935n/a PyErr_NoMemory();
936n/a return NULL;
937n/a }
938n/a unicode = new_unicode;
939n/a _Py_NewReference(unicode);
940n/a
941n/a _PyUnicode_LENGTH(unicode) = length;
942n/a if (share_wstr) {
943n/a _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
944n/a if (!PyUnicode_IS_ASCII(unicode))
945n/a _PyUnicode_WSTR_LENGTH(unicode) = length;
946n/a }
947n/a else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
948n/a PyObject_DEL(_PyUnicode_WSTR(unicode));
949n/a _PyUnicode_WSTR(unicode) = NULL;
950n/a if (!PyUnicode_IS_ASCII(unicode))
951n/a _PyUnicode_WSTR_LENGTH(unicode) = 0;
952n/a }
953n/a#ifdef Py_DEBUG
954n/a unicode_fill_invalid(unicode, old_length);
955n/a#endif
956n/a PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
957n/a length, 0);
958n/a assert(_PyUnicode_CheckConsistency(unicode, 0));
959n/a return unicode;
960n/a}
961n/a
962n/astatic int
963n/aresize_inplace(PyObject *unicode, Py_ssize_t length)
964n/a{
965n/a wchar_t *wstr;
966n/a Py_ssize_t new_size;
967n/a assert(!PyUnicode_IS_COMPACT(unicode));
968n/a assert(Py_REFCNT(unicode) == 1);
969n/a
970n/a if (PyUnicode_IS_READY(unicode)) {
971n/a Py_ssize_t char_size;
972n/a int share_wstr, share_utf8;
973n/a void *data;
974n/a#ifdef Py_DEBUG
975n/a Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
976n/a#endif
977n/a
978n/a data = _PyUnicode_DATA_ANY(unicode);
979n/a char_size = PyUnicode_KIND(unicode);
980n/a share_wstr = _PyUnicode_SHARE_WSTR(unicode);
981n/a share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
982n/a
983n/a if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
984n/a PyErr_NoMemory();
985n/a return -1;
986n/a }
987n/a new_size = (length + 1) * char_size;
988n/a
989n/a if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
990n/a {
991n/a PyObject_DEL(_PyUnicode_UTF8(unicode));
992n/a _PyUnicode_UTF8(unicode) = NULL;
993n/a _PyUnicode_UTF8_LENGTH(unicode) = 0;
994n/a }
995n/a
996n/a data = (PyObject *)PyObject_REALLOC(data, new_size);
997n/a if (data == NULL) {
998n/a PyErr_NoMemory();
999n/a return -1;
1000n/a }
1001n/a _PyUnicode_DATA_ANY(unicode) = data;
1002n/a if (share_wstr) {
1003n/a _PyUnicode_WSTR(unicode) = data;
1004n/a _PyUnicode_WSTR_LENGTH(unicode) = length;
1005n/a }
1006n/a if (share_utf8) {
1007n/a _PyUnicode_UTF8(unicode) = data;
1008n/a _PyUnicode_UTF8_LENGTH(unicode) = length;
1009n/a }
1010n/a _PyUnicode_LENGTH(unicode) = length;
1011n/a PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1012n/a#ifdef Py_DEBUG
1013n/a unicode_fill_invalid(unicode, old_length);
1014n/a#endif
1015n/a if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1016n/a assert(_PyUnicode_CheckConsistency(unicode, 0));
1017n/a return 0;
1018n/a }
1019n/a }
1020n/a assert(_PyUnicode_WSTR(unicode) != NULL);
1021n/a
1022n/a /* check for integer overflow */
1023n/a if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1024n/a PyErr_NoMemory();
1025n/a return -1;
1026n/a }
1027n/a new_size = sizeof(wchar_t) * (length + 1);
1028n/a wstr = _PyUnicode_WSTR(unicode);
1029n/a wstr = PyObject_REALLOC(wstr, new_size);
1030n/a if (!wstr) {
1031n/a PyErr_NoMemory();
1032n/a return -1;
1033n/a }
1034n/a _PyUnicode_WSTR(unicode) = wstr;
1035n/a _PyUnicode_WSTR(unicode)[length] = 0;
1036n/a _PyUnicode_WSTR_LENGTH(unicode) = length;
1037n/a assert(_PyUnicode_CheckConsistency(unicode, 0));
1038n/a return 0;
1039n/a}
1040n/a
1041n/astatic PyObject*
1042n/aresize_copy(PyObject *unicode, Py_ssize_t length)
1043n/a{
1044n/a Py_ssize_t copy_length;
1045n/a if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1046n/a PyObject *copy;
1047n/a
1048n/a assert(PyUnicode_IS_READY(unicode));
1049n/a
1050n/a copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1051n/a if (copy == NULL)
1052n/a return NULL;
1053n/a
1054n/a copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1055n/a _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1056n/a return copy;
1057n/a }
1058n/a else {
1059n/a PyObject *w;
1060n/a
1061n/a w = (PyObject*)_PyUnicode_New(length);
1062n/a if (w == NULL)
1063n/a return NULL;
1064n/a copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1065n/a copy_length = Py_MIN(copy_length, length);
1066n/a memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1067n/a copy_length * sizeof(wchar_t));
1068n/a return w;
1069n/a }
1070n/a}
1071n/a
1072n/a/* We allocate one more byte to make sure the string is
1073n/a Ux0000 terminated; some code (e.g. new_identifier)
1074n/a relies on that.
1075n/a
1076n/a XXX This allocator could further be enhanced by assuring that the
1077n/a free list never reduces its size below 1.
1078n/a
1079n/a*/
1080n/a
1081n/astatic PyUnicodeObject *
1082n/a_PyUnicode_New(Py_ssize_t length)
1083n/a{
1084n/a PyUnicodeObject *unicode;
1085n/a size_t new_size;
1086n/a
1087n/a /* Optimization for empty strings */
1088n/a if (length == 0 && unicode_empty != NULL) {
1089n/a Py_INCREF(unicode_empty);
1090n/a return (PyUnicodeObject*)unicode_empty;
1091n/a }
1092n/a
1093n/a /* Ensure we won't overflow the size. */
1094n/a if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1095n/a return (PyUnicodeObject *)PyErr_NoMemory();
1096n/a }
1097n/a if (length < 0) {
1098n/a PyErr_SetString(PyExc_SystemError,
1099n/a "Negative size passed to _PyUnicode_New");
1100n/a return NULL;
1101n/a }
1102n/a
1103n/a unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1104n/a if (unicode == NULL)
1105n/a return NULL;
1106n/a new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1107n/a
1108n/a _PyUnicode_WSTR_LENGTH(unicode) = length;
1109n/a _PyUnicode_HASH(unicode) = -1;
1110n/a _PyUnicode_STATE(unicode).interned = 0;
1111n/a _PyUnicode_STATE(unicode).kind = 0;
1112n/a _PyUnicode_STATE(unicode).compact = 0;
1113n/a _PyUnicode_STATE(unicode).ready = 0;
1114n/a _PyUnicode_STATE(unicode).ascii = 0;
1115n/a _PyUnicode_DATA_ANY(unicode) = NULL;
1116n/a _PyUnicode_LENGTH(unicode) = 0;
1117n/a _PyUnicode_UTF8(unicode) = NULL;
1118n/a _PyUnicode_UTF8_LENGTH(unicode) = 0;
1119n/a
1120n/a _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1121n/a if (!_PyUnicode_WSTR(unicode)) {
1122n/a Py_DECREF(unicode);
1123n/a PyErr_NoMemory();
1124n/a return NULL;
1125n/a }
1126n/a
1127n/a /* Initialize the first element to guard against cases where
1128n/a * the caller fails before initializing str -- unicode_resize()
1129n/a * reads str[0], and the Keep-Alive optimization can keep memory
1130n/a * allocated for str alive across a call to unicode_dealloc(unicode).
1131n/a * We don't want unicode_resize to read uninitialized memory in
1132n/a * that case.
1133n/a */
1134n/a _PyUnicode_WSTR(unicode)[0] = 0;
1135n/a _PyUnicode_WSTR(unicode)[length] = 0;
1136n/a
1137n/a assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1138n/a return unicode;
1139n/a}
1140n/a
1141n/astatic const char*
1142n/aunicode_kind_name(PyObject *unicode)
1143n/a{
1144n/a /* don't check consistency: unicode_kind_name() is called from
1145n/a _PyUnicode_Dump() */
1146n/a if (!PyUnicode_IS_COMPACT(unicode))
1147n/a {
1148n/a if (!PyUnicode_IS_READY(unicode))
1149n/a return "wstr";
1150n/a switch (PyUnicode_KIND(unicode))
1151n/a {
1152n/a case PyUnicode_1BYTE_KIND:
1153n/a if (PyUnicode_IS_ASCII(unicode))
1154n/a return "legacy ascii";
1155n/a else
1156n/a return "legacy latin1";
1157n/a case PyUnicode_2BYTE_KIND:
1158n/a return "legacy UCS2";
1159n/a case PyUnicode_4BYTE_KIND:
1160n/a return "legacy UCS4";
1161n/a default:
1162n/a return "<legacy invalid kind>";
1163n/a }
1164n/a }
1165n/a assert(PyUnicode_IS_READY(unicode));
1166n/a switch (PyUnicode_KIND(unicode)) {
1167n/a case PyUnicode_1BYTE_KIND:
1168n/a if (PyUnicode_IS_ASCII(unicode))
1169n/a return "ascii";
1170n/a else
1171n/a return "latin1";
1172n/a case PyUnicode_2BYTE_KIND:
1173n/a return "UCS2";
1174n/a case PyUnicode_4BYTE_KIND:
1175n/a return "UCS4";
1176n/a default:
1177n/a return "<invalid compact kind>";
1178n/a }
1179n/a}
1180n/a
1181n/a#ifdef Py_DEBUG
1182n/a/* Functions wrapping macros for use in debugger */
1183n/achar *_PyUnicode_utf8(void *unicode){
1184n/a return PyUnicode_UTF8(unicode);
1185n/a}
1186n/a
1187n/avoid *_PyUnicode_compact_data(void *unicode) {
1188n/a return _PyUnicode_COMPACT_DATA(unicode);
1189n/a}
1190n/avoid *_PyUnicode_data(void *unicode){
1191n/a printf("obj %p\n", unicode);
1192n/a printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1193n/a printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1194n/a printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1195n/a printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1196n/a printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1197n/a return PyUnicode_DATA(unicode);
1198n/a}
1199n/a
1200n/avoid
1201n/a_PyUnicode_Dump(PyObject *op)
1202n/a{
1203n/a PyASCIIObject *ascii = (PyASCIIObject *)op;
1204n/a PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1205n/a PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1206n/a void *data;
1207n/a
1208n/a if (ascii->state.compact)
1209n/a {
1210n/a if (ascii->state.ascii)
1211n/a data = (ascii + 1);
1212n/a else
1213n/a data = (compact + 1);
1214n/a }
1215n/a else
1216n/a data = unicode->data.any;
1217n/a printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1218n/a unicode_kind_name(op), ascii->length);
1219n/a
1220n/a if (ascii->wstr == data)
1221n/a printf("shared ");
1222n/a printf("wstr=%p", ascii->wstr);
1223n/a
1224n/a if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1225n/a printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1226n/a if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1227n/a printf("shared ");
1228n/a printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1229n/a compact->utf8, compact->utf8_length);
1230n/a }
1231n/a printf(", data=%p\n", data);
1232n/a}
1233n/a#endif
1234n/a
1235n/aPyObject *
1236n/aPyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1237n/a{
1238n/a PyObject *obj;
1239n/a PyCompactUnicodeObject *unicode;
1240n/a void *data;
1241n/a enum PyUnicode_Kind kind;
1242n/a int is_sharing, is_ascii;
1243n/a Py_ssize_t char_size;
1244n/a Py_ssize_t struct_size;
1245n/a
1246n/a /* Optimization for empty strings */
1247n/a if (size == 0 && unicode_empty != NULL) {
1248n/a Py_INCREF(unicode_empty);
1249n/a return unicode_empty;
1250n/a }
1251n/a
1252n/a is_ascii = 0;
1253n/a is_sharing = 0;
1254n/a struct_size = sizeof(PyCompactUnicodeObject);
1255n/a if (maxchar < 128) {
1256n/a kind = PyUnicode_1BYTE_KIND;
1257n/a char_size = 1;
1258n/a is_ascii = 1;
1259n/a struct_size = sizeof(PyASCIIObject);
1260n/a }
1261n/a else if (maxchar < 256) {
1262n/a kind = PyUnicode_1BYTE_KIND;
1263n/a char_size = 1;
1264n/a }
1265n/a else if (maxchar < 65536) {
1266n/a kind = PyUnicode_2BYTE_KIND;
1267n/a char_size = 2;
1268n/a if (sizeof(wchar_t) == 2)
1269n/a is_sharing = 1;
1270n/a }
1271n/a else {
1272n/a if (maxchar > MAX_UNICODE) {
1273n/a PyErr_SetString(PyExc_SystemError,
1274n/a "invalid maximum character passed to PyUnicode_New");
1275n/a return NULL;
1276n/a }
1277n/a kind = PyUnicode_4BYTE_KIND;
1278n/a char_size = 4;
1279n/a if (sizeof(wchar_t) == 4)
1280n/a is_sharing = 1;
1281n/a }
1282n/a
1283n/a /* Ensure we won't overflow the size. */
1284n/a if (size < 0) {
1285n/a PyErr_SetString(PyExc_SystemError,
1286n/a "Negative size passed to PyUnicode_New");
1287n/a return NULL;
1288n/a }
1289n/a if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1290n/a return PyErr_NoMemory();
1291n/a
1292n/a /* Duplicated allocation code from _PyObject_New() instead of a call to
1293n/a * PyObject_New() so we are able to allocate space for the object and
1294n/a * it's data buffer.
1295n/a */
1296n/a obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1297n/a if (obj == NULL)
1298n/a return PyErr_NoMemory();
1299n/a obj = PyObject_INIT(obj, &PyUnicode_Type);
1300n/a if (obj == NULL)
1301n/a return NULL;
1302n/a
1303n/a unicode = (PyCompactUnicodeObject *)obj;
1304n/a if (is_ascii)
1305n/a data = ((PyASCIIObject*)obj) + 1;
1306n/a else
1307n/a data = unicode + 1;
1308n/a _PyUnicode_LENGTH(unicode) = size;
1309n/a _PyUnicode_HASH(unicode) = -1;
1310n/a _PyUnicode_STATE(unicode).interned = 0;
1311n/a _PyUnicode_STATE(unicode).kind = kind;
1312n/a _PyUnicode_STATE(unicode).compact = 1;
1313n/a _PyUnicode_STATE(unicode).ready = 1;
1314n/a _PyUnicode_STATE(unicode).ascii = is_ascii;
1315n/a if (is_ascii) {
1316n/a ((char*)data)[size] = 0;
1317n/a _PyUnicode_WSTR(unicode) = NULL;
1318n/a }
1319n/a else if (kind == PyUnicode_1BYTE_KIND) {
1320n/a ((char*)data)[size] = 0;
1321n/a _PyUnicode_WSTR(unicode) = NULL;
1322n/a _PyUnicode_WSTR_LENGTH(unicode) = 0;
1323n/a unicode->utf8 = NULL;
1324n/a unicode->utf8_length = 0;
1325n/a }
1326n/a else {
1327n/a unicode->utf8 = NULL;
1328n/a unicode->utf8_length = 0;
1329n/a if (kind == PyUnicode_2BYTE_KIND)
1330n/a ((Py_UCS2*)data)[size] = 0;
1331n/a else /* kind == PyUnicode_4BYTE_KIND */
1332n/a ((Py_UCS4*)data)[size] = 0;
1333n/a if (is_sharing) {
1334n/a _PyUnicode_WSTR_LENGTH(unicode) = size;
1335n/a _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1336n/a }
1337n/a else {
1338n/a _PyUnicode_WSTR_LENGTH(unicode) = 0;
1339n/a _PyUnicode_WSTR(unicode) = NULL;
1340n/a }
1341n/a }
1342n/a#ifdef Py_DEBUG
1343n/a unicode_fill_invalid((PyObject*)unicode, 0);
1344n/a#endif
1345n/a assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1346n/a return obj;
1347n/a}
1348n/a
1349n/a#if SIZEOF_WCHAR_T == 2
1350n/a/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1351n/a will decode surrogate pairs, the other conversions are implemented as macros
1352n/a for efficiency.
1353n/a
1354n/a This function assumes that unicode can hold one more code point than wstr
1355n/a characters for a terminating null character. */
1356n/astatic void
1357n/aunicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1358n/a PyObject *unicode)
1359n/a{
1360n/a const wchar_t *iter;
1361n/a Py_UCS4 *ucs4_out;
1362n/a
1363n/a assert(unicode != NULL);
1364n/a assert(_PyUnicode_CHECK(unicode));
1365n/a assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1366n/a ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1367n/a
1368n/a for (iter = begin; iter < end; ) {
1369n/a assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1370n/a _PyUnicode_GET_LENGTH(unicode)));
1371n/a if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1372n/a && (iter+1) < end
1373n/a && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1374n/a {
1375n/a *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1376n/a iter += 2;
1377n/a }
1378n/a else {
1379n/a *ucs4_out++ = *iter;
1380n/a iter++;
1381n/a }
1382n/a }
1383n/a assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1384n/a _PyUnicode_GET_LENGTH(unicode)));
1385n/a
1386n/a}
1387n/a#endif
1388n/a
1389n/astatic int
1390n/aunicode_check_modifiable(PyObject *unicode)
1391n/a{
1392n/a if (!unicode_modifiable(unicode)) {
1393n/a PyErr_SetString(PyExc_SystemError,
1394n/a "Cannot modify a string currently used");
1395n/a return -1;
1396n/a }
1397n/a return 0;
1398n/a}
1399n/a
1400n/astatic int
1401n/a_copy_characters(PyObject *to, Py_ssize_t to_start,
1402n/a PyObject *from, Py_ssize_t from_start,
1403n/a Py_ssize_t how_many, int check_maxchar)
1404n/a{
1405n/a unsigned int from_kind, to_kind;
1406n/a void *from_data, *to_data;
1407n/a
1408n/a assert(0 <= how_many);
1409n/a assert(0 <= from_start);
1410n/a assert(0 <= to_start);
1411n/a assert(PyUnicode_Check(from));
1412n/a assert(PyUnicode_IS_READY(from));
1413n/a assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1414n/a
1415n/a assert(PyUnicode_Check(to));
1416n/a assert(PyUnicode_IS_READY(to));
1417n/a assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1418n/a
1419n/a if (how_many == 0)
1420n/a return 0;
1421n/a
1422n/a from_kind = PyUnicode_KIND(from);
1423n/a from_data = PyUnicode_DATA(from);
1424n/a to_kind = PyUnicode_KIND(to);
1425n/a to_data = PyUnicode_DATA(to);
1426n/a
1427n/a#ifdef Py_DEBUG
1428n/a if (!check_maxchar
1429n/a && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1430n/a {
1431n/a const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1432n/a Py_UCS4 ch;
1433n/a Py_ssize_t i;
1434n/a for (i=0; i < how_many; i++) {
1435n/a ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1436n/a assert(ch <= to_maxchar);
1437n/a }
1438n/a }
1439n/a#endif
1440n/a
1441n/a if (from_kind == to_kind) {
1442n/a if (check_maxchar
1443n/a && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1444n/a {
1445n/a /* Writing Latin-1 characters into an ASCII string requires to
1446n/a check that all written characters are pure ASCII */
1447n/a Py_UCS4 max_char;
1448n/a max_char = ucs1lib_find_max_char(from_data,
1449n/a (Py_UCS1*)from_data + how_many);
1450n/a if (max_char >= 128)
1451n/a return -1;
1452n/a }
1453n/a memcpy((char*)to_data + to_kind * to_start,
1454n/a (char*)from_data + from_kind * from_start,
1455n/a to_kind * how_many);
1456n/a }
1457n/a else if (from_kind == PyUnicode_1BYTE_KIND
1458n/a && to_kind == PyUnicode_2BYTE_KIND)
1459n/a {
1460n/a _PyUnicode_CONVERT_BYTES(
1461n/a Py_UCS1, Py_UCS2,
1462n/a PyUnicode_1BYTE_DATA(from) + from_start,
1463n/a PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1464n/a PyUnicode_2BYTE_DATA(to) + to_start
1465n/a );
1466n/a }
1467n/a else if (from_kind == PyUnicode_1BYTE_KIND
1468n/a && to_kind == PyUnicode_4BYTE_KIND)
1469n/a {
1470n/a _PyUnicode_CONVERT_BYTES(
1471n/a Py_UCS1, Py_UCS4,
1472n/a PyUnicode_1BYTE_DATA(from) + from_start,
1473n/a PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1474n/a PyUnicode_4BYTE_DATA(to) + to_start
1475n/a );
1476n/a }
1477n/a else if (from_kind == PyUnicode_2BYTE_KIND
1478n/a && to_kind == PyUnicode_4BYTE_KIND)
1479n/a {
1480n/a _PyUnicode_CONVERT_BYTES(
1481n/a Py_UCS2, Py_UCS4,
1482n/a PyUnicode_2BYTE_DATA(from) + from_start,
1483n/a PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1484n/a PyUnicode_4BYTE_DATA(to) + to_start
1485n/a );
1486n/a }
1487n/a else {
1488n/a assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1489n/a
1490n/a if (!check_maxchar) {
1491n/a if (from_kind == PyUnicode_2BYTE_KIND
1492n/a && to_kind == PyUnicode_1BYTE_KIND)
1493n/a {
1494n/a _PyUnicode_CONVERT_BYTES(
1495n/a Py_UCS2, Py_UCS1,
1496n/a PyUnicode_2BYTE_DATA(from) + from_start,
1497n/a PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1498n/a PyUnicode_1BYTE_DATA(to) + to_start
1499n/a );
1500n/a }
1501n/a else if (from_kind == PyUnicode_4BYTE_KIND
1502n/a && to_kind == PyUnicode_1BYTE_KIND)
1503n/a {
1504n/a _PyUnicode_CONVERT_BYTES(
1505n/a Py_UCS4, Py_UCS1,
1506n/a PyUnicode_4BYTE_DATA(from) + from_start,
1507n/a PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1508n/a PyUnicode_1BYTE_DATA(to) + to_start
1509n/a );
1510n/a }
1511n/a else if (from_kind == PyUnicode_4BYTE_KIND
1512n/a && to_kind == PyUnicode_2BYTE_KIND)
1513n/a {
1514n/a _PyUnicode_CONVERT_BYTES(
1515n/a Py_UCS4, Py_UCS2,
1516n/a PyUnicode_4BYTE_DATA(from) + from_start,
1517n/a PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1518n/a PyUnicode_2BYTE_DATA(to) + to_start
1519n/a );
1520n/a }
1521n/a else {
1522n/a assert(0);
1523n/a return -1;
1524n/a }
1525n/a }
1526n/a else {
1527n/a const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1528n/a Py_UCS4 ch;
1529n/a Py_ssize_t i;
1530n/a
1531n/a for (i=0; i < how_many; i++) {
1532n/a ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1533n/a if (ch > to_maxchar)
1534n/a return -1;
1535n/a PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1536n/a }
1537n/a }
1538n/a }
1539n/a return 0;
1540n/a}
1541n/a
1542n/avoid
1543n/a_PyUnicode_FastCopyCharacters(
1544n/a PyObject *to, Py_ssize_t to_start,
1545n/a PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1546n/a{
1547n/a (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1548n/a}
1549n/a
1550n/aPy_ssize_t
1551n/aPyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1552n/a PyObject *from, Py_ssize_t from_start,
1553n/a Py_ssize_t how_many)
1554n/a{
1555n/a int err;
1556n/a
1557n/a if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1558n/a PyErr_BadInternalCall();
1559n/a return -1;
1560n/a }
1561n/a
1562n/a if (PyUnicode_READY(from) == -1)
1563n/a return -1;
1564n/a if (PyUnicode_READY(to) == -1)
1565n/a return -1;
1566n/a
1567n/a if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1568n/a PyErr_SetString(PyExc_IndexError, "string index out of range");
1569n/a return -1;
1570n/a }
1571n/a if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1572n/a PyErr_SetString(PyExc_IndexError, "string index out of range");
1573n/a return -1;
1574n/a }
1575n/a if (how_many < 0) {
1576n/a PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1577n/a return -1;
1578n/a }
1579n/a how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1580n/a if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1581n/a PyErr_Format(PyExc_SystemError,
1582n/a "Cannot write %zi characters at %zi "
1583n/a "in a string of %zi characters",
1584n/a how_many, to_start, PyUnicode_GET_LENGTH(to));
1585n/a return -1;
1586n/a }
1587n/a
1588n/a if (how_many == 0)
1589n/a return 0;
1590n/a
1591n/a if (unicode_check_modifiable(to))
1592n/a return -1;
1593n/a
1594n/a err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1595n/a if (err) {
1596n/a PyErr_Format(PyExc_SystemError,
1597n/a "Cannot copy %s characters "
1598n/a "into a string of %s characters",
1599n/a unicode_kind_name(from),
1600n/a unicode_kind_name(to));
1601n/a return -1;
1602n/a }
1603n/a return how_many;
1604n/a}
1605n/a
1606n/a/* Find the maximum code point and count the number of surrogate pairs so a
1607n/a correct string length can be computed before converting a string to UCS4.
1608n/a This function counts single surrogates as a character and not as a pair.
1609n/a
1610n/a Return 0 on success, or -1 on error. */
1611n/astatic int
1612n/afind_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1613n/a Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1614n/a{
1615n/a const wchar_t *iter;
1616n/a Py_UCS4 ch;
1617n/a
1618n/a assert(num_surrogates != NULL && maxchar != NULL);
1619n/a *num_surrogates = 0;
1620n/a *maxchar = 0;
1621n/a
1622n/a for (iter = begin; iter < end; ) {
1623n/a#if SIZEOF_WCHAR_T == 2
1624n/a if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1625n/a && (iter+1) < end
1626n/a && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1627n/a {
1628n/a ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1629n/a ++(*num_surrogates);
1630n/a iter += 2;
1631n/a }
1632n/a else
1633n/a#endif
1634n/a {
1635n/a ch = *iter;
1636n/a iter++;
1637n/a }
1638n/a if (ch > *maxchar) {
1639n/a *maxchar = ch;
1640n/a if (*maxchar > MAX_UNICODE) {
1641n/a PyErr_Format(PyExc_ValueError,
1642n/a "character U+%x is not in range [U+0000; U+10ffff]",
1643n/a ch);
1644n/a return -1;
1645n/a }
1646n/a }
1647n/a }
1648n/a return 0;
1649n/a}
1650n/a
1651n/aint
1652n/a_PyUnicode_Ready(PyObject *unicode)
1653n/a{
1654n/a wchar_t *end;
1655n/a Py_UCS4 maxchar = 0;
1656n/a Py_ssize_t num_surrogates;
1657n/a#if SIZEOF_WCHAR_T == 2
1658n/a Py_ssize_t length_wo_surrogates;
1659n/a#endif
1660n/a
1661n/a /* _PyUnicode_Ready() is only intended for old-style API usage where
1662n/a strings were created using _PyObject_New() and where no canonical
1663n/a representation (the str field) has been set yet aka strings
1664n/a which are not yet ready. */
1665n/a assert(_PyUnicode_CHECK(unicode));
1666n/a assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1667n/a assert(_PyUnicode_WSTR(unicode) != NULL);
1668n/a assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1669n/a assert(_PyUnicode_UTF8(unicode) == NULL);
1670n/a /* Actually, it should neither be interned nor be anything else: */
1671n/a assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1672n/a
1673n/a end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1674n/a if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1675n/a &maxchar, &num_surrogates) == -1)
1676n/a return -1;
1677n/a
1678n/a if (maxchar < 256) {
1679n/a _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1680n/a if (!_PyUnicode_DATA_ANY(unicode)) {
1681n/a PyErr_NoMemory();
1682n/a return -1;
1683n/a }
1684n/a _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1685n/a _PyUnicode_WSTR(unicode), end,
1686n/a PyUnicode_1BYTE_DATA(unicode));
1687n/a PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1688n/a _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1689n/a _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1690n/a if (maxchar < 128) {
1691n/a _PyUnicode_STATE(unicode).ascii = 1;
1692n/a _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1693n/a _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1694n/a }
1695n/a else {
1696n/a _PyUnicode_STATE(unicode).ascii = 0;
1697n/a _PyUnicode_UTF8(unicode) = NULL;
1698n/a _PyUnicode_UTF8_LENGTH(unicode) = 0;
1699n/a }
1700n/a PyObject_FREE(_PyUnicode_WSTR(unicode));
1701n/a _PyUnicode_WSTR(unicode) = NULL;
1702n/a _PyUnicode_WSTR_LENGTH(unicode) = 0;
1703n/a }
1704n/a /* In this case we might have to convert down from 4-byte native
1705n/a wchar_t to 2-byte unicode. */
1706n/a else if (maxchar < 65536) {
1707n/a assert(num_surrogates == 0 &&
1708n/a "FindMaxCharAndNumSurrogatePairs() messed up");
1709n/a
1710n/a#if SIZEOF_WCHAR_T == 2
1711n/a /* We can share representations and are done. */
1712n/a _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1713n/a PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1714n/a _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1715n/a _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1716n/a _PyUnicode_UTF8(unicode) = NULL;
1717n/a _PyUnicode_UTF8_LENGTH(unicode) = 0;
1718n/a#else
1719n/a /* sizeof(wchar_t) == 4 */
1720n/a _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1721n/a 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1722n/a if (!_PyUnicode_DATA_ANY(unicode)) {
1723n/a PyErr_NoMemory();
1724n/a return -1;
1725n/a }
1726n/a _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1727n/a _PyUnicode_WSTR(unicode), end,
1728n/a PyUnicode_2BYTE_DATA(unicode));
1729n/a PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1730n/a _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1731n/a _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1732n/a _PyUnicode_UTF8(unicode) = NULL;
1733n/a _PyUnicode_UTF8_LENGTH(unicode) = 0;
1734n/a PyObject_FREE(_PyUnicode_WSTR(unicode));
1735n/a _PyUnicode_WSTR(unicode) = NULL;
1736n/a _PyUnicode_WSTR_LENGTH(unicode) = 0;
1737n/a#endif
1738n/a }
1739n/a /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1740n/a else {
1741n/a#if SIZEOF_WCHAR_T == 2
1742n/a /* in case the native representation is 2-bytes, we need to allocate a
1743n/a new normalized 4-byte version. */
1744n/a length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1745n/a if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1746n/a PyErr_NoMemory();
1747n/a return -1;
1748n/a }
1749n/a _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1750n/a if (!_PyUnicode_DATA_ANY(unicode)) {
1751n/a PyErr_NoMemory();
1752n/a return -1;
1753n/a }
1754n/a _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1755n/a _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1756n/a _PyUnicode_UTF8(unicode) = NULL;
1757n/a _PyUnicode_UTF8_LENGTH(unicode) = 0;
1758n/a /* unicode_convert_wchar_to_ucs4() requires a ready string */
1759n/a _PyUnicode_STATE(unicode).ready = 1;
1760n/a unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1761n/a PyObject_FREE(_PyUnicode_WSTR(unicode));
1762n/a _PyUnicode_WSTR(unicode) = NULL;
1763n/a _PyUnicode_WSTR_LENGTH(unicode) = 0;
1764n/a#else
1765n/a assert(num_surrogates == 0);
1766n/a
1767n/a _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1768n/a _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1769n/a _PyUnicode_UTF8(unicode) = NULL;
1770n/a _PyUnicode_UTF8_LENGTH(unicode) = 0;
1771n/a _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1772n/a#endif
1773n/a PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1774n/a }
1775n/a _PyUnicode_STATE(unicode).ready = 1;
1776n/a assert(_PyUnicode_CheckConsistency(unicode, 1));
1777n/a return 0;
1778n/a}
1779n/a
1780n/astatic void
1781n/aunicode_dealloc(PyObject *unicode)
1782n/a{
1783n/a switch (PyUnicode_CHECK_INTERNED(unicode)) {
1784n/a case SSTATE_NOT_INTERNED:
1785n/a break;
1786n/a
1787n/a case SSTATE_INTERNED_MORTAL:
1788n/a /* revive dead object temporarily for DelItem */
1789n/a Py_REFCNT(unicode) = 3;
1790n/a if (PyDict_DelItem(interned, unicode) != 0)
1791n/a Py_FatalError(
1792n/a "deletion of interned string failed");
1793n/a break;
1794n/a
1795n/a case SSTATE_INTERNED_IMMORTAL:
1796n/a Py_FatalError("Immortal interned string died.");
1797n/a
1798n/a default:
1799n/a Py_FatalError("Inconsistent interned string state.");
1800n/a }
1801n/a
1802n/a if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1803n/a PyObject_DEL(_PyUnicode_WSTR(unicode));
1804n/a if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1805n/a PyObject_DEL(_PyUnicode_UTF8(unicode));
1806n/a if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1807n/a PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1808n/a
1809n/a Py_TYPE(unicode)->tp_free(unicode);
1810n/a}
1811n/a
1812n/a#ifdef Py_DEBUG
1813n/astatic int
1814n/aunicode_is_singleton(PyObject *unicode)
1815n/a{
1816n/a PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1817n/a if (unicode == unicode_empty)
1818n/a return 1;
1819n/a if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1820n/a {
1821n/a Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1822n/a if (ch < 256 && unicode_latin1[ch] == unicode)
1823n/a return 1;
1824n/a }
1825n/a return 0;
1826n/a}
1827n/a#endif
1828n/a
1829n/astatic int
1830n/aunicode_modifiable(PyObject *unicode)
1831n/a{
1832n/a assert(_PyUnicode_CHECK(unicode));
1833n/a if (Py_REFCNT(unicode) != 1)
1834n/a return 0;
1835n/a if (_PyUnicode_HASH(unicode) != -1)
1836n/a return 0;
1837n/a if (PyUnicode_CHECK_INTERNED(unicode))
1838n/a return 0;
1839n/a if (!PyUnicode_CheckExact(unicode))
1840n/a return 0;
1841n/a#ifdef Py_DEBUG
1842n/a /* singleton refcount is greater than 1 */
1843n/a assert(!unicode_is_singleton(unicode));
1844n/a#endif
1845n/a return 1;
1846n/a}
1847n/a
1848n/astatic int
1849n/aunicode_resize(PyObject **p_unicode, Py_ssize_t length)
1850n/a{
1851n/a PyObject *unicode;
1852n/a Py_ssize_t old_length;
1853n/a
1854n/a assert(p_unicode != NULL);
1855n/a unicode = *p_unicode;
1856n/a
1857n/a assert(unicode != NULL);
1858n/a assert(PyUnicode_Check(unicode));
1859n/a assert(0 <= length);
1860n/a
1861n/a if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1862n/a old_length = PyUnicode_WSTR_LENGTH(unicode);
1863n/a else
1864n/a old_length = PyUnicode_GET_LENGTH(unicode);
1865n/a if (old_length == length)
1866n/a return 0;
1867n/a
1868n/a if (length == 0) {
1869n/a _Py_INCREF_UNICODE_EMPTY();
1870n/a if (!unicode_empty)
1871n/a return -1;
1872n/a Py_SETREF(*p_unicode, unicode_empty);
1873n/a return 0;
1874n/a }
1875n/a
1876n/a if (!unicode_modifiable(unicode)) {
1877n/a PyObject *copy = resize_copy(unicode, length);
1878n/a if (copy == NULL)
1879n/a return -1;
1880n/a Py_SETREF(*p_unicode, copy);
1881n/a return 0;
1882n/a }
1883n/a
1884n/a if (PyUnicode_IS_COMPACT(unicode)) {
1885n/a PyObject *new_unicode = resize_compact(unicode, length);
1886n/a if (new_unicode == NULL)
1887n/a return -1;
1888n/a *p_unicode = new_unicode;
1889n/a return 0;
1890n/a }
1891n/a return resize_inplace(unicode, length);
1892n/a}
1893n/a
1894n/aint
1895n/aPyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1896n/a{
1897n/a PyObject *unicode;
1898n/a if (p_unicode == NULL) {
1899n/a PyErr_BadInternalCall();
1900n/a return -1;
1901n/a }
1902n/a unicode = *p_unicode;
1903n/a if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1904n/a {
1905n/a PyErr_BadInternalCall();
1906n/a return -1;
1907n/a }
1908n/a return unicode_resize(p_unicode, length);
1909n/a}
1910n/a
1911n/a/* Copy an ASCII or latin1 char* string into a Python Unicode string.
1912n/a
1913n/a WARNING: The function doesn't copy the terminating null character and
1914n/a doesn't check the maximum character (may write a latin1 character in an
1915n/a ASCII string). */
1916n/astatic void
1917n/aunicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1918n/a const char *str, Py_ssize_t len)
1919n/a{
1920n/a enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1921n/a void *data = PyUnicode_DATA(unicode);
1922n/a const char *end = str + len;
1923n/a
1924n/a switch (kind) {
1925n/a case PyUnicode_1BYTE_KIND: {
1926n/a assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1927n/a#ifdef Py_DEBUG
1928n/a if (PyUnicode_IS_ASCII(unicode)) {
1929n/a Py_UCS4 maxchar = ucs1lib_find_max_char(
1930n/a (const Py_UCS1*)str,
1931n/a (const Py_UCS1*)str + len);
1932n/a assert(maxchar < 128);
1933n/a }
1934n/a#endif
1935n/a memcpy((char *) data + index, str, len);
1936n/a break;
1937n/a }
1938n/a case PyUnicode_2BYTE_KIND: {
1939n/a Py_UCS2 *start = (Py_UCS2 *)data + index;
1940n/a Py_UCS2 *ucs2 = start;
1941n/a assert(index <= PyUnicode_GET_LENGTH(unicode));
1942n/a
1943n/a for (; str < end; ++ucs2, ++str)
1944n/a *ucs2 = (Py_UCS2)*str;
1945n/a
1946n/a assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1947n/a break;
1948n/a }
1949n/a default: {
1950n/a Py_UCS4 *start = (Py_UCS4 *)data + index;
1951n/a Py_UCS4 *ucs4 = start;
1952n/a assert(kind == PyUnicode_4BYTE_KIND);
1953n/a assert(index <= PyUnicode_GET_LENGTH(unicode));
1954n/a
1955n/a for (; str < end; ++ucs4, ++str)
1956n/a *ucs4 = (Py_UCS4)*str;
1957n/a
1958n/a assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1959n/a }
1960n/a }
1961n/a}
1962n/a
1963n/astatic PyObject*
1964n/aget_latin1_char(unsigned char ch)
1965n/a{
1966n/a PyObject *unicode = unicode_latin1[ch];
1967n/a if (!unicode) {
1968n/a unicode = PyUnicode_New(1, ch);
1969n/a if (!unicode)
1970n/a return NULL;
1971n/a PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1972n/a assert(_PyUnicode_CheckConsistency(unicode, 1));
1973n/a unicode_latin1[ch] = unicode;
1974n/a }
1975n/a Py_INCREF(unicode);
1976n/a return unicode;
1977n/a}
1978n/a
1979n/astatic PyObject*
1980n/aunicode_char(Py_UCS4 ch)
1981n/a{
1982n/a PyObject *unicode;
1983n/a
1984n/a assert(ch <= MAX_UNICODE);
1985n/a
1986n/a if (ch < 256)
1987n/a return get_latin1_char(ch);
1988n/a
1989n/a unicode = PyUnicode_New(1, ch);
1990n/a if (unicode == NULL)
1991n/a return NULL;
1992n/a
1993n/a assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1994n/a if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1995n/a PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1996n/a } else {
1997n/a assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1998n/a PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1999n/a }
2000n/a assert(_PyUnicode_CheckConsistency(unicode, 1));
2001n/a return unicode;
2002n/a}
2003n/a
2004n/aPyObject *
2005n/aPyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2006n/a{
2007n/a if (u == NULL)
2008n/a return (PyObject*)_PyUnicode_New(size);
2009n/a
2010n/a if (size < 0) {
2011n/a PyErr_BadInternalCall();
2012n/a return NULL;
2013n/a }
2014n/a
2015n/a return PyUnicode_FromWideChar(u, size);
2016n/a}
2017n/a
2018n/aPyObject *
2019n/aPyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2020n/a{
2021n/a PyObject *unicode;
2022n/a Py_UCS4 maxchar = 0;
2023n/a Py_ssize_t num_surrogates;
2024n/a
2025n/a if (u == NULL && size != 0) {
2026n/a PyErr_BadInternalCall();
2027n/a return NULL;
2028n/a }
2029n/a
2030n/a if (size == -1) {
2031n/a size = wcslen(u);
2032n/a }
2033n/a
2034n/a /* If the Unicode data is known at construction time, we can apply
2035n/a some optimizations which share commonly used objects. */
2036n/a
2037n/a /* Optimization for empty strings */
2038n/a if (size == 0)
2039n/a _Py_RETURN_UNICODE_EMPTY();
2040n/a
2041n/a /* Single character Unicode objects in the Latin-1 range are
2042n/a shared when using this constructor */
2043n/a if (size == 1 && (Py_UCS4)*u < 256)
2044n/a return get_latin1_char((unsigned char)*u);
2045n/a
2046n/a /* If not empty and not single character, copy the Unicode data
2047n/a into the new object */
2048n/a if (find_maxchar_surrogates(u, u + size,
2049n/a &maxchar, &num_surrogates) == -1)
2050n/a return NULL;
2051n/a
2052n/a unicode = PyUnicode_New(size - num_surrogates, maxchar);
2053n/a if (!unicode)
2054n/a return NULL;
2055n/a
2056n/a switch (PyUnicode_KIND(unicode)) {
2057n/a case PyUnicode_1BYTE_KIND:
2058n/a _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2059n/a u, u + size, PyUnicode_1BYTE_DATA(unicode));
2060n/a break;
2061n/a case PyUnicode_2BYTE_KIND:
2062n/a#if Py_UNICODE_SIZE == 2
2063n/a memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2064n/a#else
2065n/a _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2066n/a u, u + size, PyUnicode_2BYTE_DATA(unicode));
2067n/a#endif
2068n/a break;
2069n/a case PyUnicode_4BYTE_KIND:
2070n/a#if SIZEOF_WCHAR_T == 2
2071n/a /* This is the only case which has to process surrogates, thus
2072n/a a simple copy loop is not enough and we need a function. */
2073n/a unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2074n/a#else
2075n/a assert(num_surrogates == 0);
2076n/a memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2077n/a#endif
2078n/a break;
2079n/a default:
2080n/a assert(0 && "Impossible state");
2081n/a }
2082n/a
2083n/a return unicode_result(unicode);
2084n/a}
2085n/a
2086n/aPyObject *
2087n/aPyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2088n/a{
2089n/a if (size < 0) {
2090n/a PyErr_SetString(PyExc_SystemError,
2091n/a "Negative size passed to PyUnicode_FromStringAndSize");
2092n/a return NULL;
2093n/a }
2094n/a if (u != NULL)
2095n/a return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2096n/a else
2097n/a return (PyObject *)_PyUnicode_New(size);
2098n/a}
2099n/a
2100n/aPyObject *
2101n/aPyUnicode_FromString(const char *u)
2102n/a{
2103n/a size_t size = strlen(u);
2104n/a if (size > PY_SSIZE_T_MAX) {
2105n/a PyErr_SetString(PyExc_OverflowError, "input too long");
2106n/a return NULL;
2107n/a }
2108n/a return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2109n/a}
2110n/a
2111n/aPyObject *
2112n/a_PyUnicode_FromId(_Py_Identifier *id)
2113n/a{
2114n/a if (!id->object) {
2115n/a id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2116n/a strlen(id->string),
2117n/a NULL, NULL);
2118n/a if (!id->object)
2119n/a return NULL;
2120n/a PyUnicode_InternInPlace(&id->object);
2121n/a assert(!id->next);
2122n/a id->next = static_strings;
2123n/a static_strings = id;
2124n/a }
2125n/a return id->object;
2126n/a}
2127n/a
2128n/avoid
2129n/a_PyUnicode_ClearStaticStrings()
2130n/a{
2131n/a _Py_Identifier *tmp, *s = static_strings;
2132n/a while (s) {
2133n/a Py_CLEAR(s->object);
2134n/a tmp = s->next;
2135n/a s->next = NULL;
2136n/a s = tmp;
2137n/a }
2138n/a static_strings = NULL;
2139n/a}
2140n/a
2141n/a/* Internal function, doesn't check maximum character */
2142n/a
2143n/aPyObject*
2144n/a_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2145n/a{
2146n/a const unsigned char *s = (const unsigned char *)buffer;
2147n/a PyObject *unicode;
2148n/a if (size == 1) {
2149n/a#ifdef Py_DEBUG
2150n/a assert((unsigned char)s[0] < 128);
2151n/a#endif
2152n/a return get_latin1_char(s[0]);
2153n/a }
2154n/a unicode = PyUnicode_New(size, 127);
2155n/a if (!unicode)
2156n/a return NULL;
2157n/a memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2158n/a assert(_PyUnicode_CheckConsistency(unicode, 1));
2159n/a return unicode;
2160n/a}
2161n/a
2162n/astatic Py_UCS4
2163n/akind_maxchar_limit(unsigned int kind)
2164n/a{
2165n/a switch (kind) {
2166n/a case PyUnicode_1BYTE_KIND:
2167n/a return 0x80;
2168n/a case PyUnicode_2BYTE_KIND:
2169n/a return 0x100;
2170n/a case PyUnicode_4BYTE_KIND:
2171n/a return 0x10000;
2172n/a default:
2173n/a assert(0 && "invalid kind");
2174n/a return MAX_UNICODE;
2175n/a }
2176n/a}
2177n/a
2178n/astatic inline Py_UCS4
2179n/aalign_maxchar(Py_UCS4 maxchar)
2180n/a{
2181n/a if (maxchar <= 127)
2182n/a return 127;
2183n/a else if (maxchar <= 255)
2184n/a return 255;
2185n/a else if (maxchar <= 65535)
2186n/a return 65535;
2187n/a else
2188n/a return MAX_UNICODE;
2189n/a}
2190n/a
2191n/astatic PyObject*
2192n/a_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2193n/a{
2194n/a PyObject *res;
2195n/a unsigned char max_char;
2196n/a
2197n/a if (size == 0)
2198n/a _Py_RETURN_UNICODE_EMPTY();
2199n/a assert(size > 0);
2200n/a if (size == 1)
2201n/a return get_latin1_char(u[0]);
2202n/a
2203n/a max_char = ucs1lib_find_max_char(u, u + size);
2204n/a res = PyUnicode_New(size, max_char);
2205n/a if (!res)
2206n/a return NULL;
2207n/a memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2208n/a assert(_PyUnicode_CheckConsistency(res, 1));
2209n/a return res;
2210n/a}
2211n/a
2212n/astatic PyObject*
2213n/a_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2214n/a{
2215n/a PyObject *res;
2216n/a Py_UCS2 max_char;
2217n/a
2218n/a if (size == 0)
2219n/a _Py_RETURN_UNICODE_EMPTY();
2220n/a assert(size > 0);
2221n/a if (size == 1)
2222n/a return unicode_char(u[0]);
2223n/a
2224n/a max_char = ucs2lib_find_max_char(u, u + size);
2225n/a res = PyUnicode_New(size, max_char);
2226n/a if (!res)
2227n/a return NULL;
2228n/a if (max_char >= 256)
2229n/a memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2230n/a else {
2231n/a _PyUnicode_CONVERT_BYTES(
2232n/a Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2233n/a }
2234n/a assert(_PyUnicode_CheckConsistency(res, 1));
2235n/a return res;
2236n/a}
2237n/a
2238n/astatic PyObject*
2239n/a_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2240n/a{
2241n/a PyObject *res;
2242n/a Py_UCS4 max_char;
2243n/a
2244n/a if (size == 0)
2245n/a _Py_RETURN_UNICODE_EMPTY();
2246n/a assert(size > 0);
2247n/a if (size == 1)
2248n/a return unicode_char(u[0]);
2249n/a
2250n/a max_char = ucs4lib_find_max_char(u, u + size);
2251n/a res = PyUnicode_New(size, max_char);
2252n/a if (!res)
2253n/a return NULL;
2254n/a if (max_char < 256)
2255n/a _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2256n/a PyUnicode_1BYTE_DATA(res));
2257n/a else if (max_char < 0x10000)
2258n/a _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2259n/a PyUnicode_2BYTE_DATA(res));
2260n/a else
2261n/a memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2262n/a assert(_PyUnicode_CheckConsistency(res, 1));
2263n/a return res;
2264n/a}
2265n/a
2266n/aPyObject*
2267n/aPyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2268n/a{
2269n/a if (size < 0) {
2270n/a PyErr_SetString(PyExc_ValueError, "size must be positive");
2271n/a return NULL;
2272n/a }
2273n/a switch (kind) {
2274n/a case PyUnicode_1BYTE_KIND:
2275n/a return _PyUnicode_FromUCS1(buffer, size);
2276n/a case PyUnicode_2BYTE_KIND:
2277n/a return _PyUnicode_FromUCS2(buffer, size);
2278n/a case PyUnicode_4BYTE_KIND:
2279n/a return _PyUnicode_FromUCS4(buffer, size);
2280n/a default:
2281n/a PyErr_SetString(PyExc_SystemError, "invalid kind");
2282n/a return NULL;
2283n/a }
2284n/a}
2285n/a
2286n/aPy_UCS4
2287n/a_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2288n/a{
2289n/a enum PyUnicode_Kind kind;
2290n/a void *startptr, *endptr;
2291n/a
2292n/a assert(PyUnicode_IS_READY(unicode));
2293n/a assert(0 <= start);
2294n/a assert(end <= PyUnicode_GET_LENGTH(unicode));
2295n/a assert(start <= end);
2296n/a
2297n/a if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2298n/a return PyUnicode_MAX_CHAR_VALUE(unicode);
2299n/a
2300n/a if (start == end)
2301n/a return 127;
2302n/a
2303n/a if (PyUnicode_IS_ASCII(unicode))
2304n/a return 127;
2305n/a
2306n/a kind = PyUnicode_KIND(unicode);
2307n/a startptr = PyUnicode_DATA(unicode);
2308n/a endptr = (char *)startptr + end * kind;
2309n/a startptr = (char *)startptr + start * kind;
2310n/a switch(kind) {
2311n/a case PyUnicode_1BYTE_KIND:
2312n/a return ucs1lib_find_max_char(startptr, endptr);
2313n/a case PyUnicode_2BYTE_KIND:
2314n/a return ucs2lib_find_max_char(startptr, endptr);
2315n/a case PyUnicode_4BYTE_KIND:
2316n/a return ucs4lib_find_max_char(startptr, endptr);
2317n/a default:
2318n/a assert(0);
2319n/a return 0;
2320n/a }
2321n/a}
2322n/a
2323n/a/* Ensure that a string uses the most efficient storage, if it is not the
2324n/a case: create a new string with of the right kind. Write NULL into *p_unicode
2325n/a on error. */
2326n/astatic void
2327n/aunicode_adjust_maxchar(PyObject **p_unicode)
2328n/a{
2329n/a PyObject *unicode, *copy;
2330n/a Py_UCS4 max_char;
2331n/a Py_ssize_t len;
2332n/a unsigned int kind;
2333n/a
2334n/a assert(p_unicode != NULL);
2335n/a unicode = *p_unicode;
2336n/a assert(PyUnicode_IS_READY(unicode));
2337n/a if (PyUnicode_IS_ASCII(unicode))
2338n/a return;
2339n/a
2340n/a len = PyUnicode_GET_LENGTH(unicode);
2341n/a kind = PyUnicode_KIND(unicode);
2342n/a if (kind == PyUnicode_1BYTE_KIND) {
2343n/a const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2344n/a max_char = ucs1lib_find_max_char(u, u + len);
2345n/a if (max_char >= 128)
2346n/a return;
2347n/a }
2348n/a else if (kind == PyUnicode_2BYTE_KIND) {
2349n/a const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2350n/a max_char = ucs2lib_find_max_char(u, u + len);
2351n/a if (max_char >= 256)
2352n/a return;
2353n/a }
2354n/a else {
2355n/a const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2356n/a assert(kind == PyUnicode_4BYTE_KIND);
2357n/a max_char = ucs4lib_find_max_char(u, u + len);
2358n/a if (max_char >= 0x10000)
2359n/a return;
2360n/a }
2361n/a copy = PyUnicode_New(len, max_char);
2362n/a if (copy != NULL)
2363n/a _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2364n/a Py_DECREF(unicode);
2365n/a *p_unicode = copy;
2366n/a}
2367n/a
2368n/aPyObject*
2369n/a_PyUnicode_Copy(PyObject *unicode)
2370n/a{
2371n/a Py_ssize_t length;
2372n/a PyObject *copy;
2373n/a
2374n/a if (!PyUnicode_Check(unicode)) {
2375n/a PyErr_BadInternalCall();
2376n/a return NULL;
2377n/a }
2378n/a if (PyUnicode_READY(unicode) == -1)
2379n/a return NULL;
2380n/a
2381n/a length = PyUnicode_GET_LENGTH(unicode);
2382n/a copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2383n/a if (!copy)
2384n/a return NULL;
2385n/a assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2386n/a
2387n/a memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2388n/a length * PyUnicode_KIND(unicode));
2389n/a assert(_PyUnicode_CheckConsistency(copy, 1));
2390n/a return copy;
2391n/a}
2392n/a
2393n/a
2394n/a/* Widen Unicode objects to larger buffers. Don't write terminating null
2395n/a character. Return NULL on error. */
2396n/a
2397n/avoid*
2398n/a_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2399n/a{
2400n/a Py_ssize_t len;
2401n/a void *result;
2402n/a unsigned int skind;
2403n/a
2404n/a if (PyUnicode_READY(s) == -1)
2405n/a return NULL;
2406n/a
2407n/a len = PyUnicode_GET_LENGTH(s);
2408n/a skind = PyUnicode_KIND(s);
2409n/a if (skind >= kind) {
2410n/a PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2411n/a return NULL;
2412n/a }
2413n/a switch (kind) {
2414n/a case PyUnicode_2BYTE_KIND:
2415n/a result = PyMem_New(Py_UCS2, len);
2416n/a if (!result)
2417n/a return PyErr_NoMemory();
2418n/a assert(skind == PyUnicode_1BYTE_KIND);
2419n/a _PyUnicode_CONVERT_BYTES(
2420n/a Py_UCS1, Py_UCS2,
2421n/a PyUnicode_1BYTE_DATA(s),
2422n/a PyUnicode_1BYTE_DATA(s) + len,
2423n/a result);
2424n/a return result;
2425n/a case PyUnicode_4BYTE_KIND:
2426n/a result = PyMem_New(Py_UCS4, len);
2427n/a if (!result)
2428n/a return PyErr_NoMemory();
2429n/a if (skind == PyUnicode_2BYTE_KIND) {
2430n/a _PyUnicode_CONVERT_BYTES(
2431n/a Py_UCS2, Py_UCS4,
2432n/a PyUnicode_2BYTE_DATA(s),
2433n/a PyUnicode_2BYTE_DATA(s) + len,
2434n/a result);
2435n/a }
2436n/a else {
2437n/a assert(skind == PyUnicode_1BYTE_KIND);
2438n/a _PyUnicode_CONVERT_BYTES(
2439n/a Py_UCS1, Py_UCS4,
2440n/a PyUnicode_1BYTE_DATA(s),
2441n/a PyUnicode_1BYTE_DATA(s) + len,
2442n/a result);
2443n/a }
2444n/a return result;
2445n/a default:
2446n/a break;
2447n/a }
2448n/a PyErr_SetString(PyExc_SystemError, "invalid kind");
2449n/a return NULL;
2450n/a}
2451n/a
2452n/astatic Py_UCS4*
2453n/aas_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2454n/a int copy_null)
2455n/a{
2456n/a int kind;
2457n/a void *data;
2458n/a Py_ssize_t len, targetlen;
2459n/a if (PyUnicode_READY(string) == -1)
2460n/a return NULL;
2461n/a kind = PyUnicode_KIND(string);
2462n/a data = PyUnicode_DATA(string);
2463n/a len = PyUnicode_GET_LENGTH(string);
2464n/a targetlen = len;
2465n/a if (copy_null)
2466n/a targetlen++;
2467n/a if (!target) {
2468n/a target = PyMem_New(Py_UCS4, targetlen);
2469n/a if (!target) {
2470n/a PyErr_NoMemory();
2471n/a return NULL;
2472n/a }
2473n/a }
2474n/a else {
2475n/a if (targetsize < targetlen) {
2476n/a PyErr_Format(PyExc_SystemError,
2477n/a "string is longer than the buffer");
2478n/a if (copy_null && 0 < targetsize)
2479n/a target[0] = 0;
2480n/a return NULL;
2481n/a }
2482n/a }
2483n/a if (kind == PyUnicode_1BYTE_KIND) {
2484n/a Py_UCS1 *start = (Py_UCS1 *) data;
2485n/a _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2486n/a }
2487n/a else if (kind == PyUnicode_2BYTE_KIND) {
2488n/a Py_UCS2 *start = (Py_UCS2 *) data;
2489n/a _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2490n/a }
2491n/a else {
2492n/a assert(kind == PyUnicode_4BYTE_KIND);
2493n/a memcpy(target, data, len * sizeof(Py_UCS4));
2494n/a }
2495n/a if (copy_null)
2496n/a target[len] = 0;
2497n/a return target;
2498n/a}
2499n/a
2500n/aPy_UCS4*
2501n/aPyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2502n/a int copy_null)
2503n/a{
2504n/a if (target == NULL || targetsize < 0) {
2505n/a PyErr_BadInternalCall();
2506n/a return NULL;
2507n/a }
2508n/a return as_ucs4(string, target, targetsize, copy_null);
2509n/a}
2510n/a
2511n/aPy_UCS4*
2512n/aPyUnicode_AsUCS4Copy(PyObject *string)
2513n/a{
2514n/a return as_ucs4(string, NULL, 0, 1);
2515n/a}
2516n/a
2517n/a/* maximum number of characters required for output of %lld or %p.
2518n/a We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2519n/a plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2520n/a#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2521n/a
2522n/astatic int
2523n/aunicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2524n/a Py_ssize_t width, Py_ssize_t precision)
2525n/a{
2526n/a Py_ssize_t length, fill, arglen;
2527n/a Py_UCS4 maxchar;
2528n/a
2529n/a if (PyUnicode_READY(str) == -1)
2530n/a return -1;
2531n/a
2532n/a length = PyUnicode_GET_LENGTH(str);
2533n/a if ((precision == -1 || precision >= length)
2534n/a && width <= length)
2535n/a return _PyUnicodeWriter_WriteStr(writer, str);
2536n/a
2537n/a if (precision != -1)
2538n/a length = Py_MIN(precision, length);
2539n/a
2540n/a arglen = Py_MAX(length, width);
2541n/a if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2542n/a maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2543n/a else
2544n/a maxchar = writer->maxchar;
2545n/a
2546n/a if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2547n/a return -1;
2548n/a
2549n/a if (width > length) {
2550n/a fill = width - length;
2551n/a if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2552n/a return -1;
2553n/a writer->pos += fill;
2554n/a }
2555n/a
2556n/a _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2557n/a str, 0, length);
2558n/a writer->pos += length;
2559n/a return 0;
2560n/a}
2561n/a
2562n/astatic int
2563n/aunicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2564n/a Py_ssize_t width, Py_ssize_t precision)
2565n/a{
2566n/a /* UTF-8 */
2567n/a Py_ssize_t length;
2568n/a PyObject *unicode;
2569n/a int res;
2570n/a
2571n/a length = strlen(str);
2572n/a if (precision != -1)
2573n/a length = Py_MIN(length, precision);
2574n/a unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2575n/a if (unicode == NULL)
2576n/a return -1;
2577n/a
2578n/a res = unicode_fromformat_write_str(writer, unicode, width, -1);
2579n/a Py_DECREF(unicode);
2580n/a return res;
2581n/a}
2582n/a
2583n/astatic const char*
2584n/aunicode_fromformat_arg(_PyUnicodeWriter *writer,
2585n/a const char *f, va_list *vargs)
2586n/a{
2587n/a const char *p;
2588n/a Py_ssize_t len;
2589n/a int zeropad;
2590n/a Py_ssize_t width;
2591n/a Py_ssize_t precision;
2592n/a int longflag;
2593n/a int longlongflag;
2594n/a int size_tflag;
2595n/a Py_ssize_t fill;
2596n/a
2597n/a p = f;
2598n/a f++;
2599n/a zeropad = 0;
2600n/a if (*f == '0') {
2601n/a zeropad = 1;
2602n/a f++;
2603n/a }
2604n/a
2605n/a /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2606n/a width = -1;
2607n/a if (Py_ISDIGIT((unsigned)*f)) {
2608n/a width = *f - '0';
2609n/a f++;
2610n/a while (Py_ISDIGIT((unsigned)*f)) {
2611n/a if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2612n/a PyErr_SetString(PyExc_ValueError,
2613n/a "width too big");
2614n/a return NULL;
2615n/a }
2616n/a width = (width * 10) + (*f - '0');
2617n/a f++;
2618n/a }
2619n/a }
2620n/a precision = -1;
2621n/a if (*f == '.') {
2622n/a f++;
2623n/a if (Py_ISDIGIT((unsigned)*f)) {
2624n/a precision = (*f - '0');
2625n/a f++;
2626n/a while (Py_ISDIGIT((unsigned)*f)) {
2627n/a if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2628n/a PyErr_SetString(PyExc_ValueError,
2629n/a "precision too big");
2630n/a return NULL;
2631n/a }
2632n/a precision = (precision * 10) + (*f - '0');
2633n/a f++;
2634n/a }
2635n/a }
2636n/a if (*f == '%') {
2637n/a /* "%.3%s" => f points to "3" */
2638n/a f--;
2639n/a }
2640n/a }
2641n/a if (*f == '\0') {
2642n/a /* bogus format "%.123" => go backward, f points to "3" */
2643n/a f--;
2644n/a }
2645n/a
2646n/a /* Handle %ld, %lu, %lld and %llu. */
2647n/a longflag = 0;
2648n/a longlongflag = 0;
2649n/a size_tflag = 0;
2650n/a if (*f == 'l') {
2651n/a if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2652n/a longflag = 1;
2653n/a ++f;
2654n/a }
2655n/a else if (f[1] == 'l' &&
2656n/a (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2657n/a longlongflag = 1;
2658n/a f += 2;
2659n/a }
2660n/a }
2661n/a /* handle the size_t flag. */
2662n/a else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2663n/a size_tflag = 1;
2664n/a ++f;
2665n/a }
2666n/a
2667n/a if (f[1] == '\0')
2668n/a writer->overallocate = 0;
2669n/a
2670n/a switch (*f) {
2671n/a case 'c':
2672n/a {
2673n/a int ordinal = va_arg(*vargs, int);
2674n/a if (ordinal < 0 || ordinal > MAX_UNICODE) {
2675n/a PyErr_SetString(PyExc_OverflowError,
2676n/a "character argument not in range(0x110000)");
2677n/a return NULL;
2678n/a }
2679n/a if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2680n/a return NULL;
2681n/a break;
2682n/a }
2683n/a
2684n/a case 'i':
2685n/a case 'd':
2686n/a case 'u':
2687n/a case 'x':
2688n/a {
2689n/a /* used by sprintf */
2690n/a char buffer[MAX_LONG_LONG_CHARS];
2691n/a Py_ssize_t arglen;
2692n/a
2693n/a if (*f == 'u') {
2694n/a if (longflag)
2695n/a len = sprintf(buffer, "%lu",
2696n/a va_arg(*vargs, unsigned long));
2697n/a else if (longlongflag)
2698n/a len = sprintf(buffer, "%llu",
2699n/a va_arg(*vargs, unsigned long long));
2700n/a else if (size_tflag)
2701n/a len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
2702n/a va_arg(*vargs, size_t));
2703n/a else
2704n/a len = sprintf(buffer, "%u",
2705n/a va_arg(*vargs, unsigned int));
2706n/a }
2707n/a else if (*f == 'x') {
2708n/a len = sprintf(buffer, "%x", va_arg(*vargs, int));
2709n/a }
2710n/a else {
2711n/a if (longflag)
2712n/a len = sprintf(buffer, "%li",
2713n/a va_arg(*vargs, long));
2714n/a else if (longlongflag)
2715n/a len = sprintf(buffer, "%lli",
2716n/a va_arg(*vargs, long long));
2717n/a else if (size_tflag)
2718n/a len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
2719n/a va_arg(*vargs, Py_ssize_t));
2720n/a else
2721n/a len = sprintf(buffer, "%i",
2722n/a va_arg(*vargs, int));
2723n/a }
2724n/a assert(len >= 0);
2725n/a
2726n/a if (precision < len)
2727n/a precision = len;
2728n/a
2729n/a arglen = Py_MAX(precision, width);
2730n/a if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2731n/a return NULL;
2732n/a
2733n/a if (width > precision) {
2734n/a Py_UCS4 fillchar;
2735n/a fill = width - precision;
2736n/a fillchar = zeropad?'0':' ';
2737n/a if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2738n/a return NULL;
2739n/a writer->pos += fill;
2740n/a }
2741n/a if (precision > len) {
2742n/a fill = precision - len;
2743n/a if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2744n/a return NULL;
2745n/a writer->pos += fill;
2746n/a }
2747n/a
2748n/a if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2749n/a return NULL;
2750n/a break;
2751n/a }
2752n/a
2753n/a case 'p':
2754n/a {
2755n/a char number[MAX_LONG_LONG_CHARS];
2756n/a
2757n/a len = sprintf(number, "%p", va_arg(*vargs, void*));
2758n/a assert(len >= 0);
2759n/a
2760n/a /* %p is ill-defined: ensure leading 0x. */
2761n/a if (number[1] == 'X')
2762n/a number[1] = 'x';
2763n/a else if (number[1] != 'x') {
2764n/a memmove(number + 2, number,
2765n/a strlen(number) + 1);
2766n/a number[0] = '0';
2767n/a number[1] = 'x';
2768n/a len += 2;
2769n/a }
2770n/a
2771n/a if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2772n/a return NULL;
2773n/a break;
2774n/a }
2775n/a
2776n/a case 's':
2777n/a {
2778n/a /* UTF-8 */
2779n/a const char *s = va_arg(*vargs, const char*);
2780n/a if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2781n/a return NULL;
2782n/a break;
2783n/a }
2784n/a
2785n/a case 'U':
2786n/a {
2787n/a PyObject *obj = va_arg(*vargs, PyObject *);
2788n/a assert(obj && _PyUnicode_CHECK(obj));
2789n/a
2790n/a if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2791n/a return NULL;
2792n/a break;
2793n/a }
2794n/a
2795n/a case 'V':
2796n/a {
2797n/a PyObject *obj = va_arg(*vargs, PyObject *);
2798n/a const char *str = va_arg(*vargs, const char *);
2799n/a if (obj) {
2800n/a assert(_PyUnicode_CHECK(obj));
2801n/a if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2802n/a return NULL;
2803n/a }
2804n/a else {
2805n/a assert(str != NULL);
2806n/a if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2807n/a return NULL;
2808n/a }
2809n/a break;
2810n/a }
2811n/a
2812n/a case 'S':
2813n/a {
2814n/a PyObject *obj = va_arg(*vargs, PyObject *);
2815n/a PyObject *str;
2816n/a assert(obj);
2817n/a str = PyObject_Str(obj);
2818n/a if (!str)
2819n/a return NULL;
2820n/a if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2821n/a Py_DECREF(str);
2822n/a return NULL;
2823n/a }
2824n/a Py_DECREF(str);
2825n/a break;
2826n/a }
2827n/a
2828n/a case 'R':
2829n/a {
2830n/a PyObject *obj = va_arg(*vargs, PyObject *);
2831n/a PyObject *repr;
2832n/a assert(obj);
2833n/a repr = PyObject_Repr(obj);
2834n/a if (!repr)
2835n/a return NULL;
2836n/a if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2837n/a Py_DECREF(repr);
2838n/a return NULL;
2839n/a }
2840n/a Py_DECREF(repr);
2841n/a break;
2842n/a }
2843n/a
2844n/a case 'A':
2845n/a {
2846n/a PyObject *obj = va_arg(*vargs, PyObject *);
2847n/a PyObject *ascii;
2848n/a assert(obj);
2849n/a ascii = PyObject_ASCII(obj);
2850n/a if (!ascii)
2851n/a return NULL;
2852n/a if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2853n/a Py_DECREF(ascii);
2854n/a return NULL;
2855n/a }
2856n/a Py_DECREF(ascii);
2857n/a break;
2858n/a }
2859n/a
2860n/a case '%':
2861n/a if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2862n/a return NULL;
2863n/a break;
2864n/a
2865n/a default:
2866n/a /* if we stumble upon an unknown formatting code, copy the rest
2867n/a of the format string to the output string. (we cannot just
2868n/a skip the code, since there's no way to know what's in the
2869n/a argument list) */
2870n/a len = strlen(p);
2871n/a if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2872n/a return NULL;
2873n/a f = p+len;
2874n/a return f;
2875n/a }
2876n/a
2877n/a f++;
2878n/a return f;
2879n/a}
2880n/a
2881n/aPyObject *
2882n/aPyUnicode_FromFormatV(const char *format, va_list vargs)
2883n/a{
2884n/a va_list vargs2;
2885n/a const char *f;
2886n/a _PyUnicodeWriter writer;
2887n/a
2888n/a _PyUnicodeWriter_Init(&writer);
2889n/a writer.min_length = strlen(format) + 100;
2890n/a writer.overallocate = 1;
2891n/a
2892n/a // Copy varags to be able to pass a reference to a subfunction.
2893n/a va_copy(vargs2, vargs);
2894n/a
2895n/a for (f = format; *f; ) {
2896n/a if (*f == '%') {
2897n/a f = unicode_fromformat_arg(&writer, f, &vargs2);
2898n/a if (f == NULL)
2899n/a goto fail;
2900n/a }
2901n/a else {
2902n/a const char *p;
2903n/a Py_ssize_t len;
2904n/a
2905n/a p = f;
2906n/a do
2907n/a {
2908n/a if ((unsigned char)*p > 127) {
2909n/a PyErr_Format(PyExc_ValueError,
2910n/a "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2911n/a "string, got a non-ASCII byte: 0x%02x",
2912n/a (unsigned char)*p);
2913n/a goto fail;
2914n/a }
2915n/a p++;
2916n/a }
2917n/a while (*p != '\0' && *p != '%');
2918n/a len = p - f;
2919n/a
2920n/a if (*p == '\0')
2921n/a writer.overallocate = 0;
2922n/a
2923n/a if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2924n/a goto fail;
2925n/a
2926n/a f = p;
2927n/a }
2928n/a }
2929n/a va_end(vargs2);
2930n/a return _PyUnicodeWriter_Finish(&writer);
2931n/a
2932n/a fail:
2933n/a va_end(vargs2);
2934n/a _PyUnicodeWriter_Dealloc(&writer);
2935n/a return NULL;
2936n/a}
2937n/a
2938n/aPyObject *
2939n/aPyUnicode_FromFormat(const char *format, ...)
2940n/a{
2941n/a PyObject* ret;
2942n/a va_list vargs;
2943n/a
2944n/a#ifdef HAVE_STDARG_PROTOTYPES
2945n/a va_start(vargs, format);
2946n/a#else
2947n/a va_start(vargs);
2948n/a#endif
2949n/a ret = PyUnicode_FromFormatV(format, vargs);
2950n/a va_end(vargs);
2951n/a return ret;
2952n/a}
2953n/a
2954n/a#ifdef HAVE_WCHAR_H
2955n/a
2956n/a/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2957n/a convert a Unicode object to a wide character string.
2958n/a
2959n/a - If w is NULL: return the number of wide characters (including the null
2960n/a character) required to convert the unicode object. Ignore size argument.
2961n/a
2962n/a - Otherwise: return the number of wide characters (excluding the null
2963n/a character) written into w. Write at most size wide characters (including
2964n/a the null character). */
2965n/astatic Py_ssize_t
2966n/aunicode_aswidechar(PyObject *unicode,
2967n/a wchar_t *w,
2968n/a Py_ssize_t size)
2969n/a{
2970n/a Py_ssize_t res;
2971n/a const wchar_t *wstr;
2972n/a
2973n/a wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2974n/a if (wstr == NULL)
2975n/a return -1;
2976n/a
2977n/a if (w != NULL) {
2978n/a if (size > res)
2979n/a size = res + 1;
2980n/a else
2981n/a res = size;
2982n/a memcpy(w, wstr, size * sizeof(wchar_t));
2983n/a return res;
2984n/a }
2985n/a else
2986n/a return res + 1;
2987n/a}
2988n/a
2989n/aPy_ssize_t
2990n/aPyUnicode_AsWideChar(PyObject *unicode,
2991n/a wchar_t *w,
2992n/a Py_ssize_t size)
2993n/a{
2994n/a if (unicode == NULL) {
2995n/a PyErr_BadInternalCall();
2996n/a return -1;
2997n/a }
2998n/a return unicode_aswidechar(unicode, w, size);
2999n/a}
3000n/a
3001n/awchar_t*
3002n/aPyUnicode_AsWideCharString(PyObject *unicode,
3003n/a Py_ssize_t *size)
3004n/a{
3005n/a wchar_t* buffer;
3006n/a Py_ssize_t buflen;
3007n/a
3008n/a if (unicode == NULL) {
3009n/a PyErr_BadInternalCall();
3010n/a return NULL;
3011n/a }
3012n/a
3013n/a buflen = unicode_aswidechar(unicode, NULL, 0);
3014n/a if (buflen == -1)
3015n/a return NULL;
3016n/a buffer = PyMem_NEW(wchar_t, buflen);
3017n/a if (buffer == NULL) {
3018n/a PyErr_NoMemory();
3019n/a return NULL;
3020n/a }
3021n/a buflen = unicode_aswidechar(unicode, buffer, buflen);
3022n/a if (buflen == -1) {
3023n/a PyMem_FREE(buffer);
3024n/a return NULL;
3025n/a }
3026n/a if (size != NULL)
3027n/a *size = buflen;
3028n/a return buffer;
3029n/a}
3030n/a
3031n/a#endif /* HAVE_WCHAR_H */
3032n/a
3033n/aPyObject *
3034n/aPyUnicode_FromOrdinal(int ordinal)
3035n/a{
3036n/a if (ordinal < 0 || ordinal > MAX_UNICODE) {
3037n/a PyErr_SetString(PyExc_ValueError,
3038n/a "chr() arg not in range(0x110000)");
3039n/a return NULL;
3040n/a }
3041n/a
3042n/a return unicode_char((Py_UCS4)ordinal);
3043n/a}
3044n/a
3045n/aPyObject *
3046n/aPyUnicode_FromObject(PyObject *obj)
3047n/a{
3048n/a /* XXX Perhaps we should make this API an alias of
3049n/a PyObject_Str() instead ?! */
3050n/a if (PyUnicode_CheckExact(obj)) {
3051n/a if (PyUnicode_READY(obj) == -1)
3052n/a return NULL;
3053n/a Py_INCREF(obj);
3054n/a return obj;
3055n/a }
3056n/a if (PyUnicode_Check(obj)) {
3057n/a /* For a Unicode subtype that's not a Unicode object,
3058n/a return a true Unicode object with the same data. */
3059n/a return _PyUnicode_Copy(obj);
3060n/a }
3061n/a PyErr_Format(PyExc_TypeError,
3062n/a "Can't convert '%.100s' object to str implicitly",
3063n/a Py_TYPE(obj)->tp_name);
3064n/a return NULL;
3065n/a}
3066n/a
3067n/aPyObject *
3068n/aPyUnicode_FromEncodedObject(PyObject *obj,
3069n/a const char *encoding,
3070n/a const char *errors)
3071n/a{
3072n/a Py_buffer buffer;
3073n/a PyObject *v;
3074n/a
3075n/a if (obj == NULL) {
3076n/a PyErr_BadInternalCall();
3077n/a return NULL;
3078n/a }
3079n/a
3080n/a /* Decoding bytes objects is the most common case and should be fast */
3081n/a if (PyBytes_Check(obj)) {
3082n/a if (PyBytes_GET_SIZE(obj) == 0)
3083n/a _Py_RETURN_UNICODE_EMPTY();
3084n/a v = PyUnicode_Decode(
3085n/a PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3086n/a encoding, errors);
3087n/a return v;
3088n/a }
3089n/a
3090n/a if (PyUnicode_Check(obj)) {
3091n/a PyErr_SetString(PyExc_TypeError,
3092n/a "decoding str is not supported");
3093n/a return NULL;
3094n/a }
3095n/a
3096n/a /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3097n/a if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3098n/a PyErr_Format(PyExc_TypeError,
3099n/a "decoding to str: need a bytes-like object, %.80s found",
3100n/a Py_TYPE(obj)->tp_name);
3101n/a return NULL;
3102n/a }
3103n/a
3104n/a if (buffer.len == 0) {
3105n/a PyBuffer_Release(&buffer);
3106n/a _Py_RETURN_UNICODE_EMPTY();
3107n/a }
3108n/a
3109n/a v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3110n/a PyBuffer_Release(&buffer);
3111n/a return v;
3112n/a}
3113n/a
3114n/a/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3115n/a also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3116n/a longer than lower_len-1). */
3117n/aint
3118n/a_Py_normalize_encoding(const char *encoding,
3119n/a char *lower,
3120n/a size_t lower_len)
3121n/a{
3122n/a const char *e;
3123n/a char *l;
3124n/a char *l_end;
3125n/a int punct;
3126n/a
3127n/a assert(encoding != NULL);
3128n/a
3129n/a e = encoding;
3130n/a l = lower;
3131n/a l_end = &lower[lower_len - 1];
3132n/a punct = 0;
3133n/a while (1) {
3134n/a char c = *e;
3135n/a if (c == 0) {
3136n/a break;
3137n/a }
3138n/a
3139n/a if (Py_ISALNUM(c) || c == '.') {
3140n/a if (punct && l != lower) {
3141n/a if (l == l_end) {
3142n/a return 0;
3143n/a }
3144n/a *l++ = '_';
3145n/a }
3146n/a punct = 0;
3147n/a
3148n/a if (l == l_end) {
3149n/a return 0;
3150n/a }
3151n/a *l++ = Py_TOLOWER(c);
3152n/a }
3153n/a else {
3154n/a punct = 1;
3155n/a }
3156n/a
3157n/a e++;
3158n/a }
3159n/a *l = '\0';
3160n/a return 1;
3161n/a}
3162n/a
3163n/aPyObject *
3164n/aPyUnicode_Decode(const char *s,
3165n/a Py_ssize_t size,
3166n/a const char *encoding,
3167n/a const char *errors)
3168n/a{
3169n/a PyObject *buffer = NULL, *unicode;
3170n/a Py_buffer info;
3171n/a char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3172n/a
3173n/a if (encoding == NULL) {
3174n/a return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3175n/a }
3176n/a
3177n/a /* Shortcuts for common default encodings */
3178n/a if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3179n/a char *lower = buflower;
3180n/a
3181n/a /* Fast paths */
3182n/a if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3183n/a lower += 3;
3184n/a if (*lower == '_') {
3185n/a /* Match "utf8" and "utf_8" */
3186n/a lower++;
3187n/a }
3188n/a
3189n/a if (lower[0] == '8' && lower[1] == 0) {
3190n/a return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3191n/a }
3192n/a else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3193n/a return PyUnicode_DecodeUTF16(s, size, errors, 0);
3194n/a }
3195n/a else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3196n/a return PyUnicode_DecodeUTF32(s, size, errors, 0);
3197n/a }
3198n/a }
3199n/a else {
3200n/a if (strcmp(lower, "ascii") == 0
3201n/a || strcmp(lower, "us_ascii") == 0) {
3202n/a return PyUnicode_DecodeASCII(s, size, errors);
3203n/a }
3204n/a #ifdef MS_WINDOWS
3205n/a else if (strcmp(lower, "mbcs") == 0) {
3206n/a return PyUnicode_DecodeMBCS(s, size, errors);
3207n/a }
3208n/a #endif
3209n/a else if (strcmp(lower, "latin1") == 0
3210n/a || strcmp(lower, "latin_1") == 0
3211n/a || strcmp(lower, "iso_8859_1") == 0
3212n/a || strcmp(lower, "iso8859_1") == 0) {
3213n/a return PyUnicode_DecodeLatin1(s, size, errors);
3214n/a }
3215n/a }
3216n/a }
3217n/a
3218n/a /* Decode via the codec registry */
3219n/a buffer = NULL;
3220n/a if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3221n/a goto onError;
3222n/a buffer = PyMemoryView_FromBuffer(&info);
3223n/a if (buffer == NULL)
3224n/a goto onError;
3225n/a unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3226n/a if (unicode == NULL)
3227n/a goto onError;
3228n/a if (!PyUnicode_Check(unicode)) {
3229n/a PyErr_Format(PyExc_TypeError,
3230n/a "'%.400s' decoder returned '%.400s' instead of 'str'; "
3231n/a "use codecs.decode() to decode to arbitrary types",
3232n/a encoding,
3233n/a Py_TYPE(unicode)->tp_name);
3234n/a Py_DECREF(unicode);
3235n/a goto onError;
3236n/a }
3237n/a Py_DECREF(buffer);
3238n/a return unicode_result(unicode);
3239n/a
3240n/a onError:
3241n/a Py_XDECREF(buffer);
3242n/a return NULL;
3243n/a}
3244n/a
3245n/aPyObject *
3246n/aPyUnicode_AsDecodedObject(PyObject *unicode,
3247n/a const char *encoding,
3248n/a const char *errors)
3249n/a{
3250n/a if (!PyUnicode_Check(unicode)) {
3251n/a PyErr_BadArgument();
3252n/a return NULL;
3253n/a }
3254n/a
3255n/a if (PyErr_WarnEx(PyExc_DeprecationWarning,
3256n/a "PyUnicode_AsDecodedObject() is deprecated; "
3257n/a "use PyCodec_Decode() to decode from str", 1) < 0)
3258n/a return NULL;
3259n/a
3260n/a if (encoding == NULL)
3261n/a encoding = PyUnicode_GetDefaultEncoding();
3262n/a
3263n/a /* Decode via the codec registry */
3264n/a return PyCodec_Decode(unicode, encoding, errors);
3265n/a}
3266n/a
3267n/aPyObject *
3268n/aPyUnicode_AsDecodedUnicode(PyObject *unicode,
3269n/a const char *encoding,
3270n/a const char *errors)
3271n/a{
3272n/a PyObject *v;
3273n/a
3274n/a if (!PyUnicode_Check(unicode)) {
3275n/a PyErr_BadArgument();
3276n/a goto onError;
3277n/a }
3278n/a
3279n/a if (PyErr_WarnEx(PyExc_DeprecationWarning,
3280n/a "PyUnicode_AsDecodedUnicode() is deprecated; "
3281n/a "use PyCodec_Decode() to decode from str to str", 1) < 0)
3282n/a return NULL;
3283n/a
3284n/a if (encoding == NULL)
3285n/a encoding = PyUnicode_GetDefaultEncoding();
3286n/a
3287n/a /* Decode via the codec registry */
3288n/a v = PyCodec_Decode(unicode, encoding, errors);
3289n/a if (v == NULL)
3290n/a goto onError;
3291n/a if (!PyUnicode_Check(v)) {
3292n/a PyErr_Format(PyExc_TypeError,
3293n/a "'%.400s' decoder returned '%.400s' instead of 'str'; "
3294n/a "use codecs.decode() to decode to arbitrary types",
3295n/a encoding,
3296n/a Py_TYPE(unicode)->tp_name);
3297n/a Py_DECREF(v);
3298n/a goto onError;
3299n/a }
3300n/a return unicode_result(v);
3301n/a
3302n/a onError:
3303n/a return NULL;
3304n/a}
3305n/a
3306n/aPyObject *
3307n/aPyUnicode_Encode(const Py_UNICODE *s,
3308n/a Py_ssize_t size,
3309n/a const char *encoding,
3310n/a const char *errors)
3311n/a{
3312n/a PyObject *v, *unicode;
3313n/a
3314n/a unicode = PyUnicode_FromWideChar(s, size);
3315n/a if (unicode == NULL)
3316n/a return NULL;
3317n/a v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3318n/a Py_DECREF(unicode);
3319n/a return v;
3320n/a}
3321n/a
3322n/aPyObject *
3323n/aPyUnicode_AsEncodedObject(PyObject *unicode,
3324n/a const char *encoding,
3325n/a const char *errors)
3326n/a{
3327n/a PyObject *v;
3328n/a
3329n/a if (!PyUnicode_Check(unicode)) {
3330n/a PyErr_BadArgument();
3331n/a goto onError;
3332n/a }
3333n/a
3334n/a if (PyErr_WarnEx(PyExc_DeprecationWarning,
3335n/a "PyUnicode_AsEncodedObject() is deprecated; "
3336n/a "use PyUnicode_AsEncodedString() to encode from str to bytes "
3337n/a "or PyCodec_Encode() for generic encoding", 1) < 0)
3338n/a return NULL;
3339n/a
3340n/a if (encoding == NULL)
3341n/a encoding = PyUnicode_GetDefaultEncoding();
3342n/a
3343n/a /* Encode via the codec registry */
3344n/a v = PyCodec_Encode(unicode, encoding, errors);
3345n/a if (v == NULL)
3346n/a goto onError;
3347n/a return v;
3348n/a
3349n/a onError:
3350n/a return NULL;
3351n/a}
3352n/a
3353n/astatic size_t
3354n/awcstombs_errorpos(const wchar_t *wstr)
3355n/a{
3356n/a size_t len;
3357n/a#if SIZEOF_WCHAR_T == 2
3358n/a wchar_t buf[3];
3359n/a#else
3360n/a wchar_t buf[2];
3361n/a#endif
3362n/a char outbuf[MB_LEN_MAX];
3363n/a const wchar_t *start, *previous;
3364n/a
3365n/a#if SIZEOF_WCHAR_T == 2
3366n/a buf[2] = 0;
3367n/a#else
3368n/a buf[1] = 0;
3369n/a#endif
3370n/a start = wstr;
3371n/a while (*wstr != L'\0')
3372n/a {
3373n/a previous = wstr;
3374n/a#if SIZEOF_WCHAR_T == 2
3375n/a if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3376n/a && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3377n/a {
3378n/a buf[0] = wstr[0];
3379n/a buf[1] = wstr[1];
3380n/a wstr += 2;
3381n/a }
3382n/a else {
3383n/a buf[0] = *wstr;
3384n/a buf[1] = 0;
3385n/a wstr++;
3386n/a }
3387n/a#else
3388n/a buf[0] = *wstr;
3389n/a wstr++;
3390n/a#endif
3391n/a len = wcstombs(outbuf, buf, sizeof(outbuf));
3392n/a if (len == (size_t)-1)
3393n/a return previous - start;
3394n/a }
3395n/a
3396n/a /* failed to find the unencodable character */
3397n/a return 0;
3398n/a}
3399n/a
3400n/astatic int
3401n/alocale_error_handler(const char *errors, int *surrogateescape)
3402n/a{
3403n/a _Py_error_handler error_handler = get_error_handler(errors);
3404n/a switch (error_handler)
3405n/a {
3406n/a case _Py_ERROR_STRICT:
3407n/a *surrogateescape = 0;
3408n/a return 0;
3409n/a case _Py_ERROR_SURROGATEESCAPE:
3410n/a *surrogateescape = 1;
3411n/a return 0;
3412n/a default:
3413n/a PyErr_Format(PyExc_ValueError,
3414n/a "only 'strict' and 'surrogateescape' error handlers "
3415n/a "are supported, not '%s'",
3416n/a errors);
3417n/a return -1;
3418n/a }
3419n/a}
3420n/a
3421n/aPyObject *
3422n/aPyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3423n/a{
3424n/a Py_ssize_t wlen, wlen2;
3425n/a wchar_t *wstr;
3426n/a char *errmsg;
3427n/a PyObject *bytes, *reason, *exc;
3428n/a size_t error_pos, errlen;
3429n/a int surrogateescape;
3430n/a
3431n/a if (locale_error_handler(errors, &surrogateescape) < 0)
3432n/a return NULL;
3433n/a
3434n/a wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3435n/a if (wstr == NULL)
3436n/a return NULL;
3437n/a
3438n/a wlen2 = wcslen(wstr);
3439n/a if (wlen2 != wlen) {
3440n/a PyMem_Free(wstr);
3441n/a PyErr_SetString(PyExc_ValueError, "embedded null character");
3442n/a return NULL;
3443n/a }
3444n/a
3445n/a if (surrogateescape) {
3446n/a /* "surrogateescape" error handler */
3447n/a char *str;
3448n/a
3449n/a str = Py_EncodeLocale(wstr, &error_pos);
3450n/a if (str == NULL) {
3451n/a if (error_pos == (size_t)-1) {
3452n/a PyErr_NoMemory();
3453n/a PyMem_Free(wstr);
3454n/a return NULL;
3455n/a }
3456n/a else {
3457n/a goto encode_error;
3458n/a }
3459n/a }
3460n/a PyMem_Free(wstr);
3461n/a
3462n/a bytes = PyBytes_FromString(str);
3463n/a PyMem_Free(str);
3464n/a }
3465n/a else {
3466n/a /* strict mode */
3467n/a size_t len, len2;
3468n/a
3469n/a len = wcstombs(NULL, wstr, 0);
3470n/a if (len == (size_t)-1) {
3471n/a error_pos = (size_t)-1;
3472n/a goto encode_error;
3473n/a }
3474n/a
3475n/a bytes = PyBytes_FromStringAndSize(NULL, len);
3476n/a if (bytes == NULL) {
3477n/a PyMem_Free(wstr);
3478n/a return NULL;
3479n/a }
3480n/a
3481n/a len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3482n/a if (len2 == (size_t)-1 || len2 > len) {
3483n/a Py_DECREF(bytes);
3484n/a error_pos = (size_t)-1;
3485n/a goto encode_error;
3486n/a }
3487n/a PyMem_Free(wstr);
3488n/a }
3489n/a return bytes;
3490n/a
3491n/aencode_error:
3492n/a errmsg = strerror(errno);
3493n/a assert(errmsg != NULL);
3494n/a
3495n/a if (error_pos == (size_t)-1)
3496n/a error_pos = wcstombs_errorpos(wstr);
3497n/a
3498n/a PyMem_Free(wstr);
3499n/a
3500n/a wstr = Py_DecodeLocale(errmsg, &errlen);
3501n/a if (wstr != NULL) {
3502n/a reason = PyUnicode_FromWideChar(wstr, errlen);
3503n/a PyMem_RawFree(wstr);
3504n/a } else {
3505n/a errmsg = NULL;
3506n/a }
3507n/a
3508n/a if (errmsg == NULL)
3509n/a reason = PyUnicode_FromString(
3510n/a "wcstombs() encountered an unencodable "
3511n/a "wide character");
3512n/a if (reason == NULL)
3513n/a return NULL;
3514n/a
3515n/a exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3516n/a "locale", unicode,
3517n/a (Py_ssize_t)error_pos,
3518n/a (Py_ssize_t)(error_pos+1),
3519n/a reason);
3520n/a Py_DECREF(reason);
3521n/a if (exc != NULL) {
3522n/a PyCodec_StrictErrors(exc);
3523n/a Py_DECREF(exc);
3524n/a }
3525n/a return NULL;
3526n/a}
3527n/a
3528n/aPyObject *
3529n/aPyUnicode_EncodeFSDefault(PyObject *unicode)
3530n/a{
3531n/a#if defined(__APPLE__)
3532n/a return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
3533n/a#else
3534n/a PyInterpreterState *interp = PyThreadState_GET()->interp;
3535n/a /* Bootstrap check: if the filesystem codec is implemented in Python, we
3536n/a cannot use it to encode and decode filenames before it is loaded. Load
3537n/a the Python codec requires to encode at least its own filename. Use the C
3538n/a version of the locale codec until the codec registry is initialized and
3539n/a the Python codec is loaded.
3540n/a
3541n/a Py_FileSystemDefaultEncoding is shared between all interpreters, we
3542n/a cannot only rely on it: check also interp->fscodec_initialized for
3543n/a subinterpreters. */
3544n/a if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3545n/a return PyUnicode_AsEncodedString(unicode,
3546n/a Py_FileSystemDefaultEncoding,
3547n/a Py_FileSystemDefaultEncodeErrors);
3548n/a }
3549n/a else {
3550n/a return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
3551n/a }
3552n/a#endif
3553n/a}
3554n/a
3555n/aPyObject *
3556n/aPyUnicode_AsEncodedString(PyObject *unicode,
3557n/a const char *encoding,
3558n/a const char *errors)
3559n/a{
3560n/a PyObject *v;
3561n/a char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
3562n/a
3563n/a if (!PyUnicode_Check(unicode)) {
3564n/a PyErr_BadArgument();
3565n/a return NULL;
3566n/a }
3567n/a
3568n/a if (encoding == NULL) {
3569n/a return _PyUnicode_AsUTF8String(unicode, errors);
3570n/a }
3571n/a
3572n/a /* Shortcuts for common default encodings */
3573n/a if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3574n/a char *lower = buflower;
3575n/a
3576n/a /* Fast paths */
3577n/a if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3578n/a lower += 3;
3579n/a if (*lower == '_') {
3580n/a /* Match "utf8" and "utf_8" */
3581n/a lower++;
3582n/a }
3583n/a
3584n/a if (lower[0] == '8' && lower[1] == 0) {
3585n/a return _PyUnicode_AsUTF8String(unicode, errors);
3586n/a }
3587n/a else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3588n/a return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3589n/a }
3590n/a else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3591n/a return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3592n/a }
3593n/a }
3594n/a else {
3595n/a if (strcmp(lower, "ascii") == 0
3596n/a || strcmp(lower, "us_ascii") == 0) {
3597n/a return _PyUnicode_AsASCIIString(unicode, errors);
3598n/a }
3599n/a#ifdef MS_WINDOWS
3600n/a else if (strcmp(lower, "mbcs") == 0) {
3601n/a return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3602n/a }
3603n/a#endif
3604n/a else if (strcmp(lower, "latin1") == 0 ||
3605n/a strcmp(lower, "latin_1") == 0 ||
3606n/a strcmp(lower, "iso_8859_1") == 0 ||
3607n/a strcmp(lower, "iso8859_1") == 0) {
3608n/a return _PyUnicode_AsLatin1String(unicode, errors);
3609n/a }
3610n/a }
3611n/a }
3612n/a
3613n/a /* Encode via the codec registry */
3614n/a v = _PyCodec_EncodeText(unicode, encoding, errors);
3615n/a if (v == NULL)
3616n/a return NULL;
3617n/a
3618n/a /* The normal path */
3619n/a if (PyBytes_Check(v))
3620n/a return v;
3621n/a
3622n/a /* If the codec returns a buffer, raise a warning and convert to bytes */
3623n/a if (PyByteArray_Check(v)) {
3624n/a int error;
3625n/a PyObject *b;
3626n/a
3627n/a error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3628n/a "encoder %s returned bytearray instead of bytes; "
3629n/a "use codecs.encode() to encode to arbitrary types",
3630n/a encoding);
3631n/a if (error) {
3632n/a Py_DECREF(v);
3633n/a return NULL;
3634n/a }
3635n/a
3636n/a b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3637n/a Py_DECREF(v);
3638n/a return b;
3639n/a }
3640n/a
3641n/a PyErr_Format(PyExc_TypeError,
3642n/a "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3643n/a "use codecs.encode() to encode to arbitrary types",
3644n/a encoding,
3645n/a Py_TYPE(v)->tp_name);
3646n/a Py_DECREF(v);
3647n/a return NULL;
3648n/a}
3649n/a
3650n/aPyObject *
3651n/aPyUnicode_AsEncodedUnicode(PyObject *unicode,
3652n/a const char *encoding,
3653n/a const char *errors)
3654n/a{
3655n/a PyObject *v;
3656n/a
3657n/a if (!PyUnicode_Check(unicode)) {
3658n/a PyErr_BadArgument();
3659n/a goto onError;
3660n/a }
3661n/a
3662n/a if (PyErr_WarnEx(PyExc_DeprecationWarning,
3663n/a "PyUnicode_AsEncodedUnicode() is deprecated; "
3664n/a "use PyCodec_Encode() to encode from str to str", 1) < 0)
3665n/a return NULL;
3666n/a
3667n/a if (encoding == NULL)
3668n/a encoding = PyUnicode_GetDefaultEncoding();
3669n/a
3670n/a /* Encode via the codec registry */
3671n/a v = PyCodec_Encode(unicode, encoding, errors);
3672n/a if (v == NULL)
3673n/a goto onError;
3674n/a if (!PyUnicode_Check(v)) {
3675n/a PyErr_Format(PyExc_TypeError,
3676n/a "'%.400s' encoder returned '%.400s' instead of 'str'; "
3677n/a "use codecs.encode() to encode to arbitrary types",
3678n/a encoding,
3679n/a Py_TYPE(v)->tp_name);
3680n/a Py_DECREF(v);
3681n/a goto onError;
3682n/a }
3683n/a return v;
3684n/a
3685n/a onError:
3686n/a return NULL;
3687n/a}
3688n/a
3689n/astatic size_t
3690n/ambstowcs_errorpos(const char *str, size_t len)
3691n/a{
3692n/a#ifdef HAVE_MBRTOWC
3693n/a const char *start = str;
3694n/a mbstate_t mbs;
3695n/a size_t converted;
3696n/a wchar_t ch;
3697n/a
3698n/a memset(&mbs, 0, sizeof mbs);
3699n/a while (len)
3700n/a {
3701n/a converted = mbrtowc(&ch, str, len, &mbs);
3702n/a if (converted == 0)
3703n/a /* Reached end of string */
3704n/a break;
3705n/a if (converted == (size_t)-1 || converted == (size_t)-2) {
3706n/a /* Conversion error or incomplete character */
3707n/a return str - start;
3708n/a }
3709n/a else {
3710n/a str += converted;
3711n/a len -= converted;
3712n/a }
3713n/a }
3714n/a /* failed to find the undecodable byte sequence */
3715n/a return 0;
3716n/a#endif
3717n/a return 0;
3718n/a}
3719n/a
3720n/aPyObject*
3721n/aPyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3722n/a const char *errors)
3723n/a{
3724n/a wchar_t smallbuf[256];
3725n/a size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3726n/a wchar_t *wstr;
3727n/a size_t wlen, wlen2;
3728n/a PyObject *unicode;
3729n/a int surrogateescape;
3730n/a size_t error_pos, errlen;
3731n/a char *errmsg;
3732n/a PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */
3733n/a
3734n/a if (locale_error_handler(errors, &surrogateescape) < 0)
3735n/a return NULL;
3736n/a
3737n/a if (str[len] != '\0' || (size_t)len != strlen(str)) {
3738n/a PyErr_SetString(PyExc_ValueError, "embedded null byte");
3739n/a return NULL;
3740n/a }
3741n/a
3742n/a if (surrogateescape) {
3743n/a /* "surrogateescape" error handler */
3744n/a wstr = Py_DecodeLocale(str, &wlen);
3745n/a if (wstr == NULL) {
3746n/a if (wlen == (size_t)-1)
3747n/a PyErr_NoMemory();
3748n/a else
3749n/a PyErr_SetFromErrno(PyExc_OSError);
3750n/a return NULL;
3751n/a }
3752n/a
3753n/a unicode = PyUnicode_FromWideChar(wstr, wlen);
3754n/a PyMem_RawFree(wstr);
3755n/a }
3756n/a else {
3757n/a /* strict mode */
3758n/a#ifndef HAVE_BROKEN_MBSTOWCS
3759n/a wlen = mbstowcs(NULL, str, 0);
3760n/a#else
3761n/a wlen = len;
3762n/a#endif
3763n/a if (wlen == (size_t)-1)
3764n/a goto decode_error;
3765n/a if (wlen+1 <= smallbuf_len) {
3766n/a wstr = smallbuf;
3767n/a }
3768n/a else {
3769n/a wstr = PyMem_New(wchar_t, wlen+1);
3770n/a if (!wstr)
3771n/a return PyErr_NoMemory();
3772n/a }
3773n/a
3774n/a wlen2 = mbstowcs(wstr, str, wlen+1);
3775n/a if (wlen2 == (size_t)-1) {
3776n/a if (wstr != smallbuf)
3777n/a PyMem_Free(wstr);
3778n/a goto decode_error;
3779n/a }
3780n/a#ifdef HAVE_BROKEN_MBSTOWCS
3781n/a assert(wlen2 == wlen);
3782n/a#endif
3783n/a unicode = PyUnicode_FromWideChar(wstr, wlen2);
3784n/a if (wstr != smallbuf)
3785n/a PyMem_Free(wstr);
3786n/a }
3787n/a return unicode;
3788n/a
3789n/adecode_error:
3790n/a errmsg = strerror(errno);
3791n/a assert(errmsg != NULL);
3792n/a
3793n/a error_pos = mbstowcs_errorpos(str, len);
3794n/a wstr = Py_DecodeLocale(errmsg, &errlen);
3795n/a if (wstr != NULL) {
3796n/a reason = PyUnicode_FromWideChar(wstr, errlen);
3797n/a PyMem_RawFree(wstr);
3798n/a }
3799n/a
3800n/a if (reason == NULL)
3801n/a reason = PyUnicode_FromString(
3802n/a "mbstowcs() encountered an invalid multibyte sequence");
3803n/a if (reason == NULL)
3804n/a return NULL;
3805n/a
3806n/a exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3807n/a "locale", str, len,
3808n/a (Py_ssize_t)error_pos,
3809n/a (Py_ssize_t)(error_pos+1),
3810n/a reason);
3811n/a Py_DECREF(reason);
3812n/a if (exc != NULL) {
3813n/a PyCodec_StrictErrors(exc);
3814n/a Py_DECREF(exc);
3815n/a }
3816n/a return NULL;
3817n/a}
3818n/a
3819n/aPyObject*
3820n/aPyUnicode_DecodeLocale(const char *str, const char *errors)
3821n/a{
3822n/a Py_ssize_t size = (Py_ssize_t)strlen(str);
3823n/a return PyUnicode_DecodeLocaleAndSize(str, size, errors);
3824n/a}
3825n/a
3826n/a
3827n/aPyObject*
3828n/aPyUnicode_DecodeFSDefault(const char *s) {
3829n/a Py_ssize_t size = (Py_ssize_t)strlen(s);
3830n/a return PyUnicode_DecodeFSDefaultAndSize(s, size);
3831n/a}
3832n/a
3833n/aPyObject*
3834n/aPyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3835n/a{
3836n/a#if defined(__APPLE__)
3837n/a return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
3838n/a#else
3839n/a PyInterpreterState *interp = PyThreadState_GET()->interp;
3840n/a /* Bootstrap check: if the filesystem codec is implemented in Python, we
3841n/a cannot use it to encode and decode filenames before it is loaded. Load
3842n/a the Python codec requires to encode at least its own filename. Use the C
3843n/a version of the locale codec until the codec registry is initialized and
3844n/a the Python codec is loaded.
3845n/a
3846n/a Py_FileSystemDefaultEncoding is shared between all interpreters, we
3847n/a cannot only rely on it: check also interp->fscodec_initialized for
3848n/a subinterpreters. */
3849n/a if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3850n/a return PyUnicode_Decode(s, size,
3851n/a Py_FileSystemDefaultEncoding,
3852n/a Py_FileSystemDefaultEncodeErrors);
3853n/a }
3854n/a else {
3855n/a return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
3856n/a }
3857n/a#endif
3858n/a}
3859n/a
3860n/a
3861n/aint
3862n/aPyUnicode_FSConverter(PyObject* arg, void* addr)
3863n/a{
3864n/a PyObject *path = NULL;
3865n/a PyObject *output = NULL;
3866n/a Py_ssize_t size;
3867n/a void *data;
3868n/a if (arg == NULL) {
3869n/a Py_DECREF(*(PyObject**)addr);
3870n/a *(PyObject**)addr = NULL;
3871n/a return 1;
3872n/a }
3873n/a path = PyOS_FSPath(arg);
3874n/a if (path == NULL) {
3875n/a return 0;
3876n/a }
3877n/a if (PyBytes_Check(path)) {
3878n/a output = path;
3879n/a }
3880n/a else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3881n/a output = PyUnicode_EncodeFSDefault(path);
3882n/a Py_DECREF(path);
3883n/a if (!output) {
3884n/a return 0;
3885n/a }
3886n/a assert(PyBytes_Check(output));
3887n/a }
3888n/a
3889n/a size = PyBytes_GET_SIZE(output);
3890n/a data = PyBytes_AS_STRING(output);
3891n/a if ((size_t)size != strlen(data)) {
3892n/a PyErr_SetString(PyExc_ValueError, "embedded null byte");
3893n/a Py_DECREF(output);
3894n/a return 0;
3895n/a }
3896n/a *(PyObject**)addr = output;
3897n/a return Py_CLEANUP_SUPPORTED;
3898n/a}
3899n/a
3900n/a
3901n/aint
3902n/aPyUnicode_FSDecoder(PyObject* arg, void* addr)
3903n/a{
3904n/a int is_buffer = 0;
3905n/a PyObject *path = NULL;
3906n/a PyObject *output = NULL;
3907n/a if (arg == NULL) {
3908n/a Py_DECREF(*(PyObject**)addr);
3909n/a return 1;
3910n/a }
3911n/a
3912n/a is_buffer = PyObject_CheckBuffer(arg);
3913n/a if (!is_buffer) {
3914n/a path = PyOS_FSPath(arg);
3915n/a if (path == NULL) {
3916n/a return 0;
3917n/a }
3918n/a }
3919n/a else {
3920n/a path = arg;
3921n/a Py_INCREF(arg);
3922n/a }
3923n/a
3924n/a if (PyUnicode_Check(path)) {
3925n/a if (PyUnicode_READY(path) == -1) {
3926n/a Py_DECREF(path);
3927n/a return 0;
3928n/a }
3929n/a output = path;
3930n/a }
3931n/a else if (PyBytes_Check(path) || is_buffer) {
3932n/a PyObject *path_bytes = NULL;
3933n/a
3934n/a if (!PyBytes_Check(path) &&
3935n/a PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3936n/a "path should be string, bytes, or os.PathLike, not %.200s",
3937n/a Py_TYPE(arg)->tp_name)) {
3938n/a Py_DECREF(path);
3939n/a return 0;
3940n/a }
3941n/a path_bytes = PyBytes_FromObject(path);
3942n/a Py_DECREF(path);
3943n/a if (!path_bytes) {
3944n/a return 0;
3945n/a }
3946n/a output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3947n/a PyBytes_GET_SIZE(path_bytes));
3948n/a Py_DECREF(path_bytes);
3949n/a if (!output) {
3950n/a return 0;
3951n/a }
3952n/a }
3953n/a else {
3954n/a PyErr_Format(PyExc_TypeError,
3955n/a "path should be string, bytes, or os.PathLike, not %.200s",
3956n/a Py_TYPE(arg)->tp_name);
3957n/a Py_DECREF(path);
3958n/a return 0;
3959n/a }
3960n/a if (PyUnicode_READY(output) == -1) {
3961n/a Py_DECREF(output);
3962n/a return 0;
3963n/a }
3964n/a if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3965n/a PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3966n/a PyErr_SetString(PyExc_ValueError, "embedded null character");
3967n/a Py_DECREF(output);
3968n/a return 0;
3969n/a }
3970n/a *(PyObject**)addr = output;
3971n/a return Py_CLEANUP_SUPPORTED;
3972n/a}
3973n/a
3974n/a
3975n/aconst char *
3976n/aPyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3977n/a{
3978n/a PyObject *bytes;
3979n/a
3980n/a if (!PyUnicode_Check(unicode)) {
3981n/a PyErr_BadArgument();
3982n/a return NULL;
3983n/a }
3984n/a if (PyUnicode_READY(unicode) == -1)
3985n/a return NULL;
3986n/a
3987n/a if (PyUnicode_UTF8(unicode) == NULL) {
3988n/a assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3989n/a bytes = _PyUnicode_AsUTF8String(unicode, NULL);
3990n/a if (bytes == NULL)
3991n/a return NULL;
3992n/a _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3993n/a if (_PyUnicode_UTF8(unicode) == NULL) {
3994n/a PyErr_NoMemory();
3995n/a Py_DECREF(bytes);
3996n/a return NULL;
3997n/a }
3998n/a _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3999n/a memcpy(_PyUnicode_UTF8(unicode),
4000n/a PyBytes_AS_STRING(bytes),
4001n/a _PyUnicode_UTF8_LENGTH(unicode) + 1);
4002n/a Py_DECREF(bytes);
4003n/a }
4004n/a
4005n/a if (psize)
4006n/a *psize = PyUnicode_UTF8_LENGTH(unicode);
4007n/a return PyUnicode_UTF8(unicode);
4008n/a}
4009n/a
4010n/aconst char *
4011n/aPyUnicode_AsUTF8(PyObject *unicode)
4012n/a{
4013n/a return PyUnicode_AsUTF8AndSize(unicode, NULL);
4014n/a}
4015n/a
4016n/aPy_UNICODE *
4017n/aPyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4018n/a{
4019n/a const unsigned char *one_byte;
4020n/a#if SIZEOF_WCHAR_T == 4
4021n/a const Py_UCS2 *two_bytes;
4022n/a#else
4023n/a const Py_UCS4 *four_bytes;
4024n/a const Py_UCS4 *ucs4_end;
4025n/a Py_ssize_t num_surrogates;
4026n/a#endif
4027n/a wchar_t *w;
4028n/a wchar_t *wchar_end;
4029n/a
4030n/a if (!PyUnicode_Check(unicode)) {
4031n/a PyErr_BadArgument();
4032n/a return NULL;
4033n/a }
4034n/a if (_PyUnicode_WSTR(unicode) == NULL) {
4035n/a /* Non-ASCII compact unicode object */
4036n/a assert(_PyUnicode_KIND(unicode) != 0);
4037n/a assert(PyUnicode_IS_READY(unicode));
4038n/a
4039n/a if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
4040n/a#if SIZEOF_WCHAR_T == 2
4041n/a four_bytes = PyUnicode_4BYTE_DATA(unicode);
4042n/a ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
4043n/a num_surrogates = 0;
4044n/a
4045n/a for (; four_bytes < ucs4_end; ++four_bytes) {
4046n/a if (*four_bytes > 0xFFFF)
4047n/a ++num_surrogates;
4048n/a }
4049n/a
4050n/a _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4051n/a sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4052n/a if (!_PyUnicode_WSTR(unicode)) {
4053n/a PyErr_NoMemory();
4054n/a return NULL;
4055n/a }
4056n/a _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
4057n/a
4058n/a w = _PyUnicode_WSTR(unicode);
4059n/a wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4060n/a four_bytes = PyUnicode_4BYTE_DATA(unicode);
4061n/a for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4062n/a if (*four_bytes > 0xFFFF) {
4063n/a assert(*four_bytes <= MAX_UNICODE);
4064n/a /* encode surrogate pair in this case */
4065n/a *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4066n/a *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
4067n/a }
4068n/a else
4069n/a *w = *four_bytes;
4070n/a
4071n/a if (w > wchar_end) {
4072n/a assert(0 && "Miscalculated string end");
4073n/a }
4074n/a }
4075n/a *w = 0;
4076n/a#else
4077n/a /* sizeof(wchar_t) == 4 */
4078n/a Py_FatalError("Impossible unicode object state, wstr and str "
4079n/a "should share memory already.");
4080n/a return NULL;
4081n/a#endif
4082n/a }
4083n/a else {
4084n/a if ((size_t)_PyUnicode_LENGTH(unicode) >
4085n/a PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4086n/a PyErr_NoMemory();
4087n/a return NULL;
4088n/a }
4089n/a _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4090n/a (_PyUnicode_LENGTH(unicode) + 1));
4091n/a if (!_PyUnicode_WSTR(unicode)) {
4092n/a PyErr_NoMemory();
4093n/a return NULL;
4094n/a }
4095n/a if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4096n/a _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4097n/a w = _PyUnicode_WSTR(unicode);
4098n/a wchar_end = w + _PyUnicode_LENGTH(unicode);
4099n/a
4100n/a if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4101n/a one_byte = PyUnicode_1BYTE_DATA(unicode);
4102n/a for (; w < wchar_end; ++one_byte, ++w)
4103n/a *w = *one_byte;
4104n/a /* null-terminate the wstr */
4105n/a *w = 0;
4106n/a }
4107n/a else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
4108n/a#if SIZEOF_WCHAR_T == 4
4109n/a two_bytes = PyUnicode_2BYTE_DATA(unicode);
4110n/a for (; w < wchar_end; ++two_bytes, ++w)
4111n/a *w = *two_bytes;
4112n/a /* null-terminate the wstr */
4113n/a *w = 0;
4114n/a#else
4115n/a /* sizeof(wchar_t) == 2 */
4116n/a PyObject_FREE(_PyUnicode_WSTR(unicode));
4117n/a _PyUnicode_WSTR(unicode) = NULL;
4118n/a Py_FatalError("Impossible unicode object state, wstr "
4119n/a "and str should share memory already.");
4120n/a return NULL;
4121n/a#endif
4122n/a }
4123n/a else {
4124n/a assert(0 && "This should never happen.");
4125n/a }
4126n/a }
4127n/a }
4128n/a if (size != NULL)
4129n/a *size = PyUnicode_WSTR_LENGTH(unicode);
4130n/a return _PyUnicode_WSTR(unicode);
4131n/a}
4132n/a
4133n/aPy_UNICODE *
4134n/aPyUnicode_AsUnicode(PyObject *unicode)
4135n/a{
4136n/a return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4137n/a}
4138n/a
4139n/a
4140n/aPy_ssize_t
4141n/aPyUnicode_GetSize(PyObject *unicode)
4142n/a{
4143n/a if (!PyUnicode_Check(unicode)) {
4144n/a PyErr_BadArgument();
4145n/a goto onError;
4146n/a }
4147n/a if (_PyUnicode_WSTR(unicode) == NULL) {
4148n/a if (PyUnicode_AsUnicode(unicode) == NULL)
4149n/a goto onError;
4150n/a }
4151n/a return PyUnicode_WSTR_LENGTH(unicode);
4152n/a
4153n/a onError:
4154n/a return -1;
4155n/a}
4156n/a
4157n/aPy_ssize_t
4158n/aPyUnicode_GetLength(PyObject *unicode)
4159n/a{
4160n/a if (!PyUnicode_Check(unicode)) {
4161n/a PyErr_BadArgument();
4162n/a return -1;
4163n/a }
4164n/a if (PyUnicode_READY(unicode) == -1)
4165n/a return -1;
4166n/a return PyUnicode_GET_LENGTH(unicode);
4167n/a}
4168n/a
4169n/aPy_UCS4
4170n/aPyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4171n/a{
4172n/a void *data;
4173n/a int kind;
4174n/a
4175n/a if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4176n/a PyErr_BadArgument();
4177n/a return (Py_UCS4)-1;
4178n/a }
4179n/a if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4180n/a PyErr_SetString(PyExc_IndexError, "string index out of range");
4181n/a return (Py_UCS4)-1;
4182n/a }
4183n/a data = PyUnicode_DATA(unicode);
4184n/a kind = PyUnicode_KIND(unicode);
4185n/a return PyUnicode_READ(kind, data, index);
4186n/a}
4187n/a
4188n/aint
4189n/aPyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4190n/a{
4191n/a if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4192n/a PyErr_BadArgument();
4193n/a return -1;
4194n/a }
4195n/a assert(PyUnicode_IS_READY(unicode));
4196n/a if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4197n/a PyErr_SetString(PyExc_IndexError, "string index out of range");
4198n/a return -1;
4199n/a }
4200n/a if (unicode_check_modifiable(unicode))
4201n/a return -1;
4202n/a if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4203n/a PyErr_SetString(PyExc_ValueError, "character out of range");
4204n/a return -1;
4205n/a }
4206n/a PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4207n/a index, ch);
4208n/a return 0;
4209n/a}
4210n/a
4211n/aconst char *
4212n/aPyUnicode_GetDefaultEncoding(void)
4213n/a{
4214n/a return "utf-8";
4215n/a}
4216n/a
4217n/a/* create or adjust a UnicodeDecodeError */
4218n/astatic void
4219n/amake_decode_exception(PyObject **exceptionObject,
4220n/a const char *encoding,
4221n/a const char *input, Py_ssize_t length,
4222n/a Py_ssize_t startpos, Py_ssize_t endpos,
4223n/a const char *reason)
4224n/a{
4225n/a if (*exceptionObject == NULL) {
4226n/a *exceptionObject = PyUnicodeDecodeError_Create(
4227n/a encoding, input, length, startpos, endpos, reason);
4228n/a }
4229n/a else {
4230n/a if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4231n/a goto onError;
4232n/a if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4233n/a goto onError;
4234n/a if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4235n/a goto onError;
4236n/a }
4237n/a return;
4238n/a
4239n/aonError:
4240n/a Py_CLEAR(*exceptionObject);
4241n/a}
4242n/a
4243n/a#ifdef MS_WINDOWS
4244n/a/* error handling callback helper:
4245n/a build arguments, call the callback and check the arguments,
4246n/a if no exception occurred, copy the replacement to the output
4247n/a and adjust various state variables.
4248n/a return 0 on success, -1 on error
4249n/a*/
4250n/a
4251n/astatic int
4252n/aunicode_decode_call_errorhandler_wchar(
4253n/a const char *errors, PyObject **errorHandler,
4254n/a const char *encoding, const char *reason,
4255n/a const char **input, const char **inend, Py_ssize_t *startinpos,
4256n/a Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4257n/a PyObject **output, Py_ssize_t *outpos)
4258n/a{
4259n/a static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4260n/a
4261n/a PyObject *restuple = NULL;
4262n/a PyObject *repunicode = NULL;
4263n/a Py_ssize_t outsize;
4264n/a Py_ssize_t insize;
4265n/a Py_ssize_t requiredsize;
4266n/a Py_ssize_t newpos;
4267n/a PyObject *inputobj = NULL;
4268n/a wchar_t *repwstr;
4269n/a Py_ssize_t repwlen;
4270n/a
4271n/a assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4272n/a outsize = _PyUnicode_WSTR_LENGTH(*output);
4273n/a
4274n/a if (*errorHandler == NULL) {
4275n/a *errorHandler = PyCodec_LookupError(errors);
4276n/a if (*errorHandler == NULL)
4277n/a goto onError;
4278n/a }
4279n/a
4280n/a make_decode_exception(exceptionObject,
4281n/a encoding,
4282n/a *input, *inend - *input,
4283n/a *startinpos, *endinpos,
4284n/a reason);
4285n/a if (*exceptionObject == NULL)
4286n/a goto onError;
4287n/a
4288n/a restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4289n/a if (restuple == NULL)
4290n/a goto onError;
4291n/a if (!PyTuple_Check(restuple)) {
4292n/a PyErr_SetString(PyExc_TypeError, &argparse[3]);
4293n/a goto onError;
4294n/a }
4295n/a if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4296n/a goto onError;
4297n/a
4298n/a /* Copy back the bytes variables, which might have been modified by the
4299n/a callback */
4300n/a inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4301n/a if (!inputobj)
4302n/a goto onError;
4303n/a *input = PyBytes_AS_STRING(inputobj);
4304n/a insize = PyBytes_GET_SIZE(inputobj);
4305n/a *inend = *input + insize;
4306n/a /* we can DECREF safely, as the exception has another reference,
4307n/a so the object won't go away. */
4308n/a Py_DECREF(inputobj);
4309n/a
4310n/a if (newpos<0)
4311n/a newpos = insize+newpos;
4312n/a if (newpos<0 || newpos>insize) {
4313n/a PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4314n/a goto onError;
4315n/a }
4316n/a
4317n/a repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4318n/a if (repwstr == NULL)
4319n/a goto onError;
4320n/a /* need more space? (at least enough for what we
4321n/a have+the replacement+the rest of the string (starting
4322n/a at the new input position), so we won't have to check space
4323n/a when there are no errors in the rest of the string) */
4324n/a requiredsize = *outpos;
4325n/a if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4326n/a goto overflow;
4327n/a requiredsize += repwlen;
4328n/a if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4329n/a goto overflow;
4330n/a requiredsize += insize - newpos;
4331n/a if (requiredsize > outsize) {
4332n/a if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4333n/a requiredsize = 2*outsize;
4334n/a if (unicode_resize(output, requiredsize) < 0)
4335n/a goto onError;
4336n/a }
4337n/a wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4338n/a *outpos += repwlen;
4339n/a *endinpos = newpos;
4340n/a *inptr = *input + newpos;
4341n/a
4342n/a /* we made it! */
4343n/a Py_DECREF(restuple);
4344n/a return 0;
4345n/a
4346n/a overflow:
4347n/a PyErr_SetString(PyExc_OverflowError,
4348n/a "decoded result is too long for a Python string");
4349n/a
4350n/a onError:
4351n/a Py_XDECREF(restuple);
4352n/a return -1;
4353n/a}
4354n/a#endif /* MS_WINDOWS */
4355n/a
4356n/astatic int
4357n/aunicode_decode_call_errorhandler_writer(
4358n/a const char *errors, PyObject **errorHandler,
4359n/a const char *encoding, const char *reason,
4360n/a const char **input, const char **inend, Py_ssize_t *startinpos,
4361n/a Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4362n/a _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4363n/a{
4364n/a static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4365n/a
4366n/a PyObject *restuple = NULL;
4367n/a PyObject *repunicode = NULL;
4368n/a Py_ssize_t insize;
4369n/a Py_ssize_t newpos;
4370n/a Py_ssize_t replen;
4371n/a PyObject *inputobj = NULL;
4372n/a
4373n/a if (*errorHandler == NULL) {
4374n/a *errorHandler = PyCodec_LookupError(errors);
4375n/a if (*errorHandler == NULL)
4376n/a goto onError;
4377n/a }
4378n/a
4379n/a make_decode_exception(exceptionObject,
4380n/a encoding,
4381n/a *input, *inend - *input,
4382n/a *startinpos, *endinpos,
4383n/a reason);
4384n/a if (*exceptionObject == NULL)
4385n/a goto onError;
4386n/a
4387n/a restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4388n/a if (restuple == NULL)
4389n/a goto onError;
4390n/a if (!PyTuple_Check(restuple)) {
4391n/a PyErr_SetString(PyExc_TypeError, &argparse[3]);
4392n/a goto onError;
4393n/a }
4394n/a if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4395n/a goto onError;
4396n/a
4397n/a /* Copy back the bytes variables, which might have been modified by the
4398n/a callback */
4399n/a inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4400n/a if (!inputobj)
4401n/a goto onError;
4402n/a *input = PyBytes_AS_STRING(inputobj);
4403n/a insize = PyBytes_GET_SIZE(inputobj);
4404n/a *inend = *input + insize;
4405n/a /* we can DECREF safely, as the exception has another reference,
4406n/a so the object won't go away. */
4407n/a Py_DECREF(inputobj);
4408n/a
4409n/a if (newpos<0)
4410n/a newpos = insize+newpos;
4411n/a if (newpos<0 || newpos>insize) {
4412n/a PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4413n/a goto onError;
4414n/a }
4415n/a
4416n/a replen = PyUnicode_GET_LENGTH(repunicode);
4417n/a if (replen > 1) {
4418n/a writer->min_length += replen - 1;
4419n/a writer->overallocate = 1;
4420n/a if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4421n/a PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4422n/a goto onError;
4423n/a }
4424n/a if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4425n/a goto onError;
4426n/a
4427n/a *endinpos = newpos;
4428n/a *inptr = *input + newpos;
4429n/a
4430n/a /* we made it! */
4431n/a Py_DECREF(restuple);
4432n/a return 0;
4433n/a
4434n/a onError:
4435n/a Py_XDECREF(restuple);
4436n/a return -1;
4437n/a}
4438n/a
4439n/a/* --- UTF-7 Codec -------------------------------------------------------- */
4440n/a
4441n/a/* See RFC2152 for details. We encode conservatively and decode liberally. */
4442n/a
4443n/a/* Three simple macros defining base-64. */
4444n/a
4445n/a/* Is c a base-64 character? */
4446n/a
4447n/a#define IS_BASE64(c) \
4448n/a (((c) >= 'A' && (c) <= 'Z') || \
4449n/a ((c) >= 'a' && (c) <= 'z') || \
4450n/a ((c) >= '0' && (c) <= '9') || \
4451n/a (c) == '+' || (c) == '/')
4452n/a
4453n/a/* given that c is a base-64 character, what is its base-64 value? */
4454n/a
4455n/a#define FROM_BASE64(c) \
4456n/a (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4457n/a ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4458n/a ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4459n/a (c) == '+' ? 62 : 63)
4460n/a
4461n/a/* What is the base-64 character of the bottom 6 bits of n? */
4462n/a
4463n/a#define TO_BASE64(n) \
4464n/a ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4465n/a
4466n/a/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4467n/a * decoded as itself. We are permissive on decoding; the only ASCII
4468n/a * byte not decoding to itself is the + which begins a base64
4469n/a * string. */
4470n/a
4471n/a#define DECODE_DIRECT(c) \
4472n/a ((c) <= 127 && (c) != '+')
4473n/a
4474n/a/* The UTF-7 encoder treats ASCII characters differently according to
4475n/a * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4476n/a * the above). See RFC2152. This array identifies these different
4477n/a * sets:
4478n/a * 0 : "Set D"
4479n/a * alphanumeric and '(),-./:?
4480n/a * 1 : "Set O"
4481n/a * !"#$%&*;<=>@[]^_`{|}
4482n/a * 2 : "whitespace"
4483n/a * ht nl cr sp
4484n/a * 3 : special (must be base64 encoded)
4485n/a * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4486n/a */
4487n/a
4488n/astatic
4489n/achar utf7_category[128] = {
4490n/a/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4491n/a 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4492n/a/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4493n/a 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4494n/a/* sp ! " # $ % & ' ( ) * + , - . / */
4495n/a 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4496n/a/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4497n/a 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4498n/a/* @ A B C D E F G H I J K L M N O */
4499n/a 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4500n/a/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4501n/a 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4502n/a/* ` a b c d e f g h i j k l m n o */
4503n/a 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4504n/a/* p q r s t u v w x y z { | } ~ del */
4505n/a 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
4506n/a};
4507n/a
4508n/a/* ENCODE_DIRECT: this character should be encoded as itself. The
4509n/a * answer depends on whether we are encoding set O as itself, and also
4510n/a * on whether we are encoding whitespace as itself. RFC2152 makes it
4511n/a * clear that the answers to these questions vary between
4512n/a * applications, so this code needs to be flexible. */
4513n/a
4514n/a#define ENCODE_DIRECT(c, directO, directWS) \
4515n/a ((c) < 128 && (c) > 0 && \
4516n/a ((utf7_category[(c)] == 0) || \
4517n/a (directWS && (utf7_category[(c)] == 2)) || \
4518n/a (directO && (utf7_category[(c)] == 1))))
4519n/a
4520n/aPyObject *
4521n/aPyUnicode_DecodeUTF7(const char *s,
4522n/a Py_ssize_t size,
4523n/a const char *errors)
4524n/a{
4525n/a return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4526n/a}
4527n/a
4528n/a/* The decoder. The only state we preserve is our read position,
4529n/a * i.e. how many characters we have consumed. So if we end in the
4530n/a * middle of a shift sequence we have to back off the read position
4531n/a * and the output to the beginning of the sequence, otherwise we lose
4532n/a * all the shift state (seen bits, number of bits seen, high
4533n/a * surrogate). */
4534n/a
4535n/aPyObject *
4536n/aPyUnicode_DecodeUTF7Stateful(const char *s,
4537n/a Py_ssize_t size,
4538n/a const char *errors,
4539n/a Py_ssize_t *consumed)
4540n/a{
4541n/a const char *starts = s;
4542n/a Py_ssize_t startinpos;
4543n/a Py_ssize_t endinpos;
4544n/a const char *e;
4545n/a _PyUnicodeWriter writer;
4546n/a const char *errmsg = "";
4547n/a int inShift = 0;
4548n/a Py_ssize_t shiftOutStart;
4549n/a unsigned int base64bits = 0;
4550n/a unsigned long base64buffer = 0;
4551n/a Py_UCS4 surrogate = 0;
4552n/a PyObject *errorHandler = NULL;
4553n/a PyObject *exc = NULL;
4554n/a
4555n/a if (size == 0) {
4556n/a if (consumed)
4557n/a *consumed = 0;
4558n/a _Py_RETURN_UNICODE_EMPTY();
4559n/a }
4560n/a
4561n/a /* Start off assuming it's all ASCII. Widen later as necessary. */
4562n/a _PyUnicodeWriter_Init(&writer);
4563n/a writer.min_length = size;
4564n/a
4565n/a shiftOutStart = 0;
4566n/a e = s + size;
4567n/a
4568n/a while (s < e) {
4569n/a Py_UCS4 ch;
4570n/a restart:
4571n/a ch = (unsigned char) *s;
4572n/a
4573n/a if (inShift) { /* in a base-64 section */
4574n/a if (IS_BASE64(ch)) { /* consume a base-64 character */
4575n/a base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4576n/a base64bits += 6;
4577n/a s++;
4578n/a if (base64bits >= 16) {
4579n/a /* we have enough bits for a UTF-16 value */
4580n/a Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4581n/a base64bits -= 16;
4582n/a base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4583n/a assert(outCh <= 0xffff);
4584n/a if (surrogate) {
4585n/a /* expecting a second surrogate */
4586n/a if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4587n/a Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4588n/a if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4589n/a goto onError;
4590n/a surrogate = 0;
4591n/a continue;
4592n/a }
4593n/a else {
4594n/a if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4595n/a goto onError;
4596n/a surrogate = 0;
4597n/a }
4598n/a }
4599n/a if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4600n/a /* first surrogate */
4601n/a surrogate = outCh;
4602n/a }
4603n/a else {
4604n/a if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4605n/a goto onError;
4606n/a }
4607n/a }
4608n/a }
4609n/a else { /* now leaving a base-64 section */
4610n/a inShift = 0;
4611n/a if (base64bits > 0) { /* left-over bits */
4612n/a if (base64bits >= 6) {
4613n/a /* We've seen at least one base-64 character */
4614n/a s++;
4615n/a errmsg = "partial character in shift sequence";
4616n/a goto utf7Error;
4617n/a }
4618n/a else {
4619n/a /* Some bits remain; they should be zero */
4620n/a if (base64buffer != 0) {
4621n/a s++;
4622n/a errmsg = "non-zero padding bits in shift sequence";
4623n/a goto utf7Error;
4624n/a }
4625n/a }
4626n/a }
4627n/a if (surrogate && DECODE_DIRECT(ch)) {
4628n/a if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4629n/a goto onError;
4630n/a }
4631n/a surrogate = 0;
4632n/a if (ch == '-') {
4633n/a /* '-' is absorbed; other terminating
4634n/a characters are preserved */
4635n/a s++;
4636n/a }
4637n/a }
4638n/a }
4639n/a else if ( ch == '+' ) {
4640n/a startinpos = s-starts;
4641n/a s++; /* consume '+' */
4642n/a if (s < e && *s == '-') { /* '+-' encodes '+' */
4643n/a s++;
4644n/a if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4645n/a goto onError;
4646n/a }
4647n/a else { /* begin base64-encoded section */
4648n/a inShift = 1;
4649n/a surrogate = 0;
4650n/a shiftOutStart = writer.pos;
4651n/a base64bits = 0;
4652n/a base64buffer = 0;
4653n/a }
4654n/a }
4655n/a else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4656n/a s++;
4657n/a if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4658n/a goto onError;
4659n/a }
4660n/a else {
4661n/a startinpos = s-starts;
4662n/a s++;
4663n/a errmsg = "unexpected special character";
4664n/a goto utf7Error;
4665n/a }
4666n/a continue;
4667n/autf7Error:
4668n/a endinpos = s-starts;
4669n/a if (unicode_decode_call_errorhandler_writer(
4670n/a errors, &errorHandler,
4671n/a "utf7", errmsg,
4672n/a &starts, &e, &startinpos, &endinpos, &exc, &s,
4673n/a &writer))
4674n/a goto onError;
4675n/a }
4676n/a
4677n/a /* end of string */
4678n/a
4679n/a if (inShift && !consumed) { /* in shift sequence, no more to follow */
4680n/a /* if we're in an inconsistent state, that's an error */
4681n/a inShift = 0;
4682n/a if (surrogate ||
4683n/a (base64bits >= 6) ||
4684n/a (base64bits > 0 && base64buffer != 0)) {
4685n/a endinpos = size;
4686n/a if (unicode_decode_call_errorhandler_writer(
4687n/a errors, &errorHandler,
4688n/a "utf7", "unterminated shift sequence",
4689n/a &starts, &e, &startinpos, &endinpos, &exc, &s,
4690n/a &writer))
4691n/a goto onError;
4692n/a if (s < e)
4693n/a goto restart;
4694n/a }
4695n/a }
4696n/a
4697n/a /* return state */
4698n/a if (consumed) {
4699n/a if (inShift) {
4700n/a *consumed = startinpos;
4701n/a if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4702n/a PyObject *result = PyUnicode_FromKindAndData(
4703n/a writer.kind, writer.data, shiftOutStart);
4704n/a Py_XDECREF(errorHandler);
4705n/a Py_XDECREF(exc);
4706n/a _PyUnicodeWriter_Dealloc(&writer);
4707n/a return result;
4708n/a }
4709n/a writer.pos = shiftOutStart; /* back off output */
4710n/a }
4711n/a else {
4712n/a *consumed = s-starts;
4713n/a }
4714n/a }
4715n/a
4716n/a Py_XDECREF(errorHandler);
4717n/a Py_XDECREF(exc);
4718n/a return _PyUnicodeWriter_Finish(&writer);
4719n/a
4720n/a onError:
4721n/a Py_XDECREF(errorHandler);
4722n/a Py_XDECREF(exc);
4723n/a _PyUnicodeWriter_Dealloc(&writer);
4724n/a return NULL;
4725n/a}
4726n/a
4727n/a
4728n/aPyObject *
4729n/a_PyUnicode_EncodeUTF7(PyObject *str,
4730n/a int base64SetO,
4731n/a int base64WhiteSpace,
4732n/a const char *errors)
4733n/a{
4734n/a int kind;
4735n/a void *data;
4736n/a Py_ssize_t len;
4737n/a PyObject *v;
4738n/a int inShift = 0;
4739n/a Py_ssize_t i;
4740n/a unsigned int base64bits = 0;
4741n/a unsigned long base64buffer = 0;
4742n/a char * out;
4743n/a char * start;
4744n/a
4745n/a if (PyUnicode_READY(str) == -1)
4746n/a return NULL;
4747n/a kind = PyUnicode_KIND(str);
4748n/a data = PyUnicode_DATA(str);
4749n/a len = PyUnicode_GET_LENGTH(str);
4750n/a
4751n/a if (len == 0)
4752n/a return PyBytes_FromStringAndSize(NULL, 0);
4753n/a
4754n/a /* It might be possible to tighten this worst case */
4755n/a if (len > PY_SSIZE_T_MAX / 8)
4756n/a return PyErr_NoMemory();
4757n/a v = PyBytes_FromStringAndSize(NULL, len * 8);
4758n/a if (v == NULL)
4759n/a return NULL;
4760n/a
4761n/a start = out = PyBytes_AS_STRING(v);
4762n/a for (i = 0; i < len; ++i) {
4763n/a Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4764n/a
4765n/a if (inShift) {
4766n/a if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4767n/a /* shifting out */
4768n/a if (base64bits) { /* output remaining bits */
4769n/a *out++ = TO_BASE64(base64buffer << (6-base64bits));
4770n/a base64buffer = 0;
4771n/a base64bits = 0;
4772n/a }
4773n/a inShift = 0;
4774n/a /* Characters not in the BASE64 set implicitly unshift the sequence
4775n/a so no '-' is required, except if the character is itself a '-' */
4776n/a if (IS_BASE64(ch) || ch == '-') {
4777n/a *out++ = '-';
4778n/a }
4779n/a *out++ = (char) ch;
4780n/a }
4781n/a else {
4782n/a goto encode_char;
4783n/a }
4784n/a }
4785n/a else { /* not in a shift sequence */
4786n/a if (ch == '+') {
4787n/a *out++ = '+';
4788n/a *out++ = '-';
4789n/a }
4790n/a else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4791n/a *out++ = (char) ch;
4792n/a }
4793n/a else {
4794n/a *out++ = '+';
4795n/a inShift = 1;
4796n/a goto encode_char;
4797n/a }
4798n/a }
4799n/a continue;
4800n/aencode_char:
4801n/a if (ch >= 0x10000) {
4802n/a assert(ch <= MAX_UNICODE);
4803n/a
4804n/a /* code first surrogate */
4805n/a base64bits += 16;
4806n/a base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4807n/a while (base64bits >= 6) {
4808n/a *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4809n/a base64bits -= 6;
4810n/a }
4811n/a /* prepare second surrogate */
4812n/a ch = Py_UNICODE_LOW_SURROGATE(ch);
4813n/a }
4814n/a base64bits += 16;
4815n/a base64buffer = (base64buffer << 16) | ch;
4816n/a while (base64bits >= 6) {
4817n/a *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4818n/a base64bits -= 6;
4819n/a }
4820n/a }
4821n/a if (base64bits)
4822n/a *out++= TO_BASE64(base64buffer << (6-base64bits) );
4823n/a if (inShift)
4824n/a *out++ = '-';
4825n/a if (_PyBytes_Resize(&v, out - start) < 0)
4826n/a return NULL;
4827n/a return v;
4828n/a}
4829n/aPyObject *
4830n/aPyUnicode_EncodeUTF7(const Py_UNICODE *s,
4831n/a Py_ssize_t size,
4832n/a int base64SetO,
4833n/a int base64WhiteSpace,
4834n/a const char *errors)
4835n/a{
4836n/a PyObject *result;
4837n/a PyObject *tmp = PyUnicode_FromWideChar(s, size);
4838n/a if (tmp == NULL)
4839n/a return NULL;
4840n/a result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4841n/a base64WhiteSpace, errors);
4842n/a Py_DECREF(tmp);
4843n/a return result;
4844n/a}
4845n/a
4846n/a#undef IS_BASE64
4847n/a#undef FROM_BASE64
4848n/a#undef TO_BASE64
4849n/a#undef DECODE_DIRECT
4850n/a#undef ENCODE_DIRECT
4851n/a
4852n/a/* --- UTF-8 Codec -------------------------------------------------------- */
4853n/a
4854n/aPyObject *
4855n/aPyUnicode_DecodeUTF8(const char *s,
4856n/a Py_ssize_t size,
4857n/a const char *errors)
4858n/a{
4859n/a return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4860n/a}
4861n/a
4862n/a#include "stringlib/asciilib.h"
4863n/a#include "stringlib/codecs.h"
4864n/a#include "stringlib/undef.h"
4865n/a
4866n/a#include "stringlib/ucs1lib.h"
4867n/a#include "stringlib/codecs.h"
4868n/a#include "stringlib/undef.h"
4869n/a
4870n/a#include "stringlib/ucs2lib.h"
4871n/a#include "stringlib/codecs.h"
4872n/a#include "stringlib/undef.h"
4873n/a
4874n/a#include "stringlib/ucs4lib.h"
4875n/a#include "stringlib/codecs.h"
4876n/a#include "stringlib/undef.h"
4877n/a
4878n/a/* Mask to quickly check whether a C 'long' contains a
4879n/a non-ASCII, UTF8-encoded char. */
4880n/a#if (SIZEOF_LONG == 8)
4881n/a# define ASCII_CHAR_MASK 0x8080808080808080UL
4882n/a#elif (SIZEOF_LONG == 4)
4883n/a# define ASCII_CHAR_MASK 0x80808080UL
4884n/a#else
4885n/a# error C 'long' size should be either 4 or 8!
4886n/a#endif
4887n/a
4888n/astatic Py_ssize_t
4889n/aascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4890n/a{
4891n/a const char *p = start;
4892n/a const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4893n/a
4894n/a /*
4895n/a * Issue #17237: m68k is a bit different from most architectures in
4896n/a * that objects do not use "natural alignment" - for example, int and
4897n/a * long are only aligned at 2-byte boundaries. Therefore the assert()
4898n/a * won't work; also, tests have shown that skipping the "optimised
4899n/a * version" will even speed up m68k.
4900n/a */
4901n/a#if !defined(__m68k__)
4902n/a#if SIZEOF_LONG <= SIZEOF_VOID_P
4903n/a assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4904n/a if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4905n/a /* Fast path, see in STRINGLIB(utf8_decode) for
4906n/a an explanation. */
4907n/a /* Help allocation */
4908n/a const char *_p = p;
4909n/a Py_UCS1 * q = dest;
4910n/a while (_p < aligned_end) {
4911n/a unsigned long value = *(const unsigned long *) _p;
4912n/a if (value & ASCII_CHAR_MASK)
4913n/a break;
4914n/a *((unsigned long *)q) = value;
4915n/a _p += SIZEOF_LONG;
4916n/a q += SIZEOF_LONG;
4917n/a }
4918n/a p = _p;
4919n/a while (p < end) {
4920n/a if ((unsigned char)*p & 0x80)
4921n/a break;
4922n/a *q++ = *p++;
4923n/a }
4924n/a return p - start;
4925n/a }
4926n/a#endif
4927n/a#endif
4928n/a while (p < end) {
4929n/a /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4930n/a for an explanation. */
4931n/a if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4932n/a /* Help allocation */
4933n/a const char *_p = p;
4934n/a while (_p < aligned_end) {
4935n/a unsigned long value = *(unsigned long *) _p;
4936n/a if (value & ASCII_CHAR_MASK)
4937n/a break;
4938n/a _p += SIZEOF_LONG;
4939n/a }
4940n/a p = _p;
4941n/a if (_p == end)
4942n/a break;
4943n/a }
4944n/a if ((unsigned char)*p & 0x80)
4945n/a break;
4946n/a ++p;
4947n/a }
4948n/a memcpy(dest, start, p - start);
4949n/a return p - start;
4950n/a}
4951n/a
4952n/aPyObject *
4953n/aPyUnicode_DecodeUTF8Stateful(const char *s,
4954n/a Py_ssize_t size,
4955n/a const char *errors,
4956n/a Py_ssize_t *consumed)
4957n/a{
4958n/a _PyUnicodeWriter writer;
4959n/a const char *starts = s;
4960n/a const char *end = s + size;
4961n/a
4962n/a Py_ssize_t startinpos;
4963n/a Py_ssize_t endinpos;
4964n/a const char *errmsg = "";
4965n/a PyObject *error_handler_obj = NULL;
4966n/a PyObject *exc = NULL;
4967n/a _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
4968n/a
4969n/a if (size == 0) {
4970n/a if (consumed)
4971n/a *consumed = 0;
4972n/a _Py_RETURN_UNICODE_EMPTY();
4973n/a }
4974n/a
4975n/a /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4976n/a if (size == 1 && (unsigned char)s[0] < 128) {
4977n/a if (consumed)
4978n/a *consumed = 1;
4979n/a return get_latin1_char((unsigned char)s[0]);
4980n/a }
4981n/a
4982n/a _PyUnicodeWriter_Init(&writer);
4983n/a writer.min_length = size;
4984n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4985n/a goto onError;
4986n/a
4987n/a writer.pos = ascii_decode(s, end, writer.data);
4988n/a s += writer.pos;
4989n/a while (s < end) {
4990n/a Py_UCS4 ch;
4991n/a int kind = writer.kind;
4992n/a
4993n/a if (kind == PyUnicode_1BYTE_KIND) {
4994n/a if (PyUnicode_IS_ASCII(writer.buffer))
4995n/a ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4996n/a else
4997n/a ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4998n/a } else if (kind == PyUnicode_2BYTE_KIND) {
4999n/a ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
5000n/a } else {
5001n/a assert(kind == PyUnicode_4BYTE_KIND);
5002n/a ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
5003n/a }
5004n/a
5005n/a switch (ch) {
5006n/a case 0:
5007n/a if (s == end || consumed)
5008n/a goto End;
5009n/a errmsg = "unexpected end of data";
5010n/a startinpos = s - starts;
5011n/a endinpos = end - starts;
5012n/a break;
5013n/a case 1:
5014n/a errmsg = "invalid start byte";
5015n/a startinpos = s - starts;
5016n/a endinpos = startinpos + 1;
5017n/a break;
5018n/a case 2:
5019n/a case 3:
5020n/a case 4:
5021n/a errmsg = "invalid continuation byte";
5022n/a startinpos = s - starts;
5023n/a endinpos = startinpos + ch - 1;
5024n/a break;
5025n/a default:
5026n/a if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5027n/a goto onError;
5028n/a continue;
5029n/a }
5030n/a
5031n/a if (error_handler == _Py_ERROR_UNKNOWN)
5032n/a error_handler = get_error_handler(errors);
5033n/a
5034n/a switch (error_handler) {
5035n/a case _Py_ERROR_IGNORE:
5036n/a s += (endinpos - startinpos);
5037n/a break;
5038n/a
5039n/a case _Py_ERROR_REPLACE:
5040n/a if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5041n/a goto onError;
5042n/a s += (endinpos - startinpos);
5043n/a break;
5044n/a
5045n/a case _Py_ERROR_SURROGATEESCAPE:
5046n/a {
5047n/a Py_ssize_t i;
5048n/a
5049n/a if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5050n/a goto onError;
5051n/a for (i=startinpos; i<endinpos; i++) {
5052n/a ch = (Py_UCS4)(unsigned char)(starts[i]);
5053n/a PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5054n/a ch + 0xdc00);
5055n/a writer.pos++;
5056n/a }
5057n/a s += (endinpos - startinpos);
5058n/a break;
5059n/a }
5060n/a
5061n/a default:
5062n/a if (unicode_decode_call_errorhandler_writer(
5063n/a errors, &error_handler_obj,
5064n/a "utf-8", errmsg,
5065n/a &starts, &end, &startinpos, &endinpos, &exc, &s,
5066n/a &writer))
5067n/a goto onError;
5068n/a }
5069n/a }
5070n/a
5071n/aEnd:
5072n/a if (consumed)
5073n/a *consumed = s - starts;
5074n/a
5075n/a Py_XDECREF(error_handler_obj);
5076n/a Py_XDECREF(exc);
5077n/a return _PyUnicodeWriter_Finish(&writer);
5078n/a
5079n/aonError:
5080n/a Py_XDECREF(error_handler_obj);
5081n/a Py_XDECREF(exc);
5082n/a _PyUnicodeWriter_Dealloc(&writer);
5083n/a return NULL;
5084n/a}
5085n/a
5086n/a#if defined(__APPLE__) || defined(__ANDROID__)
5087n/a
5088n/a/* Simplified UTF-8 decoder using surrogateescape error handler,
5089n/a used to decode the command line arguments on Mac OS X and Android.
5090n/a
5091n/a Return a pointer to a newly allocated wide character string (use
5092n/a PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
5093n/a
5094n/awchar_t*
5095n/a_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5096n/a{
5097n/a const char *e;
5098n/a wchar_t *unicode;
5099n/a Py_ssize_t outpos;
5100n/a
5101n/a /* Note: size will always be longer than the resulting Unicode
5102n/a character count */
5103n/a if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
5104n/a return NULL;
5105n/a unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5106n/a if (!unicode)
5107n/a return NULL;
5108n/a
5109n/a /* Unpack UTF-8 encoded data */
5110n/a e = s + size;
5111n/a outpos = 0;
5112n/a while (s < e) {
5113n/a Py_UCS4 ch;
5114n/a#if SIZEOF_WCHAR_T == 4
5115n/a ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5116n/a#else
5117n/a ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5118n/a#endif
5119n/a if (ch > 0xFF) {
5120n/a#if SIZEOF_WCHAR_T == 4
5121n/a assert(0);
5122n/a#else
5123n/a assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5124n/a /* compute and append the two surrogates: */
5125n/a unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5126n/a unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5127n/a#endif
5128n/a }
5129n/a else {
5130n/a if (!ch && s == e)
5131n/a break;
5132n/a /* surrogateescape */
5133n/a unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5134n/a }
5135n/a }
5136n/a unicode[outpos] = L'\0';
5137n/a return unicode;
5138n/a}
5139n/a
5140n/a#endif /* __APPLE__ or __ANDROID__ */
5141n/a
5142n/a/* Primary internal function which creates utf8 encoded bytes objects.
5143n/a
5144n/a Allocation strategy: if the string is short, convert into a stack buffer
5145n/a and allocate exactly as much space needed at the end. Else allocate the
5146n/a maximum possible needed (4 result bytes per Unicode character), and return
5147n/a the excess memory at the end.
5148n/a*/
5149n/aPyObject *
5150n/a_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5151n/a{
5152n/a enum PyUnicode_Kind kind;
5153n/a void *data;
5154n/a Py_ssize_t size;
5155n/a
5156n/a if (!PyUnicode_Check(unicode)) {
5157n/a PyErr_BadArgument();
5158n/a return NULL;
5159n/a }
5160n/a
5161n/a if (PyUnicode_READY(unicode) == -1)
5162n/a return NULL;
5163n/a
5164n/a if (PyUnicode_UTF8(unicode))
5165n/a return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5166n/a PyUnicode_UTF8_LENGTH(unicode));
5167n/a
5168n/a kind = PyUnicode_KIND(unicode);
5169n/a data = PyUnicode_DATA(unicode);
5170n/a size = PyUnicode_GET_LENGTH(unicode);
5171n/a
5172n/a switch (kind) {
5173n/a default:
5174n/a assert(0);
5175n/a case PyUnicode_1BYTE_KIND:
5176n/a /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5177n/a assert(!PyUnicode_IS_ASCII(unicode));
5178n/a return ucs1lib_utf8_encoder(unicode, data, size, errors);
5179n/a case PyUnicode_2BYTE_KIND:
5180n/a return ucs2lib_utf8_encoder(unicode, data, size, errors);
5181n/a case PyUnicode_4BYTE_KIND:
5182n/a return ucs4lib_utf8_encoder(unicode, data, size, errors);
5183n/a }
5184n/a}
5185n/a
5186n/aPyObject *
5187n/aPyUnicode_EncodeUTF8(const Py_UNICODE *s,
5188n/a Py_ssize_t size,
5189n/a const char *errors)
5190n/a{
5191n/a PyObject *v, *unicode;
5192n/a
5193n/a unicode = PyUnicode_FromWideChar(s, size);
5194n/a if (unicode == NULL)
5195n/a return NULL;
5196n/a v = _PyUnicode_AsUTF8String(unicode, errors);
5197n/a Py_DECREF(unicode);
5198n/a return v;
5199n/a}
5200n/a
5201n/aPyObject *
5202n/aPyUnicode_AsUTF8String(PyObject *unicode)
5203n/a{
5204n/a return _PyUnicode_AsUTF8String(unicode, NULL);
5205n/a}
5206n/a
5207n/a/* --- UTF-32 Codec ------------------------------------------------------- */
5208n/a
5209n/aPyObject *
5210n/aPyUnicode_DecodeUTF32(const char *s,
5211n/a Py_ssize_t size,
5212n/a const char *errors,
5213n/a int *byteorder)
5214n/a{
5215n/a return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5216n/a}
5217n/a
5218n/aPyObject *
5219n/aPyUnicode_DecodeUTF32Stateful(const char *s,
5220n/a Py_ssize_t size,
5221n/a const char *errors,
5222n/a int *byteorder,
5223n/a Py_ssize_t *consumed)
5224n/a{
5225n/a const char *starts = s;
5226n/a Py_ssize_t startinpos;
5227n/a Py_ssize_t endinpos;
5228n/a _PyUnicodeWriter writer;
5229n/a const unsigned char *q, *e;
5230n/a int le, bo = 0; /* assume native ordering by default */
5231n/a const char *encoding;
5232n/a const char *errmsg = "";
5233n/a PyObject *errorHandler = NULL;
5234n/a PyObject *exc = NULL;
5235n/a
5236n/a q = (unsigned char *)s;
5237n/a e = q + size;
5238n/a
5239n/a if (byteorder)
5240n/a bo = *byteorder;
5241n/a
5242n/a /* Check for BOM marks (U+FEFF) in the input and adjust current
5243n/a byte order setting accordingly. In native mode, the leading BOM
5244n/a mark is skipped, in all other modes, it is copied to the output
5245n/a stream as-is (giving a ZWNBSP character). */
5246n/a if (bo == 0 && size >= 4) {
5247n/a Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5248n/a if (bom == 0x0000FEFF) {
5249n/a bo = -1;
5250n/a q += 4;
5251n/a }
5252n/a else if (bom == 0xFFFE0000) {
5253n/a bo = 1;
5254n/a q += 4;
5255n/a }
5256n/a if (byteorder)
5257n/a *byteorder = bo;
5258n/a }
5259n/a
5260n/a if (q == e) {
5261n/a if (consumed)
5262n/a *consumed = size;
5263n/a _Py_RETURN_UNICODE_EMPTY();
5264n/a }
5265n/a
5266n/a#ifdef WORDS_BIGENDIAN
5267n/a le = bo < 0;
5268n/a#else
5269n/a le = bo <= 0;
5270n/a#endif
5271n/a encoding = le ? "utf-32-le" : "utf-32-be";
5272n/a
5273n/a _PyUnicodeWriter_Init(&writer);
5274n/a writer.min_length = (e - q + 3) / 4;
5275n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5276n/a goto onError;
5277n/a
5278n/a while (1) {
5279n/a Py_UCS4 ch = 0;
5280n/a Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5281n/a
5282n/a if (e - q >= 4) {
5283n/a enum PyUnicode_Kind kind = writer.kind;
5284n/a void *data = writer.data;
5285n/a const unsigned char *last = e - 4;
5286n/a Py_ssize_t pos = writer.pos;
5287n/a if (le) {
5288n/a do {
5289n/a ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5290n/a if (ch > maxch)
5291n/a break;
5292n/a if (kind != PyUnicode_1BYTE_KIND &&
5293n/a Py_UNICODE_IS_SURROGATE(ch))
5294n/a break;
5295n/a PyUnicode_WRITE(kind, data, pos++, ch);
5296n/a q += 4;
5297n/a } while (q <= last);
5298n/a }
5299n/a else {
5300n/a do {
5301n/a ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5302n/a if (ch > maxch)
5303n/a break;
5304n/a if (kind != PyUnicode_1BYTE_KIND &&
5305n/a Py_UNICODE_IS_SURROGATE(ch))
5306n/a break;
5307n/a PyUnicode_WRITE(kind, data, pos++, ch);
5308n/a q += 4;
5309n/a } while (q <= last);
5310n/a }
5311n/a writer.pos = pos;
5312n/a }
5313n/a
5314n/a if (Py_UNICODE_IS_SURROGATE(ch)) {
5315n/a errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5316n/a startinpos = ((const char *)q) - starts;
5317n/a endinpos = startinpos + 4;
5318n/a }
5319n/a else if (ch <= maxch) {
5320n/a if (q == e || consumed)
5321n/a break;
5322n/a /* remaining bytes at the end? (size should be divisible by 4) */
5323n/a errmsg = "truncated data";
5324n/a startinpos = ((const char *)q) - starts;
5325n/a endinpos = ((const char *)e) - starts;
5326n/a }
5327n/a else {
5328n/a if (ch < 0x110000) {
5329n/a if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5330n/a goto onError;
5331n/a q += 4;
5332n/a continue;
5333n/a }
5334n/a errmsg = "code point not in range(0x110000)";
5335n/a startinpos = ((const char *)q) - starts;
5336n/a endinpos = startinpos + 4;
5337n/a }
5338n/a
5339n/a /* The remaining input chars are ignored if the callback
5340n/a chooses to skip the input */
5341n/a if (unicode_decode_call_errorhandler_writer(
5342n/a errors, &errorHandler,
5343n/a encoding, errmsg,
5344n/a &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5345n/a &writer))
5346n/a goto onError;
5347n/a }
5348n/a
5349n/a if (consumed)
5350n/a *consumed = (const char *)q-starts;
5351n/a
5352n/a Py_XDECREF(errorHandler);
5353n/a Py_XDECREF(exc);
5354n/a return _PyUnicodeWriter_Finish(&writer);
5355n/a
5356n/a onError:
5357n/a _PyUnicodeWriter_Dealloc(&writer);
5358n/a Py_XDECREF(errorHandler);
5359n/a Py_XDECREF(exc);
5360n/a return NULL;
5361n/a}
5362n/a
5363n/aPyObject *
5364n/a_PyUnicode_EncodeUTF32(PyObject *str,
5365n/a const char *errors,
5366n/a int byteorder)
5367n/a{
5368n/a enum PyUnicode_Kind kind;
5369n/a const void *data;
5370n/a Py_ssize_t len;
5371n/a PyObject *v;
5372n/a uint32_t *out;
5373n/a#if PY_LITTLE_ENDIAN
5374n/a int native_ordering = byteorder <= 0;
5375n/a#else
5376n/a int native_ordering = byteorder >= 0;
5377n/a#endif
5378n/a const char *encoding;
5379n/a Py_ssize_t nsize, pos;
5380n/a PyObject *errorHandler = NULL;
5381n/a PyObject *exc = NULL;
5382n/a PyObject *rep = NULL;
5383n/a
5384n/a if (!PyUnicode_Check(str)) {
5385n/a PyErr_BadArgument();
5386n/a return NULL;
5387n/a }
5388n/a if (PyUnicode_READY(str) == -1)
5389n/a return NULL;
5390n/a kind = PyUnicode_KIND(str);
5391n/a data = PyUnicode_DATA(str);
5392n/a len = PyUnicode_GET_LENGTH(str);
5393n/a
5394n/a if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5395n/a return PyErr_NoMemory();
5396n/a nsize = len + (byteorder == 0);
5397n/a v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5398n/a if (v == NULL)
5399n/a return NULL;
5400n/a
5401n/a /* output buffer is 4-bytes aligned */
5402n/a assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5403n/a out = (uint32_t *)PyBytes_AS_STRING(v);
5404n/a if (byteorder == 0)
5405n/a *out++ = 0xFEFF;
5406n/a if (len == 0)
5407n/a goto done;
5408n/a
5409n/a if (byteorder == -1)
5410n/a encoding = "utf-32-le";
5411n/a else if (byteorder == 1)
5412n/a encoding = "utf-32-be";
5413n/a else
5414n/a encoding = "utf-32";
5415n/a
5416n/a if (kind == PyUnicode_1BYTE_KIND) {
5417n/a ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5418n/a goto done;
5419n/a }
5420n/a
5421n/a pos = 0;
5422n/a while (pos < len) {
5423n/a Py_ssize_t repsize, moreunits;
5424n/a
5425n/a if (kind == PyUnicode_2BYTE_KIND) {
5426n/a pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5427n/a &out, native_ordering);
5428n/a }
5429n/a else {
5430n/a assert(kind == PyUnicode_4BYTE_KIND);
5431n/a pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5432n/a &out, native_ordering);
5433n/a }
5434n/a if (pos == len)
5435n/a break;
5436n/a
5437n/a rep = unicode_encode_call_errorhandler(
5438n/a errors, &errorHandler,
5439n/a encoding, "surrogates not allowed",
5440n/a str, &exc, pos, pos + 1, &pos);
5441n/a if (!rep)
5442n/a goto error;
5443n/a
5444n/a if (PyBytes_Check(rep)) {
5445n/a repsize = PyBytes_GET_SIZE(rep);
5446n/a if (repsize & 3) {
5447n/a raise_encode_exception(&exc, encoding,
5448n/a str, pos - 1, pos,
5449n/a "surrogates not allowed");
5450n/a goto error;
5451n/a }
5452n/a moreunits = repsize / 4;
5453n/a }
5454n/a else {
5455n/a assert(PyUnicode_Check(rep));
5456n/a if (PyUnicode_READY(rep) < 0)
5457n/a goto error;
5458n/a moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5459n/a if (!PyUnicode_IS_ASCII(rep)) {
5460n/a raise_encode_exception(&exc, encoding,
5461n/a str, pos - 1, pos,
5462n/a "surrogates not allowed");
5463n/a goto error;
5464n/a }
5465n/a }
5466n/a
5467n/a /* four bytes are reserved for each surrogate */
5468n/a if (moreunits > 1) {
5469n/a Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5470n/a Py_ssize_t morebytes = 4 * (moreunits - 1);
5471n/a if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5472n/a /* integer overflow */
5473n/a PyErr_NoMemory();
5474n/a goto error;
5475n/a }
5476n/a if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5477n/a goto error;
5478n/a out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5479n/a }
5480n/a
5481n/a if (PyBytes_Check(rep)) {
5482n/a memcpy(out, PyBytes_AS_STRING(rep), repsize);
5483n/a out += moreunits;
5484n/a } else /* rep is unicode */ {
5485n/a assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5486n/a ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5487n/a &out, native_ordering);
5488n/a }
5489n/a
5490n/a Py_CLEAR(rep);
5491n/a }
5492n/a
5493n/a /* Cut back to size actually needed. This is necessary for, for example,
5494n/a encoding of a string containing isolated surrogates and the 'ignore'
5495n/a handler is used. */
5496n/a nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5497n/a if (nsize != PyBytes_GET_SIZE(v))
5498n/a _PyBytes_Resize(&v, nsize);
5499n/a Py_XDECREF(errorHandler);
5500n/a Py_XDECREF(exc);
5501n/a done:
5502n/a return v;
5503n/a error:
5504n/a Py_XDECREF(rep);
5505n/a Py_XDECREF(errorHandler);
5506n/a Py_XDECREF(exc);
5507n/a Py_XDECREF(v);
5508n/a return NULL;
5509n/a}
5510n/a
5511n/aPyObject *
5512n/aPyUnicode_EncodeUTF32(const Py_UNICODE *s,
5513n/a Py_ssize_t size,
5514n/a const char *errors,
5515n/a int byteorder)
5516n/a{
5517n/a PyObject *result;
5518n/a PyObject *tmp = PyUnicode_FromWideChar(s, size);
5519n/a if (tmp == NULL)
5520n/a return NULL;
5521n/a result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5522n/a Py_DECREF(tmp);
5523n/a return result;
5524n/a}
5525n/a
5526n/aPyObject *
5527n/aPyUnicode_AsUTF32String(PyObject *unicode)
5528n/a{
5529n/a return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5530n/a}
5531n/a
5532n/a/* --- UTF-16 Codec ------------------------------------------------------- */
5533n/a
5534n/aPyObject *
5535n/aPyUnicode_DecodeUTF16(const char *s,
5536n/a Py_ssize_t size,
5537n/a const char *errors,
5538n/a int *byteorder)
5539n/a{
5540n/a return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5541n/a}
5542n/a
5543n/aPyObject *
5544n/aPyUnicode_DecodeUTF16Stateful(const char *s,
5545n/a Py_ssize_t size,
5546n/a const char *errors,
5547n/a int *byteorder,
5548n/a Py_ssize_t *consumed)
5549n/a{
5550n/a const char *starts = s;
5551n/a Py_ssize_t startinpos;
5552n/a Py_ssize_t endinpos;
5553n/a _PyUnicodeWriter writer;
5554n/a const unsigned char *q, *e;
5555n/a int bo = 0; /* assume native ordering by default */
5556n/a int native_ordering;
5557n/a const char *errmsg = "";
5558n/a PyObject *errorHandler = NULL;
5559n/a PyObject *exc = NULL;
5560n/a const char *encoding;
5561n/a
5562n/a q = (unsigned char *)s;
5563n/a e = q + size;
5564n/a
5565n/a if (byteorder)
5566n/a bo = *byteorder;
5567n/a
5568n/a /* Check for BOM marks (U+FEFF) in the input and adjust current
5569n/a byte order setting accordingly. In native mode, the leading BOM
5570n/a mark is skipped, in all other modes, it is copied to the output
5571n/a stream as-is (giving a ZWNBSP character). */
5572n/a if (bo == 0 && size >= 2) {
5573n/a const Py_UCS4 bom = (q[1] << 8) | q[0];
5574n/a if (bom == 0xFEFF) {
5575n/a q += 2;
5576n/a bo = -1;
5577n/a }
5578n/a else if (bom == 0xFFFE) {
5579n/a q += 2;
5580n/a bo = 1;
5581n/a }
5582n/a if (byteorder)
5583n/a *byteorder = bo;
5584n/a }
5585n/a
5586n/a if (q == e) {
5587n/a if (consumed)
5588n/a *consumed = size;
5589n/a _Py_RETURN_UNICODE_EMPTY();
5590n/a }
5591n/a
5592n/a#if PY_LITTLE_ENDIAN
5593n/a native_ordering = bo <= 0;
5594n/a encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5595n/a#else
5596n/a native_ordering = bo >= 0;
5597n/a encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5598n/a#endif
5599n/a
5600n/a /* Note: size will always be longer than the resulting Unicode
5601n/a character count */
5602n/a _PyUnicodeWriter_Init(&writer);
5603n/a writer.min_length = (e - q + 1) / 2;
5604n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5605n/a goto onError;
5606n/a
5607n/a while (1) {
5608n/a Py_UCS4 ch = 0;
5609n/a if (e - q >= 2) {
5610n/a int kind = writer.kind;
5611n/a if (kind == PyUnicode_1BYTE_KIND) {
5612n/a if (PyUnicode_IS_ASCII(writer.buffer))
5613n/a ch = asciilib_utf16_decode(&q, e,
5614n/a (Py_UCS1*)writer.data, &writer.pos,
5615n/a native_ordering);
5616n/a else
5617n/a ch = ucs1lib_utf16_decode(&q, e,
5618n/a (Py_UCS1*)writer.data, &writer.pos,
5619n/a native_ordering);
5620n/a } else if (kind == PyUnicode_2BYTE_KIND) {
5621n/a ch = ucs2lib_utf16_decode(&q, e,
5622n/a (Py_UCS2*)writer.data, &writer.pos,
5623n/a native_ordering);
5624n/a } else {
5625n/a assert(kind == PyUnicode_4BYTE_KIND);
5626n/a ch = ucs4lib_utf16_decode(&q, e,
5627n/a (Py_UCS4*)writer.data, &writer.pos,
5628n/a native_ordering);
5629n/a }
5630n/a }
5631n/a
5632n/a switch (ch)
5633n/a {
5634n/a case 0:
5635n/a /* remaining byte at the end? (size should be even) */
5636n/a if (q == e || consumed)
5637n/a goto End;
5638n/a errmsg = "truncated data";
5639n/a startinpos = ((const char *)q) - starts;
5640n/a endinpos = ((const char *)e) - starts;
5641n/a break;
5642n/a /* The remaining input chars are ignored if the callback
5643n/a chooses to skip the input */
5644n/a case 1:
5645n/a q -= 2;
5646n/a if (consumed)
5647n/a goto End;
5648n/a errmsg = "unexpected end of data";
5649n/a startinpos = ((const char *)q) - starts;
5650n/a endinpos = ((const char *)e) - starts;
5651n/a break;
5652n/a case 2:
5653n/a errmsg = "illegal encoding";
5654n/a startinpos = ((const char *)q) - 2 - starts;
5655n/a endinpos = startinpos + 2;
5656n/a break;
5657n/a case 3:
5658n/a errmsg = "illegal UTF-16 surrogate";
5659n/a startinpos = ((const char *)q) - 4 - starts;
5660n/a endinpos = startinpos + 2;
5661n/a break;
5662n/a default:
5663n/a if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5664n/a goto onError;
5665n/a continue;
5666n/a }
5667n/a
5668n/a if (unicode_decode_call_errorhandler_writer(
5669n/a errors,
5670n/a &errorHandler,
5671n/a encoding, errmsg,
5672n/a &starts,
5673n/a (const char **)&e,
5674n/a &startinpos,
5675n/a &endinpos,
5676n/a &exc,
5677n/a (const char **)&q,
5678n/a &writer))
5679n/a goto onError;
5680n/a }
5681n/a
5682n/aEnd:
5683n/a if (consumed)
5684n/a *consumed = (const char *)q-starts;
5685n/a
5686n/a Py_XDECREF(errorHandler);
5687n/a Py_XDECREF(exc);
5688n/a return _PyUnicodeWriter_Finish(&writer);
5689n/a
5690n/a onError:
5691n/a _PyUnicodeWriter_Dealloc(&writer);
5692n/a Py_XDECREF(errorHandler);
5693n/a Py_XDECREF(exc);
5694n/a return NULL;
5695n/a}
5696n/a
5697n/aPyObject *
5698n/a_PyUnicode_EncodeUTF16(PyObject *str,
5699n/a const char *errors,
5700n/a int byteorder)
5701n/a{
5702n/a enum PyUnicode_Kind kind;
5703n/a const void *data;
5704n/a Py_ssize_t len;
5705n/a PyObject *v;
5706n/a unsigned short *out;
5707n/a Py_ssize_t pairs;
5708n/a#if PY_BIG_ENDIAN
5709n/a int native_ordering = byteorder >= 0;
5710n/a#else
5711n/a int native_ordering = byteorder <= 0;
5712n/a#endif
5713n/a const char *encoding;
5714n/a Py_ssize_t nsize, pos;
5715n/a PyObject *errorHandler = NULL;
5716n/a PyObject *exc = NULL;
5717n/a PyObject *rep = NULL;
5718n/a
5719n/a if (!PyUnicode_Check(str)) {
5720n/a PyErr_BadArgument();
5721n/a return NULL;
5722n/a }
5723n/a if (PyUnicode_READY(str) == -1)
5724n/a return NULL;
5725n/a kind = PyUnicode_KIND(str);
5726n/a data = PyUnicode_DATA(str);
5727n/a len = PyUnicode_GET_LENGTH(str);
5728n/a
5729n/a pairs = 0;
5730n/a if (kind == PyUnicode_4BYTE_KIND) {
5731n/a const Py_UCS4 *in = (const Py_UCS4 *)data;
5732n/a const Py_UCS4 *end = in + len;
5733n/a while (in < end) {
5734n/a if (*in++ >= 0x10000) {
5735n/a pairs++;
5736n/a }
5737n/a }
5738n/a }
5739n/a if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
5740n/a return PyErr_NoMemory();
5741n/a }
5742n/a nsize = len + pairs + (byteorder == 0);
5743n/a v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5744n/a if (v == NULL) {
5745n/a return NULL;
5746n/a }
5747n/a
5748n/a /* output buffer is 2-bytes aligned */
5749n/a assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5750n/a out = (unsigned short *)PyBytes_AS_STRING(v);
5751n/a if (byteorder == 0) {
5752n/a *out++ = 0xFEFF;
5753n/a }
5754n/a if (len == 0) {
5755n/a goto done;
5756n/a }
5757n/a
5758n/a if (kind == PyUnicode_1BYTE_KIND) {
5759n/a ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5760n/a goto done;
5761n/a }
5762n/a
5763n/a if (byteorder < 0) {
5764n/a encoding = "utf-16-le";
5765n/a }
5766n/a else if (byteorder > 0) {
5767n/a encoding = "utf-16-be";
5768n/a }
5769n/a else {
5770n/a encoding = "utf-16";
5771n/a }
5772n/a
5773n/a pos = 0;
5774n/a while (pos < len) {
5775n/a Py_ssize_t repsize, moreunits;
5776n/a
5777n/a if (kind == PyUnicode_2BYTE_KIND) {
5778n/a pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5779n/a &out, native_ordering);
5780n/a }
5781n/a else {
5782n/a assert(kind == PyUnicode_4BYTE_KIND);
5783n/a pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5784n/a &out, native_ordering);
5785n/a }
5786n/a if (pos == len)
5787n/a break;
5788n/a
5789n/a rep = unicode_encode_call_errorhandler(
5790n/a errors, &errorHandler,
5791n/a encoding, "surrogates not allowed",
5792n/a str, &exc, pos, pos + 1, &pos);
5793n/a if (!rep)
5794n/a goto error;
5795n/a
5796n/a if (PyBytes_Check(rep)) {
5797n/a repsize = PyBytes_GET_SIZE(rep);
5798n/a if (repsize & 1) {
5799n/a raise_encode_exception(&exc, encoding,
5800n/a str, pos - 1, pos,
5801n/a "surrogates not allowed");
5802n/a goto error;
5803n/a }
5804n/a moreunits = repsize / 2;
5805n/a }
5806n/a else {
5807n/a assert(PyUnicode_Check(rep));
5808n/a if (PyUnicode_READY(rep) < 0)
5809n/a goto error;
5810n/a moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5811n/a if (!PyUnicode_IS_ASCII(rep)) {
5812n/a raise_encode_exception(&exc, encoding,
5813n/a str, pos - 1, pos,
5814n/a "surrogates not allowed");
5815n/a goto error;
5816n/a }
5817n/a }
5818n/a
5819n/a /* two bytes are reserved for each surrogate */
5820n/a if (moreunits > 1) {
5821n/a Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5822n/a Py_ssize_t morebytes = 2 * (moreunits - 1);
5823n/a if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5824n/a /* integer overflow */
5825n/a PyErr_NoMemory();
5826n/a goto error;
5827n/a }
5828n/a if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5829n/a goto error;
5830n/a out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5831n/a }
5832n/a
5833n/a if (PyBytes_Check(rep)) {
5834n/a memcpy(out, PyBytes_AS_STRING(rep), repsize);
5835n/a out += moreunits;
5836n/a } else /* rep is unicode */ {
5837n/a assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5838n/a ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5839n/a &out, native_ordering);
5840n/a }
5841n/a
5842n/a Py_CLEAR(rep);
5843n/a }
5844n/a
5845n/a /* Cut back to size actually needed. This is necessary for, for example,
5846n/a encoding of a string containing isolated surrogates and the 'ignore' handler
5847n/a is used. */
5848n/a nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5849n/a if (nsize != PyBytes_GET_SIZE(v))
5850n/a _PyBytes_Resize(&v, nsize);
5851n/a Py_XDECREF(errorHandler);
5852n/a Py_XDECREF(exc);
5853n/a done:
5854n/a return v;
5855n/a error:
5856n/a Py_XDECREF(rep);
5857n/a Py_XDECREF(errorHandler);
5858n/a Py_XDECREF(exc);
5859n/a Py_XDECREF(v);
5860n/a return NULL;
5861n/a#undef STORECHAR
5862n/a}
5863n/a
5864n/aPyObject *
5865n/aPyUnicode_EncodeUTF16(const Py_UNICODE *s,
5866n/a Py_ssize_t size,
5867n/a const char *errors,
5868n/a int byteorder)
5869n/a{
5870n/a PyObject *result;
5871n/a PyObject *tmp = PyUnicode_FromWideChar(s, size);
5872n/a if (tmp == NULL)
5873n/a return NULL;
5874n/a result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5875n/a Py_DECREF(tmp);
5876n/a return result;
5877n/a}
5878n/a
5879n/aPyObject *
5880n/aPyUnicode_AsUTF16String(PyObject *unicode)
5881n/a{
5882n/a return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5883n/a}
5884n/a
5885n/a/* --- Unicode Escape Codec ----------------------------------------------- */
5886n/a
5887n/astatic _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5888n/a
5889n/aPyObject *
5890n/a_PyUnicode_DecodeUnicodeEscape(const char *s,
5891n/a Py_ssize_t size,
5892n/a const char *errors,
5893n/a const char **first_invalid_escape)
5894n/a{
5895n/a const char *starts = s;
5896n/a _PyUnicodeWriter writer;
5897n/a const char *end;
5898n/a PyObject *errorHandler = NULL;
5899n/a PyObject *exc = NULL;
5900n/a
5901n/a // so we can remember if we've seen an invalid escape char or not
5902n/a *first_invalid_escape = NULL;
5903n/a
5904n/a if (size == 0) {
5905n/a _Py_RETURN_UNICODE_EMPTY();
5906n/a }
5907n/a /* Escaped strings will always be longer than the resulting
5908n/a Unicode string, so we start with size here and then reduce the
5909n/a length after conversion to the true value.
5910n/a (but if the error callback returns a long replacement string
5911n/a we'll have to allocate more space) */
5912n/a _PyUnicodeWriter_Init(&writer);
5913n/a writer.min_length = size;
5914n/a if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5915n/a goto onError;
5916n/a }
5917n/a
5918n/a end = s + size;
5919n/a while (s < end) {
5920n/a unsigned char c = (unsigned char) *s++;
5921n/a Py_UCS4 ch;
5922n/a int count;
5923n/a Py_ssize_t startinpos;
5924n/a Py_ssize_t endinpos;
5925n/a const char *message;
5926n/a
5927n/a#define WRITE_ASCII_CHAR(ch) \
5928n/a do { \
5929n/a assert(ch <= 127); \
5930n/a assert(writer.pos < writer.size); \
5931n/a PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5932n/a } while(0)
5933n/a
5934n/a#define WRITE_CHAR(ch) \
5935n/a do { \
5936n/a if (ch <= writer.maxchar) { \
5937n/a assert(writer.pos < writer.size); \
5938n/a PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5939n/a } \
5940n/a else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5941n/a goto onError; \
5942n/a } \
5943n/a } while(0)
5944n/a
5945n/a /* Non-escape characters are interpreted as Unicode ordinals */
5946n/a if (c != '\\') {
5947n/a WRITE_CHAR(c);
5948n/a continue;
5949n/a }
5950n/a
5951n/a startinpos = s - starts - 1;
5952n/a /* \ - Escapes */
5953n/a if (s >= end) {
5954n/a message = "\\ at end of string";
5955n/a goto error;
5956n/a }
5957n/a c = (unsigned char) *s++;
5958n/a
5959n/a assert(writer.pos < writer.size);
5960n/a switch (c) {
5961n/a
5962n/a /* \x escapes */
5963n/a case '\n': continue;
5964n/a case '\\': WRITE_ASCII_CHAR('\\'); continue;
5965n/a case '\'': WRITE_ASCII_CHAR('\''); continue;
5966n/a case '\"': WRITE_ASCII_CHAR('\"'); continue;
5967n/a case 'b': WRITE_ASCII_CHAR('\b'); continue;
5968n/a /* FF */
5969n/a case 'f': WRITE_ASCII_CHAR('\014'); continue;
5970n/a case 't': WRITE_ASCII_CHAR('\t'); continue;
5971n/a case 'n': WRITE_ASCII_CHAR('\n'); continue;
5972n/a case 'r': WRITE_ASCII_CHAR('\r'); continue;
5973n/a /* VT */
5974n/a case 'v': WRITE_ASCII_CHAR('\013'); continue;
5975n/a /* BEL, not classic C */
5976n/a case 'a': WRITE_ASCII_CHAR('\007'); continue;
5977n/a
5978n/a /* \OOO (octal) escapes */
5979n/a case '0': case '1': case '2': case '3':
5980n/a case '4': case '5': case '6': case '7':
5981n/a ch = c - '0';
5982n/a if (s < end && '0' <= *s && *s <= '7') {
5983n/a ch = (ch<<3) + *s++ - '0';
5984n/a if (s < end && '0' <= *s && *s <= '7') {
5985n/a ch = (ch<<3) + *s++ - '0';
5986n/a }
5987n/a }
5988n/a WRITE_CHAR(ch);
5989n/a continue;
5990n/a
5991n/a /* hex escapes */
5992n/a /* \xXX */
5993n/a case 'x':
5994n/a count = 2;
5995n/a message = "truncated \\xXX escape";
5996n/a goto hexescape;
5997n/a
5998n/a /* \uXXXX */
5999n/a case 'u':
6000n/a count = 4;
6001n/a message = "truncated \\uXXXX escape";
6002n/a goto hexescape;
6003n/a
6004n/a /* \UXXXXXXXX */
6005n/a case 'U':
6006n/a count = 8;
6007n/a message = "truncated \\UXXXXXXXX escape";
6008n/a hexescape:
6009n/a for (ch = 0; count && s < end; ++s, --count) {
6010n/a c = (unsigned char)*s;
6011n/a ch <<= 4;
6012n/a if (c >= '0' && c <= '9') {
6013n/a ch += c - '0';
6014n/a }
6015n/a else if (c >= 'a' && c <= 'f') {
6016n/a ch += c - ('a' - 10);
6017n/a }
6018n/a else if (c >= 'A' && c <= 'F') {
6019n/a ch += c - ('A' - 10);
6020n/a }
6021n/a else {
6022n/a break;
6023n/a }
6024n/a }
6025n/a if (count) {
6026n/a goto error;
6027n/a }
6028n/a
6029n/a /* when we get here, ch is a 32-bit unicode character */
6030n/a if (ch > MAX_UNICODE) {
6031n/a message = "illegal Unicode character";
6032n/a goto error;
6033n/a }
6034n/a
6035n/a WRITE_CHAR(ch);
6036n/a continue;
6037n/a
6038n/a /* \N{name} */
6039n/a case 'N':
6040n/a if (ucnhash_CAPI == NULL) {
6041n/a /* load the unicode data module */
6042n/a ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6043n/a PyUnicodeData_CAPSULE_NAME, 1);
6044n/a if (ucnhash_CAPI == NULL) {
6045n/a PyErr_SetString(
6046n/a PyExc_UnicodeError,
6047n/a "\\N escapes not supported (can't load unicodedata module)"
6048n/a );
6049n/a goto onError;
6050n/a }
6051n/a }
6052n/a
6053n/a message = "malformed \\N character escape";
6054n/a if (*s == '{') {
6055n/a const char *start = ++s;
6056n/a size_t namelen;
6057n/a /* look for the closing brace */
6058n/a while (s < end && *s != '}')
6059n/a s++;
6060n/a namelen = s - start;
6061n/a if (namelen && s < end) {
6062n/a /* found a name. look it up in the unicode database */
6063n/a s++;
6064n/a ch = 0xffffffff; /* in case 'getcode' messes up */
6065n/a if (namelen <= INT_MAX &&
6066n/a ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6067n/a &ch, 0)) {
6068n/a assert(ch <= MAX_UNICODE);
6069n/a WRITE_CHAR(ch);
6070n/a continue;
6071n/a }
6072n/a message = "unknown Unicode character name";
6073n/a }
6074n/a }
6075n/a goto error;
6076n/a
6077n/a default:
6078n/a if (*first_invalid_escape == NULL) {
6079n/a *first_invalid_escape = s-1; /* Back up one char, since we've
6080n/a already incremented s. */
6081n/a }
6082n/a WRITE_ASCII_CHAR('\\');
6083n/a WRITE_CHAR(c);
6084n/a continue;
6085n/a }
6086n/a
6087n/a error:
6088n/a endinpos = s-starts;
6089n/a writer.min_length = end - s + writer.pos;
6090n/a if (unicode_decode_call_errorhandler_writer(
6091n/a errors, &errorHandler,
6092n/a "unicodeescape", message,
6093n/a &starts, &end, &startinpos, &endinpos, &exc, &s,
6094n/a &writer)) {
6095n/a goto onError;
6096n/a }
6097n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6098n/a goto onError;
6099n/a }
6100n/a
6101n/a#undef WRITE_ASCII_CHAR
6102n/a#undef WRITE_CHAR
6103n/a }
6104n/a
6105n/a Py_XDECREF(errorHandler);
6106n/a Py_XDECREF(exc);
6107n/a return _PyUnicodeWriter_Finish(&writer);
6108n/a
6109n/a onError:
6110n/a _PyUnicodeWriter_Dealloc(&writer);
6111n/a Py_XDECREF(errorHandler);
6112n/a Py_XDECREF(exc);
6113n/a return NULL;
6114n/a}
6115n/a
6116n/aPyObject *
6117n/aPyUnicode_DecodeUnicodeEscape(const char *s,
6118n/a Py_ssize_t size,
6119n/a const char *errors)
6120n/a{
6121n/a const char *first_invalid_escape;
6122n/a PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6123n/a &first_invalid_escape);
6124n/a if (result == NULL)
6125n/a return NULL;
6126n/a if (first_invalid_escape != NULL) {
6127n/a if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6128n/a "invalid escape sequence '\\%c'",
6129n/a *first_invalid_escape) < 0) {
6130n/a Py_DECREF(result);
6131n/a return NULL;
6132n/a }
6133n/a }
6134n/a return result;
6135n/a}
6136n/a
6137n/a/* Return a Unicode-Escape string version of the Unicode object. */
6138n/a
6139n/aPyObject *
6140n/aPyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6141n/a{
6142n/a Py_ssize_t i, len;
6143n/a PyObject *repr;
6144n/a char *p;
6145n/a enum PyUnicode_Kind kind;
6146n/a void *data;
6147n/a Py_ssize_t expandsize;
6148n/a
6149n/a /* Initial allocation is based on the longest-possible character
6150n/a escape.
6151n/a
6152n/a For UCS1 strings it's '\xxx', 4 bytes per source character.
6153n/a For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6154n/a For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6155n/a */
6156n/a
6157n/a if (!PyUnicode_Check(unicode)) {
6158n/a PyErr_BadArgument();
6159n/a return NULL;
6160n/a }
6161n/a if (PyUnicode_READY(unicode) == -1) {
6162n/a return NULL;
6163n/a }
6164n/a
6165n/a len = PyUnicode_GET_LENGTH(unicode);
6166n/a if (len == 0) {
6167n/a return PyBytes_FromStringAndSize(NULL, 0);
6168n/a }
6169n/a
6170n/a kind = PyUnicode_KIND(unicode);
6171n/a data = PyUnicode_DATA(unicode);
6172n/a /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6173n/a bytes, and 1 byte characters 4. */
6174n/a expandsize = kind * 2 + 2;
6175n/a if (len > PY_SSIZE_T_MAX / expandsize) {
6176n/a return PyErr_NoMemory();
6177n/a }
6178n/a repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6179n/a if (repr == NULL) {
6180n/a return NULL;
6181n/a }
6182n/a
6183n/a p = PyBytes_AS_STRING(repr);
6184n/a for (i = 0; i < len; i++) {
6185n/a Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6186n/a
6187n/a /* U+0000-U+00ff range */
6188n/a if (ch < 0x100) {
6189n/a if (ch >= ' ' && ch < 127) {
6190n/a if (ch != '\\') {
6191n/a /* Copy printable US ASCII as-is */
6192n/a *p++ = (char) ch;
6193n/a }
6194n/a /* Escape backslashes */
6195n/a else {
6196n/a *p++ = '\\';
6197n/a *p++ = '\\';
6198n/a }
6199n/a }
6200n/a
6201n/a /* Map special whitespace to '\t', \n', '\r' */
6202n/a else if (ch == '\t') {
6203n/a *p++ = '\\';
6204n/a *p++ = 't';
6205n/a }
6206n/a else if (ch == '\n') {
6207n/a *p++ = '\\';
6208n/a *p++ = 'n';
6209n/a }
6210n/a else if (ch == '\r') {
6211n/a *p++ = '\\';
6212n/a *p++ = 'r';
6213n/a }
6214n/a
6215n/a /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6216n/a else {
6217n/a *p++ = '\\';
6218n/a *p++ = 'x';
6219n/a *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6220n/a *p++ = Py_hexdigits[ch & 0x000F];
6221n/a }
6222n/a }
6223n/a /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6224n/a else if (ch < 0x10000) {
6225n/a *p++ = '\\';
6226n/a *p++ = 'u';
6227n/a *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6228n/a *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6229n/a *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6230n/a *p++ = Py_hexdigits[ch & 0x000F];
6231n/a }
6232n/a /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6233n/a else {
6234n/a
6235n/a /* Make sure that the first two digits are zero */
6236n/a assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6237n/a *p++ = '\\';
6238n/a *p++ = 'U';
6239n/a *p++ = '0';
6240n/a *p++ = '0';
6241n/a *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6242n/a *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6243n/a *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6244n/a *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6245n/a *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6246n/a *p++ = Py_hexdigits[ch & 0x0000000F];
6247n/a }
6248n/a }
6249n/a
6250n/a assert(p - PyBytes_AS_STRING(repr) > 0);
6251n/a if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6252n/a return NULL;
6253n/a }
6254n/a return repr;
6255n/a}
6256n/a
6257n/aPyObject *
6258n/aPyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6259n/a Py_ssize_t size)
6260n/a{
6261n/a PyObject *result;
6262n/a PyObject *tmp = PyUnicode_FromWideChar(s, size);
6263n/a if (tmp == NULL) {
6264n/a return NULL;
6265n/a }
6266n/a
6267n/a result = PyUnicode_AsUnicodeEscapeString(tmp);
6268n/a Py_DECREF(tmp);
6269n/a return result;
6270n/a}
6271n/a
6272n/a/* --- Raw Unicode Escape Codec ------------------------------------------- */
6273n/a
6274n/aPyObject *
6275n/aPyUnicode_DecodeRawUnicodeEscape(const char *s,
6276n/a Py_ssize_t size,
6277n/a const char *errors)
6278n/a{
6279n/a const char *starts = s;
6280n/a _PyUnicodeWriter writer;
6281n/a const char *end;
6282n/a PyObject *errorHandler = NULL;
6283n/a PyObject *exc = NULL;
6284n/a
6285n/a if (size == 0) {
6286n/a _Py_RETURN_UNICODE_EMPTY();
6287n/a }
6288n/a
6289n/a /* Escaped strings will always be longer than the resulting
6290n/a Unicode string, so we start with size here and then reduce the
6291n/a length after conversion to the true value. (But decoding error
6292n/a handler might have to resize the string) */
6293n/a _PyUnicodeWriter_Init(&writer);
6294n/a writer.min_length = size;
6295n/a if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6296n/a goto onError;
6297n/a }
6298n/a
6299n/a end = s + size;
6300n/a while (s < end) {
6301n/a unsigned char c = (unsigned char) *s++;
6302n/a Py_UCS4 ch;
6303n/a int count;
6304n/a Py_ssize_t startinpos;
6305n/a Py_ssize_t endinpos;
6306n/a const char *message;
6307n/a
6308n/a#define WRITE_CHAR(ch) \
6309n/a do { \
6310n/a if (ch <= writer.maxchar) { \
6311n/a assert(writer.pos < writer.size); \
6312n/a PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6313n/a } \
6314n/a else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6315n/a goto onError; \
6316n/a } \
6317n/a } while(0)
6318n/a
6319n/a /* Non-escape characters are interpreted as Unicode ordinals */
6320n/a if (c != '\\' || s >= end) {
6321n/a WRITE_CHAR(c);
6322n/a continue;
6323n/a }
6324n/a
6325n/a c = (unsigned char) *s++;
6326n/a if (c == 'u') {
6327n/a count = 4;
6328n/a message = "truncated \\uXXXX escape";
6329n/a }
6330n/a else if (c == 'U') {
6331n/a count = 8;
6332n/a message = "truncated \\UXXXXXXXX escape";
6333n/a }
6334n/a else {
6335n/a assert(writer.pos < writer.size);
6336n/a PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6337n/a WRITE_CHAR(c);
6338n/a continue;
6339n/a }
6340n/a startinpos = s - starts - 2;
6341n/a
6342n/a /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6343n/a for (ch = 0; count && s < end; ++s, --count) {
6344n/a c = (unsigned char)*s;
6345n/a ch <<= 4;
6346n/a if (c >= '0' && c <= '9') {
6347n/a ch += c - '0';
6348n/a }
6349n/a else if (c >= 'a' && c <= 'f') {
6350n/a ch += c - ('a' - 10);
6351n/a }
6352n/a else if (c >= 'A' && c <= 'F') {
6353n/a ch += c - ('A' - 10);
6354n/a }
6355n/a else {
6356n/a break;
6357n/a }
6358n/a }
6359n/a if (!count) {
6360n/a if (ch <= MAX_UNICODE) {
6361n/a WRITE_CHAR(ch);
6362n/a continue;
6363n/a }
6364n/a message = "\\Uxxxxxxxx out of range";
6365n/a }
6366n/a
6367n/a endinpos = s-starts;
6368n/a writer.min_length = end - s + writer.pos;
6369n/a if (unicode_decode_call_errorhandler_writer(
6370n/a errors, &errorHandler,
6371n/a "rawunicodeescape", message,
6372n/a &starts, &end, &startinpos, &endinpos, &exc, &s,
6373n/a &writer)) {
6374n/a goto onError;
6375n/a }
6376n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6377n/a goto onError;
6378n/a }
6379n/a
6380n/a#undef WRITE_CHAR
6381n/a }
6382n/a Py_XDECREF(errorHandler);
6383n/a Py_XDECREF(exc);
6384n/a return _PyUnicodeWriter_Finish(&writer);
6385n/a
6386n/a onError:
6387n/a _PyUnicodeWriter_Dealloc(&writer);
6388n/a Py_XDECREF(errorHandler);
6389n/a Py_XDECREF(exc);
6390n/a return NULL;
6391n/a
6392n/a}
6393n/a
6394n/a
6395n/aPyObject *
6396n/aPyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6397n/a{
6398n/a PyObject *repr;
6399n/a char *p;
6400n/a Py_ssize_t expandsize, pos;
6401n/a int kind;
6402n/a void *data;
6403n/a Py_ssize_t len;
6404n/a
6405n/a if (!PyUnicode_Check(unicode)) {
6406n/a PyErr_BadArgument();
6407n/a return NULL;
6408n/a }
6409n/a if (PyUnicode_READY(unicode) == -1) {
6410n/a return NULL;
6411n/a }
6412n/a kind = PyUnicode_KIND(unicode);
6413n/a data = PyUnicode_DATA(unicode);
6414n/a len = PyUnicode_GET_LENGTH(unicode);
6415n/a if (kind == PyUnicode_1BYTE_KIND) {
6416n/a return PyBytes_FromStringAndSize(data, len);
6417n/a }
6418n/a
6419n/a /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6420n/a bytes, and 1 byte characters 4. */
6421n/a expandsize = kind * 2 + 2;
6422n/a
6423n/a if (len > PY_SSIZE_T_MAX / expandsize) {
6424n/a return PyErr_NoMemory();
6425n/a }
6426n/a repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6427n/a if (repr == NULL) {
6428n/a return NULL;
6429n/a }
6430n/a if (len == 0) {
6431n/a return repr;
6432n/a }
6433n/a
6434n/a p = PyBytes_AS_STRING(repr);
6435n/a for (pos = 0; pos < len; pos++) {
6436n/a Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6437n/a
6438n/a /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6439n/a if (ch < 0x100) {
6440n/a *p++ = (char) ch;
6441n/a }
6442n/a /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6443n/a else if (ch < 0x10000) {
6444n/a *p++ = '\\';
6445n/a *p++ = 'u';
6446n/a *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6447n/a *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6448n/a *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6449n/a *p++ = Py_hexdigits[ch & 15];
6450n/a }
6451n/a /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6452n/a else {
6453n/a assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6454n/a *p++ = '\\';
6455n/a *p++ = 'U';
6456n/a *p++ = '0';
6457n/a *p++ = '0';
6458n/a *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6459n/a *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6460n/a *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6461n/a *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6462n/a *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6463n/a *p++ = Py_hexdigits[ch & 15];
6464n/a }
6465n/a }
6466n/a
6467n/a assert(p > PyBytes_AS_STRING(repr));
6468n/a if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6469n/a return NULL;
6470n/a }
6471n/a return repr;
6472n/a}
6473n/a
6474n/aPyObject *
6475n/aPyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6476n/a Py_ssize_t size)
6477n/a{
6478n/a PyObject *result;
6479n/a PyObject *tmp = PyUnicode_FromWideChar(s, size);
6480n/a if (tmp == NULL)
6481n/a return NULL;
6482n/a result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6483n/a Py_DECREF(tmp);
6484n/a return result;
6485n/a}
6486n/a
6487n/a/* --- Unicode Internal Codec ------------------------------------------- */
6488n/a
6489n/aPyObject *
6490n/a_PyUnicode_DecodeUnicodeInternal(const char *s,
6491n/a Py_ssize_t size,
6492n/a const char *errors)
6493n/a{
6494n/a const char *starts = s;
6495n/a Py_ssize_t startinpos;
6496n/a Py_ssize_t endinpos;
6497n/a _PyUnicodeWriter writer;
6498n/a const char *end;
6499n/a const char *reason;
6500n/a PyObject *errorHandler = NULL;
6501n/a PyObject *exc = NULL;
6502n/a
6503n/a if (PyErr_WarnEx(PyExc_DeprecationWarning,
6504n/a "unicode_internal codec has been deprecated",
6505n/a 1))
6506n/a return NULL;
6507n/a
6508n/a if (size == 0)
6509n/a _Py_RETURN_UNICODE_EMPTY();
6510n/a
6511n/a _PyUnicodeWriter_Init(&writer);
6512n/a if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6513n/a PyErr_NoMemory();
6514n/a goto onError;
6515n/a }
6516n/a writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
6517n/a
6518n/a end = s + size;
6519n/a while (s < end) {
6520n/a Py_UNICODE uch;
6521n/a Py_UCS4 ch;
6522n/a if (end - s < Py_UNICODE_SIZE) {
6523n/a endinpos = end-starts;
6524n/a reason = "truncated input";
6525n/a goto error;
6526n/a }
6527n/a /* We copy the raw representation one byte at a time because the
6528n/a pointer may be unaligned (see test_codeccallbacks). */
6529n/a ((char *) &uch)[0] = s[0];
6530n/a ((char *) &uch)[1] = s[1];
6531n/a#ifdef Py_UNICODE_WIDE
6532n/a ((char *) &uch)[2] = s[2];
6533n/a ((char *) &uch)[3] = s[3];
6534n/a#endif
6535n/a ch = uch;
6536n/a#ifdef Py_UNICODE_WIDE
6537n/a /* We have to sanity check the raw data, otherwise doom looms for
6538n/a some malformed UCS-4 data. */
6539n/a if (ch > 0x10ffff) {
6540n/a endinpos = s - starts + Py_UNICODE_SIZE;
6541n/a reason = "illegal code point (> 0x10FFFF)";
6542n/a goto error;
6543n/a }
6544n/a#endif
6545n/a s += Py_UNICODE_SIZE;
6546n/a#ifndef Py_UNICODE_WIDE
6547n/a if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
6548n/a {
6549n/a Py_UNICODE uch2;
6550n/a ((char *) &uch2)[0] = s[0];
6551n/a ((char *) &uch2)[1] = s[1];
6552n/a if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6553n/a {
6554n/a ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6555n/a s += Py_UNICODE_SIZE;
6556n/a }
6557n/a }
6558n/a#endif
6559n/a
6560n/a if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6561n/a goto onError;
6562n/a continue;
6563n/a
6564n/a error:
6565n/a startinpos = s - starts;
6566n/a if (unicode_decode_call_errorhandler_writer(
6567n/a errors, &errorHandler,
6568n/a "unicode_internal", reason,
6569n/a &starts, &end, &startinpos, &endinpos, &exc, &s,
6570n/a &writer))
6571n/a goto onError;
6572n/a }
6573n/a
6574n/a Py_XDECREF(errorHandler);
6575n/a Py_XDECREF(exc);
6576n/a return _PyUnicodeWriter_Finish(&writer);
6577n/a
6578n/a onError:
6579n/a _PyUnicodeWriter_Dealloc(&writer);
6580n/a Py_XDECREF(errorHandler);
6581n/a Py_XDECREF(exc);
6582n/a return NULL;
6583n/a}
6584n/a
6585n/a/* --- Latin-1 Codec ------------------------------------------------------ */
6586n/a
6587n/aPyObject *
6588n/aPyUnicode_DecodeLatin1(const char *s,
6589n/a Py_ssize_t size,
6590n/a const char *errors)
6591n/a{
6592n/a /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6593n/a return _PyUnicode_FromUCS1((unsigned char*)s, size);
6594n/a}
6595n/a
6596n/a/* create or adjust a UnicodeEncodeError */
6597n/astatic void
6598n/amake_encode_exception(PyObject **exceptionObject,
6599n/a const char *encoding,
6600n/a PyObject *unicode,
6601n/a Py_ssize_t startpos, Py_ssize_t endpos,
6602n/a const char *reason)
6603n/a{
6604n/a if (*exceptionObject == NULL) {
6605n/a *exceptionObject = PyObject_CallFunction(
6606n/a PyExc_UnicodeEncodeError, "sOnns",
6607n/a encoding, unicode, startpos, endpos, reason);
6608n/a }
6609n/a else {
6610n/a if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6611n/a goto onError;
6612n/a if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6613n/a goto onError;
6614n/a if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6615n/a goto onError;
6616n/a return;
6617n/a onError:
6618n/a Py_CLEAR(*exceptionObject);
6619n/a }
6620n/a}
6621n/a
6622n/a/* raises a UnicodeEncodeError */
6623n/astatic void
6624n/araise_encode_exception(PyObject **exceptionObject,
6625n/a const char *encoding,
6626n/a PyObject *unicode,
6627n/a Py_ssize_t startpos, Py_ssize_t endpos,
6628n/a const char *reason)
6629n/a{
6630n/a make_encode_exception(exceptionObject,
6631n/a encoding, unicode, startpos, endpos, reason);
6632n/a if (*exceptionObject != NULL)
6633n/a PyCodec_StrictErrors(*exceptionObject);
6634n/a}
6635n/a
6636n/a/* error handling callback helper:
6637n/a build arguments, call the callback and check the arguments,
6638n/a put the result into newpos and return the replacement string, which
6639n/a has to be freed by the caller */
6640n/astatic PyObject *
6641n/aunicode_encode_call_errorhandler(const char *errors,
6642n/a PyObject **errorHandler,
6643n/a const char *encoding, const char *reason,
6644n/a PyObject *unicode, PyObject **exceptionObject,
6645n/a Py_ssize_t startpos, Py_ssize_t endpos,
6646n/a Py_ssize_t *newpos)
6647n/a{
6648n/a static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6649n/a Py_ssize_t len;
6650n/a PyObject *restuple;
6651n/a PyObject *resunicode;
6652n/a
6653n/a if (*errorHandler == NULL) {
6654n/a *errorHandler = PyCodec_LookupError(errors);
6655n/a if (*errorHandler == NULL)
6656n/a return NULL;
6657n/a }
6658n/a
6659n/a if (PyUnicode_READY(unicode) == -1)
6660n/a return NULL;
6661n/a len = PyUnicode_GET_LENGTH(unicode);
6662n/a
6663n/a make_encode_exception(exceptionObject,
6664n/a encoding, unicode, startpos, endpos, reason);
6665n/a if (*exceptionObject == NULL)
6666n/a return NULL;
6667n/a
6668n/a restuple = PyObject_CallFunctionObjArgs(
6669n/a *errorHandler, *exceptionObject, NULL);
6670n/a if (restuple == NULL)
6671n/a return NULL;
6672n/a if (!PyTuple_Check(restuple)) {
6673n/a PyErr_SetString(PyExc_TypeError, &argparse[3]);
6674n/a Py_DECREF(restuple);
6675n/a return NULL;
6676n/a }
6677n/a if (!PyArg_ParseTuple(restuple, argparse,
6678n/a &resunicode, newpos)) {
6679n/a Py_DECREF(restuple);
6680n/a return NULL;
6681n/a }
6682n/a if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6683n/a PyErr_SetString(PyExc_TypeError, &argparse[3]);
6684n/a Py_DECREF(restuple);
6685n/a return NULL;
6686n/a }
6687n/a if (*newpos<0)
6688n/a *newpos = len + *newpos;
6689n/a if (*newpos<0 || *newpos>len) {
6690n/a PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6691n/a Py_DECREF(restuple);
6692n/a return NULL;
6693n/a }
6694n/a Py_INCREF(resunicode);
6695n/a Py_DECREF(restuple);
6696n/a return resunicode;
6697n/a}
6698n/a
6699n/astatic PyObject *
6700n/aunicode_encode_ucs1(PyObject *unicode,
6701n/a const char *errors,
6702n/a const Py_UCS4 limit)
6703n/a{
6704n/a /* input state */
6705n/a Py_ssize_t pos=0, size;
6706n/a int kind;
6707n/a void *data;
6708n/a /* pointer into the output */
6709n/a char *str;
6710n/a const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6711n/a const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6712n/a PyObject *error_handler_obj = NULL;
6713n/a PyObject *exc = NULL;
6714n/a _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6715n/a PyObject *rep = NULL;
6716n/a /* output object */
6717n/a _PyBytesWriter writer;
6718n/a
6719n/a if (PyUnicode_READY(unicode) == -1)
6720n/a return NULL;
6721n/a size = PyUnicode_GET_LENGTH(unicode);
6722n/a kind = PyUnicode_KIND(unicode);
6723n/a data = PyUnicode_DATA(unicode);
6724n/a /* allocate enough for a simple encoding without
6725n/a replacements, if we need more, we'll resize */
6726n/a if (size == 0)
6727n/a return PyBytes_FromStringAndSize(NULL, 0);
6728n/a
6729n/a _PyBytesWriter_Init(&writer);
6730n/a str = _PyBytesWriter_Alloc(&writer, size);
6731n/a if (str == NULL)
6732n/a return NULL;
6733n/a
6734n/a while (pos < size) {
6735n/a Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6736n/a
6737n/a /* can we encode this? */
6738n/a if (ch < limit) {
6739n/a /* no overflow check, because we know that the space is enough */
6740n/a *str++ = (char)ch;
6741n/a ++pos;
6742n/a }
6743n/a else {
6744n/a Py_ssize_t newpos, i;
6745n/a /* startpos for collecting unencodable chars */
6746n/a Py_ssize_t collstart = pos;
6747n/a Py_ssize_t collend = collstart + 1;
6748n/a /* find all unecodable characters */
6749n/a
6750n/a while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6751n/a ++collend;
6752n/a
6753n/a /* Only overallocate the buffer if it's not the last write */
6754n/a writer.overallocate = (collend < size);
6755n/a
6756n/a /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6757n/a if (error_handler == _Py_ERROR_UNKNOWN)
6758n/a error_handler = get_error_handler(errors);
6759n/a
6760n/a switch (error_handler) {
6761n/a case _Py_ERROR_STRICT:
6762n/a raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6763n/a goto onError;
6764n/a
6765n/a case _Py_ERROR_REPLACE:
6766n/a memset(str, '?', collend - collstart);
6767n/a str += (collend - collstart);
6768n/a /* fall through ignore error handler */
6769n/a case _Py_ERROR_IGNORE:
6770n/a pos = collend;
6771n/a break;
6772n/a
6773n/a case _Py_ERROR_BACKSLASHREPLACE:
6774n/a /* subtract preallocated bytes */
6775n/a writer.min_size -= (collend - collstart);
6776n/a str = backslashreplace(&writer, str,
6777n/a unicode, collstart, collend);
6778n/a if (str == NULL)
6779n/a goto onError;
6780n/a pos = collend;
6781n/a break;
6782n/a
6783n/a case _Py_ERROR_XMLCHARREFREPLACE:
6784n/a /* subtract preallocated bytes */
6785n/a writer.min_size -= (collend - collstart);
6786n/a str = xmlcharrefreplace(&writer, str,
6787n/a unicode, collstart, collend);
6788n/a if (str == NULL)
6789n/a goto onError;
6790n/a pos = collend;
6791n/a break;
6792n/a
6793n/a case _Py_ERROR_SURROGATEESCAPE:
6794n/a for (i = collstart; i < collend; ++i) {
6795n/a ch = PyUnicode_READ(kind, data, i);
6796n/a if (ch < 0xdc80 || 0xdcff < ch) {
6797n/a /* Not a UTF-8b surrogate */
6798n/a break;
6799n/a }
6800n/a *str++ = (char)(ch - 0xdc00);
6801n/a ++pos;
6802n/a }
6803n/a if (i >= collend)
6804n/a break;
6805n/a collstart = pos;
6806n/a assert(collstart != collend);
6807n/a /* fallback to general error handling */
6808n/a
6809n/a default:
6810n/a rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6811n/a encoding, reason, unicode, &exc,
6812n/a collstart, collend, &newpos);
6813n/a if (rep == NULL)
6814n/a goto onError;
6815n/a
6816n/a /* subtract preallocated bytes */
6817n/a writer.min_size -= newpos - collstart;
6818n/a
6819n/a if (PyBytes_Check(rep)) {
6820n/a /* Directly copy bytes result to output. */
6821n/a str = _PyBytesWriter_WriteBytes(&writer, str,
6822n/a PyBytes_AS_STRING(rep),
6823n/a PyBytes_GET_SIZE(rep));
6824n/a if (str == NULL)
6825n/a goto onError;
6826n/a }
6827n/a else {
6828n/a assert(PyUnicode_Check(rep));
6829n/a
6830n/a if (PyUnicode_READY(rep) < 0)
6831n/a goto onError;
6832n/a
6833n/a if (limit == 256 ?
6834n/a PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6835n/a !PyUnicode_IS_ASCII(rep))
6836n/a {
6837n/a /* Not all characters are smaller than limit */
6838n/a raise_encode_exception(&exc, encoding, unicode,
6839n/a collstart, collend, reason);
6840n/a goto onError;
6841n/a }
6842n/a assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6843n/a str = _PyBytesWriter_WriteBytes(&writer, str,
6844n/a PyUnicode_DATA(rep),
6845n/a PyUnicode_GET_LENGTH(rep));
6846n/a }
6847n/a pos = newpos;
6848n/a Py_CLEAR(rep);
6849n/a }
6850n/a
6851n/a /* If overallocation was disabled, ensure that it was the last
6852n/a write. Otherwise, we missed an optimization */
6853n/a assert(writer.overallocate || pos == size);
6854n/a }
6855n/a }
6856n/a
6857n/a Py_XDECREF(error_handler_obj);
6858n/a Py_XDECREF(exc);
6859n/a return _PyBytesWriter_Finish(&writer, str);
6860n/a
6861n/a onError:
6862n/a Py_XDECREF(rep);
6863n/a _PyBytesWriter_Dealloc(&writer);
6864n/a Py_XDECREF(error_handler_obj);
6865n/a Py_XDECREF(exc);
6866n/a return NULL;
6867n/a}
6868n/a
6869n/a/* Deprecated */
6870n/aPyObject *
6871n/aPyUnicode_EncodeLatin1(const Py_UNICODE *p,
6872n/a Py_ssize_t size,
6873n/a const char *errors)
6874n/a{
6875n/a PyObject *result;
6876n/a PyObject *unicode = PyUnicode_FromWideChar(p, size);
6877n/a if (unicode == NULL)
6878n/a return NULL;
6879n/a result = unicode_encode_ucs1(unicode, errors, 256);
6880n/a Py_DECREF(unicode);
6881n/a return result;
6882n/a}
6883n/a
6884n/aPyObject *
6885n/a_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6886n/a{
6887n/a if (!PyUnicode_Check(unicode)) {
6888n/a PyErr_BadArgument();
6889n/a return NULL;
6890n/a }
6891n/a if (PyUnicode_READY(unicode) == -1)
6892n/a return NULL;
6893n/a /* Fast path: if it is a one-byte string, construct
6894n/a bytes object directly. */
6895n/a if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6896n/a return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6897n/a PyUnicode_GET_LENGTH(unicode));
6898n/a /* Non-Latin-1 characters present. Defer to above function to
6899n/a raise the exception. */
6900n/a return unicode_encode_ucs1(unicode, errors, 256);
6901n/a}
6902n/a
6903n/aPyObject*
6904n/aPyUnicode_AsLatin1String(PyObject *unicode)
6905n/a{
6906n/a return _PyUnicode_AsLatin1String(unicode, NULL);
6907n/a}
6908n/a
6909n/a/* --- 7-bit ASCII Codec -------------------------------------------------- */
6910n/a
6911n/aPyObject *
6912n/aPyUnicode_DecodeASCII(const char *s,
6913n/a Py_ssize_t size,
6914n/a const char *errors)
6915n/a{
6916n/a const char *starts = s;
6917n/a _PyUnicodeWriter writer;
6918n/a int kind;
6919n/a void *data;
6920n/a Py_ssize_t startinpos;
6921n/a Py_ssize_t endinpos;
6922n/a Py_ssize_t outpos;
6923n/a const char *e;
6924n/a PyObject *error_handler_obj = NULL;
6925n/a PyObject *exc = NULL;
6926n/a _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6927n/a
6928n/a if (size == 0)
6929n/a _Py_RETURN_UNICODE_EMPTY();
6930n/a
6931n/a /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6932n/a if (size == 1 && (unsigned char)s[0] < 128)
6933n/a return get_latin1_char((unsigned char)s[0]);
6934n/a
6935n/a _PyUnicodeWriter_Init(&writer);
6936n/a writer.min_length = size;
6937n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
6938n/a return NULL;
6939n/a
6940n/a e = s + size;
6941n/a data = writer.data;
6942n/a outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6943n/a writer.pos = outpos;
6944n/a if (writer.pos == size)
6945n/a return _PyUnicodeWriter_Finish(&writer);
6946n/a
6947n/a s += writer.pos;
6948n/a kind = writer.kind;
6949n/a while (s < e) {
6950n/a unsigned char c = (unsigned char)*s;
6951n/a if (c < 128) {
6952n/a PyUnicode_WRITE(kind, data, writer.pos, c);
6953n/a writer.pos++;
6954n/a ++s;
6955n/a continue;
6956n/a }
6957n/a
6958n/a /* byte outsize range 0x00..0x7f: call the error handler */
6959n/a
6960n/a if (error_handler == _Py_ERROR_UNKNOWN)
6961n/a error_handler = get_error_handler(errors);
6962n/a
6963n/a switch (error_handler)
6964n/a {
6965n/a case _Py_ERROR_REPLACE:
6966n/a case _Py_ERROR_SURROGATEESCAPE:
6967n/a /* Fast-path: the error handler only writes one character,
6968n/a but we may switch to UCS2 at the first write */
6969n/a if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6970n/a goto onError;
6971n/a kind = writer.kind;
6972n/a data = writer.data;
6973n/a
6974n/a if (error_handler == _Py_ERROR_REPLACE)
6975n/a PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6976n/a else
6977n/a PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6978n/a writer.pos++;
6979n/a ++s;
6980n/a break;
6981n/a
6982n/a case _Py_ERROR_IGNORE:
6983n/a ++s;
6984n/a break;
6985n/a
6986n/a default:
6987n/a startinpos = s-starts;
6988n/a endinpos = startinpos + 1;
6989n/a if (unicode_decode_call_errorhandler_writer(
6990n/a errors, &error_handler_obj,
6991n/a "ascii", "ordinal not in range(128)",
6992n/a &starts, &e, &startinpos, &endinpos, &exc, &s,
6993n/a &writer))
6994n/a goto onError;
6995n/a kind = writer.kind;
6996n/a data = writer.data;
6997n/a }
6998n/a }
6999n/a Py_XDECREF(error_handler_obj);
7000n/a Py_XDECREF(exc);
7001n/a return _PyUnicodeWriter_Finish(&writer);
7002n/a
7003n/a onError:
7004n/a _PyUnicodeWriter_Dealloc(&writer);
7005n/a Py_XDECREF(error_handler_obj);
7006n/a Py_XDECREF(exc);
7007n/a return NULL;
7008n/a}
7009n/a
7010n/a/* Deprecated */
7011n/aPyObject *
7012n/aPyUnicode_EncodeASCII(const Py_UNICODE *p,
7013n/a Py_ssize_t size,
7014n/a const char *errors)
7015n/a{
7016n/a PyObject *result;
7017n/a PyObject *unicode = PyUnicode_FromWideChar(p, size);
7018n/a if (unicode == NULL)
7019n/a return NULL;
7020n/a result = unicode_encode_ucs1(unicode, errors, 128);
7021n/a Py_DECREF(unicode);
7022n/a return result;
7023n/a}
7024n/a
7025n/aPyObject *
7026n/a_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7027n/a{
7028n/a if (!PyUnicode_Check(unicode)) {
7029n/a PyErr_BadArgument();
7030n/a return NULL;
7031n/a }
7032n/a if (PyUnicode_READY(unicode) == -1)
7033n/a return NULL;
7034n/a /* Fast path: if it is an ASCII-only string, construct bytes object
7035n/a directly. Else defer to above function to raise the exception. */
7036n/a if (PyUnicode_IS_ASCII(unicode))
7037n/a return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7038n/a PyUnicode_GET_LENGTH(unicode));
7039n/a return unicode_encode_ucs1(unicode, errors, 128);
7040n/a}
7041n/a
7042n/aPyObject *
7043n/aPyUnicode_AsASCIIString(PyObject *unicode)
7044n/a{
7045n/a return _PyUnicode_AsASCIIString(unicode, NULL);
7046n/a}
7047n/a
7048n/a#ifdef MS_WINDOWS
7049n/a
7050n/a/* --- MBCS codecs for Windows -------------------------------------------- */
7051n/a
7052n/a#if SIZEOF_INT < SIZEOF_SIZE_T
7053n/a#define NEED_RETRY
7054n/a#endif
7055n/a
7056n/a#ifndef WC_ERR_INVALID_CHARS
7057n/a# define WC_ERR_INVALID_CHARS 0x0080
7058n/a#endif
7059n/a
7060n/astatic const char*
7061n/acode_page_name(UINT code_page, PyObject **obj)
7062n/a{
7063n/a *obj = NULL;
7064n/a if (code_page == CP_ACP)
7065n/a return "mbcs";
7066n/a if (code_page == CP_UTF7)
7067n/a return "CP_UTF7";
7068n/a if (code_page == CP_UTF8)
7069n/a return "CP_UTF8";
7070n/a
7071n/a *obj = PyBytes_FromFormat("cp%u", code_page);
7072n/a if (*obj == NULL)
7073n/a return NULL;
7074n/a return PyBytes_AS_STRING(*obj);
7075n/a}
7076n/a
7077n/astatic DWORD
7078n/adecode_code_page_flags(UINT code_page)
7079n/a{
7080n/a if (code_page == CP_UTF7) {
7081n/a /* The CP_UTF7 decoder only supports flags=0 */
7082n/a return 0;
7083n/a }
7084n/a else
7085n/a return MB_ERR_INVALID_CHARS;
7086n/a}
7087n/a
7088n/a/*
7089n/a * Decode a byte string from a Windows code page into unicode object in strict
7090n/a * mode.
7091n/a *
7092n/a * Returns consumed size if succeed, returns -2 on decode error, or raise an
7093n/a * OSError and returns -1 on other error.
7094n/a */
7095n/astatic int
7096n/adecode_code_page_strict(UINT code_page,
7097n/a PyObject **v,
7098n/a const char *in,
7099n/a int insize)
7100n/a{
7101n/a const DWORD flags = decode_code_page_flags(code_page);
7102n/a wchar_t *out;
7103n/a DWORD outsize;
7104n/a
7105n/a /* First get the size of the result */
7106n/a assert(insize > 0);
7107n/a outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7108n/a if (outsize <= 0)
7109n/a goto error;
7110n/a
7111n/a if (*v == NULL) {
7112n/a /* Create unicode object */
7113n/a /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7114n/a *v = (PyObject*)_PyUnicode_New(outsize);
7115n/a if (*v == NULL)
7116n/a return -1;
7117n/a out = PyUnicode_AS_UNICODE(*v);
7118n/a }
7119n/a else {
7120n/a /* Extend unicode object */
7121n/a Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7122n/a if (unicode_resize(v, n + outsize) < 0)
7123n/a return -1;
7124n/a out = PyUnicode_AS_UNICODE(*v) + n;
7125n/a }
7126n/a
7127n/a /* Do the conversion */
7128n/a outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7129n/a if (outsize <= 0)
7130n/a goto error;
7131n/a return insize;
7132n/a
7133n/aerror:
7134n/a if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7135n/a return -2;
7136n/a PyErr_SetFromWindowsErr(0);
7137n/a return -1;
7138n/a}
7139n/a
7140n/a/*
7141n/a * Decode a byte string from a code page into unicode object with an error
7142n/a * handler.
7143n/a *
7144n/a * Returns consumed size if succeed, or raise an OSError or
7145n/a * UnicodeDecodeError exception and returns -1 on error.
7146n/a */
7147n/astatic int
7148n/adecode_code_page_errors(UINT code_page,
7149n/a PyObject **v,
7150n/a const char *in, const int size,
7151n/a const char *errors, int final)
7152n/a{
7153n/a const char *startin = in;
7154n/a const char *endin = in + size;
7155n/a const DWORD flags = decode_code_page_flags(code_page);
7156n/a /* Ideally, we should get reason from FormatMessage. This is the Windows
7157n/a 2000 English version of the message. */
7158n/a const char *reason = "No mapping for the Unicode character exists "
7159n/a "in the target code page.";
7160n/a /* each step cannot decode more than 1 character, but a character can be
7161n/a represented as a surrogate pair */
7162n/a wchar_t buffer[2], *startout, *out;
7163n/a int insize;
7164n/a Py_ssize_t outsize;
7165n/a PyObject *errorHandler = NULL;
7166n/a PyObject *exc = NULL;
7167n/a PyObject *encoding_obj = NULL;
7168n/a const char *encoding;
7169n/a DWORD err;
7170n/a int ret = -1;
7171n/a
7172n/a assert(size > 0);
7173n/a
7174n/a encoding = code_page_name(code_page, &encoding_obj);
7175n/a if (encoding == NULL)
7176n/a return -1;
7177n/a
7178n/a if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7179n/a /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7180n/a UnicodeDecodeError. */
7181n/a make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7182n/a if (exc != NULL) {
7183n/a PyCodec_StrictErrors(exc);
7184n/a Py_CLEAR(exc);
7185n/a }
7186n/a goto error;
7187n/a }
7188n/a
7189n/a if (*v == NULL) {
7190n/a /* Create unicode object */
7191n/a if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7192n/a PyErr_NoMemory();
7193n/a goto error;
7194n/a }
7195n/a /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7196n/a *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
7197n/a if (*v == NULL)
7198n/a goto error;
7199n/a startout = PyUnicode_AS_UNICODE(*v);
7200n/a }
7201n/a else {
7202n/a /* Extend unicode object */
7203n/a Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7204n/a if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7205n/a PyErr_NoMemory();
7206n/a goto error;
7207n/a }
7208n/a if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7209n/a goto error;
7210n/a startout = PyUnicode_AS_UNICODE(*v) + n;
7211n/a }
7212n/a
7213n/a /* Decode the byte string character per character */
7214n/a out = startout;
7215n/a while (in < endin)
7216n/a {
7217n/a /* Decode a character */
7218n/a insize = 1;
7219n/a do
7220n/a {
7221n/a outsize = MultiByteToWideChar(code_page, flags,
7222n/a in, insize,
7223n/a buffer, Py_ARRAY_LENGTH(buffer));
7224n/a if (outsize > 0)
7225n/a break;
7226n/a err = GetLastError();
7227n/a if (err != ERROR_NO_UNICODE_TRANSLATION
7228n/a && err != ERROR_INSUFFICIENT_BUFFER)
7229n/a {
7230n/a PyErr_SetFromWindowsErr(0);
7231n/a goto error;
7232n/a }
7233n/a insize++;
7234n/a }
7235n/a /* 4=maximum length of a UTF-8 sequence */
7236n/a while (insize <= 4 && (in + insize) <= endin);
7237n/a
7238n/a if (outsize <= 0) {
7239n/a Py_ssize_t startinpos, endinpos, outpos;
7240n/a
7241n/a /* last character in partial decode? */
7242n/a if (in + insize >= endin && !final)
7243n/a break;
7244n/a
7245n/a startinpos = in - startin;
7246n/a endinpos = startinpos + 1;
7247n/a outpos = out - PyUnicode_AS_UNICODE(*v);
7248n/a if (unicode_decode_call_errorhandler_wchar(
7249n/a errors, &errorHandler,
7250n/a encoding, reason,
7251n/a &startin, &endin, &startinpos, &endinpos, &exc, &in,
7252n/a v, &outpos))
7253n/a {
7254n/a goto error;
7255n/a }
7256n/a out = PyUnicode_AS_UNICODE(*v) + outpos;
7257n/a }
7258n/a else {
7259n/a in += insize;
7260n/a memcpy(out, buffer, outsize * sizeof(wchar_t));
7261n/a out += outsize;
7262n/a }
7263n/a }
7264n/a
7265n/a /* write a NUL character at the end */
7266n/a *out = 0;
7267n/a
7268n/a /* Extend unicode object */
7269n/a outsize = out - startout;
7270n/a assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7271n/a if (unicode_resize(v, outsize) < 0)
7272n/a goto error;
7273n/a /* (in - startin) <= size and size is an int */
7274n/a ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7275n/a
7276n/aerror:
7277n/a Py_XDECREF(encoding_obj);
7278n/a Py_XDECREF(errorHandler);
7279n/a Py_XDECREF(exc);
7280n/a return ret;
7281n/a}
7282n/a
7283n/astatic PyObject *
7284n/adecode_code_page_stateful(int code_page,
7285n/a const char *s, Py_ssize_t size,
7286n/a const char *errors, Py_ssize_t *consumed)
7287n/a{
7288n/a PyObject *v = NULL;
7289n/a int chunk_size, final, converted, done;
7290n/a
7291n/a if (code_page < 0) {
7292n/a PyErr_SetString(PyExc_ValueError, "invalid code page number");
7293n/a return NULL;
7294n/a }
7295n/a
7296n/a if (consumed)
7297n/a *consumed = 0;
7298n/a
7299n/a do
7300n/a {
7301n/a#ifdef NEED_RETRY
7302n/a if (size > INT_MAX) {
7303n/a chunk_size = INT_MAX;
7304n/a final = 0;
7305n/a done = 0;
7306n/a }
7307n/a else
7308n/a#endif
7309n/a {
7310n/a chunk_size = (int)size;
7311n/a final = (consumed == NULL);
7312n/a done = 1;
7313n/a }
7314n/a
7315n/a if (chunk_size == 0 && done) {
7316n/a if (v != NULL)
7317n/a break;
7318n/a _Py_RETURN_UNICODE_EMPTY();
7319n/a }
7320n/a
7321n/a converted = decode_code_page_strict(code_page, &v,
7322n/a s, chunk_size);
7323n/a if (converted == -2)
7324n/a converted = decode_code_page_errors(code_page, &v,
7325n/a s, chunk_size,
7326n/a errors, final);
7327n/a assert(converted != 0 || done);
7328n/a
7329n/a if (converted < 0) {
7330n/a Py_XDECREF(v);
7331n/a return NULL;
7332n/a }
7333n/a
7334n/a if (consumed)
7335n/a *consumed += converted;
7336n/a
7337n/a s += converted;
7338n/a size -= converted;
7339n/a } while (!done);
7340n/a
7341n/a return unicode_result(v);
7342n/a}
7343n/a
7344n/aPyObject *
7345n/aPyUnicode_DecodeCodePageStateful(int code_page,
7346n/a const char *s,
7347n/a Py_ssize_t size,
7348n/a const char *errors,
7349n/a Py_ssize_t *consumed)
7350n/a{
7351n/a return decode_code_page_stateful(code_page, s, size, errors, consumed);
7352n/a}
7353n/a
7354n/aPyObject *
7355n/aPyUnicode_DecodeMBCSStateful(const char *s,
7356n/a Py_ssize_t size,
7357n/a const char *errors,
7358n/a Py_ssize_t *consumed)
7359n/a{
7360n/a return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7361n/a}
7362n/a
7363n/aPyObject *
7364n/aPyUnicode_DecodeMBCS(const char *s,
7365n/a Py_ssize_t size,
7366n/a const char *errors)
7367n/a{
7368n/a return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7369n/a}
7370n/a
7371n/astatic DWORD
7372n/aencode_code_page_flags(UINT code_page, const char *errors)
7373n/a{
7374n/a if (code_page == CP_UTF8) {
7375n/a return WC_ERR_INVALID_CHARS;
7376n/a }
7377n/a else if (code_page == CP_UTF7) {
7378n/a /* CP_UTF7 only supports flags=0 */
7379n/a return 0;
7380n/a }
7381n/a else {
7382n/a if (errors != NULL && strcmp(errors, "replace") == 0)
7383n/a return 0;
7384n/a else
7385n/a return WC_NO_BEST_FIT_CHARS;
7386n/a }
7387n/a}
7388n/a
7389n/a/*
7390n/a * Encode a Unicode string to a Windows code page into a byte string in strict
7391n/a * mode.
7392n/a *
7393n/a * Returns consumed characters if succeed, returns -2 on encode error, or raise
7394n/a * an OSError and returns -1 on other error.
7395n/a */
7396n/astatic int
7397n/aencode_code_page_strict(UINT code_page, PyObject **outbytes,
7398n/a PyObject *unicode, Py_ssize_t offset, int len,
7399n/a const char* errors)
7400n/a{
7401n/a BOOL usedDefaultChar = FALSE;
7402n/a BOOL *pusedDefaultChar = &usedDefaultChar;
7403n/a int outsize;
7404n/a wchar_t *p;
7405n/a Py_ssize_t size;
7406n/a const DWORD flags = encode_code_page_flags(code_page, NULL);
7407n/a char *out;
7408n/a /* Create a substring so that we can get the UTF-16 representation
7409n/a of just the slice under consideration. */
7410n/a PyObject *substring;
7411n/a
7412n/a assert(len > 0);
7413n/a
7414n/a if (code_page != CP_UTF8 && code_page != CP_UTF7)
7415n/a pusedDefaultChar = &usedDefaultChar;
7416n/a else
7417n/a pusedDefaultChar = NULL;
7418n/a
7419n/a substring = PyUnicode_Substring(unicode, offset, offset+len);
7420n/a if (substring == NULL)
7421n/a return -1;
7422n/a p = PyUnicode_AsUnicodeAndSize(substring, &size);
7423n/a if (p == NULL) {
7424n/a Py_DECREF(substring);
7425n/a return -1;
7426n/a }
7427n/a assert(size <= INT_MAX);
7428n/a
7429n/a /* First get the size of the result */
7430n/a outsize = WideCharToMultiByte(code_page, flags,
7431n/a p, (int)size,
7432n/a NULL, 0,
7433n/a NULL, pusedDefaultChar);
7434n/a if (outsize <= 0)
7435n/a goto error;
7436n/a /* If we used a default char, then we failed! */
7437n/a if (pusedDefaultChar && *pusedDefaultChar) {
7438n/a Py_DECREF(substring);
7439n/a return -2;
7440n/a }
7441n/a
7442n/a if (*outbytes == NULL) {
7443n/a /* Create string object */
7444n/a *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7445n/a if (*outbytes == NULL) {
7446n/a Py_DECREF(substring);
7447n/a return -1;
7448n/a }
7449n/a out = PyBytes_AS_STRING(*outbytes);
7450n/a }
7451n/a else {
7452n/a /* Extend string object */
7453n/a const Py_ssize_t n = PyBytes_Size(*outbytes);
7454n/a if (outsize > PY_SSIZE_T_MAX - n) {
7455n/a PyErr_NoMemory();
7456n/a Py_DECREF(substring);
7457n/a return -1;
7458n/a }
7459n/a if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7460n/a Py_DECREF(substring);
7461n/a return -1;
7462n/a }
7463n/a out = PyBytes_AS_STRING(*outbytes) + n;
7464n/a }
7465n/a
7466n/a /* Do the conversion */
7467n/a outsize = WideCharToMultiByte(code_page, flags,
7468n/a p, (int)size,
7469n/a out, outsize,
7470n/a NULL, pusedDefaultChar);
7471n/a Py_CLEAR(substring);
7472n/a if (outsize <= 0)
7473n/a goto error;
7474n/a if (pusedDefaultChar && *pusedDefaultChar)
7475n/a return -2;
7476n/a return 0;
7477n/a
7478n/aerror:
7479n/a Py_XDECREF(substring);
7480n/a if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7481n/a return -2;
7482n/a PyErr_SetFromWindowsErr(0);
7483n/a return -1;
7484n/a}
7485n/a
7486n/a/*
7487n/a * Encode a Unicode string to a Windows code page into a byte string using an
7488n/a * error handler.
7489n/a *
7490n/a * Returns consumed characters if succeed, or raise an OSError and returns
7491n/a * -1 on other error.
7492n/a */
7493n/astatic int
7494n/aencode_code_page_errors(UINT code_page, PyObject **outbytes,
7495n/a PyObject *unicode, Py_ssize_t unicode_offset,
7496n/a Py_ssize_t insize, const char* errors)
7497n/a{
7498n/a const DWORD flags = encode_code_page_flags(code_page, errors);
7499n/a Py_ssize_t pos = unicode_offset;
7500n/a Py_ssize_t endin = unicode_offset + insize;
7501n/a /* Ideally, we should get reason from FormatMessage. This is the Windows
7502n/a 2000 English version of the message. */
7503n/a const char *reason = "invalid character";
7504n/a /* 4=maximum length of a UTF-8 sequence */
7505n/a char buffer[4];
7506n/a BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7507n/a Py_ssize_t outsize;
7508n/a char *out;
7509n/a PyObject *errorHandler = NULL;
7510n/a PyObject *exc = NULL;
7511n/a PyObject *encoding_obj = NULL;
7512n/a const char *encoding;
7513n/a Py_ssize_t newpos, newoutsize;
7514n/a PyObject *rep;
7515n/a int ret = -1;
7516n/a
7517n/a assert(insize > 0);
7518n/a
7519n/a encoding = code_page_name(code_page, &encoding_obj);
7520n/a if (encoding == NULL)
7521n/a return -1;
7522n/a
7523n/a if (errors == NULL || strcmp(errors, "strict") == 0) {
7524n/a /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7525n/a then we raise a UnicodeEncodeError. */
7526n/a make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7527n/a if (exc != NULL) {
7528n/a PyCodec_StrictErrors(exc);
7529n/a Py_DECREF(exc);
7530n/a }
7531n/a Py_XDECREF(encoding_obj);
7532n/a return -1;
7533n/a }
7534n/a
7535n/a if (code_page != CP_UTF8 && code_page != CP_UTF7)
7536n/a pusedDefaultChar = &usedDefaultChar;
7537n/a else
7538n/a pusedDefaultChar = NULL;
7539n/a
7540n/a if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7541n/a PyErr_NoMemory();
7542n/a goto error;
7543n/a }
7544n/a outsize = insize * Py_ARRAY_LENGTH(buffer);
7545n/a
7546n/a if (*outbytes == NULL) {
7547n/a /* Create string object */
7548n/a *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7549n/a if (*outbytes == NULL)
7550n/a goto error;
7551n/a out = PyBytes_AS_STRING(*outbytes);
7552n/a }
7553n/a else {
7554n/a /* Extend string object */
7555n/a Py_ssize_t n = PyBytes_Size(*outbytes);
7556n/a if (n > PY_SSIZE_T_MAX - outsize) {
7557n/a PyErr_NoMemory();
7558n/a goto error;
7559n/a }
7560n/a if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7561n/a goto error;
7562n/a out = PyBytes_AS_STRING(*outbytes) + n;
7563n/a }
7564n/a
7565n/a /* Encode the string character per character */
7566n/a while (pos < endin)
7567n/a {
7568n/a Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7569n/a wchar_t chars[2];
7570n/a int charsize;
7571n/a if (ch < 0x10000) {
7572n/a chars[0] = (wchar_t)ch;
7573n/a charsize = 1;
7574n/a }
7575n/a else {
7576n/a chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7577n/a chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7578n/a charsize = 2;
7579n/a }
7580n/a
7581n/a outsize = WideCharToMultiByte(code_page, flags,
7582n/a chars, charsize,
7583n/a buffer, Py_ARRAY_LENGTH(buffer),
7584n/a NULL, pusedDefaultChar);
7585n/a if (outsize > 0) {
7586n/a if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7587n/a {
7588n/a pos++;
7589n/a memcpy(out, buffer, outsize);
7590n/a out += outsize;
7591n/a continue;
7592n/a }
7593n/a }
7594n/a else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7595n/a PyErr_SetFromWindowsErr(0);
7596n/a goto error;
7597n/a }
7598n/a
7599n/a rep = unicode_encode_call_errorhandler(
7600n/a errors, &errorHandler, encoding, reason,
7601n/a unicode, &exc,
7602n/a pos, pos + 1, &newpos);
7603n/a if (rep == NULL)
7604n/a goto error;
7605n/a pos = newpos;
7606n/a
7607n/a if (PyBytes_Check(rep)) {
7608n/a outsize = PyBytes_GET_SIZE(rep);
7609n/a if (outsize != 1) {
7610n/a Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7611n/a newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7612n/a if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7613n/a Py_DECREF(rep);
7614n/a goto error;
7615n/a }
7616n/a out = PyBytes_AS_STRING(*outbytes) + offset;
7617n/a }
7618n/a memcpy(out, PyBytes_AS_STRING(rep), outsize);
7619n/a out += outsize;
7620n/a }
7621n/a else {
7622n/a Py_ssize_t i;
7623n/a enum PyUnicode_Kind kind;
7624n/a void *data;
7625n/a
7626n/a if (PyUnicode_READY(rep) == -1) {
7627n/a Py_DECREF(rep);
7628n/a goto error;
7629n/a }
7630n/a
7631n/a outsize = PyUnicode_GET_LENGTH(rep);
7632n/a if (outsize != 1) {
7633n/a Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7634n/a newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7635n/a if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7636n/a Py_DECREF(rep);
7637n/a goto error;
7638n/a }
7639n/a out = PyBytes_AS_STRING(*outbytes) + offset;
7640n/a }
7641n/a kind = PyUnicode_KIND(rep);
7642n/a data = PyUnicode_DATA(rep);
7643n/a for (i=0; i < outsize; i++) {
7644n/a Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7645n/a if (ch > 127) {
7646n/a raise_encode_exception(&exc,
7647n/a encoding, unicode,
7648n/a pos, pos + 1,
7649n/a "unable to encode error handler result to ASCII");
7650n/a Py_DECREF(rep);
7651n/a goto error;
7652n/a }
7653n/a *out = (unsigned char)ch;
7654n/a out++;
7655n/a }
7656n/a }
7657n/a Py_DECREF(rep);
7658n/a }
7659n/a /* write a NUL byte */
7660n/a *out = 0;
7661n/a outsize = out - PyBytes_AS_STRING(*outbytes);
7662n/a assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7663n/a if (_PyBytes_Resize(outbytes, outsize) < 0)
7664n/a goto error;
7665n/a ret = 0;
7666n/a
7667n/aerror:
7668n/a Py_XDECREF(encoding_obj);
7669n/a Py_XDECREF(errorHandler);
7670n/a Py_XDECREF(exc);
7671n/a return ret;
7672n/a}
7673n/a
7674n/astatic PyObject *
7675n/aencode_code_page(int code_page,
7676n/a PyObject *unicode,
7677n/a const char *errors)
7678n/a{
7679n/a Py_ssize_t len;
7680n/a PyObject *outbytes = NULL;
7681n/a Py_ssize_t offset;
7682n/a int chunk_len, ret, done;
7683n/a
7684n/a if (!PyUnicode_Check(unicode)) {
7685n/a PyErr_BadArgument();
7686n/a return NULL;
7687n/a }
7688n/a
7689n/a if (PyUnicode_READY(unicode) == -1)
7690n/a return NULL;
7691n/a len = PyUnicode_GET_LENGTH(unicode);
7692n/a
7693n/a if (code_page < 0) {
7694n/a PyErr_SetString(PyExc_ValueError, "invalid code page number");
7695n/a return NULL;
7696n/a }
7697n/a
7698n/a if (len == 0)
7699n/a return PyBytes_FromStringAndSize(NULL, 0);
7700n/a
7701n/a offset = 0;
7702n/a do
7703n/a {
7704n/a#ifdef NEED_RETRY
7705n/a /* UTF-16 encoding may double the size, so use only INT_MAX/2
7706n/a chunks. */
7707n/a if (len > INT_MAX/2) {
7708n/a chunk_len = INT_MAX/2;
7709n/a done = 0;
7710n/a }
7711n/a else
7712n/a#endif
7713n/a {
7714n/a chunk_len = (int)len;
7715n/a done = 1;
7716n/a }
7717n/a
7718n/a ret = encode_code_page_strict(code_page, &outbytes,
7719n/a unicode, offset, chunk_len,
7720n/a errors);
7721n/a if (ret == -2)
7722n/a ret = encode_code_page_errors(code_page, &outbytes,
7723n/a unicode, offset,
7724n/a chunk_len, errors);
7725n/a if (ret < 0) {
7726n/a Py_XDECREF(outbytes);
7727n/a return NULL;
7728n/a }
7729n/a
7730n/a offset += chunk_len;
7731n/a len -= chunk_len;
7732n/a } while (!done);
7733n/a
7734n/a return outbytes;
7735n/a}
7736n/a
7737n/aPyObject *
7738n/aPyUnicode_EncodeMBCS(const Py_UNICODE *p,
7739n/a Py_ssize_t size,
7740n/a const char *errors)
7741n/a{
7742n/a PyObject *unicode, *res;
7743n/a unicode = PyUnicode_FromWideChar(p, size);
7744n/a if (unicode == NULL)
7745n/a return NULL;
7746n/a res = encode_code_page(CP_ACP, unicode, errors);
7747n/a Py_DECREF(unicode);
7748n/a return res;
7749n/a}
7750n/a
7751n/aPyObject *
7752n/aPyUnicode_EncodeCodePage(int code_page,
7753n/a PyObject *unicode,
7754n/a const char *errors)
7755n/a{
7756n/a return encode_code_page(code_page, unicode, errors);
7757n/a}
7758n/a
7759n/aPyObject *
7760n/aPyUnicode_AsMBCSString(PyObject *unicode)
7761n/a{
7762n/a return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7763n/a}
7764n/a
7765n/a#undef NEED_RETRY
7766n/a
7767n/a#endif /* MS_WINDOWS */
7768n/a
7769n/a/* --- Character Mapping Codec -------------------------------------------- */
7770n/a
7771n/astatic int
7772n/acharmap_decode_string(const char *s,
7773n/a Py_ssize_t size,
7774n/a PyObject *mapping,
7775n/a const char *errors,
7776n/a _PyUnicodeWriter *writer)
7777n/a{
7778n/a const char *starts = s;
7779n/a const char *e;
7780n/a Py_ssize_t startinpos, endinpos;
7781n/a PyObject *errorHandler = NULL, *exc = NULL;
7782n/a Py_ssize_t maplen;
7783n/a enum PyUnicode_Kind mapkind;
7784n/a void *mapdata;
7785n/a Py_UCS4 x;
7786n/a unsigned char ch;
7787n/a
7788n/a if (PyUnicode_READY(mapping) == -1)
7789n/a return -1;
7790n/a
7791n/a maplen = PyUnicode_GET_LENGTH(mapping);
7792n/a mapdata = PyUnicode_DATA(mapping);
7793n/a mapkind = PyUnicode_KIND(mapping);
7794n/a
7795n/a e = s + size;
7796n/a
7797n/a if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7798n/a /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7799n/a * is disabled in encoding aliases, latin1 is preferred because
7800n/a * its implementation is faster. */
7801n/a Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7802n/a Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7803n/a Py_UCS4 maxchar = writer->maxchar;
7804n/a
7805n/a assert (writer->kind == PyUnicode_1BYTE_KIND);
7806n/a while (s < e) {
7807n/a ch = *s;
7808n/a x = mapdata_ucs1[ch];
7809n/a if (x > maxchar) {
7810n/a if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7811n/a goto onError;
7812n/a maxchar = writer->maxchar;
7813n/a outdata = (Py_UCS1 *)writer->data;
7814n/a }
7815n/a outdata[writer->pos] = x;
7816n/a writer->pos++;
7817n/a ++s;
7818n/a }
7819n/a return 0;
7820n/a }
7821n/a
7822n/a while (s < e) {
7823n/a if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7824n/a enum PyUnicode_Kind outkind = writer->kind;
7825n/a Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7826n/a if (outkind == PyUnicode_1BYTE_KIND) {
7827n/a Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7828n/a Py_UCS4 maxchar = writer->maxchar;
7829n/a while (s < e) {
7830n/a ch = *s;
7831n/a x = mapdata_ucs2[ch];
7832n/a if (x > maxchar)
7833n/a goto Error;
7834n/a outdata[writer->pos] = x;
7835n/a writer->pos++;
7836n/a ++s;
7837n/a }
7838n/a break;
7839n/a }
7840n/a else if (outkind == PyUnicode_2BYTE_KIND) {
7841n/a Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7842n/a while (s < e) {
7843n/a ch = *s;
7844n/a x = mapdata_ucs2[ch];
7845n/a if (x == 0xFFFE)
7846n/a goto Error;
7847n/a outdata[writer->pos] = x;
7848n/a writer->pos++;
7849n/a ++s;
7850n/a }
7851n/a break;
7852n/a }
7853n/a }
7854n/a ch = *s;
7855n/a
7856n/a if (ch < maplen)
7857n/a x = PyUnicode_READ(mapkind, mapdata, ch);
7858n/a else
7859n/a x = 0xfffe; /* invalid value */
7860n/aError:
7861n/a if (x == 0xfffe)
7862n/a {
7863n/a /* undefined mapping */
7864n/a startinpos = s-starts;
7865n/a endinpos = startinpos+1;
7866n/a if (unicode_decode_call_errorhandler_writer(
7867n/a errors, &errorHandler,
7868n/a "charmap", "character maps to <undefined>",
7869n/a &starts, &e, &startinpos, &endinpos, &exc, &s,
7870n/a writer)) {
7871n/a goto onError;
7872n/a }
7873n/a continue;
7874n/a }
7875n/a
7876n/a if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7877n/a goto onError;
7878n/a ++s;
7879n/a }
7880n/a Py_XDECREF(errorHandler);
7881n/a Py_XDECREF(exc);
7882n/a return 0;
7883n/a
7884n/aonError:
7885n/a Py_XDECREF(errorHandler);
7886n/a Py_XDECREF(exc);
7887n/a return -1;
7888n/a}
7889n/a
7890n/astatic int
7891n/acharmap_decode_mapping(const char *s,
7892n/a Py_ssize_t size,
7893n/a PyObject *mapping,
7894n/a const char *errors,
7895n/a _PyUnicodeWriter *writer)
7896n/a{
7897n/a const char *starts = s;
7898n/a const char *e;
7899n/a Py_ssize_t startinpos, endinpos;
7900n/a PyObject *errorHandler = NULL, *exc = NULL;
7901n/a unsigned char ch;
7902n/a PyObject *key, *item = NULL;
7903n/a
7904n/a e = s + size;
7905n/a
7906n/a while (s < e) {
7907n/a ch = *s;
7908n/a
7909n/a /* Get mapping (char ordinal -> integer, Unicode char or None) */
7910n/a key = PyLong_FromLong((long)ch);
7911n/a if (key == NULL)
7912n/a goto onError;
7913n/a
7914n/a item = PyObject_GetItem(mapping, key);
7915n/a Py_DECREF(key);
7916n/a if (item == NULL) {
7917n/a if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7918n/a /* No mapping found means: mapping is undefined. */
7919n/a PyErr_Clear();
7920n/a goto Undefined;
7921n/a } else
7922n/a goto onError;
7923n/a }
7924n/a
7925n/a /* Apply mapping */
7926n/a if (item == Py_None)
7927n/a goto Undefined;
7928n/a if (PyLong_Check(item)) {
7929n/a long value = PyLong_AS_LONG(item);
7930n/a if (value == 0xFFFE)
7931n/a goto Undefined;
7932n/a if (value < 0 || value > MAX_UNICODE) {
7933n/a PyErr_Format(PyExc_TypeError,
7934n/a "character mapping must be in range(0x%lx)",
7935n/a (unsigned long)MAX_UNICODE + 1);
7936n/a goto onError;
7937n/a }
7938n/a
7939n/a if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7940n/a goto onError;
7941n/a }
7942n/a else if (PyUnicode_Check(item)) {
7943n/a if (PyUnicode_READY(item) == -1)
7944n/a goto onError;
7945n/a if (PyUnicode_GET_LENGTH(item) == 1) {
7946n/a Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7947n/a if (value == 0xFFFE)
7948n/a goto Undefined;
7949n/a if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7950n/a goto onError;
7951n/a }
7952n/a else {
7953n/a writer->overallocate = 1;
7954n/a if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7955n/a goto onError;
7956n/a }
7957n/a }
7958n/a else {
7959n/a /* wrong return value */
7960n/a PyErr_SetString(PyExc_TypeError,
7961n/a "character mapping must return integer, None or str");
7962n/a goto onError;
7963n/a }
7964n/a Py_CLEAR(item);
7965n/a ++s;
7966n/a continue;
7967n/a
7968n/aUndefined:
7969n/a /* undefined mapping */
7970n/a Py_CLEAR(item);
7971n/a startinpos = s-starts;
7972n/a endinpos = startinpos+1;
7973n/a if (unicode_decode_call_errorhandler_writer(
7974n/a errors, &errorHandler,
7975n/a "charmap", "character maps to <undefined>",
7976n/a &starts, &e, &startinpos, &endinpos, &exc, &s,
7977n/a writer)) {
7978n/a goto onError;
7979n/a }
7980n/a }
7981n/a Py_XDECREF(errorHandler);
7982n/a Py_XDECREF(exc);
7983n/a return 0;
7984n/a
7985n/aonError:
7986n/a Py_XDECREF(item);
7987n/a Py_XDECREF(errorHandler);
7988n/a Py_XDECREF(exc);
7989n/a return -1;
7990n/a}
7991n/a
7992n/aPyObject *
7993n/aPyUnicode_DecodeCharmap(const char *s,
7994n/a Py_ssize_t size,
7995n/a PyObject *mapping,
7996n/a const char *errors)
7997n/a{
7998n/a _PyUnicodeWriter writer;
7999n/a
8000n/a /* Default to Latin-1 */
8001n/a if (mapping == NULL)
8002n/a return PyUnicode_DecodeLatin1(s, size, errors);
8003n/a
8004n/a if (size == 0)
8005n/a _Py_RETURN_UNICODE_EMPTY();
8006n/a _PyUnicodeWriter_Init(&writer);
8007n/a writer.min_length = size;
8008n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8009n/a goto onError;
8010n/a
8011n/a if (PyUnicode_CheckExact(mapping)) {
8012n/a if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8013n/a goto onError;
8014n/a }
8015n/a else {
8016n/a if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8017n/a goto onError;
8018n/a }
8019n/a return _PyUnicodeWriter_Finish(&writer);
8020n/a
8021n/a onError:
8022n/a _PyUnicodeWriter_Dealloc(&writer);
8023n/a return NULL;
8024n/a}
8025n/a
8026n/a/* Charmap encoding: the lookup table */
8027n/a
8028n/astruct encoding_map {
8029n/a PyObject_HEAD
8030n/a unsigned char level1[32];
8031n/a int count2, count3;
8032n/a unsigned char level23[1];
8033n/a};
8034n/a
8035n/astatic PyObject*
8036n/aencoding_map_size(PyObject *obj, PyObject* args)
8037n/a{
8038n/a struct encoding_map *map = (struct encoding_map*)obj;
8039n/a return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8040n/a 128*map->count3);
8041n/a}
8042n/a
8043n/astatic PyMethodDef encoding_map_methods[] = {
8044n/a {"size", encoding_map_size, METH_NOARGS,
8045n/a PyDoc_STR("Return the size (in bytes) of this object") },
8046n/a { 0 }
8047n/a};
8048n/a
8049n/astatic void
8050n/aencoding_map_dealloc(PyObject* o)
8051n/a{
8052n/a PyObject_FREE(o);
8053n/a}
8054n/a
8055n/astatic PyTypeObject EncodingMapType = {
8056n/a PyVarObject_HEAD_INIT(NULL, 0)
8057n/a "EncodingMap", /*tp_name*/
8058n/a sizeof(struct encoding_map), /*tp_basicsize*/
8059n/a 0, /*tp_itemsize*/
8060n/a /* methods */
8061n/a encoding_map_dealloc, /*tp_dealloc*/
8062n/a 0, /*tp_print*/
8063n/a 0, /*tp_getattr*/
8064n/a 0, /*tp_setattr*/
8065n/a 0, /*tp_reserved*/
8066n/a 0, /*tp_repr*/
8067n/a 0, /*tp_as_number*/
8068n/a 0, /*tp_as_sequence*/
8069n/a 0, /*tp_as_mapping*/
8070n/a 0, /*tp_hash*/
8071n/a 0, /*tp_call*/
8072n/a 0, /*tp_str*/
8073n/a 0, /*tp_getattro*/
8074n/a 0, /*tp_setattro*/
8075n/a 0, /*tp_as_buffer*/
8076n/a Py_TPFLAGS_DEFAULT, /*tp_flags*/
8077n/a 0, /*tp_doc*/
8078n/a 0, /*tp_traverse*/
8079n/a 0, /*tp_clear*/
8080n/a 0, /*tp_richcompare*/
8081n/a 0, /*tp_weaklistoffset*/
8082n/a 0, /*tp_iter*/
8083n/a 0, /*tp_iternext*/
8084n/a encoding_map_methods, /*tp_methods*/
8085n/a 0, /*tp_members*/
8086n/a 0, /*tp_getset*/
8087n/a 0, /*tp_base*/
8088n/a 0, /*tp_dict*/
8089n/a 0, /*tp_descr_get*/
8090n/a 0, /*tp_descr_set*/
8091n/a 0, /*tp_dictoffset*/
8092n/a 0, /*tp_init*/
8093n/a 0, /*tp_alloc*/
8094n/a 0, /*tp_new*/
8095n/a 0, /*tp_free*/
8096n/a 0, /*tp_is_gc*/
8097n/a};
8098n/a
8099n/aPyObject*
8100n/aPyUnicode_BuildEncodingMap(PyObject* string)
8101n/a{
8102n/a PyObject *result;
8103n/a struct encoding_map *mresult;
8104n/a int i;
8105n/a int need_dict = 0;
8106n/a unsigned char level1[32];
8107n/a unsigned char level2[512];
8108n/a unsigned char *mlevel1, *mlevel2, *mlevel3;
8109n/a int count2 = 0, count3 = 0;
8110n/a int kind;
8111n/a void *data;
8112n/a Py_ssize_t length;
8113n/a Py_UCS4 ch;
8114n/a
8115n/a if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8116n/a PyErr_BadArgument();
8117n/a return NULL;
8118n/a }
8119n/a kind = PyUnicode_KIND(string);
8120n/a data = PyUnicode_DATA(string);
8121n/a length = PyUnicode_GET_LENGTH(string);
8122n/a length = Py_MIN(length, 256);
8123n/a memset(level1, 0xFF, sizeof level1);
8124n/a memset(level2, 0xFF, sizeof level2);
8125n/a
8126n/a /* If there isn't a one-to-one mapping of NULL to \0,
8127n/a or if there are non-BMP characters, we need to use
8128n/a a mapping dictionary. */
8129n/a if (PyUnicode_READ(kind, data, 0) != 0)
8130n/a need_dict = 1;
8131n/a for (i = 1; i < length; i++) {
8132n/a int l1, l2;
8133n/a ch = PyUnicode_READ(kind, data, i);
8134n/a if (ch == 0 || ch > 0xFFFF) {
8135n/a need_dict = 1;
8136n/a break;
8137n/a }
8138n/a if (ch == 0xFFFE)
8139n/a /* unmapped character */
8140n/a continue;
8141n/a l1 = ch >> 11;
8142n/a l2 = ch >> 7;
8143n/a if (level1[l1] == 0xFF)
8144n/a level1[l1] = count2++;
8145n/a if (level2[l2] == 0xFF)
8146n/a level2[l2] = count3++;
8147n/a }
8148n/a
8149n/a if (count2 >= 0xFF || count3 >= 0xFF)
8150n/a need_dict = 1;
8151n/a
8152n/a if (need_dict) {
8153n/a PyObject *result = PyDict_New();
8154n/a PyObject *key, *value;
8155n/a if (!result)
8156n/a return NULL;
8157n/a for (i = 0; i < length; i++) {
8158n/a key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8159n/a value = PyLong_FromLong(i);
8160n/a if (!key || !value)
8161n/a goto failed1;
8162n/a if (PyDict_SetItem(result, key, value) == -1)
8163n/a goto failed1;
8164n/a Py_DECREF(key);
8165n/a Py_DECREF(value);
8166n/a }
8167n/a return result;
8168n/a failed1:
8169n/a Py_XDECREF(key);
8170n/a Py_XDECREF(value);
8171n/a Py_DECREF(result);
8172n/a return NULL;
8173n/a }
8174n/a
8175n/a /* Create a three-level trie */
8176n/a result = PyObject_MALLOC(sizeof(struct encoding_map) +
8177n/a 16*count2 + 128*count3 - 1);
8178n/a if (!result)
8179n/a return PyErr_NoMemory();
8180n/a PyObject_Init(result, &EncodingMapType);
8181n/a mresult = (struct encoding_map*)result;
8182n/a mresult->count2 = count2;
8183n/a mresult->count3 = count3;
8184n/a mlevel1 = mresult->level1;
8185n/a mlevel2 = mresult->level23;
8186n/a mlevel3 = mresult->level23 + 16*count2;
8187n/a memcpy(mlevel1, level1, 32);
8188n/a memset(mlevel2, 0xFF, 16*count2);
8189n/a memset(mlevel3, 0, 128*count3);
8190n/a count3 = 0;
8191n/a for (i = 1; i < length; i++) {
8192n/a int o1, o2, o3, i2, i3;
8193n/a Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8194n/a if (ch == 0xFFFE)
8195n/a /* unmapped character */
8196n/a continue;
8197n/a o1 = ch>>11;
8198n/a o2 = (ch>>7) & 0xF;
8199n/a i2 = 16*mlevel1[o1] + o2;
8200n/a if (mlevel2[i2] == 0xFF)
8201n/a mlevel2[i2] = count3++;
8202n/a o3 = ch & 0x7F;
8203n/a i3 = 128*mlevel2[i2] + o3;
8204n/a mlevel3[i3] = i;
8205n/a }
8206n/a return result;
8207n/a}
8208n/a
8209n/astatic int
8210n/aencoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8211n/a{
8212n/a struct encoding_map *map = (struct encoding_map*)mapping;
8213n/a int l1 = c>>11;
8214n/a int l2 = (c>>7) & 0xF;
8215n/a int l3 = c & 0x7F;
8216n/a int i;
8217n/a
8218n/a if (c > 0xFFFF)
8219n/a return -1;
8220n/a if (c == 0)
8221n/a return 0;
8222n/a /* level 1*/
8223n/a i = map->level1[l1];
8224n/a if (i == 0xFF) {
8225n/a return -1;
8226n/a }
8227n/a /* level 2*/
8228n/a i = map->level23[16*i+l2];
8229n/a if (i == 0xFF) {
8230n/a return -1;
8231n/a }
8232n/a /* level 3 */
8233n/a i = map->level23[16*map->count2 + 128*i + l3];
8234n/a if (i == 0) {
8235n/a return -1;
8236n/a }
8237n/a return i;
8238n/a}
8239n/a
8240n/a/* Lookup the character ch in the mapping. If the character
8241n/a can't be found, Py_None is returned (or NULL, if another
8242n/a error occurred). */
8243n/astatic PyObject *
8244n/acharmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8245n/a{
8246n/a PyObject *w = PyLong_FromLong((long)c);
8247n/a PyObject *x;
8248n/a
8249n/a if (w == NULL)
8250n/a return NULL;
8251n/a x = PyObject_GetItem(mapping, w);
8252n/a Py_DECREF(w);
8253n/a if (x == NULL) {
8254n/a if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8255n/a /* No mapping found means: mapping is undefined. */
8256n/a PyErr_Clear();
8257n/a Py_RETURN_NONE;
8258n/a } else
8259n/a return NULL;
8260n/a }
8261n/a else if (x == Py_None)
8262n/a return x;
8263n/a else if (PyLong_Check(x)) {
8264n/a long value = PyLong_AS_LONG(x);
8265n/a if (value < 0 || value > 255) {
8266n/a PyErr_SetString(PyExc_TypeError,
8267n/a "character mapping must be in range(256)");
8268n/a Py_DECREF(x);
8269n/a return NULL;
8270n/a }
8271n/a return x;
8272n/a }
8273n/a else if (PyBytes_Check(x))
8274n/a return x;
8275n/a else {
8276n/a /* wrong return value */
8277n/a PyErr_Format(PyExc_TypeError,
8278n/a "character mapping must return integer, bytes or None, not %.400s",
8279n/a x->ob_type->tp_name);
8280n/a Py_DECREF(x);
8281n/a return NULL;
8282n/a }
8283n/a}
8284n/a
8285n/astatic int
8286n/acharmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8287n/a{
8288n/a Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8289n/a /* exponentially overallocate to minimize reallocations */
8290n/a if (requiredsize < 2*outsize)
8291n/a requiredsize = 2*outsize;
8292n/a if (_PyBytes_Resize(outobj, requiredsize))
8293n/a return -1;
8294n/a return 0;
8295n/a}
8296n/a
8297n/atypedef enum charmapencode_result {
8298n/a enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8299n/a} charmapencode_result;
8300n/a/* lookup the character, put the result in the output string and adjust
8301n/a various state variables. Resize the output bytes object if not enough
8302n/a space is available. Return a new reference to the object that
8303n/a was put in the output buffer, or Py_None, if the mapping was undefined
8304n/a (in which case no character was written) or NULL, if a
8305n/a reallocation error occurred. The caller must decref the result */
8306n/astatic charmapencode_result
8307n/acharmapencode_output(Py_UCS4 c, PyObject *mapping,
8308n/a PyObject **outobj, Py_ssize_t *outpos)
8309n/a{
8310n/a PyObject *rep;
8311n/a char *outstart;
8312n/a Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8313n/a
8314n/a if (Py_TYPE(mapping) == &EncodingMapType) {
8315n/a int res = encoding_map_lookup(c, mapping);
8316n/a Py_ssize_t requiredsize = *outpos+1;
8317n/a if (res == -1)
8318n/a return enc_FAILED;
8319n/a if (outsize<requiredsize)
8320n/a if (charmapencode_resize(outobj, outpos, requiredsize))
8321n/a return enc_EXCEPTION;
8322n/a outstart = PyBytes_AS_STRING(*outobj);
8323n/a outstart[(*outpos)++] = (char)res;
8324n/a return enc_SUCCESS;
8325n/a }
8326n/a
8327n/a rep = charmapencode_lookup(c, mapping);
8328n/a if (rep==NULL)
8329n/a return enc_EXCEPTION;
8330n/a else if (rep==Py_None) {
8331n/a Py_DECREF(rep);
8332n/a return enc_FAILED;
8333n/a } else {
8334n/a if (PyLong_Check(rep)) {
8335n/a Py_ssize_t requiredsize = *outpos+1;
8336n/a if (outsize<requiredsize)
8337n/a if (charmapencode_resize(outobj, outpos, requiredsize)) {
8338n/a Py_DECREF(rep);
8339n/a return enc_EXCEPTION;
8340n/a }
8341n/a outstart = PyBytes_AS_STRING(*outobj);
8342n/a outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8343n/a }
8344n/a else {
8345n/a const char *repchars = PyBytes_AS_STRING(rep);
8346n/a Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8347n/a Py_ssize_t requiredsize = *outpos+repsize;
8348n/a if (outsize<requiredsize)
8349n/a if (charmapencode_resize(outobj, outpos, requiredsize)) {
8350n/a Py_DECREF(rep);
8351n/a return enc_EXCEPTION;
8352n/a }
8353n/a outstart = PyBytes_AS_STRING(*outobj);
8354n/a memcpy(outstart + *outpos, repchars, repsize);
8355n/a *outpos += repsize;
8356n/a }
8357n/a }
8358n/a Py_DECREF(rep);
8359n/a return enc_SUCCESS;
8360n/a}
8361n/a
8362n/a/* handle an error in PyUnicode_EncodeCharmap
8363n/a Return 0 on success, -1 on error */
8364n/astatic int
8365n/acharmap_encoding_error(
8366n/a PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8367n/a PyObject **exceptionObject,
8368n/a _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8369n/a PyObject **res, Py_ssize_t *respos)
8370n/a{
8371n/a PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8372n/a Py_ssize_t size, repsize;
8373n/a Py_ssize_t newpos;
8374n/a enum PyUnicode_Kind kind;
8375n/a void *data;
8376n/a Py_ssize_t index;
8377n/a /* startpos for collecting unencodable chars */
8378n/a Py_ssize_t collstartpos = *inpos;
8379n/a Py_ssize_t collendpos = *inpos+1;
8380n/a Py_ssize_t collpos;
8381n/a char *encoding = "charmap";
8382n/a char *reason = "character maps to <undefined>";
8383n/a charmapencode_result x;
8384n/a Py_UCS4 ch;
8385n/a int val;
8386n/a
8387n/a if (PyUnicode_READY(unicode) == -1)
8388n/a return -1;
8389n/a size = PyUnicode_GET_LENGTH(unicode);
8390n/a /* find all unencodable characters */
8391n/a while (collendpos < size) {
8392n/a PyObject *rep;
8393n/a if (Py_TYPE(mapping) == &EncodingMapType) {
8394n/a ch = PyUnicode_READ_CHAR(unicode, collendpos);
8395n/a val = encoding_map_lookup(ch, mapping);
8396n/a if (val != -1)
8397n/a break;
8398n/a ++collendpos;
8399n/a continue;
8400n/a }
8401n/a
8402n/a ch = PyUnicode_READ_CHAR(unicode, collendpos);
8403n/a rep = charmapencode_lookup(ch, mapping);
8404n/a if (rep==NULL)
8405n/a return -1;
8406n/a else if (rep!=Py_None) {
8407n/a Py_DECREF(rep);
8408n/a break;
8409n/a }
8410n/a Py_DECREF(rep);
8411n/a ++collendpos;
8412n/a }
8413n/a /* cache callback name lookup
8414n/a * (if not done yet, i.e. it's the first error) */
8415n/a if (*error_handler == _Py_ERROR_UNKNOWN)
8416n/a *error_handler = get_error_handler(errors);
8417n/a
8418n/a switch (*error_handler) {
8419n/a case _Py_ERROR_STRICT:
8420n/a raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8421n/a return -1;
8422n/a
8423n/a case _Py_ERROR_REPLACE:
8424n/a for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8425n/a x = charmapencode_output('?', mapping, res, respos);
8426n/a if (x==enc_EXCEPTION) {
8427n/a return -1;
8428n/a }
8429n/a else if (x==enc_FAILED) {
8430n/a raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8431n/a return -1;
8432n/a }
8433n/a }
8434n/a /* fall through */
8435n/a case _Py_ERROR_IGNORE:
8436n/a *inpos = collendpos;
8437n/a break;
8438n/a
8439n/a case _Py_ERROR_XMLCHARREFREPLACE:
8440n/a /* generate replacement (temporarily (mis)uses p) */
8441n/a for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8442n/a char buffer[2+29+1+1];
8443n/a char *cp;
8444n/a sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8445n/a for (cp = buffer; *cp; ++cp) {
8446n/a x = charmapencode_output(*cp, mapping, res, respos);
8447n/a if (x==enc_EXCEPTION)
8448n/a return -1;
8449n/a else if (x==enc_FAILED) {
8450n/a raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8451n/a return -1;
8452n/a }
8453n/a }
8454n/a }
8455n/a *inpos = collendpos;
8456n/a break;
8457n/a
8458n/a default:
8459n/a repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8460n/a encoding, reason, unicode, exceptionObject,
8461n/a collstartpos, collendpos, &newpos);
8462n/a if (repunicode == NULL)
8463n/a return -1;
8464n/a if (PyBytes_Check(repunicode)) {
8465n/a /* Directly copy bytes result to output. */
8466n/a Py_ssize_t outsize = PyBytes_Size(*res);
8467n/a Py_ssize_t requiredsize;
8468n/a repsize = PyBytes_Size(repunicode);
8469n/a requiredsize = *respos + repsize;
8470n/a if (requiredsize > outsize)
8471n/a /* Make room for all additional bytes. */
8472n/a if (charmapencode_resize(res, respos, requiredsize)) {
8473n/a Py_DECREF(repunicode);
8474n/a return -1;
8475n/a }
8476n/a memcpy(PyBytes_AsString(*res) + *respos,
8477n/a PyBytes_AsString(repunicode), repsize);
8478n/a *respos += repsize;
8479n/a *inpos = newpos;
8480n/a Py_DECREF(repunicode);
8481n/a break;
8482n/a }
8483n/a /* generate replacement */
8484n/a if (PyUnicode_READY(repunicode) == -1) {
8485n/a Py_DECREF(repunicode);
8486n/a return -1;
8487n/a }
8488n/a repsize = PyUnicode_GET_LENGTH(repunicode);
8489n/a data = PyUnicode_DATA(repunicode);
8490n/a kind = PyUnicode_KIND(repunicode);
8491n/a for (index = 0; index < repsize; index++) {
8492n/a Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8493n/a x = charmapencode_output(repch, mapping, res, respos);
8494n/a if (x==enc_EXCEPTION) {
8495n/a Py_DECREF(repunicode);
8496n/a return -1;
8497n/a }
8498n/a else if (x==enc_FAILED) {
8499n/a Py_DECREF(repunicode);
8500n/a raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8501n/a return -1;
8502n/a }
8503n/a }
8504n/a *inpos = newpos;
8505n/a Py_DECREF(repunicode);
8506n/a }
8507n/a return 0;
8508n/a}
8509n/a
8510n/aPyObject *
8511n/a_PyUnicode_EncodeCharmap(PyObject *unicode,
8512n/a PyObject *mapping,
8513n/a const char *errors)
8514n/a{
8515n/a /* output object */
8516n/a PyObject *res = NULL;
8517n/a /* current input position */
8518n/a Py_ssize_t inpos = 0;
8519n/a Py_ssize_t size;
8520n/a /* current output position */
8521n/a Py_ssize_t respos = 0;
8522n/a PyObject *error_handler_obj = NULL;
8523n/a PyObject *exc = NULL;
8524n/a _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8525n/a void *data;
8526n/a int kind;
8527n/a
8528n/a if (PyUnicode_READY(unicode) == -1)
8529n/a return NULL;
8530n/a size = PyUnicode_GET_LENGTH(unicode);
8531n/a data = PyUnicode_DATA(unicode);
8532n/a kind = PyUnicode_KIND(unicode);
8533n/a
8534n/a /* Default to Latin-1 */
8535n/a if (mapping == NULL)
8536n/a return unicode_encode_ucs1(unicode, errors, 256);
8537n/a
8538n/a /* allocate enough for a simple encoding without
8539n/a replacements, if we need more, we'll resize */
8540n/a res = PyBytes_FromStringAndSize(NULL, size);
8541n/a if (res == NULL)
8542n/a goto onError;
8543n/a if (size == 0)
8544n/a return res;
8545n/a
8546n/a while (inpos<size) {
8547n/a Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8548n/a /* try to encode it */
8549n/a charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8550n/a if (x==enc_EXCEPTION) /* error */
8551n/a goto onError;
8552n/a if (x==enc_FAILED) { /* unencodable character */
8553n/a if (charmap_encoding_error(unicode, &inpos, mapping,
8554n/a &exc,
8555n/a &error_handler, &error_handler_obj, errors,
8556n/a &res, &respos)) {
8557n/a goto onError;
8558n/a }
8559n/a }
8560n/a else
8561n/a /* done with this character => adjust input position */
8562n/a ++inpos;
8563n/a }
8564n/a
8565n/a /* Resize if we allocated to much */
8566n/a if (respos<PyBytes_GET_SIZE(res))
8567n/a if (_PyBytes_Resize(&res, respos) < 0)
8568n/a goto onError;
8569n/a
8570n/a Py_XDECREF(exc);
8571n/a Py_XDECREF(error_handler_obj);
8572n/a return res;
8573n/a
8574n/a onError:
8575n/a Py_XDECREF(res);
8576n/a Py_XDECREF(exc);
8577n/a Py_XDECREF(error_handler_obj);
8578n/a return NULL;
8579n/a}
8580n/a
8581n/a/* Deprecated */
8582n/aPyObject *
8583n/aPyUnicode_EncodeCharmap(const Py_UNICODE *p,
8584n/a Py_ssize_t size,
8585n/a PyObject *mapping,
8586n/a const char *errors)
8587n/a{
8588n/a PyObject *result;
8589n/a PyObject *unicode = PyUnicode_FromWideChar(p, size);
8590n/a if (unicode == NULL)
8591n/a return NULL;
8592n/a result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8593n/a Py_DECREF(unicode);
8594n/a return result;
8595n/a}
8596n/a
8597n/aPyObject *
8598n/aPyUnicode_AsCharmapString(PyObject *unicode,
8599n/a PyObject *mapping)
8600n/a{
8601n/a if (!PyUnicode_Check(unicode) || mapping == NULL) {
8602n/a PyErr_BadArgument();
8603n/a return NULL;
8604n/a }
8605n/a return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8606n/a}
8607n/a
8608n/a/* create or adjust a UnicodeTranslateError */
8609n/astatic void
8610n/amake_translate_exception(PyObject **exceptionObject,
8611n/a PyObject *unicode,
8612n/a Py_ssize_t startpos, Py_ssize_t endpos,
8613n/a const char *reason)
8614n/a{
8615n/a if (*exceptionObject == NULL) {
8616n/a *exceptionObject = _PyUnicodeTranslateError_Create(
8617n/a unicode, startpos, endpos, reason);
8618n/a }
8619n/a else {
8620n/a if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8621n/a goto onError;
8622n/a if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8623n/a goto onError;
8624n/a if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8625n/a goto onError;
8626n/a return;
8627n/a onError:
8628n/a Py_CLEAR(*exceptionObject);
8629n/a }
8630n/a}
8631n/a
8632n/a/* error handling callback helper:
8633n/a build arguments, call the callback and check the arguments,
8634n/a put the result into newpos and return the replacement string, which
8635n/a has to be freed by the caller */
8636n/astatic PyObject *
8637n/aunicode_translate_call_errorhandler(const char *errors,
8638n/a PyObject **errorHandler,
8639n/a const char *reason,
8640n/a PyObject *unicode, PyObject **exceptionObject,
8641n/a Py_ssize_t startpos, Py_ssize_t endpos,
8642n/a Py_ssize_t *newpos)
8643n/a{
8644n/a static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8645n/a
8646n/a Py_ssize_t i_newpos;
8647n/a PyObject *restuple;
8648n/a PyObject *resunicode;
8649n/a
8650n/a if (*errorHandler == NULL) {
8651n/a *errorHandler = PyCodec_LookupError(errors);
8652n/a if (*errorHandler == NULL)
8653n/a return NULL;
8654n/a }
8655n/a
8656n/a make_translate_exception(exceptionObject,
8657n/a unicode, startpos, endpos, reason);
8658n/a if (*exceptionObject == NULL)
8659n/a return NULL;
8660n/a
8661n/a restuple = PyObject_CallFunctionObjArgs(
8662n/a *errorHandler, *exceptionObject, NULL);
8663n/a if (restuple == NULL)
8664n/a return NULL;
8665n/a if (!PyTuple_Check(restuple)) {
8666n/a PyErr_SetString(PyExc_TypeError, &argparse[3]);
8667n/a Py_DECREF(restuple);
8668n/a return NULL;
8669n/a }
8670n/a if (!PyArg_ParseTuple(restuple, argparse,
8671n/a &resunicode, &i_newpos)) {
8672n/a Py_DECREF(restuple);
8673n/a return NULL;
8674n/a }
8675n/a if (i_newpos<0)
8676n/a *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8677n/a else
8678n/a *newpos = i_newpos;
8679n/a if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8680