ยปCore Development>Code coverage>Objects/unicodeobject.c

Python code coverage for Objects/unicodeobject.c

#countcontent
1n/a/*
2n/a
3n/aUnicode implementation based on original code by Fredrik Lundh,
4n/amodified by Marc-Andre Lemburg <mal@lemburg.com>.
5n/a
6n/aMajor speed upgrades to the method implementations at the Reykjavik
7n/aNeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8n/a
9n/aCopyright (c) Corporation for National Research Initiatives.
10n/a
11n/a--------------------------------------------------------------------
12n/aThe original string type implementation is:
13n/a
14n/a Copyright (c) 1999 by Secret Labs AB
15n/a Copyright (c) 1999 by Fredrik Lundh
16n/a
17n/aBy obtaining, using, and/or copying this software and/or its
18n/aassociated documentation, you agree that you have read, understood,
19n/aand will comply with the following terms and conditions:
20n/a
21n/aPermission to use, copy, modify, and distribute this software and its
22n/aassociated documentation for any purpose and without fee is hereby
23n/agranted, provided that the above copyright notice appears in all
24n/acopies, and that both that copyright notice and this permission notice
25n/aappear in supporting documentation, and that the name of Secret Labs
26n/aAB or the author not be used in advertising or publicity pertaining to
27n/adistribution of the software without specific, written prior
28n/apermission.
29n/a
30n/aSECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31n/aTHIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32n/aFITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33n/aANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34n/aWHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35n/aACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36n/aOF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37n/a--------------------------------------------------------------------
38n/a
39n/a*/
40n/a
41n/a#define PY_SSIZE_T_CLEAN
42n/a#include "Python.h"
43n/a#include "ucnhash.h"
44n/a#include "bytes_methods.h"
45n/a#include "stringlib/eq.h"
46n/a
47n/a#ifdef MS_WINDOWS
48n/a#include <windows.h>
49n/a#endif
50n/a
51n/a/*[clinic input]
52n/aclass str "PyObject *" "&PyUnicode_Type"
53n/a[clinic start generated code]*/
54n/a/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
55n/a
56n/a/*[python input]
57n/aclass Py_UCS4_converter(CConverter):
58n/a type = 'Py_UCS4'
59n/a converter = 'convert_uc'
60n/a
61n/a def converter_init(self):
62n/a if self.default is not unspecified:
63n/a self.c_default = ascii(self.default)
64n/a if len(self.c_default) > 4 or self.c_default[0] != "'":
65n/a self.c_default = hex(ord(self.default))
66n/a
67n/a[python start generated code]*/
68n/a/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
69n/a
70n/a/* --- Globals ------------------------------------------------------------
71n/a
72n/aNOTE: In the interpreter's initialization phase, some globals are currently
73n/a initialized dynamically as needed. In the process Unicode objects may
74n/a be created before the Unicode type is ready.
75n/a
76n/a*/
77n/a
78n/a
79n/a#ifdef __cplusplus
80n/aextern "C" {
81n/a#endif
82n/a
83n/a/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
84n/a#define MAX_UNICODE 0x10ffff
85n/a
86n/a#ifdef Py_DEBUG
87n/a# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
88n/a#else
89n/a# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
90n/a#endif
91n/a
92n/a#define _PyUnicode_UTF8(op) \
93n/a (((PyCompactUnicodeObject*)(op))->utf8)
94n/a#define PyUnicode_UTF8(op) \
95n/a (assert(_PyUnicode_CHECK(op)), \
96n/a assert(PyUnicode_IS_READY(op)), \
97n/a PyUnicode_IS_COMPACT_ASCII(op) ? \
98n/a ((char*)((PyASCIIObject*)(op) + 1)) : \
99n/a _PyUnicode_UTF8(op))
100n/a#define _PyUnicode_UTF8_LENGTH(op) \
101n/a (((PyCompactUnicodeObject*)(op))->utf8_length)
102n/a#define PyUnicode_UTF8_LENGTH(op) \
103n/a (assert(_PyUnicode_CHECK(op)), \
104n/a assert(PyUnicode_IS_READY(op)), \
105n/a PyUnicode_IS_COMPACT_ASCII(op) ? \
106n/a ((PyASCIIObject*)(op))->length : \
107n/a _PyUnicode_UTF8_LENGTH(op))
108n/a#define _PyUnicode_WSTR(op) \
109n/a (((PyASCIIObject*)(op))->wstr)
110n/a#define _PyUnicode_WSTR_LENGTH(op) \
111n/a (((PyCompactUnicodeObject*)(op))->wstr_length)
112n/a#define _PyUnicode_LENGTH(op) \
113n/a (((PyASCIIObject *)(op))->length)
114n/a#define _PyUnicode_STATE(op) \
115n/a (((PyASCIIObject *)(op))->state)
116n/a#define _PyUnicode_HASH(op) \
117n/a (((PyASCIIObject *)(op))->hash)
118n/a#define _PyUnicode_KIND(op) \
119n/a (assert(_PyUnicode_CHECK(op)), \
120n/a ((PyASCIIObject *)(op))->state.kind)
121n/a#define _PyUnicode_GET_LENGTH(op) \
122n/a (assert(_PyUnicode_CHECK(op)), \
123n/a ((PyASCIIObject *)(op))->length)
124n/a#define _PyUnicode_DATA_ANY(op) \
125n/a (((PyUnicodeObject*)(op))->data.any)
126n/a
127n/a#undef PyUnicode_READY
128n/a#define PyUnicode_READY(op) \
129n/a (assert(_PyUnicode_CHECK(op)), \
130n/a (PyUnicode_IS_READY(op) ? \
131n/a 0 : \
132n/a _PyUnicode_Ready(op)))
133n/a
134n/a#define _PyUnicode_SHARE_UTF8(op) \
135n/a (assert(_PyUnicode_CHECK(op)), \
136n/a assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
137n/a (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
138n/a#define _PyUnicode_SHARE_WSTR(op) \
139n/a (assert(_PyUnicode_CHECK(op)), \
140n/a (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
141n/a
142n/a/* true if the Unicode object has an allocated UTF-8 memory block
143n/a (not shared with other data) */
144n/a#define _PyUnicode_HAS_UTF8_MEMORY(op) \
145n/a ((!PyUnicode_IS_COMPACT_ASCII(op) \
146n/a && _PyUnicode_UTF8(op) \
147n/a && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
148n/a
149n/a/* true if the Unicode object has an allocated wstr memory block
150n/a (not shared with other data) */
151n/a#define _PyUnicode_HAS_WSTR_MEMORY(op) \
152n/a ((_PyUnicode_WSTR(op) && \
153n/a (!PyUnicode_IS_READY(op) || \
154n/a _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
155n/a
156n/a/* Generic helper macro to convert characters of different types.
157n/a from_type and to_type have to be valid type names, begin and end
158n/a are pointers to the source characters which should be of type
159n/a "from_type *". to is a pointer of type "to_type *" and points to the
160n/a buffer where the result characters are written to. */
161n/a#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
162n/a do { \
163n/a to_type *_to = (to_type *)(to); \
164n/a const from_type *_iter = (from_type *)(begin); \
165n/a const from_type *_end = (from_type *)(end); \
166n/a Py_ssize_t n = (_end) - (_iter); \
167n/a const from_type *_unrolled_end = \
168n/a _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
169n/a while (_iter < (_unrolled_end)) { \
170n/a _to[0] = (to_type) _iter[0]; \
171n/a _to[1] = (to_type) _iter[1]; \
172n/a _to[2] = (to_type) _iter[2]; \
173n/a _to[3] = (to_type) _iter[3]; \
174n/a _iter += 4; _to += 4; \
175n/a } \
176n/a while (_iter < (_end)) \
177n/a *_to++ = (to_type) *_iter++; \
178n/a } while (0)
179n/a
180n/a#ifdef MS_WINDOWS
181n/a /* On Windows, overallocate by 50% is the best factor */
182n/a# define OVERALLOCATE_FACTOR 2
183n/a#else
184n/a /* On Linux, overallocate by 25% is the best factor */
185n/a# define OVERALLOCATE_FACTOR 4
186n/a#endif
187n/a
188n/a/* This dictionary holds all interned unicode strings. Note that references
189n/a to strings in this dictionary are *not* counted in the string's ob_refcnt.
190n/a When the interned string reaches a refcnt of 0 the string deallocation
191n/a function will delete the reference from this dictionary.
192n/a
193n/a Another way to look at this is that to say that the actual reference
194n/a count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
195n/a*/
196n/astatic PyObject *interned = NULL;
197n/a
198n/a/* The empty Unicode object is shared to improve performance. */
199n/astatic PyObject *unicode_empty = NULL;
200n/a
201n/a#define _Py_INCREF_UNICODE_EMPTY() \
202n/a do { \
203n/a if (unicode_empty != NULL) \
204n/a Py_INCREF(unicode_empty); \
205n/a else { \
206n/a unicode_empty = PyUnicode_New(0, 0); \
207n/a if (unicode_empty != NULL) { \
208n/a Py_INCREF(unicode_empty); \
209n/a assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
210n/a } \
211n/a } \
212n/a } while (0)
213n/a
214n/a#define _Py_RETURN_UNICODE_EMPTY() \
215n/a do { \
216n/a _Py_INCREF_UNICODE_EMPTY(); \
217n/a return unicode_empty; \
218n/a } while (0)
219n/a
220n/a/* Forward declaration */
221n/astatic inline int
222n/a_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
223n/a
224n/a/* List of static strings. */
225n/astatic _Py_Identifier *static_strings = NULL;
226n/a
227n/a/* Single character Unicode strings in the Latin-1 range are being
228n/a shared as well. */
229n/astatic PyObject *unicode_latin1[256] = {NULL};
230n/a
231n/a/* Fast detection of the most frequent whitespace characters */
232n/aconst unsigned char _Py_ascii_whitespace[] = {
233n/a 0, 0, 0, 0, 0, 0, 0, 0,
234n/a/* case 0x0009: * CHARACTER TABULATION */
235n/a/* case 0x000A: * LINE FEED */
236n/a/* case 0x000B: * LINE TABULATION */
237n/a/* case 0x000C: * FORM FEED */
238n/a/* case 0x000D: * CARRIAGE RETURN */
239n/a 0, 1, 1, 1, 1, 1, 0, 0,
240n/a 0, 0, 0, 0, 0, 0, 0, 0,
241n/a/* case 0x001C: * FILE SEPARATOR */
242n/a/* case 0x001D: * GROUP SEPARATOR */
243n/a/* case 0x001E: * RECORD SEPARATOR */
244n/a/* case 0x001F: * UNIT SEPARATOR */
245n/a 0, 0, 0, 0, 1, 1, 1, 1,
246n/a/* case 0x0020: * SPACE */
247n/a 1, 0, 0, 0, 0, 0, 0, 0,
248n/a 0, 0, 0, 0, 0, 0, 0, 0,
249n/a 0, 0, 0, 0, 0, 0, 0, 0,
250n/a 0, 0, 0, 0, 0, 0, 0, 0,
251n/a
252n/a 0, 0, 0, 0, 0, 0, 0, 0,
253n/a 0, 0, 0, 0, 0, 0, 0, 0,
254n/a 0, 0, 0, 0, 0, 0, 0, 0,
255n/a 0, 0, 0, 0, 0, 0, 0, 0,
256n/a 0, 0, 0, 0, 0, 0, 0, 0,
257n/a 0, 0, 0, 0, 0, 0, 0, 0,
258n/a 0, 0, 0, 0, 0, 0, 0, 0,
259n/a 0, 0, 0, 0, 0, 0, 0, 0
260n/a};
261n/a
262n/a/* forward */
263n/astatic PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
264n/astatic PyObject* get_latin1_char(unsigned char ch);
265n/astatic int unicode_modifiable(PyObject *unicode);
266n/a
267n/a
268n/astatic PyObject *
269n/a_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
270n/astatic PyObject *
271n/a_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
272n/astatic PyObject *
273n/a_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
274n/a
275n/astatic PyObject *
276n/aunicode_encode_call_errorhandler(const char *errors,
277n/a PyObject **errorHandler,const char *encoding, const char *reason,
278n/a PyObject *unicode, PyObject **exceptionObject,
279n/a Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
280n/a
281n/astatic void
282n/araise_encode_exception(PyObject **exceptionObject,
283n/a const char *encoding,
284n/a PyObject *unicode,
285n/a Py_ssize_t startpos, Py_ssize_t endpos,
286n/a const char *reason);
287n/a
288n/a/* Same for linebreaks */
289n/astatic const unsigned char ascii_linebreak[] = {
290n/a 0, 0, 0, 0, 0, 0, 0, 0,
291n/a/* 0x000A, * LINE FEED */
292n/a/* 0x000B, * LINE TABULATION */
293n/a/* 0x000C, * FORM FEED */
294n/a/* 0x000D, * CARRIAGE RETURN */
295n/a 0, 0, 1, 1, 1, 1, 0, 0,
296n/a 0, 0, 0, 0, 0, 0, 0, 0,
297n/a/* 0x001C, * FILE SEPARATOR */
298n/a/* 0x001D, * GROUP SEPARATOR */
299n/a/* 0x001E, * RECORD SEPARATOR */
300n/a 0, 0, 0, 0, 1, 1, 1, 0,
301n/a 0, 0, 0, 0, 0, 0, 0, 0,
302n/a 0, 0, 0, 0, 0, 0, 0, 0,
303n/a 0, 0, 0, 0, 0, 0, 0, 0,
304n/a 0, 0, 0, 0, 0, 0, 0, 0,
305n/a
306n/a 0, 0, 0, 0, 0, 0, 0, 0,
307n/a 0, 0, 0, 0, 0, 0, 0, 0,
308n/a 0, 0, 0, 0, 0, 0, 0, 0,
309n/a 0, 0, 0, 0, 0, 0, 0, 0,
310n/a 0, 0, 0, 0, 0, 0, 0, 0,
311n/a 0, 0, 0, 0, 0, 0, 0, 0,
312n/a 0, 0, 0, 0, 0, 0, 0, 0,
313n/a 0, 0, 0, 0, 0, 0, 0, 0
314n/a};
315n/a
316n/astatic int convert_uc(PyObject *obj, void *addr);
317n/a
318n/a#include "clinic/unicodeobject.c.h"
319n/a
320n/atypedef enum {
321n/a _Py_ERROR_UNKNOWN=0,
322n/a _Py_ERROR_STRICT,
323n/a _Py_ERROR_SURROGATEESCAPE,
324n/a _Py_ERROR_REPLACE,
325n/a _Py_ERROR_IGNORE,
326n/a _Py_ERROR_BACKSLASHREPLACE,
327n/a _Py_ERROR_SURROGATEPASS,
328n/a _Py_ERROR_XMLCHARREFREPLACE,
329n/a _Py_ERROR_OTHER
330n/a} _Py_error_handler;
331n/a
332n/astatic _Py_error_handler
333n/aget_error_handler(const char *errors)
334n/a{
335n/a if (errors == NULL || strcmp(errors, "strict") == 0) {
336n/a return _Py_ERROR_STRICT;
337n/a }
338n/a if (strcmp(errors, "surrogateescape") == 0) {
339n/a return _Py_ERROR_SURROGATEESCAPE;
340n/a }
341n/a if (strcmp(errors, "replace") == 0) {
342n/a return _Py_ERROR_REPLACE;
343n/a }
344n/a if (strcmp(errors, "ignore") == 0) {
345n/a return _Py_ERROR_IGNORE;
346n/a }
347n/a if (strcmp(errors, "backslashreplace") == 0) {
348n/a return _Py_ERROR_BACKSLASHREPLACE;
349n/a }
350n/a if (strcmp(errors, "surrogatepass") == 0) {
351n/a return _Py_ERROR_SURROGATEPASS;
352n/a }
353n/a if (strcmp(errors, "xmlcharrefreplace") == 0) {
354n/a return _Py_ERROR_XMLCHARREFREPLACE;
355n/a }
356n/a return _Py_ERROR_OTHER;
357n/a}
358n/a
359n/a/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
360n/a This function is kept for backward compatibility with the old API. */
361n/aPy_UNICODE
362n/aPyUnicode_GetMax(void)
363n/a{
364n/a#ifdef Py_UNICODE_WIDE
365n/a return 0x10FFFF;
366n/a#else
367n/a /* This is actually an illegal character, so it should
368n/a not be passed to unichr. */
369n/a return 0xFFFF;
370n/a#endif
371n/a}
372n/a
373n/a#ifdef Py_DEBUG
374n/aint
375n/a_PyUnicode_CheckConsistency(PyObject *op, int check_content)
376n/a{
377n/a PyASCIIObject *ascii;
378n/a unsigned int kind;
379n/a
380n/a assert(PyUnicode_Check(op));
381n/a
382n/a ascii = (PyASCIIObject *)op;
383n/a kind = ascii->state.kind;
384n/a
385n/a if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
386n/a assert(kind == PyUnicode_1BYTE_KIND);
387n/a assert(ascii->state.ready == 1);
388n/a }
389n/a else {
390n/a PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
391n/a void *data;
392n/a
393n/a if (ascii->state.compact == 1) {
394n/a data = compact + 1;
395n/a assert(kind == PyUnicode_1BYTE_KIND
396n/a || kind == PyUnicode_2BYTE_KIND
397n/a || kind == PyUnicode_4BYTE_KIND);
398n/a assert(ascii->state.ascii == 0);
399n/a assert(ascii->state.ready == 1);
400n/a assert (compact->utf8 != data);
401n/a }
402n/a else {
403n/a PyUnicodeObject *unicode = (PyUnicodeObject *)op;
404n/a
405n/a data = unicode->data.any;
406n/a if (kind == PyUnicode_WCHAR_KIND) {
407n/a assert(ascii->length == 0);
408n/a assert(ascii->hash == -1);
409n/a assert(ascii->state.compact == 0);
410n/a assert(ascii->state.ascii == 0);
411n/a assert(ascii->state.ready == 0);
412n/a assert(ascii->state.interned == SSTATE_NOT_INTERNED);
413n/a assert(ascii->wstr != NULL);
414n/a assert(data == NULL);
415n/a assert(compact->utf8 == NULL);
416n/a }
417n/a else {
418n/a assert(kind == PyUnicode_1BYTE_KIND
419n/a || kind == PyUnicode_2BYTE_KIND
420n/a || kind == PyUnicode_4BYTE_KIND);
421n/a assert(ascii->state.compact == 0);
422n/a assert(ascii->state.ready == 1);
423n/a assert(data != NULL);
424n/a if (ascii->state.ascii) {
425n/a assert (compact->utf8 == data);
426n/a assert (compact->utf8_length == ascii->length);
427n/a }
428n/a else
429n/a assert (compact->utf8 != data);
430n/a }
431n/a }
432n/a if (kind != PyUnicode_WCHAR_KIND) {
433n/a if (
434n/a#if SIZEOF_WCHAR_T == 2
435n/a kind == PyUnicode_2BYTE_KIND
436n/a#else
437n/a kind == PyUnicode_4BYTE_KIND
438n/a#endif
439n/a )
440n/a {
441n/a assert(ascii->wstr == data);
442n/a assert(compact->wstr_length == ascii->length);
443n/a } else
444n/a assert(ascii->wstr != data);
445n/a }
446n/a
447n/a if (compact->utf8 == NULL)
448n/a assert(compact->utf8_length == 0);
449n/a if (ascii->wstr == NULL)
450n/a assert(compact->wstr_length == 0);
451n/a }
452n/a /* check that the best kind is used */
453n/a if (check_content && kind != PyUnicode_WCHAR_KIND)
454n/a {
455n/a Py_ssize_t i;
456n/a Py_UCS4 maxchar = 0;
457n/a void *data;
458n/a Py_UCS4 ch;
459n/a
460n/a data = PyUnicode_DATA(ascii);
461n/a for (i=0; i < ascii->length; i++)
462n/a {
463n/a ch = PyUnicode_READ(kind, data, i);
464n/a if (ch > maxchar)
465n/a maxchar = ch;
466n/a }
467n/a if (kind == PyUnicode_1BYTE_KIND) {
468n/a if (ascii->state.ascii == 0) {
469n/a assert(maxchar >= 128);
470n/a assert(maxchar <= 255);
471n/a }
472n/a else
473n/a assert(maxchar < 128);
474n/a }
475n/a else if (kind == PyUnicode_2BYTE_KIND) {
476n/a assert(maxchar >= 0x100);
477n/a assert(maxchar <= 0xFFFF);
478n/a }
479n/a else {
480n/a assert(maxchar >= 0x10000);
481n/a assert(maxchar <= MAX_UNICODE);
482n/a }
483n/a assert(PyUnicode_READ(kind, data, ascii->length) == 0);
484n/a }
485n/a return 1;
486n/a}
487n/a#endif
488n/a
489n/astatic PyObject*
490n/aunicode_result_wchar(PyObject *unicode)
491n/a{
492n/a#ifndef Py_DEBUG
493n/a Py_ssize_t len;
494n/a
495n/a len = _PyUnicode_WSTR_LENGTH(unicode);
496n/a if (len == 0) {
497n/a Py_DECREF(unicode);
498n/a _Py_RETURN_UNICODE_EMPTY();
499n/a }
500n/a
501n/a if (len == 1) {
502n/a wchar_t ch = _PyUnicode_WSTR(unicode)[0];
503n/a if ((Py_UCS4)ch < 256) {
504n/a PyObject *latin1_char = get_latin1_char((unsigned char)ch);
505n/a Py_DECREF(unicode);
506n/a return latin1_char;
507n/a }
508n/a }
509n/a
510n/a if (_PyUnicode_Ready(unicode) < 0) {
511n/a Py_DECREF(unicode);
512n/a return NULL;
513n/a }
514n/a#else
515n/a assert(Py_REFCNT(unicode) == 1);
516n/a
517n/a /* don't make the result ready in debug mode to ensure that the caller
518n/a makes the string ready before using it */
519n/a assert(_PyUnicode_CheckConsistency(unicode, 1));
520n/a#endif
521n/a return unicode;
522n/a}
523n/a
524n/astatic PyObject*
525n/aunicode_result_ready(PyObject *unicode)
526n/a{
527n/a Py_ssize_t length;
528n/a
529n/a length = PyUnicode_GET_LENGTH(unicode);
530n/a if (length == 0) {
531n/a if (unicode != unicode_empty) {
532n/a Py_DECREF(unicode);
533n/a _Py_RETURN_UNICODE_EMPTY();
534n/a }
535n/a return unicode_empty;
536n/a }
537n/a
538n/a if (length == 1) {
539n/a void *data = PyUnicode_DATA(unicode);
540n/a int kind = PyUnicode_KIND(unicode);
541n/a Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
542n/a if (ch < 256) {
543n/a PyObject *latin1_char = unicode_latin1[ch];
544n/a if (latin1_char != NULL) {
545n/a if (unicode != latin1_char) {
546n/a Py_INCREF(latin1_char);
547n/a Py_DECREF(unicode);
548n/a }
549n/a return latin1_char;
550n/a }
551n/a else {
552n/a assert(_PyUnicode_CheckConsistency(unicode, 1));
553n/a Py_INCREF(unicode);
554n/a unicode_latin1[ch] = unicode;
555n/a return unicode;
556n/a }
557n/a }
558n/a }
559n/a
560n/a assert(_PyUnicode_CheckConsistency(unicode, 1));
561n/a return unicode;
562n/a}
563n/a
564n/astatic PyObject*
565n/aunicode_result(PyObject *unicode)
566n/a{
567n/a assert(_PyUnicode_CHECK(unicode));
568n/a if (PyUnicode_IS_READY(unicode))
569n/a return unicode_result_ready(unicode);
570n/a else
571n/a return unicode_result_wchar(unicode);
572n/a}
573n/a
574n/astatic PyObject*
575n/aunicode_result_unchanged(PyObject *unicode)
576n/a{
577n/a if (PyUnicode_CheckExact(unicode)) {
578n/a if (PyUnicode_READY(unicode) == -1)
579n/a return NULL;
580n/a Py_INCREF(unicode);
581n/a return unicode;
582n/a }
583n/a else
584n/a /* Subtype -- return genuine unicode string with the same value. */
585n/a return _PyUnicode_Copy(unicode);
586n/a}
587n/a
588n/a/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
589n/a ASCII, Latin1, UTF-8, etc. */
590n/astatic char*
591n/abackslashreplace(_PyBytesWriter *writer, char *str,
592n/a PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
593n/a{
594n/a Py_ssize_t size, i;
595n/a Py_UCS4 ch;
596n/a enum PyUnicode_Kind kind;
597n/a void *data;
598n/a
599n/a assert(PyUnicode_IS_READY(unicode));
600n/a kind = PyUnicode_KIND(unicode);
601n/a data = PyUnicode_DATA(unicode);
602n/a
603n/a size = 0;
604n/a /* determine replacement size */
605n/a for (i = collstart; i < collend; ++i) {
606n/a Py_ssize_t incr;
607n/a
608n/a ch = PyUnicode_READ(kind, data, i);
609n/a if (ch < 0x100)
610n/a incr = 2+2;
611n/a else if (ch < 0x10000)
612n/a incr = 2+4;
613n/a else {
614n/a assert(ch <= MAX_UNICODE);
615n/a incr = 2+8;
616n/a }
617n/a if (size > PY_SSIZE_T_MAX - incr) {
618n/a PyErr_SetString(PyExc_OverflowError,
619n/a "encoded result is too long for a Python string");
620n/a return NULL;
621n/a }
622n/a size += incr;
623n/a }
624n/a
625n/a str = _PyBytesWriter_Prepare(writer, str, size);
626n/a if (str == NULL)
627n/a return NULL;
628n/a
629n/a /* generate replacement */
630n/a for (i = collstart; i < collend; ++i) {
631n/a ch = PyUnicode_READ(kind, data, i);
632n/a *str++ = '\\';
633n/a if (ch >= 0x00010000) {
634n/a *str++ = 'U';
635n/a *str++ = Py_hexdigits[(ch>>28)&0xf];
636n/a *str++ = Py_hexdigits[(ch>>24)&0xf];
637n/a *str++ = Py_hexdigits[(ch>>20)&0xf];
638n/a *str++ = Py_hexdigits[(ch>>16)&0xf];
639n/a *str++ = Py_hexdigits[(ch>>12)&0xf];
640n/a *str++ = Py_hexdigits[(ch>>8)&0xf];
641n/a }
642n/a else if (ch >= 0x100) {
643n/a *str++ = 'u';
644n/a *str++ = Py_hexdigits[(ch>>12)&0xf];
645n/a *str++ = Py_hexdigits[(ch>>8)&0xf];
646n/a }
647n/a else
648n/a *str++ = 'x';
649n/a *str++ = Py_hexdigits[(ch>>4)&0xf];
650n/a *str++ = Py_hexdigits[ch&0xf];
651n/a }
652n/a return str;
653n/a}
654n/a
655n/a/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
656n/a ASCII, Latin1, UTF-8, etc. */
657n/astatic char*
658n/axmlcharrefreplace(_PyBytesWriter *writer, char *str,
659n/a PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
660n/a{
661n/a Py_ssize_t size, i;
662n/a Py_UCS4 ch;
663n/a enum PyUnicode_Kind kind;
664n/a void *data;
665n/a
666n/a assert(PyUnicode_IS_READY(unicode));
667n/a kind = PyUnicode_KIND(unicode);
668n/a data = PyUnicode_DATA(unicode);
669n/a
670n/a size = 0;
671n/a /* determine replacement size */
672n/a for (i = collstart; i < collend; ++i) {
673n/a Py_ssize_t incr;
674n/a
675n/a ch = PyUnicode_READ(kind, data, i);
676n/a if (ch < 10)
677n/a incr = 2+1+1;
678n/a else if (ch < 100)
679n/a incr = 2+2+1;
680n/a else if (ch < 1000)
681n/a incr = 2+3+1;
682n/a else if (ch < 10000)
683n/a incr = 2+4+1;
684n/a else if (ch < 100000)
685n/a incr = 2+5+1;
686n/a else if (ch < 1000000)
687n/a incr = 2+6+1;
688n/a else {
689n/a assert(ch <= MAX_UNICODE);
690n/a incr = 2+7+1;
691n/a }
692n/a if (size > PY_SSIZE_T_MAX - incr) {
693n/a PyErr_SetString(PyExc_OverflowError,
694n/a "encoded result is too long for a Python string");
695n/a return NULL;
696n/a }
697n/a size += incr;
698n/a }
699n/a
700n/a str = _PyBytesWriter_Prepare(writer, str, size);
701n/a if (str == NULL)
702n/a return NULL;
703n/a
704n/a /* generate replacement */
705n/a for (i = collstart; i < collend; ++i) {
706n/a str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
707n/a }
708n/a return str;
709n/a}
710n/a
711n/a/* --- Bloom Filters ----------------------------------------------------- */
712n/a
713n/a/* stuff to implement simple "bloom filters" for Unicode characters.
714n/a to keep things simple, we use a single bitmask, using the least 5
715n/a bits from each unicode characters as the bit index. */
716n/a
717n/a/* the linebreak mask is set up by Unicode_Init below */
718n/a
719n/a#if LONG_BIT >= 128
720n/a#define BLOOM_WIDTH 128
721n/a#elif LONG_BIT >= 64
722n/a#define BLOOM_WIDTH 64
723n/a#elif LONG_BIT >= 32
724n/a#define BLOOM_WIDTH 32
725n/a#else
726n/a#error "LONG_BIT is smaller than 32"
727n/a#endif
728n/a
729n/a#define BLOOM_MASK unsigned long
730n/a
731n/astatic BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
732n/a
733n/a#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
734n/a
735n/a#define BLOOM_LINEBREAK(ch) \
736n/a ((ch) < 128U ? ascii_linebreak[(ch)] : \
737n/a (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
738n/a
739n/astatic inline BLOOM_MASK
740n/amake_bloom_mask(int kind, void* ptr, Py_ssize_t len)
741n/a{
742n/a#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
743n/a do { \
744n/a TYPE *data = (TYPE *)PTR; \
745n/a TYPE *end = data + LEN; \
746n/a Py_UCS4 ch; \
747n/a for (; data != end; data++) { \
748n/a ch = *data; \
749n/a MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
750n/a } \
751n/a break; \
752n/a } while (0)
753n/a
754n/a /* calculate simple bloom-style bitmask for a given unicode string */
755n/a
756n/a BLOOM_MASK mask;
757n/a
758n/a mask = 0;
759n/a switch (kind) {
760n/a case PyUnicode_1BYTE_KIND:
761n/a BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
762n/a break;
763n/a case PyUnicode_2BYTE_KIND:
764n/a BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
765n/a break;
766n/a case PyUnicode_4BYTE_KIND:
767n/a BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
768n/a break;
769n/a default:
770n/a assert(0);
771n/a }
772n/a return mask;
773n/a
774n/a#undef BLOOM_UPDATE
775n/a}
776n/a
777n/astatic int
778n/aensure_unicode(PyObject *obj)
779n/a{
780n/a if (!PyUnicode_Check(obj)) {
781n/a PyErr_Format(PyExc_TypeError,
782n/a "must be str, not %.100s",
783n/a Py_TYPE(obj)->tp_name);
784n/a return -1;
785n/a }
786n/a return PyUnicode_READY(obj);
787n/a}
788n/a
789n/a/* Compilation of templated routines */
790n/a
791n/a#include "stringlib/asciilib.h"
792n/a#include "stringlib/fastsearch.h"
793n/a#include "stringlib/partition.h"
794n/a#include "stringlib/split.h"
795n/a#include "stringlib/count.h"
796n/a#include "stringlib/find.h"
797n/a#include "stringlib/find_max_char.h"
798n/a#include "stringlib/localeutil.h"
799n/a#include "stringlib/undef.h"
800n/a
801n/a#include "stringlib/ucs1lib.h"
802n/a#include "stringlib/fastsearch.h"
803n/a#include "stringlib/partition.h"
804n/a#include "stringlib/split.h"
805n/a#include "stringlib/count.h"
806n/a#include "stringlib/find.h"
807n/a#include "stringlib/replace.h"
808n/a#include "stringlib/find_max_char.h"
809n/a#include "stringlib/localeutil.h"
810n/a#include "stringlib/undef.h"
811n/a
812n/a#include "stringlib/ucs2lib.h"
813n/a#include "stringlib/fastsearch.h"
814n/a#include "stringlib/partition.h"
815n/a#include "stringlib/split.h"
816n/a#include "stringlib/count.h"
817n/a#include "stringlib/find.h"
818n/a#include "stringlib/replace.h"
819n/a#include "stringlib/find_max_char.h"
820n/a#include "stringlib/localeutil.h"
821n/a#include "stringlib/undef.h"
822n/a
823n/a#include "stringlib/ucs4lib.h"
824n/a#include "stringlib/fastsearch.h"
825n/a#include "stringlib/partition.h"
826n/a#include "stringlib/split.h"
827n/a#include "stringlib/count.h"
828n/a#include "stringlib/find.h"
829n/a#include "stringlib/replace.h"
830n/a#include "stringlib/find_max_char.h"
831n/a#include "stringlib/localeutil.h"
832n/a#include "stringlib/undef.h"
833n/a
834n/a#include "stringlib/unicodedefs.h"
835n/a#include "stringlib/fastsearch.h"
836n/a#include "stringlib/count.h"
837n/a#include "stringlib/find.h"
838n/a#include "stringlib/undef.h"
839n/a
840n/a/* --- Unicode Object ----------------------------------------------------- */
841n/a
842n/astatic PyObject *
843n/afixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
844n/a
845n/astatic inline Py_ssize_t
846n/afindchar(const void *s, int kind,
847n/a Py_ssize_t size, Py_UCS4 ch,
848n/a int direction)
849n/a{
850n/a switch (kind) {
851n/a case PyUnicode_1BYTE_KIND:
852n/a if ((Py_UCS1) ch != ch)
853n/a return -1;
854n/a if (direction > 0)
855n/a return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
856n/a else
857n/a return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
858n/a case PyUnicode_2BYTE_KIND:
859n/a if ((Py_UCS2) ch != ch)
860n/a return -1;
861n/a if (direction > 0)
862n/a return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
863n/a else
864n/a return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
865n/a case PyUnicode_4BYTE_KIND:
866n/a if (direction > 0)
867n/a return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
868n/a else
869n/a return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
870n/a default:
871n/a assert(0);
872n/a return -1;
873n/a }
874n/a}
875n/a
876n/a#ifdef Py_DEBUG
877n/a/* Fill the data of a Unicode string with invalid characters to detect bugs
878n/a earlier.
879n/a
880n/a _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
881n/a ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
882n/a invalid character in Unicode 6.0. */
883n/astatic void
884n/aunicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
885n/a{
886n/a int kind = PyUnicode_KIND(unicode);
887n/a Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
888n/a Py_ssize_t length = _PyUnicode_LENGTH(unicode);
889n/a if (length <= old_length)
890n/a return;
891n/a memset(data + old_length * kind, 0xff, (length - old_length) * kind);
892n/a}
893n/a#endif
894n/a
895n/astatic PyObject*
896n/aresize_compact(PyObject *unicode, Py_ssize_t length)
897n/a{
898n/a Py_ssize_t char_size;
899n/a Py_ssize_t struct_size;
900n/a Py_ssize_t new_size;
901n/a int share_wstr;
902n/a PyObject *new_unicode;
903n/a#ifdef Py_DEBUG
904n/a Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
905n/a#endif
906n/a
907n/a assert(unicode_modifiable(unicode));
908n/a assert(PyUnicode_IS_READY(unicode));
909n/a assert(PyUnicode_IS_COMPACT(unicode));
910n/a
911n/a char_size = PyUnicode_KIND(unicode);
912n/a if (PyUnicode_IS_ASCII(unicode))
913n/a struct_size = sizeof(PyASCIIObject);
914n/a else
915n/a struct_size = sizeof(PyCompactUnicodeObject);
916n/a share_wstr = _PyUnicode_SHARE_WSTR(unicode);
917n/a
918n/a if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
919n/a PyErr_NoMemory();
920n/a return NULL;
921n/a }
922n/a new_size = (struct_size + (length + 1) * char_size);
923n/a
924n/a if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
925n/a PyObject_DEL(_PyUnicode_UTF8(unicode));
926n/a _PyUnicode_UTF8(unicode) = NULL;
927n/a _PyUnicode_UTF8_LENGTH(unicode) = 0;
928n/a }
929n/a _Py_DEC_REFTOTAL;
930n/a _Py_ForgetReference(unicode);
931n/a
932n/a new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
933n/a if (new_unicode == NULL) {
934n/a _Py_NewReference(unicode);
935n/a PyErr_NoMemory();
936n/a return NULL;
937n/a }
938n/a unicode = new_unicode;
939n/a _Py_NewReference(unicode);
940n/a
941n/a _PyUnicode_LENGTH(unicode) = length;
942n/a if (share_wstr) {
943n/a _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
944n/a if (!PyUnicode_IS_ASCII(unicode))
945n/a _PyUnicode_WSTR_LENGTH(unicode) = length;
946n/a }
947n/a else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
948n/a PyObject_DEL(_PyUnicode_WSTR(unicode));
949n/a _PyUnicode_WSTR(unicode) = NULL;
950n/a if (!PyUnicode_IS_ASCII(unicode))
951n/a _PyUnicode_WSTR_LENGTH(unicode) = 0;
952n/a }
953n/a#ifdef Py_DEBUG
954n/a unicode_fill_invalid(unicode, old_length);
955n/a#endif
956n/a PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
957n/a length, 0);
958n/a assert(_PyUnicode_CheckConsistency(unicode, 0));
959n/a return unicode;
960n/a}
961n/a
962n/astatic int
963n/aresize_inplace(PyObject *unicode, Py_ssize_t length)
964n/a{
965n/a wchar_t *wstr;
966n/a Py_ssize_t new_size;
967n/a assert(!PyUnicode_IS_COMPACT(unicode));
968n/a assert(Py_REFCNT(unicode) == 1);
969n/a
970n/a if (PyUnicode_IS_READY(unicode)) {
971n/a Py_ssize_t char_size;
972n/a int share_wstr, share_utf8;
973n/a void *data;
974n/a#ifdef Py_DEBUG
975n/a Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
976n/a#endif
977n/a
978n/a data = _PyUnicode_DATA_ANY(unicode);
979n/a char_size = PyUnicode_KIND(unicode);
980n/a share_wstr = _PyUnicode_SHARE_WSTR(unicode);
981n/a share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
982n/a
983n/a if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
984n/a PyErr_NoMemory();
985n/a return -1;
986n/a }
987n/a new_size = (length + 1) * char_size;
988n/a
989n/a if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
990n/a {
991n/a PyObject_DEL(_PyUnicode_UTF8(unicode));
992n/a _PyUnicode_UTF8(unicode) = NULL;
993n/a _PyUnicode_UTF8_LENGTH(unicode) = 0;
994n/a }
995n/a
996n/a data = (PyObject *)PyObject_REALLOC(data, new_size);
997n/a if (data == NULL) {
998n/a PyErr_NoMemory();
999n/a return -1;
1000n/a }
1001n/a _PyUnicode_DATA_ANY(unicode) = data;
1002n/a if (share_wstr) {
1003n/a _PyUnicode_WSTR(unicode) = data;
1004n/a _PyUnicode_WSTR_LENGTH(unicode) = length;
1005n/a }
1006n/a if (share_utf8) {
1007n/a _PyUnicode_UTF8(unicode) = data;
1008n/a _PyUnicode_UTF8_LENGTH(unicode) = length;
1009n/a }
1010n/a _PyUnicode_LENGTH(unicode) = length;
1011n/a PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1012n/a#ifdef Py_DEBUG
1013n/a unicode_fill_invalid(unicode, old_length);
1014n/a#endif
1015n/a if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1016n/a assert(_PyUnicode_CheckConsistency(unicode, 0));
1017n/a return 0;
1018n/a }
1019n/a }
1020n/a assert(_PyUnicode_WSTR(unicode) != NULL);
1021n/a
1022n/a /* check for integer overflow */
1023n/a if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1024n/a PyErr_NoMemory();
1025n/a return -1;
1026n/a }
1027n/a new_size = sizeof(wchar_t) * (length + 1);
1028n/a wstr = _PyUnicode_WSTR(unicode);
1029n/a wstr = PyObject_REALLOC(wstr, new_size);
1030n/a if (!wstr) {
1031n/a PyErr_NoMemory();
1032n/a return -1;
1033n/a }
1034n/a _PyUnicode_WSTR(unicode) = wstr;
1035n/a _PyUnicode_WSTR(unicode)[length] = 0;
1036n/a _PyUnicode_WSTR_LENGTH(unicode) = length;
1037n/a assert(_PyUnicode_CheckConsistency(unicode, 0));
1038n/a return 0;
1039n/a}
1040n/a
1041n/astatic PyObject*
1042n/aresize_copy(PyObject *unicode, Py_ssize_t length)
1043n/a{
1044n/a Py_ssize_t copy_length;
1045n/a if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1046n/a PyObject *copy;
1047n/a
1048n/a assert(PyUnicode_IS_READY(unicode));
1049n/a
1050n/a copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1051n/a if (copy == NULL)
1052n/a return NULL;
1053n/a
1054n/a copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1055n/a _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1056n/a return copy;
1057n/a }
1058n/a else {
1059n/a PyObject *w;
1060n/a
1061n/a w = (PyObject*)_PyUnicode_New(length);
1062n/a if (w == NULL)
1063n/a return NULL;
1064n/a copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1065n/a copy_length = Py_MIN(copy_length, length);
1066n/a memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1067n/a copy_length * sizeof(wchar_t));
1068n/a return w;
1069n/a }
1070n/a}
1071n/a
1072n/a/* We allocate one more byte to make sure the string is
1073n/a Ux0000 terminated; some code (e.g. new_identifier)
1074n/a relies on that.
1075n/a
1076n/a XXX This allocator could further be enhanced by assuring that the
1077n/a free list never reduces its size below 1.
1078n/a
1079n/a*/
1080n/a
1081n/astatic PyUnicodeObject *
1082n/a_PyUnicode_New(Py_ssize_t length)
1083n/a{
1084n/a PyUnicodeObject *unicode;
1085n/a size_t new_size;
1086n/a
1087n/a /* Optimization for empty strings */
1088n/a if (length == 0 && unicode_empty != NULL) {
1089n/a Py_INCREF(unicode_empty);
1090n/a return (PyUnicodeObject*)unicode_empty;
1091n/a }
1092n/a
1093n/a /* Ensure we won't overflow the size. */
1094n/a if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1095n/a return (PyUnicodeObject *)PyErr_NoMemory();
1096n/a }
1097n/a if (length < 0) {
1098n/a PyErr_SetString(PyExc_SystemError,
1099n/a "Negative size passed to _PyUnicode_New");
1100n/a return NULL;
1101n/a }
1102n/a
1103n/a unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1104n/a if (unicode == NULL)
1105n/a return NULL;
1106n/a new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1107n/a
1108n/a _PyUnicode_WSTR_LENGTH(unicode) = length;
1109n/a _PyUnicode_HASH(unicode) = -1;
1110n/a _PyUnicode_STATE(unicode).interned = 0;
1111n/a _PyUnicode_STATE(unicode).kind = 0;
1112n/a _PyUnicode_STATE(unicode).compact = 0;
1113n/a _PyUnicode_STATE(unicode).ready = 0;
1114n/a _PyUnicode_STATE(unicode).ascii = 0;
1115n/a _PyUnicode_DATA_ANY(unicode) = NULL;
1116n/a _PyUnicode_LENGTH(unicode) = 0;
1117n/a _PyUnicode_UTF8(unicode) = NULL;
1118n/a _PyUnicode_UTF8_LENGTH(unicode) = 0;
1119n/a
1120n/a _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1121n/a if (!_PyUnicode_WSTR(unicode)) {
1122n/a Py_DECREF(unicode);
1123n/a PyErr_NoMemory();
1124n/a return NULL;
1125n/a }
1126n/a
1127n/a /* Initialize the first element to guard against cases where
1128n/a * the caller fails before initializing str -- unicode_resize()
1129n/a * reads str[0], and the Keep-Alive optimization can keep memory
1130n/a * allocated for str alive across a call to unicode_dealloc(unicode).
1131n/a * We don't want unicode_resize to read uninitialized memory in
1132n/a * that case.
1133n/a */
1134n/a _PyUnicode_WSTR(unicode)[0] = 0;
1135n/a _PyUnicode_WSTR(unicode)[length] = 0;
1136n/a
1137n/a assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1138n/a return unicode;
1139n/a}
1140n/a
1141n/astatic const char*
1142n/aunicode_kind_name(PyObject *unicode)
1143n/a{
1144n/a /* don't check consistency: unicode_kind_name() is called from
1145n/a _PyUnicode_Dump() */
1146n/a if (!PyUnicode_IS_COMPACT(unicode))
1147n/a {
1148n/a if (!PyUnicode_IS_READY(unicode))
1149n/a return "wstr";
1150n/a switch (PyUnicode_KIND(unicode))
1151n/a {
1152n/a case PyUnicode_1BYTE_KIND:
1153n/a if (PyUnicode_IS_ASCII(unicode))
1154n/a return "legacy ascii";
1155n/a else
1156n/a return "legacy latin1";
1157n/a case PyUnicode_2BYTE_KIND:
1158n/a return "legacy UCS2";
1159n/a case PyUnicode_4BYTE_KIND:
1160n/a return "legacy UCS4";
1161n/a default:
1162n/a return "<legacy invalid kind>";
1163n/a }
1164n/a }
1165n/a assert(PyUnicode_IS_READY(unicode));
1166n/a switch (PyUnicode_KIND(unicode)) {
1167n/a case PyUnicode_1BYTE_KIND:
1168n/a if (PyUnicode_IS_ASCII(unicode))
1169n/a return "ascii";
1170n/a else
1171n/a return "latin1";
1172n/a case PyUnicode_2BYTE_KIND:
1173n/a return "UCS2";
1174n/a case PyUnicode_4BYTE_KIND:
1175n/a return "UCS4";
1176n/a default:
1177n/a return "<invalid compact kind>";
1178n/a }
1179n/a}
1180n/a
1181n/a#ifdef Py_DEBUG
1182n/a/* Functions wrapping macros for use in debugger */
1183n/achar *_PyUnicode_utf8(void *unicode){
1184n/a return PyUnicode_UTF8(unicode);
1185n/a}
1186n/a
1187n/avoid *_PyUnicode_compact_data(void *unicode) {
1188n/a return _PyUnicode_COMPACT_DATA(unicode);
1189n/a}
1190n/avoid *_PyUnicode_data(void *unicode){
1191n/a printf("obj %p\n", unicode);
1192n/a printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1193n/a printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1194n/a printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1195n/a printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1196n/a printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1197n/a return PyUnicode_DATA(unicode);
1198n/a}
1199n/a
1200n/avoid
1201n/a_PyUnicode_Dump(PyObject *op)
1202n/a{
1203n/a PyASCIIObject *ascii = (PyASCIIObject *)op;
1204n/a PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1205n/a PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1206n/a void *data;
1207n/a
1208n/a if (ascii->state.compact)
1209n/a {
1210n/a if (ascii->state.ascii)
1211n/a data = (ascii + 1);
1212n/a else
1213n/a data = (compact + 1);
1214n/a }
1215n/a else
1216n/a data = unicode->data.any;
1217n/a printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1218n/a unicode_kind_name(op), ascii->length);
1219n/a
1220n/a if (ascii->wstr == data)
1221n/a printf("shared ");
1222n/a printf("wstr=%p", ascii->wstr);
1223n/a
1224n/a if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1225n/a printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1226n/a if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1227n/a printf("shared ");
1228n/a printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1229n/a compact->utf8, compact->utf8_length);
1230n/a }
1231n/a printf(", data=%p\n", data);
1232n/a}
1233n/a#endif
1234n/a
1235n/aPyObject *
1236n/aPyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1237n/a{
1238n/a PyObject *obj;
1239n/a PyCompactUnicodeObject *unicode;
1240n/a void *data;
1241n/a enum PyUnicode_Kind kind;
1242n/a int is_sharing, is_ascii;
1243n/a Py_ssize_t char_size;
1244n/a Py_ssize_t struct_size;
1245n/a
1246n/a /* Optimization for empty strings */
1247n/a if (size == 0 && unicode_empty != NULL) {
1248n/a Py_INCREF(unicode_empty);
1249n/a return unicode_empty;
1250n/a }
1251n/a
1252n/a is_ascii = 0;
1253n/a is_sharing = 0;
1254n/a struct_size = sizeof(PyCompactUnicodeObject);
1255n/a if (maxchar < 128) {
1256n/a kind = PyUnicode_1BYTE_KIND;
1257n/a char_size = 1;
1258n/a is_ascii = 1;
1259n/a struct_size = sizeof(PyASCIIObject);
1260n/a }
1261n/a else if (maxchar < 256) {
1262n/a kind = PyUnicode_1BYTE_KIND;
1263n/a char_size = 1;
1264n/a }
1265n/a else if (maxchar < 65536) {
1266n/a kind = PyUnicode_2BYTE_KIND;
1267n/a char_size = 2;
1268n/a if (sizeof(wchar_t) == 2)
1269n/a is_sharing = 1;
1270n/a }
1271n/a else {
1272n/a if (maxchar > MAX_UNICODE) {
1273n/a PyErr_SetString(PyExc_SystemError,
1274n/a "invalid maximum character passed to PyUnicode_New");
1275n/a return NULL;
1276n/a }
1277n/a kind = PyUnicode_4BYTE_KIND;
1278n/a char_size = 4;
1279n/a if (sizeof(wchar_t) == 4)
1280n/a is_sharing = 1;
1281n/a }
1282n/a
1283n/a /* Ensure we won't overflow the size. */
1284n/a if (size < 0) {
1285n/a PyErr_SetString(PyExc_SystemError,
1286n/a "Negative size passed to PyUnicode_New");
1287n/a return NULL;
1288n/a }
1289n/a if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1290n/a return PyErr_NoMemory();
1291n/a
1292n/a /* Duplicated allocation code from _PyObject_New() instead of a call to
1293n/a * PyObject_New() so we are able to allocate space for the object and
1294n/a * it's data buffer.
1295n/a */
1296n/a obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1297n/a if (obj == NULL)
1298n/a return PyErr_NoMemory();
1299n/a obj = PyObject_INIT(obj, &PyUnicode_Type);
1300n/a if (obj == NULL)
1301n/a return NULL;
1302n/a
1303n/a unicode = (PyCompactUnicodeObject *)obj;
1304n/a if (is_ascii)
1305n/a data = ((PyASCIIObject*)obj) + 1;
1306n/a else
1307n/a data = unicode + 1;
1308n/a _PyUnicode_LENGTH(unicode) = size;
1309n/a _PyUnicode_HASH(unicode) = -1;
1310n/a _PyUnicode_STATE(unicode).interned = 0;
1311n/a _PyUnicode_STATE(unicode).kind = kind;
1312n/a _PyUnicode_STATE(unicode).compact = 1;
1313n/a _PyUnicode_STATE(unicode).ready = 1;
1314n/a _PyUnicode_STATE(unicode).ascii = is_ascii;
1315n/a if (is_ascii) {
1316n/a ((char*)data)[size] = 0;
1317n/a _PyUnicode_WSTR(unicode) = NULL;
1318n/a }
1319n/a else if (kind == PyUnicode_1BYTE_KIND) {
1320n/a ((char*)data)[size] = 0;
1321n/a _PyUnicode_WSTR(unicode) = NULL;
1322n/a _PyUnicode_WSTR_LENGTH(unicode) = 0;
1323n/a unicode->utf8 = NULL;
1324n/a unicode->utf8_length = 0;
1325n/a }
1326n/a else {
1327n/a unicode->utf8 = NULL;
1328n/a unicode->utf8_length = 0;
1329n/a if (kind == PyUnicode_2BYTE_KIND)
1330n/a ((Py_UCS2*)data)[size] = 0;
1331n/a else /* kind == PyUnicode_4BYTE_KIND */
1332n/a ((Py_UCS4*)data)[size] = 0;
1333n/a if (is_sharing) {
1334n/a _PyUnicode_WSTR_LENGTH(unicode) = size;
1335n/a _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1336n/a }
1337n/a else {
1338n/a _PyUnicode_WSTR_LENGTH(unicode) = 0;
1339n/a _PyUnicode_WSTR(unicode) = NULL;
1340n/a }
1341n/a }
1342n/a#ifdef Py_DEBUG
1343n/a unicode_fill_invalid((PyObject*)unicode, 0);
1344n/a#endif
1345n/a assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1346n/a return obj;
1347n/a}
1348n/a
1349n/a#if SIZEOF_WCHAR_T == 2
1350n/a/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1351n/a will decode surrogate pairs, the other conversions are implemented as macros
1352n/a for efficiency.
1353n/a
1354n/a This function assumes that unicode can hold one more code point than wstr
1355n/a characters for a terminating null character. */
1356n/astatic void
1357n/aunicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1358n/a PyObject *unicode)
1359n/a{
1360n/a const wchar_t *iter;
1361n/a Py_UCS4 *ucs4_out;
1362n/a
1363n/a assert(unicode != NULL);
1364n/a assert(_PyUnicode_CHECK(unicode));
1365n/a assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1366n/a ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1367n/a
1368n/a for (iter = begin; iter < end; ) {
1369n/a assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1370n/a _PyUnicode_GET_LENGTH(unicode)));
1371n/a if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1372n/a && (iter+1) < end
1373n/a && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1374n/a {
1375n/a *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1376n/a iter += 2;
1377n/a }
1378n/a else {
1379n/a *ucs4_out++ = *iter;
1380n/a iter++;
1381n/a }
1382n/a }
1383n/a assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1384n/a _PyUnicode_GET_LENGTH(unicode)));
1385n/a
1386n/a}
1387n/a#endif
1388n/a
1389n/astatic int
1390n/aunicode_check_modifiable(PyObject *unicode)
1391n/a{
1392n/a if (!unicode_modifiable(unicode)) {
1393n/a PyErr_SetString(PyExc_SystemError,
1394n/a "Cannot modify a string currently used");
1395n/a return -1;
1396n/a }
1397n/a return 0;
1398n/a}
1399n/a
1400n/astatic int
1401n/a_copy_characters(PyObject *to, Py_ssize_t to_start,
1402n/a PyObject *from, Py_ssize_t from_start,
1403n/a Py_ssize_t how_many, int check_maxchar)
1404n/a{
1405n/a unsigned int from_kind, to_kind;
1406n/a void *from_data, *to_data;
1407n/a
1408n/a assert(0 <= how_many);
1409n/a assert(0 <= from_start);
1410n/a assert(0 <= to_start);
1411n/a assert(PyUnicode_Check(from));
1412n/a assert(PyUnicode_IS_READY(from));
1413n/a assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1414n/a
1415n/a assert(PyUnicode_Check(to));
1416n/a assert(PyUnicode_IS_READY(to));
1417n/a assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1418n/a
1419n/a if (how_many == 0)
1420n/a return 0;
1421n/a
1422n/a from_kind = PyUnicode_KIND(from);
1423n/a from_data = PyUnicode_DATA(from);
1424n/a to_kind = PyUnicode_KIND(to);
1425n/a to_data = PyUnicode_DATA(to);
1426n/a
1427n/a#ifdef Py_DEBUG
1428n/a if (!check_maxchar
1429n/a && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1430n/a {
1431n/a const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1432n/a Py_UCS4 ch;
1433n/a Py_ssize_t i;
1434n/a for (i=0; i < how_many; i++) {
1435n/a ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1436n/a assert(ch <= to_maxchar);
1437n/a }
1438n/a }
1439n/a#endif
1440n/a
1441n/a if (from_kind == to_kind) {
1442n/a if (check_maxchar
1443n/a && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1444n/a {
1445n/a /* Writing Latin-1 characters into an ASCII string requires to
1446n/a check that all written characters are pure ASCII */
1447n/a Py_UCS4 max_char;
1448n/a max_char = ucs1lib_find_max_char(from_data,
1449n/a (Py_UCS1*)from_data + how_many);
1450n/a if (max_char >= 128)
1451n/a return -1;
1452n/a }
1453n/a memcpy((char*)to_data + to_kind * to_start,
1454n/a (char*)from_data + from_kind * from_start,
1455n/a to_kind * how_many);
1456n/a }
1457n/a else if (from_kind == PyUnicode_1BYTE_KIND
1458n/a && to_kind == PyUnicode_2BYTE_KIND)
1459n/a {
1460n/a _PyUnicode_CONVERT_BYTES(
1461n/a Py_UCS1, Py_UCS2,
1462n/a PyUnicode_1BYTE_DATA(from) + from_start,
1463n/a PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1464n/a PyUnicode_2BYTE_DATA(to) + to_start
1465n/a );
1466n/a }
1467n/a else if (from_kind == PyUnicode_1BYTE_KIND
1468n/a && to_kind == PyUnicode_4BYTE_KIND)
1469n/a {
1470n/a _PyUnicode_CONVERT_BYTES(
1471n/a Py_UCS1, Py_UCS4,
1472n/a PyUnicode_1BYTE_DATA(from) + from_start,
1473n/a PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1474n/a PyUnicode_4BYTE_DATA(to) + to_start
1475n/a );
1476n/a }
1477n/a else if (from_kind == PyUnicode_2BYTE_KIND
1478n/a && to_kind == PyUnicode_4BYTE_KIND)
1479n/a {
1480n/a _PyUnicode_CONVERT_BYTES(
1481n/a Py_UCS2, Py_UCS4,
1482n/a PyUnicode_2BYTE_DATA(from) + from_start,
1483n/a PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1484n/a PyUnicode_4BYTE_DATA(to) + to_start
1485n/a );
1486n/a }
1487n/a else {
1488n/a assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1489n/a
1490n/a if (!check_maxchar) {
1491n/a if (from_kind == PyUnicode_2BYTE_KIND
1492n/a && to_kind == PyUnicode_1BYTE_KIND)
1493n/a {
1494n/a _PyUnicode_CONVERT_BYTES(
1495n/a Py_UCS2, Py_UCS1,
1496n/a PyUnicode_2BYTE_DATA(from) + from_start,
1497n/a PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1498n/a PyUnicode_1BYTE_DATA(to) + to_start
1499n/a );
1500n/a }
1501n/a else if (from_kind == PyUnicode_4BYTE_KIND
1502n/a && to_kind == PyUnicode_1BYTE_KIND)
1503n/a {
1504n/a _PyUnicode_CONVERT_BYTES(
1505n/a Py_UCS4, Py_UCS1,
1506n/a PyUnicode_4BYTE_DATA(from) + from_start,
1507n/a PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1508n/a PyUnicode_1BYTE_DATA(to) + to_start
1509n/a );
1510n/a }
1511n/a else if (from_kind == PyUnicode_4BYTE_KIND
1512n/a && to_kind == PyUnicode_2BYTE_KIND)
1513n/a {
1514n/a _PyUnicode_CONVERT_BYTES(
1515n/a Py_UCS4, Py_UCS2,
1516n/a PyUnicode_4BYTE_DATA(from) + from_start,
1517n/a PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1518n/a PyUnicode_2BYTE_DATA(to) + to_start
1519n/a );
1520n/a }
1521n/a else {
1522n/a assert(0);
1523n/a return -1;
1524n/a }
1525n/a }
1526n/a else {
1527n/a const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1528n/a Py_UCS4 ch;
1529n/a Py_ssize_t i;
1530n/a
1531n/a for (i=0; i < how_many; i++) {
1532n/a ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1533n/a if (ch > to_maxchar)
1534n/a return -1;
1535n/a PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1536n/a }
1537n/a }
1538n/a }
1539n/a return 0;
1540n/a}
1541n/a
1542n/avoid
1543n/a_PyUnicode_FastCopyCharacters(
1544n/a PyObject *to, Py_ssize_t to_start,
1545n/a PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1546n/a{
1547n/a (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1548n/a}
1549n/a
1550n/aPy_ssize_t
1551n/aPyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1552n/a PyObject *from, Py_ssize_t from_start,
1553n/a Py_ssize_t how_many)
1554n/a{
1555n/a int err;
1556n/a
1557n/a if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1558n/a PyErr_BadInternalCall();
1559n/a return -1;
1560n/a }
1561n/a
1562n/a if (PyUnicode_READY(from) == -1)
1563n/a return -1;
1564n/a if (PyUnicode_READY(to) == -1)
1565n/a return -1;
1566n/a
1567n/a if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1568n/a PyErr_SetString(PyExc_IndexError, "string index out of range");
1569n/a return -1;
1570n/a }
1571n/a if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1572n/a PyErr_SetString(PyExc_IndexError, "string index out of range");
1573n/a return -1;
1574n/a }
1575n/a if (how_many < 0) {
1576n/a PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1577n/a return -1;
1578n/a }
1579n/a how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1580n/a if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1581n/a PyErr_Format(PyExc_SystemError,
1582n/a "Cannot write %zi characters at %zi "
1583n/a "in a string of %zi characters",
1584n/a how_many, to_start, PyUnicode_GET_LENGTH(to));
1585n/a return -1;
1586n/a }
1587n/a
1588n/a if (how_many == 0)
1589n/a return 0;
1590n/a
1591n/a if (unicode_check_modifiable(to))
1592n/a return -1;
1593n/a
1594n/a err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1595n/a if (err) {
1596n/a PyErr_Format(PyExc_SystemError,
1597n/a "Cannot copy %s characters "
1598n/a "into a string of %s characters",
1599n/a unicode_kind_name(from),
1600n/a unicode_kind_name(to));
1601n/a return -1;
1602n/a }
1603n/a return how_many;
1604n/a}
1605n/a
1606n/a/* Find the maximum code point and count the number of surrogate pairs so a
1607n/a correct string length can be computed before converting a string to UCS4.
1608n/a This function counts single surrogates as a character and not as a pair.
1609n/a
1610n/a Return 0 on success, or -1 on error. */
1611n/astatic int
1612n/afind_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1613n/a Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1614n/a{
1615n/a const wchar_t *iter;
1616n/a Py_UCS4 ch;
1617n/a
1618n/a assert(num_surrogates != NULL && maxchar != NULL);
1619n/a *num_surrogates = 0;
1620n/a *maxchar = 0;
1621n/a
1622n/a for (iter = begin; iter < end; ) {
1623n/a#if SIZEOF_WCHAR_T == 2
1624n/a if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1625n/a && (iter+1) < end
1626n/a && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1627n/a {
1628n/a ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1629n/a ++(*num_surrogates);
1630n/a iter += 2;
1631n/a }
1632n/a else
1633n/a#endif
1634n/a {
1635n/a ch = *iter;
1636n/a iter++;
1637n/a }
1638n/a if (ch > *maxchar) {
1639n/a *maxchar = ch;
1640n/a if (*maxchar > MAX_UNICODE) {
1641n/a PyErr_Format(PyExc_ValueError,
1642n/a "character U+%x is not in range [U+0000; U+10ffff]",
1643n/a ch);
1644n/a return -1;
1645n/a }
1646n/a }
1647n/a }
1648n/a return 0;
1649n/a}
1650n/a
1651n/aint
1652n/a_PyUnicode_Ready(PyObject *unicode)
1653n/a{
1654n/a wchar_t *end;
1655n/a Py_UCS4 maxchar = 0;
1656n/a Py_ssize_t num_surrogates;
1657n/a#if SIZEOF_WCHAR_T == 2
1658n/a Py_ssize_t length_wo_surrogates;
1659n/a#endif
1660n/a
1661n/a /* _PyUnicode_Ready() is only intended for old-style API usage where
1662n/a strings were created using _PyObject_New() and where no canonical
1663n/a representation (the str field) has been set yet aka strings
1664n/a which are not yet ready. */
1665n/a assert(_PyUnicode_CHECK(unicode));
1666n/a assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1667n/a assert(_PyUnicode_WSTR(unicode) != NULL);
1668n/a assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1669n/a assert(_PyUnicode_UTF8(unicode) == NULL);
1670n/a /* Actually, it should neither be interned nor be anything else: */
1671n/a assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1672n/a
1673n/a end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1674n/a if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1675n/a &maxchar, &num_surrogates) == -1)
1676n/a return -1;
1677n/a
1678n/a if (maxchar < 256) {
1679n/a _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1680n/a if (!_PyUnicode_DATA_ANY(unicode)) {
1681n/a PyErr_NoMemory();
1682n/a return -1;
1683n/a }
1684n/a _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1685n/a _PyUnicode_WSTR(unicode), end,
1686n/a PyUnicode_1BYTE_DATA(unicode));
1687n/a PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1688n/a _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1689n/a _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1690n/a if (maxchar < 128) {
1691n/a _PyUnicode_STATE(unicode).ascii = 1;
1692n/a _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1693n/a _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1694n/a }
1695n/a else {
1696n/a _PyUnicode_STATE(unicode).ascii = 0;
1697n/a _PyUnicode_UTF8(unicode) = NULL;
1698n/a _PyUnicode_UTF8_LENGTH(unicode) = 0;
1699n/a }
1700n/a PyObject_FREE(_PyUnicode_WSTR(unicode));
1701n/a _PyUnicode_WSTR(unicode) = NULL;
1702n/a _PyUnicode_WSTR_LENGTH(unicode) = 0;
1703n/a }
1704n/a /* In this case we might have to convert down from 4-byte native
1705n/a wchar_t to 2-byte unicode. */
1706n/a else if (maxchar < 65536) {
1707n/a assert(num_surrogates == 0 &&
1708n/a "FindMaxCharAndNumSurrogatePairs() messed up");
1709n/a
1710n/a#if SIZEOF_WCHAR_T == 2
1711n/a /* We can share representations and are done. */
1712n/a _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1713n/a PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1714n/a _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1715n/a _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1716n/a _PyUnicode_UTF8(unicode) = NULL;
1717n/a _PyUnicode_UTF8_LENGTH(unicode) = 0;
1718n/a#else
1719n/a /* sizeof(wchar_t) == 4 */
1720n/a _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1721n/a 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1722n/a if (!_PyUnicode_DATA_ANY(unicode)) {
1723n/a PyErr_NoMemory();
1724n/a return -1;
1725n/a }
1726n/a _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1727n/a _PyUnicode_WSTR(unicode), end,
1728n/a PyUnicode_2BYTE_DATA(unicode));
1729n/a PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1730n/a _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1731n/a _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1732n/a _PyUnicode_UTF8(unicode) = NULL;
1733n/a _PyUnicode_UTF8_LENGTH(unicode) = 0;
1734n/a PyObject_FREE(_PyUnicode_WSTR(unicode));
1735n/a _PyUnicode_WSTR(unicode) = NULL;
1736n/a _PyUnicode_WSTR_LENGTH(unicode) = 0;
1737n/a#endif
1738n/a }
1739n/a /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1740n/a else {
1741n/a#if SIZEOF_WCHAR_T == 2
1742n/a /* in case the native representation is 2-bytes, we need to allocate a
1743n/a new normalized 4-byte version. */
1744n/a length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1745n/a if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1746n/a PyErr_NoMemory();
1747n/a return -1;
1748n/a }
1749n/a _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1750n/a if (!_PyUnicode_DATA_ANY(unicode)) {
1751n/a PyErr_NoMemory();
1752n/a return -1;
1753n/a }
1754n/a _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1755n/a _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1756n/a _PyUnicode_UTF8(unicode) = NULL;
1757n/a _PyUnicode_UTF8_LENGTH(unicode) = 0;
1758n/a /* unicode_convert_wchar_to_ucs4() requires a ready string */
1759n/a _PyUnicode_STATE(unicode).ready = 1;
1760n/a unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1761n/a PyObject_FREE(_PyUnicode_WSTR(unicode));
1762n/a _PyUnicode_WSTR(unicode) = NULL;
1763n/a _PyUnicode_WSTR_LENGTH(unicode) = 0;
1764n/a#else
1765n/a assert(num_surrogates == 0);
1766n/a
1767n/a _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1768n/a _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1769n/a _PyUnicode_UTF8(unicode) = NULL;
1770n/a _PyUnicode_UTF8_LENGTH(unicode) = 0;
1771n/a _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1772n/a#endif
1773n/a PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1774n/a }
1775n/a _PyUnicode_STATE(unicode).ready = 1;
1776n/a assert(_PyUnicode_CheckConsistency(unicode, 1));
1777n/a return 0;
1778n/a}
1779n/a
1780n/astatic void
1781n/aunicode_dealloc(PyObject *unicode)
1782n/a{
1783n/a switch (PyUnicode_CHECK_INTERNED(unicode)) {
1784n/a case SSTATE_NOT_INTERNED:
1785n/a break;
1786n/a
1787n/a case SSTATE_INTERNED_MORTAL:
1788n/a /* revive dead object temporarily for DelItem */
1789n/a Py_REFCNT(unicode) = 3;
1790n/a if (PyDict_DelItem(interned, unicode) != 0)
1791n/a Py_FatalError(
1792n/a "deletion of interned string failed");
1793n/a break;
1794n/a
1795n/a case SSTATE_INTERNED_IMMORTAL:
1796n/a Py_FatalError("Immortal interned string died.");
1797n/a
1798n/a default:
1799n/a Py_FatalError("Inconsistent interned string state.");
1800n/a }
1801n/a
1802n/a if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1803n/a PyObject_DEL(_PyUnicode_WSTR(unicode));
1804n/a if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1805n/a PyObject_DEL(_PyUnicode_UTF8(unicode));
1806n/a if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1807n/a PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1808n/a
1809n/a Py_TYPE(unicode)->tp_free(unicode);
1810n/a}
1811n/a
1812n/a#ifdef Py_DEBUG
1813n/astatic int
1814n/aunicode_is_singleton(PyObject *unicode)
1815n/a{
1816n/a PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1817n/a if (unicode == unicode_empty)
1818n/a return 1;
1819n/a if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1820n/a {
1821n/a Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1822n/a if (ch < 256 && unicode_latin1[ch] == unicode)
1823n/a return 1;
1824n/a }
1825n/a return 0;
1826n/a}
1827n/a#endif
1828n/a
1829n/astatic int
1830n/aunicode_modifiable(PyObject *unicode)
1831n/a{
1832n/a assert(_PyUnicode_CHECK(unicode));
1833n/a if (Py_REFCNT(unicode) != 1)
1834n/a return 0;
1835n/a if (_PyUnicode_HASH(unicode) != -1)
1836n/a return 0;
1837n/a if (PyUnicode_CHECK_INTERNED(unicode))
1838n/a return 0;
1839n/a if (!PyUnicode_CheckExact(unicode))
1840n/a return 0;
1841n/a#ifdef Py_DEBUG
1842n/a /* singleton refcount is greater than 1 */
1843n/a assert(!unicode_is_singleton(unicode));
1844n/a#endif
1845n/a return 1;
1846n/a}
1847n/a
1848n/astatic int
1849n/aunicode_resize(PyObject **p_unicode, Py_ssize_t length)
1850n/a{
1851n/a PyObject *unicode;
1852n/a Py_ssize_t old_length;
1853n/a
1854n/a assert(p_unicode != NULL);
1855n/a unicode = *p_unicode;
1856n/a
1857n/a assert(unicode != NULL);
1858n/a assert(PyUnicode_Check(unicode));
1859n/a assert(0 <= length);
1860n/a
1861n/a if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1862n/a old_length = PyUnicode_WSTR_LENGTH(unicode);
1863n/a else
1864n/a old_length = PyUnicode_GET_LENGTH(unicode);
1865n/a if (old_length == length)
1866n/a return 0;
1867n/a
1868n/a if (length == 0) {
1869n/a _Py_INCREF_UNICODE_EMPTY();
1870n/a if (!unicode_empty)
1871n/a return -1;
1872n/a Py_SETREF(*p_unicode, unicode_empty);
1873n/a return 0;
1874n/a }
1875n/a
1876n/a if (!unicode_modifiable(unicode)) {
1877n/a PyObject *copy = resize_copy(unicode, length);
1878n/a if (copy == NULL)
1879n/a return -1;
1880n/a Py_SETREF(*p_unicode, copy);
1881n/a return 0;
1882n/a }
1883n/a
1884n/a if (PyUnicode_IS_COMPACT(unicode)) {
1885n/a PyObject *new_unicode = resize_compact(unicode, length);
1886n/a if (new_unicode == NULL)
1887n/a return -1;
1888n/a *p_unicode = new_unicode;
1889n/a return 0;
1890n/a }
1891n/a return resize_inplace(unicode, length);
1892n/a}
1893n/a
1894n/aint
1895n/aPyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1896n/a{
1897n/a PyObject *unicode;
1898n/a if (p_unicode == NULL) {
1899n/a PyErr_BadInternalCall();
1900n/a return -1;
1901n/a }
1902n/a unicode = *p_unicode;
1903n/a if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1904n/a {
1905n/a PyErr_BadInternalCall();
1906n/a return -1;
1907n/a }
1908n/a return unicode_resize(p_unicode, length);
1909n/a}
1910n/a
1911n/a/* Copy an ASCII or latin1 char* string into a Python Unicode string.
1912n/a
1913n/a WARNING: The function doesn't copy the terminating null character and
1914n/a doesn't check the maximum character (may write a latin1 character in an
1915n/a ASCII string). */
1916n/astatic void
1917n/aunicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1918n/a const char *str, Py_ssize_t len)
1919n/a{
1920n/a enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1921n/a void *data = PyUnicode_DATA(unicode);
1922n/a const char *end = str + len;
1923n/a
1924n/a switch (kind) {
1925n/a case PyUnicode_1BYTE_KIND: {
1926n/a assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1927n/a#ifdef Py_DEBUG
1928n/a if (PyUnicode_IS_ASCII(unicode)) {
1929n/a Py_UCS4 maxchar = ucs1lib_find_max_char(
1930n/a (const Py_UCS1*)str,
1931n/a (const Py_UCS1*)str + len);
1932n/a assert(maxchar < 128);
1933n/a }
1934n/a#endif
1935n/a memcpy((char *) data + index, str, len);
1936n/a break;
1937n/a }
1938n/a case PyUnicode_2BYTE_KIND: {
1939n/a Py_UCS2 *start = (Py_UCS2 *)data + index;
1940n/a Py_UCS2 *ucs2 = start;
1941n/a assert(index <= PyUnicode_GET_LENGTH(unicode));
1942n/a
1943n/a for (; str < end; ++ucs2, ++str)
1944n/a *ucs2 = (Py_UCS2)*str;
1945n/a
1946n/a assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1947n/a break;
1948n/a }
1949n/a default: {
1950n/a Py_UCS4 *start = (Py_UCS4 *)data + index;
1951n/a Py_UCS4 *ucs4 = start;
1952n/a assert(kind == PyUnicode_4BYTE_KIND);
1953n/a assert(index <= PyUnicode_GET_LENGTH(unicode));
1954n/a
1955n/a for (; str < end; ++ucs4, ++str)
1956n/a *ucs4 = (Py_UCS4)*str;
1957n/a
1958n/a assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1959n/a }
1960n/a }
1961n/a}
1962n/a
1963n/astatic PyObject*
1964n/aget_latin1_char(unsigned char ch)
1965n/a{
1966n/a PyObject *unicode = unicode_latin1[ch];
1967n/a if (!unicode) {
1968n/a unicode = PyUnicode_New(1, ch);
1969n/a if (!unicode)
1970n/a return NULL;
1971n/a PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1972n/a assert(_PyUnicode_CheckConsistency(unicode, 1));
1973n/a unicode_latin1[ch] = unicode;
1974n/a }
1975n/a Py_INCREF(unicode);
1976n/a return unicode;
1977n/a}
1978n/a
1979n/astatic PyObject*
1980n/aunicode_char(Py_UCS4 ch)
1981n/a{
1982n/a PyObject *unicode;
1983n/a
1984n/a assert(ch <= MAX_UNICODE);
1985n/a
1986n/a if (ch < 256)
1987n/a return get_latin1_char(ch);
1988n/a
1989n/a unicode = PyUnicode_New(1, ch);
1990n/a if (unicode == NULL)
1991n/a return NULL;
1992n/a
1993n/a assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1994n/a if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1995n/a PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1996n/a } else {
1997n/a assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1998n/a PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1999n/a }
2000n/a assert(_PyUnicode_CheckConsistency(unicode, 1));
2001n/a return unicode;
2002n/a}
2003n/a
2004n/aPyObject *
2005n/aPyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2006n/a{
2007n/a if (u == NULL)
2008n/a return (PyObject*)_PyUnicode_New(size);
2009n/a
2010n/a if (size < 0) {
2011n/a PyErr_BadInternalCall();
2012n/a return NULL;
2013n/a }
2014n/a
2015n/a return PyUnicode_FromWideChar(u, size);
2016n/a}
2017n/a
2018n/aPyObject *
2019n/aPyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2020n/a{
2021n/a PyObject *unicode;
2022n/a Py_UCS4 maxchar = 0;
2023n/a Py_ssize_t num_surrogates;
2024n/a
2025n/a if (u == NULL && size != 0) {
2026n/a PyErr_BadInternalCall();
2027n/a return NULL;
2028n/a }
2029n/a
2030n/a if (size == -1) {
2031n/a size = wcslen(u);
2032n/a }
2033n/a
2034n/a /* If the Unicode data is known at construction time, we can apply
2035n/a some optimizations which share commonly used objects. */
2036n/a
2037n/a /* Optimization for empty strings */
2038n/a if (size == 0)
2039n/a _Py_RETURN_UNICODE_EMPTY();
2040n/a
2041n/a /* Single character Unicode objects in the Latin-1 range are
2042n/a shared when using this constructor */
2043n/a if (size == 1 && (Py_UCS4)*u < 256)
2044n/a return get_latin1_char((unsigned char)*u);
2045n/a
2046n/a /* If not empty and not single character, copy the Unicode data
2047n/a into the new object */
2048n/a if (find_maxchar_surrogates(u, u + size,
2049n/a &maxchar, &num_surrogates) == -1)
2050n/a return NULL;
2051n/a
2052n/a unicode = PyUnicode_New(size - num_surrogates, maxchar);
2053n/a if (!unicode)
2054n/a return NULL;
2055n/a
2056n/a switch (PyUnicode_KIND(unicode)) {
2057n/a case PyUnicode_1BYTE_KIND:
2058n/a _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2059n/a u, u + size, PyUnicode_1BYTE_DATA(unicode));
2060n/a break;
2061n/a case PyUnicode_2BYTE_KIND:
2062n/a#if Py_UNICODE_SIZE == 2
2063n/a memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2064n/a#else
2065n/a _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2066n/a u, u + size, PyUnicode_2BYTE_DATA(unicode));
2067n/a#endif
2068n/a break;
2069n/a case PyUnicode_4BYTE_KIND:
2070n/a#if SIZEOF_WCHAR_T == 2
2071n/a /* This is the only case which has to process surrogates, thus
2072n/a a simple copy loop is not enough and we need a function. */
2073n/a unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2074n/a#else
2075n/a assert(num_surrogates == 0);
2076n/a memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2077n/a#endif
2078n/a break;
2079n/a default:
2080n/a assert(0 && "Impossible state");
2081n/a }
2082n/a
2083n/a return unicode_result(unicode);
2084n/a}
2085n/a
2086n/aPyObject *
2087n/aPyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2088n/a{
2089n/a if (size < 0) {
2090n/a PyErr_SetString(PyExc_SystemError,
2091n/a "Negative size passed to PyUnicode_FromStringAndSize");
2092n/a return NULL;
2093n/a }
2094n/a if (u != NULL)
2095n/a return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2096n/a else
2097n/a return (PyObject *)_PyUnicode_New(size);
2098n/a}
2099n/a
2100n/aPyObject *
2101n/aPyUnicode_FromString(const char *u)
2102n/a{
2103n/a size_t size = strlen(u);
2104n/a if (size > PY_SSIZE_T_MAX) {
2105n/a PyErr_SetString(PyExc_OverflowError, "input too long");
2106n/a return NULL;
2107n/a }
2108n/a return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2109n/a}
2110n/a
2111n/aPyObject *
2112n/a_PyUnicode_FromId(_Py_Identifier *id)
2113n/a{
2114n/a if (!id->object) {
2115n/a id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2116n/a strlen(id->string),
2117n/a NULL, NULL);
2118n/a if (!id->object)
2119n/a return NULL;
2120n/a PyUnicode_InternInPlace(&id->object);
2121n/a assert(!id->next);
2122n/a id->next = static_strings;
2123n/a static_strings = id;
2124n/a }
2125n/a return id->object;
2126n/a}
2127n/a
2128n/avoid
2129n/a_PyUnicode_ClearStaticStrings()
2130n/a{
2131n/a _Py_Identifier *tmp, *s = static_strings;
2132n/a while (s) {
2133n/a Py_CLEAR(s->object);
2134n/a tmp = s->next;
2135n/a s->next = NULL;
2136n/a s = tmp;
2137n/a }
2138n/a static_strings = NULL;
2139n/a}
2140n/a
2141n/a/* Internal function, doesn't check maximum character */
2142n/a
2143n/aPyObject*
2144n/a_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2145n/a{
2146n/a const unsigned char *s = (const unsigned char *)buffer;
2147n/a PyObject *unicode;
2148n/a if (size == 1) {
2149n/a#ifdef Py_DEBUG
2150n/a assert((unsigned char)s[0] < 128);
2151n/a#endif
2152n/a return get_latin1_char(s[0]);
2153n/a }
2154n/a unicode = PyUnicode_New(size, 127);
2155n/a if (!unicode)
2156n/a return NULL;
2157n/a memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2158n/a assert(_PyUnicode_CheckConsistency(unicode, 1));
2159n/a return unicode;
2160n/a}
2161n/a
2162n/astatic Py_UCS4
2163n/akind_maxchar_limit(unsigned int kind)
2164n/a{
2165n/a switch (kind) {
2166n/a case PyUnicode_1BYTE_KIND:
2167n/a return 0x80;
2168n/a case PyUnicode_2BYTE_KIND:
2169n/a return 0x100;
2170n/a case PyUnicode_4BYTE_KIND:
2171n/a return 0x10000;
2172n/a default:
2173n/a assert(0 && "invalid kind");
2174n/a return MAX_UNICODE;
2175n/a }
2176n/a}
2177n/a
2178n/astatic inline Py_UCS4
2179n/aalign_maxchar(Py_UCS4 maxchar)
2180n/a{
2181n/a if (maxchar <= 127)
2182n/a return 127;
2183n/a else if (maxchar <= 255)
2184n/a return 255;
2185n/a else if (maxchar <= 65535)
2186n/a return 65535;
2187n/a else
2188n/a return MAX_UNICODE;
2189n/a}
2190n/a
2191n/astatic PyObject*
2192n/a_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2193n/a{
2194n/a PyObject *res;
2195n/a unsigned char max_char;
2196n/a
2197n/a if (size == 0)
2198n/a _Py_RETURN_UNICODE_EMPTY();
2199n/a assert(size > 0);
2200n/a if (size == 1)
2201n/a return get_latin1_char(u[0]);
2202n/a
2203n/a max_char = ucs1lib_find_max_char(u, u + size);
2204n/a res = PyUnicode_New(size, max_char);
2205n/a if (!res)
2206n/a return NULL;
2207n/a memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2208n/a assert(_PyUnicode_CheckConsistency(res, 1));
2209n/a return res;
2210n/a}
2211n/a
2212n/astatic PyObject*
2213n/a_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2214n/a{
2215n/a PyObject *res;
2216n/a Py_UCS2 max_char;
2217n/a
2218n/a if (size == 0)
2219n/a _Py_RETURN_UNICODE_EMPTY();
2220n/a assert(size > 0);
2221n/a if (size == 1)
2222n/a return unicode_char(u[0]);
2223n/a
2224n/a max_char = ucs2lib_find_max_char(u, u + size);
2225n/a res = PyUnicode_New(size, max_char);
2226n/a if (!res)
2227n/a return NULL;
2228n/a if (max_char >= 256)
2229n/a memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2230n/a else {
2231n/a _PyUnicode_CONVERT_BYTES(
2232n/a Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2233n/a }
2234n/a assert(_PyUnicode_CheckConsistency(res, 1));
2235n/a return res;
2236n/a}
2237n/a
2238n/astatic PyObject*
2239n/a_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2240n/a{
2241n/a PyObject *res;
2242n/a Py_UCS4 max_char;
2243n/a
2244n/a if (size == 0)
2245n/a _Py_RETURN_UNICODE_EMPTY();
2246n/a assert(size > 0);
2247n/a if (size == 1)
2248n/a return unicode_char(u[0]);
2249n/a
2250n/a max_char = ucs4lib_find_max_char(u, u + size);
2251n/a res = PyUnicode_New(size, max_char);
2252n/a if (!res)
2253n/a return NULL;
2254n/a if (max_char < 256)
2255n/a _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2256n/a PyUnicode_1BYTE_DATA(res));
2257n/a else if (max_char < 0x10000)
2258n/a _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2259n/a PyUnicode_2BYTE_DATA(res));
2260n/a else
2261n/a memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2262n/a assert(_PyUnicode_CheckConsistency(res, 1));
2263n/a return res;
2264n/a}
2265n/a
2266n/aPyObject*
2267n/aPyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2268n/a{
2269n/a if (size < 0) {
2270n/a PyErr_SetString(PyExc_ValueError, "size must be positive");
2271n/a return NULL;
2272n/a }
2273n/a switch (kind) {
2274n/a case PyUnicode_1BYTE_KIND:
2275n/a return _PyUnicode_FromUCS1(buffer, size);
2276n/a case PyUnicode_2BYTE_KIND:
2277n/a return _PyUnicode_FromUCS2(buffer, size);
2278n/a case PyUnicode_4BYTE_KIND:
2279n/a return _PyUnicode_FromUCS4(buffer, size);
2280n/a default:
2281n/a PyErr_SetString(PyExc_SystemError, "invalid kind");
2282n/a return NULL;
2283n/a }
2284n/a}
2285n/a
2286n/aPy_UCS4
2287n/a_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2288n/a{
2289n/a enum PyUnicode_Kind kind;
2290n/a void *startptr, *endptr;
2291n/a
2292n/a assert(PyUnicode_IS_READY(unicode));
2293n/a assert(0 <= start);
2294n/a assert(end <= PyUnicode_GET_LENGTH(unicode));
2295n/a assert(start <= end);
2296n/a
2297n/a if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2298n/a return PyUnicode_MAX_CHAR_VALUE(unicode);
2299n/a
2300n/a if (start == end)
2301n/a return 127;
2302n/a
2303n/a if (PyUnicode_IS_ASCII(unicode))
2304n/a return 127;
2305n/a
2306n/a kind = PyUnicode_KIND(unicode);
2307n/a startptr = PyUnicode_DATA(unicode);
2308n/a endptr = (char *)startptr + end * kind;
2309n/a startptr = (char *)startptr + start * kind;
2310n/a switch(kind) {
2311n/a case PyUnicode_1BYTE_KIND:
2312n/a return ucs1lib_find_max_char(startptr, endptr);
2313n/a case PyUnicode_2BYTE_KIND:
2314n/a return ucs2lib_find_max_char(startptr, endptr);
2315n/a case PyUnicode_4BYTE_KIND:
2316n/a return ucs4lib_find_max_char(startptr, endptr);
2317n/a default:
2318n/a assert(0);
2319n/a return 0;
2320n/a }
2321n/a}
2322n/a
2323n/a/* Ensure that a string uses the most efficient storage, if it is not the
2324n/a case: create a new string with of the right kind. Write NULL into *p_unicode
2325n/a on error. */
2326n/astatic void
2327n/aunicode_adjust_maxchar(PyObject **p_unicode)
2328n/a{
2329n/a PyObject *unicode, *copy;
2330n/a Py_UCS4 max_char;
2331n/a Py_ssize_t len;
2332n/a unsigned int kind;
2333n/a
2334n/a assert(p_unicode != NULL);
2335n/a unicode = *p_unicode;
2336n/a assert(PyUnicode_IS_READY(unicode));
2337n/a if (PyUnicode_IS_ASCII(unicode))
2338n/a return;
2339n/a
2340n/a len = PyUnicode_GET_LENGTH(unicode);
2341n/a kind = PyUnicode_KIND(unicode);
2342n/a if (kind == PyUnicode_1BYTE_KIND) {
2343n/a const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2344n/a max_char = ucs1lib_find_max_char(u, u + len);
2345n/a if (max_char >= 128)
2346n/a return;
2347n/a }
2348n/a else if (kind == PyUnicode_2BYTE_KIND) {
2349n/a const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2350n/a max_char = ucs2lib_find_max_char(u, u + len);
2351n/a if (max_char >= 256)
2352n/a return;
2353n/a }
2354n/a else {
2355n/a const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2356n/a assert(kind == PyUnicode_4BYTE_KIND);
2357n/a max_char = ucs4lib_find_max_char(u, u + len);
2358n/a if (max_char >= 0x10000)
2359n/a return;
2360n/a }
2361n/a copy = PyUnicode_New(len, max_char);
2362n/a if (copy != NULL)
2363n/a _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2364n/a Py_DECREF(unicode);
2365n/a *p_unicode = copy;
2366n/a}
2367n/a
2368n/aPyObject*
2369n/a_PyUnicode_Copy(PyObject *unicode)
2370n/a{
2371n/a Py_ssize_t length;
2372n/a PyObject *copy;
2373n/a
2374n/a if (!PyUnicode_Check(unicode)) {
2375n/a PyErr_BadInternalCall();
2376n/a return NULL;
2377n/a }
2378n/a if (PyUnicode_READY(unicode) == -1)
2379n/a return NULL;
2380n/a
2381n/a length = PyUnicode_GET_LENGTH(unicode);
2382n/a copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2383n/a if (!copy)
2384n/a return NULL;
2385n/a assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2386n/a
2387n/a memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2388n/a length * PyUnicode_KIND(unicode));
2389n/a assert(_PyUnicode_CheckConsistency(copy, 1));
2390n/a return copy;
2391n/a}
2392n/a
2393n/a
2394n/a/* Widen Unicode objects to larger buffers. Don't write terminating null
2395n/a character. Return NULL on error. */
2396n/a
2397n/avoid*
2398n/a_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2399n/a{
2400n/a Py_ssize_t len;
2401n/a void *result;
2402n/a unsigned int skind;
2403n/a
2404n/a if (PyUnicode_READY(s) == -1)
2405n/a return NULL;
2406n/a
2407n/a len = PyUnicode_GET_LENGTH(s);
2408n/a skind = PyUnicode_KIND(s);
2409n/a if (skind >= kind) {
2410n/a PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2411n/a return NULL;
2412n/a }
2413n/a switch (kind) {
2414n/a case PyUnicode_2BYTE_KIND:
2415n/a result = PyMem_New(Py_UCS2, len);
2416n/a if (!result)
2417n/a return PyErr_NoMemory();
2418n/a assert(skind == PyUnicode_1BYTE_KIND);
2419n/a _PyUnicode_CONVERT_BYTES(
2420n/a Py_UCS1, Py_UCS2,
2421n/a PyUnicode_1BYTE_DATA(s),
2422n/a PyUnicode_1BYTE_DATA(s) + len,
2423n/a result);
2424n/a return result;
2425n/a case PyUnicode_4BYTE_KIND:
2426n/a result = PyMem_New(Py_UCS4, len);
2427n/a if (!result)
2428n/a return PyErr_NoMemory();
2429n/a if (skind == PyUnicode_2BYTE_KIND) {
2430n/a _PyUnicode_CONVERT_BYTES(
2431n/a Py_UCS2, Py_UCS4,
2432n/a PyUnicode_2BYTE_DATA(s),
2433n/a PyUnicode_2BYTE_DATA(s) + len,
2434n/a result);
2435n/a }
2436n/a else {
2437n/a assert(skind == PyUnicode_1BYTE_KIND);
2438n/a _PyUnicode_CONVERT_BYTES(
2439n/a Py_UCS1, Py_UCS4,
2440n/a PyUnicode_1BYTE_DATA(s),
2441n/a PyUnicode_1BYTE_DATA(s) + len,
2442n/a result);
2443n/a }
2444n/a return result;
2445n/a default:
2446n/a break;
2447n/a }
2448n/a PyErr_SetString(PyExc_SystemError, "invalid kind");
2449n/a return NULL;
2450n/a}
2451n/a
2452n/astatic Py_UCS4*
2453n/aas_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2454n/a int copy_null)
2455n/a{
2456n/a int kind;
2457n/a void *data;
2458n/a Py_ssize_t len, targetlen;
2459n/a if (PyUnicode_READY(string) == -1)
2460n/a return NULL;
2461n/a kind = PyUnicode_KIND(string);
2462n/a data = PyUnicode_DATA(string);
2463n/a len = PyUnicode_GET_LENGTH(string);
2464n/a targetlen = len;
2465n/a if (copy_null)
2466n/a targetlen++;
2467n/a if (!target) {
2468n/a target = PyMem_New(Py_UCS4, targetlen);
2469n/a if (!target) {
2470n/a PyErr_NoMemory();
2471n/a return NULL;
2472n/a }
2473n/a }
2474n/a else {
2475n/a if (targetsize < targetlen) {
2476n/a PyErr_Format(PyExc_SystemError,
2477n/a "string is longer than the buffer");
2478n/a if (copy_null && 0 < targetsize)
2479n/a target[0] = 0;
2480n/a return NULL;
2481n/a }
2482n/a }
2483n/a if (kind == PyUnicode_1BYTE_KIND) {
2484n/a Py_UCS1 *start = (Py_UCS1 *) data;
2485n/a _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2486n/a }
2487n/a else if (kind == PyUnicode_2BYTE_KIND) {
2488n/a Py_UCS2 *start = (Py_UCS2 *) data;
2489n/a _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2490n/a }
2491n/a else {
2492n/a assert(kind == PyUnicode_4BYTE_KIND);
2493n/a memcpy(target, data, len * sizeof(Py_UCS4));
2494n/a }
2495n/a if (copy_null)
2496n/a target[len] = 0;
2497n/a return target;
2498n/a}
2499n/a
2500n/aPy_UCS4*
2501n/aPyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2502n/a int copy_null)
2503n/a{
2504n/a if (target == NULL || targetsize < 0) {
2505n/a PyErr_BadInternalCall();
2506n/a return NULL;
2507n/a }
2508n/a return as_ucs4(string, target, targetsize, copy_null);
2509n/a}
2510n/a
2511n/aPy_UCS4*
2512n/aPyUnicode_AsUCS4Copy(PyObject *string)
2513n/a{
2514n/a return as_ucs4(string, NULL, 0, 1);
2515n/a}
2516n/a
2517n/a/* maximum number of characters required for output of %lld or %p.
2518n/a We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2519n/a plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2520n/a#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2521n/a
2522n/astatic int
2523n/aunicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2524n/a Py_ssize_t width, Py_ssize_t precision)
2525n/a{
2526n/a Py_ssize_t length, fill, arglen;
2527n/a Py_UCS4 maxchar;
2528n/a
2529n/a if (PyUnicode_READY(str) == -1)
2530n/a return -1;
2531n/a
2532n/a length = PyUnicode_GET_LENGTH(str);
2533n/a if ((precision == -1 || precision >= length)
2534n/a && width <= length)
2535n/a return _PyUnicodeWriter_WriteStr(writer, str);
2536n/a
2537n/a if (precision != -1)
2538n/a length = Py_MIN(precision, length);
2539n/a
2540n/a arglen = Py_MAX(length, width);
2541n/a if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2542n/a maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2543n/a else
2544n/a maxchar = writer->maxchar;
2545n/a
2546n/a if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2547n/a return -1;
2548n/a
2549n/a if (width > length) {
2550n/a fill = width - length;
2551n/a if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2552n/a return -1;
2553n/a writer->pos += fill;
2554n/a }
2555n/a
2556n/a _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2557n/a str, 0, length);
2558n/a writer->pos += length;
2559n/a return 0;
2560n/a}
2561n/a
2562n/astatic int
2563n/aunicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2564n/a Py_ssize_t width, Py_ssize_t precision)
2565n/a{
2566n/a /* UTF-8 */
2567n/a Py_ssize_t length;
2568n/a PyObject *unicode;
2569n/a int res;
2570n/a
2571n/a length = strlen(str);
2572n/a if (precision != -1)
2573n/a length = Py_MIN(length, precision);
2574n/a unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2575n/a if (unicode == NULL)
2576n/a return -1;
2577n/a
2578n/a res = unicode_fromformat_write_str(writer, unicode, width, -1);
2579n/a Py_DECREF(unicode);
2580n/a return res;
2581n/a}
2582n/a
2583n/astatic const char*
2584n/aunicode_fromformat_arg(_PyUnicodeWriter *writer,
2585n/a const char *f, va_list *vargs)
2586n/a{
2587n/a const char *p;
2588n/a Py_ssize_t len;
2589n/a int zeropad;
2590n/a Py_ssize_t width;
2591n/a Py_ssize_t precision;
2592n/a int longflag;
2593n/a int longlongflag;
2594n/a int size_tflag;
2595n/a Py_ssize_t fill;
2596n/a
2597n/a p = f;
2598n/a f++;
2599n/a zeropad = 0;
2600n/a if (*f == '0') {
2601n/a zeropad = 1;
2602n/a f++;
2603n/a }
2604n/a
2605n/a /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2606n/a width = -1;
2607n/a if (Py_ISDIGIT((unsigned)*f)) {
2608n/a width = *f - '0';
2609n/a f++;
2610n/a while (Py_ISDIGIT((unsigned)*f)) {
2611n/a if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2612n/a PyErr_SetString(PyExc_ValueError,
2613n/a "width too big");
2614n/a return NULL;
2615n/a }
2616n/a width = (width * 10) + (*f - '0');
2617n/a f++;
2618n/a }
2619n/a }
2620n/a precision = -1;
2621n/a if (*f == '.') {
2622n/a f++;
2623n/a if (Py_ISDIGIT((unsigned)*f)) {
2624n/a precision = (*f - '0');
2625n/a f++;
2626n/a while (Py_ISDIGIT((unsigned)*f)) {
2627n/a if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2628n/a PyErr_SetString(PyExc_ValueError,
2629n/a "precision too big");
2630n/a return NULL;
2631n/a }
2632n/a precision = (precision * 10) + (*f - '0');
2633n/a f++;
2634n/a }
2635n/a }
2636n/a if (*f == '%') {
2637n/a /* "%.3%s" => f points to "3" */
2638n/a f--;
2639n/a }
2640n/a }
2641n/a if (*f == '\0') {
2642n/a /* bogus format "%.123" => go backward, f points to "3" */
2643n/a f--;
2644n/a }
2645n/a
2646n/a /* Handle %ld, %lu, %lld and %llu. */
2647n/a longflag = 0;
2648n/a longlongflag = 0;
2649n/a size_tflag = 0;
2650n/a if (*f == 'l') {
2651n/a if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2652n/a longflag = 1;
2653n/a ++f;
2654n/a }
2655n/a else if (f[1] == 'l' &&
2656n/a (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2657n/a longlongflag = 1;
2658n/a f += 2;
2659n/a }
2660n/a }
2661n/a /* handle the size_t flag. */
2662n/a else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2663n/a size_tflag = 1;
2664n/a ++f;
2665n/a }
2666n/a
2667n/a if (f[1] == '\0')
2668n/a writer->overallocate = 0;
2669n/a
2670n/a switch (*f) {
2671n/a case 'c':
2672n/a {
2673n/a int ordinal = va_arg(*vargs, int);
2674n/a if (ordinal < 0 || ordinal > MAX_UNICODE) {
2675n/a PyErr_SetString(PyExc_OverflowError,
2676n/a "character argument not in range(0x110000)");
2677n/a return NULL;
2678n/a }
2679n/a if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2680n/a return NULL;
2681n/a break;
2682n/a }
2683n/a
2684n/a case 'i':
2685n/a case 'd':
2686n/a case 'u':
2687n/a case 'x':
2688n/a {
2689n/a /* used by sprintf */
2690n/a char buffer[MAX_LONG_LONG_CHARS];
2691n/a Py_ssize_t arglen;
2692n/a
2693n/a if (*f == 'u') {
2694n/a if (longflag)
2695n/a len = sprintf(buffer, "%lu",
2696n/a va_arg(*vargs, unsigned long));
2697n/a else if (longlongflag)
2698n/a len = sprintf(buffer, "%llu",
2699n/a va_arg(*vargs, unsigned long long));
2700n/a else if (size_tflag)
2701n/a len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
2702n/a va_arg(*vargs, size_t));
2703n/a else
2704n/a len = sprintf(buffer, "%u",
2705n/a va_arg(*vargs, unsigned int));
2706n/a }
2707n/a else if (*f == 'x') {
2708n/a len = sprintf(buffer, "%x", va_arg(*vargs, int));
2709n/a }
2710n/a else {
2711n/a if (longflag)
2712n/a len = sprintf(buffer, "%li",
2713n/a va_arg(*vargs, long));
2714n/a else if (longlongflag)
2715n/a len = sprintf(buffer, "%lli",
2716n/a va_arg(*vargs, long long));
2717n/a else if (size_tflag)
2718n/a len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
2719n/a va_arg(*vargs, Py_ssize_t));
2720n/a else
2721n/a len = sprintf(buffer, "%i",
2722n/a va_arg(*vargs, int));
2723n/a }
2724n/a assert(len >= 0);
2725n/a
2726n/a if (precision < len)
2727n/a precision = len;
2728n/a
2729n/a arglen = Py_MAX(precision, width);
2730n/a if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2731n/a return NULL;
2732n/a
2733n/a if (width > precision) {
2734n/a Py_UCS4 fillchar;
2735n/a fill = width - precision;
2736n/a fillchar = zeropad?'0':' ';
2737n/a if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2738n/a return NULL;
2739n/a writer->pos += fill;
2740n/a }
2741n/a if (precision > len) {
2742n/a fill = precision - len;
2743n/a if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2744n/a return NULL;
2745n/a writer->pos += fill;
2746n/a }
2747n/a
2748n/a if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2749n/a return NULL;
2750n/a break;
2751n/a }
2752n/a
2753n/a case 'p':
2754n/a {
2755n/a char number[MAX_LONG_LONG_CHARS];
2756n/a
2757n/a len = sprintf(number, "%p", va_arg(*vargs, void*));
2758n/a assert(len >= 0);
2759n/a
2760n/a /* %p is ill-defined: ensure leading 0x. */
2761n/a if (number[1] == 'X')
2762n/a number[1] = 'x';
2763n/a else if (number[1] != 'x') {
2764n/a memmove(number + 2, number,
2765n/a strlen(number) + 1);
2766n/a number[0] = '0';
2767n/a number[1] = 'x';
2768n/a len += 2;
2769n/a }
2770n/a
2771n/a if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2772n/a return NULL;
2773n/a break;
2774n/a }
2775n/a
2776n/a case 's':
2777n/a {
2778n/a /* UTF-8 */
2779n/a const char *s = va_arg(*vargs, const char*);
2780n/a if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2781n/a return NULL;
2782n/a break;
2783n/a }
2784n/a
2785n/a case 'U':
2786n/a {
2787n/a PyObject *obj = va_arg(*vargs, PyObject *);
2788n/a assert(obj && _PyUnicode_CHECK(obj));
2789n/a
2790n/a if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2791n/a return NULL;
2792n/a break;
2793n/a }
2794n/a
2795n/a case 'V':
2796n/a {
2797n/a PyObject *obj = va_arg(*vargs, PyObject *);
2798n/a const char *str = va_arg(*vargs, const char *);
2799n/a if (obj) {
2800n/a assert(_PyUnicode_CHECK(obj));
2801n/a if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2802n/a return NULL;
2803n/a }
2804n/a else {
2805n/a assert(str != NULL);
2806n/a if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2807n/a return NULL;
2808n/a }
2809n/a break;
2810n/a }
2811n/a
2812n/a case 'S':
2813n/a {
2814n/a PyObject *obj = va_arg(*vargs, PyObject *);
2815n/a PyObject *str;
2816n/a assert(obj);
2817n/a str = PyObject_Str(obj);
2818n/a if (!str)
2819n/a return NULL;
2820n/a if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2821n/a Py_DECREF(str);
2822n/a return NULL;
2823n/a }
2824n/a Py_DECREF(str);
2825n/a break;
2826n/a }
2827n/a
2828n/a case 'R':
2829n/a {
2830n/a PyObject *obj = va_arg(*vargs, PyObject *);
2831n/a PyObject *repr;
2832n/a assert(obj);
2833n/a repr = PyObject_Repr(obj);
2834n/a if (!repr)
2835n/a return NULL;
2836n/a if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2837n/a Py_DECREF(repr);
2838n/a return NULL;
2839n/a }
2840n/a Py_DECREF(repr);
2841n/a break;
2842n/a }
2843n/a
2844n/a case 'A':
2845n/a {
2846n/a PyObject *obj = va_arg(*vargs, PyObject *);
2847n/a PyObject *ascii;
2848n/a assert(obj);
2849n/a ascii = PyObject_ASCII(obj);
2850n/a if (!ascii)
2851n/a return NULL;
2852n/a if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2853n/a Py_DECREF(ascii);
2854n/a return NULL;
2855n/a }
2856n/a Py_DECREF(ascii);
2857n/a break;
2858n/a }
2859n/a
2860n/a case '%':
2861n/a if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2862n/a return NULL;
2863n/a break;
2864n/a
2865n/a default:
2866n/a /* if we stumble upon an unknown formatting code, copy the rest
2867n/a of the format string to the output string. (we cannot just
2868n/a skip the code, since there's no way to know what's in the
2869n/a argument list) */
2870n/a len = strlen(p);
2871n/a if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2872n/a return NULL;
2873n/a f = p+len;
2874n/a return f;
2875n/a }
2876n/a
2877n/a f++;
2878n/a return f;
2879n/a}
2880n/a
2881n/aPyObject *
2882n/aPyUnicode_FromFormatV(const char *format, va_list vargs)
2883n/a{
2884n/a va_list vargs2;
2885n/a const char *f;
2886n/a _PyUnicodeWriter writer;
2887n/a
2888n/a _PyUnicodeWriter_Init(&writer);
2889n/a writer.min_length = strlen(format) + 100;
2890n/a writer.overallocate = 1;
2891n/a
2892n/a // Copy varags to be able to pass a reference to a subfunction.
2893n/a va_copy(vargs2, vargs);
2894n/a
2895n/a for (f = format; *f; ) {
2896n/a if (*f == '%') {
2897n/a f = unicode_fromformat_arg(&writer, f, &vargs2);
2898n/a if (f == NULL)
2899n/a goto fail;
2900n/a }
2901n/a else {
2902n/a const char *p;
2903n/a Py_ssize_t len;
2904n/a
2905n/a p = f;
2906n/a do
2907n/a {
2908n/a if ((unsigned char)*p > 127) {
2909n/a PyErr_Format(PyExc_ValueError,
2910n/a "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2911n/a "string, got a non-ASCII byte: 0x%02x",
2912n/a (unsigned char)*p);
2913n/a goto fail;
2914n/a }
2915n/a p++;
2916n/a }
2917n/a while (*p != '\0' && *p != '%');
2918n/a len = p - f;
2919n/a
2920n/a if (*p == '\0')
2921n/a writer.overallocate = 0;
2922n/a
2923n/a if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2924n/a goto fail;
2925n/a
2926n/a f = p;
2927n/a }
2928n/a }
2929n/a va_end(vargs2);
2930n/a return _PyUnicodeWriter_Finish(&writer);
2931n/a
2932n/a fail:
2933n/a va_end(vargs2);
2934n/a _PyUnicodeWriter_Dealloc(&writer);
2935n/a return NULL;
2936n/a}
2937n/a
2938n/aPyObject *
2939n/aPyUnicode_FromFormat(const char *format, ...)
2940n/a{
2941n/a PyObject* ret;
2942n/a va_list vargs;
2943n/a
2944n/a#ifdef HAVE_STDARG_PROTOTYPES
2945n/a va_start(vargs, format);
2946n/a#else
2947n/a va_start(vargs);
2948n/a#endif
2949n/a ret = PyUnicode_FromFormatV(format, vargs);
2950n/a va_end(vargs);
2951n/a return ret;
2952n/a}
2953n/a
2954n/a#ifdef HAVE_WCHAR_H
2955n/a
2956n/a/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2957n/a convert a Unicode object to a wide character string.
2958n/a
2959n/a - If w is NULL: return the number of wide characters (including the null
2960n/a character) required to convert the unicode object. Ignore size argument.
2961n/a
2962n/a - Otherwise: return the number of wide characters (excluding the null
2963n/a character) written into w. Write at most size wide characters (including
2964n/a the null character). */
2965n/astatic Py_ssize_t
2966n/aunicode_aswidechar(PyObject *unicode,
2967n/a wchar_t *w,
2968n/a Py_ssize_t size)
2969n/a{
2970n/a Py_ssize_t res;
2971n/a const wchar_t *wstr;
2972n/a
2973n/a wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2974n/a if (wstr == NULL)
2975n/a return -1;
2976n/a
2977n/a if (w != NULL) {
2978n/a if (size > res)
2979n/a size = res + 1;
2980n/a else
2981n/a res = size;
2982n/a memcpy(w, wstr, size * sizeof(wchar_t));
2983n/a return res;
2984n/a }
2985n/a else
2986n/a return res + 1;
2987n/a}
2988n/a
2989n/aPy_ssize_t
2990n/aPyUnicode_AsWideChar(PyObject *unicode,
2991n/a wchar_t *w,
2992n/a Py_ssize_t size)
2993n/a{
2994n/a if (unicode == NULL) {
2995n/a PyErr_BadInternalCall();
2996n/a return -1;
2997n/a }
2998n/a return unicode_aswidechar(unicode, w, size);
2999n/a}
3000n/a
3001n/awchar_t*
3002n/aPyUnicode_AsWideCharString(PyObject *unicode,
3003n/a Py_ssize_t *size)
3004n/a{
3005n/a wchar_t* buffer;
3006n/a Py_ssize_t buflen;
3007n/a
3008n/a if (unicode == NULL) {
3009n/a PyErr_BadInternalCall();
3010n/a return NULL;
3011n/a }
3012n/a
3013n/a buflen = unicode_aswidechar(unicode, NULL, 0);
3014n/a if (buflen == -1)
3015n/a return NULL;
3016n/a buffer = PyMem_NEW(wchar_t, buflen);
3017n/a if (buffer == NULL) {
3018n/a PyErr_NoMemory();
3019n/a return NULL;
3020n/a }
3021n/a buflen = unicode_aswidechar(unicode, buffer, buflen);
3022n/a if (buflen == -1) {
3023n/a PyMem_FREE(buffer);
3024n/a return NULL;
3025n/a }
3026n/a if (size != NULL)
3027n/a *size = buflen;
3028n/a return buffer;
3029n/a}
3030n/a
3031n/a#endif /* HAVE_WCHAR_H */
3032n/a
3033n/aPyObject *
3034n/aPyUnicode_FromOrdinal(int ordinal)
3035n/a{
3036n/a if (ordinal < 0 || ordinal > MAX_UNICODE) {
3037n/a PyErr_SetString(PyExc_ValueError,
3038n/a "chr() arg not in range(0x110000)");
3039n/a return NULL;
3040n/a }
3041n/a
3042n/a return unicode_char((Py_UCS4)ordinal);
3043n/a}
3044n/a
3045n/aPyObject *
3046n/aPyUnicode_FromObject(PyObject *obj)
3047n/a{
3048n/a /* XXX Perhaps we should make this API an alias of
3049n/a PyObject_Str() instead ?! */
3050n/a if (PyUnicode_CheckExact(obj)) {
3051n/a if (PyUnicode_READY(obj) == -1)
3052n/a return NULL;
3053n/a Py_INCREF(obj);
3054n/a return obj;
3055n/a }
3056n/a if (PyUnicode_Check(obj)) {
3057n/a /* For a Unicode subtype that's not a Unicode object,
3058n/a return a true Unicode object with the same data. */
3059n/a return _PyUnicode_Copy(obj);
3060n/a }
3061n/a PyErr_Format(PyExc_TypeError,
3062n/a "Can't convert '%.100s' object to str implicitly",
3063n/a Py_TYPE(obj)->tp_name);
3064n/a return NULL;
3065n/a}
3066n/a
3067n/aPyObject *
3068n/aPyUnicode_FromEncodedObject(PyObject *obj,
3069n/a const char *encoding,
3070n/a const char *errors)
3071n/a{
3072n/a Py_buffer buffer;
3073n/a PyObject *v;
3074n/a
3075n/a if (obj == NULL) {
3076n/a PyErr_BadInternalCall();
3077n/a return NULL;
3078n/a }
3079n/a
3080n/a /* Decoding bytes objects is the most common case and should be fast */
3081n/a if (PyBytes_Check(obj)) {
3082n/a if (PyBytes_GET_SIZE(obj) == 0)
3083n/a _Py_RETURN_UNICODE_EMPTY();
3084n/a v = PyUnicode_Decode(
3085n/a PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3086n/a encoding, errors);
3087n/a return v;
3088n/a }
3089n/a
3090n/a if (PyUnicode_Check(obj)) {
3091n/a PyErr_SetString(PyExc_TypeError,
3092n/a "decoding str is not supported");
3093n/a return NULL;
3094n/a }
3095n/a
3096n/a /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3097n/a if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3098n/a PyErr_Format(PyExc_TypeError,
3099n/a "decoding to str: need a bytes-like object, %.80s found",
3100n/a Py_TYPE(obj)->tp_name);
3101n/a return NULL;
3102n/a }
3103n/a
3104n/a if (buffer.len == 0) {
3105n/a PyBuffer_Release(&buffer);
3106n/a _Py_RETURN_UNICODE_EMPTY();
3107n/a }
3108n/a
3109n/a v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3110n/a PyBuffer_Release(&buffer);
3111n/a return v;
3112n/a}
3113n/a
3114n/a/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3115n/a also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3116n/a longer than lower_len-1). */
3117n/aint
3118n/a_Py_normalize_encoding(const char *encoding,
3119n/a char *lower,
3120n/a size_t lower_len)
3121n/a{
3122n/a const char *e;
3123n/a char *l;
3124n/a char *l_end;
3125n/a int punct;
3126n/a
3127n/a assert(encoding != NULL);
3128n/a
3129n/a e = encoding;
3130n/a l = lower;
3131n/a l_end = &lower[lower_len - 1];
3132n/a punct = 0;
3133n/a while (1) {
3134n/a char c = *e;
3135n/a if (c == 0) {
3136n/a break;
3137n/a }
3138n/a
3139n/a if (Py_ISALNUM(c) || c == '.') {
3140n/a if (punct && l != lower) {
3141n/a if (l == l_end) {
3142n/a return 0;
3143n/a }
3144n/a *l++ = '_';
3145n/a }
3146n/a punct = 0;
3147n/a
3148n/a if (l == l_end) {
3149n/a return 0;
3150n/a }
3151n/a *l++ = Py_TOLOWER(c);
3152n/a }
3153n/a else {
3154n/a punct = 1;
3155n/a }
3156n/a
3157n/a e++;
3158n/a }
3159n/a *l = '\0';
3160n/a return 1;
3161n/a}
3162n/a
3163n/aPyObject *
3164n/aPyUnicode_Decode(const char *s,
3165n/a Py_ssize_t size,
3166n/a const char *encoding,
3167n/a const char *errors)
3168n/a{
3169n/a PyObject *buffer = NULL, *unicode;
3170n/a Py_buffer info;
3171n/a char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3172n/a
3173n/a if (encoding == NULL) {
3174n/a return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3175n/a }
3176n/a
3177n/a /* Shortcuts for common default encodings */
3178n/a if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3179n/a char *lower = buflower;
3180n/a
3181n/a /* Fast paths */
3182n/a if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3183n/a lower += 3;
3184n/a if (*lower == '_') {
3185n/a /* Match "utf8" and "utf_8" */
3186n/a lower++;
3187n/a }
3188n/a
3189n/a if (lower[0] == '8' && lower[1] == 0) {
3190n/a return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3191n/a }
3192n/a else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3193n/a return PyUnicode_DecodeUTF16(s, size, errors, 0);
3194n/a }
3195n/a else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3196n/a return PyUnicode_DecodeUTF32(s, size, errors, 0);
3197n/a }
3198n/a }
3199n/a else {
3200n/a if (strcmp(lower, "ascii") == 0
3201n/a || strcmp(lower, "us_ascii") == 0) {
3202n/a return PyUnicode_DecodeASCII(s, size, errors);
3203n/a }
3204n/a #ifdef MS_WINDOWS
3205n/a else if (strcmp(lower, "mbcs") == 0) {
3206n/a return PyUnicode_DecodeMBCS(s, size, errors);
3207n/a }
3208n/a #endif
3209n/a else if (strcmp(lower, "latin1") == 0
3210n/a || strcmp(lower, "latin_1") == 0
3211n/a || strcmp(lower, "iso_8859_1") == 0
3212n/a || strcmp(lower, "iso8859_1") == 0) {
3213n/a return PyUnicode_DecodeLatin1(s, size, errors);
3214n/a }
3215n/a }
3216n/a }
3217n/a
3218n/a /* Decode via the codec registry */
3219n/a buffer = NULL;
3220n/a if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3221n/a goto onError;
3222n/a buffer = PyMemoryView_FromBuffer(&info);
3223n/a if (buffer == NULL)
3224n/a goto onError;
3225n/a unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3226n/a if (unicode == NULL)
3227n/a goto onError;
3228n/a if (!PyUnicode_Check(unicode)) {
3229n/a PyErr_Format(PyExc_TypeError,
3230n/a "'%.400s' decoder returned '%.400s' instead of 'str'; "
3231n/a "use codecs.decode() to decode to arbitrary types",
3232n/a encoding,
3233n/a Py_TYPE(unicode)->tp_name);
3234n/a Py_DECREF(unicode);
3235n/a goto onError;
3236n/a }
3237n/a Py_DECREF(buffer);
3238n/a return unicode_result(unicode);
3239n/a
3240n/a onError:
3241n/a Py_XDECREF(buffer);
3242n/a return NULL;
3243n/a}
3244n/a
3245n/aPyObject *
3246n/aPyUnicode_AsDecodedObject(PyObject *unicode,
3247n/a const char *encoding,
3248n/a const char *errors)
3249n/a{
3250n/a if (!PyUnicode_Check(unicode)) {
3251n/a PyErr_BadArgument();
3252n/a return NULL;
3253n/a }
3254n/a
3255n/a if (PyErr_WarnEx(PyExc_DeprecationWarning,
3256n/a "PyUnicode_AsDecodedObject() is deprecated; "
3257n/a "use PyCodec_Decode() to decode from str", 1) < 0)
3258n/a return NULL;
3259n/a
3260n/a if (encoding == NULL)
3261n/a encoding = PyUnicode_GetDefaultEncoding();
3262n/a
3263n/a /* Decode via the codec registry */
3264n/a return PyCodec_Decode(unicode, encoding, errors);
3265n/a}
3266n/a
3267n/aPyObject *
3268n/aPyUnicode_AsDecodedUnicode(PyObject *unicode,
3269n/a const char *encoding,
3270n/a const char *errors)
3271n/a{
3272n/a PyObject *v;
3273n/a
3274n/a if (!PyUnicode_Check(unicode)) {
3275n/a PyErr_BadArgument();
3276n/a goto onError;
3277n/a }
3278n/a
3279n/a if (PyErr_WarnEx(PyExc_DeprecationWarning,
3280n/a "PyUnicode_AsDecodedUnicode() is deprecated; "
3281n/a "use PyCodec_Decode() to decode from str to str", 1) < 0)
3282n/a return NULL;
3283n/a
3284n/a if (encoding == NULL)
3285n/a encoding = PyUnicode_GetDefaultEncoding();
3286n/a
3287n/a /* Decode via the codec registry */
3288n/a v = PyCodec_Decode(unicode, encoding, errors);
3289n/a if (v == NULL)
3290n/a goto onError;
3291n/a if (!PyUnicode_Check(v)) {
3292n/a PyErr_Format(PyExc_TypeError,
3293n/a "'%.400s' decoder returned '%.400s' instead of 'str'; "
3294n/a "use codecs.decode() to decode to arbitrary types",
3295n/a encoding,
3296n/a Py_TYPE(unicode)->tp_name);
3297n/a Py_DECREF(v);
3298n/a goto onError;
3299n/a }
3300n/a return unicode_result(v);
3301n/a
3302n/a onError:
3303n/a return NULL;
3304n/a}
3305n/a
3306n/aPyObject *
3307n/aPyUnicode_Encode(const Py_UNICODE *s,
3308n/a Py_ssize_t size,
3309n/a const char *encoding,
3310n/a const char *errors)
3311n/a{
3312n/a PyObject *v, *unicode;
3313n/a
3314n/a unicode = PyUnicode_FromWideChar(s, size);
3315n/a if (unicode == NULL)
3316n/a return NULL;
3317n/a v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3318n/a Py_DECREF(unicode);
3319n/a return v;
3320n/a}
3321n/a
3322n/aPyObject *
3323n/aPyUnicode_AsEncodedObject(PyObject *unicode,
3324n/a const char *encoding,
3325n/a const char *errors)
3326n/a{
3327n/a PyObject *v;
3328n/a
3329n/a if (!PyUnicode_Check(unicode)) {
3330n/a PyErr_BadArgument();
3331n/a goto onError;
3332n/a }
3333n/a
3334n/a if (PyErr_WarnEx(PyExc_DeprecationWarning,
3335n/a "PyUnicode_AsEncodedObject() is deprecated; "
3336n/a "use PyUnicode_AsEncodedString() to encode from str to bytes "
3337n/a "or PyCodec_Encode() for generic encoding", 1) < 0)
3338n/a return NULL;
3339n/a
3340n/a if (encoding == NULL)
3341n/a encoding = PyUnicode_GetDefaultEncoding();
3342n/a
3343n/a /* Encode via the codec registry */
3344n/a v = PyCodec_Encode(unicode, encoding, errors);
3345n/a if (v == NULL)
3346n/a goto onError;
3347n/a return v;
3348n/a
3349n/a onError:
3350n/a return NULL;
3351n/a}
3352n/a
3353n/astatic size_t
3354n/awcstombs_errorpos(const wchar_t *wstr)
3355n/a{
3356n/a size_t len;
3357n/a#if SIZEOF_WCHAR_T == 2
3358n/a wchar_t buf[3];
3359n/a#else
3360n/a wchar_t buf[2];
3361n/a#endif
3362n/a char outbuf[MB_LEN_MAX];
3363n/a const wchar_t *start, *previous;
3364n/a
3365n/a#if SIZEOF_WCHAR_T == 2
3366n/a buf[2] = 0;
3367n/a#else
3368n/a buf[1] = 0;
3369n/a#endif
3370n/a start = wstr;
3371n/a while (*wstr != L'\0')
3372n/a {
3373n/a previous = wstr;
3374n/a#if SIZEOF_WCHAR_T == 2
3375n/a if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3376n/a && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3377n/a {
3378n/a buf[0] = wstr[0];
3379n/a buf[1] = wstr[1];
3380n/a wstr += 2;
3381n/a }
3382n/a else {
3383n/a buf[0] = *wstr;
3384n/a buf[1] = 0;
3385n/a wstr++;
3386n/a }
3387n/a#else
3388n/a buf[0] = *wstr;
3389n/a wstr++;
3390n/a#endif
3391n/a len = wcstombs(outbuf, buf, sizeof(outbuf));
3392n/a if (len == (size_t)-1)
3393n/a return previous - start;
3394n/a }
3395n/a
3396n/a /* failed to find the unencodable character */
3397n/a return 0;
3398n/a}
3399n/a
3400n/astatic int
3401n/alocale_error_handler(const char *errors, int *surrogateescape)
3402n/a{
3403n/a _Py_error_handler error_handler = get_error_handler(errors);
3404n/a switch (error_handler)
3405n/a {
3406n/a case _Py_ERROR_STRICT:
3407n/a *surrogateescape = 0;
3408n/a return 0;
3409n/a case _Py_ERROR_SURROGATEESCAPE:
3410n/a *surrogateescape = 1;
3411n/a return 0;
3412n/a default:
3413n/a PyErr_Format(PyExc_ValueError,
3414n/a "only 'strict' and 'surrogateescape' error handlers "
3415n/a "are supported, not '%s'",
3416n/a errors);
3417n/a return -1;
3418n/a }
3419n/a}
3420n/a
3421n/aPyObject *
3422n/aPyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3423n/a{
3424n/a Py_ssize_t wlen, wlen2;
3425n/a wchar_t *wstr;
3426n/a char *errmsg;
3427n/a PyObject *bytes, *reason, *exc;
3428n/a size_t error_pos, errlen;
3429n/a int surrogateescape;
3430n/a
3431n/a if (locale_error_handler(errors, &surrogateescape) < 0)
3432n/a return NULL;
3433n/a
3434n/a wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3435n/a if (wstr == NULL)
3436n/a return NULL;
3437n/a
3438n/a wlen2 = wcslen(wstr);
3439n/a if (wlen2 != wlen) {
3440n/a PyMem_Free(wstr);
3441n/a PyErr_SetString(PyExc_ValueError, "embedded null character");
3442n/a return NULL;
3443n/a }
3444n/a
3445n/a if (surrogateescape) {
3446n/a /* "surrogateescape" error handler */
3447n/a char *str;
3448n/a
3449n/a str = Py_EncodeLocale(wstr, &error_pos);
3450n/a if (str == NULL) {
3451n/a if (error_pos == (size_t)-1) {
3452n/a PyErr_NoMemory();
3453n/a PyMem_Free(wstr);
3454n/a return NULL;
3455n/a }
3456n/a else {
3457n/a goto encode_error;
3458n/a }
3459n/a }
3460n/a PyMem_Free(wstr);
3461n/a
3462n/a bytes = PyBytes_FromString(str);
3463n/a PyMem_Free(str);
3464n/a }
3465n/a else {
3466n/a /* strict mode */
3467n/a size_t len, len2;
3468n/a
3469n/a len = wcstombs(NULL, wstr, 0);
3470n/a if (len == (size_t)-1) {
3471n/a error_pos = (size_t)-1;
3472n/a goto encode_error;
3473n/a }
3474n/a
3475n/a bytes = PyBytes_FromStringAndSize(NULL, len);
3476n/a if (bytes == NULL) {
3477n/a PyMem_Free(wstr);
3478n/a return NULL;
3479n/a }
3480n/a
3481n/a len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3482n/a if (len2 == (size_t)-1 || len2 > len) {
3483n/a Py_DECREF(bytes);
3484n/a error_pos = (size_t)-1;
3485n/a goto encode_error;
3486n/a }
3487n/a PyMem_Free(wstr);
3488n/a }
3489n/a return bytes;
3490n/a
3491n/aencode_error:
3492n/a errmsg = strerror(errno);
3493n/a assert(errmsg != NULL);
3494n/a
3495n/a if (error_pos == (size_t)-1)
3496n/a error_pos = wcstombs_errorpos(wstr);
3497n/a
3498n/a PyMem_Free(wstr);
3499n/a
3500n/a wstr = Py_DecodeLocale(errmsg, &errlen);
3501n/a if (wstr != NULL) {
3502n/a reason = PyUnicode_FromWideChar(wstr, errlen);
3503n/a PyMem_RawFree(wstr);
3504n/a } else {
3505n/a errmsg = NULL;
3506n/a }
3507n/a
3508n/a if (errmsg == NULL)
3509n/a reason = PyUnicode_FromString(
3510n/a "wcstombs() encountered an unencodable "
3511n/a "wide character");
3512n/a if (reason == NULL)
3513n/a return NULL;
3514n/a
3515n/a exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3516n/a "locale", unicode,
3517n/a (Py_ssize_t)error_pos,
3518n/a (Py_ssize_t)(error_pos+1),
3519n/a reason);
3520n/a Py_DECREF(reason);
3521n/a if (exc != NULL) {
3522n/a PyCodec_StrictErrors(exc);
3523n/a Py_DECREF(exc);
3524n/a }
3525n/a return NULL;
3526n/a}
3527n/a
3528n/aPyObject *
3529n/aPyUnicode_EncodeFSDefault(PyObject *unicode)
3530n/a{
3531n/a#if defined(__APPLE__)
3532n/a return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
3533n/a#else
3534n/a PyInterpreterState *interp = PyThreadState_GET()->interp;
3535n/a /* Bootstrap check: if the filesystem codec is implemented in Python, we
3536n/a cannot use it to encode and decode filenames before it is loaded. Load
3537n/a the Python codec requires to encode at least its own filename. Use the C
3538n/a version of the locale codec until the codec registry is initialized and
3539n/a the Python codec is loaded.
3540n/a
3541n/a Py_FileSystemDefaultEncoding is shared between all interpreters, we
3542n/a cannot only rely on it: check also interp->fscodec_initialized for
3543n/a subinterpreters. */
3544n/a if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3545n/a return PyUnicode_AsEncodedString(unicode,
3546n/a Py_FileSystemDefaultEncoding,
3547n/a Py_FileSystemDefaultEncodeErrors);
3548n/a }
3549n/a else {
3550n/a return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
3551n/a }
3552n/a#endif
3553n/a}
3554n/a
3555n/aPyObject *
3556n/aPyUnicode_AsEncodedString(PyObject *unicode,
3557n/a const char *encoding,
3558n/a const char *errors)
3559n/a{
3560n/a PyObject *v;
3561n/a char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
3562n/a
3563n/a if (!PyUnicode_Check(unicode)) {
3564n/a PyErr_BadArgument();
3565n/a return NULL;
3566n/a }
3567n/a
3568n/a if (encoding == NULL) {
3569n/a return _PyUnicode_AsUTF8String(unicode, errors);
3570n/a }
3571n/a
3572n/a /* Shortcuts for common default encodings */
3573n/a if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3574n/a char *lower = buflower;
3575n/a
3576n/a /* Fast paths */
3577n/a if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3578n/a lower += 3;
3579n/a if (*lower == '_') {
3580n/a /* Match "utf8" and "utf_8" */
3581n/a lower++;
3582n/a }
3583n/a
3584n/a if (lower[0] == '8' && lower[1] == 0) {
3585n/a return _PyUnicode_AsUTF8String(unicode, errors);
3586n/a }
3587n/a else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3588n/a return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3589n/a }
3590n/a else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3591n/a return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3592n/a }
3593n/a }
3594n/a else {
3595n/a if (strcmp(lower, "ascii") == 0
3596n/a || strcmp(lower, "us_ascii") == 0) {
3597n/a return _PyUnicode_AsASCIIString(unicode, errors);
3598n/a }
3599n/a#ifdef MS_WINDOWS
3600n/a else if (strcmp(lower, "mbcs") == 0) {
3601n/a return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3602n/a }
3603n/a#endif
3604n/a else if (strcmp(lower, "latin1") == 0 ||
3605n/a strcmp(lower, "latin_1") == 0 ||
3606n/a strcmp(lower, "iso_8859_1") == 0 ||
3607n/a strcmp(lower, "iso8859_1") == 0) {
3608n/a return _PyUnicode_AsLatin1String(unicode, errors);
3609n/a }
3610n/a }
3611n/a }
3612n/a
3613n/a /* Encode via the codec registry */
3614n/a v = _PyCodec_EncodeText(unicode, encoding, errors);
3615n/a if (v == NULL)
3616n/a return NULL;
3617n/a
3618n/a /* The normal path */
3619n/a if (PyBytes_Check(v))
3620n/a return v;
3621n/a
3622n/a /* If the codec returns a buffer, raise a warning and convert to bytes */
3623n/a if (PyByteArray_Check(v)) {
3624n/a int error;
3625n/a PyObject *b;
3626n/a
3627n/a error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3628n/a "encoder %s returned bytearray instead of bytes; "
3629n/a "use codecs.encode() to encode to arbitrary types",
3630n/a encoding);
3631n/a if (error) {
3632n/a Py_DECREF(v);
3633n/a return NULL;
3634n/a }
3635n/a
3636n/a b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3637n/a Py_DECREF(v);
3638n/a return b;
3639n/a }
3640n/a
3641n/a PyErr_Format(PyExc_TypeError,
3642n/a "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3643n/a "use codecs.encode() to encode to arbitrary types",
3644n/a encoding,
3645n/a Py_TYPE(v)->tp_name);
3646n/a Py_DECREF(v);
3647n/a return NULL;
3648n/a}
3649n/a
3650n/aPyObject *
3651n/aPyUnicode_AsEncodedUnicode(PyObject *unicode,
3652n/a const char *encoding,
3653n/a const char *errors)
3654n/a{
3655n/a PyObject *v;
3656n/a
3657n/a if (!PyUnicode_Check(unicode)) {
3658n/a PyErr_BadArgument();
3659n/a goto onError;
3660n/a }
3661n/a
3662n/a if (PyErr_WarnEx(PyExc_DeprecationWarning,
3663n/a "PyUnicode_AsEncodedUnicode() is deprecated; "
3664n/a "use PyCodec_Encode() to encode from str to str", 1) < 0)
3665n/a return NULL;
3666n/a
3667n/a if (encoding == NULL)
3668n/a encoding = PyUnicode_GetDefaultEncoding();
3669n/a
3670n/a /* Encode via the codec registry */
3671n/a v = PyCodec_Encode(unicode, encoding, errors);
3672n/a if (v == NULL)
3673n/a goto onError;
3674n/a if (!PyUnicode_Check(v)) {
3675n/a PyErr_Format(PyExc_TypeError,
3676n/a "'%.400s' encoder returned '%.400s' instead of 'str'; "
3677n/a "use codecs.encode() to encode to arbitrary types",
3678n/a encoding,
3679n/a Py_TYPE(v)->tp_name);
3680n/a Py_DECREF(v);
3681n/a goto onError;
3682n/a }
3683n/a return v;
3684n/a
3685n/a onError:
3686n/a return NULL;
3687n/a}
3688n/a
3689n/astatic size_t
3690n/ambstowcs_errorpos(const char *str, size_t len)
3691n/a{
3692n/a#ifdef HAVE_MBRTOWC
3693n/a const char *start = str;
3694n/a mbstate_t mbs;
3695n/a size_t converted;
3696n/a wchar_t ch;
3697n/a
3698n/a memset(&mbs, 0, sizeof mbs);
3699n/a while (len)
3700n/a {
3701n/a converted = mbrtowc(&ch, str, len, &mbs);
3702n/a if (converted == 0)
3703n/a /* Reached end of string */
3704n/a break;
3705n/a if (converted == (size_t)-1 || converted == (size_t)-2) {
3706n/a /* Conversion error or incomplete character */
3707n/a return str - start;
3708n/a }
3709n/a else {
3710n/a str += converted;
3711n/a len -= converted;
3712n/a }
3713n/a }
3714n/a /* failed to find the undecodable byte sequence */
3715n/a return 0;
3716n/a#endif
3717n/a return 0;
3718n/a}
3719n/a
3720n/aPyObject*
3721n/aPyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3722n/a const char *errors)
3723n/a{
3724n/a wchar_t smallbuf[256];
3725n/a size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3726n/a wchar_t *wstr;
3727n/a size_t wlen, wlen2;
3728n/a PyObject *unicode;
3729n/a int surrogateescape;
3730n/a size_t error_pos, errlen;
3731n/a char *errmsg;
3732n/a PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */
3733n/a
3734n/a if (locale_error_handler(errors, &surrogateescape) < 0)
3735n/a return NULL;
3736n/a
3737n/a if (str[len] != '\0' || (size_t)len != strlen(str)) {
3738n/a PyErr_SetString(PyExc_ValueError, "embedded null byte");
3739n/a return NULL;
3740n/a }
3741n/a
3742n/a if (surrogateescape) {
3743n/a /* "surrogateescape" error handler */
3744n/a wstr = Py_DecodeLocale(str, &wlen);
3745n/a if (wstr == NULL) {
3746n/a if (wlen == (size_t)-1)
3747n/a PyErr_NoMemory();
3748n/a else
3749n/a PyErr_SetFromErrno(PyExc_OSError);
3750n/a return NULL;
3751n/a }
3752n/a
3753n/a unicode = PyUnicode_FromWideChar(wstr, wlen);
3754n/a PyMem_RawFree(wstr);
3755n/a }
3756n/a else {
3757n/a /* strict mode */
3758n/a#ifndef HAVE_BROKEN_MBSTOWCS
3759n/a wlen = mbstowcs(NULL, str, 0);
3760n/a#else
3761n/a wlen = len;
3762n/a#endif
3763n/a if (wlen == (size_t)-1)
3764n/a goto decode_error;
3765n/a if (wlen+1 <= smallbuf_len) {
3766n/a wstr = smallbuf;
3767n/a }
3768n/a else {
3769n/a wstr = PyMem_New(wchar_t, wlen+1);
3770n/a if (!wstr)
3771n/a return PyErr_NoMemory();
3772n/a }
3773n/a
3774n/a wlen2 = mbstowcs(wstr, str, wlen+1);
3775n/a if (wlen2 == (size_t)-1) {
3776n/a if (wstr != smallbuf)
3777n/a PyMem_Free(wstr);
3778n/a goto decode_error;
3779n/a }
3780n/a#ifdef HAVE_BROKEN_MBSTOWCS
3781n/a assert(wlen2 == wlen);
3782n/a#endif
3783n/a unicode = PyUnicode_FromWideChar(wstr, wlen2);
3784n/a if (wstr != smallbuf)
3785n/a PyMem_Free(wstr);
3786n/a }
3787n/a return unicode;
3788n/a
3789n/adecode_error:
3790n/a errmsg = strerror(errno);
3791n/a assert(errmsg != NULL);
3792n/a
3793n/a error_pos = mbstowcs_errorpos(str, len);
3794n/a wstr = Py_DecodeLocale(errmsg, &errlen);
3795n/a if (wstr != NULL) {
3796n/a reason = PyUnicode_FromWideChar(wstr, errlen);
3797n/a PyMem_RawFree(wstr);
3798n/a }
3799n/a
3800n/a if (reason == NULL)
3801n/a reason = PyUnicode_FromString(
3802n/a "mbstowcs() encountered an invalid multibyte sequence");
3803n/a if (reason == NULL)
3804n/a return NULL;
3805n/a
3806n/a exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3807n/a "locale", str, len,
3808n/a (Py_ssize_t)error_pos,
3809n/a (Py_ssize_t)(error_pos+1),
3810n/a reason);
3811n/a Py_DECREF(reason);
3812n/a if (exc != NULL) {
3813n/a PyCodec_StrictErrors(exc);
3814n/a Py_DECREF(exc);
3815n/a }
3816n/a return NULL;
3817n/a}
3818n/a
3819n/aPyObject*
3820n/aPyUnicode_DecodeLocale(const char *str, const char *errors)
3821n/a{
3822n/a Py_ssize_t size = (Py_ssize_t)strlen(str);
3823n/a return PyUnicode_DecodeLocaleAndSize(str, size, errors);
3824n/a}
3825n/a
3826n/a
3827n/aPyObject*
3828n/aPyUnicode_DecodeFSDefault(const char *s) {
3829n/a Py_ssize_t size = (Py_ssize_t)strlen(s);
3830n/a return PyUnicode_DecodeFSDefaultAndSize(s, size);
3831n/a}
3832n/a
3833n/aPyObject*
3834n/aPyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3835n/a{
3836n/a#if defined(__APPLE__)
3837n/a return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
3838n/a#else
3839n/a PyInterpreterState *interp = PyThreadState_GET()->interp;
3840n/a /* Bootstrap check: if the filesystem codec is implemented in Python, we
3841n/a cannot use it to encode and decode filenames before it is loaded. Load
3842n/a the Python codec requires to encode at least its own filename. Use the C
3843n/a version of the locale codec until the codec registry is initialized and
3844n/a the Python codec is loaded.
3845n/a
3846n/a Py_FileSystemDefaultEncoding is shared between all interpreters, we
3847n/a cannot only rely on it: check also interp->fscodec_initialized for
3848n/a subinterpreters. */
3849n/a if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3850n/a return PyUnicode_Decode(s, size,
3851n/a Py_FileSystemDefaultEncoding,
3852n/a Py_FileSystemDefaultEncodeErrors);
3853n/a }
3854n/a else {
3855n/a return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
3856n/a }
3857n/a#endif
3858n/a}
3859n/a
3860n/a
3861n/aint
3862n/aPyUnicode_FSConverter(PyObject* arg, void* addr)
3863n/a{
3864n/a PyObject *path = NULL;
3865n/a PyObject *output = NULL;
3866n/a Py_ssize_t size;
3867n/a void *data;
3868n/a if (arg == NULL) {
3869n/a Py_DECREF(*(PyObject**)addr);
3870n/a *(PyObject**)addr = NULL;
3871n/a return 1;
3872n/a }
3873n/a path = PyOS_FSPath(arg);
3874n/a if (path == NULL) {
3875n/a return 0;
3876n/a }
3877n/a if (PyBytes_Check(path)) {
3878n/a output = path;
3879n/a }
3880n/a else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3881n/a output = PyUnicode_EncodeFSDefault(path);
3882n/a Py_DECREF(path);
3883n/a if (!output) {
3884n/a return 0;
3885n/a }
3886n/a assert(PyBytes_Check(output));
3887n/a }
3888n/a
3889n/a size = PyBytes_GET_SIZE(output);
3890n/a data = PyBytes_AS_STRING(output);
3891n/a if ((size_t)size != strlen(data)) {
3892n/a PyErr_SetString(PyExc_ValueError, "embedded null byte");
3893n/a Py_DECREF(output);
3894n/a return 0;
3895n/a }
3896n/a *(PyObject**)addr = output;
3897n/a return Py_CLEANUP_SUPPORTED;
3898n/a}
3899n/a
3900n/a
3901n/aint
3902n/aPyUnicode_FSDecoder(PyObject* arg, void* addr)
3903n/a{
3904n/a int is_buffer = 0;
3905n/a PyObject *path = NULL;
3906n/a PyObject *output = NULL;
3907n/a if (arg == NULL) {
3908n/a Py_DECREF(*(PyObject**)addr);
3909n/a return 1;
3910n/a }
3911n/a
3912n/a is_buffer = PyObject_CheckBuffer(arg);
3913n/a if (!is_buffer) {
3914n/a path = PyOS_FSPath(arg);
3915n/a if (path == NULL) {
3916n/a return 0;
3917n/a }
3918n/a }
3919n/a else {
3920n/a path = arg;
3921n/a Py_INCREF(arg);
3922n/a }
3923n/a
3924n/a if (PyUnicode_Check(path)) {
3925n/a if (PyUnicode_READY(path) == -1) {
3926n/a Py_DECREF(path);
3927n/a return 0;
3928n/a }
3929n/a output = path;
3930n/a }
3931n/a else if (PyBytes_Check(path) || is_buffer) {
3932n/a PyObject *path_bytes = NULL;
3933n/a
3934n/a if (!PyBytes_Check(path) &&
3935n/a PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3936n/a "path should be string, bytes, or os.PathLike, not %.200s",
3937n/a Py_TYPE(arg)->tp_name)) {
3938n/a Py_DECREF(path);
3939n/a return 0;
3940n/a }
3941n/a path_bytes = PyBytes_FromObject(path);
3942n/a Py_DECREF(path);
3943n/a if (!path_bytes) {
3944n/a return 0;
3945n/a }
3946n/a output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3947n/a PyBytes_GET_SIZE(path_bytes));
3948n/a Py_DECREF(path_bytes);
3949n/a if (!output) {
3950n/a return 0;
3951n/a }
3952n/a }
3953n/a else {
3954n/a PyErr_Format(PyExc_TypeError,
3955n/a "path should be string, bytes, or os.PathLike, not %.200s",
3956n/a Py_TYPE(arg)->tp_name);
3957n/a Py_DECREF(path);
3958n/a return 0;
3959n/a }
3960n/a if (PyUnicode_READY(output) == -1) {
3961n/a Py_DECREF(output);
3962n/a return 0;
3963n/a }
3964n/a if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3965n/a PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3966n/a PyErr_SetString(PyExc_ValueError, "embedded null character");
3967n/a Py_DECREF(output);
3968n/a return 0;
3969n/a }
3970n/a *(PyObject**)addr = output;
3971n/a return Py_CLEANUP_SUPPORTED;
3972n/a}
3973n/a
3974n/a
3975n/aconst char *
3976n/aPyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3977n/a{
3978n/a PyObject *bytes;
3979n/a
3980n/a if (!PyUnicode_Check(unicode)) {
3981n/a PyErr_BadArgument();
3982n/a return NULL;
3983n/a }
3984n/a if (PyUnicode_READY(unicode) == -1)
3985n/a return NULL;
3986n/a
3987n/a if (PyUnicode_UTF8(unicode) == NULL) {
3988n/a assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3989n/a bytes = _PyUnicode_AsUTF8String(unicode, NULL);
3990n/a if (bytes == NULL)
3991n/a return NULL;
3992n/a _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3993n/a if (_PyUnicode_UTF8(unicode) == NULL) {
3994n/a PyErr_NoMemory();
3995n/a Py_DECREF(bytes);
3996n/a return NULL;
3997n/a }
3998n/a _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3999n/a memcpy(_PyUnicode_UTF8(unicode),
4000n/a PyBytes_AS_STRING(bytes),
4001n/a _PyUnicode_UTF8_LENGTH(unicode) + 1);
4002n/a Py_DECREF(bytes);
4003n/a }
4004n/a
4005n/a if (psize)
4006n/a *psize = PyUnicode_UTF8_LENGTH(unicode);
4007n/a return PyUnicode_UTF8(unicode);
4008n/a}
4009n/a
4010n/aconst char *
4011n/aPyUnicode_AsUTF8(PyObject *unicode)
4012n/a{
4013n/a return PyUnicode_AsUTF8AndSize(unicode, NULL);
4014n/a}
4015n/a
4016n/aPy_UNICODE *
4017n/aPyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4018n/a{
4019n/a const unsigned char *one_byte;
4020n/a#if SIZEOF_WCHAR_T == 4
4021n/a const Py_UCS2 *two_bytes;
4022n/a#else
4023n/a const Py_UCS4 *four_bytes;
4024n/a const Py_UCS4 *ucs4_end;
4025n/a Py_ssize_t num_surrogates;
4026n/a#endif
4027n/a wchar_t *w;
4028n/a wchar_t *wchar_end;
4029n/a
4030n/a if (!PyUnicode_Check(unicode)) {
4031n/a PyErr_BadArgument();
4032n/a return NULL;
4033n/a }
4034n/a if (_PyUnicode_WSTR(unicode) == NULL) {
4035n/a /* Non-ASCII compact unicode object */
4036n/a assert(_PyUnicode_KIND(unicode) != 0);
4037n/a assert(PyUnicode_IS_READY(unicode));
4038n/a
4039n/a if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
4040n/a#if SIZEOF_WCHAR_T == 2
4041n/a four_bytes = PyUnicode_4BYTE_DATA(unicode);
4042n/a ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
4043n/a num_surrogates = 0;
4044n/a
4045n/a for (; four_bytes < ucs4_end; ++four_bytes) {
4046n/a if (*four_bytes > 0xFFFF)
4047n/a ++num_surrogates;
4048n/a }
4049n/a
4050n/a _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4051n/a sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4052n/a if (!_PyUnicode_WSTR(unicode)) {
4053n/a PyErr_NoMemory();
4054n/a return NULL;
4055n/a }
4056n/a _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
4057n/a
4058n/a w = _PyUnicode_WSTR(unicode);
4059n/a wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4060n/a four_bytes = PyUnicode_4BYTE_DATA(unicode);
4061n/a for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4062n/a if (*four_bytes > 0xFFFF) {
4063n/a assert(*four_bytes <= MAX_UNICODE);
4064n/a /* encode surrogate pair in this case */
4065n/a *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4066n/a *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
4067n/a }
4068n/a else
4069n/a *w = *four_bytes;
4070n/a
4071n/a if (w > wchar_end) {
4072n/a assert(0 && "Miscalculated string end");
4073n/a }
4074n/a }
4075n/a *w = 0;
4076n/a#else
4077n/a /* sizeof(wchar_t) == 4 */
4078n/a Py_FatalError("Impossible unicode object state, wstr and str "
4079n/a "should share memory already.");
4080n/a return NULL;
4081n/a#endif
4082n/a }
4083n/a else {
4084n/a if ((size_t)_PyUnicode_LENGTH(unicode) >
4085n/a PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4086n/a PyErr_NoMemory();
4087n/a return NULL;
4088n/a }
4089n/a _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4090n/a (_PyUnicode_LENGTH(unicode) + 1));
4091n/a if (!_PyUnicode_WSTR(unicode)) {
4092n/a PyErr_NoMemory();
4093n/a return NULL;
4094n/a }
4095n/a if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4096n/a _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4097n/a w = _PyUnicode_WSTR(unicode);
4098n/a wchar_end = w + _PyUnicode_LENGTH(unicode);
4099n/a
4100n/a if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4101n/a one_byte = PyUnicode_1BYTE_DATA(unicode);
4102n/a for (; w < wchar_end; ++one_byte, ++w)
4103n/a *w = *one_byte;
4104n/a /* null-terminate the wstr */
4105n/a *w = 0;
4106n/a }
4107n/a else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
4108n/a#if SIZEOF_WCHAR_T == 4
4109n/a two_bytes = PyUnicode_2BYTE_DATA(unicode);
4110n/a for (; w < wchar_end; ++two_bytes, ++w)
4111n/a *w = *two_bytes;
4112n/a /* null-terminate the wstr */
4113n/a *w = 0;
4114n/a#else
4115n/a /* sizeof(wchar_t) == 2 */
4116n/a PyObject_FREE(_PyUnicode_WSTR(unicode));
4117n/a _PyUnicode_WSTR(unicode) = NULL;
4118n/a Py_FatalError("Impossible unicode object state, wstr "
4119n/a "and str should share memory already.");
4120n/a return NULL;
4121n/a#endif
4122n/a }
4123n/a else {
4124n/a assert(0 && "This should never happen.");
4125n/a }
4126n/a }
4127n/a }
4128n/a if (size != NULL)
4129n/a *size = PyUnicode_WSTR_LENGTH(unicode);
4130n/a return _PyUnicode_WSTR(unicode);
4131n/a}
4132n/a
4133n/aPy_UNICODE *
4134n/aPyUnicode_AsUnicode(PyObject *unicode)
4135n/a{
4136n/a return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4137n/a}
4138n/a
4139n/a
4140n/aPy_ssize_t
4141n/aPyUnicode_GetSize(PyObject *unicode)
4142n/a{
4143n/a if (!PyUnicode_Check(unicode)) {
4144n/a PyErr_BadArgument();
4145n/a goto onError;
4146n/a }
4147n/a if (_PyUnicode_WSTR(unicode) == NULL) {
4148n/a if (PyUnicode_AsUnicode(unicode) == NULL)
4149n/a goto onError;
4150n/a }
4151n/a return PyUnicode_WSTR_LENGTH(unicode);
4152n/a
4153n/a onError:
4154n/a return -1;
4155n/a}
4156n/a
4157n/aPy_ssize_t
4158n/aPyUnicode_GetLength(PyObject *unicode)
4159n/a{
4160n/a if (!PyUnicode_Check(unicode)) {
4161n/a PyErr_BadArgument();
4162n/a return -1;
4163n/a }
4164n/a if (PyUnicode_READY(unicode) == -1)
4165n/a return -1;
4166n/a return PyUnicode_GET_LENGTH(unicode);
4167n/a}
4168n/a
4169n/aPy_UCS4
4170n/aPyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4171n/a{
4172n/a void *data;
4173n/a int kind;
4174n/a
4175n/a if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4176n/a PyErr_BadArgument();
4177n/a return (Py_UCS4)-1;
4178n/a }
4179n/a if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4180n/a PyErr_SetString(PyExc_IndexError, "string index out of range");
4181n/a return (Py_UCS4)-1;
4182n/a }
4183n/a data = PyUnicode_DATA(unicode);
4184n/a kind = PyUnicode_KIND(unicode);
4185n/a return PyUnicode_READ(kind, data, index);
4186n/a}
4187n/a
4188n/aint
4189n/aPyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4190n/a{
4191n/a if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4192n/a PyErr_BadArgument();
4193n/a return -1;
4194n/a }
4195n/a assert(PyUnicode_IS_READY(unicode));
4196n/a if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4197n/a PyErr_SetString(PyExc_IndexError, "string index out of range");
4198n/a return -1;
4199n/a }
4200n/a if (unicode_check_modifiable(unicode))
4201n/a return -1;
4202n/a if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4203n/a PyErr_SetString(PyExc_ValueError, "character out of range");
4204n/a return -1;
4205n/a }
4206n/a PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4207n/a index, ch);
4208n/a return 0;
4209n/a}
4210n/a
4211n/aconst char *
4212n/aPyUnicode_GetDefaultEncoding(void)
4213n/a{
4214n/a return "utf-8";
4215n/a}
4216n/a
4217n/a/* create or adjust a UnicodeDecodeError */
4218n/astatic void
4219n/amake_decode_exception(PyObject **exceptionObject,
4220n/a const char *encoding,
4221n/a const char *input, Py_ssize_t length,
4222n/a Py_ssize_t startpos, Py_ssize_t endpos,
4223n/a const char *reason)
4224n/a{
4225n/a if (*exceptionObject == NULL) {
4226n/a *exceptionObject = PyUnicodeDecodeError_Create(
4227n/a encoding, input, length, startpos, endpos, reason);
4228n/a }
4229n/a else {
4230n/a if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4231n/a goto onError;
4232n/a if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4233n/a goto onError;
4234n/a if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4235n/a goto onError;
4236n/a }
4237n/a return;
4238n/a
4239n/aonError:
4240n/a Py_CLEAR(*exceptionObject);
4241n/a}
4242n/a
4243n/a#ifdef MS_WINDOWS
4244n/a/* error handling callback helper:
4245n/a build arguments, call the callback and check the arguments,
4246n/a if no exception occurred, copy the replacement to the output
4247n/a and adjust various state variables.
4248n/a return 0 on success, -1 on error
4249n/a*/
4250n/a
4251n/astatic int
4252n/aunicode_decode_call_errorhandler_wchar(
4253n/a const char *errors, PyObject **errorHandler,
4254n/a const char *encoding, const char *reason,
4255n/a const char **input, const char **inend, Py_ssize_t *startinpos,
4256n/a Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4257n/a PyObject **output, Py_ssize_t *outpos)
4258n/a{
4259n/a static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4260n/a
4261n/a PyObject *restuple = NULL;
4262n/a PyObject *repunicode = NULL;
4263n/a Py_ssize_t outsize;
4264n/a Py_ssize_t insize;
4265n/a Py_ssize_t requiredsize;
4266n/a Py_ssize_t newpos;
4267n/a PyObject *inputobj = NULL;
4268n/a wchar_t *repwstr;
4269n/a Py_ssize_t repwlen;
4270n/a
4271n/a assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4272n/a outsize = _PyUnicode_WSTR_LENGTH(*output);
4273n/a
4274n/a if (*errorHandler == NULL) {
4275n/a *errorHandler = PyCodec_LookupError(errors);
4276n/a if (*errorHandler == NULL)
4277n/a goto onError;
4278n/a }
4279n/a
4280n/a make_decode_exception(exceptionObject,
4281n/a encoding,
4282n/a *input, *inend - *input,
4283n/a *startinpos, *endinpos,
4284n/a reason);
4285n/a if (*exceptionObject == NULL)
4286n/a goto onError;
4287n/a
4288n/a restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4289n/a if (restuple == NULL)
4290n/a goto onError;
4291n/a if (!PyTuple_Check(restuple)) {
4292n/a PyErr_SetString(PyExc_TypeError, &argparse[3]);
4293n/a goto onError;
4294n/a }
4295n/a if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4296n/a goto onError;
4297n/a
4298n/a /* Copy back the bytes variables, which might have been modified by the
4299n/a callback */
4300n/a inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4301n/a if (!inputobj)
4302n/a goto onError;
4303n/a *input = PyBytes_AS_STRING(inputobj);
4304n/a insize = PyBytes_GET_SIZE(inputobj);
4305n/a *inend = *input + insize;
4306n/a /* we can DECREF safely, as the exception has another reference,
4307n/a so the object won't go away. */
4308n/a Py_DECREF(inputobj);
4309n/a
4310n/a if (newpos<0)
4311n/a newpos = insize+newpos;
4312n/a if (newpos<0 || newpos>insize) {
4313n/a PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4314n/a goto onError;
4315n/a }
4316n/a
4317n/a repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4318n/a if (repwstr == NULL)
4319n/a goto onError;
4320n/a /* need more space? (at least enough for what we
4321n/a have+the replacement+the rest of the string (starting
4322n/a at the new input position), so we won't have to check space
4323n/a when there are no errors in the rest of the string) */
4324n/a requiredsize = *outpos;
4325n/a if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4326n/a goto overflow;
4327n/a requiredsize += repwlen;
4328n/a if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4329n/a goto overflow;
4330n/a requiredsize += insize - newpos;
4331n/a if (requiredsize > outsize) {
4332n/a if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4333n/a requiredsize = 2*outsize;
4334n/a if (unicode_resize(output, requiredsize) < 0)
4335n/a goto onError;
4336n/a }
4337n/a wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4338n/a *outpos += repwlen;
4339n/a *endinpos = newpos;
4340n/a *inptr = *input + newpos;
4341n/a
4342n/a /* we made it! */
4343n/a Py_DECREF(restuple);
4344n/a return 0;
4345n/a
4346n/a overflow:
4347n/a PyErr_SetString(PyExc_OverflowError,
4348n/a "decoded result is too long for a Python string");
4349n/a
4350n/a onError:
4351n/a Py_XDECREF(restuple);
4352n/a return -1;
4353n/a}
4354n/a#endif /* MS_WINDOWS */
4355n/a
4356n/astatic int
4357n/aunicode_decode_call_errorhandler_writer(
4358n/a const char *errors, PyObject **errorHandler,
4359n/a const char *encoding, const char *reason,
4360n/a const char **input, const char **inend, Py_ssize_t *startinpos,
4361n/a Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4362n/a _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4363n/a{
4364n/a static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4365n/a
4366n/a PyObject *restuple = NULL;
4367n/a PyObject *repunicode = NULL;
4368n/a Py_ssize_t insize;
4369n/a Py_ssize_t newpos;
4370n/a Py_ssize_t replen;
4371n/a PyObject *inputobj = NULL;
4372n/a
4373n/a if (*errorHandler == NULL) {
4374n/a *errorHandler = PyCodec_LookupError(errors);
4375n/a if (*errorHandler == NULL)
4376n/a goto onError;
4377n/a }
4378n/a
4379n/a make_decode_exception(exceptionObject,
4380n/a encoding,
4381n/a *input, *inend - *input,
4382n/a *startinpos, *endinpos,
4383n/a reason);
4384n/a if (*exceptionObject == NULL)
4385n/a goto onError;
4386n/a
4387n/a restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4388n/a if (restuple == NULL)
4389n/a goto onError;
4390n/a if (!PyTuple_Check(restuple)) {
4391n/a PyErr_SetString(PyExc_TypeError, &argparse[3]);
4392n/a goto onError;
4393n/a }
4394n/a if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4395n/a goto onError;
4396n/a
4397n/a /* Copy back the bytes variables, which might have been modified by the
4398n/a callback */
4399n/a inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4400n/a if (!inputobj)
4401n/a goto onError;
4402n/a *input = PyBytes_AS_STRING(inputobj);
4403n/a insize = PyBytes_GET_SIZE(inputobj);
4404n/a *inend = *input + insize;
4405n/a /* we can DECREF safely, as the exception has another reference,
4406n/a so the object won't go away. */
4407n/a Py_DECREF(inputobj);
4408n/a
4409n/a if (newpos<0)
4410n/a newpos = insize+newpos;
4411n/a if (newpos<0 || newpos>insize) {
4412n/a PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4413n/a goto onError;
4414n/a }
4415n/a
4416n/a replen = PyUnicode_GET_LENGTH(repunicode);
4417n/a if (replen > 1) {
4418n/a writer->min_length += replen - 1;
4419n/a writer->overallocate = 1;
4420n/a if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4421n/a PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4422n/a goto onError;
4423n/a }
4424n/a if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4425n/a goto onError;
4426n/a
4427n/a *endinpos = newpos;
4428n/a *inptr = *input + newpos;
4429n/a
4430n/a /* we made it! */
4431n/a Py_DECREF(restuple);
4432n/a return 0;
4433n/a
4434n/a onError:
4435n/a Py_XDECREF(restuple);
4436n/a return -1;
4437n/a}
4438n/a
4439n/a/* --- UTF-7 Codec -------------------------------------------------------- */
4440n/a
4441n/a/* See RFC2152 for details. We encode conservatively and decode liberally. */
4442n/a
4443n/a/* Three simple macros defining base-64. */
4444n/a
4445n/a/* Is c a base-64 character? */
4446n/a
4447n/a#define IS_BASE64(c) \
4448n/a (((c) >= 'A' && (c) <= 'Z') || \
4449n/a ((c) >= 'a' && (c) <= 'z') || \
4450n/a ((c) >= '0' && (c) <= '9') || \
4451n/a (c) == '+' || (c) == '/')
4452n/a
4453n/a/* given that c is a base-64 character, what is its base-64 value? */
4454n/a
4455n/a#define FROM_BASE64(c) \
4456n/a (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4457n/a ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4458n/a ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4459n/a (c) == '+' ? 62 : 63)
4460n/a
4461n/a/* What is the base-64 character of the bottom 6 bits of n? */
4462n/a
4463n/a#define TO_BASE64(n) \
4464n/a ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4465n/a
4466n/a/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4467n/a * decoded as itself. We are permissive on decoding; the only ASCII
4468n/a * byte not decoding to itself is the + which begins a base64
4469n/a * string. */
4470n/a
4471n/a#define DECODE_DIRECT(c) \
4472n/a ((c) <= 127 && (c) != '+')
4473n/a
4474n/a/* The UTF-7 encoder treats ASCII characters differently according to
4475n/a * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4476n/a * the above). See RFC2152. This array identifies these different
4477n/a * sets:
4478n/a * 0 : "Set D"
4479n/a * alphanumeric and '(),-./:?
4480n/a * 1 : "Set O"
4481n/a * !"#$%&*;<=>@[]^_`{|}
4482n/a * 2 : "whitespace"
4483n/a * ht nl cr sp
4484n/a * 3 : special (must be base64 encoded)
4485n/a * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4486n/a */
4487n/a
4488n/astatic
4489n/achar utf7_category[128] = {
4490n/a/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4491n/a 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4492n/a/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4493n/a 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4494n/a/* sp ! " # $ % & ' ( ) * + , - . / */
4495n/a 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4496n/a/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4497n/a 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4498n/a/* @ A B C D E F G H I J K L M N O */
4499n/a 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4500n/a/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4501n/a 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4502n/a/* ` a b c d e f g h i j k l m n o */
4503n/a 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4504n/a/* p q r s t u v w x y z { | } ~ del */
4505n/a 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
4506n/a};
4507n/a
4508n/a/* ENCODE_DIRECT: this character should be encoded as itself. The
4509n/a * answer depends on whether we are encoding set O as itself, and also
4510n/a * on whether we are encoding whitespace as itself. RFC2152 makes it
4511n/a * clear that the answers to these questions vary between
4512n/a * applications, so this code needs to be flexible. */
4513n/a
4514n/a#define ENCODE_DIRECT(c, directO, directWS) \
4515n/a ((c) < 128 && (c) > 0 && \
4516n/a ((utf7_category[(c)] == 0) || \
4517n/a (directWS && (utf7_category[(c)] == 2)) || \
4518n/a (directO && (utf7_category[(c)] == 1))))
4519n/a
4520n/aPyObject *
4521n/aPyUnicode_DecodeUTF7(const char *s,
4522n/a Py_ssize_t size,
4523n/a const char *errors)
4524n/a{
4525n/a return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4526n/a}
4527n/a
4528n/a/* The decoder. The only state we preserve is our read position,
4529n/a * i.e. how many characters we have consumed. So if we end in the
4530n/a * middle of a shift sequence we have to back off the read position
4531n/a * and the output to the beginning of the sequence, otherwise we lose
4532n/a * all the shift state (seen bits, number of bits seen, high
4533n/a * surrogate). */
4534n/a
4535n/aPyObject *
4536n/aPyUnicode_DecodeUTF7Stateful(const char *s,
4537n/a Py_ssize_t size,
4538n/a const char *errors,
4539n/a Py_ssize_t *consumed)
4540n/a{
4541n/a const char *starts = s;
4542n/a Py_ssize_t startinpos;
4543n/a Py_ssize_t endinpos;
4544n/a const char *e;
4545n/a _PyUnicodeWriter writer;
4546n/a const char *errmsg = "";
4547n/a int inShift = 0;
4548n/a Py_ssize_t shiftOutStart;
4549n/a unsigned int base64bits = 0;
4550n/a unsigned long base64buffer = 0;
4551n/a Py_UCS4 surrogate = 0;
4552n/a PyObject *errorHandler = NULL;
4553n/a PyObject *exc = NULL;
4554n/a
4555n/a if (size == 0) {
4556n/a if (consumed)
4557n/a *consumed = 0;
4558n/a _Py_RETURN_UNICODE_EMPTY();
4559n/a }
4560n/a
4561n/a /* Start off assuming it's all ASCII. Widen later as necessary. */
4562n/a _PyUnicodeWriter_Init(&writer);
4563n/a writer.min_length = size;
4564n/a
4565n/a shiftOutStart = 0;
4566n/a e = s + size;
4567n/a
4568n/a while (s < e) {
4569n/a Py_UCS4 ch;
4570n/a restart:
4571n/a ch = (unsigned char) *s;
4572n/a
4573n/a if (inShift) { /* in a base-64 section */
4574n/a if (IS_BASE64(ch)) { /* consume a base-64 character */
4575n/a base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4576n/a base64bits += 6;
4577n/a s++;
4578n/a if (base64bits >= 16) {
4579n/a /* we have enough bits for a UTF-16 value */
4580n/a Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4581n/a base64bits -= 16;
4582n/a base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4583n/a assert(outCh <= 0xffff);
4584n/a if (surrogate) {
4585n/a /* expecting a second surrogate */
4586n/a if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4587n/a Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4588n/a if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4589n/a goto onError;
4590n/a surrogate = 0;
4591n/a continue;
4592n/a }
4593n/a else {
4594n/a if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4595n/a goto onError;
4596n/a surrogate = 0;
4597n/a }
4598n/a }
4599n/a if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4600n/a /* first surrogate */
4601n/a surrogate = outCh;
4602n/a }
4603n/a else {
4604n/a if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4605n/a goto onError;
4606n/a }
4607n/a }
4608n/a }
4609n/a else { /* now leaving a base-64 section */
4610n/a inShift = 0;
4611n/a if (base64bits > 0) { /* left-over bits */
4612n/a if (base64bits >= 6) {
4613n/a /* We've seen at least one base-64 character */
4614n/a s++;
4615n/a errmsg = "partial character in shift sequence";
4616n/a goto utf7Error;
4617n/a }
4618n/a else {
4619n/a /* Some bits remain; they should be zero */
4620n/a if (base64buffer != 0) {
4621n/a s++;
4622n/a errmsg = "non-zero padding bits in shift sequence";
4623n/a goto utf7Error;
4624n/a }
4625n/a }
4626n/a }
4627n/a if (surrogate && DECODE_DIRECT(ch)) {
4628n/a if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4629n/a goto onError;
4630n/a }
4631n/a surrogate = 0;
4632n/a if (ch == '-') {
4633n/a /* '-' is absorbed; other terminating
4634n/a characters are preserved */
4635n/a s++;
4636n/a }
4637n/a }
4638n/a }
4639n/a else if ( ch == '+' ) {
4640n/a startinpos = s-starts;
4641n/a s++; /* consume '+' */
4642n/a if (s < e && *s == '-') { /* '+-' encodes '+' */
4643n/a s++;
4644n/a if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4645n/a goto onError;
4646n/a }
4647n/a else { /* begin base64-encoded section */
4648n/a inShift = 1;
4649n/a surrogate = 0;
4650n/a shiftOutStart = writer.pos;
4651n/a base64bits = 0;
4652n/a base64buffer = 0;
4653n/a }
4654n/a }
4655n/a else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4656n/a s++;
4657n/a if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4658n/a goto onError;
4659n/a }
4660n/a else {
4661n/a startinpos = s-starts;
4662n/a s++;
4663n/a errmsg = "unexpected special character";
4664n/a goto utf7Error;
4665n/a }
4666n/a continue;
4667n/autf7Error:
4668n/a endinpos = s-starts;
4669n/a if (unicode_decode_call_errorhandler_writer(
4670n/a errors, &errorHandler,
4671n/a "utf7", errmsg,
4672n/a &starts, &e, &startinpos, &endinpos, &exc, &s,
4673n/a &writer))
4674n/a goto onError;
4675n/a }
4676n/a
4677n/a /* end of string */
4678n/a
4679n/a if (inShift && !consumed) { /* in shift sequence, no more to follow */
4680n/a /* if we're in an inconsistent state, that's an error */
4681n/a inShift = 0;
4682n/a if (surrogate ||
4683n/a (base64bits >= 6) ||
4684n/a (base64bits > 0 && base64buffer != 0)) {
4685n/a endinpos = size;
4686n/a if (unicode_decode_call_errorhandler_writer(
4687n/a errors, &errorHandler,
4688n/a "utf7", "unterminated shift sequence",
4689n/a &starts, &e, &startinpos, &endinpos, &exc, &s,
4690n/a &writer))
4691n/a goto onError;
4692n/a if (s < e)
4693n/a goto restart;
4694n/a }
4695n/a }
4696n/a
4697n/a /* return state */
4698n/a if (consumed) {
4699n/a if (inShift) {
4700n/a *consumed = startinpos;
4701n/a if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4702n/a PyObject *result = PyUnicode_FromKindAndData(
4703n/a writer.kind, writer.data, shiftOutStart);
4704n/a Py_XDECREF(errorHandler);
4705n/a Py_XDECREF(exc);
4706n/a _PyUnicodeWriter_Dealloc(&writer);
4707n/a return result;
4708n/a }
4709n/a writer.pos = shiftOutStart; /* back off output */
4710n/a }
4711n/a else {
4712n/a *consumed = s-starts;
4713n/a }
4714n/a }
4715n/a
4716n/a Py_XDECREF(errorHandler);
4717n/a Py_XDECREF(exc);
4718n/a return _PyUnicodeWriter_Finish(&writer);
4719n/a
4720n/a onError:
4721n/a Py_XDECREF(errorHandler);
4722n/a Py_XDECREF(exc);
4723n/a _PyUnicodeWriter_Dealloc(&writer);
4724n/a return NULL;
4725n/a}
4726n/a
4727n/a
4728n/aPyObject *
4729n/a_PyUnicode_EncodeUTF7(PyObject *str,
4730n/a int base64SetO,
4731n/a int base64WhiteSpace,
4732n/a const char *errors)
4733n/a{
4734n/a int kind;
4735n/a void *data;
4736n/a Py_ssize_t len;
4737n/a PyObject *v;
4738n/a int inShift = 0;
4739n/a Py_ssize_t i;
4740n/a unsigned int base64bits = 0;
4741n/a unsigned long base64buffer = 0;
4742n/a char * out;
4743n/a char * start;
4744n/a
4745n/a if (PyUnicode_READY(str) == -1)
4746n/a return NULL;
4747n/a kind = PyUnicode_KIND(str);
4748n/a data = PyUnicode_DATA(str);
4749n/a len = PyUnicode_GET_LENGTH(str);
4750n/a
4751n/a if (len == 0)
4752n/a return PyBytes_FromStringAndSize(NULL, 0);
4753n/a
4754n/a /* It might be possible to tighten this worst case */
4755n/a if (len > PY_SSIZE_T_MAX / 8)
4756n/a return PyErr_NoMemory();
4757n/a v = PyBytes_FromStringAndSize(NULL, len * 8);
4758n/a if (v == NULL)
4759n/a return NULL;
4760n/a
4761n/a start = out = PyBytes_AS_STRING(v);
4762n/a for (i = 0; i < len; ++i) {
4763n/a Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4764n/a
4765n/a if (inShift) {
4766n/a if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4767n/a /* shifting out */
4768n/a if (base64bits) { /* output remaining bits */
4769n/a *out++ = TO_BASE64(base64buffer << (6-base64bits));
4770n/a base64buffer = 0;
4771n/a base64bits = 0;
4772n/a }
4773n/a inShift = 0;
4774n/a /* Characters not in the BASE64 set implicitly unshift the sequence
4775n/a so no '-' is required, except if the character is itself a '-' */
4776n/a if (IS_BASE64(ch) || ch == '-') {
4777n/a *out++ = '-';
4778n/a }
4779n/a *out++ = (char) ch;
4780n/a }
4781n/a else {
4782n/a goto encode_char;
4783n/a }
4784n/a }
4785n/a else { /* not in a shift sequence */
4786n/a if (ch == '+') {
4787n/a *out++ = '+';
4788n/a *out++ = '-';
4789n/a }
4790n/a else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4791n/a *out++ = (char) ch;
4792n/a }
4793n/a else {
4794n/a *out++ = '+';
4795n/a inShift = 1;
4796n/a goto encode_char;
4797n/a }
4798n/a }
4799n/a continue;
4800n/aencode_char:
4801n/a if (ch >= 0x10000) {
4802n/a assert(ch <= MAX_UNICODE);
4803n/a
4804n/a /* code first surrogate */
4805n/a base64bits += 16;
4806n/a base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4807n/a while (base64bits >= 6) {
4808n/a *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4809n/a base64bits -= 6;
4810n/a }
4811n/a /* prepare second surrogate */
4812n/a ch = Py_UNICODE_LOW_SURROGATE(ch);
4813n/a }
4814n/a base64bits += 16;
4815n/a base64buffer = (base64buffer << 16) | ch;
4816n/a while (base64bits >= 6) {
4817n/a *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4818n/a base64bits -= 6;
4819n/a }
4820n/a }
4821n/a if (base64bits)
4822n/a *out++= TO_BASE64(base64buffer << (6-base64bits) );
4823n/a if (inShift)
4824n/a *out++ = '-';
4825n/a if (_PyBytes_Resize(&v, out - start) < 0)
4826n/a return NULL;
4827n/a return v;
4828n/a}
4829n/aPyObject *
4830n/aPyUnicode_EncodeUTF7(const Py_UNICODE *s,
4831n/a Py_ssize_t size,
4832n/a int base64SetO,
4833n/a int base64WhiteSpace,
4834n/a const char *errors)
4835n/a{
4836n/a PyObject *result;
4837n/a PyObject *tmp = PyUnicode_FromWideChar(s, size);
4838n/a if (tmp == NULL)
4839n/a return NULL;
4840n/a result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4841n/a base64WhiteSpace, errors);
4842n/a Py_DECREF(tmp);
4843n/a return result;
4844n/a}
4845n/a
4846n/a#undef IS_BASE64
4847n/a#undef FROM_BASE64
4848n/a#undef TO_BASE64
4849n/a#undef DECODE_DIRECT
4850n/a#undef ENCODE_DIRECT
4851n/a
4852n/a/* --- UTF-8 Codec -------------------------------------------------------- */
4853n/a
4854n/aPyObject *
4855n/aPyUnicode_DecodeUTF8(const char *s,
4856n/a Py_ssize_t size,
4857n/a const char *errors)
4858n/a{
4859n/a return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4860n/a}
4861n/a
4862n/a#include "stringlib/asciilib.h"
4863n/a#include "stringlib/codecs.h"
4864n/a#include "stringlib/undef.h"
4865n/a
4866n/a#include "stringlib/ucs1lib.h"
4867n/a#include "stringlib/codecs.h"
4868n/a#include "stringlib/undef.h"
4869n/a
4870n/a#include "stringlib/ucs2lib.h"
4871n/a#include "stringlib/codecs.h"
4872n/a#include "stringlib/undef.h"
4873n/a
4874n/a#include "stringlib/ucs4lib.h"
4875n/a#include "stringlib/codecs.h"
4876n/a#include "stringlib/undef.h"
4877n/a
4878n/a/* Mask to quickly check whether a C 'long' contains a
4879n/a non-ASCII, UTF8-encoded char. */
4880n/a#if (SIZEOF_LONG == 8)
4881n/a# define ASCII_CHAR_MASK 0x8080808080808080UL
4882n/a#elif (SIZEOF_LONG == 4)
4883n/a# define ASCII_CHAR_MASK 0x80808080UL
4884n/a#else
4885n/a# error C 'long' size should be either 4 or 8!
4886n/a#endif
4887n/a
4888n/astatic Py_ssize_t
4889n/aascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4890n/a{
4891n/a const char *p = start;
4892n/a const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4893n/a
4894n/a /*
4895n/a * Issue #17237: m68k is a bit different from most architectures in
4896n/a * that objects do not use "natural alignment" - for example, int and
4897n/a * long are only aligned at 2-byte boundaries. Therefore the assert()
4898n/a * won't work; also, tests have shown that skipping the "optimised
4899n/a * version" will even speed up m68k.
4900n/a */
4901n/a#if !defined(__m68k__)
4902n/a#if SIZEOF_LONG <= SIZEOF_VOID_P
4903n/a assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4904n/a if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4905n/a /* Fast path, see in STRINGLIB(utf8_decode) for
4906n/a an explanation. */
4907n/a /* Help allocation */
4908n/a const char *_p = p;
4909n/a Py_UCS1 * q = dest;
4910n/a while (_p < aligned_end) {
4911n/a unsigned long value = *(const unsigned long *) _p;
4912n/a if (value & ASCII_CHAR_MASK)
4913n/a break;
4914n/a *((unsigned long *)q) = value;
4915n/a _p += SIZEOF_LONG;
4916n/a q += SIZEOF_LONG;
4917n/a }
4918n/a p = _p;
4919n/a while (p < end) {
4920n/a if ((unsigned char)*p & 0x80)
4921n/a break;
4922n/a *q++ = *p++;
4923n/a }
4924n/a return p - start;
4925n/a }
4926n/a#endif
4927n/a#endif
4928n/a while (p < end) {
4929n/a /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4930n/a for an explanation. */
4931n/a if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4932n/a /* Help allocation */
4933n/a const char *_p = p;
4934n/a while (_p < aligned_end) {
4935n/a unsigned long value = *(unsigned long *) _p;
4936n/a if (value & ASCII_CHAR_MASK)
4937n/a break;
4938n/a _p += SIZEOF_LONG;
4939n/a }
4940n/a p = _p;
4941n/a if (_p == end)
4942n/a break;
4943n/a }
4944n/a if ((unsigned char)*p & 0x80)
4945n/a break;
4946n/a ++p;
4947n/a }
4948n/a memcpy(dest, start, p - start);
4949n/a return p - start;
4950n/a}
4951n/a
4952n/aPyObject *
4953n/aPyUnicode_DecodeUTF8Stateful(const char *s,
4954n/a Py_ssize_t size,
4955n/a const char *errors,
4956n/a Py_ssize_t *consumed)
4957n/a{
4958n/a _PyUnicodeWriter writer;
4959n/a const char *starts = s;
4960n/a const char *end = s + size;
4961n/a
4962n/a Py_ssize_t startinpos;
4963n/a Py_ssize_t endinpos;
4964n/a const char *errmsg = "";
4965n/a PyObject *error_handler_obj = NULL;
4966n/a PyObject *exc = NULL;
4967n/a _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
4968n/a
4969n/a if (size == 0) {
4970n/a if (consumed)
4971n/a *consumed = 0;
4972n/a _Py_RETURN_UNICODE_EMPTY();
4973n/a }
4974n/a
4975n/a /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4976n/a if (size == 1 && (unsigned char)s[0] < 128) {
4977n/a if (consumed)
4978n/a *consumed = 1;
4979n/a return get_latin1_char((unsigned char)s[0]);
4980n/a }
4981n/a
4982n/a _PyUnicodeWriter_Init(&writer);
4983n/a writer.min_length = size;
4984n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4985n/a goto onError;
4986n/a
4987n/a writer.pos = ascii_decode(s, end, writer.data);
4988n/a s += writer.pos;
4989n/a while (s < end) {
4990n/a Py_UCS4 ch;
4991n/a int kind = writer.kind;
4992n/a
4993n/a if (kind == PyUnicode_1BYTE_KIND) {
4994n/a if (PyUnicode_IS_ASCII(writer.buffer))
4995n/a ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4996n/a else
4997n/a ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4998n/a } else if (kind == PyUnicode_2BYTE_KIND) {
4999n/a ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
5000n/a } else {
5001n/a assert(kind == PyUnicode_4BYTE_KIND);
5002n/a ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
5003n/a }
5004n/a
5005n/a switch (ch) {
5006n/a case 0:
5007n/a if (s == end || consumed)
5008n/a goto End;
5009n/a errmsg = "unexpected end of data";
5010n/a startinpos = s - starts;
5011n/a endinpos = end - starts;
5012n/a break;
5013n/a case 1:
5014n/a errmsg = "invalid start byte";
5015n/a startinpos = s - starts;
5016n/a endinpos = startinpos + 1;
5017n/a break;
5018n/a case 2:
5019n/a case 3:
5020n/a case 4:
5021n/a errmsg = "invalid continuation byte";
5022n/a startinpos = s - starts;
5023n/a endinpos = startinpos + ch - 1;
5024n/a break;
5025n/a default:
5026n/a if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5027n/a goto onError;
5028n/a continue;
5029n/a }
5030n/a
5031n/a if (error_handler == _Py_ERROR_UNKNOWN)
5032n/a error_handler = get_error_handler(errors);
5033n/a
5034n/a switch (error_handler) {
5035n/a case _Py_ERROR_IGNORE:
5036n/a s += (endinpos - startinpos);
5037n/a break;
5038n/a
5039n/a case _Py_ERROR_REPLACE:
5040n/a if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5041n/a goto onError;
5042n/a s += (endinpos - startinpos);
5043n/a break;
5044n/a
5045n/a case _Py_ERROR_SURROGATEESCAPE:
5046n/a {
5047n/a Py_ssize_t i;
5048n/a
5049n/a if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5050n/a goto onError;
5051n/a for (i=startinpos; i<endinpos; i++) {
5052n/a ch = (Py_UCS4)(unsigned char)(starts[i]);
5053n/a PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5054n/a ch + 0xdc00);
5055n/a writer.pos++;
5056n/a }
5057n/a s += (endinpos - startinpos);
5058n/a break;
5059n/a }
5060n/a
5061n/a default:
5062n/a if (unicode_decode_call_errorhandler_writer(
5063n/a errors, &error_handler_obj,
5064n/a "utf-8", errmsg,
5065n/a &starts, &end, &startinpos, &endinpos, &exc, &s,
5066n/a &writer))
5067n/a goto onError;
5068n/a }
5069n/a }
5070n/a
5071n/aEnd:
5072n/a if (consumed)
5073n/a *consumed = s - starts;
5074n/a
5075n/a Py_XDECREF(error_handler_obj);
5076n/a Py_XDECREF(exc);
5077n/a return _PyUnicodeWriter_Finish(&writer);
5078n/a
5079n/aonError:
5080n/a Py_XDECREF(error_handler_obj);
5081n/a Py_XDECREF(exc);
5082n/a _PyUnicodeWriter_Dealloc(&writer);
5083n/a return NULL;
5084n/a}
5085n/a
5086n/a#if defined(__APPLE__) || defined(__ANDROID__)
5087n/a
5088n/a/* Simplified UTF-8 decoder using surrogateescape error handler,
5089n/a used to decode the command line arguments on Mac OS X and Android.
5090n/a
5091n/a Return a pointer to a newly allocated wide character string (use
5092n/a PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
5093n/a
5094n/awchar_t*
5095n/a_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5096n/a{
5097n/a const char *e;
5098n/a wchar_t *unicode;
5099n/a Py_ssize_t outpos;
5100n/a
5101n/a /* Note: size will always be longer than the resulting Unicode
5102n/a character count */
5103n/a if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
5104n/a return NULL;
5105n/a unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5106n/a if (!unicode)
5107n/a return NULL;
5108n/a
5109n/a /* Unpack UTF-8 encoded data */
5110n/a e = s + size;
5111n/a outpos = 0;
5112n/a while (s < e) {
5113n/a Py_UCS4 ch;
5114n/a#if SIZEOF_WCHAR_T == 4
5115n/a ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5116n/a#else
5117n/a ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5118n/a#endif
5119n/a if (ch > 0xFF) {
5120n/a#if SIZEOF_WCHAR_T == 4
5121n/a assert(0);
5122n/a#else
5123n/a assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5124n/a /* compute and append the two surrogates: */
5125n/a unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5126n/a unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5127n/a#endif
5128n/a }
5129n/a else {
5130n/a if (!ch && s == e)
5131n/a break;
5132n/a /* surrogateescape */
5133n/a unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5134n/a }
5135n/a }
5136n/a unicode[outpos] = L'\0';
5137n/a return unicode;
5138n/a}
5139n/a
5140n/a#endif /* __APPLE__ or __ANDROID__ */
5141n/a
5142n/a/* Primary internal function which creates utf8 encoded bytes objects.
5143n/a
5144n/a Allocation strategy: if the string is short, convert into a stack buffer
5145n/a and allocate exactly as much space needed at the end. Else allocate the
5146n/a maximum possible needed (4 result bytes per Unicode character), and return
5147n/a the excess memory at the end.
5148n/a*/
5149n/aPyObject *
5150n/a_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5151n/a{
5152n/a enum PyUnicode_Kind kind;
5153n/a void *data;
5154n/a Py_ssize_t size;
5155n/a
5156n/a if (!PyUnicode_Check(unicode)) {
5157n/a PyErr_BadArgument();
5158n/a return NULL;
5159n/a }
5160n/a
5161n/a if (PyUnicode_READY(unicode) == -1)
5162n/a return NULL;
5163n/a
5164n/a if (PyUnicode_UTF8(unicode))
5165n/a return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5166n/a PyUnicode_UTF8_LENGTH(unicode));
5167n/a
5168n/a kind = PyUnicode_KIND(unicode);
5169n/a data = PyUnicode_DATA(unicode);
5170n/a size = PyUnicode_GET_LENGTH(unicode);
5171n/a
5172n/a switch (kind) {
5173n/a default:
5174n/a assert(0);
5175n/a case PyUnicode_1BYTE_KIND:
5176n/a /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5177n/a assert(!PyUnicode_IS_ASCII(unicode));
5178n/a return ucs1lib_utf8_encoder(unicode, data, size, errors);
5179n/a case PyUnicode_2BYTE_KIND:
5180n/a return ucs2lib_utf8_encoder(unicode, data, size, errors);
5181n/a case PyUnicode_4BYTE_KIND:
5182n/a return ucs4lib_utf8_encoder(unicode, data, size, errors);
5183n/a }
5184n/a}
5185n/a
5186n/aPyObject *
5187n/aPyUnicode_EncodeUTF8(const Py_UNICODE *s,
5188n/a Py_ssize_t size,
5189n/a const char *errors)
5190n/a{
5191n/a PyObject *v, *unicode;
5192n/a
5193n/a unicode = PyUnicode_FromWideChar(s, size);
5194n/a if (unicode == NULL)
5195n/a return NULL;
5196n/a v = _PyUnicode_AsUTF8String(unicode, errors);
5197n/a Py_DECREF(unicode);
5198n/a return v;
5199n/a}
5200n/a
5201n/aPyObject *
5202n/aPyUnicode_AsUTF8String(PyObject *unicode)
5203n/a{
5204n/a return _PyUnicode_AsUTF8String(unicode, NULL);
5205n/a}
5206n/a
5207n/a/* --- UTF-32 Codec ------------------------------------------------------- */
5208n/a
5209n/aPyObject *
5210n/aPyUnicode_DecodeUTF32(const char *s,
5211n/a Py_ssize_t size,
5212n/a const char *errors,
5213n/a int *byteorder)
5214n/a{
5215n/a return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5216n/a}
5217n/a
5218n/aPyObject *
5219n/aPyUnicode_DecodeUTF32Stateful(const char *s,
5220n/a Py_ssize_t size,
5221n/a const char *errors,
5222n/a int *byteorder,
5223n/a Py_ssize_t *consumed)
5224n/a{
5225n/a const char *starts = s;
5226n/a Py_ssize_t startinpos;
5227n/a Py_ssize_t endinpos;
5228n/a _PyUnicodeWriter writer;
5229n/a const unsigned char *q, *e;
5230n/a int le, bo = 0; /* assume native ordering by default */
5231n/a const char *encoding;
5232n/a const char *errmsg = "";
5233n/a PyObject *errorHandler = NULL;
5234n/a PyObject *exc = NULL;
5235n/a
5236n/a q = (unsigned char *)s;
5237n/a e = q + size;
5238n/a
5239n/a if (byteorder)
5240n/a bo = *byteorder;
5241n/a
5242n/a /* Check for BOM marks (U+FEFF) in the input and adjust current
5243n/a byte order setting accordingly. In native mode, the leading BOM
5244n/a mark is skipped, in all other modes, it is copied to the output
5245n/a stream as-is (giving a ZWNBSP character). */
5246n/a if (bo == 0 && size >= 4) {
5247n/a Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5248n/a if (bom == 0x0000FEFF) {
5249n/a bo = -1;
5250n/a q += 4;
5251n/a }
5252n/a else if (bom == 0xFFFE0000) {
5253n/a bo = 1;
5254n/a q += 4;
5255n/a }
5256n/a if (byteorder)
5257n/a *byteorder = bo;
5258n/a }
5259n/a
5260n/a if (q == e) {
5261n/a if (consumed)
5262n/a *consumed = size;
5263n/a _Py_RETURN_UNICODE_EMPTY();
5264n/a }
5265n/a
5266n/a#ifdef WORDS_BIGENDIAN
5267n/a le = bo < 0;
5268n/a#else
5269n/a le = bo <= 0;
5270n/a#endif
5271n/a encoding = le ? "utf-32-le" : "utf-32-be";
5272n/a
5273n/a _PyUnicodeWriter_Init(&writer);
5274n/a writer.min_length = (e - q + 3) / 4;
5275n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5276n/a goto onError;
5277n/a
5278n/a while (1) {
5279n/a Py_UCS4 ch = 0;
5280n/a Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5281n/a
5282n/a if (e - q >= 4) {
5283n/a enum PyUnicode_Kind kind = writer.kind;
5284n/a void *data = writer.data;
5285n/a const unsigned char *last = e - 4;
5286n/a Py_ssize_t pos = writer.pos;
5287n/a if (le) {
5288n/a do {
5289n/a ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5290n/a if (ch > maxch)
5291n/a break;
5292n/a if (kind != PyUnicode_1BYTE_KIND &&
5293n/a Py_UNICODE_IS_SURROGATE(ch))
5294n/a break;
5295n/a PyUnicode_WRITE(kind, data, pos++, ch);
5296n/a q += 4;
5297n/a } while (q <= last);
5298n/a }
5299n/a else {
5300n/a do {
5301n/a ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5302n/a if (ch > maxch)
5303n/a break;
5304n/a if (kind != PyUnicode_1BYTE_KIND &&
5305n/a Py_UNICODE_IS_SURROGATE(ch))
5306n/a break;
5307n/a PyUnicode_WRITE(kind, data, pos++, ch);
5308n/a q += 4;
5309n/a } while (q <= last);
5310n/a }
5311n/a writer.pos = pos;
5312n/a }
5313n/a
5314n/a if (Py_UNICODE_IS_SURROGATE(ch)) {
5315n/a errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5316n/a startinpos = ((const char *)q) - starts;
5317n/a endinpos = startinpos + 4;
5318n/a }
5319n/a else if (ch <= maxch) {
5320n/a if (q == e || consumed)
5321n/a break;
5322n/a /* remaining bytes at the end? (size should be divisible by 4) */
5323n/a errmsg = "truncated data";
5324n/a startinpos = ((const char *)q) - starts;
5325n/a endinpos = ((const char *)e) - starts;
5326n/a }
5327n/a else {
5328n/a if (ch < 0x110000) {
5329n/a if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5330n/a goto onError;
5331n/a q += 4;
5332n/a continue;
5333n/a }
5334n/a errmsg = "code point not in range(0x110000)";
5335n/a startinpos = ((const char *)q) - starts;
5336n/a endinpos = startinpos + 4;
5337n/a }
5338n/a
5339n/a /* The remaining input chars are ignored if the callback
5340n/a chooses to skip the input */
5341n/a if (unicode_decode_call_errorhandler_writer(
5342n/a errors, &errorHandler,
5343n/a encoding, errmsg,
5344n/a &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5345n/a &writer))
5346n/a goto onError;
5347n/a }
5348n/a
5349n/a if (consumed)
5350n/a *consumed = (const char *)q-starts;
5351n/a
5352n/a Py_XDECREF(errorHandler);
5353n/a Py_XDECREF(exc);
5354n/a return _PyUnicodeWriter_Finish(&writer);
5355n/a
5356n/a onError:
5357n/a _PyUnicodeWriter_Dealloc(&writer);
5358n/a Py_XDECREF(errorHandler);
5359n/a Py_XDECREF(exc);
5360n/a return NULL;
5361n/a}
5362n/a
5363n/aPyObject *
5364n/a_PyUnicode_EncodeUTF32(PyObject *str,
5365n/a const char *errors,
5366n/a int byteorder)
5367n/a{
5368n/a enum PyUnicode_Kind kind;
5369n/a const void *data;
5370n/a Py_ssize_t len;
5371n/a PyObject *v;
5372n/a uint32_t *out;
5373n/a#if PY_LITTLE_ENDIAN
5374n/a int native_ordering = byteorder <= 0;
5375n/a#else
5376n/a int native_ordering = byteorder >= 0;
5377n/a#endif
5378n/a const char *encoding;
5379n/a Py_ssize_t nsize, pos;
5380n/a PyObject *errorHandler = NULL;
5381n/a PyObject *exc = NULL;
5382n/a PyObject *rep = NULL;
5383n/a
5384n/a if (!PyUnicode_Check(str)) {
5385n/a PyErr_BadArgument();
5386n/a return NULL;
5387n/a }
5388n/a if (PyUnicode_READY(str) == -1)
5389n/a return NULL;
5390n/a kind = PyUnicode_KIND(str);
5391n/a data = PyUnicode_DATA(str);
5392n/a len = PyUnicode_GET_LENGTH(str);
5393n/a
5394n/a if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5395n/a return PyErr_NoMemory();
5396n/a nsize = len + (byteorder == 0);
5397n/a v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5398n/a if (v == NULL)
5399n/a return NULL;
5400n/a
5401n/a /* output buffer is 4-bytes aligned */
5402n/a assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5403n/a out = (uint32_t *)PyBytes_AS_STRING(v);
5404n/a if (byteorder == 0)
5405n/a *out++ = 0xFEFF;
5406n/a if (len == 0)
5407n/a goto done;
5408n/a
5409n/a if (byteorder == -1)
5410n/a encoding = "utf-32-le";
5411n/a else if (byteorder == 1)
5412n/a encoding = "utf-32-be";
5413n/a else
5414n/a encoding = "utf-32";
5415n/a
5416n/a if (kind == PyUnicode_1BYTE_KIND) {
5417n/a ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5418n/a goto done;
5419n/a }
5420n/a
5421n/a pos = 0;
5422n/a while (pos < len) {
5423n/a Py_ssize_t repsize, moreunits;
5424n/a
5425n/a if (kind == PyUnicode_2BYTE_KIND) {
5426n/a pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5427n/a &out, native_ordering);
5428n/a }
5429n/a else {
5430n/a assert(kind == PyUnicode_4BYTE_KIND);
5431n/a pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5432n/a &out, native_ordering);
5433n/a }
5434n/a if (pos == len)
5435n/a break;
5436n/a
5437n/a rep = unicode_encode_call_errorhandler(
5438n/a errors, &errorHandler,
5439n/a encoding, "surrogates not allowed",
5440n/a str, &exc, pos, pos + 1, &pos);
5441n/a if (!rep)
5442n/a goto error;
5443n/a
5444n/a if (PyBytes_Check(rep)) {
5445n/a repsize = PyBytes_GET_SIZE(rep);
5446n/a if (repsize & 3) {
5447n/a raise_encode_exception(&exc, encoding,
5448n/a str, pos - 1, pos,
5449n/a "surrogates not allowed");
5450n/a goto error;
5451n/a }
5452n/a moreunits = repsize / 4;
5453n/a }
5454n/a else {
5455n/a assert(PyUnicode_Check(rep));
5456n/a if (PyUnicode_READY(rep) < 0)
5457n/a goto error;
5458n/a moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5459n/a if (!PyUnicode_IS_ASCII(rep)) {
5460n/a raise_encode_exception(&exc, encoding,
5461n/a str, pos - 1, pos,
5462n/a "surrogates not allowed");
5463n/a goto error;
5464n/a }
5465n/a }
5466n/a
5467n/a /* four bytes are reserved for each surrogate */
5468n/a if (moreunits > 1) {
5469n/a Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5470n/a Py_ssize_t morebytes = 4 * (moreunits - 1);
5471n/a if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5472n/a /* integer overflow */
5473n/a PyErr_NoMemory();
5474n/a goto error;
5475n/a }
5476n/a if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5477n/a goto error;
5478n/a out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5479n/a }
5480n/a
5481n/a if (PyBytes_Check(rep)) {
5482n/a memcpy(out, PyBytes_AS_STRING(rep), repsize);
5483n/a out += moreunits;
5484n/a } else /* rep is unicode */ {
5485n/a assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5486n/a ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5487n/a &out, native_ordering);
5488n/a }
5489n/a
5490n/a Py_CLEAR(rep);
5491n/a }
5492n/a
5493n/a /* Cut back to size actually needed. This is necessary for, for example,
5494n/a encoding of a string containing isolated surrogates and the 'ignore'
5495n/a handler is used. */
5496n/a nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5497n/a if (nsize != PyBytes_GET_SIZE(v))
5498n/a _PyBytes_Resize(&v, nsize);
5499n/a Py_XDECREF(errorHandler);
5500n/a Py_XDECREF(exc);
5501n/a done:
5502n/a return v;
5503n/a error:
5504n/a Py_XDECREF(rep);
5505n/a Py_XDECREF(errorHandler);
5506n/a Py_XDECREF(exc);
5507n/a Py_XDECREF(v);
5508n/a return NULL;
5509n/a}
5510n/a
5511n/aPyObject *
5512n/aPyUnicode_EncodeUTF32(const Py_UNICODE *s,
5513n/a Py_ssize_t size,
5514n/a const char *errors,
5515n/a int byteorder)
5516n/a{
5517n/a PyObject *result;
5518n/a PyObject *tmp = PyUnicode_FromWideChar(s, size);
5519n/a if (tmp == NULL)
5520n/a return NULL;
5521n/a result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5522n/a Py_DECREF(tmp);
5523n/a return result;
5524n/a}
5525n/a
5526n/aPyObject *
5527n/aPyUnicode_AsUTF32String(PyObject *unicode)
5528n/a{
5529n/a return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5530n/a}
5531n/a
5532n/a/* --- UTF-16 Codec ------------------------------------------------------- */
5533n/a
5534n/aPyObject *
5535n/aPyUnicode_DecodeUTF16(const char *s,
5536n/a Py_ssize_t size,
5537n/a const char *errors,
5538n/a int *byteorder)
5539n/a{
5540n/a return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5541n/a}
5542n/a
5543n/aPyObject *
5544n/aPyUnicode_DecodeUTF16Stateful(const char *s,
5545n/a Py_ssize_t size,
5546n/a const char *errors,
5547n/a int *byteorder,
5548n/a Py_ssize_t *consumed)
5549n/a{
5550n/a const char *starts = s;
5551n/a Py_ssize_t startinpos;
5552n/a Py_ssize_t endinpos;
5553n/a _PyUnicodeWriter writer;
5554n/a const unsigned char *q, *e;
5555n/a int bo = 0; /* assume native ordering by default */
5556n/a int native_ordering;
5557n/a const char *errmsg = "";
5558n/a PyObject *errorHandler = NULL;
5559n/a PyObject *exc = NULL;
5560n/a const char *encoding;
5561n/a
5562n/a q = (unsigned char *)s;
5563n/a e = q + size;
5564n/a
5565n/a if (byteorder)
5566n/a bo = *byteorder;
5567n/a
5568n/a /* Check for BOM marks (U+FEFF) in the input and adjust current
5569n/a byte order setting accordingly. In native mode, the leading BOM
5570n/a mark is skipped, in all other modes, it is copied to the output
5571n/a stream as-is (giving a ZWNBSP character). */
5572n/a if (bo == 0 && size >= 2) {
5573n/a const Py_UCS4 bom = (q[1] << 8) | q[0];
5574n/a if (bom == 0xFEFF) {
5575n/a q += 2;
5576n/a bo = -1;
5577n/a }
5578n/a else if (bom == 0xFFFE) {
5579n/a q += 2;
5580n/a bo = 1;
5581n/a }
5582n/a if (byteorder)
5583n/a *byteorder = bo;
5584n/a }
5585n/a
5586n/a if (q == e) {
5587n/a if (consumed)
5588n/a *consumed = size;
5589n/a _Py_RETURN_UNICODE_EMPTY();
5590n/a }
5591n/a
5592n/a#if PY_LITTLE_ENDIAN
5593n/a native_ordering = bo <= 0;
5594n/a encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5595n/a#else
5596n/a native_ordering = bo >= 0;
5597n/a encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5598n/a#endif
5599n/a
5600n/a /* Note: size will always be longer than the resulting Unicode
5601n/a character count */
5602n/a _PyUnicodeWriter_Init(&writer);
5603n/a writer.min_length = (e - q + 1) / 2;
5604n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5605n/a goto onError;
5606n/a
5607n/a while (1) {
5608n/a Py_UCS4 ch = 0;
5609n/a if (e - q >= 2) {
5610n/a int kind = writer.kind;
5611n/a if (kind == PyUnicode_1BYTE_KIND) {
5612n/a if (PyUnicode_IS_ASCII(writer.buffer))
5613n/a ch = asciilib_utf16_decode(&q, e,
5614n/a (Py_UCS1*)writer.data, &writer.pos,
5615n/a native_ordering);
5616n/a else
5617n/a ch = ucs1lib_utf16_decode(&q, e,
5618n/a (Py_UCS1*)writer.data, &writer.pos,
5619n/a native_ordering);
5620n/a } else if (kind == PyUnicode_2BYTE_KIND) {
5621n/a ch = ucs2lib_utf16_decode(&q, e,
5622n/a (Py_UCS2*)writer.data, &writer.pos,
5623n/a native_ordering);
5624n/a } else {
5625n/a assert(kind == PyUnicode_4BYTE_KIND);
5626n/a ch = ucs4lib_utf16_decode(&q, e,
5627n/a (Py_UCS4*)writer.data, &writer.pos,
5628n/a native_ordering);
5629n/a }
5630n/a }
5631n/a
5632n/a switch (ch)
5633n/a {
5634n/a case 0:
5635n/a /* remaining byte at the end? (size should be even) */
5636n/a if (q == e || consumed)
5637n/a goto End;
5638n/a errmsg = "truncated data";
5639n/a startinpos = ((const char *)q) - starts;
5640n/a endinpos = ((const char *)e) - starts;
5641n/a break;
5642n/a /* The remaining input chars are ignored if the callback
5643n/a chooses to skip the input */
5644n/a case 1:
5645n/a q -= 2;
5646n/a if (consumed)
5647n/a goto End;
5648n/a errmsg = "unexpected end of data";
5649n/a startinpos = ((const char *)q) - starts;
5650n/a endinpos = ((const char *)e) - starts;
5651n/a break;
5652n/a case 2:
5653n/a errmsg = "illegal encoding";
5654n/a startinpos = ((const char *)q) - 2 - starts;
5655n/a endinpos = startinpos + 2;
5656n/a break;
5657n/a case 3:
5658n/a errmsg = "illegal UTF-16 surrogate";
5659n/a startinpos = ((const char *)q) - 4 - starts;
5660n/a endinpos = startinpos + 2;
5661n/a break;
5662n/a default:
5663n/a if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5664n/a goto onError;
5665n/a continue;
5666n/a }
5667n/a
5668n/a if (unicode_decode_call_errorhandler_writer(
5669n/a errors,
5670n/a &errorHandler,
5671n/a encoding, errmsg,
5672n/a &starts,
5673n/a (const char **)&e,
5674n/a &startinpos,
5675n/a &endinpos,
5676n/a &exc,
5677n/a (const char **)&q,
5678n/a &writer))
5679n/a goto onError;
5680n/a }
5681n/a
5682n/aEnd:
5683n/a if (consumed)
5684n/a *consumed = (const char *)q-starts;
5685n/a
5686n/a Py_XDECREF(errorHandler);
5687n/a Py_XDECREF(exc);
5688n/a return _PyUnicodeWriter_Finish(&writer);
5689n/a
5690n/a onError:
5691n/a _PyUnicodeWriter_Dealloc(&writer);
5692n/a Py_XDECREF(errorHandler);
5693n/a Py_XDECREF(exc);
5694n/a return NULL;
5695n/a}
5696n/a
5697n/aPyObject *
5698n/a_PyUnicode_EncodeUTF16(PyObject *str,
5699n/a const char *errors,
5700n/a int byteorder)
5701n/a{
5702n/a enum PyUnicode_Kind kind;
5703n/a const void *data;
5704n/a Py_ssize_t len;
5705n/a PyObject *v;
5706n/a unsigned short *out;
5707n/a Py_ssize_t pairs;
5708n/a#if PY_BIG_ENDIAN
5709n/a int native_ordering = byteorder >= 0;
5710n/a#else
5711n/a int native_ordering = byteorder <= 0;
5712n/a#endif
5713n/a const char *encoding;
5714n/a Py_ssize_t nsize, pos;
5715n/a PyObject *errorHandler = NULL;
5716n/a PyObject *exc = NULL;
5717n/a PyObject *rep = NULL;
5718n/a
5719n/a if (!PyUnicode_Check(str)) {
5720n/a PyErr_BadArgument();
5721n/a return NULL;
5722n/a }
5723n/a if (PyUnicode_READY(str) == -1)
5724n/a return NULL;
5725n/a kind = PyUnicode_KIND(str);
5726n/a data = PyUnicode_DATA(str);
5727n/a len = PyUnicode_GET_LENGTH(str);
5728n/a
5729n/a pairs = 0;
5730n/a if (kind == PyUnicode_4BYTE_KIND) {
5731n/a const Py_UCS4 *in = (const Py_UCS4 *)data;
5732n/a const Py_UCS4 *end = in + len;
5733n/a while (in < end) {
5734n/a if (*in++ >= 0x10000) {
5735n/a pairs++;
5736n/a }
5737n/a }
5738n/a }
5739n/a if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
5740n/a return PyErr_NoMemory();
5741n/a }
5742n/a nsize = len + pairs + (byteorder == 0);
5743n/a v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5744n/a if (v == NULL) {
5745n/a return NULL;
5746n/a }
5747n/a
5748n/a /* output buffer is 2-bytes aligned */
5749n/a assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5750n/a out = (unsigned short *)PyBytes_AS_STRING(v);
5751n/a if (byteorder == 0) {
5752n/a *out++ = 0xFEFF;
5753n/a }
5754n/a if (len == 0) {
5755n/a goto done;
5756n/a }
5757n/a
5758n/a if (kind == PyUnicode_1BYTE_KIND) {
5759n/a ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5760n/a goto done;
5761n/a }
5762n/a
5763n/a if (byteorder < 0) {
5764n/a encoding = "utf-16-le";
5765n/a }
5766n/a else if (byteorder > 0) {
5767n/a encoding = "utf-16-be";
5768n/a }
5769n/a else {
5770n/a encoding = "utf-16";
5771n/a }
5772n/a
5773n/a pos = 0;
5774n/a while (pos < len) {
5775n/a Py_ssize_t repsize, moreunits;
5776n/a
5777n/a if (kind == PyUnicode_2BYTE_KIND) {
5778n/a pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5779n/a &out, native_ordering);
5780n/a }
5781n/a else {
5782n/a assert(kind == PyUnicode_4BYTE_KIND);
5783n/a pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5784n/a &out, native_ordering);
5785n/a }
5786n/a if (pos == len)
5787n/a break;
5788n/a
5789n/a rep = unicode_encode_call_errorhandler(
5790n/a errors, &errorHandler,
5791n/a encoding, "surrogates not allowed",
5792n/a str, &exc, pos, pos + 1, &pos);
5793n/a if (!rep)
5794n/a goto error;
5795n/a
5796n/a if (PyBytes_Check(rep)) {
5797n/a repsize = PyBytes_GET_SIZE(rep);
5798n/a if (repsize & 1) {
5799n/a raise_encode_exception(&exc, encoding,
5800n/a str, pos - 1, pos,
5801n/a "surrogates not allowed");
5802n/a goto error;
5803n/a }
5804n/a moreunits = repsize / 2;
5805n/a }
5806n/a else {
5807n/a assert(PyUnicode_Check(rep));
5808n/a if (PyUnicode_READY(rep) < 0)
5809n/a goto error;
5810n/a moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5811n/a if (!PyUnicode_IS_ASCII(rep)) {
5812n/a raise_encode_exception(&exc, encoding,
5813n/a str, pos - 1, pos,
5814n/a "surrogates not allowed");
5815n/a goto error;
5816n/a }
5817n/a }
5818n/a
5819n/a /* two bytes are reserved for each surrogate */
5820n/a if (moreunits > 1) {
5821n/a Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5822n/a Py_ssize_t morebytes = 2 * (moreunits - 1);
5823n/a if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5824n/a /* integer overflow */
5825n/a PyErr_NoMemory();
5826n/a goto error;
5827n/a }
5828n/a if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5829n/a goto error;
5830n/a out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5831n/a }
5832n/a
5833n/a if (PyBytes_Check(rep)) {
5834n/a memcpy(out, PyBytes_AS_STRING(rep), repsize);
5835n/a out += moreunits;
5836n/a } else /* rep is unicode */ {
5837n/a assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5838n/a ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5839n/a &out, native_ordering);
5840n/a }
5841n/a
5842n/a Py_CLEAR(rep);
5843n/a }
5844n/a
5845n/a /* Cut back to size actually needed. This is necessary for, for example,
5846n/a encoding of a string containing isolated surrogates and the 'ignore' handler
5847n/a is used. */
5848n/a nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5849n/a if (nsize != PyBytes_GET_SIZE(v))
5850n/a _PyBytes_Resize(&v, nsize);
5851n/a Py_XDECREF(errorHandler);
5852n/a Py_XDECREF(exc);
5853n/a done:
5854n/a return v;
5855n/a error:
5856n/a Py_XDECREF(rep);
5857n/a Py_XDECREF(errorHandler);
5858n/a Py_XDECREF(exc);
5859n/a Py_XDECREF(v);
5860n/a return NULL;
5861n/a#undef STORECHAR
5862n/a}
5863n/a
5864n/aPyObject *
5865n/aPyUnicode_EncodeUTF16(const Py_UNICODE *s,
5866n/a Py_ssize_t size,
5867n/a const char *errors,
5868n/a int byteorder)
5869n/a{
5870n/a PyObject *result;
5871n/a PyObject *tmp = PyUnicode_FromWideChar(s, size);
5872n/a if (tmp == NULL)
5873n/a return NULL;
5874n/a result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5875n/a Py_DECREF(tmp);
5876n/a return result;
5877n/a}
5878n/a
5879n/aPyObject *
5880n/aPyUnicode_AsUTF16String(PyObject *unicode)
5881n/a{
5882n/a return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5883n/a}
5884n/a
5885n/a/* --- Unicode Escape Codec ----------------------------------------------- */
5886n/a
5887n/astatic _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5888n/a
5889n/aPyObject *
5890n/a_PyUnicode_DecodeUnicodeEscape(const char *s,
5891n/a Py_ssize_t size,
5892n/a const char *errors,
5893n/a const char **first_invalid_escape)
5894n/a{
5895n/a const char *starts = s;
5896n/a _PyUnicodeWriter writer;
5897n/a const char *end;
5898n/a PyObject *errorHandler = NULL;
5899n/a PyObject *exc = NULL;
5900n/a
5901n/a // so we can remember if we've seen an invalid escape char or not
5902n/a *first_invalid_escape = NULL;
5903n/a
5904n/a if (size == 0) {
5905n/a _Py_RETURN_UNICODE_EMPTY();
5906n/a }
5907n/a /* Escaped strings will always be longer than the resulting
5908n/a Unicode string, so we start with size here and then reduce the
5909n/a length after conversion to the true value.
5910n/a (but if the error callback returns a long replacement string
5911n/a we'll have to allocate more space) */
5912n/a _PyUnicodeWriter_Init(&writer);
5913n/a writer.min_length = size;
5914n/a if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5915n/a goto onError;
5916n/a }
5917n/a
5918n/a end = s + size;
5919n/a while (s < end) {
5920n/a unsigned char c = (unsigned char) *s++;
5921n/a Py_UCS4 ch;
5922n/a int count;
5923n/a Py_ssize_t startinpos;
5924n/a Py_ssize_t endinpos;
5925n/a const char *message;
5926n/a
5927n/a#define WRITE_ASCII_CHAR(ch) \
5928n/a do { \
5929n/a assert(ch <= 127); \
5930n/a assert(writer.pos < writer.size); \
5931n/a PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5932n/a } while(0)
5933n/a
5934n/a#define WRITE_CHAR(ch) \
5935n/a do { \
5936n/a if (ch <= writer.maxchar) { \
5937n/a assert(writer.pos < writer.size); \
5938n/a PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5939n/a } \
5940n/a else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5941n/a goto onError; \
5942n/a } \
5943n/a } while(0)
5944n/a
5945n/a /* Non-escape characters are interpreted as Unicode ordinals */
5946n/a if (c != '\\') {
5947n/a WRITE_CHAR(c);
5948n/a continue;
5949n/a }
5950n/a
5951n/a startinpos = s - starts - 1;
5952n/a /* \ - Escapes */
5953n/a if (s >= end) {
5954n/a message = "\\ at end of string";
5955n/a goto error;
5956n/a }
5957n/a c = (unsigned char) *s++;
5958n/a
5959n/a assert(writer.pos < writer.size);
5960n/a switch (c) {
5961n/a
5962n/a /* \x escapes */
5963n/a case '\n': continue;
5964n/a case '\\': WRITE_ASCII_CHAR('\\'); continue;
5965n/a case '\'': WRITE_ASCII_CHAR('\''); continue;
5966n/a case '\"': WRITE_ASCII_CHAR('\"'); continue;
5967n/a case 'b': WRITE_ASCII_CHAR('\b'); continue;
5968n/a /* FF */
5969n/a case 'f': WRITE_ASCII_CHAR('\014'); continue;
5970n/a case 't': WRITE_ASCII_CHAR('\t'); continue;
5971n/a case 'n': WRITE_ASCII_CHAR('\n'); continue;
5972n/a case 'r': WRITE_ASCII_CHAR('\r'); continue;
5973n/a /* VT */
5974n/a case 'v': WRITE_ASCII_CHAR('\013'); continue;
5975n/a /* BEL, not classic C */
5976n/a case 'a': WRITE_ASCII_CHAR('\007'); continue;
5977n/a
5978n/a /* \OOO (octal) escapes */
5979n/a case '0': case '1': case '2': case '3':
5980n/a case '4': case '5': case '6': case '7':
5981n/a ch = c - '0';
5982n/a if (s < end && '0' <= *s && *s <= '7') {
5983n/a ch = (ch<<3) + *s++ - '0';
5984n/a if (s < end && '0' <= *s && *s <= '7') {
5985n/a ch = (ch<<3) + *s++ - '0';
5986n/a }
5987n/a }
5988n/a WRITE_CHAR(ch);
5989n/a continue;
5990n/a
5991n/a /* hex escapes */
5992n/a /* \xXX */
5993n/a case 'x':
5994n/a count = 2;
5995n/a message = "truncated \\xXX escape";
5996n/a goto hexescape;
5997n/a
5998n/a /* \uXXXX */
5999n/a case 'u':
6000n/a count = 4;
6001n/a message = "truncated \\uXXXX escape";
6002n/a goto hexescape;
6003n/a
6004n/a /* \UXXXXXXXX */
6005n/a case 'U':
6006n/a count = 8;
6007n/a message = "truncated \\UXXXXXXXX escape";
6008n/a hexescape:
6009n/a for (ch = 0; count && s < end; ++s, --count) {
6010n/a c = (unsigned char)*s;
6011n/a ch <<= 4;
6012n/a if (c >= '0' && c <= '9') {
6013n/a ch += c - '0';
6014n/a }
6015n/a else if (c >= 'a' && c <= 'f') {
6016n/a ch += c - ('a' - 10);
6017n/a }
6018n/a else if (c >= 'A' && c <= 'F') {
6019n/a ch += c - ('A' - 10);
6020n/a }
6021n/a else {
6022n/a break;
6023n/a }
6024n/a }
6025n/a if (count) {
6026n/a goto error;
6027n/a }
6028n/a
6029n/a /* when we get here, ch is a 32-bit unicode character */
6030n/a if (ch > MAX_UNICODE) {
6031n/a message = "illegal Unicode character";
6032n/a goto error;
6033n/a }
6034n/a
6035n/a WRITE_CHAR(ch);
6036n/a continue;
6037n/a
6038n/a /* \N{name} */
6039n/a case 'N':
6040n/a if (ucnhash_CAPI == NULL) {
6041n/a /* load the unicode data module */
6042n/a ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6043n/a PyUnicodeData_CAPSULE_NAME, 1);
6044n/a if (ucnhash_CAPI == NULL) {
6045n/a PyErr_SetString(
6046n/a PyExc_UnicodeError,
6047n/a "\\N escapes not supported (can't load unicodedata module)"
6048n/a );
6049n/a goto onError;
6050n/a }
6051n/a }
6052n/a
6053n/a message = "malformed \\N character escape";
6054n/a if (*s == '{') {
6055n/a const char *start = ++s;
6056n/a size_t namelen;
6057n/a /* look for the closing brace */
6058n/a while (s < end && *s != '}')
6059n/a s++;
6060n/a namelen = s - start;
6061n/a if (namelen && s < end) {
6062n/a /* found a name. look it up in the unicode database */
6063n/a s++;
6064n/a ch = 0xffffffff; /* in case 'getcode' messes up */
6065n/a if (namelen <= INT_MAX &&
6066n/a ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6067n/a &ch, 0)) {
6068n/a assert(ch <= MAX_UNICODE);
6069n/a WRITE_CHAR(ch);
6070n/a continue;
6071n/a }
6072n/a message = "unknown Unicode character name";
6073n/a }
6074n/a }
6075n/a goto error;
6076n/a
6077n/a default:
6078n/a if (*first_invalid_escape == NULL) {
6079n/a *first_invalid_escape = s-1; /* Back up one char, since we've
6080n/a already incremented s. */
6081n/a }
6082n/a WRITE_ASCII_CHAR('\\');
6083n/a WRITE_CHAR(c);
6084n/a continue;
6085n/a }
6086n/a
6087n/a error:
6088n/a endinpos = s-starts;
6089n/a writer.min_length = end - s + writer.pos;
6090n/a if (unicode_decode_call_errorhandler_writer(
6091n/a errors, &errorHandler,
6092n/a "unicodeescape", message,
6093n/a &starts, &end, &startinpos, &endinpos, &exc, &s,
6094n/a &writer)) {
6095n/a goto onError;
6096n/a }
6097n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6098n/a goto onError;
6099n/a }
6100n/a
6101n/a#undef WRITE_ASCII_CHAR
6102n/a#undef WRITE_CHAR
6103n/a }
6104n/a
6105n/a Py_XDECREF(errorHandler);
6106n/a Py_XDECREF(exc);
6107n/a return _PyUnicodeWriter_Finish(&writer);
6108n/a
6109n/a onError:
6110n/a _PyUnicodeWriter_Dealloc(&writer);
6111n/a Py_XDECREF(errorHandler);
6112n/a Py_XDECREF(exc);
6113n/a return NULL;
6114n/a}
6115n/a
6116n/aPyObject *
6117n/aPyUnicode_DecodeUnicodeEscape(const char *s,
6118n/a Py_ssize_t size,
6119n/a const char *errors)
6120n/a{
6121n/a const char *first_invalid_escape;
6122n/a PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6123n/a &first_invalid_escape);
6124n/a if (result == NULL)
6125n/a return NULL;
6126n/a if (first_invalid_escape != NULL) {
6127n/a if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6128n/a "invalid escape sequence '\\%c'",
6129n/a *first_invalid_escape) < 0) {
6130n/a Py_DECREF(result);
6131n/a return NULL;
6132n/a }
6133n/a }
6134n/a return result;
6135n/a}
6136n/a
6137n/a/* Return a Unicode-Escape string version of the Unicode object. */
6138n/a
6139n/aPyObject *
6140n/aPyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6141n/a{
6142n/a Py_ssize_t i, len;
6143n/a PyObject *repr;
6144n/a char *p;
6145n/a enum PyUnicode_Kind kind;
6146n/a void *data;
6147n/a Py_ssize_t expandsize;
6148n/a
6149n/a /* Initial allocation is based on the longest-possible character
6150n/a escape.
6151n/a
6152n/a For UCS1 strings it's '\xxx', 4 bytes per source character.
6153n/a For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6154n/a For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6155n/a */
6156n/a
6157n/a if (!PyUnicode_Check(unicode)) {
6158n/a PyErr_BadArgument();
6159n/a return NULL;
6160n/a }
6161n/a if (PyUnicode_READY(unicode) == -1) {
6162n/a return NULL;
6163n/a }
6164n/a
6165n/a len = PyUnicode_GET_LENGTH(unicode);
6166n/a if (len == 0) {
6167n/a return PyBytes_FromStringAndSize(NULL, 0);
6168n/a }
6169n/a
6170n/a kind = PyUnicode_KIND(unicode);
6171n/a data = PyUnicode_DATA(unicode);
6172n/a /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6173n/a bytes, and 1 byte characters 4. */
6174n/a expandsize = kind * 2 + 2;
6175n/a if (len > PY_SSIZE_T_MAX / expandsize) {
6176n/a return PyErr_NoMemory();
6177n/a }
6178n/a repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6179n/a if (repr == NULL) {
6180n/a return NULL;
6181n/a }
6182n/a
6183n/a p = PyBytes_AS_STRING(repr);
6184n/a for (i = 0; i < len; i++) {
6185n/a Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6186n/a
6187n/a /* U+0000-U+00ff range */
6188n/a if (ch < 0x100) {
6189n/a if (ch >= ' ' && ch < 127) {
6190n/a if (ch != '\\') {
6191n/a /* Copy printable US ASCII as-is */
6192n/a *p++ = (char) ch;
6193n/a }
6194n/a /* Escape backslashes */
6195n/a else {
6196n/a *p++ = '\\';
6197n/a *p++ = '\\';
6198n/a }
6199n/a }
6200n/a
6201n/a /* Map special whitespace to '\t', \n', '\r' */
6202n/a else if (ch == '\t') {
6203n/a *p++ = '\\';
6204n/a *p++ = 't';
6205n/a }
6206n/a else if (ch == '\n') {
6207n/a *p++ = '\\';
6208n/a *p++ = 'n';
6209n/a }
6210n/a else if (ch == '\r') {
6211n/a *p++ = '\\';
6212n/a *p++ = 'r';
6213n/a }
6214n/a
6215n/a /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6216n/a else {
6217n/a *p++ = '\\';
6218n/a *p++ = 'x';
6219n/a *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6220n/a *p++ = Py_hexdigits[ch & 0x000F];
6221n/a }
6222n/a }
6223n/a /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6224n/a else if (ch < 0x10000) {
6225n/a *p++ = '\\';
6226n/a *p++ = 'u';
6227n/a *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6228n/a *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6229n/a *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6230n/a *p++ = Py_hexdigits[ch & 0x000F];
6231n/a }
6232n/a /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6233n/a else {
6234n/a
6235n/a /* Make sure that the first two digits are zero */
6236n/a assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6237n/a *p++ = '\\';
6238n/a *p++ = 'U';
6239n/a *p++ = '0';
6240n/a *p++ = '0';
6241n/a *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6242n/a *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6243n/a *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6244n/a *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6245n/a *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6246n/a *p++ = Py_hexdigits[ch & 0x0000000F];
6247n/a }
6248n/a }
6249n/a
6250n/a assert(p - PyBytes_AS_STRING(repr) > 0);
6251n/a if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6252n/a return NULL;
6253n/a }
6254n/a return repr;
6255n/a}
6256n/a
6257n/aPyObject *
6258n/aPyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6259n/a Py_ssize_t size)
6260n/a{
6261n/a PyObject *result;
6262n/a PyObject *tmp = PyUnicode_FromWideChar(s, size);
6263n/a if (tmp == NULL) {
6264n/a return NULL;
6265n/a }
6266n/a
6267n/a result = PyUnicode_AsUnicodeEscapeString(tmp);
6268n/a Py_DECREF(tmp);
6269n/a return result;
6270n/a}
6271n/a
6272n/a/* --- Raw Unicode Escape Codec ------------------------------------------- */
6273n/a
6274n/aPyObject *
6275n/aPyUnicode_DecodeRawUnicodeEscape(const char *s,
6276n/a Py_ssize_t size,
6277n/a const char *errors)
6278n/a{
6279n/a const char *starts = s;
6280n/a _PyUnicodeWriter writer;
6281n/a const char *end;
6282n/a PyObject *errorHandler = NULL;
6283n/a PyObject *exc = NULL;
6284n/a
6285n/a if (size == 0) {
6286n/a _Py_RETURN_UNICODE_EMPTY();
6287n/a }
6288n/a
6289n/a /* Escaped strings will always be longer than the resulting
6290n/a Unicode string, so we start with size here and then reduce the
6291n/a length after conversion to the true value. (But decoding error
6292n/a handler might have to resize the string) */
6293n/a _PyUnicodeWriter_Init(&writer);
6294n/a writer.min_length = size;
6295n/a if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6296n/a goto onError;
6297n/a }
6298n/a
6299n/a end = s + size;
6300n/a while (s < end) {
6301n/a unsigned char c = (unsigned char) *s++;
6302n/a Py_UCS4 ch;
6303n/a int count;
6304n/a Py_ssize_t startinpos;
6305n/a Py_ssize_t endinpos;
6306n/a const char *message;
6307n/a
6308n/a#define WRITE_CHAR(ch) \
6309n/a do { \
6310n/a if (ch <= writer.maxchar) { \
6311n/a assert(writer.pos < writer.size); \
6312n/a PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6313n/a } \
6314n/a else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6315n/a goto onError; \
6316n/a } \
6317n/a } while(0)
6318n/a
6319n/a /* Non-escape characters are interpreted as Unicode ordinals */
6320n/a if (c != '\\' || s >= end) {
6321n/a WRITE_CHAR(c);
6322n/a continue;
6323n/a }
6324n/a
6325n/a c = (unsigned char) *s++;
6326n/a if (c == 'u') {
6327n/a count = 4;
6328n/a message = "truncated \\uXXXX escape";
6329n/a }
6330n/a else if (c == 'U') {
6331n/a count = 8;
6332n/a message = "truncated \\UXXXXXXXX escape";
6333n/a }
6334n/a else {
6335n/a assert(writer.pos < writer.size);
6336n/a PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6337n/a WRITE_CHAR(c);
6338n/a continue;
6339n/a }
6340n/a startinpos = s - starts - 2;
6341n/a
6342n/a /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6343n/a for (ch = 0; count && s < end; ++s, --count) {
6344n/a c = (unsigned char)*s;
6345n/a ch <<= 4;
6346n/a if (c >= '0' && c <= '9') {
6347n/a ch += c - '0';
6348n/a }
6349n/a else if (c >= 'a' && c <= 'f') {
6350n/a ch += c - ('a' - 10);
6351n/a }
6352n/a else if (c >= 'A' && c <= 'F') {
6353n/a ch += c - ('A' - 10);
6354n/a }
6355n/a else {
6356n/a break;
6357n/a }
6358n/a }
6359n/a if (!count) {
6360n/a if (ch <= MAX_UNICODE) {
6361n/a WRITE_CHAR(ch);
6362n/a continue;
6363n/a }
6364n/a message = "\\Uxxxxxxxx out of range";
6365n/a }
6366n/a
6367n/a endinpos = s-starts;
6368n/a writer.min_length = end - s + writer.pos;
6369n/a if (unicode_decode_call_errorhandler_writer(
6370n/a errors, &errorHandler,
6371n/a "rawunicodeescape", message,
6372n/a &starts, &end, &startinpos, &endinpos, &exc, &s,
6373n/a &writer)) {
6374n/a goto onError;
6375n/a }
6376n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6377n/a goto onError;
6378n/a }
6379n/a
6380n/a#undef WRITE_CHAR
6381n/a }
6382n/a Py_XDECREF(errorHandler);
6383n/a Py_XDECREF(exc);
6384n/a return _PyUnicodeWriter_Finish(&writer);
6385n/a
6386n/a onError:
6387n/a _PyUnicodeWriter_Dealloc(&writer);
6388n/a Py_XDECREF(errorHandler);
6389n/a Py_XDECREF(exc);
6390n/a return NULL;
6391n/a
6392n/a}
6393n/a
6394n/a
6395n/aPyObject *
6396n/aPyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6397n/a{
6398n/a PyObject *repr;
6399n/a char *p;
6400n/a Py_ssize_t expandsize, pos;
6401n/a int kind;
6402n/a void *data;
6403n/a Py_ssize_t len;
6404n/a
6405n/a if (!PyUnicode_Check(unicode)) {
6406n/a PyErr_BadArgument();
6407n/a return NULL;
6408n/a }
6409n/a if (PyUnicode_READY(unicode) == -1) {
6410n/a return NULL;
6411n/a }
6412n/a kind = PyUnicode_KIND(unicode);
6413n/a data = PyUnicode_DATA(unicode);
6414n/a len = PyUnicode_GET_LENGTH(unicode);
6415n/a if (kind == PyUnicode_1BYTE_KIND) {
6416n/a return PyBytes_FromStringAndSize(data, len);
6417n/a }
6418n/a
6419n/a /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6420n/a bytes, and 1 byte characters 4. */
6421n/a expandsize = kind * 2 + 2;
6422n/a
6423n/a if (len > PY_SSIZE_T_MAX / expandsize) {
6424n/a return PyErr_NoMemory();
6425n/a }
6426n/a repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6427n/a if (repr == NULL) {
6428n/a return NULL;
6429n/a }
6430n/a if (len == 0) {
6431n/a return repr;
6432n/a }
6433n/a
6434n/a p = PyBytes_AS_STRING(repr);
6435n/a for (pos = 0; pos < len; pos++) {
6436n/a Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6437n/a
6438n/a /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6439n/a if (ch < 0x100) {
6440n/a *p++ = (char) ch;
6441n/a }
6442n/a /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6443n/a else if (ch < 0x10000) {
6444n/a *p++ = '\\';
6445n/a *p++ = 'u';
6446n/a *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6447n/a *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6448n/a *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6449n/a *p++ = Py_hexdigits[ch & 15];
6450n/a }
6451n/a /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6452n/a else {
6453n/a assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6454n/a *p++ = '\\';
6455n/a *p++ = 'U';
6456n/a *p++ = '0';
6457n/a *p++ = '0';
6458n/a *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6459n/a *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6460n/a *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6461n/a *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6462n/a *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6463n/a *p++ = Py_hexdigits[ch & 15];
6464n/a }
6465n/a }
6466n/a
6467n/a assert(p > PyBytes_AS_STRING(repr));
6468n/a if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6469n/a return NULL;
6470n/a }
6471n/a return repr;
6472n/a}
6473n/a
6474n/aPyObject *
6475n/aPyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6476n/a Py_ssize_t size)
6477n/a{
6478n/a PyObject *result;
6479n/a PyObject *tmp = PyUnicode_FromWideChar(s, size);
6480n/a if (tmp == NULL)
6481n/a return NULL;
6482n/a result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6483n/a Py_DECREF(tmp);
6484n/a return result;
6485n/a}
6486n/a
6487n/a/* --- Unicode Internal Codec ------------------------------------------- */
6488n/a
6489n/aPyObject *
6490n/a_PyUnicode_DecodeUnicodeInternal(const char *s,
6491n/a Py_ssize_t size,
6492n/a const char *errors)
6493n/a{
6494n/a const char *starts = s;
6495n/a Py_ssize_t startinpos;
6496n/a Py_ssize_t endinpos;
6497n/a _PyUnicodeWriter writer;
6498n/a const char *end;
6499n/a const char *reason;
6500n/a PyObject *errorHandler = NULL;
6501n/a PyObject *exc = NULL;
6502n/a
6503n/a if (PyErr_WarnEx(PyExc_DeprecationWarning,
6504n/a "unicode_internal codec has been deprecated",
6505n/a 1))
6506n/a return NULL;
6507n/a
6508n/a if (size == 0)
6509n/a _Py_RETURN_UNICODE_EMPTY();
6510n/a
6511n/a _PyUnicodeWriter_Init(&writer);
6512n/a if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6513n/a PyErr_NoMemory();
6514n/a goto onError;
6515n/a }
6516n/a writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
6517n/a
6518n/a end = s + size;
6519n/a while (s < end) {
6520n/a Py_UNICODE uch;
6521n/a Py_UCS4 ch;
6522n/a if (end - s < Py_UNICODE_SIZE) {
6523n/a endinpos = end-starts;
6524n/a reason = "truncated input";
6525n/a goto error;
6526n/a }
6527n/a /* We copy the raw representation one byte at a time because the
6528n/a pointer may be unaligned (see test_codeccallbacks). */
6529n/a ((char *) &uch)[0] = s[0];
6530n/a ((char *) &uch)[1] = s[1];
6531n/a#ifdef Py_UNICODE_WIDE
6532n/a ((char *) &uch)[2] = s[2];
6533n/a ((char *) &uch)[3] = s[3];
6534n/a#endif
6535n/a ch = uch;
6536n/a#ifdef Py_UNICODE_WIDE
6537n/a /* We have to sanity check the raw data, otherwise doom looms for
6538n/a some malformed UCS-4 data. */
6539n/a if (ch > 0x10ffff) {
6540n/a endinpos = s - starts + Py_UNICODE_SIZE;
6541n/a reason = "illegal code point (> 0x10FFFF)";
6542n/a goto error;
6543n/a }
6544n/a#endif
6545n/a s += Py_UNICODE_SIZE;
6546n/a#ifndef Py_UNICODE_WIDE
6547n/a if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
6548n/a {
6549n/a Py_UNICODE uch2;
6550n/a ((char *) &uch2)[0] = s[0];
6551n/a ((char *) &uch2)[1] = s[1];
6552n/a if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6553n/a {
6554n/a ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6555n/a s += Py_UNICODE_SIZE;
6556n/a }
6557n/a }
6558n/a#endif
6559n/a
6560n/a if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6561n/a goto onError;
6562n/a continue;
6563n/a
6564n/a error:
6565n/a startinpos = s - starts;
6566n/a if (unicode_decode_call_errorhandler_writer(
6567n/a errors, &errorHandler,
6568n/a "unicode_internal", reason,
6569n/a &starts, &end, &startinpos, &endinpos, &exc, &s,
6570n/a &writer))
6571n/a goto onError;
6572n/a }
6573n/a
6574n/a Py_XDECREF(errorHandler);
6575n/a Py_XDECREF(exc);
6576n/a return _PyUnicodeWriter_Finish(&writer);
6577n/a
6578n/a onError:
6579n/a _PyUnicodeWriter_Dealloc(&writer);
6580n/a Py_XDECREF(errorHandler);
6581n/a Py_XDECREF(exc);
6582n/a return NULL;
6583n/a}
6584n/a
6585n/a/* --- Latin-1 Codec ------------------------------------------------------ */
6586n/a
6587n/aPyObject *
6588n/aPyUnicode_DecodeLatin1(const char *s,
6589n/a Py_ssize_t size,
6590n/a const char *errors)
6591n/a{
6592n/a /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6593n/a return _PyUnicode_FromUCS1((unsigned char*)s, size);
6594n/a}
6595n/a
6596n/a/* create or adjust a UnicodeEncodeError */
6597n/astatic void
6598n/amake_encode_exception(PyObject **exceptionObject,
6599n/a const char *encoding,
6600n/a PyObject *unicode,
6601n/a Py_ssize_t startpos, Py_ssize_t endpos,
6602n/a const char *reason)
6603n/a{
6604n/a if (*exceptionObject == NULL) {
6605n/a *exceptionObject = PyObject_CallFunction(
6606n/a PyExc_UnicodeEncodeError, "sOnns",
6607n/a encoding, unicode, startpos, endpos, reason);
6608n/a }
6609n/a else {
6610n/a if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6611n/a goto onError;
6612n/a if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6613n/a goto onError;
6614n/a if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6615n/a goto onError;
6616n/a return;
6617n/a onError:
6618n/a Py_CLEAR(*exceptionObject);
6619n/a }
6620n/a}
6621n/a
6622n/a/* raises a UnicodeEncodeError */
6623n/astatic void
6624n/araise_encode_exception(PyObject **exceptionObject,
6625n/a const char *encoding,
6626n/a PyObject *unicode,
6627n/a Py_ssize_t startpos, Py_ssize_t endpos,
6628n/a const char *reason)
6629n/a{
6630n/a make_encode_exception(exceptionObject,
6631n/a encoding, unicode, startpos, endpos, reason);
6632n/a if (*exceptionObject != NULL)
6633n/a PyCodec_StrictErrors(*exceptionObject);
6634n/a}
6635n/a
6636n/a/* error handling callback helper:
6637n/a build arguments, call the callback and check the arguments,
6638n/a put the result into newpos and return the replacement string, which
6639n/a has to be freed by the caller */
6640n/astatic PyObject *
6641n/aunicode_encode_call_errorhandler(const char *errors,
6642n/a PyObject **errorHandler,
6643n/a const char *encoding, const char *reason,
6644n/a PyObject *unicode, PyObject **exceptionObject,
6645n/a Py_ssize_t startpos, Py_ssize_t endpos,
6646n/a Py_ssize_t *newpos)
6647n/a{
6648n/a static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6649n/a Py_ssize_t len;
6650n/a PyObject *restuple;
6651n/a PyObject *resunicode;
6652n/a
6653n/a if (*errorHandler == NULL) {
6654n/a *errorHandler = PyCodec_LookupError(errors);
6655n/a if (*errorHandler == NULL)
6656n/a return NULL;
6657n/a }
6658n/a
6659n/a if (PyUnicode_READY(unicode) == -1)
6660n/a return NULL;
6661n/a len = PyUnicode_GET_LENGTH(unicode);
6662n/a
6663n/a make_encode_exception(exceptionObject,
6664n/a encoding, unicode, startpos, endpos, reason);
6665n/a if (*exceptionObject == NULL)
6666n/a return NULL;
6667n/a
6668n/a restuple = PyObject_CallFunctionObjArgs(
6669n/a *errorHandler, *exceptionObject, NULL);
6670n/a if (restuple == NULL)
6671n/a return NULL;
6672n/a if (!PyTuple_Check(restuple)) {
6673n/a PyErr_SetString(PyExc_TypeError, &argparse[3]);
6674n/a Py_DECREF(restuple);
6675n/a return NULL;
6676n/a }
6677n/a if (!PyArg_ParseTuple(restuple, argparse,
6678n/a &resunicode, newpos)) {
6679n/a Py_DECREF(restuple);
6680n/a return NULL;
6681n/a }
6682n/a if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6683n/a PyErr_SetString(PyExc_TypeError, &argparse[3]);
6684n/a Py_DECREF(restuple);
6685n/a return NULL;
6686n/a }
6687n/a if (*newpos<0)
6688n/a *newpos = len + *newpos;
6689n/a if (*newpos<0 || *newpos>len) {
6690n/a PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6691n/a Py_DECREF(restuple);
6692n/a return NULL;
6693n/a }
6694n/a Py_INCREF(resunicode);
6695n/a Py_DECREF(restuple);
6696n/a return resunicode;
6697n/a}
6698n/a
6699n/astatic PyObject *
6700n/aunicode_encode_ucs1(PyObject *unicode,
6701n/a const char *errors,
6702n/a const Py_UCS4 limit)
6703n/a{
6704n/a /* input state */
6705n/a Py_ssize_t pos=0, size;
6706n/a int kind;
6707n/a void *data;
6708n/a /* pointer into the output */
6709n/a char *str;
6710n/a const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6711n/a const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6712n/a PyObject *error_handler_obj = NULL;
6713n/a PyObject *exc = NULL;
6714n/a _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6715n/a PyObject *rep = NULL;
6716n/a /* output object */
6717n/a _PyBytesWriter writer;
6718n/a
6719n/a if (PyUnicode_READY(unicode) == -1)
6720n/a return NULL;
6721n/a size = PyUnicode_GET_LENGTH(unicode);
6722n/a kind = PyUnicode_KIND(unicode);
6723n/a data = PyUnicode_DATA(unicode);
6724n/a /* allocate enough for a simple encoding without
6725n/a replacements, if we need more, we'll resize */
6726n/a if (size == 0)
6727n/a return PyBytes_FromStringAndSize(NULL, 0);
6728n/a
6729n/a _PyBytesWriter_Init(&writer);
6730n/a str = _PyBytesWriter_Alloc(&writer, size);
6731n/a if (str == NULL)
6732n/a return NULL;
6733n/a
6734n/a while (pos < size) {
6735n/a Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6736n/a
6737n/a /* can we encode this? */
6738n/a if (ch < limit) {
6739n/a /* no overflow check, because we know that the space is enough */
6740n/a *str++ = (char)ch;
6741n/a ++pos;
6742n/a }
6743n/a else {
6744n/a Py_ssize_t newpos, i;
6745n/a /* startpos for collecting unencodable chars */
6746n/a Py_ssize_t collstart = pos;
6747n/a Py_ssize_t collend = collstart + 1;
6748n/a /* find all unecodable characters */
6749n/a
6750n/a while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6751n/a ++collend;
6752n/a
6753n/a /* Only overallocate the buffer if it's not the last write */
6754n/a writer.overallocate = (collend < size);
6755n/a
6756n/a /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6757n/a if (error_handler == _Py_ERROR_UNKNOWN)
6758n/a error_handler = get_error_handler(errors);
6759n/a
6760n/a switch (error_handler) {
6761n/a case _Py_ERROR_STRICT:
6762n/a raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6763n/a goto onError;
6764n/a
6765n/a case _Py_ERROR_REPLACE:
6766n/a memset(str, '?', collend - collstart);
6767n/a str += (collend - collstart);
6768n/a /* fall through ignore error handler */
6769n/a case _Py_ERROR_IGNORE:
6770n/a pos = collend;
6771n/a break;
6772n/a
6773n/a case _Py_ERROR_BACKSLASHREPLACE:
6774n/a /* subtract preallocated bytes */
6775n/a writer.min_size -= (collend - collstart);
6776n/a str = backslashreplace(&writer, str,
6777n/a unicode, collstart, collend);
6778n/a if (str == NULL)
6779n/a goto onError;
6780n/a pos = collend;
6781n/a break;
6782n/a
6783n/a case _Py_ERROR_XMLCHARREFREPLACE:
6784n/a /* subtract preallocated bytes */
6785n/a writer.min_size -= (collend - collstart);
6786n/a str = xmlcharrefreplace(&writer, str,
6787n/a unicode, collstart, collend);
6788n/a if (str == NULL)
6789n/a goto onError;
6790n/a pos = collend;
6791n/a break;
6792n/a
6793n/a case _Py_ERROR_SURROGATEESCAPE:
6794n/a for (i = collstart; i < collend; ++i) {
6795n/a ch = PyUnicode_READ(kind, data, i);
6796n/a if (ch < 0xdc80 || 0xdcff < ch) {
6797n/a /* Not a UTF-8b surrogate */
6798n/a break;
6799n/a }
6800n/a *str++ = (char)(ch - 0xdc00);
6801n/a ++pos;
6802n/a }
6803n/a if (i >= collend)
6804n/a break;
6805n/a collstart = pos;
6806n/a assert(collstart != collend);
6807n/a /* fallback to general error handling */
6808n/a
6809n/a default:
6810n/a rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6811n/a encoding, reason, unicode, &exc,
6812n/a collstart, collend, &newpos);
6813n/a if (rep == NULL)
6814n/a goto onError;
6815n/a
6816n/a /* subtract preallocated bytes */
6817n/a writer.min_size -= newpos - collstart;
6818n/a
6819n/a if (PyBytes_Check(rep)) {
6820n/a /* Directly copy bytes result to output. */
6821n/a str = _PyBytesWriter_WriteBytes(&writer, str,
6822n/a PyBytes_AS_STRING(rep),
6823n/a PyBytes_GET_SIZE(rep));
6824n/a if (str == NULL)
6825n/a goto onError;
6826n/a }
6827n/a else {
6828n/a assert(PyUnicode_Check(rep));
6829n/a
6830n/a if (PyUnicode_READY(rep) < 0)
6831n/a goto onError;
6832n/a
6833n/a if (limit == 256 ?
6834n/a PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6835n/a !PyUnicode_IS_ASCII(rep))
6836n/a {
6837n/a /* Not all characters are smaller than limit */
6838n/a raise_encode_exception(&exc, encoding, unicode,
6839n/a collstart, collend, reason);
6840n/a goto onError;
6841n/a }
6842n/a assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6843n/a str = _PyBytesWriter_WriteBytes(&writer, str,
6844n/a PyUnicode_DATA(rep),
6845n/a PyUnicode_GET_LENGTH(rep));
6846n/a }
6847n/a pos = newpos;
6848n/a Py_CLEAR(rep);
6849n/a }
6850n/a
6851n/a /* If overallocation was disabled, ensure that it was the last
6852n/a write. Otherwise, we missed an optimization */
6853n/a assert(writer.overallocate || pos == size);
6854n/a }
6855n/a }
6856n/a
6857n/a Py_XDECREF(error_handler_obj);
6858n/a Py_XDECREF(exc);
6859n/a return _PyBytesWriter_Finish(&writer, str);
6860n/a
6861n/a onError:
6862n/a Py_XDECREF(rep);
6863n/a _PyBytesWriter_Dealloc(&writer);
6864n/a Py_XDECREF(error_handler_obj);
6865n/a Py_XDECREF(exc);
6866n/a return NULL;
6867n/a}
6868n/a
6869n/a/* Deprecated */
6870n/aPyObject *
6871n/aPyUnicode_EncodeLatin1(const Py_UNICODE *p,
6872n/a Py_ssize_t size,
6873n/a const char *errors)
6874n/a{
6875n/a PyObject *result;
6876n/a PyObject *unicode = PyUnicode_FromWideChar(p, size);
6877n/a if (unicode == NULL)
6878n/a return NULL;
6879n/a result = unicode_encode_ucs1(unicode, errors, 256);
6880n/a Py_DECREF(unicode);
6881n/a return result;
6882n/a}
6883n/a
6884n/aPyObject *
6885n/a_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6886n/a{
6887n/a if (!PyUnicode_Check(unicode)) {
6888n/a PyErr_BadArgument();
6889n/a return NULL;
6890n/a }
6891n/a if (PyUnicode_READY(unicode) == -1)
6892n/a return NULL;
6893n/a /* Fast path: if it is a one-byte string, construct
6894n/a bytes object directly. */
6895n/a if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6896n/a return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6897n/a PyUnicode_GET_LENGTH(unicode));
6898n/a /* Non-Latin-1 characters present. Defer to above function to
6899n/a raise the exception. */
6900n/a return unicode_encode_ucs1(unicode, errors, 256);
6901n/a}
6902n/a
6903n/aPyObject*
6904n/aPyUnicode_AsLatin1String(PyObject *unicode)
6905n/a{
6906n/a return _PyUnicode_AsLatin1String(unicode, NULL);
6907n/a}
6908n/a
6909n/a/* --- 7-bit ASCII Codec -------------------------------------------------- */
6910n/a
6911n/aPyObject *
6912n/aPyUnicode_DecodeASCII(const char *s,
6913n/a Py_ssize_t size,
6914n/a const char *errors)
6915n/a{
6916n/a const char *starts = s;
6917n/a _PyUnicodeWriter writer;
6918n/a int kind;
6919n/a void *data;
6920n/a Py_ssize_t startinpos;
6921n/a Py_ssize_t endinpos;
6922n/a Py_ssize_t outpos;
6923n/a const char *e;
6924n/a PyObject *error_handler_obj = NULL;
6925n/a PyObject *exc = NULL;
6926n/a _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6927n/a
6928n/a if (size == 0)
6929n/a _Py_RETURN_UNICODE_EMPTY();
6930n/a
6931n/a /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6932n/a if (size == 1 && (unsigned char)s[0] < 128)
6933n/a return get_latin1_char((unsigned char)s[0]);
6934n/a
6935n/a _PyUnicodeWriter_Init(&writer);
6936n/a writer.min_length = size;
6937n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
6938n/a return NULL;
6939n/a
6940n/a e = s + size;
6941n/a data = writer.data;
6942n/a outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6943n/a writer.pos = outpos;
6944n/a if (writer.pos == size)
6945n/a return _PyUnicodeWriter_Finish(&writer);
6946n/a
6947n/a s += writer.pos;
6948n/a kind = writer.kind;
6949n/a while (s < e) {
6950n/a unsigned char c = (unsigned char)*s;
6951n/a if (c < 128) {
6952n/a PyUnicode_WRITE(kind, data, writer.pos, c);
6953n/a writer.pos++;
6954n/a ++s;
6955n/a continue;
6956n/a }
6957n/a
6958n/a /* byte outsize range 0x00..0x7f: call the error handler */
6959n/a
6960n/a if (error_handler == _Py_ERROR_UNKNOWN)
6961n/a error_handler = get_error_handler(errors);
6962n/a
6963n/a switch (error_handler)
6964n/a {
6965n/a case _Py_ERROR_REPLACE:
6966n/a case _Py_ERROR_SURROGATEESCAPE:
6967n/a /* Fast-path: the error handler only writes one character,
6968n/a but we may switch to UCS2 at the first write */
6969n/a if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6970n/a goto onError;
6971n/a kind = writer.kind;
6972n/a data = writer.data;
6973n/a
6974n/a if (error_handler == _Py_ERROR_REPLACE)
6975n/a PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6976n/a else
6977n/a PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6978n/a writer.pos++;
6979n/a ++s;
6980n/a break;
6981n/a
6982n/a case _Py_ERROR_IGNORE:
6983n/a ++s;
6984n/a break;
6985n/a
6986n/a default:
6987n/a startinpos = s-starts;
6988n/a endinpos = startinpos + 1;
6989n/a if (unicode_decode_call_errorhandler_writer(
6990n/a errors, &error_handler_obj,
6991n/a "ascii", "ordinal not in range(128)",
6992n/a &starts, &e, &startinpos, &endinpos, &exc, &s,
6993n/a &writer))
6994n/a goto onError;
6995n/a kind = writer.kind;
6996n/a data = writer.data;
6997n/a }
6998n/a }
6999n/a Py_XDECREF(error_handler_obj);
7000n/a Py_XDECREF(exc);
7001n/a return _PyUnicodeWriter_Finish(&writer);
7002n/a
7003n/a onError:
7004n/a _PyUnicodeWriter_Dealloc(&writer);
7005n/a Py_XDECREF(error_handler_obj);
7006n/a Py_XDECREF(exc);
7007n/a return NULL;
7008n/a}
7009n/a
7010n/a/* Deprecated */
7011n/aPyObject *
7012n/aPyUnicode_EncodeASCII(const Py_UNICODE *p,
7013n/a Py_ssize_t size,
7014n/a const char *errors)
7015n/a{
7016n/a PyObject *result;
7017n/a PyObject *unicode = PyUnicode_FromWideChar(p, size);
7018n/a if (unicode == NULL)
7019n/a return NULL;
7020n/a result = unicode_encode_ucs1(unicode, errors, 128);
7021n/a Py_DECREF(unicode);
7022n/a return result;
7023n/a}
7024n/a
7025n/aPyObject *
7026n/a_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7027n/a{
7028n/a if (!PyUnicode_Check(unicode)) {
7029n/a PyErr_BadArgument();
7030n/a return NULL;
7031n/a }
7032n/a if (PyUnicode_READY(unicode) == -1)
7033n/a return NULL;
7034n/a /* Fast path: if it is an ASCII-only string, construct bytes object
7035n/a directly. Else defer to above function to raise the exception. */
7036n/a if (PyUnicode_IS_ASCII(unicode))
7037n/a return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7038n/a PyUnicode_GET_LENGTH(unicode));
7039n/a return unicode_encode_ucs1(unicode, errors, 128);
7040n/a}
7041n/a
7042n/aPyObject *
7043n/aPyUnicode_AsASCIIString(PyObject *unicode)
7044n/a{
7045n/a return _PyUnicode_AsASCIIString(unicode, NULL);
7046n/a}
7047n/a
7048n/a#ifdef MS_WINDOWS
7049n/a
7050n/a/* --- MBCS codecs for Windows -------------------------------------------- */
7051n/a
7052n/a#if SIZEOF_INT < SIZEOF_SIZE_T
7053n/a#define NEED_RETRY
7054n/a#endif
7055n/a
7056n/a#ifndef WC_ERR_INVALID_CHARS
7057n/a# define WC_ERR_INVALID_CHARS 0x0080
7058n/a#endif
7059n/a
7060n/astatic const char*
7061n/acode_page_name(UINT code_page, PyObject **obj)
7062n/a{
7063n/a *obj = NULL;
7064n/a if (code_page == CP_ACP)
7065n/a return "mbcs";
7066n/a if (code_page == CP_UTF7)
7067n/a return "CP_UTF7";
7068n/a if (code_page == CP_UTF8)
7069n/a return "CP_UTF8";
7070n/a
7071n/a *obj = PyBytes_FromFormat("cp%u", code_page);
7072n/a if (*obj == NULL)
7073n/a return NULL;
7074n/a return PyBytes_AS_STRING(*obj);
7075n/a}
7076n/a
7077n/astatic DWORD
7078n/adecode_code_page_flags(UINT code_page)
7079n/a{
7080n/a if (code_page == CP_UTF7) {
7081n/a /* The CP_UTF7 decoder only supports flags=0 */
7082n/a return 0;
7083n/a }
7084n/a else
7085n/a return MB_ERR_INVALID_CHARS;
7086n/a}
7087n/a
7088n/a/*
7089n/a * Decode a byte string from a Windows code page into unicode object in strict
7090n/a * mode.
7091n/a *
7092n/a * Returns consumed size if succeed, returns -2 on decode error, or raise an
7093n/a * OSError and returns -1 on other error.
7094n/a */
7095n/astatic int
7096n/adecode_code_page_strict(UINT code_page,
7097n/a PyObject **v,
7098n/a const char *in,
7099n/a int insize)
7100n/a{
7101n/a const DWORD flags = decode_code_page_flags(code_page);
7102n/a wchar_t *out;
7103n/a DWORD outsize;
7104n/a
7105n/a /* First get the size of the result */
7106n/a assert(insize > 0);
7107n/a outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7108n/a if (outsize <= 0)
7109n/a goto error;
7110n/a
7111n/a if (*v == NULL) {
7112n/a /* Create unicode object */
7113n/a /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7114n/a *v = (PyObject*)_PyUnicode_New(outsize);
7115n/a if (*v == NULL)
7116n/a return -1;
7117n/a out = PyUnicode_AS_UNICODE(*v);
7118n/a }
7119n/a else {
7120n/a /* Extend unicode object */
7121n/a Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7122n/a if (unicode_resize(v, n + outsize) < 0)
7123n/a return -1;
7124n/a out = PyUnicode_AS_UNICODE(*v) + n;
7125n/a }
7126n/a
7127n/a /* Do the conversion */
7128n/a outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7129n/a if (outsize <= 0)
7130n/a goto error;
7131n/a return insize;
7132n/a
7133n/aerror:
7134n/a if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7135n/a return -2;
7136n/a PyErr_SetFromWindowsErr(0);
7137n/a return -1;
7138n/a}
7139n/a
7140n/a/*
7141n/a * Decode a byte string from a code page into unicode object with an error
7142n/a * handler.
7143n/a *
7144n/a * Returns consumed size if succeed, or raise an OSError or
7145n/a * UnicodeDecodeError exception and returns -1 on error.
7146n/a */
7147n/astatic int
7148n/adecode_code_page_errors(UINT code_page,
7149n/a PyObject **v,
7150n/a const char *in, const int size,
7151n/a const char *errors, int final)
7152n/a{
7153n/a const char *startin = in;
7154n/a const char *endin = in + size;
7155n/a const DWORD flags = decode_code_page_flags(code_page);
7156n/a /* Ideally, we should get reason from FormatMessage. This is the Windows
7157n/a 2000 English version of the message. */
7158n/a const char *reason = "No mapping for the Unicode character exists "
7159n/a "in the target code page.";
7160n/a /* each step cannot decode more than 1 character, but a character can be
7161n/a represented as a surrogate pair */
7162n/a wchar_t buffer[2], *startout, *out;
7163n/a int insize;
7164n/a Py_ssize_t outsize;
7165n/a PyObject *errorHandler = NULL;
7166n/a PyObject *exc = NULL;
7167n/a PyObject *encoding_obj = NULL;
7168n/a const char *encoding;
7169n/a DWORD err;
7170n/a int ret = -1;
7171n/a
7172n/a assert(size > 0);
7173n/a
7174n/a encoding = code_page_name(code_page, &encoding_obj);
7175n/a if (encoding == NULL)
7176n/a return -1;
7177n/a
7178n/a if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7179n/a /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7180n/a UnicodeDecodeError. */
7181n/a make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7182n/a if (exc != NULL) {
7183n/a PyCodec_StrictErrors(exc);
7184n/a Py_CLEAR(exc);
7185n/a }
7186n/a goto error;
7187n/a }
7188n/a
7189n/a if (*v == NULL) {
7190n/a /* Create unicode object */
7191n/a if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7192n/a PyErr_NoMemory();
7193n/a goto error;
7194n/a }
7195n/a /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7196n/a *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
7197n/a if (*v == NULL)
7198n/a goto error;
7199n/a startout = PyUnicode_AS_UNICODE(*v);
7200n/a }
7201n/a else {
7202n/a /* Extend unicode object */
7203n/a Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7204n/a if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7205n/a PyErr_NoMemory();
7206n/a goto error;
7207n/a }
7208n/a if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7209n/a goto error;
7210n/a startout = PyUnicode_AS_UNICODE(*v) + n;
7211n/a }
7212n/a
7213n/a /* Decode the byte string character per character */
7214n/a out = startout;
7215n/a while (in < endin)
7216n/a {
7217n/a /* Decode a character */
7218n/a insize = 1;
7219n/a do
7220n/a {
7221n/a outsize = MultiByteToWideChar(code_page, flags,
7222n/a in, insize,
7223n/a buffer, Py_ARRAY_LENGTH(buffer));
7224n/a if (outsize > 0)
7225n/a break;
7226n/a err = GetLastError();
7227n/a if (err != ERROR_NO_UNICODE_TRANSLATION
7228n/a && err != ERROR_INSUFFICIENT_BUFFER)
7229n/a {
7230n/a PyErr_SetFromWindowsErr(0);
7231n/a goto error;
7232n/a }
7233n/a insize++;
7234n/a }
7235n/a /* 4=maximum length of a UTF-8 sequence */
7236n/a while (insize <= 4 && (in + insize) <= endin);
7237n/a
7238n/a if (outsize <= 0) {
7239n/a Py_ssize_t startinpos, endinpos, outpos;
7240n/a
7241n/a /* last character in partial decode? */
7242n/a if (in + insize >= endin && !final)
7243n/a break;
7244n/a
7245n/a startinpos = in - startin;
7246n/a endinpos = startinpos + 1;
7247n/a outpos = out - PyUnicode_AS_UNICODE(*v);
7248n/a if (unicode_decode_call_errorhandler_wchar(
7249n/a errors, &errorHandler,
7250n/a encoding, reason,
7251n/a &startin, &endin, &startinpos, &endinpos, &exc, &in,
7252n/a v, &outpos))
7253n/a {
7254n/a goto error;
7255n/a }
7256n/a out = PyUnicode_AS_UNICODE(*v) + outpos;
7257n/a }
7258n/a else {
7259n/a in += insize;
7260n/a memcpy(out, buffer, outsize * sizeof(wchar_t));
7261n/a out += outsize;
7262n/a }
7263n/a }
7264n/a
7265n/a /* write a NUL character at the end */
7266n/a *out = 0;
7267n/a
7268n/a /* Extend unicode object */
7269n/a outsize = out - startout;
7270n/a assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7271n/a if (unicode_resize(v, outsize) < 0)
7272n/a goto error;
7273n/a /* (in - startin) <= size and size is an int */
7274n/a ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7275n/a
7276n/aerror:
7277n/a Py_XDECREF(encoding_obj);
7278n/a Py_XDECREF(errorHandler);
7279n/a Py_XDECREF(exc);
7280n/a return ret;
7281n/a}
7282n/a
7283n/astatic PyObject *
7284n/adecode_code_page_stateful(int code_page,
7285n/a const char *s, Py_ssize_t size,
7286n/a const char *errors, Py_ssize_t *consumed)
7287n/a{
7288n/a PyObject *v = NULL;
7289n/a int chunk_size, final, converted, done;
7290n/a
7291n/a if (code_page < 0) {
7292n/a PyErr_SetString(PyExc_ValueError, "invalid code page number");
7293n/a return NULL;
7294n/a }
7295n/a
7296n/a if (consumed)
7297n/a *consumed = 0;
7298n/a
7299n/a do
7300n/a {
7301n/a#ifdef NEED_RETRY
7302n/a if (size > INT_MAX) {
7303n/a chunk_size = INT_MAX;
7304n/a final = 0;
7305n/a done = 0;
7306n/a }
7307n/a else
7308n/a#endif
7309n/a {
7310n/a chunk_size = (int)size;
7311n/a final = (consumed == NULL);
7312n/a done = 1;
7313n/a }
7314n/a
7315n/a if (chunk_size == 0 && done) {
7316n/a if (v != NULL)
7317n/a break;
7318n/a _Py_RETURN_UNICODE_EMPTY();
7319n/a }
7320n/a
7321n/a converted = decode_code_page_strict(code_page, &v,
7322n/a s, chunk_size);
7323n/a if (converted == -2)
7324n/a converted = decode_code_page_errors(code_page, &v,
7325n/a s, chunk_size,
7326n/a errors, final);
7327n/a assert(converted != 0 || done);
7328n/a
7329n/a if (converted < 0) {
7330n/a Py_XDECREF(v);
7331n/a return NULL;
7332n/a }
7333n/a
7334n/a if (consumed)
7335n/a *consumed += converted;
7336n/a
7337n/a s += converted;
7338n/a size -= converted;
7339n/a } while (!done);
7340n/a
7341n/a return unicode_result(v);
7342n/a}
7343n/a
7344n/aPyObject *
7345n/aPyUnicode_DecodeCodePageStateful(int code_page,
7346n/a const char *s,
7347n/a Py_ssize_t size,
7348n/a const char *errors,
7349n/a Py_ssize_t *consumed)
7350n/a{
7351n/a return decode_code_page_stateful(code_page, s, size, errors, consumed);
7352n/a}
7353n/a
7354n/aPyObject *
7355n/aPyUnicode_DecodeMBCSStateful(const char *s,
7356n/a Py_ssize_t size,
7357n/a const char *errors,
7358n/a Py_ssize_t *consumed)
7359n/a{
7360n/a return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7361n/a}
7362n/a
7363n/aPyObject *
7364n/aPyUnicode_DecodeMBCS(const char *s,
7365n/a Py_ssize_t size,
7366n/a const char *errors)
7367n/a{
7368n/a return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7369n/a}
7370n/a
7371n/astatic DWORD
7372n/aencode_code_page_flags(UINT code_page, const char *errors)
7373n/a{
7374n/a if (code_page == CP_UTF8) {
7375n/a return WC_ERR_INVALID_CHARS;
7376n/a }
7377n/a else if (code_page == CP_UTF7) {
7378n/a /* CP_UTF7 only supports flags=0 */
7379n/a return 0;
7380n/a }
7381n/a else {
7382n/a if (errors != NULL && strcmp(errors, "replace") == 0)
7383n/a return 0;
7384n/a else
7385n/a return WC_NO_BEST_FIT_CHARS;
7386n/a }
7387n/a}
7388n/a
7389n/a/*
7390n/a * Encode a Unicode string to a Windows code page into a byte string in strict
7391n/a * mode.
7392n/a *
7393n/a * Returns consumed characters if succeed, returns -2 on encode error, or raise
7394n/a * an OSError and returns -1 on other error.
7395n/a */
7396n/astatic int
7397n/aencode_code_page_strict(UINT code_page, PyObject **outbytes,
7398n/a PyObject *unicode, Py_ssize_t offset, int len,
7399n/a const char* errors)
7400n/a{
7401n/a BOOL usedDefaultChar = FALSE;
7402n/a BOOL *pusedDefaultChar = &usedDefaultChar;
7403n/a int outsize;
7404n/a wchar_t *p;
7405n/a Py_ssize_t size;
7406n/a const DWORD flags = encode_code_page_flags(code_page, NULL);
7407n/a char *out;
7408n/a /* Create a substring so that we can get the UTF-16 representation
7409n/a of just the slice under consideration. */
7410n/a PyObject *substring;
7411n/a
7412n/a assert(len > 0);
7413n/a
7414n/a if (code_page != CP_UTF8 && code_page != CP_UTF7)
7415n/a pusedDefaultChar = &usedDefaultChar;
7416n/a else
7417n/a pusedDefaultChar = NULL;
7418n/a
7419n/a substring = PyUnicode_Substring(unicode, offset, offset+len);
7420n/a if (substring == NULL)
7421n/a return -1;
7422n/a p = PyUnicode_AsUnicodeAndSize(substring, &size);
7423n/a if (p == NULL) {
7424n/a Py_DECREF(substring);
7425n/a return -1;
7426n/a }
7427n/a assert(size <= INT_MAX);
7428n/a
7429n/a /* First get the size of the result */
7430n/a outsize = WideCharToMultiByte(code_page, flags,
7431n/a p, (int)size,
7432n/a NULL, 0,
7433n/a NULL, pusedDefaultChar);
7434n/a if (outsize <= 0)
7435n/a goto error;
7436n/a /* If we used a default char, then we failed! */
7437n/a if (pusedDefaultChar && *pusedDefaultChar) {
7438n/a Py_DECREF(substring);
7439n/a return -2;
7440n/a }
7441n/a
7442n/a if (*outbytes == NULL) {
7443n/a /* Create string object */
7444n/a *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7445n/a if (*outbytes == NULL) {
7446n/a Py_DECREF(substring);
7447n/a return -1;
7448n/a }
7449n/a out = PyBytes_AS_STRING(*outbytes);
7450n/a }
7451n/a else {
7452n/a /* Extend string object */
7453n/a const Py_ssize_t n = PyBytes_Size(*outbytes);
7454n/a if (outsize > PY_SSIZE_T_MAX - n) {
7455n/a PyErr_NoMemory();
7456n/a Py_DECREF(substring);
7457n/a return -1;
7458n/a }
7459n/a if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7460n/a Py_DECREF(substring);
7461n/a return -1;
7462n/a }
7463n/a out = PyBytes_AS_STRING(*outbytes) + n;
7464n/a }
7465n/a
7466n/a /* Do the conversion */
7467n/a outsize = WideCharToMultiByte(code_page, flags,
7468n/a p, (int)size,
7469n/a out, outsize,
7470n/a NULL, pusedDefaultChar);
7471n/a Py_CLEAR(substring);
7472n/a if (outsize <= 0)
7473n/a goto error;
7474n/a if (pusedDefaultChar && *pusedDefaultChar)
7475n/a return -2;
7476n/a return 0;
7477n/a
7478n/aerror:
7479n/a Py_XDECREF(substring);
7480n/a if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7481n/a return -2;
7482n/a PyErr_SetFromWindowsErr(0);
7483n/a return -1;
7484n/a}
7485n/a
7486n/a/*
7487n/a * Encode a Unicode string to a Windows code page into a byte string using an
7488n/a * error handler.
7489n/a *
7490n/a * Returns consumed characters if succeed, or raise an OSError and returns
7491n/a * -1 on other error.
7492n/a */
7493n/astatic int
7494n/aencode_code_page_errors(UINT code_page, PyObject **outbytes,
7495n/a PyObject *unicode, Py_ssize_t unicode_offset,
7496n/a Py_ssize_t insize, const char* errors)
7497n/a{
7498n/a const DWORD flags = encode_code_page_flags(code_page, errors);
7499n/a Py_ssize_t pos = unicode_offset;
7500n/a Py_ssize_t endin = unicode_offset + insize;
7501n/a /* Ideally, we should get reason from FormatMessage. This is the Windows
7502n/a 2000 English version of the message. */
7503n/a const char *reason = "invalid character";
7504n/a /* 4=maximum length of a UTF-8 sequence */
7505n/a char buffer[4];
7506n/a BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7507n/a Py_ssize_t outsize;
7508n/a char *out;
7509n/a PyObject *errorHandler = NULL;
7510n/a PyObject *exc = NULL;
7511n/a PyObject *encoding_obj = NULL;
7512n/a const char *encoding;
7513n/a Py_ssize_t newpos, newoutsize;
7514n/a PyObject *rep;
7515n/a int ret = -1;
7516n/a
7517n/a assert(insize > 0);
7518n/a
7519n/a encoding = code_page_name(code_page, &encoding_obj);
7520n/a if (encoding == NULL)
7521n/a return -1;
7522n/a
7523n/a if (errors == NULL || strcmp(errors, "strict") == 0) {
7524n/a /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7525n/a then we raise a UnicodeEncodeError. */
7526n/a make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7527n/a if (exc != NULL) {
7528n/a PyCodec_StrictErrors(exc);
7529n/a Py_DECREF(exc);
7530n/a }
7531n/a Py_XDECREF(encoding_obj);
7532n/a return -1;
7533n/a }
7534n/a
7535n/a if (code_page != CP_UTF8 && code_page != CP_UTF7)
7536n/a pusedDefaultChar = &usedDefaultChar;
7537n/a else
7538n/a pusedDefaultChar = NULL;
7539n/a
7540n/a if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7541n/a PyErr_NoMemory();
7542n/a goto error;
7543n/a }
7544n/a outsize = insize * Py_ARRAY_LENGTH(buffer);
7545n/a
7546n/a if (*outbytes == NULL) {
7547n/a /* Create string object */
7548n/a *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7549n/a if (*outbytes == NULL)
7550n/a goto error;
7551n/a out = PyBytes_AS_STRING(*outbytes);
7552n/a }
7553n/a else {
7554n/a /* Extend string object */
7555n/a Py_ssize_t n = PyBytes_Size(*outbytes);
7556n/a if (n > PY_SSIZE_T_MAX - outsize) {
7557n/a PyErr_NoMemory();
7558n/a goto error;
7559n/a }
7560n/a if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7561n/a goto error;
7562n/a out = PyBytes_AS_STRING(*outbytes) + n;
7563n/a }
7564n/a
7565n/a /* Encode the string character per character */
7566n/a while (pos < endin)
7567n/a {
7568n/a Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7569n/a wchar_t chars[2];
7570n/a int charsize;
7571n/a if (ch < 0x10000) {
7572n/a chars[0] = (wchar_t)ch;
7573n/a charsize = 1;
7574n/a }
7575n/a else {
7576n/a chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7577n/a chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7578n/a charsize = 2;
7579n/a }
7580n/a
7581n/a outsize = WideCharToMultiByte(code_page, flags,
7582n/a chars, charsize,
7583n/a buffer, Py_ARRAY_LENGTH(buffer),
7584n/a NULL, pusedDefaultChar);
7585n/a if (outsize > 0) {
7586n/a if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7587n/a {
7588n/a pos++;
7589n/a memcpy(out, buffer, outsize);
7590n/a out += outsize;
7591n/a continue;
7592n/a }
7593n/a }
7594n/a else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7595n/a PyErr_SetFromWindowsErr(0);
7596n/a goto error;
7597n/a }
7598n/a
7599n/a rep = unicode_encode_call_errorhandler(
7600n/a errors, &errorHandler, encoding, reason,
7601n/a unicode, &exc,
7602n/a pos, pos + 1, &newpos);
7603n/a if (rep == NULL)
7604n/a goto error;
7605n/a pos = newpos;
7606n/a
7607n/a if (PyBytes_Check(rep)) {
7608n/a outsize = PyBytes_GET_SIZE(rep);
7609n/a if (outsize != 1) {
7610n/a Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7611n/a newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7612n/a if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7613n/a Py_DECREF(rep);
7614n/a goto error;
7615n/a }
7616n/a out = PyBytes_AS_STRING(*outbytes) + offset;
7617n/a }
7618n/a memcpy(out, PyBytes_AS_STRING(rep), outsize);
7619n/a out += outsize;
7620n/a }
7621n/a else {
7622n/a Py_ssize_t i;
7623n/a enum PyUnicode_Kind kind;
7624n/a void *data;
7625n/a
7626n/a if (PyUnicode_READY(rep) == -1) {
7627n/a Py_DECREF(rep);
7628n/a goto error;
7629n/a }
7630n/a
7631n/a outsize = PyUnicode_GET_LENGTH(rep);
7632n/a if (outsize != 1) {
7633n/a Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7634n/a newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7635n/a if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7636n/a Py_DECREF(rep);
7637n/a goto error;
7638n/a }
7639n/a out = PyBytes_AS_STRING(*outbytes) + offset;
7640n/a }
7641n/a kind = PyUnicode_KIND(rep);
7642n/a data = PyUnicode_DATA(rep);
7643n/a for (i=0; i < outsize; i++) {
7644n/a Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7645n/a if (ch > 127) {
7646n/a raise_encode_exception(&exc,
7647n/a encoding, unicode,
7648n/a pos, pos + 1,
7649n/a "unable to encode error handler result to ASCII");
7650n/a Py_DECREF(rep);
7651n/a goto error;
7652n/a }
7653n/a *out = (unsigned char)ch;
7654n/a out++;
7655n/a }
7656n/a }
7657n/a Py_DECREF(rep);
7658n/a }
7659n/a /* write a NUL byte */
7660n/a *out = 0;
7661n/a outsize = out - PyBytes_AS_STRING(*outbytes);
7662n/a assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7663n/a if (_PyBytes_Resize(outbytes, outsize) < 0)
7664n/a goto error;
7665n/a ret = 0;
7666n/a
7667n/aerror:
7668n/a Py_XDECREF(encoding_obj);
7669n/a Py_XDECREF(errorHandler);
7670n/a Py_XDECREF(exc);
7671n/a return ret;
7672n/a}
7673n/a
7674n/astatic PyObject *
7675n/aencode_code_page(int code_page,
7676n/a PyObject *unicode,
7677n/a const char *errors)
7678n/a{
7679n/a Py_ssize_t len;
7680n/a PyObject *outbytes = NULL;
7681n/a Py_ssize_t offset;
7682n/a int chunk_len, ret, done;
7683n/a
7684n/a if (!PyUnicode_Check(unicode)) {
7685n/a PyErr_BadArgument();
7686n/a return NULL;
7687n/a }
7688n/a
7689n/a if (PyUnicode_READY(unicode) == -1)
7690n/a return NULL;
7691n/a len = PyUnicode_GET_LENGTH(unicode);
7692n/a
7693n/a if (code_page < 0) {
7694n/a PyErr_SetString(PyExc_ValueError, "invalid code page number");
7695n/a return NULL;
7696n/a }
7697n/a
7698n/a if (len == 0)
7699n/a return PyBytes_FromStringAndSize(NULL, 0);
7700n/a
7701n/a offset = 0;
7702n/a do
7703n/a {
7704n/a#ifdef NEED_RETRY
7705n/a /* UTF-16 encoding may double the size, so use only INT_MAX/2
7706n/a chunks. */
7707n/a if (len > INT_MAX/2) {
7708n/a chunk_len = INT_MAX/2;
7709n/a done = 0;
7710n/a }
7711n/a else
7712n/a#endif
7713n/a {
7714n/a chunk_len = (int)len;
7715n/a done = 1;
7716n/a }
7717n/a
7718n/a ret = encode_code_page_strict(code_page, &outbytes,
7719n/a unicode, offset, chunk_len,
7720n/a errors);
7721n/a if (ret == -2)
7722n/a ret = encode_code_page_errors(code_page, &outbytes,
7723n/a unicode, offset,
7724n/a chunk_len, errors);
7725n/a if (ret < 0) {
7726n/a Py_XDECREF(outbytes);
7727n/a return NULL;
7728n/a }
7729n/a
7730n/a offset += chunk_len;
7731n/a len -= chunk_len;
7732n/a } while (!done);
7733n/a
7734n/a return outbytes;
7735n/a}
7736n/a
7737n/aPyObject *
7738n/aPyUnicode_EncodeMBCS(const Py_UNICODE *p,
7739n/a Py_ssize_t size,
7740n/a const char *errors)
7741n/a{
7742n/a PyObject *unicode, *res;
7743n/a unicode = PyUnicode_FromWideChar(p, size);
7744n/a if (unicode == NULL)
7745n/a return NULL;
7746n/a res = encode_code_page(CP_ACP, unicode, errors);
7747n/a Py_DECREF(unicode);
7748n/a return res;
7749n/a}
7750n/a
7751n/aPyObject *
7752n/aPyUnicode_EncodeCodePage(int code_page,
7753n/a PyObject *unicode,
7754n/a const char *errors)
7755n/a{
7756n/a return encode_code_page(code_page, unicode, errors);
7757n/a}
7758n/a
7759n/aPyObject *
7760n/aPyUnicode_AsMBCSString(PyObject *unicode)
7761n/a{
7762n/a return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7763n/a}
7764n/a
7765n/a#undef NEED_RETRY
7766n/a
7767n/a#endif /* MS_WINDOWS */
7768n/a
7769n/a/* --- Character Mapping Codec -------------------------------------------- */
7770n/a
7771n/astatic int
7772n/acharmap_decode_string(const char *s,
7773n/a Py_ssize_t size,
7774n/a PyObject *mapping,
7775n/a const char *errors,
7776n/a _PyUnicodeWriter *writer)
7777n/a{
7778n/a const char *starts = s;
7779n/a const char *e;
7780n/a Py_ssize_t startinpos, endinpos;
7781n/a PyObject *errorHandler = NULL, *exc = NULL;
7782n/a Py_ssize_t maplen;
7783n/a enum PyUnicode_Kind mapkind;
7784n/a void *mapdata;
7785n/a Py_UCS4 x;
7786n/a unsigned char ch;
7787n/a
7788n/a if (PyUnicode_READY(mapping) == -1)
7789n/a return -1;
7790n/a
7791n/a maplen = PyUnicode_GET_LENGTH(mapping);
7792n/a mapdata = PyUnicode_DATA(mapping);
7793n/a mapkind = PyUnicode_KIND(mapping);
7794n/a
7795n/a e = s + size;
7796n/a
7797n/a if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7798n/a /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7799n/a * is disabled in encoding aliases, latin1 is preferred because
7800n/a * its implementation is faster. */
7801n/a Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7802n/a Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7803n/a Py_UCS4 maxchar = writer->maxchar;
7804n/a
7805n/a assert (writer->kind == PyUnicode_1BYTE_KIND);
7806n/a while (s < e) {
7807n/a ch = *s;
7808n/a x = mapdata_ucs1[ch];
7809n/a if (x > maxchar) {
7810n/a if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7811n/a goto onError;
7812n/a maxchar = writer->maxchar;
7813n/a outdata = (Py_UCS1 *)writer->data;
7814n/a }
7815n/a outdata[writer->pos] = x;
7816n/a writer->pos++;
7817n/a ++s;
7818n/a }
7819n/a return 0;
7820n/a }
7821n/a
7822n/a while (s < e) {
7823n/a if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7824n/a enum PyUnicode_Kind outkind = writer->kind;
7825n/a Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7826n/a if (outkind == PyUnicode_1BYTE_KIND) {
7827n/a Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7828n/a Py_UCS4 maxchar = writer->maxchar;
7829n/a while (s < e) {
7830n/a ch = *s;
7831n/a x = mapdata_ucs2[ch];
7832n/a if (x > maxchar)
7833n/a goto Error;
7834n/a outdata[writer->pos] = x;
7835n/a writer->pos++;
7836n/a ++s;
7837n/a }
7838n/a break;
7839n/a }
7840n/a else if (outkind == PyUnicode_2BYTE_KIND) {
7841n/a Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7842n/a while (s < e) {
7843n/a ch = *s;
7844n/a x = mapdata_ucs2[ch];
7845n/a if (x == 0xFFFE)
7846n/a goto Error;
7847n/a outdata[writer->pos] = x;
7848n/a writer->pos++;
7849n/a ++s;
7850n/a }
7851n/a break;
7852n/a }
7853n/a }
7854n/a ch = *s;
7855n/a
7856n/a if (ch < maplen)
7857n/a x = PyUnicode_READ(mapkind, mapdata, ch);
7858n/a else
7859n/a x = 0xfffe; /* invalid value */
7860n/aError:
7861n/a if (x == 0xfffe)
7862n/a {
7863n/a /* undefined mapping */
7864n/a startinpos = s-starts;
7865n/a endinpos = startinpos+1;
7866n/a if (unicode_decode_call_errorhandler_writer(
7867n/a errors, &errorHandler,
7868n/a "charmap", "character maps to <undefined>",
7869n/a &starts, &e, &startinpos, &endinpos, &exc, &s,
7870n/a writer)) {
7871n/a goto onError;
7872n/a }
7873n/a continue;
7874n/a }
7875n/a
7876n/a if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7877n/a goto onError;
7878n/a ++s;
7879n/a }
7880n/a Py_XDECREF(errorHandler);
7881n/a Py_XDECREF(exc);
7882n/a return 0;
7883n/a
7884n/aonError:
7885n/a Py_XDECREF(errorHandler);
7886n/a Py_XDECREF(exc);
7887n/a return -1;
7888n/a}
7889n/a
7890n/astatic int
7891n/acharmap_decode_mapping(const char *s,
7892n/a Py_ssize_t size,
7893n/a PyObject *mapping,
7894n/a const char *errors,
7895n/a _PyUnicodeWriter *writer)
7896n/a{
7897n/a const char *starts = s;
7898n/a const char *e;
7899n/a Py_ssize_t startinpos, endinpos;
7900n/a PyObject *errorHandler = NULL, *exc = NULL;
7901n/a unsigned char ch;
7902n/a PyObject *key, *item = NULL;
7903n/a
7904n/a e = s + size;
7905n/a
7906n/a while (s < e) {
7907n/a ch = *s;
7908n/a
7909n/a /* Get mapping (char ordinal -> integer, Unicode char or None) */
7910n/a key = PyLong_FromLong((long)ch);
7911n/a if (key == NULL)
7912n/a goto onError;
7913n/a
7914n/a item = PyObject_GetItem(mapping, key);
7915n/a Py_DECREF(key);
7916n/a if (item == NULL) {
7917n/a if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7918n/a /* No mapping found means: mapping is undefined. */
7919n/a PyErr_Clear();
7920n/a goto Undefined;
7921n/a } else
7922n/a goto onError;
7923n/a }
7924n/a
7925n/a /* Apply mapping */
7926n/a if (item == Py_None)
7927n/a goto Undefined;
7928n/a if (PyLong_Check(item)) {
7929n/a long value = PyLong_AS_LONG(item);
7930n/a if (value == 0xFFFE)
7931n/a goto Undefined;
7932n/a if (value < 0 || value > MAX_UNICODE) {
7933n/a PyErr_Format(PyExc_TypeError,
7934n/a "character mapping must be in range(0x%lx)",
7935n/a (unsigned long)MAX_UNICODE + 1);
7936n/a goto onError;
7937n/a }
7938n/a
7939n/a if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7940n/a goto onError;
7941n/a }
7942n/a else if (PyUnicode_Check(item)) {
7943n/a if (PyUnicode_READY(item) == -1)
7944n/a goto onError;
7945n/a if (PyUnicode_GET_LENGTH(item) == 1) {
7946n/a Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7947n/a if (value == 0xFFFE)
7948n/a goto Undefined;
7949n/a if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7950n/a goto onError;
7951n/a }
7952n/a else {
7953n/a writer->overallocate = 1;
7954n/a if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7955n/a goto onError;
7956n/a }
7957n/a }
7958n/a else {
7959n/a /* wrong return value */
7960n/a PyErr_SetString(PyExc_TypeError,
7961n/a "character mapping must return integer, None or str");
7962n/a goto onError;
7963n/a }
7964n/a Py_CLEAR(item);
7965n/a ++s;
7966n/a continue;
7967n/a
7968n/aUndefined:
7969n/a /* undefined mapping */
7970n/a Py_CLEAR(item);
7971n/a startinpos = s-starts;
7972n/a endinpos = startinpos+1;
7973n/a if (unicode_decode_call_errorhandler_writer(
7974n/a errors, &errorHandler,
7975n/a "charmap", "character maps to <undefined>",
7976n/a &starts, &e, &startinpos, &endinpos, &exc, &s,
7977n/a writer)) {
7978n/a goto onError;
7979n/a }
7980n/a }
7981n/a Py_XDECREF(errorHandler);
7982n/a Py_XDECREF(exc);
7983n/a return 0;
7984n/a
7985n/aonError:
7986n/a Py_XDECREF(item);
7987n/a Py_XDECREF(errorHandler);
7988n/a Py_XDECREF(exc);
7989n/a return -1;
7990n/a}
7991n/a
7992n/aPyObject *
7993n/aPyUnicode_DecodeCharmap(const char *s,
7994n/a Py_ssize_t size,
7995n/a PyObject *mapping,
7996n/a const char *errors)
7997n/a{
7998n/a _PyUnicodeWriter writer;
7999n/a
8000n/a /* Default to Latin-1 */
8001n/a if (mapping == NULL)
8002n/a return PyUnicode_DecodeLatin1(s, size, errors);
8003n/a
8004n/a if (size == 0)
8005n/a _Py_RETURN_UNICODE_EMPTY();
8006n/a _PyUnicodeWriter_Init(&writer);
8007n/a writer.min_length = size;
8008n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8009n/a goto onError;
8010n/a
8011n/a if (PyUnicode_CheckExact(mapping)) {
8012n/a if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8013n/a goto onError;
8014n/a }
8015n/a else {
8016n/a if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8017n/a goto onError;
8018n/a }
8019n/a return _PyUnicodeWriter_Finish(&writer);
8020n/a
8021n/a onError:
8022n/a _PyUnicodeWriter_Dealloc(&writer);
8023n/a return NULL;
8024n/a}
8025n/a
8026n/a/* Charmap encoding: the lookup table */
8027n/a
8028n/astruct encoding_map {
8029n/a PyObject_HEAD
8030n/a unsigned char level1[32];
8031n/a int count2, count3;
8032n/a unsigned char level23[1];
8033n/a};
8034n/a
8035n/astatic PyObject*
8036n/aencoding_map_size(PyObject *obj, PyObject* args)
8037n/a{
8038n/a struct encoding_map *map = (struct encoding_map*)obj;
8039n/a return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8040n/a 128*map->count3);
8041n/a}
8042n/a
8043n/astatic PyMethodDef encoding_map_methods[] = {
8044n/a {"size", encoding_map_size, METH_NOARGS,
8045n/a PyDoc_STR("Return the size (in bytes) of this object") },
8046n/a { 0 }
8047n/a};
8048n/a
8049n/astatic void
8050n/aencoding_map_dealloc(PyObject* o)
8051n/a{
8052n/a PyObject_FREE(o);
8053n/a}
8054n/a
8055n/astatic PyTypeObject EncodingMapType = {
8056n/a PyVarObject_HEAD_INIT(NULL, 0)
8057n/a "EncodingMap", /*tp_name*/
8058n/a sizeof(struct encoding_map), /*tp_basicsize*/
8059n/a 0, /*tp_itemsize*/
8060n/a /* methods */
8061n/a encoding_map_dealloc, /*tp_dealloc*/
8062n/a 0, /*tp_print*/
8063n/a 0, /*tp_getattr*/
8064n/a 0, /*tp_setattr*/
8065n/a 0, /*tp_reserved*/
8066n/a 0, /*tp_repr*/
8067n/a 0, /*tp_as_number*/
8068n/a 0, /*tp_as_sequence*/
8069n/a 0, /*tp_as_mapping*/
8070n/a 0, /*tp_hash*/
8071n/a 0, /*tp_call*/
8072n/a 0, /*tp_str*/
8073n/a 0, /*tp_getattro*/
8074n/a 0, /*tp_setattro*/
8075n/a 0, /*tp_as_buffer*/
8076n/a Py_TPFLAGS_DEFAULT, /*tp_flags*/
8077n/a 0, /*tp_doc*/
8078n/a 0, /*tp_traverse*/
8079n/a 0, /*tp_clear*/
8080n/a 0, /*tp_richcompare*/
8081n/a 0, /*tp_weaklistoffset*/
8082n/a 0, /*tp_iter*/
8083n/a 0, /*tp_iternext*/
8084n/a encoding_map_methods, /*tp_methods*/
8085n/a 0, /*tp_members*/
8086n/a 0, /*tp_getset*/
8087n/a 0, /*tp_base*/
8088n/a 0, /*tp_dict*/
8089n/a 0, /*tp_descr_get*/
8090n/a 0, /*tp_descr_set*/
8091n/a 0, /*tp_dictoffset*/
8092n/a 0, /*tp_init*/
8093n/a 0, /*tp_alloc*/
8094n/a 0, /*tp_new*/
8095n/a 0, /*tp_free*/
8096n/a 0, /*tp_is_gc*/
8097n/a};
8098n/a
8099n/aPyObject*
8100n/aPyUnicode_BuildEncodingMap(PyObject* string)
8101n/a{
8102n/a PyObject *result;
8103n/a struct encoding_map *mresult;
8104n/a int i;
8105n/a int need_dict = 0;
8106n/a unsigned char level1[32];
8107n/a unsigned char level2[512];
8108n/a unsigned char *mlevel1, *mlevel2, *mlevel3;
8109n/a int count2 = 0, count3 = 0;
8110n/a int kind;
8111n/a void *data;
8112n/a Py_ssize_t length;
8113n/a Py_UCS4 ch;
8114n/a
8115n/a if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8116n/a PyErr_BadArgument();
8117n/a return NULL;
8118n/a }
8119n/a kind = PyUnicode_KIND(string);
8120n/a data = PyUnicode_DATA(string);
8121n/a length = PyUnicode_GET_LENGTH(string);
8122n/a length = Py_MIN(length, 256);
8123n/a memset(level1, 0xFF, sizeof level1);
8124n/a memset(level2, 0xFF, sizeof level2);
8125n/a
8126n/a /* If there isn't a one-to-one mapping of NULL to \0,
8127n/a or if there are non-BMP characters, we need to use
8128n/a a mapping dictionary. */
8129n/a if (PyUnicode_READ(kind, data, 0) != 0)
8130n/a need_dict = 1;
8131n/a for (i = 1; i < length; i++) {
8132n/a int l1, l2;
8133n/a ch = PyUnicode_READ(kind, data, i);
8134n/a if (ch == 0 || ch > 0xFFFF) {
8135n/a need_dict = 1;
8136n/a break;
8137n/a }
8138n/a if (ch == 0xFFFE)
8139n/a /* unmapped character */
8140n/a continue;
8141n/a l1 = ch >> 11;
8142n/a l2 = ch >> 7;
8143n/a if (level1[l1] == 0xFF)
8144n/a level1[l1] = count2++;
8145n/a if (level2[l2] == 0xFF)
8146n/a level2[l2] = count3++;
8147n/a }
8148n/a
8149n/a if (count2 >= 0xFF || count3 >= 0xFF)
8150n/a need_dict = 1;
8151n/a
8152n/a if (need_dict) {
8153n/a PyObject *result = PyDict_New();
8154n/a PyObject *key, *value;
8155n/a if (!result)
8156n/a return NULL;
8157n/a for (i = 0; i < length; i++) {
8158n/a key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8159n/a value = PyLong_FromLong(i);
8160n/a if (!key || !value)
8161n/a goto failed1;
8162n/a if (PyDict_SetItem(result, key, value) == -1)
8163n/a goto failed1;
8164n/a Py_DECREF(key);
8165n/a Py_DECREF(value);
8166n/a }
8167n/a return result;
8168n/a failed1:
8169n/a Py_XDECREF(key);
8170n/a Py_XDECREF(value);
8171n/a Py_DECREF(result);
8172n/a return NULL;
8173n/a }
8174n/a
8175n/a /* Create a three-level trie */
8176n/a result = PyObject_MALLOC(sizeof(struct encoding_map) +
8177n/a 16*count2 + 128*count3 - 1);
8178n/a if (!result)
8179n/a return PyErr_NoMemory();
8180n/a PyObject_Init(result, &EncodingMapType);
8181n/a mresult = (struct encoding_map*)result;
8182n/a mresult->count2 = count2;
8183n/a mresult->count3 = count3;
8184n/a mlevel1 = mresult->level1;
8185n/a mlevel2 = mresult->level23;
8186n/a mlevel3 = mresult->level23 + 16*count2;
8187n/a memcpy(mlevel1, level1, 32);
8188n/a memset(mlevel2, 0xFF, 16*count2);
8189n/a memset(mlevel3, 0, 128*count3);
8190n/a count3 = 0;
8191n/a for (i = 1; i < length; i++) {
8192n/a int o1, o2, o3, i2, i3;
8193n/a Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8194n/a if (ch == 0xFFFE)
8195n/a /* unmapped character */
8196n/a continue;
8197n/a o1 = ch>>11;
8198n/a o2 = (ch>>7) & 0xF;
8199n/a i2 = 16*mlevel1[o1] + o2;
8200n/a if (mlevel2[i2] == 0xFF)
8201n/a mlevel2[i2] = count3++;
8202n/a o3 = ch & 0x7F;
8203n/a i3 = 128*mlevel2[i2] + o3;
8204n/a mlevel3[i3] = i;
8205n/a }
8206n/a return result;
8207n/a}
8208n/a
8209n/astatic int
8210n/aencoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8211n/a{
8212n/a struct encoding_map *map = (struct encoding_map*)mapping;
8213n/a int l1 = c>>11;
8214n/a int l2 = (c>>7) & 0xF;
8215n/a int l3 = c & 0x7F;
8216n/a int i;
8217n/a
8218n/a if (c > 0xFFFF)
8219n/a return -1;
8220n/a if (c == 0)
8221n/a return 0;
8222n/a /* level 1*/
8223n/a i = map->level1[l1];
8224n/a if (i == 0xFF) {
8225n/a return -1;
8226n/a }
8227n/a /* level 2*/
8228n/a i = map->level23[16*i+l2];
8229n/a if (i == 0xFF) {
8230n/a return -1;
8231n/a }
8232n/a /* level 3 */
8233n/a i = map->level23[16*map->count2 + 128*i + l3];
8234n/a if (i == 0) {
8235n/a return -1;
8236n/a }
8237n/a return i;
8238n/a}
8239n/a
8240n/a/* Lookup the character ch in the mapping. If the character
8241n/a can't be found, Py_None is returned (or NULL, if another
8242n/a error occurred). */
8243n/astatic PyObject *
8244n/acharmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8245n/a{
8246n/a PyObject *w = PyLong_FromLong((long)c);
8247n/a PyObject *x;
8248n/a
8249n/a if (w == NULL)
8250n/a return NULL;
8251n/a x = PyObject_GetItem(mapping, w);
8252n/a Py_DECREF(w);
8253n/a if (x == NULL) {
8254n/a if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8255n/a /* No mapping found means: mapping is undefined. */
8256n/a PyErr_Clear();
8257n/a Py_RETURN_NONE;
8258n/a } else
8259n/a return NULL;
8260n/a }
8261n/a else if (x == Py_None)
8262n/a return x;
8263n/a else if (PyLong_Check(x)) {
8264n/a long value = PyLong_AS_LONG(x);
8265n/a if (value < 0 || value > 255) {
8266n/a PyErr_SetString(PyExc_TypeError,
8267n/a "character mapping must be in range(256)");
8268n/a Py_DECREF(x);
8269n/a return NULL;
8270n/a }
8271n/a return x;
8272n/a }
8273n/a else if (PyBytes_Check(x))
8274n/a return x;
8275n/a else {
8276n/a /* wrong return value */
8277n/a PyErr_Format(PyExc_TypeError,
8278n/a "character mapping must return integer, bytes or None, not %.400s",
8279n/a x->ob_type->tp_name);
8280n/a Py_DECREF(x);
8281n/a return NULL;
8282n/a }
8283n/a}
8284n/a
8285n/astatic int
8286n/acharmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8287n/a{
8288n/a Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8289n/a /* exponentially overallocate to minimize reallocations */
8290n/a if (requiredsize < 2*outsize)
8291n/a requiredsize = 2*outsize;
8292n/a if (_PyBytes_Resize(outobj, requiredsize))
8293n/a return -1;
8294n/a return 0;
8295n/a}
8296n/a
8297n/atypedef enum charmapencode_result {
8298n/a enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8299n/a} charmapencode_result;
8300n/a/* lookup the character, put the result in the output string and adjust
8301n/a various state variables. Resize the output bytes object if not enough
8302n/a space is available. Return a new reference to the object that
8303n/a was put in the output buffer, or Py_None, if the mapping was undefined
8304n/a (in which case no character was written) or NULL, if a
8305n/a reallocation error occurred. The caller must decref the result */
8306n/astatic charmapencode_result
8307n/acharmapencode_output(Py_UCS4 c, PyObject *mapping,
8308n/a PyObject **outobj, Py_ssize_t *outpos)
8309n/a{
8310n/a PyObject *rep;
8311n/a char *outstart;
8312n/a Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8313n/a
8314n/a if (Py_TYPE(mapping) == &EncodingMapType) {
8315n/a int res = encoding_map_lookup(c, mapping);
8316n/a Py_ssize_t requiredsize = *outpos+1;
8317n/a if (res == -1)
8318n/a return enc_FAILED;
8319n/a if (outsize<requiredsize)
8320n/a if (charmapencode_resize(outobj, outpos, requiredsize))
8321n/a return enc_EXCEPTION;
8322n/a outstart = PyBytes_AS_STRING(*outobj);
8323n/a outstart[(*outpos)++] = (char)res;
8324n/a return enc_SUCCESS;
8325n/a }
8326n/a
8327n/a rep = charmapencode_lookup(c, mapping);
8328n/a if (rep==NULL)
8329n/a return enc_EXCEPTION;
8330n/a else if (rep==Py_None) {
8331n/a Py_DECREF(rep);
8332n/a return enc_FAILED;
8333n/a } else {
8334n/a if (PyLong_Check(rep)) {
8335n/a Py_ssize_t requiredsize = *outpos+1;
8336n/a if (outsize<requiredsize)
8337n/a if (charmapencode_resize(outobj, outpos, requiredsize)) {
8338n/a Py_DECREF(rep);
8339n/a return enc_EXCEPTION;
8340n/a }
8341n/a outstart = PyBytes_AS_STRING(*outobj);
8342n/a outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8343n/a }
8344n/a else {
8345n/a const char *repchars = PyBytes_AS_STRING(rep);
8346n/a Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8347n/a Py_ssize_t requiredsize = *outpos+repsize;
8348n/a if (outsize<requiredsize)
8349n/a if (charmapencode_resize(outobj, outpos, requiredsize)) {
8350n/a Py_DECREF(rep);
8351n/a return enc_EXCEPTION;
8352n/a }
8353n/a outstart = PyBytes_AS_STRING(*outobj);
8354n/a memcpy(outstart + *outpos, repchars, repsize);
8355n/a *outpos += repsize;
8356n/a }
8357n/a }
8358n/a Py_DECREF(rep);
8359n/a return enc_SUCCESS;
8360n/a}
8361n/a
8362n/a/* handle an error in PyUnicode_EncodeCharmap
8363n/a Return 0 on success, -1 on error */
8364n/astatic int
8365n/acharmap_encoding_error(
8366n/a PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8367n/a PyObject **exceptionObject,
8368n/a _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8369n/a PyObject **res, Py_ssize_t *respos)
8370n/a{
8371n/a PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8372n/a Py_ssize_t size, repsize;
8373n/a Py_ssize_t newpos;
8374n/a enum PyUnicode_Kind kind;
8375n/a void *data;
8376n/a Py_ssize_t index;
8377n/a /* startpos for collecting unencodable chars */
8378n/a Py_ssize_t collstartpos = *inpos;
8379n/a Py_ssize_t collendpos = *inpos+1;
8380n/a Py_ssize_t collpos;
8381n/a char *encoding = "charmap";
8382n/a char *reason = "character maps to <undefined>";
8383n/a charmapencode_result x;
8384n/a Py_UCS4 ch;
8385n/a int val;
8386n/a
8387n/a if (PyUnicode_READY(unicode) == -1)
8388n/a return -1;
8389n/a size = PyUnicode_GET_LENGTH(unicode);
8390n/a /* find all unencodable characters */
8391n/a while (collendpos < size) {
8392n/a PyObject *rep;
8393n/a if (Py_TYPE(mapping) == &EncodingMapType) {
8394n/a ch = PyUnicode_READ_CHAR(unicode, collendpos);
8395n/a val = encoding_map_lookup(ch, mapping);
8396n/a if (val != -1)
8397n/a break;
8398n/a ++collendpos;
8399n/a continue;
8400n/a }
8401n/a
8402n/a ch = PyUnicode_READ_CHAR(unicode, collendpos);
8403n/a rep = charmapencode_lookup(ch, mapping);
8404n/a if (rep==NULL)
8405n/a return -1;
8406n/a else if (rep!=Py_None) {
8407n/a Py_DECREF(rep);
8408n/a break;
8409n/a }
8410n/a Py_DECREF(rep);
8411n/a ++collendpos;
8412n/a }
8413n/a /* cache callback name lookup
8414n/a * (if not done yet, i.e. it's the first error) */
8415n/a if (*error_handler == _Py_ERROR_UNKNOWN)
8416n/a *error_handler = get_error_handler(errors);
8417n/a
8418n/a switch (*error_handler) {
8419n/a case _Py_ERROR_STRICT:
8420n/a raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8421n/a return -1;
8422n/a
8423n/a case _Py_ERROR_REPLACE:
8424n/a for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8425n/a x = charmapencode_output('?', mapping, res, respos);
8426n/a if (x==enc_EXCEPTION) {
8427n/a return -1;
8428n/a }
8429n/a else if (x==enc_FAILED) {
8430n/a raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8431n/a return -1;
8432n/a }
8433n/a }
8434n/a /* fall through */
8435n/a case _Py_ERROR_IGNORE:
8436n/a *inpos = collendpos;
8437n/a break;
8438n/a
8439n/a case _Py_ERROR_XMLCHARREFREPLACE:
8440n/a /* generate replacement (temporarily (mis)uses p) */
8441n/a for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8442n/a char buffer[2+29+1+1];
8443n/a char *cp;
8444n/a sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8445n/a for (cp = buffer; *cp; ++cp) {
8446n/a x = charmapencode_output(*cp, mapping, res, respos);
8447n/a if (x==enc_EXCEPTION)
8448n/a return -1;
8449n/a else if (x==enc_FAILED) {
8450n/a raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8451n/a return -1;
8452n/a }
8453n/a }
8454n/a }
8455n/a *inpos = collendpos;
8456n/a break;
8457n/a
8458n/a default:
8459n/a repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8460n/a encoding, reason, unicode, exceptionObject,
8461n/a collstartpos, collendpos, &newpos);
8462n/a if (repunicode == NULL)
8463n/a return -1;
8464n/a if (PyBytes_Check(repunicode)) {
8465n/a /* Directly copy bytes result to output. */
8466n/a Py_ssize_t outsize = PyBytes_Size(*res);
8467n/a Py_ssize_t requiredsize;
8468n/a repsize = PyBytes_Size(repunicode);
8469n/a requiredsize = *respos + repsize;
8470n/a if (requiredsize > outsize)
8471n/a /* Make room for all additional bytes. */
8472n/a if (charmapencode_resize(res, respos, requiredsize)) {
8473n/a Py_DECREF(repunicode);
8474n/a return -1;
8475n/a }
8476n/a memcpy(PyBytes_AsString(*res) + *respos,
8477n/a PyBytes_AsString(repunicode), repsize);
8478n/a *respos += repsize;
8479n/a *inpos = newpos;
8480n/a Py_DECREF(repunicode);
8481n/a break;
8482n/a }
8483n/a /* generate replacement */
8484n/a if (PyUnicode_READY(repunicode) == -1) {
8485n/a Py_DECREF(repunicode);
8486n/a return -1;
8487n/a }
8488n/a repsize = PyUnicode_GET_LENGTH(repunicode);
8489n/a data = PyUnicode_DATA(repunicode);
8490n/a kind = PyUnicode_KIND(repunicode);
8491n/a for (index = 0; index < repsize; index++) {
8492n/a Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8493n/a x = charmapencode_output(repch, mapping, res, respos);
8494n/a if (x==enc_EXCEPTION) {
8495n/a Py_DECREF(repunicode);
8496n/a return -1;
8497n/a }
8498n/a else if (x==enc_FAILED) {
8499n/a Py_DECREF(repunicode);
8500n/a raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8501n/a return -1;
8502n/a }
8503n/a }
8504n/a *inpos = newpos;
8505n/a Py_DECREF(repunicode);
8506n/a }
8507n/a return 0;
8508n/a}
8509n/a
8510n/aPyObject *
8511n/a_PyUnicode_EncodeCharmap(PyObject *unicode,
8512n/a PyObject *mapping,
8513n/a const char *errors)
8514n/a{
8515n/a /* output object */
8516n/a PyObject *res = NULL;
8517n/a /* current input position */
8518n/a Py_ssize_t inpos = 0;
8519n/a Py_ssize_t size;
8520n/a /* current output position */
8521n/a Py_ssize_t respos = 0;
8522n/a PyObject *error_handler_obj = NULL;
8523n/a PyObject *exc = NULL;
8524n/a _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8525n/a void *data;
8526n/a int kind;
8527n/a
8528n/a if (PyUnicode_READY(unicode) == -1)
8529n/a return NULL;
8530n/a size = PyUnicode_GET_LENGTH(unicode);
8531n/a data = PyUnicode_DATA(unicode);
8532n/a kind = PyUnicode_KIND(unicode);
8533n/a
8534n/a /* Default to Latin-1 */
8535n/a if (mapping == NULL)
8536n/a return unicode_encode_ucs1(unicode, errors, 256);
8537n/a
8538n/a /* allocate enough for a simple encoding without
8539n/a replacements, if we need more, we'll resize */
8540n/a res = PyBytes_FromStringAndSize(NULL, size);
8541n/a if (res == NULL)
8542n/a goto onError;
8543n/a if (size == 0)
8544n/a return res;
8545n/a
8546n/a while (inpos<size) {
8547n/a Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8548n/a /* try to encode it */
8549n/a charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8550n/a if (x==enc_EXCEPTION) /* error */
8551n/a goto onError;
8552n/a if (x==enc_FAILED) { /* unencodable character */
8553n/a if (charmap_encoding_error(unicode, &inpos, mapping,
8554n/a &exc,
8555n/a &error_handler, &error_handler_obj, errors,
8556n/a &res, &respos)) {
8557n/a goto onError;
8558n/a }
8559n/a }
8560n/a else
8561n/a /* done with this character => adjust input position */
8562n/a ++inpos;
8563n/a }
8564n/a
8565n/a /* Resize if we allocated to much */
8566n/a if (respos<PyBytes_GET_SIZE(res))
8567n/a if (_PyBytes_Resize(&res, respos) < 0)
8568n/a goto onError;
8569n/a
8570n/a Py_XDECREF(exc);
8571n/a Py_XDECREF(error_handler_obj);
8572n/a return res;
8573n/a
8574n/a onError:
8575n/a Py_XDECREF(res);
8576n/a Py_XDECREF(exc);
8577n/a Py_XDECREF(error_handler_obj);
8578n/a return NULL;
8579n/a}
8580n/a
8581n/a/* Deprecated */
8582n/aPyObject *
8583n/aPyUnicode_EncodeCharmap(const Py_UNICODE *p,
8584n/a Py_ssize_t size,
8585n/a PyObject *mapping,
8586n/a const char *errors)
8587n/a{
8588n/a PyObject *result;
8589n/a PyObject *unicode = PyUnicode_FromWideChar(p, size);
8590n/a if (unicode == NULL)
8591n/a return NULL;
8592n/a result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8593n/a Py_DECREF(unicode);
8594n/a return result;
8595n/a}
8596n/a
8597n/aPyObject *
8598n/aPyUnicode_AsCharmapString(PyObject *unicode,
8599n/a PyObject *mapping)
8600n/a{
8601n/a if (!PyUnicode_Check(unicode) || mapping == NULL) {
8602n/a PyErr_BadArgument();
8603n/a return NULL;
8604n/a }
8605n/a return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8606n/a}
8607n/a
8608n/a/* create or adjust a UnicodeTranslateError */
8609n/astatic void
8610n/amake_translate_exception(PyObject **exceptionObject,
8611n/a PyObject *unicode,
8612n/a Py_ssize_t startpos, Py_ssize_t endpos,
8613n/a const char *reason)
8614n/a{
8615n/a if (*exceptionObject == NULL) {
8616n/a *exceptionObject = _PyUnicodeTranslateError_Create(
8617n/a unicode, startpos, endpos, reason);
8618n/a }
8619n/a else {
8620n/a if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8621n/a goto onError;
8622n/a if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8623n/a goto onError;
8624n/a if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8625n/a goto onError;
8626n/a return;
8627n/a onError:
8628n/a Py_CLEAR(*exceptionObject);
8629n/a }
8630n/a}
8631n/a
8632n/a/* error handling callback helper:
8633n/a build arguments, call the callback and check the arguments,
8634n/a put the result into newpos and return the replacement string, which
8635n/a has to be freed by the caller */
8636n/astatic PyObject *
8637n/aunicode_translate_call_errorhandler(const char *errors,
8638n/a PyObject **errorHandler,
8639n/a const char *reason,
8640n/a PyObject *unicode, PyObject **exceptionObject,
8641n/a Py_ssize_t startpos, Py_ssize_t endpos,
8642n/a Py_ssize_t *newpos)
8643n/a{
8644n/a static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8645n/a
8646n/a Py_ssize_t i_newpos;
8647n/a PyObject *restuple;
8648n/a PyObject *resunicode;
8649n/a
8650n/a if (*errorHandler == NULL) {
8651n/a *errorHandler = PyCodec_LookupError(errors);
8652n/a if (*errorHandler == NULL)
8653n/a return NULL;
8654n/a }
8655n/a
8656n/a make_translate_exception(exceptionObject,
8657n/a unicode, startpos, endpos, reason);
8658n/a if (*exceptionObject == NULL)
8659n/a return NULL;
8660n/a
8661n/a restuple = PyObject_CallFunctionObjArgs(
8662n/a *errorHandler, *exceptionObject, NULL);
8663n/a if (restuple == NULL)
8664n/a return NULL;
8665n/a if (!PyTuple_Check(restuple)) {
8666n/a PyErr_SetString(PyExc_TypeError, &argparse[3]);
8667n/a Py_DECREF(restuple);
8668n/a return NULL;
8669n/a }
8670n/a if (!PyArg_ParseTuple(restuple, argparse,
8671n/a &resunicode, &i_newpos)) {
8672n/a Py_DECREF(restuple);
8673n/a return NULL;
8674n/a }
8675n/a if (i_newpos<0)
8676n/a *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8677n/a else
8678n/a *newpos = i_newpos;
8679n/a if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8680n/a PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8681n/a Py_DECREF(restuple);
8682n/a return NULL;
8683n/a }
8684n/a Py_INCREF(resunicode);
8685n/a Py_DECREF(restuple);
8686n/a return resunicode;
8687n/a}
8688n/a
8689n/a/* Lookup the character ch in the mapping and put the result in result,
8690n/a which must be decrefed by the caller.
8691n/a Return 0 on success, -1 on error */
8692n/astatic int
8693n/acharmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8694n/a{
8695n/a PyObject *w = PyLong_FromLong((long)c);
8696n/a PyObject *x;
8697n/a
8698n/a if (w == NULL)
8699n/a return -1;
8700n/a x = PyObject_GetItem(mapping, w);
8701n/a Py_DECREF(w);
8702n/a if (x == NULL) {
8703n/a if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8704n/a /* No mapping found means: use 1:1 mapping. */
8705n/a PyErr_Clear();
8706n/a *result = NULL;
8707n/a return 0;
8708n/a } else
8709n/a return -1;
8710n/a }
8711n/a else if (x == Py_None) {
8712n/a *result = x;
8713n/a return 0;
8714n/a }
8715n/a else if (PyLong_Check(x)) {
8716n/a long value = PyLong_AS_LONG(x);
8717n/a if (value < 0 || value > MAX_UNICODE) {
8718n/a PyErr_Format(PyExc_ValueError,
8719n/a "character mapping must be in range(0x%x)",
8720n/a MAX_UNICODE+1);
8721n/a Py_DECREF(x);
8722n/a return -1;
8723n/a }
8724n/a *result = x;
8725n/a return 0;
8726n/a }
8727n/a else if (PyUnicode_Check(x)) {
8728n/a *result = x;
8729n/a return 0;
8730n/a }
8731n/a else {
8732n/a /* wrong return value */
8733n/a PyErr_SetString(PyExc_TypeError,
8734n/a "character mapping must return integer, None or str");
8735n/a Py_DECREF(x);
8736n/a return -1;
8737n/a }
8738n/a}
8739n/a
8740n/a/* lookup the character, write the result into the writer.
8741n/a Return 1 if the result was written into the writer, return 0 if the mapping
8742n/a was undefined, raise an exception return -1 on error. */
8743n/astatic int
8744n/acharmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8745n/a _PyUnicodeWriter *writer)
8746n/a{
8747n/a PyObject *item;
8748n/a
8749n/a if (charmaptranslate_lookup(ch, mapping, &item))
8750n/a return -1;
8751n/a
8752n/a if (item == NULL) {
8753n/a /* not found => default to 1:1 mapping */
8754n/a if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8755n/a return -1;
8756n/a }
8757n/a return 1;
8758n/a }
8759n/a
8760n/a if (item == Py_None) {
8761n/a Py_DECREF(item);
8762n/a return 0;
8763n/a }
8764n/a
8765n/a if (PyLong_Check(item)) {
8766n/a long ch = (Py_UCS4)PyLong_AS_LONG(item);
8767n/a /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8768n/a used it */
8769n/a if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8770n/a Py_DECREF(item);
8771n/a return -1;
8772n/a }
8773n/a Py_DECREF(item);
8774n/a return 1;
8775n/a }
8776n/a
8777n/a if (!PyUnicode_Check(item)) {
8778n/a Py_DECREF(item);
8779n/a return -1;
8780n/a }
8781n/a
8782n/a if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8783n/a Py_DECREF(item);
8784n/a return -1;
8785n/a }
8786n/a
8787n/a Py_DECREF(item);
8788n/a return 1;
8789n/a}
8790n/a
8791n/astatic int
8792n/aunicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8793n/a Py_UCS1 *translate)
8794n/a{
8795n/a PyObject *item = NULL;
8796n/a int ret = 0;
8797n/a
8798n/a if (charmaptranslate_lookup(ch, mapping, &item)) {
8799n/a return -1;
8800n/a }
8801n/a
8802n/a if (item == Py_None) {
8803n/a /* deletion */
8804n/a translate[ch] = 0xfe;
8805n/a }
8806n/a else if (item == NULL) {
8807n/a /* not found => default to 1:1 mapping */
8808n/a translate[ch] = ch;
8809n/a return 1;
8810n/a }
8811n/a else if (PyLong_Check(item)) {
8812n/a long replace = PyLong_AS_LONG(item);
8813n/a /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8814n/a used it */
8815n/a if (127 < replace) {
8816n/a /* invalid character or character outside ASCII:
8817n/a skip the fast translate */
8818n/a goto exit;
8819n/a }
8820n/a translate[ch] = (Py_UCS1)replace;
8821n/a }
8822n/a else if (PyUnicode_Check(item)) {
8823n/a Py_UCS4 replace;
8824n/a
8825n/a if (PyUnicode_READY(item) == -1) {
8826n/a Py_DECREF(item);
8827n/a return -1;
8828n/a }
8829n/a if (PyUnicode_GET_LENGTH(item) != 1)
8830n/a goto exit;
8831n/a
8832n/a replace = PyUnicode_READ_CHAR(item, 0);
8833n/a if (replace > 127)
8834n/a goto exit;
8835n/a translate[ch] = (Py_UCS1)replace;
8836n/a }
8837n/a else {
8838n/a /* not None, NULL, long or unicode */
8839n/a goto exit;
8840n/a }
8841n/a ret = 1;
8842n/a
8843n/a exit:
8844n/a Py_DECREF(item);
8845n/a return ret;
8846n/a}
8847n/a
8848n/a/* Fast path for ascii => ascii translation. Return 1 if the whole string
8849n/a was translated into writer, return 0 if the input string was partially
8850n/a translated into writer, raise an exception and return -1 on error. */
8851n/astatic int
8852n/aunicode_fast_translate(PyObject *input, PyObject *mapping,
8853n/a _PyUnicodeWriter *writer, int ignore,
8854n/a Py_ssize_t *input_pos)
8855n/a{
8856n/a Py_UCS1 ascii_table[128], ch, ch2;
8857n/a Py_ssize_t len;
8858n/a Py_UCS1 *in, *end, *out;
8859n/a int res = 0;
8860n/a
8861n/a len = PyUnicode_GET_LENGTH(input);
8862n/a
8863n/a memset(ascii_table, 0xff, 128);
8864n/a
8865n/a in = PyUnicode_1BYTE_DATA(input);
8866n/a end = in + len;
8867n/a
8868n/a assert(PyUnicode_IS_ASCII(writer->buffer));
8869n/a assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8870n/a out = PyUnicode_1BYTE_DATA(writer->buffer);
8871n/a
8872n/a for (; in < end; in++) {
8873n/a ch = *in;
8874n/a ch2 = ascii_table[ch];
8875n/a if (ch2 == 0xff) {
8876n/a int translate = unicode_fast_translate_lookup(mapping, ch,
8877n/a ascii_table);
8878n/a if (translate < 0)
8879n/a return -1;
8880n/a if (translate == 0)
8881n/a goto exit;
8882n/a ch2 = ascii_table[ch];
8883n/a }
8884n/a if (ch2 == 0xfe) {
8885n/a if (ignore)
8886n/a continue;
8887n/a goto exit;
8888n/a }
8889n/a assert(ch2 < 128);
8890n/a *out = ch2;
8891n/a out++;
8892n/a }
8893n/a res = 1;
8894n/a
8895n/aexit:
8896n/a writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8897n/a *input_pos = in - PyUnicode_1BYTE_DATA(input);
8898n/a return res;
8899n/a}
8900n/a
8901n/astatic PyObject *
8902n/a_PyUnicode_TranslateCharmap(PyObject *input,
8903n/a PyObject *mapping,
8904n/a const char *errors)
8905n/a{
8906n/a /* input object */
8907n/a char *data;
8908n/a Py_ssize_t size, i;
8909n/a int kind;
8910n/a /* output buffer */
8911n/a _PyUnicodeWriter writer;
8912n/a /* error handler */
8913n/a char *reason = "character maps to <undefined>";
8914n/a PyObject *errorHandler = NULL;
8915n/a PyObject *exc = NULL;
8916n/a int ignore;
8917n/a int res;
8918n/a
8919n/a if (mapping == NULL) {
8920n/a PyErr_BadArgument();
8921n/a return NULL;
8922n/a }
8923n/a
8924n/a if (PyUnicode_READY(input) == -1)
8925n/a return NULL;
8926n/a data = (char*)PyUnicode_DATA(input);
8927n/a kind = PyUnicode_KIND(input);
8928n/a size = PyUnicode_GET_LENGTH(input);
8929n/a
8930n/a if (size == 0)
8931n/a return PyUnicode_FromObject(input);
8932n/a
8933n/a /* allocate enough for a simple 1:1 translation without
8934n/a replacements, if we need more, we'll resize */
8935n/a _PyUnicodeWriter_Init(&writer);
8936n/a if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8937n/a goto onError;
8938n/a
8939n/a ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8940n/a
8941n/a if (PyUnicode_READY(input) == -1)
8942n/a return NULL;
8943n/a if (PyUnicode_IS_ASCII(input)) {
8944n/a res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8945n/a if (res < 0) {
8946n/a _PyUnicodeWriter_Dealloc(&writer);
8947n/a return NULL;
8948n/a }
8949n/a if (res == 1)
8950n/a return _PyUnicodeWriter_Finish(&writer);
8951n/a }
8952n/a else {
8953n/a i = 0;
8954n/a }
8955n/a
8956n/a while (i<size) {
8957n/a /* try to encode it */
8958n/a int translate;
8959n/a PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8960n/a Py_ssize_t newpos;
8961n/a /* startpos for collecting untranslatable chars */
8962n/a Py_ssize_t collstart;
8963n/a Py_ssize_t collend;
8964n/a Py_UCS4 ch;
8965n/a
8966n/a ch = PyUnicode_READ(kind, data, i);
8967n/a translate = charmaptranslate_output(ch, mapping, &writer);
8968n/a if (translate < 0)
8969n/a goto onError;
8970n/a
8971n/a if (translate != 0) {
8972n/a /* it worked => adjust input pointer */
8973n/a ++i;
8974n/a continue;
8975n/a }
8976n/a
8977n/a /* untranslatable character */
8978n/a collstart = i;
8979n/a collend = i+1;
8980n/a
8981n/a /* find all untranslatable characters */
8982n/a while (collend < size) {
8983n/a PyObject *x;
8984n/a ch = PyUnicode_READ(kind, data, collend);
8985n/a if (charmaptranslate_lookup(ch, mapping, &x))
8986n/a goto onError;
8987n/a Py_XDECREF(x);
8988n/a if (x != Py_None)
8989n/a break;
8990n/a ++collend;
8991n/a }
8992n/a
8993n/a if (ignore) {
8994n/a i = collend;
8995n/a }
8996n/a else {
8997n/a repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8998n/a reason, input, &exc,
8999n/a collstart, collend, &newpos);
9000n/a if (repunicode == NULL)
9001n/a goto onError;
9002n/a if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9003n/a Py_DECREF(repunicode);
9004n/a goto onError;
9005n/a }
9006n/a Py_DECREF(repunicode);
9007n/a i = newpos;
9008n/a }
9009n/a }
9010n/a Py_XDECREF(exc);
9011n/a Py_XDECREF(errorHandler);
9012n/a return _PyUnicodeWriter_Finish(&writer);
9013n/a
9014n/a onError:
9015n/a _PyUnicodeWriter_Dealloc(&writer);
9016n/a Py_XDECREF(exc);
9017n/a Py_XDECREF(errorHandler);
9018n/a return NULL;
9019n/a}
9020n/a
9021n/a/* Deprecated. Use PyUnicode_Translate instead. */
9022n/aPyObject *
9023n/aPyUnicode_TranslateCharmap(const Py_UNICODE *p,
9024n/a Py_ssize_t size,
9025n/a PyObject *mapping,
9026n/a const char *errors)
9027n/a{
9028n/a PyObject *result;
9029n/a PyObject *unicode = PyUnicode_FromWideChar(p, size);
9030n/a if (!unicode)
9031n/a return NULL;
9032n/a result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9033n/a Py_DECREF(unicode);
9034n/a return result;
9035n/a}
9036n/a
9037n/aPyObject *
9038n/aPyUnicode_Translate(PyObject *str,
9039n/a PyObject *mapping,
9040n/a const char *errors)
9041n/a{
9042n/a if (ensure_unicode(str) < 0)
9043n/a return NULL;
9044n/a return _PyUnicode_TranslateCharmap(str, mapping, errors);
9045n/a}
9046n/a
9047n/astatic Py_UCS4
9048n/afix_decimal_and_space_to_ascii(PyObject *self)
9049n/a{
9050n/a /* No need to call PyUnicode_READY(self) because this function is only
9051n/a called as a callback from fixup() which does it already. */
9052n/a const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9053n/a const int kind = PyUnicode_KIND(self);
9054n/a void *data = PyUnicode_DATA(self);
9055n/a Py_UCS4 maxchar = 127, ch, fixed;
9056n/a int modified = 0;
9057n/a Py_ssize_t i;
9058n/a
9059n/a for (i = 0; i < len; ++i) {
9060n/a ch = PyUnicode_READ(kind, data, i);
9061n/a fixed = 0;
9062n/a if (ch > 127) {
9063n/a if (Py_UNICODE_ISSPACE(ch))
9064n/a fixed = ' ';
9065n/a else {
9066n/a const int decimal = Py_UNICODE_TODECIMAL(ch);
9067n/a if (decimal >= 0)
9068n/a fixed = '0' + decimal;
9069n/a }
9070n/a if (fixed != 0) {
9071n/a modified = 1;
9072n/a maxchar = Py_MAX(maxchar, fixed);
9073n/a PyUnicode_WRITE(kind, data, i, fixed);
9074n/a }
9075n/a else
9076n/a maxchar = Py_MAX(maxchar, ch);
9077n/a }
9078n/a }
9079n/a
9080n/a return (modified) ? maxchar : 0;
9081n/a}
9082n/a
9083n/aPyObject *
9084n/a_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9085n/a{
9086n/a if (!PyUnicode_Check(unicode)) {
9087n/a PyErr_BadInternalCall();
9088n/a return NULL;
9089n/a }
9090n/a if (PyUnicode_READY(unicode) == -1)
9091n/a return NULL;
9092n/a if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9093n/a /* If the string is already ASCII, just return the same string */
9094n/a Py_INCREF(unicode);
9095n/a return unicode;
9096n/a }
9097n/a return fixup(unicode, fix_decimal_and_space_to_ascii);
9098n/a}
9099n/a
9100n/aPyObject *
9101n/aPyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9102n/a Py_ssize_t length)
9103n/a{
9104n/a PyObject *decimal;
9105n/a Py_ssize_t i;
9106n/a Py_UCS4 maxchar;
9107n/a enum PyUnicode_Kind kind;
9108n/a void *data;
9109n/a
9110n/a maxchar = 127;
9111n/a for (i = 0; i < length; i++) {
9112n/a Py_UCS4 ch = s[i];
9113n/a if (ch > 127) {
9114n/a int decimal = Py_UNICODE_TODECIMAL(ch);
9115n/a if (decimal >= 0)
9116n/a ch = '0' + decimal;
9117n/a maxchar = Py_MAX(maxchar, ch);
9118n/a }
9119n/a }
9120n/a
9121n/a /* Copy to a new string */
9122n/a decimal = PyUnicode_New(length, maxchar);
9123n/a if (decimal == NULL)
9124n/a return decimal;
9125n/a kind = PyUnicode_KIND(decimal);
9126n/a data = PyUnicode_DATA(decimal);
9127n/a /* Iterate over code points */
9128n/a for (i = 0; i < length; i++) {
9129n/a Py_UCS4 ch = s[i];
9130n/a if (ch > 127) {
9131n/a int decimal = Py_UNICODE_TODECIMAL(ch);
9132n/a if (decimal >= 0)
9133n/a ch = '0' + decimal;
9134n/a }
9135n/a PyUnicode_WRITE(kind, data, i, ch);
9136n/a }
9137n/a return unicode_result(decimal);
9138n/a}
9139n/a/* --- Decimal Encoder ---------------------------------------------------- */
9140n/a
9141n/aint
9142n/aPyUnicode_EncodeDecimal(Py_UNICODE *s,
9143n/a Py_ssize_t length,
9144n/a char *output,
9145n/a const char *errors)
9146n/a{
9147n/a PyObject *unicode;
9148n/a Py_ssize_t i;
9149n/a enum PyUnicode_Kind kind;
9150n/a void *data;
9151n/a
9152n/a if (output == NULL) {
9153n/a PyErr_BadArgument();
9154n/a return -1;
9155n/a }
9156n/a
9157n/a unicode = PyUnicode_FromWideChar(s, length);
9158n/a if (unicode == NULL)
9159n/a return -1;
9160n/a
9161n/a kind = PyUnicode_KIND(unicode);
9162n/a data = PyUnicode_DATA(unicode);
9163n/a
9164n/a for (i=0; i < length; ) {
9165n/a PyObject *exc;
9166n/a Py_UCS4 ch;
9167n/a int decimal;
9168n/a Py_ssize_t startpos;
9169n/a
9170n/a ch = PyUnicode_READ(kind, data, i);
9171n/a
9172n/a if (Py_UNICODE_ISSPACE(ch)) {
9173n/a *output++ = ' ';
9174n/a i++;
9175n/a continue;
9176n/a }
9177n/a decimal = Py_UNICODE_TODECIMAL(ch);
9178n/a if (decimal >= 0) {
9179n/a *output++ = '0' + decimal;
9180n/a i++;
9181n/a continue;
9182n/a }
9183n/a if (0 < ch && ch < 256) {
9184n/a *output++ = (char)ch;
9185n/a i++;
9186n/a continue;
9187n/a }
9188n/a
9189n/a startpos = i;
9190n/a exc = NULL;
9191n/a raise_encode_exception(&exc, "decimal", unicode,
9192n/a startpos, startpos+1,
9193n/a "invalid decimal Unicode string");
9194n/a Py_XDECREF(exc);
9195n/a Py_DECREF(unicode);
9196n/a return -1;
9197n/a }
9198n/a /* 0-terminate the output string */
9199n/a *output++ = '\0';
9200n/a Py_DECREF(unicode);
9201n/a return 0;
9202n/a}
9203n/a
9204n/a/* --- Helpers ------------------------------------------------------------ */
9205n/a
9206n/a/* helper macro to fixup start/end slice values */
9207n/a#define ADJUST_INDICES(start, end, len) \
9208n/a if (end > len) \
9209n/a end = len; \
9210n/a else if (end < 0) { \
9211n/a end += len; \
9212n/a if (end < 0) \
9213n/a end = 0; \
9214n/a } \
9215n/a if (start < 0) { \
9216n/a start += len; \
9217n/a if (start < 0) \
9218n/a start = 0; \
9219n/a }
9220n/a
9221n/astatic Py_ssize_t
9222n/aany_find_slice(PyObject* s1, PyObject* s2,
9223n/a Py_ssize_t start,
9224n/a Py_ssize_t end,
9225n/a int direction)
9226n/a{
9227n/a int kind1, kind2;
9228n/a void *buf1, *buf2;
9229n/a Py_ssize_t len1, len2, result;
9230n/a
9231n/a kind1 = PyUnicode_KIND(s1);
9232n/a kind2 = PyUnicode_KIND(s2);
9233n/a if (kind1 < kind2)
9234n/a return -1;
9235n/a
9236n/a len1 = PyUnicode_GET_LENGTH(s1);
9237n/a len2 = PyUnicode_GET_LENGTH(s2);
9238n/a ADJUST_INDICES(start, end, len1);
9239n/a if (end - start < len2)
9240n/a return -1;
9241n/a
9242n/a buf1 = PyUnicode_DATA(s1);
9243n/a buf2 = PyUnicode_DATA(s2);
9244n/a if (len2 == 1) {
9245n/a Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9246n/a result = findchar((const char *)buf1 + kind1*start,
9247n/a kind1, end - start, ch, direction);
9248n/a if (result == -1)
9249n/a return -1;
9250n/a else
9251n/a return start + result;
9252n/a }
9253n/a
9254n/a if (kind2 != kind1) {
9255n/a buf2 = _PyUnicode_AsKind(s2, kind1);
9256n/a if (!buf2)
9257n/a return -2;
9258n/a }
9259n/a
9260n/a if (direction > 0) {
9261n/a switch (kind1) {
9262n/a case PyUnicode_1BYTE_KIND:
9263n/a if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9264n/a result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9265n/a else
9266n/a result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9267n/a break;
9268n/a case PyUnicode_2BYTE_KIND:
9269n/a result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9270n/a break;
9271n/a case PyUnicode_4BYTE_KIND:
9272n/a result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9273n/a break;
9274n/a default:
9275n/a assert(0); result = -2;
9276n/a }
9277n/a }
9278n/a else {
9279n/a switch (kind1) {
9280n/a case PyUnicode_1BYTE_KIND:
9281n/a if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9282n/a result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9283n/a else
9284n/a result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9285n/a break;
9286n/a case PyUnicode_2BYTE_KIND:
9287n/a result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9288n/a break;
9289n/a case PyUnicode_4BYTE_KIND:
9290n/a result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9291n/a break;
9292n/a default:
9293n/a assert(0); result = -2;
9294n/a }
9295n/a }
9296n/a
9297n/a if (kind2 != kind1)
9298n/a PyMem_Free(buf2);
9299n/a
9300n/a return result;
9301n/a}
9302n/a
9303n/aPy_ssize_t
9304n/a_PyUnicode_InsertThousandsGrouping(
9305n/a PyObject *unicode, Py_ssize_t index,
9306n/a Py_ssize_t n_buffer,
9307n/a void *digits, Py_ssize_t n_digits,
9308n/a Py_ssize_t min_width,
9309n/a const char *grouping, PyObject *thousands_sep,
9310n/a Py_UCS4 *maxchar)
9311n/a{
9312n/a unsigned int kind, thousands_sep_kind;
9313n/a char *data, *thousands_sep_data;
9314n/a Py_ssize_t thousands_sep_len;
9315n/a Py_ssize_t len;
9316n/a
9317n/a if (unicode != NULL) {
9318n/a kind = PyUnicode_KIND(unicode);
9319n/a data = (char *) PyUnicode_DATA(unicode) + index * kind;
9320n/a }
9321n/a else {
9322n/a kind = PyUnicode_1BYTE_KIND;
9323n/a data = NULL;
9324n/a }
9325n/a thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9326n/a thousands_sep_data = PyUnicode_DATA(thousands_sep);
9327n/a thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9328n/a if (unicode != NULL && thousands_sep_kind != kind) {
9329n/a if (thousands_sep_kind < kind) {
9330n/a thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9331n/a if (!thousands_sep_data)
9332n/a return -1;
9333n/a }
9334n/a else {
9335n/a data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9336n/a if (!data)
9337n/a return -1;
9338n/a }
9339n/a }
9340n/a
9341n/a switch (kind) {
9342n/a case PyUnicode_1BYTE_KIND:
9343n/a if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9344n/a len = asciilib_InsertThousandsGrouping(
9345n/a (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
9346n/a min_width, grouping,
9347n/a (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9348n/a else
9349n/a len = ucs1lib_InsertThousandsGrouping(
9350n/a (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9351n/a min_width, grouping,
9352n/a (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9353n/a break;
9354n/a case PyUnicode_2BYTE_KIND:
9355n/a len = ucs2lib_InsertThousandsGrouping(
9356n/a (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
9357n/a min_width, grouping,
9358n/a (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
9359n/a break;
9360n/a case PyUnicode_4BYTE_KIND:
9361n/a len = ucs4lib_InsertThousandsGrouping(
9362n/a (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
9363n/a min_width, grouping,
9364n/a (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
9365n/a break;
9366n/a default:
9367n/a assert(0);
9368n/a return -1;
9369n/a }
9370n/a if (unicode != NULL && thousands_sep_kind != kind) {
9371n/a if (thousands_sep_kind < kind)
9372n/a PyMem_Free(thousands_sep_data);
9373n/a else
9374n/a PyMem_Free(data);
9375n/a }
9376n/a if (unicode == NULL) {
9377n/a *maxchar = 127;
9378n/a if (len != n_digits) {
9379n/a *maxchar = Py_MAX(*maxchar,
9380n/a PyUnicode_MAX_CHAR_VALUE(thousands_sep));
9381n/a }
9382n/a }
9383n/a return len;
9384n/a}
9385n/a
9386n/a
9387n/aPy_ssize_t
9388n/aPyUnicode_Count(PyObject *str,
9389n/a PyObject *substr,
9390n/a Py_ssize_t start,
9391n/a Py_ssize_t end)
9392n/a{
9393n/a Py_ssize_t result;
9394n/a int kind1, kind2;
9395n/a void *buf1 = NULL, *buf2 = NULL;
9396n/a Py_ssize_t len1, len2;
9397n/a
9398n/a if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9399n/a return -1;
9400n/a
9401n/a kind1 = PyUnicode_KIND(str);
9402n/a kind2 = PyUnicode_KIND(substr);
9403n/a if (kind1 < kind2)
9404n/a return 0;
9405n/a
9406n/a len1 = PyUnicode_GET_LENGTH(str);
9407n/a len2 = PyUnicode_GET_LENGTH(substr);
9408n/a ADJUST_INDICES(start, end, len1);
9409n/a if (end - start < len2)
9410n/a return 0;
9411n/a
9412n/a buf1 = PyUnicode_DATA(str);
9413n/a buf2 = PyUnicode_DATA(substr);
9414n/a if (kind2 != kind1) {
9415n/a buf2 = _PyUnicode_AsKind(substr, kind1);
9416n/a if (!buf2)
9417n/a goto onError;
9418n/a }
9419n/a
9420n/a switch (kind1) {
9421n/a case PyUnicode_1BYTE_KIND:
9422n/a if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9423n/a result = asciilib_count(
9424n/a ((Py_UCS1*)buf1) + start, end - start,
9425n/a buf2, len2, PY_SSIZE_T_MAX
9426n/a );
9427n/a else
9428n/a result = ucs1lib_count(
9429n/a ((Py_UCS1*)buf1) + start, end - start,
9430n/a buf2, len2, PY_SSIZE_T_MAX
9431n/a );
9432n/a break;
9433n/a case PyUnicode_2BYTE_KIND:
9434n/a result = ucs2lib_count(
9435n/a ((Py_UCS2*)buf1) + start, end - start,
9436n/a buf2, len2, PY_SSIZE_T_MAX
9437n/a );
9438n/a break;
9439n/a case PyUnicode_4BYTE_KIND:
9440n/a result = ucs4lib_count(
9441n/a ((Py_UCS4*)buf1) + start, end - start,
9442n/a buf2, len2, PY_SSIZE_T_MAX
9443n/a );
9444n/a break;
9445n/a default:
9446n/a assert(0); result = 0;
9447n/a }
9448n/a
9449n/a if (kind2 != kind1)
9450n/a PyMem_Free(buf2);
9451n/a
9452n/a return result;
9453n/a onError:
9454n/a if (kind2 != kind1 && buf2)
9455n/a PyMem_Free(buf2);
9456n/a return -1;
9457n/a}
9458n/a
9459n/aPy_ssize_t
9460n/aPyUnicode_Find(PyObject *str,
9461n/a PyObject *substr,
9462n/a Py_ssize_t start,
9463n/a Py_ssize_t end,
9464n/a int direction)
9465n/a{
9466n/a if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9467n/a return -2;
9468n/a
9469n/a return any_find_slice(str, substr, start, end, direction);
9470n/a}
9471n/a
9472n/aPy_ssize_t
9473n/aPyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9474n/a Py_ssize_t start, Py_ssize_t end,
9475n/a int direction)
9476n/a{
9477n/a int kind;
9478n/a Py_ssize_t len, result;
9479n/a if (PyUnicode_READY(str) == -1)
9480n/a return -2;
9481n/a len = PyUnicode_GET_LENGTH(str);
9482n/a ADJUST_INDICES(start, end, len);
9483n/a if (end - start < 1)
9484n/a return -1;
9485n/a kind = PyUnicode_KIND(str);
9486n/a result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9487n/a kind, end-start, ch, direction);
9488n/a if (result == -1)
9489n/a return -1;
9490n/a else
9491n/a return start + result;
9492n/a}
9493n/a
9494n/astatic int
9495n/atailmatch(PyObject *self,
9496n/a PyObject *substring,
9497n/a Py_ssize_t start,
9498n/a Py_ssize_t end,
9499n/a int direction)
9500n/a{
9501n/a int kind_self;
9502n/a int kind_sub;
9503n/a void *data_self;
9504n/a void *data_sub;
9505n/a Py_ssize_t offset;
9506n/a Py_ssize_t i;
9507n/a Py_ssize_t end_sub;
9508n/a
9509n/a if (PyUnicode_READY(self) == -1 ||
9510n/a PyUnicode_READY(substring) == -1)
9511n/a return -1;
9512n/a
9513n/a ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9514n/a end -= PyUnicode_GET_LENGTH(substring);
9515n/a if (end < start)
9516n/a return 0;
9517n/a
9518n/a if (PyUnicode_GET_LENGTH(substring) == 0)
9519n/a return 1;
9520n/a
9521n/a kind_self = PyUnicode_KIND(self);
9522n/a data_self = PyUnicode_DATA(self);
9523n/a kind_sub = PyUnicode_KIND(substring);
9524n/a data_sub = PyUnicode_DATA(substring);
9525n/a end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9526n/a
9527n/a if (direction > 0)
9528n/a offset = end;
9529n/a else
9530n/a offset = start;
9531n/a
9532n/a if (PyUnicode_READ(kind_self, data_self, offset) ==
9533n/a PyUnicode_READ(kind_sub, data_sub, 0) &&
9534n/a PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9535n/a PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9536n/a /* If both are of the same kind, memcmp is sufficient */
9537n/a if (kind_self == kind_sub) {
9538n/a return ! memcmp((char *)data_self +
9539n/a (offset * PyUnicode_KIND(substring)),
9540n/a data_sub,
9541n/a PyUnicode_GET_LENGTH(substring) *
9542n/a PyUnicode_KIND(substring));
9543n/a }
9544n/a /* otherwise we have to compare each character by first accessing it */
9545n/a else {
9546n/a /* We do not need to compare 0 and len(substring)-1 because
9547n/a the if statement above ensured already that they are equal
9548n/a when we end up here. */
9549n/a for (i = 1; i < end_sub; ++i) {
9550n/a if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9551n/a PyUnicode_READ(kind_sub, data_sub, i))
9552n/a return 0;
9553n/a }
9554n/a return 1;
9555n/a }
9556n/a }
9557n/a
9558n/a return 0;
9559n/a}
9560n/a
9561n/aPy_ssize_t
9562n/aPyUnicode_Tailmatch(PyObject *str,
9563n/a PyObject *substr,
9564n/a Py_ssize_t start,
9565n/a Py_ssize_t end,
9566n/a int direction)
9567n/a{
9568n/a if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9569n/a return -1;
9570n/a
9571n/a return tailmatch(str, substr, start, end, direction);
9572n/a}
9573n/a
9574n/a/* Apply fixfct filter to the Unicode object self and return a
9575n/a reference to the modified object */
9576n/a
9577n/astatic PyObject *
9578n/afixup(PyObject *self,
9579n/a Py_UCS4 (*fixfct)(PyObject *s))
9580n/a{
9581n/a PyObject *u;
9582n/a Py_UCS4 maxchar_old, maxchar_new = 0;
9583n/a PyObject *v;
9584n/a
9585n/a u = _PyUnicode_Copy(self);
9586n/a if (u == NULL)
9587n/a return NULL;
9588n/a maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9589n/a
9590n/a /* fix functions return the new maximum character in a string,
9591n/a if the kind of the resulting unicode object does not change,
9592n/a everything is fine. Otherwise we need to change the string kind
9593n/a and re-run the fix function. */
9594n/a maxchar_new = fixfct(u);
9595n/a
9596n/a if (maxchar_new == 0) {
9597n/a /* no changes */;
9598n/a if (PyUnicode_CheckExact(self)) {
9599n/a Py_DECREF(u);
9600n/a Py_INCREF(self);
9601n/a return self;
9602n/a }
9603n/a else
9604n/a return u;
9605n/a }
9606n/a
9607n/a maxchar_new = align_maxchar(maxchar_new);
9608n/a
9609n/a if (maxchar_new == maxchar_old)
9610n/a return u;
9611n/a
9612n/a /* In case the maximum character changed, we need to
9613n/a convert the string to the new category. */
9614n/a v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9615n/a if (v == NULL) {
9616n/a Py_DECREF(u);
9617n/a return NULL;
9618n/a }
9619n/a if (maxchar_new > maxchar_old) {
9620n/a /* If the maxchar increased so that the kind changed, not all
9621n/a characters are representable anymore and we need to fix the
9622n/a string again. This only happens in very few cases. */
9623n/a _PyUnicode_FastCopyCharacters(v, 0,
9624n/a self, 0, PyUnicode_GET_LENGTH(self));
9625n/a maxchar_old = fixfct(v);
9626n/a assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9627n/a }
9628n/a else {
9629n/a _PyUnicode_FastCopyCharacters(v, 0,
9630n/a u, 0, PyUnicode_GET_LENGTH(self));
9631n/a }
9632n/a Py_DECREF(u);
9633n/a assert(_PyUnicode_CheckConsistency(v, 1));
9634n/a return v;
9635n/a}
9636n/a
9637n/astatic PyObject *
9638n/aascii_upper_or_lower(PyObject *self, int lower)
9639n/a{
9640n/a Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9641n/a char *resdata, *data = PyUnicode_DATA(self);
9642n/a PyObject *res;
9643n/a
9644n/a res = PyUnicode_New(len, 127);
9645n/a if (res == NULL)
9646n/a return NULL;
9647n/a resdata = PyUnicode_DATA(res);
9648n/a if (lower)
9649n/a _Py_bytes_lower(resdata, data, len);
9650n/a else
9651n/a _Py_bytes_upper(resdata, data, len);
9652n/a return res;
9653n/a}
9654n/a
9655n/astatic Py_UCS4
9656n/ahandle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9657n/a{
9658n/a Py_ssize_t j;
9659n/a int final_sigma;
9660n/a Py_UCS4 c = 0; /* initialize to prevent gcc warning */
9661n/a /* U+03A3 is in the Final_Sigma context when, it is found like this:
9662n/a
9663n/a \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9664n/a
9665n/a where ! is a negation and \p{xxx} is a character with property xxx.
9666n/a */
9667n/a for (j = i - 1; j >= 0; j--) {
9668n/a c = PyUnicode_READ(kind, data, j);
9669n/a if (!_PyUnicode_IsCaseIgnorable(c))
9670n/a break;
9671n/a }
9672n/a final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9673n/a if (final_sigma) {
9674n/a for (j = i + 1; j < length; j++) {
9675n/a c = PyUnicode_READ(kind, data, j);
9676n/a if (!_PyUnicode_IsCaseIgnorable(c))
9677n/a break;
9678n/a }
9679n/a final_sigma = j == length || !_PyUnicode_IsCased(c);
9680n/a }
9681n/a return (final_sigma) ? 0x3C2 : 0x3C3;
9682n/a}
9683n/a
9684n/astatic int
9685n/alower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9686n/a Py_UCS4 c, Py_UCS4 *mapped)
9687n/a{
9688n/a /* Obscure special case. */
9689n/a if (c == 0x3A3) {
9690n/a mapped[0] = handle_capital_sigma(kind, data, length, i);
9691n/a return 1;
9692n/a }
9693n/a return _PyUnicode_ToLowerFull(c, mapped);
9694n/a}
9695n/a
9696n/astatic Py_ssize_t
9697n/ado_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9698n/a{
9699n/a Py_ssize_t i, k = 0;
9700n/a int n_res, j;
9701n/a Py_UCS4 c, mapped[3];
9702n/a
9703n/a c = PyUnicode_READ(kind, data, 0);
9704n/a n_res = _PyUnicode_ToUpperFull(c, mapped);
9705n/a for (j = 0; j < n_res; j++) {
9706n/a *maxchar = Py_MAX(*maxchar, mapped[j]);
9707n/a res[k++] = mapped[j];
9708n/a }
9709n/a for (i = 1; i < length; i++) {
9710n/a c = PyUnicode_READ(kind, data, i);
9711n/a n_res = lower_ucs4(kind, data, length, i, c, mapped);
9712n/a for (j = 0; j < n_res; j++) {
9713n/a *maxchar = Py_MAX(*maxchar, mapped[j]);
9714n/a res[k++] = mapped[j];
9715n/a }
9716n/a }
9717n/a return k;
9718n/a}
9719n/a
9720n/astatic Py_ssize_t
9721n/ado_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9722n/a Py_ssize_t i, k = 0;
9723n/a
9724n/a for (i = 0; i < length; i++) {
9725n/a Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9726n/a int n_res, j;
9727n/a if (Py_UNICODE_ISUPPER(c)) {
9728n/a n_res = lower_ucs4(kind, data, length, i, c, mapped);
9729n/a }
9730n/a else if (Py_UNICODE_ISLOWER(c)) {
9731n/a n_res = _PyUnicode_ToUpperFull(c, mapped);
9732n/a }
9733n/a else {
9734n/a n_res = 1;
9735n/a mapped[0] = c;
9736n/a }
9737n/a for (j = 0; j < n_res; j++) {
9738n/a *maxchar = Py_MAX(*maxchar, mapped[j]);
9739n/a res[k++] = mapped[j];
9740n/a }
9741n/a }
9742n/a return k;
9743n/a}
9744n/a
9745n/astatic Py_ssize_t
9746n/ado_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9747n/a Py_UCS4 *maxchar, int lower)
9748n/a{
9749n/a Py_ssize_t i, k = 0;
9750n/a
9751n/a for (i = 0; i < length; i++) {
9752n/a Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9753n/a int n_res, j;
9754n/a if (lower)
9755n/a n_res = lower_ucs4(kind, data, length, i, c, mapped);
9756n/a else
9757n/a n_res = _PyUnicode_ToUpperFull(c, mapped);
9758n/a for (j = 0; j < n_res; j++) {
9759n/a *maxchar = Py_MAX(*maxchar, mapped[j]);
9760n/a res[k++] = mapped[j];
9761n/a }
9762n/a }
9763n/a return k;
9764n/a}
9765n/a
9766n/astatic Py_ssize_t
9767n/ado_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9768n/a{
9769n/a return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9770n/a}
9771n/a
9772n/astatic Py_ssize_t
9773n/ado_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9774n/a{
9775n/a return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9776n/a}
9777n/a
9778n/astatic Py_ssize_t
9779n/ado_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9780n/a{
9781n/a Py_ssize_t i, k = 0;
9782n/a
9783n/a for (i = 0; i < length; i++) {
9784n/a Py_UCS4 c = PyUnicode_READ(kind, data, i);
9785n/a Py_UCS4 mapped[3];
9786n/a int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9787n/a for (j = 0; j < n_res; j++) {
9788n/a *maxchar = Py_MAX(*maxchar, mapped[j]);
9789n/a res[k++] = mapped[j];
9790n/a }
9791n/a }
9792n/a return k;
9793n/a}
9794n/a
9795n/astatic Py_ssize_t
9796n/ado_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9797n/a{
9798n/a Py_ssize_t i, k = 0;
9799n/a int previous_is_cased;
9800n/a
9801n/a previous_is_cased = 0;
9802n/a for (i = 0; i < length; i++) {
9803n/a const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9804n/a Py_UCS4 mapped[3];
9805n/a int n_res, j;
9806n/a
9807n/a if (previous_is_cased)
9808n/a n_res = lower_ucs4(kind, data, length, i, c, mapped);
9809n/a else
9810n/a n_res = _PyUnicode_ToTitleFull(c, mapped);
9811n/a
9812n/a for (j = 0; j < n_res; j++) {
9813n/a *maxchar = Py_MAX(*maxchar, mapped[j]);
9814n/a res[k++] = mapped[j];
9815n/a }
9816n/a
9817n/a previous_is_cased = _PyUnicode_IsCased(c);
9818n/a }
9819n/a return k;
9820n/a}
9821n/a
9822n/astatic PyObject *
9823n/acase_operation(PyObject *self,
9824n/a Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9825n/a{
9826n/a PyObject *res = NULL;
9827n/a Py_ssize_t length, newlength = 0;
9828n/a int kind, outkind;
9829n/a void *data, *outdata;
9830n/a Py_UCS4 maxchar = 0, *tmp, *tmpend;
9831n/a
9832n/a assert(PyUnicode_IS_READY(self));
9833n/a
9834n/a kind = PyUnicode_KIND(self);
9835n/a data = PyUnicode_DATA(self);
9836n/a length = PyUnicode_GET_LENGTH(self);
9837n/a if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9838n/a PyErr_SetString(PyExc_OverflowError, "string is too long");
9839n/a return NULL;
9840n/a }
9841n/a tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9842n/a if (tmp == NULL)
9843n/a return PyErr_NoMemory();
9844n/a newlength = perform(kind, data, length, tmp, &maxchar);
9845n/a res = PyUnicode_New(newlength, maxchar);
9846n/a if (res == NULL)
9847n/a goto leave;
9848n/a tmpend = tmp + newlength;
9849n/a outdata = PyUnicode_DATA(res);
9850n/a outkind = PyUnicode_KIND(res);
9851n/a switch (outkind) {
9852n/a case PyUnicode_1BYTE_KIND:
9853n/a _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9854n/a break;
9855n/a case PyUnicode_2BYTE_KIND:
9856n/a _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9857n/a break;
9858n/a case PyUnicode_4BYTE_KIND:
9859n/a memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9860n/a break;
9861n/a default:
9862n/a assert(0);
9863n/a break;
9864n/a }
9865n/a leave:
9866n/a PyMem_FREE(tmp);
9867n/a return res;
9868n/a}
9869n/a
9870n/aPyObject *
9871n/aPyUnicode_Join(PyObject *separator, PyObject *seq)
9872n/a{
9873n/a PyObject *res;
9874n/a PyObject *fseq;
9875n/a Py_ssize_t seqlen;
9876n/a PyObject **items;
9877n/a
9878n/a fseq = PySequence_Fast(seq, "can only join an iterable");
9879n/a if (fseq == NULL) {
9880n/a return NULL;
9881n/a }
9882n/a
9883n/a /* NOTE: the following code can't call back into Python code,
9884n/a * so we are sure that fseq won't be mutated.
9885n/a */
9886n/a
9887n/a items = PySequence_Fast_ITEMS(fseq);
9888n/a seqlen = PySequence_Fast_GET_SIZE(fseq);
9889n/a res = _PyUnicode_JoinArray(separator, items, seqlen);
9890n/a Py_DECREF(fseq);
9891n/a return res;
9892n/a}
9893n/a
9894n/aPyObject *
9895n/a_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9896n/a{
9897n/a PyObject *res = NULL; /* the result */
9898n/a PyObject *sep = NULL;
9899n/a Py_ssize_t seplen;
9900n/a PyObject *item;
9901n/a Py_ssize_t sz, i, res_offset;
9902n/a Py_UCS4 maxchar;
9903n/a Py_UCS4 item_maxchar;
9904n/a int use_memcpy;
9905n/a unsigned char *res_data = NULL, *sep_data = NULL;
9906n/a PyObject *last_obj;
9907n/a unsigned int kind = 0;
9908n/a
9909n/a /* If empty sequence, return u"". */
9910n/a if (seqlen == 0) {
9911n/a _Py_RETURN_UNICODE_EMPTY();
9912n/a }
9913n/a
9914n/a /* If singleton sequence with an exact Unicode, return that. */
9915n/a last_obj = NULL;
9916n/a if (seqlen == 1) {
9917n/a if (PyUnicode_CheckExact(items[0])) {
9918n/a res = items[0];
9919n/a Py_INCREF(res);
9920n/a return res;
9921n/a }
9922n/a seplen = 0;
9923n/a maxchar = 0;
9924n/a }
9925n/a else {
9926n/a /* Set up sep and seplen */
9927n/a if (separator == NULL) {
9928n/a /* fall back to a blank space separator */
9929n/a sep = PyUnicode_FromOrdinal(' ');
9930n/a if (!sep)
9931n/a goto onError;
9932n/a seplen = 1;
9933n/a maxchar = 32;
9934n/a }
9935n/a else {
9936n/a if (!PyUnicode_Check(separator)) {
9937n/a PyErr_Format(PyExc_TypeError,
9938n/a "separator: expected str instance,"
9939n/a " %.80s found",
9940n/a Py_TYPE(separator)->tp_name);
9941n/a goto onError;
9942n/a }
9943n/a if (PyUnicode_READY(separator))
9944n/a goto onError;
9945n/a sep = separator;
9946n/a seplen = PyUnicode_GET_LENGTH(separator);
9947n/a maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9948n/a /* inc refcount to keep this code path symmetric with the
9949n/a above case of a blank separator */
9950n/a Py_INCREF(sep);
9951n/a }
9952n/a last_obj = sep;
9953n/a }
9954n/a
9955n/a /* There are at least two things to join, or else we have a subclass
9956n/a * of str in the sequence.
9957n/a * Do a pre-pass to figure out the total amount of space we'll
9958n/a * need (sz), and see whether all argument are strings.
9959n/a */
9960n/a sz = 0;
9961n/a#ifdef Py_DEBUG
9962n/a use_memcpy = 0;
9963n/a#else
9964n/a use_memcpy = 1;
9965n/a#endif
9966n/a for (i = 0; i < seqlen; i++) {
9967n/a size_t add_sz;
9968n/a item = items[i];
9969n/a if (!PyUnicode_Check(item)) {
9970n/a PyErr_Format(PyExc_TypeError,
9971n/a "sequence item %zd: expected str instance,"
9972n/a " %.80s found",
9973n/a i, Py_TYPE(item)->tp_name);
9974n/a goto onError;
9975n/a }
9976n/a if (PyUnicode_READY(item) == -1)
9977n/a goto onError;
9978n/a add_sz = PyUnicode_GET_LENGTH(item);
9979n/a item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9980n/a maxchar = Py_MAX(maxchar, item_maxchar);
9981n/a if (i != 0) {
9982n/a add_sz += seplen;
9983n/a }
9984n/a if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
9985n/a PyErr_SetString(PyExc_OverflowError,
9986n/a "join() result is too long for a Python string");
9987n/a goto onError;
9988n/a }
9989n/a sz += add_sz;
9990n/a if (use_memcpy && last_obj != NULL) {
9991n/a if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9992n/a use_memcpy = 0;
9993n/a }
9994n/a last_obj = item;
9995n/a }
9996n/a
9997n/a res = PyUnicode_New(sz, maxchar);
9998n/a if (res == NULL)
9999n/a goto onError;
10000n/a
10001n/a /* Catenate everything. */
10002n/a#ifdef Py_DEBUG
10003n/a use_memcpy = 0;
10004n/a#else
10005n/a if (use_memcpy) {
10006n/a res_data = PyUnicode_1BYTE_DATA(res);
10007n/a kind = PyUnicode_KIND(res);
10008n/a if (seplen != 0)
10009n/a sep_data = PyUnicode_1BYTE_DATA(sep);
10010n/a }
10011n/a#endif
10012n/a if (use_memcpy) {
10013n/a for (i = 0; i < seqlen; ++i) {
10014n/a Py_ssize_t itemlen;
10015n/a item = items[i];
10016n/a
10017n/a /* Copy item, and maybe the separator. */
10018n/a if (i && seplen != 0) {
10019n/a memcpy(res_data,
10020n/a sep_data,
10021n/a kind * seplen);
10022n/a res_data += kind * seplen;
10023n/a }
10024n/a
10025n/a itemlen = PyUnicode_GET_LENGTH(item);
10026n/a if (itemlen != 0) {
10027n/a memcpy(res_data,
10028n/a PyUnicode_DATA(item),
10029n/a kind * itemlen);
10030n/a res_data += kind * itemlen;
10031n/a }
10032n/a }
10033n/a assert(res_data == PyUnicode_1BYTE_DATA(res)
10034n/a + kind * PyUnicode_GET_LENGTH(res));
10035n/a }
10036n/a else {
10037n/a for (i = 0, res_offset = 0; i < seqlen; ++i) {
10038n/a Py_ssize_t itemlen;
10039n/a item = items[i];
10040n/a
10041n/a /* Copy item, and maybe the separator. */
10042n/a if (i && seplen != 0) {
10043n/a _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10044n/a res_offset += seplen;
10045n/a }
10046n/a
10047n/a itemlen = PyUnicode_GET_LENGTH(item);
10048n/a if (itemlen != 0) {
10049n/a _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10050n/a res_offset += itemlen;
10051n/a }
10052n/a }
10053n/a assert(res_offset == PyUnicode_GET_LENGTH(res));
10054n/a }
10055n/a
10056n/a Py_XDECREF(sep);
10057n/a assert(_PyUnicode_CheckConsistency(res, 1));
10058n/a return res;
10059n/a
10060n/a onError:
10061n/a Py_XDECREF(sep);
10062n/a Py_XDECREF(res);
10063n/a return NULL;
10064n/a}
10065n/a
10066n/a#define FILL(kind, data, value, start, length) \
10067n/a do { \
10068n/a Py_ssize_t i_ = 0; \
10069n/a assert(kind != PyUnicode_WCHAR_KIND); \
10070n/a switch ((kind)) { \
10071n/a case PyUnicode_1BYTE_KIND: { \
10072n/a unsigned char * to_ = (unsigned char *)((data)) + (start); \
10073n/a memset(to_, (unsigned char)value, (length)); \
10074n/a break; \
10075n/a } \
10076n/a case PyUnicode_2BYTE_KIND: { \
10077n/a Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10078n/a for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10079n/a break; \
10080n/a } \
10081n/a case PyUnicode_4BYTE_KIND: { \
10082n/a Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10083n/a for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10084n/a break; \
10085n/a } \
10086n/a default: assert(0); \
10087n/a } \
10088n/a } while (0)
10089n/a
10090n/avoid
10091n/a_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10092n/a Py_UCS4 fill_char)
10093n/a{
10094n/a const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10095n/a const void *data = PyUnicode_DATA(unicode);
10096n/a assert(PyUnicode_IS_READY(unicode));
10097n/a assert(unicode_modifiable(unicode));
10098n/a assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10099n/a assert(start >= 0);
10100n/a assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10101n/a FILL(kind, data, fill_char, start, length);
10102n/a}
10103n/a
10104n/aPy_ssize_t
10105n/aPyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10106n/a Py_UCS4 fill_char)
10107n/a{
10108n/a Py_ssize_t maxlen;
10109n/a
10110n/a if (!PyUnicode_Check(unicode)) {
10111n/a PyErr_BadInternalCall();
10112n/a return -1;
10113n/a }
10114n/a if (PyUnicode_READY(unicode) == -1)
10115n/a return -1;
10116n/a if (unicode_check_modifiable(unicode))
10117n/a return -1;
10118n/a
10119n/a if (start < 0) {
10120n/a PyErr_SetString(PyExc_IndexError, "string index out of range");
10121n/a return -1;
10122n/a }
10123n/a if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10124n/a PyErr_SetString(PyExc_ValueError,
10125n/a "fill character is bigger than "
10126n/a "the string maximum character");
10127n/a return -1;
10128n/a }
10129n/a
10130n/a maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10131n/a length = Py_MIN(maxlen, length);
10132n/a if (length <= 0)
10133n/a return 0;
10134n/a
10135n/a _PyUnicode_FastFill(unicode, start, length, fill_char);
10136n/a return length;
10137n/a}
10138n/a
10139n/astatic PyObject *
10140n/apad(PyObject *self,
10141n/a Py_ssize_t left,
10142n/a Py_ssize_t right,
10143n/a Py_UCS4 fill)
10144n/a{
10145n/a PyObject *u;
10146n/a Py_UCS4 maxchar;
10147n/a int kind;
10148n/a void *data;
10149n/a
10150n/a if (left < 0)
10151n/a left = 0;
10152n/a if (right < 0)
10153n/a right = 0;
10154n/a
10155n/a if (left == 0 && right == 0)
10156n/a return unicode_result_unchanged(self);
10157n/a
10158n/a if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10159n/a right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10160n/a PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10161n/a return NULL;
10162n/a }
10163n/a maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10164n/a maxchar = Py_MAX(maxchar, fill);
10165n/a u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10166n/a if (!u)
10167n/a return NULL;
10168n/a
10169n/a kind = PyUnicode_KIND(u);
10170n/a data = PyUnicode_DATA(u);
10171n/a if (left)
10172n/a FILL(kind, data, fill, 0, left);
10173n/a if (right)
10174n/a FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10175n/a _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10176n/a assert(_PyUnicode_CheckConsistency(u, 1));
10177n/a return u;
10178n/a}
10179n/a
10180n/aPyObject *
10181n/aPyUnicode_Splitlines(PyObject *string, int keepends)
10182n/a{
10183n/a PyObject *list;
10184n/a
10185n/a if (ensure_unicode(string) < 0)
10186n/a return NULL;
10187n/a
10188n/a switch (PyUnicode_KIND(string)) {
10189n/a case PyUnicode_1BYTE_KIND:
10190n/a if (PyUnicode_IS_ASCII(string))
10191n/a list = asciilib_splitlines(
10192n/a string, PyUnicode_1BYTE_DATA(string),
10193n/a PyUnicode_GET_LENGTH(string), keepends);
10194n/a else
10195n/a list = ucs1lib_splitlines(
10196n/a string, PyUnicode_1BYTE_DATA(string),
10197n/a PyUnicode_GET_LENGTH(string), keepends);
10198n/a break;
10199n/a case PyUnicode_2BYTE_KIND:
10200n/a list = ucs2lib_splitlines(
10201n/a string, PyUnicode_2BYTE_DATA(string),
10202n/a PyUnicode_GET_LENGTH(string), keepends);
10203n/a break;
10204n/a case PyUnicode_4BYTE_KIND:
10205n/a list = ucs4lib_splitlines(
10206n/a string, PyUnicode_4BYTE_DATA(string),
10207n/a PyUnicode_GET_LENGTH(string), keepends);
10208n/a break;
10209n/a default:
10210n/a assert(0);
10211n/a list = 0;
10212n/a }
10213n/a return list;
10214n/a}
10215n/a
10216n/astatic PyObject *
10217n/asplit(PyObject *self,
10218n/a PyObject *substring,
10219n/a Py_ssize_t maxcount)
10220n/a{
10221n/a int kind1, kind2;
10222n/a void *buf1, *buf2;
10223n/a Py_ssize_t len1, len2;
10224n/a PyObject* out;
10225n/a
10226n/a if (maxcount < 0)
10227n/a maxcount = PY_SSIZE_T_MAX;
10228n/a
10229n/a if (PyUnicode_READY(self) == -1)
10230n/a return NULL;
10231n/a
10232n/a if (substring == NULL)
10233n/a switch (PyUnicode_KIND(self)) {
10234n/a case PyUnicode_1BYTE_KIND:
10235n/a if (PyUnicode_IS_ASCII(self))
10236n/a return asciilib_split_whitespace(
10237n/a self, PyUnicode_1BYTE_DATA(self),
10238n/a PyUnicode_GET_LENGTH(self), maxcount
10239n/a );
10240n/a else
10241n/a return ucs1lib_split_whitespace(
10242n/a self, PyUnicode_1BYTE_DATA(self),
10243n/a PyUnicode_GET_LENGTH(self), maxcount
10244n/a );
10245n/a case PyUnicode_2BYTE_KIND:
10246n/a return ucs2lib_split_whitespace(
10247n/a self, PyUnicode_2BYTE_DATA(self),
10248n/a PyUnicode_GET_LENGTH(self), maxcount
10249n/a );
10250n/a case PyUnicode_4BYTE_KIND:
10251n/a return ucs4lib_split_whitespace(
10252n/a self, PyUnicode_4BYTE_DATA(self),
10253n/a PyUnicode_GET_LENGTH(self), maxcount
10254n/a );
10255n/a default:
10256n/a assert(0);
10257n/a return NULL;
10258n/a }
10259n/a
10260n/a if (PyUnicode_READY(substring) == -1)
10261n/a return NULL;
10262n/a
10263n/a kind1 = PyUnicode_KIND(self);
10264n/a kind2 = PyUnicode_KIND(substring);
10265n/a len1 = PyUnicode_GET_LENGTH(self);
10266n/a len2 = PyUnicode_GET_LENGTH(substring);
10267n/a if (kind1 < kind2 || len1 < len2) {
10268n/a out = PyList_New(1);
10269n/a if (out == NULL)
10270n/a return NULL;
10271n/a Py_INCREF(self);
10272n/a PyList_SET_ITEM(out, 0, self);
10273n/a return out;
10274n/a }
10275n/a buf1 = PyUnicode_DATA(self);
10276n/a buf2 = PyUnicode_DATA(substring);
10277n/a if (kind2 != kind1) {
10278n/a buf2 = _PyUnicode_AsKind(substring, kind1);
10279n/a if (!buf2)
10280n/a return NULL;
10281n/a }
10282n/a
10283n/a switch (kind1) {
10284n/a case PyUnicode_1BYTE_KIND:
10285n/a if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10286n/a out = asciilib_split(
10287n/a self, buf1, len1, buf2, len2, maxcount);
10288n/a else
10289n/a out = ucs1lib_split(
10290n/a self, buf1, len1, buf2, len2, maxcount);
10291n/a break;
10292n/a case PyUnicode_2BYTE_KIND:
10293n/a out = ucs2lib_split(
10294n/a self, buf1, len1, buf2, len2, maxcount);
10295n/a break;
10296n/a case PyUnicode_4BYTE_KIND:
10297n/a out = ucs4lib_split(
10298n/a self, buf1, len1, buf2, len2, maxcount);
10299n/a break;
10300n/a default:
10301n/a out = NULL;
10302n/a }
10303n/a if (kind2 != kind1)
10304n/a PyMem_Free(buf2);
10305n/a return out;
10306n/a}
10307n/a
10308n/astatic PyObject *
10309n/arsplit(PyObject *self,
10310n/a PyObject *substring,
10311n/a Py_ssize_t maxcount)
10312n/a{
10313n/a int kind1, kind2;
10314n/a void *buf1, *buf2;
10315n/a Py_ssize_t len1, len2;
10316n/a PyObject* out;
10317n/a
10318n/a if (maxcount < 0)
10319n/a maxcount = PY_SSIZE_T_MAX;
10320n/a
10321n/a if (PyUnicode_READY(self) == -1)
10322n/a return NULL;
10323n/a
10324n/a if (substring == NULL)
10325n/a switch (PyUnicode_KIND(self)) {
10326n/a case PyUnicode_1BYTE_KIND:
10327n/a if (PyUnicode_IS_ASCII(self))
10328n/a return asciilib_rsplit_whitespace(
10329n/a self, PyUnicode_1BYTE_DATA(self),
10330n/a PyUnicode_GET_LENGTH(self), maxcount
10331n/a );
10332n/a else
10333n/a return ucs1lib_rsplit_whitespace(
10334n/a self, PyUnicode_1BYTE_DATA(self),
10335n/a PyUnicode_GET_LENGTH(self), maxcount
10336n/a );
10337n/a case PyUnicode_2BYTE_KIND:
10338n/a return ucs2lib_rsplit_whitespace(
10339n/a self, PyUnicode_2BYTE_DATA(self),
10340n/a PyUnicode_GET_LENGTH(self), maxcount
10341n/a );
10342n/a case PyUnicode_4BYTE_KIND:
10343n/a return ucs4lib_rsplit_whitespace(
10344n/a self, PyUnicode_4BYTE_DATA(self),
10345n/a PyUnicode_GET_LENGTH(self), maxcount
10346n/a );
10347n/a default:
10348n/a assert(0);
10349n/a return NULL;
10350n/a }
10351n/a
10352n/a if (PyUnicode_READY(substring) == -1)
10353n/a return NULL;
10354n/a
10355n/a kind1 = PyUnicode_KIND(self);
10356n/a kind2 = PyUnicode_KIND(substring);
10357n/a len1 = PyUnicode_GET_LENGTH(self);
10358n/a len2 = PyUnicode_GET_LENGTH(substring);
10359n/a if (kind1 < kind2 || len1 < len2) {
10360n/a out = PyList_New(1);
10361n/a if (out == NULL)
10362n/a return NULL;
10363n/a Py_INCREF(self);
10364n/a PyList_SET_ITEM(out, 0, self);
10365n/a return out;
10366n/a }
10367n/a buf1 = PyUnicode_DATA(self);
10368n/a buf2 = PyUnicode_DATA(substring);
10369n/a if (kind2 != kind1) {
10370n/a buf2 = _PyUnicode_AsKind(substring, kind1);
10371n/a if (!buf2)
10372n/a return NULL;
10373n/a }
10374n/a
10375n/a switch (kind1) {
10376n/a case PyUnicode_1BYTE_KIND:
10377n/a if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10378n/a out = asciilib_rsplit(
10379n/a self, buf1, len1, buf2, len2, maxcount);
10380n/a else
10381n/a out = ucs1lib_rsplit(
10382n/a self, buf1, len1, buf2, len2, maxcount);
10383n/a break;
10384n/a case PyUnicode_2BYTE_KIND:
10385n/a out = ucs2lib_rsplit(
10386n/a self, buf1, len1, buf2, len2, maxcount);
10387n/a break;
10388n/a case PyUnicode_4BYTE_KIND:
10389n/a out = ucs4lib_rsplit(
10390n/a self, buf1, len1, buf2, len2, maxcount);
10391n/a break;
10392n/a default:
10393n/a out = NULL;
10394n/a }
10395n/a if (kind2 != kind1)
10396n/a PyMem_Free(buf2);
10397n/a return out;
10398n/a}
10399n/a
10400n/astatic Py_ssize_t
10401n/aanylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10402n/a PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10403n/a{
10404n/a switch (kind) {
10405n/a case PyUnicode_1BYTE_KIND:
10406n/a if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10407n/a return asciilib_find(buf1, len1, buf2, len2, offset);
10408n/a else
10409n/a return ucs1lib_find(buf1, len1, buf2, len2, offset);
10410n/a case PyUnicode_2BYTE_KIND:
10411n/a return ucs2lib_find(buf1, len1, buf2, len2, offset);
10412n/a case PyUnicode_4BYTE_KIND:
10413n/a return ucs4lib_find(buf1, len1, buf2, len2, offset);
10414n/a }
10415n/a assert(0);
10416n/a return -1;
10417n/a}
10418n/a
10419n/astatic Py_ssize_t
10420n/aanylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10421n/a PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10422n/a{
10423n/a switch (kind) {
10424n/a case PyUnicode_1BYTE_KIND:
10425n/a if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10426n/a return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10427n/a else
10428n/a return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10429n/a case PyUnicode_2BYTE_KIND:
10430n/a return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10431n/a case PyUnicode_4BYTE_KIND:
10432n/a return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10433n/a }
10434n/a assert(0);
10435n/a return 0;
10436n/a}
10437n/a
10438n/astatic void
10439n/areplace_1char_inplace(PyObject *u, Py_ssize_t pos,
10440n/a Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10441n/a{
10442n/a int kind = PyUnicode_KIND(u);
10443n/a void *data = PyUnicode_DATA(u);
10444n/a Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10445n/a if (kind == PyUnicode_1BYTE_KIND) {
10446n/a ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10447n/a (Py_UCS1 *)data + len,
10448n/a u1, u2, maxcount);
10449n/a }
10450n/a else if (kind == PyUnicode_2BYTE_KIND) {
10451n/a ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10452n/a (Py_UCS2 *)data + len,
10453n/a u1, u2, maxcount);
10454n/a }
10455n/a else {
10456n/a assert(kind == PyUnicode_4BYTE_KIND);
10457n/a ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10458n/a (Py_UCS4 *)data + len,
10459n/a u1, u2, maxcount);
10460n/a }
10461n/a}
10462n/a
10463n/astatic PyObject *
10464n/areplace(PyObject *self, PyObject *str1,
10465n/a PyObject *str2, Py_ssize_t maxcount)
10466n/a{
10467n/a PyObject *u;
10468n/a char *sbuf = PyUnicode_DATA(self);
10469n/a char *buf1 = PyUnicode_DATA(str1);
10470n/a char *buf2 = PyUnicode_DATA(str2);
10471n/a int srelease = 0, release1 = 0, release2 = 0;
10472n/a int skind = PyUnicode_KIND(self);
10473n/a int kind1 = PyUnicode_KIND(str1);
10474n/a int kind2 = PyUnicode_KIND(str2);
10475n/a Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10476n/a Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10477n/a Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10478n/a int mayshrink;
10479n/a Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10480n/a
10481n/a if (maxcount < 0)
10482n/a maxcount = PY_SSIZE_T_MAX;
10483n/a else if (maxcount == 0 || slen == 0)
10484n/a goto nothing;
10485n/a
10486n/a if (str1 == str2)
10487n/a goto nothing;
10488n/a
10489n/a maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10490n/a maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10491n/a if (maxchar < maxchar_str1)
10492n/a /* substring too wide to be present */
10493n/a goto nothing;
10494n/a maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10495n/a /* Replacing str1 with str2 may cause a maxchar reduction in the
10496n/a result string. */
10497n/a mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10498n/a maxchar = Py_MAX(maxchar, maxchar_str2);
10499n/a
10500n/a if (len1 == len2) {
10501n/a /* same length */
10502n/a if (len1 == 0)
10503n/a goto nothing;
10504n/a if (len1 == 1) {
10505n/a /* replace characters */
10506n/a Py_UCS4 u1, u2;
10507n/a Py_ssize_t pos;
10508n/a
10509n/a u1 = PyUnicode_READ(kind1, buf1, 0);
10510n/a pos = findchar(sbuf, skind, slen, u1, 1);
10511n/a if (pos < 0)
10512n/a goto nothing;
10513n/a u2 = PyUnicode_READ(kind2, buf2, 0);
10514n/a u = PyUnicode_New(slen, maxchar);
10515n/a if (!u)
10516n/a goto error;
10517n/a
10518n/a _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10519n/a replace_1char_inplace(u, pos, u1, u2, maxcount);
10520n/a }
10521n/a else {
10522n/a int rkind = skind;
10523n/a char *res;
10524n/a Py_ssize_t i;
10525n/a
10526n/a if (kind1 < rkind) {
10527n/a /* widen substring */
10528n/a buf1 = _PyUnicode_AsKind(str1, rkind);
10529n/a if (!buf1) goto error;
10530n/a release1 = 1;
10531n/a }
10532n/a i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10533n/a if (i < 0)
10534n/a goto nothing;
10535n/a if (rkind > kind2) {
10536n/a /* widen replacement */
10537n/a buf2 = _PyUnicode_AsKind(str2, rkind);
10538n/a if (!buf2) goto error;
10539n/a release2 = 1;
10540n/a }
10541n/a else if (rkind < kind2) {
10542n/a /* widen self and buf1 */
10543n/a rkind = kind2;
10544n/a if (release1) PyMem_Free(buf1);
10545n/a release1 = 0;
10546n/a sbuf = _PyUnicode_AsKind(self, rkind);
10547n/a if (!sbuf) goto error;
10548n/a srelease = 1;
10549n/a buf1 = _PyUnicode_AsKind(str1, rkind);
10550n/a if (!buf1) goto error;
10551n/a release1 = 1;
10552n/a }
10553n/a u = PyUnicode_New(slen, maxchar);
10554n/a if (!u)
10555n/a goto error;
10556n/a assert(PyUnicode_KIND(u) == rkind);
10557n/a res = PyUnicode_DATA(u);
10558n/a
10559n/a memcpy(res, sbuf, rkind * slen);
10560n/a /* change everything in-place, starting with this one */
10561n/a memcpy(res + rkind * i,
10562n/a buf2,
10563n/a rkind * len2);
10564n/a i += len1;
10565n/a
10566n/a while ( --maxcount > 0) {
10567n/a i = anylib_find(rkind, self,
10568n/a sbuf+rkind*i, slen-i,
10569n/a str1, buf1, len1, i);
10570n/a if (i == -1)
10571n/a break;
10572n/a memcpy(res + rkind * i,
10573n/a buf2,
10574n/a rkind * len2);
10575n/a i += len1;
10576n/a }
10577n/a }
10578n/a }
10579n/a else {
10580n/a Py_ssize_t n, i, j, ires;
10581n/a Py_ssize_t new_size;
10582n/a int rkind = skind;
10583n/a char *res;
10584n/a
10585n/a if (kind1 < rkind) {
10586n/a /* widen substring */
10587n/a buf1 = _PyUnicode_AsKind(str1, rkind);
10588n/a if (!buf1) goto error;
10589n/a release1 = 1;
10590n/a }
10591n/a n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10592n/a if (n == 0)
10593n/a goto nothing;
10594n/a if (kind2 < rkind) {
10595n/a /* widen replacement */
10596n/a buf2 = _PyUnicode_AsKind(str2, rkind);
10597n/a if (!buf2) goto error;
10598n/a release2 = 1;
10599n/a }
10600n/a else if (kind2 > rkind) {
10601n/a /* widen self and buf1 */
10602n/a rkind = kind2;
10603n/a sbuf = _PyUnicode_AsKind(self, rkind);
10604n/a if (!sbuf) goto error;
10605n/a srelease = 1;
10606n/a if (release1) PyMem_Free(buf1);
10607n/a release1 = 0;
10608n/a buf1 = _PyUnicode_AsKind(str1, rkind);
10609n/a if (!buf1) goto error;
10610n/a release1 = 1;
10611n/a }
10612n/a /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10613n/a PyUnicode_GET_LENGTH(str1))); */
10614n/a if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10615n/a PyErr_SetString(PyExc_OverflowError,
10616n/a "replace string is too long");
10617n/a goto error;
10618n/a }
10619n/a new_size = slen + n * (len2 - len1);
10620n/a if (new_size == 0) {
10621n/a _Py_INCREF_UNICODE_EMPTY();
10622n/a if (!unicode_empty)
10623n/a goto error;
10624n/a u = unicode_empty;
10625n/a goto done;
10626n/a }
10627n/a if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10628n/a PyErr_SetString(PyExc_OverflowError,
10629n/a "replace string is too long");
10630n/a goto error;
10631n/a }
10632n/a u = PyUnicode_New(new_size, maxchar);
10633n/a if (!u)
10634n/a goto error;
10635n/a assert(PyUnicode_KIND(u) == rkind);
10636n/a res = PyUnicode_DATA(u);
10637n/a ires = i = 0;
10638n/a if (len1 > 0) {
10639n/a while (n-- > 0) {
10640n/a /* look for next match */
10641n/a j = anylib_find(rkind, self,
10642n/a sbuf + rkind * i, slen-i,
10643n/a str1, buf1, len1, i);
10644n/a if (j == -1)
10645n/a break;
10646n/a else if (j > i) {
10647n/a /* copy unchanged part [i:j] */
10648n/a memcpy(res + rkind * ires,
10649n/a sbuf + rkind * i,
10650n/a rkind * (j-i));
10651n/a ires += j - i;
10652n/a }
10653n/a /* copy substitution string */
10654n/a if (len2 > 0) {
10655n/a memcpy(res + rkind * ires,
10656n/a buf2,
10657n/a rkind * len2);
10658n/a ires += len2;
10659n/a }
10660n/a i = j + len1;
10661n/a }
10662n/a if (i < slen)
10663n/a /* copy tail [i:] */
10664n/a memcpy(res + rkind * ires,
10665n/a sbuf + rkind * i,
10666n/a rkind * (slen-i));
10667n/a }
10668n/a else {
10669n/a /* interleave */
10670n/a while (n > 0) {
10671n/a memcpy(res + rkind * ires,
10672n/a buf2,
10673n/a rkind * len2);
10674n/a ires += len2;
10675n/a if (--n <= 0)
10676n/a break;
10677n/a memcpy(res + rkind * ires,
10678n/a sbuf + rkind * i,
10679n/a rkind);
10680n/a ires++;
10681n/a i++;
10682n/a }
10683n/a memcpy(res + rkind * ires,
10684n/a sbuf + rkind * i,
10685n/a rkind * (slen-i));
10686n/a }
10687n/a }
10688n/a
10689n/a if (mayshrink) {
10690n/a unicode_adjust_maxchar(&u);
10691n/a if (u == NULL)
10692n/a goto error;
10693n/a }
10694n/a
10695n/a done:
10696n/a if (srelease)
10697n/a PyMem_FREE(sbuf);
10698n/a if (release1)
10699n/a PyMem_FREE(buf1);
10700n/a if (release2)
10701n/a PyMem_FREE(buf2);
10702n/a assert(_PyUnicode_CheckConsistency(u, 1));
10703n/a return u;
10704n/a
10705n/a nothing:
10706n/a /* nothing to replace; return original string (when possible) */
10707n/a if (srelease)
10708n/a PyMem_FREE(sbuf);
10709n/a if (release1)
10710n/a PyMem_FREE(buf1);
10711n/a if (release2)
10712n/a PyMem_FREE(buf2);
10713n/a return unicode_result_unchanged(self);
10714n/a
10715n/a error:
10716n/a if (srelease && sbuf)
10717n/a PyMem_FREE(sbuf);
10718n/a if (release1 && buf1)
10719n/a PyMem_FREE(buf1);
10720n/a if (release2 && buf2)
10721n/a PyMem_FREE(buf2);
10722n/a return NULL;
10723n/a}
10724n/a
10725n/a/* --- Unicode Object Methods --------------------------------------------- */
10726n/a
10727n/a/*[clinic input]
10728n/astr.title as unicode_title
10729n/a
10730n/aReturn a version of the string where each word is titlecased.
10731n/a
10732n/aMore specifically, words start with uppercased characters and all remaining
10733n/acased characters have lower case.
10734n/a[clinic start generated code]*/
10735n/a
10736n/astatic PyObject *
10737n/aunicode_title_impl(PyObject *self)
10738n/a/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10739n/a{
10740n/a if (PyUnicode_READY(self) == -1)
10741n/a return NULL;
10742n/a return case_operation(self, do_title);
10743n/a}
10744n/a
10745n/a/*[clinic input]
10746n/astr.capitalize as unicode_capitalize
10747n/a
10748n/aReturn a capitalized version of the string.
10749n/a
10750n/aMore specifically, make the first character have upper case and the rest lower
10751n/acase.
10752n/a[clinic start generated code]*/
10753n/a
10754n/astatic PyObject *
10755n/aunicode_capitalize_impl(PyObject *self)
10756n/a/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10757n/a{
10758n/a if (PyUnicode_READY(self) == -1)
10759n/a return NULL;
10760n/a if (PyUnicode_GET_LENGTH(self) == 0)
10761n/a return unicode_result_unchanged(self);
10762n/a return case_operation(self, do_capitalize);
10763n/a}
10764n/a
10765n/a/*[clinic input]
10766n/astr.casefold as unicode_casefold
10767n/a
10768n/aReturn a version of the string suitable for caseless comparisons.
10769n/a[clinic start generated code]*/
10770n/a
10771n/astatic PyObject *
10772n/aunicode_casefold_impl(PyObject *self)
10773n/a/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10774n/a{
10775n/a if (PyUnicode_READY(self) == -1)
10776n/a return NULL;
10777n/a if (PyUnicode_IS_ASCII(self))
10778n/a return ascii_upper_or_lower(self, 1);
10779n/a return case_operation(self, do_casefold);
10780n/a}
10781n/a
10782n/a
10783n/a/* Argument converter. Accepts a single Unicode character. */
10784n/a
10785n/astatic int
10786n/aconvert_uc(PyObject *obj, void *addr)
10787n/a{
10788n/a Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10789n/a
10790n/a if (!PyUnicode_Check(obj)) {
10791n/a PyErr_Format(PyExc_TypeError,
10792n/a "The fill character must be a unicode character, "
10793n/a "not %.100s", Py_TYPE(obj)->tp_name);
10794n/a return 0;
10795n/a }
10796n/a if (PyUnicode_READY(obj) < 0)
10797n/a return 0;
10798n/a if (PyUnicode_GET_LENGTH(obj) != 1) {
10799n/a PyErr_SetString(PyExc_TypeError,
10800n/a "The fill character must be exactly one character long");
10801n/a return 0;
10802n/a }
10803n/a *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10804n/a return 1;
10805n/a}
10806n/a
10807n/a/*[clinic input]
10808n/astr.center as unicode_center
10809n/a
10810n/a width: Py_ssize_t
10811n/a fillchar: Py_UCS4 = ' '
10812n/a /
10813n/a
10814n/aReturn a centered string of length width.
10815n/a
10816n/aPadding is done using the specified fill character (default is a space).
10817n/a[clinic start generated code]*/
10818n/a
10819n/astatic PyObject *
10820n/aunicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10821n/a/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10822n/a{
10823n/a Py_ssize_t marg, left;
10824n/a
10825n/a if (PyUnicode_READY(self) == -1)
10826n/a return NULL;
10827n/a
10828n/a if (PyUnicode_GET_LENGTH(self) >= width)
10829n/a return unicode_result_unchanged(self);
10830n/a
10831n/a marg = width - PyUnicode_GET_LENGTH(self);
10832n/a left = marg / 2 + (marg & width & 1);
10833n/a
10834n/a return pad(self, left, marg - left, fillchar);
10835n/a}
10836n/a
10837n/a/* This function assumes that str1 and str2 are readied by the caller. */
10838n/a
10839n/astatic int
10840n/aunicode_compare(PyObject *str1, PyObject *str2)
10841n/a{
10842n/a#define COMPARE(TYPE1, TYPE2) \
10843n/a do { \
10844n/a TYPE1* p1 = (TYPE1 *)data1; \
10845n/a TYPE2* p2 = (TYPE2 *)data2; \
10846n/a TYPE1* end = p1 + len; \
10847n/a Py_UCS4 c1, c2; \
10848n/a for (; p1 != end; p1++, p2++) { \
10849n/a c1 = *p1; \
10850n/a c2 = *p2; \
10851n/a if (c1 != c2) \
10852n/a return (c1 < c2) ? -1 : 1; \
10853n/a } \
10854n/a } \
10855n/a while (0)
10856n/a
10857n/a int kind1, kind2;
10858n/a void *data1, *data2;
10859n/a Py_ssize_t len1, len2, len;
10860n/a
10861n/a kind1 = PyUnicode_KIND(str1);
10862n/a kind2 = PyUnicode_KIND(str2);
10863n/a data1 = PyUnicode_DATA(str1);
10864n/a data2 = PyUnicode_DATA(str2);
10865n/a len1 = PyUnicode_GET_LENGTH(str1);
10866n/a len2 = PyUnicode_GET_LENGTH(str2);
10867n/a len = Py_MIN(len1, len2);
10868n/a
10869n/a switch(kind1) {
10870n/a case PyUnicode_1BYTE_KIND:
10871n/a {
10872n/a switch(kind2) {
10873n/a case PyUnicode_1BYTE_KIND:
10874n/a {
10875n/a int cmp = memcmp(data1, data2, len);
10876n/a /* normalize result of memcmp() into the range [-1; 1] */
10877n/a if (cmp < 0)
10878n/a return -1;
10879n/a if (cmp > 0)
10880n/a return 1;
10881n/a break;
10882n/a }
10883n/a case PyUnicode_2BYTE_KIND:
10884n/a COMPARE(Py_UCS1, Py_UCS2);
10885n/a break;
10886n/a case PyUnicode_4BYTE_KIND:
10887n/a COMPARE(Py_UCS1, Py_UCS4);
10888n/a break;
10889n/a default:
10890n/a assert(0);
10891n/a }
10892n/a break;
10893n/a }
10894n/a case PyUnicode_2BYTE_KIND:
10895n/a {
10896n/a switch(kind2) {
10897n/a case PyUnicode_1BYTE_KIND:
10898n/a COMPARE(Py_UCS2, Py_UCS1);
10899n/a break;
10900n/a case PyUnicode_2BYTE_KIND:
10901n/a {
10902n/a COMPARE(Py_UCS2, Py_UCS2);
10903n/a break;
10904n/a }
10905n/a case PyUnicode_4BYTE_KIND:
10906n/a COMPARE(Py_UCS2, Py_UCS4);
10907n/a break;
10908n/a default:
10909n/a assert(0);
10910n/a }
10911n/a break;
10912n/a }
10913n/a case PyUnicode_4BYTE_KIND:
10914n/a {
10915n/a switch(kind2) {
10916n/a case PyUnicode_1BYTE_KIND:
10917n/a COMPARE(Py_UCS4, Py_UCS1);
10918n/a break;
10919n/a case PyUnicode_2BYTE_KIND:
10920n/a COMPARE(Py_UCS4, Py_UCS2);
10921n/a break;
10922n/a case PyUnicode_4BYTE_KIND:
10923n/a {
10924n/a#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10925n/a int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10926n/a /* normalize result of wmemcmp() into the range [-1; 1] */
10927n/a if (cmp < 0)
10928n/a return -1;
10929n/a if (cmp > 0)
10930n/a return 1;
10931n/a#else
10932n/a COMPARE(Py_UCS4, Py_UCS4);
10933n/a#endif
10934n/a break;
10935n/a }
10936n/a default:
10937n/a assert(0);
10938n/a }
10939n/a break;
10940n/a }
10941n/a default:
10942n/a assert(0);
10943n/a }
10944n/a
10945n/a if (len1 == len2)
10946n/a return 0;
10947n/a if (len1 < len2)
10948n/a return -1;
10949n/a else
10950n/a return 1;
10951n/a
10952n/a#undef COMPARE
10953n/a}
10954n/a
10955n/astatic int
10956n/aunicode_compare_eq(PyObject *str1, PyObject *str2)
10957n/a{
10958n/a int kind;
10959n/a void *data1, *data2;
10960n/a Py_ssize_t len;
10961n/a int cmp;
10962n/a
10963n/a len = PyUnicode_GET_LENGTH(str1);
10964n/a if (PyUnicode_GET_LENGTH(str2) != len)
10965n/a return 0;
10966n/a kind = PyUnicode_KIND(str1);
10967n/a if (PyUnicode_KIND(str2) != kind)
10968n/a return 0;
10969n/a data1 = PyUnicode_DATA(str1);
10970n/a data2 = PyUnicode_DATA(str2);
10971n/a
10972n/a cmp = memcmp(data1, data2, len * kind);
10973n/a return (cmp == 0);
10974n/a}
10975n/a
10976n/a
10977n/aint
10978n/aPyUnicode_Compare(PyObject *left, PyObject *right)
10979n/a{
10980n/a if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10981n/a if (PyUnicode_READY(left) == -1 ||
10982n/a PyUnicode_READY(right) == -1)
10983n/a return -1;
10984n/a
10985n/a /* a string is equal to itself */
10986n/a if (left == right)
10987n/a return 0;
10988n/a
10989n/a return unicode_compare(left, right);
10990n/a }
10991n/a PyErr_Format(PyExc_TypeError,
10992n/a "Can't compare %.100s and %.100s",
10993n/a left->ob_type->tp_name,
10994n/a right->ob_type->tp_name);
10995n/a return -1;
10996n/a}
10997n/a
10998n/aint
10999n/aPyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11000n/a{
11001n/a Py_ssize_t i;
11002n/a int kind;
11003n/a Py_UCS4 chr;
11004n/a const unsigned char *ustr = (const unsigned char *)str;
11005n/a
11006n/a assert(_PyUnicode_CHECK(uni));
11007n/a if (!PyUnicode_IS_READY(uni)) {
11008n/a const wchar_t *ws = _PyUnicode_WSTR(uni);
11009n/a /* Compare Unicode string and source character set string */
11010n/a for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11011n/a if (chr != ustr[i])
11012n/a return (chr < ustr[i]) ? -1 : 1;
11013n/a }
11014n/a /* This check keeps Python strings that end in '\0' from comparing equal
11015n/a to C strings identical up to that point. */
11016n/a if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11017n/a return 1; /* uni is longer */
11018n/a if (ustr[i])
11019n/a return -1; /* str is longer */
11020n/a return 0;
11021n/a }
11022n/a kind = PyUnicode_KIND(uni);
11023n/a if (kind == PyUnicode_1BYTE_KIND) {
11024n/a const void *data = PyUnicode_1BYTE_DATA(uni);
11025n/a size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11026n/a size_t len, len2 = strlen(str);
11027n/a int cmp;
11028n/a
11029n/a len = Py_MIN(len1, len2);
11030n/a cmp = memcmp(data, str, len);
11031n/a if (cmp != 0) {
11032n/a if (cmp < 0)
11033n/a return -1;
11034n/a else
11035n/a return 1;
11036n/a }
11037n/a if (len1 > len2)
11038n/a return 1; /* uni is longer */
11039n/a if (len1 < len2)
11040n/a return -1; /* str is longer */
11041n/a return 0;
11042n/a }
11043n/a else {
11044n/a void *data = PyUnicode_DATA(uni);
11045n/a /* Compare Unicode string and source character set string */
11046n/a for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11047n/a if (chr != (unsigned char)str[i])
11048n/a return (chr < (unsigned char)(str[i])) ? -1 : 1;
11049n/a /* This check keeps Python strings that end in '\0' from comparing equal
11050n/a to C strings identical up to that point. */
11051n/a if (PyUnicode_GET_LENGTH(uni) != i || chr)
11052n/a return 1; /* uni is longer */
11053n/a if (str[i])
11054n/a return -1; /* str is longer */
11055n/a return 0;
11056n/a }
11057n/a}
11058n/a
11059n/astatic int
11060n/anon_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11061n/a{
11062n/a size_t i, len;
11063n/a const wchar_t *p;
11064n/a len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11065n/a if (strlen(str) != len)
11066n/a return 0;
11067n/a p = _PyUnicode_WSTR(unicode);
11068n/a assert(p);
11069n/a for (i = 0; i < len; i++) {
11070n/a unsigned char c = (unsigned char)str[i];
11071n/a if (c >= 128 || p[i] != (wchar_t)c)
11072n/a return 0;
11073n/a }
11074n/a return 1;
11075n/a}
11076n/a
11077n/aint
11078n/a_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11079n/a{
11080n/a size_t len;
11081n/a assert(_PyUnicode_CHECK(unicode));
11082n/a assert(str);
11083n/a#ifndef NDEBUG
11084n/a for (const char *p = str; *p; p++) {
11085n/a assert((unsigned char)*p < 128);
11086n/a }
11087n/a#endif
11088n/a if (PyUnicode_READY(unicode) == -1) {
11089n/a /* Memory error or bad data */
11090n/a PyErr_Clear();
11091n/a return non_ready_unicode_equal_to_ascii_string(unicode, str);
11092n/a }
11093n/a if (!PyUnicode_IS_ASCII(unicode))
11094n/a return 0;
11095n/a len = (size_t)PyUnicode_GET_LENGTH(unicode);
11096n/a return strlen(str) == len &&
11097n/a memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11098n/a}
11099n/a
11100n/aint
11101n/a_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11102n/a{
11103n/a PyObject *right_uni;
11104n/a Py_hash_t hash;
11105n/a
11106n/a assert(_PyUnicode_CHECK(left));
11107n/a assert(right->string);
11108n/a#ifndef NDEBUG
11109n/a for (const char *p = right->string; *p; p++) {
11110n/a assert((unsigned char)*p < 128);
11111n/a }
11112n/a#endif
11113n/a
11114n/a if (PyUnicode_READY(left) == -1) {
11115n/a /* memory error or bad data */
11116n/a PyErr_Clear();
11117n/a return non_ready_unicode_equal_to_ascii_string(left, right->string);
11118n/a }
11119n/a
11120n/a if (!PyUnicode_IS_ASCII(left))
11121n/a return 0;
11122n/a
11123n/a right_uni = _PyUnicode_FromId(right); /* borrowed */
11124n/a if (right_uni == NULL) {
11125n/a /* memory error or bad data */
11126n/a PyErr_Clear();
11127n/a return _PyUnicode_EqualToASCIIString(left, right->string);
11128n/a }
11129n/a
11130n/a if (left == right_uni)
11131n/a return 1;
11132n/a
11133n/a if (PyUnicode_CHECK_INTERNED(left))
11134n/a return 0;
11135n/a
11136n/a assert(_PyUnicode_HASH(right_uni) != 1);
11137n/a hash = _PyUnicode_HASH(left);
11138n/a if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11139n/a return 0;
11140n/a
11141n/a return unicode_compare_eq(left, right_uni);
11142n/a}
11143n/a
11144n/a#define TEST_COND(cond) \
11145n/a ((cond) ? Py_True : Py_False)
11146n/a
11147n/aPyObject *
11148n/aPyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11149n/a{
11150n/a int result;
11151n/a PyObject *v;
11152n/a
11153n/a if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11154n/a Py_RETURN_NOTIMPLEMENTED;
11155n/a
11156n/a if (PyUnicode_READY(left) == -1 ||
11157n/a PyUnicode_READY(right) == -1)
11158n/a return NULL;
11159n/a
11160n/a if (left == right) {
11161n/a switch (op) {
11162n/a case Py_EQ:
11163n/a case Py_LE:
11164n/a case Py_GE:
11165n/a /* a string is equal to itself */
11166n/a v = Py_True;
11167n/a break;
11168n/a case Py_NE:
11169n/a case Py_LT:
11170n/a case Py_GT:
11171n/a v = Py_False;
11172n/a break;
11173n/a default:
11174n/a PyErr_BadArgument();
11175n/a return NULL;
11176n/a }
11177n/a }
11178n/a else if (op == Py_EQ || op == Py_NE) {
11179n/a result = unicode_compare_eq(left, right);
11180n/a result ^= (op == Py_NE);
11181n/a v = TEST_COND(result);
11182n/a }
11183n/a else {
11184n/a result = unicode_compare(left, right);
11185n/a
11186n/a /* Convert the return value to a Boolean */
11187n/a switch (op) {
11188n/a case Py_LE:
11189n/a v = TEST_COND(result <= 0);
11190n/a break;
11191n/a case Py_GE:
11192n/a v = TEST_COND(result >= 0);
11193n/a break;
11194n/a case Py_LT:
11195n/a v = TEST_COND(result == -1);
11196n/a break;
11197n/a case Py_GT:
11198n/a v = TEST_COND(result == 1);
11199n/a break;
11200n/a default:
11201n/a PyErr_BadArgument();
11202n/a return NULL;
11203n/a }
11204n/a }
11205n/a Py_INCREF(v);
11206n/a return v;
11207n/a}
11208n/a
11209n/aint
11210n/a_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11211n/a{
11212n/a return unicode_eq(aa, bb);
11213n/a}
11214n/a
11215n/aint
11216n/aPyUnicode_Contains(PyObject *str, PyObject *substr)
11217n/a{
11218n/a int kind1, kind2;
11219n/a void *buf1, *buf2;
11220n/a Py_ssize_t len1, len2;
11221n/a int result;
11222n/a
11223n/a if (!PyUnicode_Check(substr)) {
11224n/a PyErr_Format(PyExc_TypeError,
11225n/a "'in <string>' requires string as left operand, not %.100s",
11226n/a Py_TYPE(substr)->tp_name);
11227n/a return -1;
11228n/a }
11229n/a if (PyUnicode_READY(substr) == -1)
11230n/a return -1;
11231n/a if (ensure_unicode(str) < 0)
11232n/a return -1;
11233n/a
11234n/a kind1 = PyUnicode_KIND(str);
11235n/a kind2 = PyUnicode_KIND(substr);
11236n/a if (kind1 < kind2)
11237n/a return 0;
11238n/a len1 = PyUnicode_GET_LENGTH(str);
11239n/a len2 = PyUnicode_GET_LENGTH(substr);
11240n/a if (len1 < len2)
11241n/a return 0;
11242n/a buf1 = PyUnicode_DATA(str);
11243n/a buf2 = PyUnicode_DATA(substr);
11244n/a if (len2 == 1) {
11245n/a Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11246n/a result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11247n/a return result;
11248n/a }
11249n/a if (kind2 != kind1) {
11250n/a buf2 = _PyUnicode_AsKind(substr, kind1);
11251n/a if (!buf2)
11252n/a return -1;
11253n/a }
11254n/a
11255n/a switch (kind1) {
11256n/a case PyUnicode_1BYTE_KIND:
11257n/a result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11258n/a break;
11259n/a case PyUnicode_2BYTE_KIND:
11260n/a result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11261n/a break;
11262n/a case PyUnicode_4BYTE_KIND:
11263n/a result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11264n/a break;
11265n/a default:
11266n/a result = -1;
11267n/a assert(0);
11268n/a }
11269n/a
11270n/a if (kind2 != kind1)
11271n/a PyMem_Free(buf2);
11272n/a
11273n/a return result;
11274n/a}
11275n/a
11276n/a/* Concat to string or Unicode object giving a new Unicode object. */
11277n/a
11278n/aPyObject *
11279n/aPyUnicode_Concat(PyObject *left, PyObject *right)
11280n/a{
11281n/a PyObject *result;
11282n/a Py_UCS4 maxchar, maxchar2;
11283n/a Py_ssize_t left_len, right_len, new_len;
11284n/a
11285n/a if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11286n/a return NULL;
11287n/a
11288n/a /* Shortcuts */
11289n/a if (left == unicode_empty)
11290n/a return PyUnicode_FromObject(right);
11291n/a if (right == unicode_empty)
11292n/a return PyUnicode_FromObject(left);
11293n/a
11294n/a left_len = PyUnicode_GET_LENGTH(left);
11295n/a right_len = PyUnicode_GET_LENGTH(right);
11296n/a if (left_len > PY_SSIZE_T_MAX - right_len) {
11297n/a PyErr_SetString(PyExc_OverflowError,
11298n/a "strings are too large to concat");
11299n/a return NULL;
11300n/a }
11301n/a new_len = left_len + right_len;
11302n/a
11303n/a maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11304n/a maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11305n/a maxchar = Py_MAX(maxchar, maxchar2);
11306n/a
11307n/a /* Concat the two Unicode strings */
11308n/a result = PyUnicode_New(new_len, maxchar);
11309n/a if (result == NULL)
11310n/a return NULL;
11311n/a _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11312n/a _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11313n/a assert(_PyUnicode_CheckConsistency(result, 1));
11314n/a return result;
11315n/a}
11316n/a
11317n/avoid
11318n/aPyUnicode_Append(PyObject **p_left, PyObject *right)
11319n/a{
11320n/a PyObject *left, *res;
11321n/a Py_UCS4 maxchar, maxchar2;
11322n/a Py_ssize_t left_len, right_len, new_len;
11323n/a
11324n/a if (p_left == NULL) {
11325n/a if (!PyErr_Occurred())
11326n/a PyErr_BadInternalCall();
11327n/a return;
11328n/a }
11329n/a left = *p_left;
11330n/a if (right == NULL || left == NULL
11331n/a || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11332n/a if (!PyErr_Occurred())
11333n/a PyErr_BadInternalCall();
11334n/a goto error;
11335n/a }
11336n/a
11337n/a if (PyUnicode_READY(left) == -1)
11338n/a goto error;
11339n/a if (PyUnicode_READY(right) == -1)
11340n/a goto error;
11341n/a
11342n/a /* Shortcuts */
11343n/a if (left == unicode_empty) {
11344n/a Py_DECREF(left);
11345n/a Py_INCREF(right);
11346n/a *p_left = right;
11347n/a return;
11348n/a }
11349n/a if (right == unicode_empty)
11350n/a return;
11351n/a
11352n/a left_len = PyUnicode_GET_LENGTH(left);
11353n/a right_len = PyUnicode_GET_LENGTH(right);
11354n/a if (left_len > PY_SSIZE_T_MAX - right_len) {
11355n/a PyErr_SetString(PyExc_OverflowError,
11356n/a "strings are too large to concat");
11357n/a goto error;
11358n/a }
11359n/a new_len = left_len + right_len;
11360n/a
11361n/a if (unicode_modifiable(left)
11362n/a && PyUnicode_CheckExact(right)
11363n/a && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11364n/a /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11365n/a to change the structure size, but characters are stored just after
11366n/a the structure, and so it requires to move all characters which is
11367n/a not so different than duplicating the string. */
11368n/a && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11369n/a {
11370n/a /* append inplace */
11371n/a if (unicode_resize(p_left, new_len) != 0)
11372n/a goto error;
11373n/a
11374n/a /* copy 'right' into the newly allocated area of 'left' */
11375n/a _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11376n/a }
11377n/a else {
11378n/a maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11379n/a maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11380n/a maxchar = Py_MAX(maxchar, maxchar2);
11381n/a
11382n/a /* Concat the two Unicode strings */
11383n/a res = PyUnicode_New(new_len, maxchar);
11384n/a if (res == NULL)
11385n/a goto error;
11386n/a _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11387n/a _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11388n/a Py_DECREF(left);
11389n/a *p_left = res;
11390n/a }
11391n/a assert(_PyUnicode_CheckConsistency(*p_left, 1));
11392n/a return;
11393n/a
11394n/aerror:
11395n/a Py_CLEAR(*p_left);
11396n/a}
11397n/a
11398n/avoid
11399n/aPyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11400n/a{
11401n/a PyUnicode_Append(pleft, right);
11402n/a Py_XDECREF(right);
11403n/a}
11404n/a
11405n/a/*
11406n/aWraps stringlib_parse_args_finds() and additionally ensures that the
11407n/afirst argument is a unicode object.
11408n/a*/
11409n/a
11410n/astatic inline int
11411n/aparse_args_finds_unicode(const char * function_name, PyObject *args,
11412n/a PyObject **substring,
11413n/a Py_ssize_t *start, Py_ssize_t *end)
11414n/a{
11415n/a if(stringlib_parse_args_finds(function_name, args, substring,
11416n/a start, end)) {
11417n/a if (ensure_unicode(*substring) < 0)
11418n/a return 0;
11419n/a return 1;
11420n/a }
11421n/a return 0;
11422n/a}
11423n/a
11424n/aPyDoc_STRVAR(count__doc__,
11425n/a "S.count(sub[, start[, end]]) -> int\n\
11426n/a\n\
11427n/aReturn the number of non-overlapping occurrences of substring sub in\n\
11428n/astring S[start:end]. Optional arguments start and end are\n\
11429n/ainterpreted as in slice notation.");
11430n/a
11431n/astatic PyObject *
11432n/aunicode_count(PyObject *self, PyObject *args)
11433n/a{
11434n/a PyObject *substring = NULL; /* initialize to fix a compiler warning */
11435n/a Py_ssize_t start = 0;
11436n/a Py_ssize_t end = PY_SSIZE_T_MAX;
11437n/a PyObject *result;
11438n/a int kind1, kind2;
11439n/a void *buf1, *buf2;
11440n/a Py_ssize_t len1, len2, iresult;
11441n/a
11442n/a if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11443n/a return NULL;
11444n/a
11445n/a kind1 = PyUnicode_KIND(self);
11446n/a kind2 = PyUnicode_KIND(substring);
11447n/a if (kind1 < kind2)
11448n/a return PyLong_FromLong(0);
11449n/a
11450n/a len1 = PyUnicode_GET_LENGTH(self);
11451n/a len2 = PyUnicode_GET_LENGTH(substring);
11452n/a ADJUST_INDICES(start, end, len1);
11453n/a if (end - start < len2)
11454n/a return PyLong_FromLong(0);
11455n/a
11456n/a buf1 = PyUnicode_DATA(self);
11457n/a buf2 = PyUnicode_DATA(substring);
11458n/a if (kind2 != kind1) {
11459n/a buf2 = _PyUnicode_AsKind(substring, kind1);
11460n/a if (!buf2)
11461n/a return NULL;
11462n/a }
11463n/a switch (kind1) {
11464n/a case PyUnicode_1BYTE_KIND:
11465n/a iresult = ucs1lib_count(
11466n/a ((Py_UCS1*)buf1) + start, end - start,
11467n/a buf2, len2, PY_SSIZE_T_MAX
11468n/a );
11469n/a break;
11470n/a case PyUnicode_2BYTE_KIND:
11471n/a iresult = ucs2lib_count(
11472n/a ((Py_UCS2*)buf1) + start, end - start,
11473n/a buf2, len2, PY_SSIZE_T_MAX
11474n/a );
11475n/a break;
11476n/a case PyUnicode_4BYTE_KIND:
11477n/a iresult = ucs4lib_count(
11478n/a ((Py_UCS4*)buf1) + start, end - start,
11479n/a buf2, len2, PY_SSIZE_T_MAX
11480n/a );
11481n/a break;
11482n/a default:
11483n/a assert(0); iresult = 0;
11484n/a }
11485n/a
11486n/a result = PyLong_FromSsize_t(iresult);
11487n/a
11488n/a if (kind2 != kind1)
11489n/a PyMem_Free(buf2);
11490n/a
11491n/a return result;
11492n/a}
11493n/a
11494n/a/*[clinic input]
11495n/astr.encode as unicode_encode
11496n/a
11497n/a encoding: str(c_default="NULL") = 'utf-8'
11498n/a The encoding in which to encode the string.
11499n/a errors: str(c_default="NULL") = 'strict'
11500n/a The error handling scheme to use for encoding errors.
11501n/a The default is 'strict' meaning that encoding errors raise a
11502n/a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11503n/a 'xmlcharrefreplace' as well as any other name registered with
11504n/a codecs.register_error that can handle UnicodeEncodeErrors.
11505n/a
11506n/aEncode the string using the codec registered for encoding.
11507n/a[clinic start generated code]*/
11508n/a
11509n/astatic PyObject *
11510n/aunicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11511n/a/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11512n/a{
11513n/a return PyUnicode_AsEncodedString(self, encoding, errors);
11514n/a}
11515n/a
11516n/a/*[clinic input]
11517n/astr.expandtabs as unicode_expandtabs
11518n/a
11519n/a tabsize: int = 8
11520n/a
11521n/aReturn a copy where all tab characters are expanded using spaces.
11522n/a
11523n/aIf tabsize is not given, a tab size of 8 characters is assumed.
11524n/a[clinic start generated code]*/
11525n/a
11526n/astatic PyObject *
11527n/aunicode_expandtabs_impl(PyObject *self, int tabsize)
11528n/a/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11529n/a{
11530n/a Py_ssize_t i, j, line_pos, src_len, incr;
11531n/a Py_UCS4 ch;
11532n/a PyObject *u;
11533n/a void *src_data, *dest_data;
11534n/a int kind;
11535n/a int found;
11536n/a
11537n/a if (PyUnicode_READY(self) == -1)
11538n/a return NULL;
11539n/a
11540n/a /* First pass: determine size of output string */
11541n/a src_len = PyUnicode_GET_LENGTH(self);
11542n/a i = j = line_pos = 0;
11543n/a kind = PyUnicode_KIND(self);
11544n/a src_data = PyUnicode_DATA(self);
11545n/a found = 0;
11546n/a for (; i < src_len; i++) {
11547n/a ch = PyUnicode_READ(kind, src_data, i);
11548n/a if (ch == '\t') {
11549n/a found = 1;
11550n/a if (tabsize > 0) {
11551n/a incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11552n/a if (j > PY_SSIZE_T_MAX - incr)
11553n/a goto overflow;
11554n/a line_pos += incr;
11555n/a j += incr;
11556n/a }
11557n/a }
11558n/a else {
11559n/a if (j > PY_SSIZE_T_MAX - 1)
11560n/a goto overflow;
11561n/a line_pos++;
11562n/a j++;
11563n/a if (ch == '\n' || ch == '\r')
11564n/a line_pos = 0;
11565n/a }
11566n/a }
11567n/a if (!found)
11568n/a return unicode_result_unchanged(self);
11569n/a
11570n/a /* Second pass: create output string and fill it */
11571n/a u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11572n/a if (!u)
11573n/a return NULL;
11574n/a dest_data = PyUnicode_DATA(u);
11575n/a
11576n/a i = j = line_pos = 0;
11577n/a
11578n/a for (; i < src_len; i++) {
11579n/a ch = PyUnicode_READ(kind, src_data, i);
11580n/a if (ch == '\t') {
11581n/a if (tabsize > 0) {
11582n/a incr = tabsize - (line_pos % tabsize);
11583n/a line_pos += incr;
11584n/a FILL(kind, dest_data, ' ', j, incr);
11585n/a j += incr;
11586n/a }
11587n/a }
11588n/a else {
11589n/a line_pos++;
11590n/a PyUnicode_WRITE(kind, dest_data, j, ch);
11591n/a j++;
11592n/a if (ch == '\n' || ch == '\r')
11593n/a line_pos = 0;
11594n/a }
11595n/a }
11596n/a assert (j == PyUnicode_GET_LENGTH(u));
11597n/a return unicode_result(u);
11598n/a
11599n/a overflow:
11600n/a PyErr_SetString(PyExc_OverflowError, "new string is too long");
11601n/a return NULL;
11602n/a}
11603n/a
11604n/aPyDoc_STRVAR(find__doc__,
11605n/a "S.find(sub[, start[, end]]) -> int\n\
11606n/a\n\
11607n/aReturn the lowest index in S where substring sub is found,\n\
11608n/asuch that sub is contained within S[start:end]. Optional\n\
11609n/aarguments start and end are interpreted as in slice notation.\n\
11610n/a\n\
11611n/aReturn -1 on failure.");
11612n/a
11613n/astatic PyObject *
11614n/aunicode_find(PyObject *self, PyObject *args)
11615n/a{
11616n/a /* initialize variables to prevent gcc warning */
11617n/a PyObject *substring = NULL;
11618n/a Py_ssize_t start = 0;
11619n/a Py_ssize_t end = 0;
11620n/a Py_ssize_t result;
11621n/a
11622n/a if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11623n/a return NULL;
11624n/a
11625n/a if (PyUnicode_READY(self) == -1)
11626n/a return NULL;
11627n/a
11628n/a result = any_find_slice(self, substring, start, end, 1);
11629n/a
11630n/a if (result == -2)
11631n/a return NULL;
11632n/a
11633n/a return PyLong_FromSsize_t(result);
11634n/a}
11635n/a
11636n/astatic PyObject *
11637n/aunicode_getitem(PyObject *self, Py_ssize_t index)
11638n/a{
11639n/a void *data;
11640n/a enum PyUnicode_Kind kind;
11641n/a Py_UCS4 ch;
11642n/a
11643n/a if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11644n/a PyErr_BadArgument();
11645n/a return NULL;
11646n/a }
11647n/a if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11648n/a PyErr_SetString(PyExc_IndexError, "string index out of range");
11649n/a return NULL;
11650n/a }
11651n/a kind = PyUnicode_KIND(self);
11652n/a data = PyUnicode_DATA(self);
11653n/a ch = PyUnicode_READ(kind, data, index);
11654n/a return unicode_char(ch);
11655n/a}
11656n/a
11657n/a/* Believe it or not, this produces the same value for ASCII strings
11658n/a as bytes_hash(). */
11659n/astatic Py_hash_t
11660n/aunicode_hash(PyObject *self)
11661n/a{
11662n/a Py_ssize_t len;
11663n/a Py_uhash_t x; /* Unsigned for defined overflow behavior. */
11664n/a
11665n/a#ifdef Py_DEBUG
11666n/a assert(_Py_HashSecret_Initialized);
11667n/a#endif
11668n/a if (_PyUnicode_HASH(self) != -1)
11669n/a return _PyUnicode_HASH(self);
11670n/a if (PyUnicode_READY(self) == -1)
11671n/a return -1;
11672n/a len = PyUnicode_GET_LENGTH(self);
11673n/a /*
11674n/a We make the hash of the empty string be 0, rather than using
11675n/a (prefix ^ suffix), since this slightly obfuscates the hash secret
11676n/a */
11677n/a if (len == 0) {
11678n/a _PyUnicode_HASH(self) = 0;
11679n/a return 0;
11680n/a }
11681n/a x = _Py_HashBytes(PyUnicode_DATA(self),
11682n/a PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11683n/a _PyUnicode_HASH(self) = x;
11684n/a return x;
11685n/a}
11686n/a
11687n/aPyDoc_STRVAR(index__doc__,
11688n/a "S.index(sub[, start[, end]]) -> int\n\
11689n/a\n\
11690n/aLike S.find() but raise ValueError when the substring is not found.");
11691n/a
11692n/astatic PyObject *
11693n/aunicode_index(PyObject *self, PyObject *args)
11694n/a{
11695n/a /* initialize variables to prevent gcc warning */
11696n/a Py_ssize_t result;
11697n/a PyObject *substring = NULL;
11698n/a Py_ssize_t start = 0;
11699n/a Py_ssize_t end = 0;
11700n/a
11701n/a if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11702n/a return NULL;
11703n/a
11704n/a if (PyUnicode_READY(self) == -1)
11705n/a return NULL;
11706n/a
11707n/a result = any_find_slice(self, substring, start, end, 1);
11708n/a
11709n/a if (result == -2)
11710n/a return NULL;
11711n/a
11712n/a if (result < 0) {
11713n/a PyErr_SetString(PyExc_ValueError, "substring not found");
11714n/a return NULL;
11715n/a }
11716n/a
11717n/a return PyLong_FromSsize_t(result);
11718n/a}
11719n/a
11720n/a/*[clinic input]
11721n/astr.islower as unicode_islower
11722n/a
11723n/aReturn True if the string is a lowercase string, False otherwise.
11724n/a
11725n/aA string is lowercase if all cased characters in the string are lowercase and
11726n/athere is at least one cased character in the string.
11727n/a[clinic start generated code]*/
11728n/a
11729n/astatic PyObject *
11730n/aunicode_islower_impl(PyObject *self)
11731n/a/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11732n/a{
11733n/a Py_ssize_t i, length;
11734n/a int kind;
11735n/a void *data;
11736n/a int cased;
11737n/a
11738n/a if (PyUnicode_READY(self) == -1)
11739n/a return NULL;
11740n/a length = PyUnicode_GET_LENGTH(self);
11741n/a kind = PyUnicode_KIND(self);
11742n/a data = PyUnicode_DATA(self);
11743n/a
11744n/a /* Shortcut for single character strings */
11745n/a if (length == 1)
11746n/a return PyBool_FromLong(
11747n/a Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11748n/a
11749n/a /* Special case for empty strings */
11750n/a if (length == 0)
11751n/a return PyBool_FromLong(0);
11752n/a
11753n/a cased = 0;
11754n/a for (i = 0; i < length; i++) {
11755n/a const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11756n/a
11757n/a if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11758n/a return PyBool_FromLong(0);
11759n/a else if (!cased && Py_UNICODE_ISLOWER(ch))
11760n/a cased = 1;
11761n/a }
11762n/a return PyBool_FromLong(cased);
11763n/a}
11764n/a
11765n/a/*[clinic input]
11766n/astr.isupper as unicode_isupper
11767n/a
11768n/aReturn True if the string is an uppercase string, False otherwise.
11769n/a
11770n/aA string is uppercase if all cased characters in the string are uppercase and
11771n/athere is at least one cased character in the string.
11772n/a[clinic start generated code]*/
11773n/a
11774n/astatic PyObject *
11775n/aunicode_isupper_impl(PyObject *self)
11776n/a/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11777n/a{
11778n/a Py_ssize_t i, length;
11779n/a int kind;
11780n/a void *data;
11781n/a int cased;
11782n/a
11783n/a if (PyUnicode_READY(self) == -1)
11784n/a return NULL;
11785n/a length = PyUnicode_GET_LENGTH(self);
11786n/a kind = PyUnicode_KIND(self);
11787n/a data = PyUnicode_DATA(self);
11788n/a
11789n/a /* Shortcut for single character strings */
11790n/a if (length == 1)
11791n/a return PyBool_FromLong(
11792n/a Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11793n/a
11794n/a /* Special case for empty strings */
11795n/a if (length == 0)
11796n/a return PyBool_FromLong(0);
11797n/a
11798n/a cased = 0;
11799n/a for (i = 0; i < length; i++) {
11800n/a const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11801n/a
11802n/a if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11803n/a return PyBool_FromLong(0);
11804n/a else if (!cased && Py_UNICODE_ISUPPER(ch))
11805n/a cased = 1;
11806n/a }
11807n/a return PyBool_FromLong(cased);
11808n/a}
11809n/a
11810n/a/*[clinic input]
11811n/astr.istitle as unicode_istitle
11812n/a
11813n/aReturn True if the string is a title-cased string, False otherwise.
11814n/a
11815n/aIn a title-cased string, upper- and title-case characters may only
11816n/afollow uncased characters and lowercase characters only cased ones.
11817n/a[clinic start generated code]*/
11818n/a
11819n/astatic PyObject *
11820n/aunicode_istitle_impl(PyObject *self)
11821n/a/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11822n/a{
11823n/a Py_ssize_t i, length;
11824n/a int kind;
11825n/a void *data;
11826n/a int cased, previous_is_cased;
11827n/a
11828n/a if (PyUnicode_READY(self) == -1)
11829n/a return NULL;
11830n/a length = PyUnicode_GET_LENGTH(self);
11831n/a kind = PyUnicode_KIND(self);
11832n/a data = PyUnicode_DATA(self);
11833n/a
11834n/a /* Shortcut for single character strings */
11835n/a if (length == 1) {
11836n/a Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11837n/a return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11838n/a (Py_UNICODE_ISUPPER(ch) != 0));
11839n/a }
11840n/a
11841n/a /* Special case for empty strings */
11842n/a if (length == 0)
11843n/a return PyBool_FromLong(0);
11844n/a
11845n/a cased = 0;
11846n/a previous_is_cased = 0;
11847n/a for (i = 0; i < length; i++) {
11848n/a const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11849n/a
11850n/a if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11851n/a if (previous_is_cased)
11852n/a return PyBool_FromLong(0);
11853n/a previous_is_cased = 1;
11854n/a cased = 1;
11855n/a }
11856n/a else if (Py_UNICODE_ISLOWER(ch)) {
11857n/a if (!previous_is_cased)
11858n/a return PyBool_FromLong(0);
11859n/a previous_is_cased = 1;
11860n/a cased = 1;
11861n/a }
11862n/a else
11863n/a previous_is_cased = 0;
11864n/a }
11865n/a return PyBool_FromLong(cased);
11866n/a}
11867n/a
11868n/a/*[clinic input]
11869n/astr.isspace as unicode_isspace
11870n/a
11871n/aReturn True if the string is a whitespace string, False otherwise.
11872n/a
11873n/aA string is whitespace if all characters in the string are whitespace and there
11874n/ais at least one character in the string.
11875n/a[clinic start generated code]*/
11876n/a
11877n/astatic PyObject *
11878n/aunicode_isspace_impl(PyObject *self)
11879n/a/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
11880n/a{
11881n/a Py_ssize_t i, length;
11882n/a int kind;
11883n/a void *data;
11884n/a
11885n/a if (PyUnicode_READY(self) == -1)
11886n/a return NULL;
11887n/a length = PyUnicode_GET_LENGTH(self);
11888n/a kind = PyUnicode_KIND(self);
11889n/a data = PyUnicode_DATA(self);
11890n/a
11891n/a /* Shortcut for single character strings */
11892n/a if (length == 1)
11893n/a return PyBool_FromLong(
11894n/a Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11895n/a
11896n/a /* Special case for empty strings */
11897n/a if (length == 0)
11898n/a return PyBool_FromLong(0);
11899n/a
11900n/a for (i = 0; i < length; i++) {
11901n/a const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11902n/a if (!Py_UNICODE_ISSPACE(ch))
11903n/a return PyBool_FromLong(0);
11904n/a }
11905n/a return PyBool_FromLong(1);
11906n/a}
11907n/a
11908n/a/*[clinic input]
11909n/astr.isalpha as unicode_isalpha
11910n/a
11911n/aReturn True if the string is an alphabetic string, False otherwise.
11912n/a
11913n/aA string is alphabetic if all characters in the string are alphabetic and there
11914n/ais at least one character in the string.
11915n/a[clinic start generated code]*/
11916n/a
11917n/astatic PyObject *
11918n/aunicode_isalpha_impl(PyObject *self)
11919n/a/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
11920n/a{
11921n/a Py_ssize_t i, length;
11922n/a int kind;
11923n/a void *data;
11924n/a
11925n/a if (PyUnicode_READY(self) == -1)
11926n/a return NULL;
11927n/a length = PyUnicode_GET_LENGTH(self);
11928n/a kind = PyUnicode_KIND(self);
11929n/a data = PyUnicode_DATA(self);
11930n/a
11931n/a /* Shortcut for single character strings */
11932n/a if (length == 1)
11933n/a return PyBool_FromLong(
11934n/a Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11935n/a
11936n/a /* Special case for empty strings */
11937n/a if (length == 0)
11938n/a return PyBool_FromLong(0);
11939n/a
11940n/a for (i = 0; i < length; i++) {
11941n/a if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11942n/a return PyBool_FromLong(0);
11943n/a }
11944n/a return PyBool_FromLong(1);
11945n/a}
11946n/a
11947n/a/*[clinic input]
11948n/astr.isalnum as unicode_isalnum
11949n/a
11950n/aReturn True if the string is an alpha-numeric string, False otherwise.
11951n/a
11952n/aA string is alpha-numeric if all characters in the string are alpha-numeric and
11953n/athere is at least one character in the string.
11954n/a[clinic start generated code]*/
11955n/a
11956n/astatic PyObject *
11957n/aunicode_isalnum_impl(PyObject *self)
11958n/a/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
11959n/a{
11960n/a int kind;
11961n/a void *data;
11962n/a Py_ssize_t len, i;
11963n/a
11964n/a if (PyUnicode_READY(self) == -1)
11965n/a return NULL;
11966n/a
11967n/a kind = PyUnicode_KIND(self);
11968n/a data = PyUnicode_DATA(self);
11969n/a len = PyUnicode_GET_LENGTH(self);
11970n/a
11971n/a /* Shortcut for single character strings */
11972n/a if (len == 1) {
11973n/a const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11974n/a return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11975n/a }
11976n/a
11977n/a /* Special case for empty strings */
11978n/a if (len == 0)
11979n/a return PyBool_FromLong(0);
11980n/a
11981n/a for (i = 0; i < len; i++) {
11982n/a const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11983n/a if (!Py_UNICODE_ISALNUM(ch))
11984n/a return PyBool_FromLong(0);
11985n/a }
11986n/a return PyBool_FromLong(1);
11987n/a}
11988n/a
11989n/a/*[clinic input]
11990n/astr.isdecimal as unicode_isdecimal
11991n/a
11992n/aReturn True if the string is a decimal string, False otherwise.
11993n/a
11994n/aA string is a decimal string if all characters in the string are decimal and
11995n/athere is at least one character in the string.
11996n/a[clinic start generated code]*/
11997n/a
11998n/astatic PyObject *
11999n/aunicode_isdecimal_impl(PyObject *self)
12000n/a/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12001n/a{
12002n/a Py_ssize_t i, length;
12003n/a int kind;
12004n/a void *data;
12005n/a
12006n/a if (PyUnicode_READY(self) == -1)
12007n/a return NULL;
12008n/a length = PyUnicode_GET_LENGTH(self);
12009n/a kind = PyUnicode_KIND(self);
12010n/a data = PyUnicode_DATA(self);
12011n/a
12012n/a /* Shortcut for single character strings */
12013n/a if (length == 1)
12014n/a return PyBool_FromLong(
12015n/a Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12016n/a
12017n/a /* Special case for empty strings */
12018n/a if (length == 0)
12019n/a return PyBool_FromLong(0);
12020n/a
12021n/a for (i = 0; i < length; i++) {
12022n/a if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12023n/a return PyBool_FromLong(0);
12024n/a }
12025n/a return PyBool_FromLong(1);
12026n/a}
12027n/a
12028n/a/*[clinic input]
12029n/astr.isdigit as unicode_isdigit
12030n/a
12031n/aReturn True if the string is a digit string, False otherwise.
12032n/a
12033n/aA string is a digit string if all characters in the string are digits and there
12034n/ais at least one character in the string.
12035n/a[clinic start generated code]*/
12036n/a
12037n/astatic PyObject *
12038n/aunicode_isdigit_impl(PyObject *self)
12039n/a/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12040n/a{
12041n/a Py_ssize_t i, length;
12042n/a int kind;
12043n/a void *data;
12044n/a
12045n/a if (PyUnicode_READY(self) == -1)
12046n/a return NULL;
12047n/a length = PyUnicode_GET_LENGTH(self);
12048n/a kind = PyUnicode_KIND(self);
12049n/a data = PyUnicode_DATA(self);
12050n/a
12051n/a /* Shortcut for single character strings */
12052n/a if (length == 1) {
12053n/a const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12054n/a return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12055n/a }
12056n/a
12057n/a /* Special case for empty strings */
12058n/a if (length == 0)
12059n/a return PyBool_FromLong(0);
12060n/a
12061n/a for (i = 0; i < length; i++) {
12062n/a if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12063n/a return PyBool_FromLong(0);
12064n/a }
12065n/a return PyBool_FromLong(1);
12066n/a}
12067n/a
12068n/a/*[clinic input]
12069n/astr.isnumeric as unicode_isnumeric
12070n/a
12071n/aReturn True if the string is a numeric string, False otherwise.
12072n/a
12073n/aA string is numeric if all characters in the string are numeric and there is at
12074n/aleast one character in the string.
12075n/a[clinic start generated code]*/
12076n/a
12077n/astatic PyObject *
12078n/aunicode_isnumeric_impl(PyObject *self)
12079n/a/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12080n/a{
12081n/a Py_ssize_t i, length;
12082n/a int kind;
12083n/a void *data;
12084n/a
12085n/a if (PyUnicode_READY(self) == -1)
12086n/a return NULL;
12087n/a length = PyUnicode_GET_LENGTH(self);
12088n/a kind = PyUnicode_KIND(self);
12089n/a data = PyUnicode_DATA(self);
12090n/a
12091n/a /* Shortcut for single character strings */
12092n/a if (length == 1)
12093n/a return PyBool_FromLong(
12094n/a Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12095n/a
12096n/a /* Special case for empty strings */
12097n/a if (length == 0)
12098n/a return PyBool_FromLong(0);
12099n/a
12100n/a for (i = 0; i < length; i++) {
12101n/a if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12102n/a return PyBool_FromLong(0);
12103n/a }
12104n/a return PyBool_FromLong(1);
12105n/a}
12106n/a
12107n/aint
12108n/aPyUnicode_IsIdentifier(PyObject *self)
12109n/a{
12110n/a int kind;
12111n/a void *data;
12112n/a Py_ssize_t i;
12113n/a Py_UCS4 first;
12114n/a
12115n/a if (PyUnicode_READY(self) == -1) {
12116n/a Py_FatalError("identifier not ready");
12117n/a return 0;
12118n/a }
12119n/a
12120n/a /* Special case for empty strings */
12121n/a if (PyUnicode_GET_LENGTH(self) == 0)
12122n/a return 0;
12123n/a kind = PyUnicode_KIND(self);
12124n/a data = PyUnicode_DATA(self);
12125n/a
12126n/a /* PEP 3131 says that the first character must be in
12127n/a XID_Start and subsequent characters in XID_Continue,
12128n/a and for the ASCII range, the 2.x rules apply (i.e
12129n/a start with letters and underscore, continue with
12130n/a letters, digits, underscore). However, given the current
12131n/a definition of XID_Start and XID_Continue, it is sufficient
12132n/a to check just for these, except that _ must be allowed
12133n/a as starting an identifier. */
12134n/a first = PyUnicode_READ(kind, data, 0);
12135n/a if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
12136n/a return 0;
12137n/a
12138n/a for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
12139n/a if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
12140n/a return 0;
12141n/a return 1;
12142n/a}
12143n/a
12144n/a/*[clinic input]
12145n/astr.isidentifier as unicode_isidentifier
12146n/a
12147n/aReturn True if the string is a valid Python identifier, False otherwise.
12148n/a
12149n/aUse keyword.iskeyword() to test for reserved identifiers such as "def" and
12150n/a"class".
12151n/a[clinic start generated code]*/
12152n/a
12153n/astatic PyObject *
12154n/aunicode_isidentifier_impl(PyObject *self)
12155n/a/*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
12156n/a{
12157n/a return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12158n/a}
12159n/a
12160n/a/*[clinic input]
12161n/astr.isprintable as unicode_isprintable
12162n/a
12163n/aReturn True if the string is printable, False otherwise.
12164n/a
12165n/aA string is printable if all of its characters are considered printable in
12166n/arepr() or if it is empty.
12167n/a[clinic start generated code]*/
12168n/a
12169n/astatic PyObject *
12170n/aunicode_isprintable_impl(PyObject *self)
12171n/a/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12172n/a{
12173n/a Py_ssize_t i, length;
12174n/a int kind;
12175n/a void *data;
12176n/a
12177n/a if (PyUnicode_READY(self) == -1)
12178n/a return NULL;
12179n/a length = PyUnicode_GET_LENGTH(self);
12180n/a kind = PyUnicode_KIND(self);
12181n/a data = PyUnicode_DATA(self);
12182n/a
12183n/a /* Shortcut for single character strings */
12184n/a if (length == 1)
12185n/a return PyBool_FromLong(
12186n/a Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12187n/a
12188n/a for (i = 0; i < length; i++) {
12189n/a if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12190n/a Py_RETURN_FALSE;
12191n/a }
12192n/a }
12193n/a Py_RETURN_TRUE;
12194n/a}
12195n/a
12196n/a/*[clinic input]
12197n/astr.join as unicode_join
12198n/a
12199n/a iterable: object
12200n/a /
12201n/a
12202n/aConcatenate any number of strings.
12203n/a
12204n/aThe string whose method is called is inserted in between each given string.
12205n/aThe result is returned as a new string.
12206n/a
12207n/aExample: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12208n/a[clinic start generated code]*/
12209n/a
12210n/astatic PyObject *
12211n/aunicode_join(PyObject *self, PyObject *iterable)
12212n/a/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12213n/a{
12214n/a return PyUnicode_Join(self, iterable);
12215n/a}
12216n/a
12217n/astatic Py_ssize_t
12218n/aunicode_length(PyObject *self)
12219n/a{
12220n/a if (PyUnicode_READY(self) == -1)
12221n/a return -1;
12222n/a return PyUnicode_GET_LENGTH(self);
12223n/a}
12224n/a
12225n/a/*[clinic input]
12226n/astr.ljust as unicode_ljust
12227n/a
12228n/a width: Py_ssize_t
12229n/a fillchar: Py_UCS4 = ' '
12230n/a /
12231n/a
12232n/aReturn a left-justified string of length width.
12233n/a
12234n/aPadding is done using the specified fill character (default is a space).
12235n/a[clinic start generated code]*/
12236n/a
12237n/astatic PyObject *
12238n/aunicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12239n/a/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12240n/a{
12241n/a if (PyUnicode_READY(self) == -1)
12242n/a return NULL;
12243n/a
12244n/a if (PyUnicode_GET_LENGTH(self) >= width)
12245n/a return unicode_result_unchanged(self);
12246n/a
12247n/a return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12248n/a}
12249n/a
12250n/a/*[clinic input]
12251n/astr.lower as unicode_lower
12252n/a
12253n/aReturn a copy of the string converted to lowercase.
12254n/a[clinic start generated code]*/
12255n/a
12256n/astatic PyObject *
12257n/aunicode_lower_impl(PyObject *self)
12258n/a/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12259n/a{
12260n/a if (PyUnicode_READY(self) == -1)
12261n/a return NULL;
12262n/a if (PyUnicode_IS_ASCII(self))
12263n/a return ascii_upper_or_lower(self, 1);
12264n/a return case_operation(self, do_lower);
12265n/a}
12266n/a
12267n/a#define LEFTSTRIP 0
12268n/a#define RIGHTSTRIP 1
12269n/a#define BOTHSTRIP 2
12270n/a
12271n/a/* Arrays indexed by above */
12272n/astatic const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12273n/a
12274n/a#define STRIPNAME(i) (stripfuncnames[i])
12275n/a
12276n/a/* externally visible for str.strip(unicode) */
12277n/aPyObject *
12278n/a_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12279n/a{
12280n/a void *data;
12281n/a int kind;
12282n/a Py_ssize_t i, j, len;
12283n/a BLOOM_MASK sepmask;
12284n/a Py_ssize_t seplen;
12285n/a
12286n/a if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12287n/a return NULL;
12288n/a
12289n/a kind = PyUnicode_KIND(self);
12290n/a data = PyUnicode_DATA(self);
12291n/a len = PyUnicode_GET_LENGTH(self);
12292n/a seplen = PyUnicode_GET_LENGTH(sepobj);
12293n/a sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12294n/a PyUnicode_DATA(sepobj),
12295n/a seplen);
12296n/a
12297n/a i = 0;
12298n/a if (striptype != RIGHTSTRIP) {
12299n/a while (i < len) {
12300n/a Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12301n/a if (!BLOOM(sepmask, ch))
12302n/a break;
12303n/a if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12304n/a break;
12305n/a i++;
12306n/a }
12307n/a }
12308n/a
12309n/a j = len;
12310n/a if (striptype != LEFTSTRIP) {
12311n/a j--;
12312n/a while (j >= i) {
12313n/a Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12314n/a if (!BLOOM(sepmask, ch))
12315n/a break;
12316n/a if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12317n/a break;
12318n/a j--;
12319n/a }
12320n/a
12321n/a j++;
12322n/a }
12323n/a
12324n/a return PyUnicode_Substring(self, i, j);
12325n/a}
12326n/a
12327n/aPyObject*
12328n/aPyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12329n/a{
12330n/a unsigned char *data;
12331n/a int kind;
12332n/a Py_ssize_t length;
12333n/a
12334n/a if (PyUnicode_READY(self) == -1)
12335n/a return NULL;
12336n/a
12337n/a length = PyUnicode_GET_LENGTH(self);
12338n/a end = Py_MIN(end, length);
12339n/a
12340n/a if (start == 0 && end == length)
12341n/a return unicode_result_unchanged(self);
12342n/a
12343n/a if (start < 0 || end < 0) {
12344n/a PyErr_SetString(PyExc_IndexError, "string index out of range");
12345n/a return NULL;
12346n/a }
12347n/a if (start >= length || end < start)
12348n/a _Py_RETURN_UNICODE_EMPTY();
12349n/a
12350n/a length = end - start;
12351n/a if (PyUnicode_IS_ASCII(self)) {
12352n/a data = PyUnicode_1BYTE_DATA(self);
12353n/a return _PyUnicode_FromASCII((char*)(data + start), length);
12354n/a }
12355n/a else {
12356n/a kind = PyUnicode_KIND(self);
12357n/a data = PyUnicode_1BYTE_DATA(self);
12358n/a return PyUnicode_FromKindAndData(kind,
12359n/a data + kind * start,
12360n/a length);
12361n/a }
12362n/a}
12363n/a
12364n/astatic PyObject *
12365n/ado_strip(PyObject *self, int striptype)
12366n/a{
12367n/a Py_ssize_t len, i, j;
12368n/a
12369n/a if (PyUnicode_READY(self) == -1)
12370n/a return NULL;
12371n/a
12372n/a len = PyUnicode_GET_LENGTH(self);
12373n/a
12374n/a if (PyUnicode_IS_ASCII(self)) {
12375n/a Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12376n/a
12377n/a i = 0;
12378n/a if (striptype != RIGHTSTRIP) {
12379n/a while (i < len) {
12380n/a Py_UCS1 ch = data[i];
12381n/a if (!_Py_ascii_whitespace[ch])
12382n/a break;
12383n/a i++;
12384n/a }
12385n/a }
12386n/a
12387n/a j = len;
12388n/a if (striptype != LEFTSTRIP) {
12389n/a j--;
12390n/a while (j >= i) {
12391n/a Py_UCS1 ch = data[j];
12392n/a if (!_Py_ascii_whitespace[ch])
12393n/a break;
12394n/a j--;
12395n/a }
12396n/a j++;
12397n/a }
12398n/a }
12399n/a else {
12400n/a int kind = PyUnicode_KIND(self);
12401n/a void *data = PyUnicode_DATA(self);
12402n/a
12403n/a i = 0;
12404n/a if (striptype != RIGHTSTRIP) {
12405n/a while (i < len) {
12406n/a Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12407n/a if (!Py_UNICODE_ISSPACE(ch))
12408n/a break;
12409n/a i++;
12410n/a }
12411n/a }
12412n/a
12413n/a j = len;
12414n/a if (striptype != LEFTSTRIP) {
12415n/a j--;
12416n/a while (j >= i) {
12417n/a Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12418n/a if (!Py_UNICODE_ISSPACE(ch))
12419n/a break;
12420n/a j--;
12421n/a }
12422n/a j++;
12423n/a }
12424n/a }
12425n/a
12426n/a return PyUnicode_Substring(self, i, j);
12427n/a}
12428n/a
12429n/a
12430n/astatic PyObject *
12431n/ado_argstrip(PyObject *self, int striptype, PyObject *sep)
12432n/a{
12433n/a if (sep != NULL && sep != Py_None) {
12434n/a if (PyUnicode_Check(sep))
12435n/a return _PyUnicode_XStrip(self, striptype, sep);
12436n/a else {
12437n/a PyErr_Format(PyExc_TypeError,
12438n/a "%s arg must be None or str",
12439n/a STRIPNAME(striptype));
12440n/a return NULL;
12441n/a }
12442n/a }
12443n/a
12444n/a return do_strip(self, striptype);
12445n/a}
12446n/a
12447n/a
12448n/a/*[clinic input]
12449n/astr.strip as unicode_strip
12450n/a
12451n/a chars: object = None
12452n/a /
12453n/a
12454n/aReturn a copy of the string with leading and trailing whitespace remove.
12455n/a
12456n/aIf chars is given and not None, remove characters in chars instead.
12457n/a[clinic start generated code]*/
12458n/a
12459n/astatic PyObject *
12460n/aunicode_strip_impl(PyObject *self, PyObject *chars)
12461n/a/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
12462n/a{
12463n/a return do_argstrip(self, BOTHSTRIP, chars);
12464n/a}
12465n/a
12466n/a
12467n/a/*[clinic input]
12468n/astr.lstrip as unicode_lstrip
12469n/a
12470n/a chars: object = NULL
12471n/a /
12472n/a
12473n/aReturn a copy of the string with leading whitespace removed.
12474n/a
12475n/aIf chars is given and not None, remove characters in chars instead.
12476n/a[clinic start generated code]*/
12477n/a
12478n/astatic PyObject *
12479n/aunicode_lstrip_impl(PyObject *self, PyObject *chars)
12480n/a/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
12481n/a{
12482n/a return do_argstrip(self, LEFTSTRIP, chars);
12483n/a}
12484n/a
12485n/a
12486n/a/*[clinic input]
12487n/astr.rstrip as unicode_rstrip
12488n/a
12489n/a chars: object = NULL
12490n/a /
12491n/a
12492n/aReturn a copy of the string with trailing whitespace removed.
12493n/a
12494n/aIf chars is given and not None, remove characters in chars instead.
12495n/a[clinic start generated code]*/
12496n/a
12497n/astatic PyObject *
12498n/aunicode_rstrip_impl(PyObject *self, PyObject *chars)
12499n/a/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
12500n/a{
12501n/a return do_argstrip(self, RIGHTSTRIP, chars);
12502n/a}
12503n/a
12504n/a
12505n/astatic PyObject*
12506n/aunicode_repeat(PyObject *str, Py_ssize_t len)
12507n/a{
12508n/a PyObject *u;
12509n/a Py_ssize_t nchars, n;
12510n/a
12511n/a if (len < 1)
12512n/a _Py_RETURN_UNICODE_EMPTY();
12513n/a
12514n/a /* no repeat, return original string */
12515n/a if (len == 1)
12516n/a return unicode_result_unchanged(str);
12517n/a
12518n/a if (PyUnicode_READY(str) == -1)
12519n/a return NULL;
12520n/a
12521n/a if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12522n/a PyErr_SetString(PyExc_OverflowError,
12523n/a "repeated string is too long");
12524n/a return NULL;
12525n/a }
12526n/a nchars = len * PyUnicode_GET_LENGTH(str);
12527n/a
12528n/a u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12529n/a if (!u)
12530n/a return NULL;
12531n/a assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12532n/a
12533n/a if (PyUnicode_GET_LENGTH(str) == 1) {
12534n/a const int kind = PyUnicode_KIND(str);
12535n/a const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12536n/a if (kind == PyUnicode_1BYTE_KIND) {
12537n/a void *to = PyUnicode_DATA(u);
12538n/a memset(to, (unsigned char)fill_char, len);
12539n/a }
12540n/a else if (kind == PyUnicode_2BYTE_KIND) {
12541n/a Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12542n/a for (n = 0; n < len; ++n)
12543n/a ucs2[n] = fill_char;
12544n/a } else {
12545n/a Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12546n/a assert(kind == PyUnicode_4BYTE_KIND);
12547n/a for (n = 0; n < len; ++n)
12548n/a ucs4[n] = fill_char;
12549n/a }
12550n/a }
12551n/a else {
12552n/a /* number of characters copied this far */
12553n/a Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12554n/a const Py_ssize_t char_size = PyUnicode_KIND(str);
12555n/a char *to = (char *) PyUnicode_DATA(u);
12556n/a memcpy(to, PyUnicode_DATA(str),
12557n/a PyUnicode_GET_LENGTH(str) * char_size);
12558n/a while (done < nchars) {
12559n/a n = (done <= nchars-done) ? done : nchars-done;
12560n/a memcpy(to + (done * char_size), to, n * char_size);
12561n/a done += n;
12562n/a }
12563n/a }
12564n/a
12565n/a assert(_PyUnicode_CheckConsistency(u, 1));
12566n/a return u;
12567n/a}
12568n/a
12569n/aPyObject *
12570n/aPyUnicode_Replace(PyObject *str,
12571n/a PyObject *substr,
12572n/a PyObject *replstr,
12573n/a Py_ssize_t maxcount)
12574n/a{
12575n/a if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12576n/a ensure_unicode(replstr) < 0)
12577n/a return NULL;
12578n/a return replace(str, substr, replstr, maxcount);
12579n/a}
12580n/a
12581n/a/*[clinic input]
12582n/astr.replace as unicode_replace
12583n/a
12584n/a old: unicode
12585n/a new: unicode
12586n/a count: Py_ssize_t = -1
12587n/a Maximum number of occurrences to replace.
12588n/a -1 (the default value) means replace all occurrences.
12589n/a /
12590n/a
12591n/aReturn a copy with all occurrences of substring old replaced by new.
12592n/a
12593n/aIf the optional argument count is given, only the first count occurrences are
12594n/areplaced.
12595n/a[clinic start generated code]*/
12596n/a
12597n/astatic PyObject *
12598n/aunicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12599n/a Py_ssize_t count)
12600n/a/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
12601n/a{
12602n/a if (PyUnicode_READY(self) == -1)
12603n/a return NULL;
12604n/a return replace(self, old, new, count);
12605n/a}
12606n/a
12607n/astatic PyObject *
12608n/aunicode_repr(PyObject *unicode)
12609n/a{
12610n/a PyObject *repr;
12611n/a Py_ssize_t isize;
12612n/a Py_ssize_t osize, squote, dquote, i, o;
12613n/a Py_UCS4 max, quote;
12614n/a int ikind, okind, unchanged;
12615n/a void *idata, *odata;
12616n/a
12617n/a if (PyUnicode_READY(unicode) == -1)
12618n/a return NULL;
12619n/a
12620n/a isize = PyUnicode_GET_LENGTH(unicode);
12621n/a idata = PyUnicode_DATA(unicode);
12622n/a
12623n/a /* Compute length of output, quote characters, and
12624n/a maximum character */
12625n/a osize = 0;
12626n/a max = 127;
12627n/a squote = dquote = 0;
12628n/a ikind = PyUnicode_KIND(unicode);
12629n/a for (i = 0; i < isize; i++) {
12630n/a Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12631n/a Py_ssize_t incr = 1;
12632n/a switch (ch) {
12633n/a case '\'': squote++; break;
12634n/a case '"': dquote++; break;
12635n/a case '\\': case '\t': case '\r': case '\n':
12636n/a incr = 2;
12637n/a break;
12638n/a default:
12639n/a /* Fast-path ASCII */
12640n/a if (ch < ' ' || ch == 0x7f)
12641n/a incr = 4; /* \xHH */
12642n/a else if (ch < 0x7f)
12643n/a ;
12644n/a else if (Py_UNICODE_ISPRINTABLE(ch))
12645n/a max = ch > max ? ch : max;
12646n/a else if (ch < 0x100)
12647n/a incr = 4; /* \xHH */
12648n/a else if (ch < 0x10000)
12649n/a incr = 6; /* \uHHHH */
12650n/a else
12651n/a incr = 10; /* \uHHHHHHHH */
12652n/a }
12653n/a if (osize > PY_SSIZE_T_MAX - incr) {
12654n/a PyErr_SetString(PyExc_OverflowError,
12655n/a "string is too long to generate repr");
12656n/a return NULL;
12657n/a }
12658n/a osize += incr;
12659n/a }
12660n/a
12661n/a quote = '\'';
12662n/a unchanged = (osize == isize);
12663n/a if (squote) {
12664n/a unchanged = 0;
12665n/a if (dquote)
12666n/a /* Both squote and dquote present. Use squote,
12667n/a and escape them */
12668n/a osize += squote;
12669n/a else
12670n/a quote = '"';
12671n/a }
12672n/a osize += 2; /* quotes */
12673n/a
12674n/a repr = PyUnicode_New(osize, max);
12675n/a if (repr == NULL)
12676n/a return NULL;
12677n/a okind = PyUnicode_KIND(repr);
12678n/a odata = PyUnicode_DATA(repr);
12679n/a
12680n/a PyUnicode_WRITE(okind, odata, 0, quote);
12681n/a PyUnicode_WRITE(okind, odata, osize-1, quote);
12682n/a if (unchanged) {
12683n/a _PyUnicode_FastCopyCharacters(repr, 1,
12684n/a unicode, 0,
12685n/a isize);
12686n/a }
12687n/a else {
12688n/a for (i = 0, o = 1; i < isize; i++) {
12689n/a Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12690n/a
12691n/a /* Escape quotes and backslashes */
12692n/a if ((ch == quote) || (ch == '\\')) {
12693n/a PyUnicode_WRITE(okind, odata, o++, '\\');
12694n/a PyUnicode_WRITE(okind, odata, o++, ch);
12695n/a continue;
12696n/a }
12697n/a
12698n/a /* Map special whitespace to '\t', \n', '\r' */
12699n/a if (ch == '\t') {
12700n/a PyUnicode_WRITE(okind, odata, o++, '\\');
12701n/a PyUnicode_WRITE(okind, odata, o++, 't');
12702n/a }
12703n/a else if (ch == '\n') {
12704n/a PyUnicode_WRITE(okind, odata, o++, '\\');
12705n/a PyUnicode_WRITE(okind, odata, o++, 'n');
12706n/a }
12707n/a else if (ch == '\r') {
12708n/a PyUnicode_WRITE(okind, odata, o++, '\\');
12709n/a PyUnicode_WRITE(okind, odata, o++, 'r');
12710n/a }
12711n/a
12712n/a /* Map non-printable US ASCII to '\xhh' */
12713n/a else if (ch < ' ' || ch == 0x7F) {
12714n/a PyUnicode_WRITE(okind, odata, o++, '\\');
12715n/a PyUnicode_WRITE(okind, odata, o++, 'x');
12716n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12717n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12718n/a }
12719n/a
12720n/a /* Copy ASCII characters as-is */
12721n/a else if (ch < 0x7F) {
12722n/a PyUnicode_WRITE(okind, odata, o++, ch);
12723n/a }
12724n/a
12725n/a /* Non-ASCII characters */
12726n/a else {
12727n/a /* Map Unicode whitespace and control characters
12728n/a (categories Z* and C* except ASCII space)
12729n/a */
12730n/a if (!Py_UNICODE_ISPRINTABLE(ch)) {
12731n/a PyUnicode_WRITE(okind, odata, o++, '\\');
12732n/a /* Map 8-bit characters to '\xhh' */
12733n/a if (ch <= 0xff) {
12734n/a PyUnicode_WRITE(okind, odata, o++, 'x');
12735n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12736n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12737n/a }
12738n/a /* Map 16-bit characters to '\uxxxx' */
12739n/a else if (ch <= 0xffff) {
12740n/a PyUnicode_WRITE(okind, odata, o++, 'u');
12741n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12742n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12743n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12744n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12745n/a }
12746n/a /* Map 21-bit characters to '\U00xxxxxx' */
12747n/a else {
12748n/a PyUnicode_WRITE(okind, odata, o++, 'U');
12749n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12750n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12751n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12752n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12753n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12754n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12755n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12756n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12757n/a }
12758n/a }
12759n/a /* Copy characters as-is */
12760n/a else {
12761n/a PyUnicode_WRITE(okind, odata, o++, ch);
12762n/a }
12763n/a }
12764n/a }
12765n/a }
12766n/a /* Closing quote already added at the beginning */
12767n/a assert(_PyUnicode_CheckConsistency(repr, 1));
12768n/a return repr;
12769n/a}
12770n/a
12771n/aPyDoc_STRVAR(rfind__doc__,
12772n/a "S.rfind(sub[, start[, end]]) -> int\n\
12773n/a\n\
12774n/aReturn the highest index in S where substring sub is found,\n\
12775n/asuch that sub is contained within S[start:end]. Optional\n\
12776n/aarguments start and end are interpreted as in slice notation.\n\
12777n/a\n\
12778n/aReturn -1 on failure.");
12779n/a
12780n/astatic PyObject *
12781n/aunicode_rfind(PyObject *self, PyObject *args)
12782n/a{
12783n/a /* initialize variables to prevent gcc warning */
12784n/a PyObject *substring = NULL;
12785n/a Py_ssize_t start = 0;
12786n/a Py_ssize_t end = 0;
12787n/a Py_ssize_t result;
12788n/a
12789n/a if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
12790n/a return NULL;
12791n/a
12792n/a if (PyUnicode_READY(self) == -1)
12793n/a return NULL;
12794n/a
12795n/a result = any_find_slice(self, substring, start, end, -1);
12796n/a
12797n/a if (result == -2)
12798n/a return NULL;
12799n/a
12800n/a return PyLong_FromSsize_t(result);
12801n/a}
12802n/a
12803n/aPyDoc_STRVAR(rindex__doc__,
12804n/a "S.rindex(sub[, start[, end]]) -> int\n\
12805n/a\n\
12806n/aLike S.rfind() but raise ValueError when the substring is not found.");
12807n/a
12808n/astatic PyObject *
12809n/aunicode_rindex(PyObject *self, PyObject *args)
12810n/a{
12811n/a /* initialize variables to prevent gcc warning */
12812n/a PyObject *substring = NULL;
12813n/a Py_ssize_t start = 0;
12814n/a Py_ssize_t end = 0;
12815n/a Py_ssize_t result;
12816n/a
12817n/a if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
12818n/a return NULL;
12819n/a
12820n/a if (PyUnicode_READY(self) == -1)
12821n/a return NULL;
12822n/a
12823n/a result = any_find_slice(self, substring, start, end, -1);
12824n/a
12825n/a if (result == -2)
12826n/a return NULL;
12827n/a
12828n/a if (result < 0) {
12829n/a PyErr_SetString(PyExc_ValueError, "substring not found");
12830n/a return NULL;
12831n/a }
12832n/a
12833n/a return PyLong_FromSsize_t(result);
12834n/a}
12835n/a
12836n/a/*[clinic input]
12837n/astr.rjust as unicode_rjust
12838n/a
12839n/a width: Py_ssize_t
12840n/a fillchar: Py_UCS4 = ' '
12841n/a /
12842n/a
12843n/aReturn a right-justified string of length width.
12844n/a
12845n/aPadding is done using the specified fill character (default is a space).
12846n/a[clinic start generated code]*/
12847n/a
12848n/astatic PyObject *
12849n/aunicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12850n/a/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12851n/a{
12852n/a if (PyUnicode_READY(self) == -1)
12853n/a return NULL;
12854n/a
12855n/a if (PyUnicode_GET_LENGTH(self) >= width)
12856n/a return unicode_result_unchanged(self);
12857n/a
12858n/a return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12859n/a}
12860n/a
12861n/aPyObject *
12862n/aPyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12863n/a{
12864n/a if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12865n/a return NULL;
12866n/a
12867n/a return split(s, sep, maxsplit);
12868n/a}
12869n/a
12870n/a/*[clinic input]
12871n/astr.split as unicode_split
12872n/a
12873n/a sep: object = None
12874n/a The delimiter according which to split the string.
12875n/a None (the default value) means split according to any whitespace,
12876n/a and discard empty strings from the result.
12877n/a maxsplit: Py_ssize_t = -1
12878n/a Maximum number of splits to do.
12879n/a -1 (the default value) means no limit.
12880n/a
12881n/aReturn a list of the words in the string, using sep as the delimiter string.
12882n/a[clinic start generated code]*/
12883n/a
12884n/astatic PyObject *
12885n/aunicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12886n/a/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
12887n/a{
12888n/a if (sep == Py_None)
12889n/a return split(self, NULL, maxsplit);
12890n/a if (PyUnicode_Check(sep))
12891n/a return split(self, sep, maxsplit);
12892n/a
12893n/a PyErr_Format(PyExc_TypeError,
12894n/a "must be str or None, not %.100s",
12895n/a Py_TYPE(sep)->tp_name);
12896n/a return NULL;
12897n/a}
12898n/a
12899n/aPyObject *
12900n/aPyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12901n/a{
12902n/a PyObject* out;
12903n/a int kind1, kind2;
12904n/a void *buf1, *buf2;
12905n/a Py_ssize_t len1, len2;
12906n/a
12907n/a if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12908n/a return NULL;
12909n/a
12910n/a kind1 = PyUnicode_KIND(str_obj);
12911n/a kind2 = PyUnicode_KIND(sep_obj);
12912n/a len1 = PyUnicode_GET_LENGTH(str_obj);
12913n/a len2 = PyUnicode_GET_LENGTH(sep_obj);
12914n/a if (kind1 < kind2 || len1 < len2) {
12915n/a _Py_INCREF_UNICODE_EMPTY();
12916n/a if (!unicode_empty)
12917n/a out = NULL;
12918n/a else {
12919n/a out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12920n/a Py_DECREF(unicode_empty);
12921n/a }
12922n/a return out;
12923n/a }
12924n/a buf1 = PyUnicode_DATA(str_obj);
12925n/a buf2 = PyUnicode_DATA(sep_obj);
12926n/a if (kind2 != kind1) {
12927n/a buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12928n/a if (!buf2)
12929n/a return NULL;
12930n/a }
12931n/a
12932n/a switch (kind1) {
12933n/a case PyUnicode_1BYTE_KIND:
12934n/a if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12935n/a out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12936n/a else
12937n/a out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12938n/a break;
12939n/a case PyUnicode_2BYTE_KIND:
12940n/a out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12941n/a break;
12942n/a case PyUnicode_4BYTE_KIND:
12943n/a out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12944n/a break;
12945n/a default:
12946n/a assert(0);
12947n/a out = 0;
12948n/a }
12949n/a
12950n/a if (kind2 != kind1)
12951n/a PyMem_Free(buf2);
12952n/a
12953n/a return out;
12954n/a}
12955n/a
12956n/a
12957n/aPyObject *
12958n/aPyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12959n/a{
12960n/a PyObject* out;
12961n/a int kind1, kind2;
12962n/a void *buf1, *buf2;
12963n/a Py_ssize_t len1, len2;
12964n/a
12965n/a if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12966n/a return NULL;
12967n/a
12968n/a kind1 = PyUnicode_KIND(str_obj);
12969n/a kind2 = PyUnicode_KIND(sep_obj);
12970n/a len1 = PyUnicode_GET_LENGTH(str_obj);
12971n/a len2 = PyUnicode_GET_LENGTH(sep_obj);
12972n/a if (kind1 < kind2 || len1 < len2) {
12973n/a _Py_INCREF_UNICODE_EMPTY();
12974n/a if (!unicode_empty)
12975n/a out = NULL;
12976n/a else {
12977n/a out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12978n/a Py_DECREF(unicode_empty);
12979n/a }
12980n/a return out;
12981n/a }
12982n/a buf1 = PyUnicode_DATA(str_obj);
12983n/a buf2 = PyUnicode_DATA(sep_obj);
12984n/a if (kind2 != kind1) {
12985n/a buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12986n/a if (!buf2)
12987n/a return NULL;
12988n/a }
12989n/a
12990n/a switch (kind1) {
12991n/a case PyUnicode_1BYTE_KIND:
12992n/a if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12993n/a out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12994n/a else
12995n/a out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12996n/a break;
12997n/a case PyUnicode_2BYTE_KIND:
12998n/a out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12999n/a break;
13000n/a case PyUnicode_4BYTE_KIND:
13001n/a out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13002n/a break;
13003n/a default:
13004n/a assert(0);
13005n/a out = 0;
13006n/a }
13007n/a
13008n/a if (kind2 != kind1)
13009n/a PyMem_Free(buf2);
13010n/a
13011n/a return out;
13012n/a}
13013n/a
13014n/a/*[clinic input]
13015n/astr.partition as unicode_partition
13016n/a
13017n/a sep: object
13018n/a /
13019n/a
13020n/aPartition the string into three parts using the given separator.
13021n/a
13022n/aThis will search for the separator in the string. If the separator is found,
13023n/areturns a 3-tuple containing the part before the separator, the separator
13024n/aitself, and the part after it.
13025n/a
13026n/aIf the separator is not found, returns a 3-tuple containing the original string
13027n/aand two empty strings.
13028n/a[clinic start generated code]*/
13029n/a
13030n/astatic PyObject *
13031n/aunicode_partition(PyObject *self, PyObject *sep)
13032n/a/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13033n/a{
13034n/a return PyUnicode_Partition(self, sep);
13035n/a}
13036n/a
13037n/a/*[clinic input]
13038n/astr.rpartition as unicode_rpartition = str.partition
13039n/a
13040n/aPartition the string into three parts using the given separator.
13041n/a
13042n/aThis will search for the separator in the string, starting and the end. If
13043n/athe separator is found, returns a 3-tuple containing the part before the
13044n/aseparator, the separator itself, and the part after it.
13045n/a
13046n/aIf the separator is not found, returns a 3-tuple containing two empty strings
13047n/aand the original string.
13048n/a[clinic start generated code]*/
13049n/a
13050n/astatic PyObject *
13051n/aunicode_rpartition(PyObject *self, PyObject *sep)
13052n/a/*[clinic end generated code: output=1aa13cf1156572aa input=e77c7acb69bdfca6]*/
13053n/a{
13054n/a return PyUnicode_RPartition(self, sep);
13055n/a}
13056n/a
13057n/aPyObject *
13058n/aPyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13059n/a{
13060n/a if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13061n/a return NULL;
13062n/a
13063n/a return rsplit(s, sep, maxsplit);
13064n/a}
13065n/a
13066n/a/*[clinic input]
13067n/astr.rsplit as unicode_rsplit = str.split
13068n/a
13069n/aReturn a list of the words in the string, using sep as the delimiter string.
13070n/a
13071n/aSplits are done starting at the end of the string and working to the front.
13072n/a[clinic start generated code]*/
13073n/a
13074n/astatic PyObject *
13075n/aunicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13076n/a/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
13077n/a{
13078n/a if (sep == Py_None)
13079n/a return rsplit(self, NULL, maxsplit);
13080n/a if (PyUnicode_Check(sep))
13081n/a return rsplit(self, sep, maxsplit);
13082n/a
13083n/a PyErr_Format(PyExc_TypeError,
13084n/a "must be str or None, not %.100s",
13085n/a Py_TYPE(sep)->tp_name);
13086n/a return NULL;
13087n/a}
13088n/a
13089n/a/*[clinic input]
13090n/astr.splitlines as unicode_splitlines
13091n/a
13092n/a keepends: int(c_default="0") = False
13093n/a
13094n/aReturn a list of the lines in the string, breaking at line boundaries.
13095n/a
13096n/aLine breaks are not included in the resulting list unless keepends is given and
13097n/atrue.
13098n/a[clinic start generated code]*/
13099n/a
13100n/astatic PyObject *
13101n/aunicode_splitlines_impl(PyObject *self, int keepends)
13102n/a/*[clinic end generated code: output=f664dcdad153ec40 input=d6ff99fe43465b0f]*/
13103n/a{
13104n/a return PyUnicode_Splitlines(self, keepends);
13105n/a}
13106n/a
13107n/astatic
13108n/aPyObject *unicode_str(PyObject *self)
13109n/a{
13110n/a return unicode_result_unchanged(self);
13111n/a}
13112n/a
13113n/a/*[clinic input]
13114n/astr.swapcase as unicode_swapcase
13115n/a
13116n/aConvert uppercase characters to lowercase and lowercase characters to uppercase.
13117n/a[clinic start generated code]*/
13118n/a
13119n/astatic PyObject *
13120n/aunicode_swapcase_impl(PyObject *self)
13121n/a/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13122n/a{
13123n/a if (PyUnicode_READY(self) == -1)
13124n/a return NULL;
13125n/a return case_operation(self, do_swapcase);
13126n/a}
13127n/a
13128n/a/*[clinic input]
13129n/a
13130n/a@staticmethod
13131n/astr.maketrans as unicode_maketrans
13132n/a
13133n/a x: object
13134n/a
13135n/a y: unicode=NULL
13136n/a
13137n/a z: unicode=NULL
13138n/a
13139n/a /
13140n/a
13141n/aReturn a translation table usable for str.translate().
13142n/a
13143n/aIf there is only one argument, it must be a dictionary mapping Unicode
13144n/aordinals (integers) or characters to Unicode ordinals, strings or None.
13145n/aCharacter keys will be then converted to ordinals.
13146n/aIf there are two arguments, they must be strings of equal length, and
13147n/ain the resulting dictionary, each character in x will be mapped to the
13148n/acharacter at the same position in y. If there is a third argument, it
13149n/amust be a string, whose characters will be mapped to None in the result.
13150n/a[clinic start generated code]*/
13151n/a
13152n/astatic PyObject *
13153n/aunicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13154n/a/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13155n/a{
13156n/a PyObject *new = NULL, *key, *value;
13157n/a Py_ssize_t i = 0;
13158n/a int res;
13159n/a
13160n/a new = PyDict_New();
13161n/a if (!new)
13162n/a return NULL;
13163n/a if (y != NULL) {
13164n/a int x_kind, y_kind, z_kind;
13165n/a void *x_data, *y_data, *z_data;
13166n/a
13167n/a /* x must be a string too, of equal length */
13168n/a if (!PyUnicode_Check(x)) {
13169n/a PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13170n/a "be a string if there is a second argument");
13171n/a goto err;
13172n/a }
13173n/a if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13174n/a PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13175n/a "arguments must have equal length");
13176n/a goto err;
13177n/a }
13178n/a /* create entries for translating chars in x to those in y */
13179n/a x_kind = PyUnicode_KIND(x);
13180n/a y_kind = PyUnicode_KIND(y);
13181n/a x_data = PyUnicode_DATA(x);
13182n/a y_data = PyUnicode_DATA(y);
13183n/a for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13184n/a key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13185n/a if (!key)
13186n/a goto err;
13187n/a value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13188n/a if (!value) {
13189n/a Py_DECREF(key);
13190n/a goto err;
13191n/a }
13192n/a res = PyDict_SetItem(new, key, value);
13193n/a Py_DECREF(key);
13194n/a Py_DECREF(value);
13195n/a if (res < 0)
13196n/a goto err;
13197n/a }
13198n/a /* create entries for deleting chars in z */
13199n/a if (z != NULL) {
13200n/a z_kind = PyUnicode_KIND(z);
13201n/a z_data = PyUnicode_DATA(z);
13202n/a for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13203n/a key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13204n/a if (!key)
13205n/a goto err;
13206n/a res = PyDict_SetItem(new, key, Py_None);
13207n/a Py_DECREF(key);
13208n/a if (res < 0)
13209n/a goto err;
13210n/a }
13211n/a }
13212n/a } else {
13213n/a int kind;
13214n/a void *data;
13215n/a
13216n/a /* x must be a dict */
13217n/a if (!PyDict_CheckExact(x)) {
13218n/a PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13219n/a "to maketrans it must be a dict");
13220n/a goto err;
13221n/a }
13222n/a /* copy entries into the new dict, converting string keys to int keys */
13223n/a while (PyDict_Next(x, &i, &key, &value)) {
13224n/a if (PyUnicode_Check(key)) {
13225n/a /* convert string keys to integer keys */
13226n/a PyObject *newkey;
13227n/a if (PyUnicode_GET_LENGTH(key) != 1) {
13228n/a PyErr_SetString(PyExc_ValueError, "string keys in translate "
13229n/a "table must be of length 1");
13230n/a goto err;
13231n/a }
13232n/a kind = PyUnicode_KIND(key);
13233n/a data = PyUnicode_DATA(key);
13234n/a newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13235n/a if (!newkey)
13236n/a goto err;
13237n/a res = PyDict_SetItem(new, newkey, value);
13238n/a Py_DECREF(newkey);
13239n/a if (res < 0)
13240n/a goto err;
13241n/a } else if (PyLong_Check(key)) {
13242n/a /* just keep integer keys */
13243n/a if (PyDict_SetItem(new, key, value) < 0)
13244n/a goto err;
13245n/a } else {
13246n/a PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13247n/a "be strings or integers");
13248n/a goto err;
13249n/a }
13250n/a }
13251n/a }
13252n/a return new;
13253n/a err:
13254n/a Py_DECREF(new);
13255n/a return NULL;
13256n/a}
13257n/a
13258n/a/*[clinic input]
13259n/astr.translate as unicode_translate
13260n/a
13261n/a table: object
13262n/a Translation table, which must be a mapping of Unicode ordinals to
13263n/a Unicode ordinals, strings, or None.
13264n/a /
13265n/a
13266n/aReplace each character in the string using the given translation table.
13267n/a
13268n/aThe table must implement lookup/indexing via __getitem__, for instance a
13269n/adictionary or list. If this operation raises LookupError, the character is
13270n/aleft untouched. Characters mapped to None are deleted.
13271n/a[clinic start generated code]*/
13272n/a
13273n/astatic PyObject *
13274n/aunicode_translate(PyObject *self, PyObject *table)
13275n/a/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13276n/a{
13277n/a return _PyUnicode_TranslateCharmap(self, table, "ignore");
13278n/a}
13279n/a
13280n/a/*[clinic input]
13281n/astr.upper as unicode_upper
13282n/a
13283n/aReturn a copy of the string converted to uppercase.
13284n/a[clinic start generated code]*/
13285n/a
13286n/astatic PyObject *
13287n/aunicode_upper_impl(PyObject *self)
13288n/a/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13289n/a{
13290n/a if (PyUnicode_READY(self) == -1)
13291n/a return NULL;
13292n/a if (PyUnicode_IS_ASCII(self))
13293n/a return ascii_upper_or_lower(self, 0);
13294n/a return case_operation(self, do_upper);
13295n/a}
13296n/a
13297n/a/*[clinic input]
13298n/astr.zfill as unicode_zfill
13299n/a
13300n/a width: Py_ssize_t
13301n/a /
13302n/a
13303n/aPad a numeric string with zeros on the left, to fill a field of the given width.
13304n/a
13305n/aThe string is never truncated.
13306n/a[clinic start generated code]*/
13307n/a
13308n/astatic PyObject *
13309n/aunicode_zfill_impl(PyObject *self, Py_ssize_t width)
13310n/a/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13311n/a{
13312n/a Py_ssize_t fill;
13313n/a PyObject *u;
13314n/a int kind;
13315n/a void *data;
13316n/a Py_UCS4 chr;
13317n/a
13318n/a if (PyUnicode_READY(self) == -1)
13319n/a return NULL;
13320n/a
13321n/a if (PyUnicode_GET_LENGTH(self) >= width)
13322n/a return unicode_result_unchanged(self);
13323n/a
13324n/a fill = width - PyUnicode_GET_LENGTH(self);
13325n/a
13326n/a u = pad(self, fill, 0, '0');
13327n/a
13328n/a if (u == NULL)
13329n/a return NULL;
13330n/a
13331n/a kind = PyUnicode_KIND(u);
13332n/a data = PyUnicode_DATA(u);
13333n/a chr = PyUnicode_READ(kind, data, fill);
13334n/a
13335n/a if (chr == '+' || chr == '-') {
13336n/a /* move sign to beginning of string */
13337n/a PyUnicode_WRITE(kind, data, 0, chr);
13338n/a PyUnicode_WRITE(kind, data, fill, '0');
13339n/a }
13340n/a
13341n/a assert(_PyUnicode_CheckConsistency(u, 1));
13342n/a return u;
13343n/a}
13344n/a
13345n/a#if 0
13346n/astatic PyObject *
13347n/aunicode__decimal2ascii(PyObject *self)
13348n/a{
13349n/a return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13350n/a}
13351n/a#endif
13352n/a
13353n/aPyDoc_STRVAR(startswith__doc__,
13354n/a "S.startswith(prefix[, start[, end]]) -> bool\n\
13355n/a\n\
13356n/aReturn True if S starts with the specified prefix, False otherwise.\n\
13357n/aWith optional start, test S beginning at that position.\n\
13358n/aWith optional end, stop comparing S at that position.\n\
13359n/aprefix can also be a tuple of strings to try.");
13360n/a
13361n/astatic PyObject *
13362n/aunicode_startswith(PyObject *self,
13363n/a PyObject *args)
13364n/a{
13365n/a PyObject *subobj;
13366n/a PyObject *substring;
13367n/a Py_ssize_t start = 0;
13368n/a Py_ssize_t end = PY_SSIZE_T_MAX;
13369n/a int result;
13370n/a
13371n/a if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13372n/a return NULL;
13373n/a if (PyTuple_Check(subobj)) {
13374n/a Py_ssize_t i;
13375n/a for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13376n/a substring = PyTuple_GET_ITEM(subobj, i);
13377n/a if (!PyUnicode_Check(substring)) {
13378n/a PyErr_Format(PyExc_TypeError,
13379n/a "tuple for startswith must only contain str, "
13380n/a "not %.100s",
13381n/a Py_TYPE(substring)->tp_name);
13382n/a return NULL;
13383n/a }
13384n/a result = tailmatch(self, substring, start, end, -1);
13385n/a if (result == -1)
13386n/a return NULL;
13387n/a if (result) {
13388n/a Py_RETURN_TRUE;
13389n/a }
13390n/a }
13391n/a /* nothing matched */
13392n/a Py_RETURN_FALSE;
13393n/a }
13394n/a if (!PyUnicode_Check(subobj)) {
13395n/a PyErr_Format(PyExc_TypeError,
13396n/a "startswith first arg must be str or "
13397n/a "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13398n/a return NULL;
13399n/a }
13400n/a result = tailmatch(self, subobj, start, end, -1);
13401n/a if (result == -1)
13402n/a return NULL;
13403n/a return PyBool_FromLong(result);
13404n/a}
13405n/a
13406n/a
13407n/aPyDoc_STRVAR(endswith__doc__,
13408n/a "S.endswith(suffix[, start[, end]]) -> bool\n\
13409n/a\n\
13410n/aReturn True if S ends with the specified suffix, False otherwise.\n\
13411n/aWith optional start, test S beginning at that position.\n\
13412n/aWith optional end, stop comparing S at that position.\n\
13413n/asuffix can also be a tuple of strings to try.");
13414n/a
13415n/astatic PyObject *
13416n/aunicode_endswith(PyObject *self,
13417n/a PyObject *args)
13418n/a{
13419n/a PyObject *subobj;
13420n/a PyObject *substring;
13421n/a Py_ssize_t start = 0;
13422n/a Py_ssize_t end = PY_SSIZE_T_MAX;
13423n/a int result;
13424n/a
13425n/a if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13426n/a return NULL;
13427n/a if (PyTuple_Check(subobj)) {
13428n/a Py_ssize_t i;
13429n/a for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13430n/a substring = PyTuple_GET_ITEM(subobj, i);
13431n/a if (!PyUnicode_Check(substring)) {
13432n/a PyErr_Format(PyExc_TypeError,
13433n/a "tuple for endswith must only contain str, "
13434n/a "not %.100s",
13435n/a Py_TYPE(substring)->tp_name);
13436n/a return NULL;
13437n/a }
13438n/a result = tailmatch(self, substring, start, end, +1);
13439n/a if (result == -1)
13440n/a return NULL;
13441n/a if (result) {
13442n/a Py_RETURN_TRUE;
13443n/a }
13444n/a }
13445n/a Py_RETURN_FALSE;
13446n/a }
13447n/a if (!PyUnicode_Check(subobj)) {
13448n/a PyErr_Format(PyExc_TypeError,
13449n/a "endswith first arg must be str or "
13450n/a "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13451n/a return NULL;
13452n/a }
13453n/a result = tailmatch(self, subobj, start, end, +1);
13454n/a if (result == -1)
13455n/a return NULL;
13456n/a return PyBool_FromLong(result);
13457n/a}
13458n/a
13459n/astatic inline void
13460n/a_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13461n/a{
13462n/a writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13463n/a writer->data = PyUnicode_DATA(writer->buffer);
13464n/a
13465n/a if (!writer->readonly) {
13466n/a writer->kind = PyUnicode_KIND(writer->buffer);
13467n/a writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13468n/a }
13469n/a else {
13470n/a /* use a value smaller than PyUnicode_1BYTE_KIND() so
13471n/a _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13472n/a writer->kind = PyUnicode_WCHAR_KIND;
13473n/a assert(writer->kind <= PyUnicode_1BYTE_KIND);
13474n/a
13475n/a /* Copy-on-write mode: set buffer size to 0 so
13476n/a * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13477n/a * next write. */
13478n/a writer->size = 0;
13479n/a }
13480n/a}
13481n/a
13482n/avoid
13483n/a_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13484n/a{
13485n/a memset(writer, 0, sizeof(*writer));
13486n/a
13487n/a /* ASCII is the bare minimum */
13488n/a writer->min_char = 127;
13489n/a
13490n/a /* use a value smaller than PyUnicode_1BYTE_KIND() so
13491n/a _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13492n/a writer->kind = PyUnicode_WCHAR_KIND;
13493n/a assert(writer->kind <= PyUnicode_1BYTE_KIND);
13494n/a}
13495n/a
13496n/aint
13497n/a_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13498n/a Py_ssize_t length, Py_UCS4 maxchar)
13499n/a{
13500n/a Py_ssize_t newlen;
13501n/a PyObject *newbuffer;
13502n/a
13503n/a assert(maxchar <= MAX_UNICODE);
13504n/a
13505n/a /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13506n/a assert((maxchar > writer->maxchar && length >= 0)
13507n/a || length > 0);
13508n/a
13509n/a if (length > PY_SSIZE_T_MAX - writer->pos) {
13510n/a PyErr_NoMemory();
13511n/a return -1;
13512n/a }
13513n/a newlen = writer->pos + length;
13514n/a
13515n/a maxchar = Py_MAX(maxchar, writer->min_char);
13516n/a
13517n/a if (writer->buffer == NULL) {
13518n/a assert(!writer->readonly);
13519n/a if (writer->overallocate
13520n/a && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13521n/a /* overallocate to limit the number of realloc() */
13522n/a newlen += newlen / OVERALLOCATE_FACTOR;
13523n/a }
13524n/a if (newlen < writer->min_length)
13525n/a newlen = writer->min_length;
13526n/a
13527n/a writer->buffer = PyUnicode_New(newlen, maxchar);
13528n/a if (writer->buffer == NULL)
13529n/a return -1;
13530n/a }
13531n/a else if (newlen > writer->size) {
13532n/a if (writer->overallocate
13533n/a && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13534n/a /* overallocate to limit the number of realloc() */
13535n/a newlen += newlen / OVERALLOCATE_FACTOR;
13536n/a }
13537n/a if (newlen < writer->min_length)
13538n/a newlen = writer->min_length;
13539n/a
13540n/a if (maxchar > writer->maxchar || writer->readonly) {
13541n/a /* resize + widen */
13542n/a maxchar = Py_MAX(maxchar, writer->maxchar);
13543n/a newbuffer = PyUnicode_New(newlen, maxchar);
13544n/a if (newbuffer == NULL)
13545n/a return -1;
13546n/a _PyUnicode_FastCopyCharacters(newbuffer, 0,
13547n/a writer->buffer, 0, writer->pos);
13548n/a Py_DECREF(writer->buffer);
13549n/a writer->readonly = 0;
13550n/a }
13551n/a else {
13552n/a newbuffer = resize_compact(writer->buffer, newlen);
13553n/a if (newbuffer == NULL)
13554n/a return -1;
13555n/a }
13556n/a writer->buffer = newbuffer;
13557n/a }
13558n/a else if (maxchar > writer->maxchar) {
13559n/a assert(!writer->readonly);
13560n/a newbuffer = PyUnicode_New(writer->size, maxchar);
13561n/a if (newbuffer == NULL)
13562n/a return -1;
13563n/a _PyUnicode_FastCopyCharacters(newbuffer, 0,
13564n/a writer->buffer, 0, writer->pos);
13565n/a Py_SETREF(writer->buffer, newbuffer);
13566n/a }
13567n/a _PyUnicodeWriter_Update(writer);
13568n/a return 0;
13569n/a
13570n/a#undef OVERALLOCATE_FACTOR
13571n/a}
13572n/a
13573n/aint
13574n/a_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13575n/a enum PyUnicode_Kind kind)
13576n/a{
13577n/a Py_UCS4 maxchar;
13578n/a
13579n/a /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13580n/a assert(writer->kind < kind);
13581n/a
13582n/a switch (kind)
13583n/a {
13584n/a case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13585n/a case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13586n/a case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13587n/a default:
13588n/a assert(0 && "invalid kind");
13589n/a return -1;
13590n/a }
13591n/a
13592n/a return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13593n/a}
13594n/a
13595n/astatic inline int
13596n/a_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13597n/a{
13598n/a assert(ch <= MAX_UNICODE);
13599n/a if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13600n/a return -1;
13601n/a PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13602n/a writer->pos++;
13603n/a return 0;
13604n/a}
13605n/a
13606n/aint
13607n/a_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13608n/a{
13609n/a return _PyUnicodeWriter_WriteCharInline(writer, ch);
13610n/a}
13611n/a
13612n/aint
13613n/a_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13614n/a{
13615n/a Py_UCS4 maxchar;
13616n/a Py_ssize_t len;
13617n/a
13618n/a if (PyUnicode_READY(str) == -1)
13619n/a return -1;
13620n/a len = PyUnicode_GET_LENGTH(str);
13621n/a if (len == 0)
13622n/a return 0;
13623n/a maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13624n/a if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13625n/a if (writer->buffer == NULL && !writer->overallocate) {
13626n/a assert(_PyUnicode_CheckConsistency(str, 1));
13627n/a writer->readonly = 1;
13628n/a Py_INCREF(str);
13629n/a writer->buffer = str;
13630n/a _PyUnicodeWriter_Update(writer);
13631n/a writer->pos += len;
13632n/a return 0;
13633n/a }
13634n/a if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13635n/a return -1;
13636n/a }
13637n/a _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13638n/a str, 0, len);
13639n/a writer->pos += len;
13640n/a return 0;
13641n/a}
13642n/a
13643n/aint
13644n/a_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13645n/a Py_ssize_t start, Py_ssize_t end)
13646n/a{
13647n/a Py_UCS4 maxchar;
13648n/a Py_ssize_t len;
13649n/a
13650n/a if (PyUnicode_READY(str) == -1)
13651n/a return -1;
13652n/a
13653n/a assert(0 <= start);
13654n/a assert(end <= PyUnicode_GET_LENGTH(str));
13655n/a assert(start <= end);
13656n/a
13657n/a if (end == 0)
13658n/a return 0;
13659n/a
13660n/a if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13661n/a return _PyUnicodeWriter_WriteStr(writer, str);
13662n/a
13663n/a if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13664n/a maxchar = _PyUnicode_FindMaxChar(str, start, end);
13665n/a else
13666n/a maxchar = writer->maxchar;
13667n/a len = end - start;
13668n/a
13669n/a if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13670n/a return -1;
13671n/a
13672n/a _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13673n/a str, start, len);
13674n/a writer->pos += len;
13675n/a return 0;
13676n/a}
13677n/a
13678n/aint
13679n/a_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13680n/a const char *ascii, Py_ssize_t len)
13681n/a{
13682n/a if (len == -1)
13683n/a len = strlen(ascii);
13684n/a
13685n/a assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13686n/a
13687n/a if (writer->buffer == NULL && !writer->overallocate) {
13688n/a PyObject *str;
13689n/a
13690n/a str = _PyUnicode_FromASCII(ascii, len);
13691n/a if (str == NULL)
13692n/a return -1;
13693n/a
13694n/a writer->readonly = 1;
13695n/a writer->buffer = str;
13696n/a _PyUnicodeWriter_Update(writer);
13697n/a writer->pos += len;
13698n/a return 0;
13699n/a }
13700n/a
13701n/a if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13702n/a return -1;
13703n/a
13704n/a switch (writer->kind)
13705n/a {
13706n/a case PyUnicode_1BYTE_KIND:
13707n/a {
13708n/a const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13709n/a Py_UCS1 *data = writer->data;
13710n/a
13711n/a memcpy(data + writer->pos, str, len);
13712n/a break;
13713n/a }
13714n/a case PyUnicode_2BYTE_KIND:
13715n/a {
13716n/a _PyUnicode_CONVERT_BYTES(
13717n/a Py_UCS1, Py_UCS2,
13718n/a ascii, ascii + len,
13719n/a (Py_UCS2 *)writer->data + writer->pos);
13720n/a break;
13721n/a }
13722n/a case PyUnicode_4BYTE_KIND:
13723n/a {
13724n/a _PyUnicode_CONVERT_BYTES(
13725n/a Py_UCS1, Py_UCS4,
13726n/a ascii, ascii + len,
13727n/a (Py_UCS4 *)writer->data + writer->pos);
13728n/a break;
13729n/a }
13730n/a default:
13731n/a assert(0);
13732n/a }
13733n/a
13734n/a writer->pos += len;
13735n/a return 0;
13736n/a}
13737n/a
13738n/aint
13739n/a_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13740n/a const char *str, Py_ssize_t len)
13741n/a{
13742n/a Py_UCS4 maxchar;
13743n/a
13744n/a maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13745n/a if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13746n/a return -1;
13747n/a unicode_write_cstr(writer->buffer, writer->pos, str, len);
13748n/a writer->pos += len;
13749n/a return 0;
13750n/a}
13751n/a
13752n/aPyObject *
13753n/a_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13754n/a{
13755n/a PyObject *str;
13756n/a
13757n/a if (writer->pos == 0) {
13758n/a Py_CLEAR(writer->buffer);
13759n/a _Py_RETURN_UNICODE_EMPTY();
13760n/a }
13761n/a
13762n/a str = writer->buffer;
13763n/a writer->buffer = NULL;
13764n/a
13765n/a if (writer->readonly) {
13766n/a assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13767n/a return str;
13768n/a }
13769n/a
13770n/a if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13771n/a PyObject *str2;
13772n/a str2 = resize_compact(str, writer->pos);
13773n/a if (str2 == NULL) {
13774n/a Py_DECREF(str);
13775n/a return NULL;
13776n/a }
13777n/a str = str2;
13778n/a }
13779n/a
13780n/a assert(_PyUnicode_CheckConsistency(str, 1));
13781n/a return unicode_result_ready(str);
13782n/a}
13783n/a
13784n/avoid
13785n/a_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13786n/a{
13787n/a Py_CLEAR(writer->buffer);
13788n/a}
13789n/a
13790n/a#include "stringlib/unicode_format.h"
13791n/a
13792n/aPyDoc_STRVAR(format__doc__,
13793n/a "S.format(*args, **kwargs) -> str\n\
13794n/a\n\
13795n/aReturn a formatted version of S, using substitutions from args and kwargs.\n\
13796n/aThe substitutions are identified by braces ('{' and '}').");
13797n/a
13798n/aPyDoc_STRVAR(format_map__doc__,
13799n/a "S.format_map(mapping) -> str\n\
13800n/a\n\
13801n/aReturn a formatted version of S, using substitutions from mapping.\n\
13802n/aThe substitutions are identified by braces ('{' and '}').");
13803n/a
13804n/a/*[clinic input]
13805n/astr.__format__ as unicode___format__
13806n/a
13807n/a format_spec: unicode
13808n/a /
13809n/a
13810n/aReturn a formatted version of the string as described by format_spec.
13811n/a[clinic start generated code]*/
13812n/a
13813n/astatic PyObject *
13814n/aunicode___format___impl(PyObject *self, PyObject *format_spec)
13815n/a/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13816n/a{
13817n/a _PyUnicodeWriter writer;
13818n/a int ret;
13819n/a
13820n/a if (PyUnicode_READY(self) == -1)
13821n/a return NULL;
13822n/a _PyUnicodeWriter_Init(&writer);
13823n/a ret = _PyUnicode_FormatAdvancedWriter(&writer,
13824n/a self, format_spec, 0,
13825n/a PyUnicode_GET_LENGTH(format_spec));
13826n/a if (ret == -1) {
13827n/a _PyUnicodeWriter_Dealloc(&writer);
13828n/a return NULL;
13829n/a }
13830n/a return _PyUnicodeWriter_Finish(&writer);
13831n/a}
13832n/a
13833n/a/*[clinic input]
13834n/astr.__sizeof__ as unicode_sizeof
13835n/a
13836n/aReturn the size of the string in memory, in bytes.
13837n/a[clinic start generated code]*/
13838n/a
13839n/astatic PyObject *
13840n/aunicode_sizeof_impl(PyObject *self)
13841n/a/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13842n/a{
13843n/a Py_ssize_t size;
13844n/a
13845n/a /* If it's a compact object, account for base structure +
13846n/a character data. */
13847n/a if (PyUnicode_IS_COMPACT_ASCII(self))
13848n/a size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13849n/a else if (PyUnicode_IS_COMPACT(self))
13850n/a size = sizeof(PyCompactUnicodeObject) +
13851n/a (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13852n/a else {
13853n/a /* If it is a two-block object, account for base object, and
13854n/a for character block if present. */
13855n/a size = sizeof(PyUnicodeObject);
13856n/a if (_PyUnicode_DATA_ANY(self))
13857n/a size += (PyUnicode_GET_LENGTH(self) + 1) *
13858n/a PyUnicode_KIND(self);
13859n/a }
13860n/a /* If the wstr pointer is present, account for it unless it is shared
13861n/a with the data pointer. Check if the data is not shared. */
13862n/a if (_PyUnicode_HAS_WSTR_MEMORY(self))
13863n/a size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13864n/a if (_PyUnicode_HAS_UTF8_MEMORY(self))
13865n/a size += PyUnicode_UTF8_LENGTH(self) + 1;
13866n/a
13867n/a return PyLong_FromSsize_t(size);
13868n/a}
13869n/a
13870n/astatic PyObject *
13871n/aunicode_getnewargs(PyObject *v)
13872n/a{
13873n/a PyObject *copy = _PyUnicode_Copy(v);
13874n/a if (!copy)
13875n/a return NULL;
13876n/a return Py_BuildValue("(N)", copy);
13877n/a}
13878n/a
13879n/astatic PyMethodDef unicode_methods[] = {
13880n/a UNICODE_ENCODE_METHODDEF
13881n/a UNICODE_REPLACE_METHODDEF
13882n/a UNICODE_SPLIT_METHODDEF
13883n/a UNICODE_RSPLIT_METHODDEF
13884n/a UNICODE_JOIN_METHODDEF
13885n/a UNICODE_CAPITALIZE_METHODDEF
13886n/a UNICODE_CASEFOLD_METHODDEF
13887n/a UNICODE_TITLE_METHODDEF
13888n/a UNICODE_CENTER_METHODDEF
13889n/a {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13890n/a UNICODE_EXPANDTABS_METHODDEF
13891n/a {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13892n/a UNICODE_PARTITION_METHODDEF
13893n/a {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13894n/a UNICODE_LJUST_METHODDEF
13895n/a UNICODE_LOWER_METHODDEF
13896n/a UNICODE_LSTRIP_METHODDEF
13897n/a {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13898n/a {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13899n/a UNICODE_RJUST_METHODDEF
13900n/a UNICODE_RSTRIP_METHODDEF
13901n/a UNICODE_RPARTITION_METHODDEF
13902n/a UNICODE_SPLITLINES_METHODDEF
13903n/a UNICODE_STRIP_METHODDEF
13904n/a UNICODE_SWAPCASE_METHODDEF
13905n/a UNICODE_TRANSLATE_METHODDEF
13906n/a UNICODE_UPPER_METHODDEF
13907n/a {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13908n/a {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13909n/a UNICODE_ISLOWER_METHODDEF
13910n/a UNICODE_ISUPPER_METHODDEF
13911n/a UNICODE_ISTITLE_METHODDEF
13912n/a UNICODE_ISSPACE_METHODDEF
13913n/a UNICODE_ISDECIMAL_METHODDEF
13914n/a UNICODE_ISDIGIT_METHODDEF
13915n/a UNICODE_ISNUMERIC_METHODDEF
13916n/a UNICODE_ISALPHA_METHODDEF
13917n/a UNICODE_ISALNUM_METHODDEF
13918n/a UNICODE_ISIDENTIFIER_METHODDEF
13919n/a UNICODE_ISPRINTABLE_METHODDEF
13920n/a UNICODE_ZFILL_METHODDEF
13921n/a {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13922n/a {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13923n/a UNICODE___FORMAT___METHODDEF
13924n/a UNICODE_MAKETRANS_METHODDEF
13925n/a UNICODE_SIZEOF_METHODDEF
13926n/a#if 0
13927n/a /* These methods are just used for debugging the implementation. */
13928n/a {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13929n/a#endif
13930n/a
13931n/a {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
13932n/a {NULL, NULL}
13933n/a};
13934n/a
13935n/astatic PyObject *
13936n/aunicode_mod(PyObject *v, PyObject *w)
13937n/a{
13938n/a if (!PyUnicode_Check(v))
13939n/a Py_RETURN_NOTIMPLEMENTED;
13940n/a return PyUnicode_Format(v, w);
13941n/a}
13942n/a
13943n/astatic PyNumberMethods unicode_as_number = {
13944n/a 0, /*nb_add*/
13945n/a 0, /*nb_subtract*/
13946n/a 0, /*nb_multiply*/
13947n/a unicode_mod, /*nb_remainder*/
13948n/a};
13949n/a
13950n/astatic PySequenceMethods unicode_as_sequence = {
13951n/a (lenfunc) unicode_length, /* sq_length */
13952n/a PyUnicode_Concat, /* sq_concat */
13953n/a (ssizeargfunc) unicode_repeat, /* sq_repeat */
13954n/a (ssizeargfunc) unicode_getitem, /* sq_item */
13955n/a 0, /* sq_slice */
13956n/a 0, /* sq_ass_item */
13957n/a 0, /* sq_ass_slice */
13958n/a PyUnicode_Contains, /* sq_contains */
13959n/a};
13960n/a
13961n/astatic PyObject*
13962n/aunicode_subscript(PyObject* self, PyObject* item)
13963n/a{
13964n/a if (PyUnicode_READY(self) == -1)
13965n/a return NULL;
13966n/a
13967n/a if (PyIndex_Check(item)) {
13968n/a Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13969n/a if (i == -1 && PyErr_Occurred())
13970n/a return NULL;
13971n/a if (i < 0)
13972n/a i += PyUnicode_GET_LENGTH(self);
13973n/a return unicode_getitem(self, i);
13974n/a } else if (PySlice_Check(item)) {
13975n/a Py_ssize_t start, stop, step, slicelength, cur, i;
13976n/a PyObject *result;
13977n/a void *src_data, *dest_data;
13978n/a int src_kind, dest_kind;
13979n/a Py_UCS4 ch, max_char, kind_limit;
13980n/a
13981n/a if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
13982n/a &start, &stop, &step, &slicelength) < 0) {
13983n/a return NULL;
13984n/a }
13985n/a
13986n/a if (slicelength <= 0) {
13987n/a _Py_RETURN_UNICODE_EMPTY();
13988n/a } else if (start == 0 && step == 1 &&
13989n/a slicelength == PyUnicode_GET_LENGTH(self)) {
13990n/a return unicode_result_unchanged(self);
13991n/a } else if (step == 1) {
13992n/a return PyUnicode_Substring(self,
13993n/a start, start + slicelength);
13994n/a }
13995n/a /* General case */
13996n/a src_kind = PyUnicode_KIND(self);
13997n/a src_data = PyUnicode_DATA(self);
13998n/a if (!PyUnicode_IS_ASCII(self)) {
13999n/a kind_limit = kind_maxchar_limit(src_kind);
14000n/a max_char = 0;
14001n/a for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14002n/a ch = PyUnicode_READ(src_kind, src_data, cur);
14003n/a if (ch > max_char) {
14004n/a max_char = ch;
14005n/a if (max_char >= kind_limit)
14006n/a break;
14007n/a }
14008n/a }
14009n/a }
14010n/a else
14011n/a max_char = 127;
14012n/a result = PyUnicode_New(slicelength, max_char);
14013n/a if (result == NULL)
14014n/a return NULL;
14015n/a dest_kind = PyUnicode_KIND(result);
14016n/a dest_data = PyUnicode_DATA(result);
14017n/a
14018n/a for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14019n/a Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14020n/a PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14021n/a }
14022n/a assert(_PyUnicode_CheckConsistency(result, 1));
14023n/a return result;
14024n/a } else {
14025n/a PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14026n/a return NULL;
14027n/a }
14028n/a}
14029n/a
14030n/astatic PyMappingMethods unicode_as_mapping = {
14031n/a (lenfunc)unicode_length, /* mp_length */
14032n/a (binaryfunc)unicode_subscript, /* mp_subscript */
14033n/a (objobjargproc)0, /* mp_ass_subscript */
14034n/a};
14035n/a
14036n/a
14037n/a/* Helpers for PyUnicode_Format() */
14038n/a
14039n/astruct unicode_formatter_t {
14040n/a PyObject *args;
14041n/a int args_owned;
14042n/a Py_ssize_t arglen, argidx;
14043n/a PyObject *dict;
14044n/a
14045n/a enum PyUnicode_Kind fmtkind;
14046n/a Py_ssize_t fmtcnt, fmtpos;
14047n/a void *fmtdata;
14048n/a PyObject *fmtstr;
14049n/a
14050n/a _PyUnicodeWriter writer;
14051n/a};
14052n/a
14053n/astruct unicode_format_arg_t {
14054n/a Py_UCS4 ch;
14055n/a int flags;
14056n/a Py_ssize_t width;
14057n/a int prec;
14058n/a int sign;
14059n/a};
14060n/a
14061n/astatic PyObject *
14062n/aunicode_format_getnextarg(struct unicode_formatter_t *ctx)
14063n/a{
14064n/a Py_ssize_t argidx = ctx->argidx;
14065n/a
14066n/a if (argidx < ctx->arglen) {
14067n/a ctx->argidx++;
14068n/a if (ctx->arglen < 0)
14069n/a return ctx->args;
14070n/a else
14071n/a return PyTuple_GetItem(ctx->args, argidx);
14072n/a }
14073n/a PyErr_SetString(PyExc_TypeError,
14074n/a "not enough arguments for format string");
14075n/a return NULL;
14076n/a}
14077n/a
14078n/a/* Returns a new reference to a PyUnicode object, or NULL on failure. */
14079n/a
14080n/a/* Format a float into the writer if the writer is not NULL, or into *p_output
14081n/a otherwise.
14082n/a
14083n/a Return 0 on success, raise an exception and return -1 on error. */
14084n/astatic int
14085n/aformatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14086n/a PyObject **p_output,
14087n/a _PyUnicodeWriter *writer)
14088n/a{
14089n/a char *p;
14090n/a double x;
14091n/a Py_ssize_t len;
14092n/a int prec;
14093n/a int dtoa_flags;
14094n/a
14095n/a x = PyFloat_AsDouble(v);
14096n/a if (x == -1.0 && PyErr_Occurred())
14097n/a return -1;
14098n/a
14099n/a prec = arg->prec;
14100n/a if (prec < 0)
14101n/a prec = 6;
14102n/a
14103n/a if (arg->flags & F_ALT)
14104n/a dtoa_flags = Py_DTSF_ALT;
14105n/a else
14106n/a dtoa_flags = 0;
14107n/a p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14108n/a if (p == NULL)
14109n/a return -1;
14110n/a len = strlen(p);
14111n/a if (writer) {
14112n/a if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14113n/a PyMem_Free(p);
14114n/a return -1;
14115n/a }
14116n/a }
14117n/a else
14118n/a *p_output = _PyUnicode_FromASCII(p, len);
14119n/a PyMem_Free(p);
14120n/a return 0;
14121n/a}
14122n/a
14123n/a/* formatlong() emulates the format codes d, u, o, x and X, and
14124n/a * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14125n/a * Python's regular ints.
14126n/a * Return value: a new PyUnicodeObject*, or NULL if error.
14127n/a * The output string is of the form
14128n/a * "-"? ("0x" | "0X")? digit+
14129n/a * "0x"/"0X" are present only for x and X conversions, with F_ALT
14130n/a * set in flags. The case of hex digits will be correct,
14131n/a * There will be at least prec digits, zero-filled on the left if
14132n/a * necessary to get that many.
14133n/a * val object to be converted
14134n/a * flags bitmask of format flags; only F_ALT is looked at
14135n/a * prec minimum number of digits; 0-fill on left if needed
14136n/a * type a character in [duoxX]; u acts the same as d
14137n/a *
14138n/a * CAUTION: o, x and X conversions on regular ints can never
14139n/a * produce a '-' sign, but can for Python's unbounded ints.
14140n/a */
14141n/aPyObject *
14142n/a_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14143n/a{
14144n/a PyObject *result = NULL;
14145n/a char *buf;
14146n/a Py_ssize_t i;
14147n/a int sign; /* 1 if '-', else 0 */
14148n/a int len; /* number of characters */
14149n/a Py_ssize_t llen;
14150n/a int numdigits; /* len == numnondigits + numdigits */
14151n/a int numnondigits = 0;
14152n/a
14153n/a /* Avoid exceeding SSIZE_T_MAX */
14154n/a if (prec > INT_MAX-3) {
14155n/a PyErr_SetString(PyExc_OverflowError,
14156n/a "precision too large");
14157n/a return NULL;
14158n/a }
14159n/a
14160n/a assert(PyLong_Check(val));
14161n/a
14162n/a switch (type) {
14163n/a default:
14164n/a assert(!"'type' not in [diuoxX]");
14165n/a case 'd':
14166n/a case 'i':
14167n/a case 'u':
14168n/a /* int and int subclasses should print numerically when a numeric */
14169n/a /* format code is used (see issue18780) */
14170n/a result = PyNumber_ToBase(val, 10);
14171n/a break;
14172n/a case 'o':
14173n/a numnondigits = 2;
14174n/a result = PyNumber_ToBase(val, 8);
14175n/a break;
14176n/a case 'x':
14177n/a case 'X':
14178n/a numnondigits = 2;
14179n/a result = PyNumber_ToBase(val, 16);
14180n/a break;
14181n/a }
14182n/a if (!result)
14183n/a return NULL;
14184n/a
14185n/a assert(unicode_modifiable(result));
14186n/a assert(PyUnicode_IS_READY(result));
14187n/a assert(PyUnicode_IS_ASCII(result));
14188n/a
14189n/a /* To modify the string in-place, there can only be one reference. */
14190n/a if (Py_REFCNT(result) != 1) {
14191n/a Py_DECREF(result);
14192n/a PyErr_BadInternalCall();
14193n/a return NULL;
14194n/a }
14195n/a buf = PyUnicode_DATA(result);
14196n/a llen = PyUnicode_GET_LENGTH(result);
14197n/a if (llen > INT_MAX) {
14198n/a Py_DECREF(result);
14199n/a PyErr_SetString(PyExc_ValueError,
14200n/a "string too large in _PyUnicode_FormatLong");
14201n/a return NULL;
14202n/a }
14203n/a len = (int)llen;
14204n/a sign = buf[0] == '-';
14205n/a numnondigits += sign;
14206n/a numdigits = len - numnondigits;
14207n/a assert(numdigits > 0);
14208n/a
14209n/a /* Get rid of base marker unless F_ALT */
14210n/a if (((alt) == 0 &&
14211n/a (type == 'o' || type == 'x' || type == 'X'))) {
14212n/a assert(buf[sign] == '0');
14213n/a assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14214n/a buf[sign+1] == 'o');
14215n/a numnondigits -= 2;
14216n/a buf += 2;
14217n/a len -= 2;
14218n/a if (sign)
14219n/a buf[0] = '-';
14220n/a assert(len == numnondigits + numdigits);
14221n/a assert(numdigits > 0);
14222n/a }
14223n/a
14224n/a /* Fill with leading zeroes to meet minimum width. */
14225n/a if (prec > numdigits) {
14226n/a PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14227n/a numnondigits + prec);
14228n/a char *b1;
14229n/a if (!r1) {
14230n/a Py_DECREF(result);
14231n/a return NULL;
14232n/a }
14233n/a b1 = PyBytes_AS_STRING(r1);
14234n/a for (i = 0; i < numnondigits; ++i)
14235n/a *b1++ = *buf++;
14236n/a for (i = 0; i < prec - numdigits; i++)
14237n/a *b1++ = '0';
14238n/a for (i = 0; i < numdigits; i++)
14239n/a *b1++ = *buf++;
14240n/a *b1 = '\0';
14241n/a Py_DECREF(result);
14242n/a result = r1;
14243n/a buf = PyBytes_AS_STRING(result);
14244n/a len = numnondigits + prec;
14245n/a }
14246n/a
14247n/a /* Fix up case for hex conversions. */
14248n/a if (type == 'X') {
14249n/a /* Need to convert all lower case letters to upper case.
14250n/a and need to convert 0x to 0X (and -0x to -0X). */
14251n/a for (i = 0; i < len; i++)
14252n/a if (buf[i] >= 'a' && buf[i] <= 'x')
14253n/a buf[i] -= 'a'-'A';
14254n/a }
14255n/a if (!PyUnicode_Check(result)
14256n/a || buf != PyUnicode_DATA(result)) {
14257n/a PyObject *unicode;
14258n/a unicode = _PyUnicode_FromASCII(buf, len);
14259n/a Py_DECREF(result);
14260n/a result = unicode;
14261n/a }
14262n/a else if (len != PyUnicode_GET_LENGTH(result)) {
14263n/a if (PyUnicode_Resize(&result, len) < 0)
14264n/a Py_CLEAR(result);
14265n/a }
14266n/a return result;
14267n/a}
14268n/a
14269n/a/* Format an integer or a float as an integer.
14270n/a * Return 1 if the number has been formatted into the writer,
14271n/a * 0 if the number has been formatted into *p_output
14272n/a * -1 and raise an exception on error */
14273n/astatic int
14274n/amainformatlong(PyObject *v,
14275n/a struct unicode_format_arg_t *arg,
14276n/a PyObject **p_output,
14277n/a _PyUnicodeWriter *writer)
14278n/a{
14279n/a PyObject *iobj, *res;
14280n/a char type = (char)arg->ch;
14281n/a
14282n/a if (!PyNumber_Check(v))
14283n/a goto wrongtype;
14284n/a
14285n/a /* make sure number is a type of integer for o, x, and X */
14286n/a if (!PyLong_Check(v)) {
14287n/a if (type == 'o' || type == 'x' || type == 'X') {
14288n/a iobj = PyNumber_Index(v);
14289n/a if (iobj == NULL) {
14290n/a if (PyErr_ExceptionMatches(PyExc_TypeError))
14291n/a goto wrongtype;
14292n/a return -1;
14293n/a }
14294n/a }
14295n/a else {
14296n/a iobj = PyNumber_Long(v);
14297n/a if (iobj == NULL ) {
14298n/a if (PyErr_ExceptionMatches(PyExc_TypeError))
14299n/a goto wrongtype;
14300n/a return -1;
14301n/a }
14302n/a }
14303n/a assert(PyLong_Check(iobj));
14304n/a }
14305n/a else {
14306n/a iobj = v;
14307n/a Py_INCREF(iobj);
14308n/a }
14309n/a
14310n/a if (PyLong_CheckExact(v)
14311n/a && arg->width == -1 && arg->prec == -1
14312n/a && !(arg->flags & (F_SIGN | F_BLANK))
14313n/a && type != 'X')
14314n/a {
14315n/a /* Fast path */
14316n/a int alternate = arg->flags & F_ALT;
14317n/a int base;
14318n/a
14319n/a switch(type)
14320n/a {
14321n/a default:
14322n/a assert(0 && "'type' not in [diuoxX]");
14323n/a case 'd':
14324n/a case 'i':
14325n/a case 'u':
14326n/a base = 10;
14327n/a break;
14328n/a case 'o':
14329n/a base = 8;
14330n/a break;
14331n/a case 'x':
14332n/a case 'X':
14333n/a base = 16;
14334n/a break;
14335n/a }
14336n/a
14337n/a if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14338n/a Py_DECREF(iobj);
14339n/a return -1;
14340n/a }
14341n/a Py_DECREF(iobj);
14342n/a return 1;
14343n/a }
14344n/a
14345n/a res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14346n/a Py_DECREF(iobj);
14347n/a if (res == NULL)
14348n/a return -1;
14349n/a *p_output = res;
14350n/a return 0;
14351n/a
14352n/awrongtype:
14353n/a switch(type)
14354n/a {
14355n/a case 'o':
14356n/a case 'x':
14357n/a case 'X':
14358n/a PyErr_Format(PyExc_TypeError,
14359n/a "%%%c format: an integer is required, "
14360n/a "not %.200s",
14361n/a type, Py_TYPE(v)->tp_name);
14362n/a break;
14363n/a default:
14364n/a PyErr_Format(PyExc_TypeError,
14365n/a "%%%c format: a number is required, "
14366n/a "not %.200s",
14367n/a type, Py_TYPE(v)->tp_name);
14368n/a break;
14369n/a }
14370n/a return -1;
14371n/a}
14372n/a
14373n/astatic Py_UCS4
14374n/aformatchar(PyObject *v)
14375n/a{
14376n/a /* presume that the buffer is at least 3 characters long */
14377n/a if (PyUnicode_Check(v)) {
14378n/a if (PyUnicode_GET_LENGTH(v) == 1) {
14379n/a return PyUnicode_READ_CHAR(v, 0);
14380n/a }
14381n/a goto onError;
14382n/a }
14383n/a else {
14384n/a PyObject *iobj;
14385n/a long x;
14386n/a /* make sure number is a type of integer */
14387n/a if (!PyLong_Check(v)) {
14388n/a iobj = PyNumber_Index(v);
14389n/a if (iobj == NULL) {
14390n/a goto onError;
14391n/a }
14392n/a x = PyLong_AsLong(iobj);
14393n/a Py_DECREF(iobj);
14394n/a }
14395n/a else {
14396n/a x = PyLong_AsLong(v);
14397n/a }
14398n/a if (x == -1 && PyErr_Occurred())
14399n/a goto onError;
14400n/a
14401n/a if (x < 0 || x > MAX_UNICODE) {
14402n/a PyErr_SetString(PyExc_OverflowError,
14403n/a "%c arg not in range(0x110000)");
14404n/a return (Py_UCS4) -1;
14405n/a }
14406n/a
14407n/a return (Py_UCS4) x;
14408n/a }
14409n/a
14410n/a onError:
14411n/a PyErr_SetString(PyExc_TypeError,
14412n/a "%c requires int or char");
14413n/a return (Py_UCS4) -1;
14414n/a}
14415n/a
14416n/a/* Parse options of an argument: flags, width, precision.
14417n/a Handle also "%(name)" syntax.
14418n/a
14419n/a Return 0 if the argument has been formatted into arg->str.
14420n/a Return 1 if the argument has been written into ctx->writer,
14421n/a Raise an exception and return -1 on error. */
14422n/astatic int
14423n/aunicode_format_arg_parse(struct unicode_formatter_t *ctx,
14424n/a struct unicode_format_arg_t *arg)
14425n/a{
14426n/a#define FORMAT_READ(ctx) \
14427n/a PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14428n/a
14429n/a PyObject *v;
14430n/a
14431n/a if (arg->ch == '(') {
14432n/a /* Get argument value from a dictionary. Example: "%(name)s". */
14433n/a Py_ssize_t keystart;
14434n/a Py_ssize_t keylen;
14435n/a PyObject *key;
14436n/a int pcount = 1;
14437n/a
14438n/a if (ctx->dict == NULL) {
14439n/a PyErr_SetString(PyExc_TypeError,
14440n/a "format requires a mapping");
14441n/a return -1;
14442n/a }
14443n/a ++ctx->fmtpos;
14444n/a --ctx->fmtcnt;
14445n/a keystart = ctx->fmtpos;
14446n/a /* Skip over balanced parentheses */
14447n/a while (pcount > 0 && --ctx->fmtcnt >= 0) {
14448n/a arg->ch = FORMAT_READ(ctx);
14449n/a if (arg->ch == ')')
14450n/a --pcount;
14451n/a else if (arg->ch == '(')
14452n/a ++pcount;
14453n/a ctx->fmtpos++;
14454n/a }
14455n/a keylen = ctx->fmtpos - keystart - 1;
14456n/a if (ctx->fmtcnt < 0 || pcount > 0) {
14457n/a PyErr_SetString(PyExc_ValueError,
14458n/a "incomplete format key");
14459n/a return -1;
14460n/a }
14461n/a key = PyUnicode_Substring(ctx->fmtstr,
14462n/a keystart, keystart + keylen);
14463n/a if (key == NULL)
14464n/a return -1;
14465n/a if (ctx->args_owned) {
14466n/a ctx->args_owned = 0;
14467n/a Py_DECREF(ctx->args);
14468n/a }
14469n/a ctx->args = PyObject_GetItem(ctx->dict, key);
14470n/a Py_DECREF(key);
14471n/a if (ctx->args == NULL)
14472n/a return -1;
14473n/a ctx->args_owned = 1;
14474n/a ctx->arglen = -1;
14475n/a ctx->argidx = -2;
14476n/a }
14477n/a
14478n/a /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14479n/a while (--ctx->fmtcnt >= 0) {
14480n/a arg->ch = FORMAT_READ(ctx);
14481n/a ctx->fmtpos++;
14482n/a switch (arg->ch) {
14483n/a case '-': arg->flags |= F_LJUST; continue;
14484n/a case '+': arg->flags |= F_SIGN; continue;
14485n/a case ' ': arg->flags |= F_BLANK; continue;
14486n/a case '#': arg->flags |= F_ALT; continue;
14487n/a case '0': arg->flags |= F_ZERO; continue;
14488n/a }
14489n/a break;
14490n/a }
14491n/a
14492n/a /* Parse width. Example: "%10s" => width=10 */
14493n/a if (arg->ch == '*') {
14494n/a v = unicode_format_getnextarg(ctx);
14495n/a if (v == NULL)
14496n/a return -1;
14497n/a if (!PyLong_Check(v)) {
14498n/a PyErr_SetString(PyExc_TypeError,
14499n/a "* wants int");
14500n/a return -1;
14501n/a }
14502n/a arg->width = PyLong_AsSsize_t(v);
14503n/a if (arg->width == -1 && PyErr_Occurred())
14504n/a return -1;
14505n/a if (arg->width < 0) {
14506n/a arg->flags |= F_LJUST;
14507n/a arg->width = -arg->width;
14508n/a }
14509n/a if (--ctx->fmtcnt >= 0) {
14510n/a arg->ch = FORMAT_READ(ctx);
14511n/a ctx->fmtpos++;
14512n/a }
14513n/a }
14514n/a else if (arg->ch >= '0' && arg->ch <= '9') {
14515n/a arg->width = arg->ch - '0';
14516n/a while (--ctx->fmtcnt >= 0) {
14517n/a arg->ch = FORMAT_READ(ctx);
14518n/a ctx->fmtpos++;
14519n/a if (arg->ch < '0' || arg->ch > '9')
14520n/a break;
14521n/a /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14522n/a mixing signed and unsigned comparison. Since arg->ch is between
14523n/a '0' and '9', casting to int is safe. */
14524n/a if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14525n/a PyErr_SetString(PyExc_ValueError,
14526n/a "width too big");
14527n/a return -1;
14528n/a }
14529n/a arg->width = arg->width*10 + (arg->ch - '0');
14530n/a }
14531n/a }
14532n/a
14533n/a /* Parse precision. Example: "%.3f" => prec=3 */
14534n/a if (arg->ch == '.') {
14535n/a arg->prec = 0;
14536n/a if (--ctx->fmtcnt >= 0) {
14537n/a arg->ch = FORMAT_READ(ctx);
14538n/a ctx->fmtpos++;
14539n/a }
14540n/a if (arg->ch == '*') {
14541n/a v = unicode_format_getnextarg(ctx);
14542n/a if (v == NULL)
14543n/a return -1;
14544n/a if (!PyLong_Check(v)) {
14545n/a PyErr_SetString(PyExc_TypeError,
14546n/a "* wants int");
14547n/a return -1;
14548n/a }
14549n/a arg->prec = _PyLong_AsInt(v);
14550n/a if (arg->prec == -1 && PyErr_Occurred())
14551n/a return -1;
14552n/a if (arg->prec < 0)
14553n/a arg->prec = 0;
14554n/a if (--ctx->fmtcnt >= 0) {
14555n/a arg->ch = FORMAT_READ(ctx);
14556n/a ctx->fmtpos++;
14557n/a }
14558n/a }
14559n/a else if (arg->ch >= '0' && arg->ch <= '9') {
14560n/a arg->prec = arg->ch - '0';
14561n/a while (--ctx->fmtcnt >= 0) {
14562n/a arg->ch = FORMAT_READ(ctx);
14563n/a ctx->fmtpos++;
14564n/a if (arg->ch < '0' || arg->ch > '9')
14565n/a break;
14566n/a if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14567n/a PyErr_SetString(PyExc_ValueError,
14568n/a "precision too big");
14569n/a return -1;
14570n/a }
14571n/a arg->prec = arg->prec*10 + (arg->ch - '0');
14572n/a }
14573n/a }
14574n/a }
14575n/a
14576n/a /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14577n/a if (ctx->fmtcnt >= 0) {
14578n/a if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14579n/a if (--ctx->fmtcnt >= 0) {
14580n/a arg->ch = FORMAT_READ(ctx);
14581n/a ctx->fmtpos++;
14582n/a }
14583n/a }
14584n/a }
14585n/a if (ctx->fmtcnt < 0) {
14586n/a PyErr_SetString(PyExc_ValueError,
14587n/a "incomplete format");
14588n/a return -1;
14589n/a }
14590n/a return 0;
14591n/a
14592n/a#undef FORMAT_READ
14593n/a}
14594n/a
14595n/a/* Format one argument. Supported conversion specifiers:
14596n/a
14597n/a - "s", "r", "a": any type
14598n/a - "i", "d", "u": int or float
14599n/a - "o", "x", "X": int
14600n/a - "e", "E", "f", "F", "g", "G": float
14601n/a - "c": int or str (1 character)
14602n/a
14603n/a When possible, the output is written directly into the Unicode writer
14604n/a (ctx->writer). A string is created when padding is required.
14605n/a
14606n/a Return 0 if the argument has been formatted into *p_str,
14607n/a 1 if the argument has been written into ctx->writer,
14608n/a -1 on error. */
14609n/astatic int
14610n/aunicode_format_arg_format(struct unicode_formatter_t *ctx,
14611n/a struct unicode_format_arg_t *arg,
14612n/a PyObject **p_str)
14613n/a{
14614n/a PyObject *v;
14615n/a _PyUnicodeWriter *writer = &ctx->writer;
14616n/a
14617n/a if (ctx->fmtcnt == 0)
14618n/a ctx->writer.overallocate = 0;
14619n/a
14620n/a if (arg->ch == '%') {
14621n/a if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
14622n/a return -1;
14623n/a return 1;
14624n/a }
14625n/a
14626n/a v = unicode_format_getnextarg(ctx);
14627n/a if (v == NULL)
14628n/a return -1;
14629n/a
14630n/a
14631n/a switch (arg->ch) {
14632n/a case 's':
14633n/a case 'r':
14634n/a case 'a':
14635n/a if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14636n/a /* Fast path */
14637n/a if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14638n/a return -1;
14639n/a return 1;
14640n/a }
14641n/a
14642n/a if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14643n/a *p_str = v;
14644n/a Py_INCREF(*p_str);
14645n/a }
14646n/a else {
14647n/a if (arg->ch == 's')
14648n/a *p_str = PyObject_Str(v);
14649n/a else if (arg->ch == 'r')
14650n/a *p_str = PyObject_Repr(v);
14651n/a else
14652n/a *p_str = PyObject_ASCII(v);
14653n/a }
14654n/a break;
14655n/a
14656n/a case 'i':
14657n/a case 'd':
14658n/a case 'u':
14659n/a case 'o':
14660n/a case 'x':
14661n/a case 'X':
14662n/a {
14663n/a int ret = mainformatlong(v, arg, p_str, writer);
14664n/a if (ret != 0)
14665n/a return ret;
14666n/a arg->sign = 1;
14667n/a break;
14668n/a }
14669n/a
14670n/a case 'e':
14671n/a case 'E':
14672n/a case 'f':
14673n/a case 'F':
14674n/a case 'g':
14675n/a case 'G':
14676n/a if (arg->width == -1 && arg->prec == -1
14677n/a && !(arg->flags & (F_SIGN | F_BLANK)))
14678n/a {
14679n/a /* Fast path */
14680n/a if (formatfloat(v, arg, NULL, writer) == -1)
14681n/a return -1;
14682n/a return 1;
14683n/a }
14684n/a
14685n/a arg->sign = 1;
14686n/a if (formatfloat(v, arg, p_str, NULL) == -1)
14687n/a return -1;
14688n/a break;
14689n/a
14690n/a case 'c':
14691n/a {
14692n/a Py_UCS4 ch = formatchar(v);
14693n/a if (ch == (Py_UCS4) -1)
14694n/a return -1;
14695n/a if (arg->width == -1 && arg->prec == -1) {
14696n/a /* Fast path */
14697n/a if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14698n/a return -1;
14699n/a return 1;
14700n/a }
14701n/a *p_str = PyUnicode_FromOrdinal(ch);
14702n/a break;
14703n/a }
14704n/a
14705n/a default:
14706n/a PyErr_Format(PyExc_ValueError,
14707n/a "unsupported format character '%c' (0x%x) "
14708n/a "at index %zd",
14709n/a (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14710n/a (int)arg->ch,
14711n/a ctx->fmtpos - 1);
14712n/a return -1;
14713n/a }
14714n/a if (*p_str == NULL)
14715n/a return -1;
14716n/a assert (PyUnicode_Check(*p_str));
14717n/a return 0;
14718n/a}
14719n/a
14720n/astatic int
14721n/aunicode_format_arg_output(struct unicode_formatter_t *ctx,
14722n/a struct unicode_format_arg_t *arg,
14723n/a PyObject *str)
14724n/a{
14725n/a Py_ssize_t len;
14726n/a enum PyUnicode_Kind kind;
14727n/a void *pbuf;
14728n/a Py_ssize_t pindex;
14729n/a Py_UCS4 signchar;
14730n/a Py_ssize_t buflen;
14731n/a Py_UCS4 maxchar;
14732n/a Py_ssize_t sublen;
14733n/a _PyUnicodeWriter *writer = &ctx->writer;
14734n/a Py_UCS4 fill;
14735n/a
14736n/a fill = ' ';
14737n/a if (arg->sign && arg->flags & F_ZERO)
14738n/a fill = '0';
14739n/a
14740n/a if (PyUnicode_READY(str) == -1)
14741n/a return -1;
14742n/a
14743n/a len = PyUnicode_GET_LENGTH(str);
14744n/a if ((arg->width == -1 || arg->width <= len)
14745n/a && (arg->prec == -1 || arg->prec >= len)
14746n/a && !(arg->flags & (F_SIGN | F_BLANK)))
14747n/a {
14748n/a /* Fast path */
14749n/a if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14750n/a return -1;
14751n/a return 0;
14752n/a }
14753n/a
14754n/a /* Truncate the string for "s", "r" and "a" formats
14755n/a if the precision is set */
14756n/a if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14757n/a if (arg->prec >= 0 && len > arg->prec)
14758n/a len = arg->prec;
14759n/a }
14760n/a
14761n/a /* Adjust sign and width */
14762n/a kind = PyUnicode_KIND(str);
14763n/a pbuf = PyUnicode_DATA(str);
14764n/a pindex = 0;
14765n/a signchar = '\0';
14766n/a if (arg->sign) {
14767n/a Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14768n/a if (ch == '-' || ch == '+') {
14769n/a signchar = ch;
14770n/a len--;
14771n/a pindex++;
14772n/a }
14773n/a else if (arg->flags & F_SIGN)
14774n/a signchar = '+';
14775n/a else if (arg->flags & F_BLANK)
14776n/a signchar = ' ';
14777n/a else
14778n/a arg->sign = 0;
14779n/a }
14780n/a if (arg->width < len)
14781n/a arg->width = len;
14782n/a
14783n/a /* Prepare the writer */
14784n/a maxchar = writer->maxchar;
14785n/a if (!(arg->flags & F_LJUST)) {
14786n/a if (arg->sign) {
14787n/a if ((arg->width-1) > len)
14788n/a maxchar = Py_MAX(maxchar, fill);
14789n/a }
14790n/a else {
14791n/a if (arg->width > len)
14792n/a maxchar = Py_MAX(maxchar, fill);
14793n/a }
14794n/a }
14795n/a if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14796n/a Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14797n/a maxchar = Py_MAX(maxchar, strmaxchar);
14798n/a }
14799n/a
14800n/a buflen = arg->width;
14801n/a if (arg->sign && len == arg->width)
14802n/a buflen++;
14803n/a if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14804n/a return -1;
14805n/a
14806n/a /* Write the sign if needed */
14807n/a if (arg->sign) {
14808n/a if (fill != ' ') {
14809n/a PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14810n/a writer->pos += 1;
14811n/a }
14812n/a if (arg->width > len)
14813n/a arg->width--;
14814n/a }
14815n/a
14816n/a /* Write the numeric prefix for "x", "X" and "o" formats
14817n/a if the alternate form is used.
14818n/a For example, write "0x" for the "%#x" format. */
14819n/a if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14820n/a assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14821n/a assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14822n/a if (fill != ' ') {
14823n/a PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14824n/a PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14825n/a writer->pos += 2;
14826n/a pindex += 2;
14827n/a }
14828n/a arg->width -= 2;
14829n/a if (arg->width < 0)
14830n/a arg->width = 0;
14831n/a len -= 2;
14832n/a }
14833n/a
14834n/a /* Pad left with the fill character if needed */
14835n/a if (arg->width > len && !(arg->flags & F_LJUST)) {
14836n/a sublen = arg->width - len;
14837n/a FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14838n/a writer->pos += sublen;
14839n/a arg->width = len;
14840n/a }
14841n/a
14842n/a /* If padding with spaces: write sign if needed and/or numeric prefix if
14843n/a the alternate form is used */
14844n/a if (fill == ' ') {
14845n/a if (arg->sign) {
14846n/a PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14847n/a writer->pos += 1;
14848n/a }
14849n/a if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14850n/a assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14851n/a assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14852n/a PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14853n/a PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14854n/a writer->pos += 2;
14855n/a pindex += 2;
14856n/a }
14857n/a }
14858n/a
14859n/a /* Write characters */
14860n/a if (len) {
14861n/a _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14862n/a str, pindex, len);
14863n/a writer->pos += len;
14864n/a }
14865n/a
14866n/a /* Pad right with the fill character if needed */
14867n/a if (arg->width > len) {
14868n/a sublen = arg->width - len;
14869n/a FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14870n/a writer->pos += sublen;
14871n/a }
14872n/a return 0;
14873n/a}
14874n/a
14875n/a/* Helper of PyUnicode_Format(): format one arg.
14876n/a Return 0 on success, raise an exception and return -1 on error. */
14877n/astatic int
14878n/aunicode_format_arg(struct unicode_formatter_t *ctx)
14879n/a{
14880n/a struct unicode_format_arg_t arg;
14881n/a PyObject *str;
14882n/a int ret;
14883n/a
14884n/a arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14885n/a arg.flags = 0;
14886n/a arg.width = -1;
14887n/a arg.prec = -1;
14888n/a arg.sign = 0;
14889n/a str = NULL;
14890n/a
14891n/a ret = unicode_format_arg_parse(ctx, &arg);
14892n/a if (ret == -1)
14893n/a return -1;
14894n/a
14895n/a ret = unicode_format_arg_format(ctx, &arg, &str);
14896n/a if (ret == -1)
14897n/a return -1;
14898n/a
14899n/a if (ret != 1) {
14900n/a ret = unicode_format_arg_output(ctx, &arg, str);
14901n/a Py_DECREF(str);
14902n/a if (ret == -1)
14903n/a return -1;
14904n/a }
14905n/a
14906n/a if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14907n/a PyErr_SetString(PyExc_TypeError,
14908n/a "not all arguments converted during string formatting");
14909n/a return -1;
14910n/a }
14911n/a return 0;
14912n/a}
14913n/a
14914n/aPyObject *
14915n/aPyUnicode_Format(PyObject *format, PyObject *args)
14916n/a{
14917n/a struct unicode_formatter_t ctx;
14918n/a
14919n/a if (format == NULL || args == NULL) {
14920n/a PyErr_BadInternalCall();
14921n/a return NULL;
14922n/a }
14923n/a
14924n/a if (ensure_unicode(format) < 0)
14925n/a return NULL;
14926n/a
14927n/a ctx.fmtstr = format;
14928n/a ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14929n/a ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14930n/a ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14931n/a ctx.fmtpos = 0;
14932n/a
14933n/a _PyUnicodeWriter_Init(&ctx.writer);
14934n/a ctx.writer.min_length = ctx.fmtcnt + 100;
14935n/a ctx.writer.overallocate = 1;
14936n/a
14937n/a if (PyTuple_Check(args)) {
14938n/a ctx.arglen = PyTuple_Size(args);
14939n/a ctx.argidx = 0;
14940n/a }
14941n/a else {
14942n/a ctx.arglen = -1;
14943n/a ctx.argidx = -2;
14944n/a }
14945n/a ctx.args_owned = 0;
14946n/a if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14947n/a ctx.dict = args;
14948n/a else
14949n/a ctx.dict = NULL;
14950n/a ctx.args = args;
14951n/a
14952n/a while (--ctx.fmtcnt >= 0) {
14953n/a if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14954n/a Py_ssize_t nonfmtpos;
14955n/a
14956n/a nonfmtpos = ctx.fmtpos++;
14957n/a while (ctx.fmtcnt >= 0 &&
14958n/a PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14959n/a ctx.fmtpos++;
14960n/a ctx.fmtcnt--;
14961n/a }
14962n/a if (ctx.fmtcnt < 0) {
14963n/a ctx.fmtpos--;
14964n/a ctx.writer.overallocate = 0;
14965n/a }
14966n/a
14967n/a if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14968n/a nonfmtpos, ctx.fmtpos) < 0)
14969n/a goto onError;
14970n/a }
14971n/a else {
14972n/a ctx.fmtpos++;
14973n/a if (unicode_format_arg(&ctx) == -1)
14974n/a goto onError;
14975n/a }
14976n/a }
14977n/a
14978n/a if (ctx.argidx < ctx.arglen && !ctx.dict) {
14979n/a PyErr_SetString(PyExc_TypeError,
14980n/a "not all arguments converted during string formatting");
14981n/a goto onError;
14982n/a }
14983n/a
14984n/a if (ctx.args_owned) {
14985n/a Py_DECREF(ctx.args);
14986n/a }
14987n/a return _PyUnicodeWriter_Finish(&ctx.writer);
14988n/a
14989n/a onError:
14990n/a _PyUnicodeWriter_Dealloc(&ctx.writer);
14991n/a if (ctx.args_owned) {
14992n/a Py_DECREF(ctx.args);
14993n/a }
14994n/a return NULL;
14995n/a}
14996n/a
14997n/astatic PyObject *
14998n/aunicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14999n/a
15000n/astatic PyObject *
15001n/aunicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15002n/a{
15003n/a PyObject *x = NULL;
15004n/a static char *kwlist[] = {"object", "encoding", "errors", 0};
15005n/a char *encoding = NULL;
15006n/a char *errors = NULL;
15007n/a
15008n/a if (type != &PyUnicode_Type)
15009n/a return unicode_subtype_new(type, args, kwds);
15010n/a if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
15011n/a kwlist, &x, &encoding, &errors))
15012n/a return NULL;
15013n/a if (x == NULL)
15014n/a _Py_RETURN_UNICODE_EMPTY();
15015n/a if (encoding == NULL && errors == NULL)
15016n/a return PyObject_Str(x);
15017n/a else
15018n/a return PyUnicode_FromEncodedObject(x, encoding, errors);
15019n/a}
15020n/a
15021n/astatic PyObject *
15022n/aunicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15023n/a{
15024n/a PyObject *unicode, *self;
15025n/a Py_ssize_t length, char_size;
15026n/a int share_wstr, share_utf8;
15027n/a unsigned int kind;
15028n/a void *data;
15029n/a
15030n/a assert(PyType_IsSubtype(type, &PyUnicode_Type));
15031n/a
15032n/a unicode = unicode_new(&PyUnicode_Type, args, kwds);
15033n/a if (unicode == NULL)
15034n/a return NULL;
15035n/a assert(_PyUnicode_CHECK(unicode));
15036n/a if (PyUnicode_READY(unicode) == -1) {
15037n/a Py_DECREF(unicode);
15038n/a return NULL;
15039n/a }
15040n/a
15041n/a self = type->tp_alloc(type, 0);
15042n/a if (self == NULL) {
15043n/a Py_DECREF(unicode);
15044n/a return NULL;
15045n/a }
15046n/a kind = PyUnicode_KIND(unicode);
15047n/a length = PyUnicode_GET_LENGTH(unicode);
15048n/a
15049n/a _PyUnicode_LENGTH(self) = length;
15050n/a#ifdef Py_DEBUG
15051n/a _PyUnicode_HASH(self) = -1;
15052n/a#else
15053n/a _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15054n/a#endif
15055n/a _PyUnicode_STATE(self).interned = 0;
15056n/a _PyUnicode_STATE(self).kind = kind;
15057n/a _PyUnicode_STATE(self).compact = 0;
15058n/a _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15059n/a _PyUnicode_STATE(self).ready = 1;
15060n/a _PyUnicode_WSTR(self) = NULL;
15061n/a _PyUnicode_UTF8_LENGTH(self) = 0;
15062n/a _PyUnicode_UTF8(self) = NULL;
15063n/a _PyUnicode_WSTR_LENGTH(self) = 0;
15064n/a _PyUnicode_DATA_ANY(self) = NULL;
15065n/a
15066n/a share_utf8 = 0;
15067n/a share_wstr = 0;
15068n/a if (kind == PyUnicode_1BYTE_KIND) {
15069n/a char_size = 1;
15070n/a if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15071n/a share_utf8 = 1;
15072n/a }
15073n/a else if (kind == PyUnicode_2BYTE_KIND) {
15074n/a char_size = 2;
15075n/a if (sizeof(wchar_t) == 2)
15076n/a share_wstr = 1;
15077n/a }
15078n/a else {
15079n/a assert(kind == PyUnicode_4BYTE_KIND);
15080n/a char_size = 4;
15081n/a if (sizeof(wchar_t) == 4)
15082n/a share_wstr = 1;
15083n/a }
15084n/a
15085n/a /* Ensure we won't overflow the length. */
15086n/a if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15087n/a PyErr_NoMemory();
15088n/a goto onError;
15089n/a }
15090n/a data = PyObject_MALLOC((length + 1) * char_size);
15091n/a if (data == NULL) {
15092n/a PyErr_NoMemory();
15093n/a goto onError;
15094n/a }
15095n/a
15096n/a _PyUnicode_DATA_ANY(self) = data;
15097n/a if (share_utf8) {
15098n/a _PyUnicode_UTF8_LENGTH(self) = length;
15099n/a _PyUnicode_UTF8(self) = data;
15100n/a }
15101n/a if (share_wstr) {
15102n/a _PyUnicode_WSTR_LENGTH(self) = length;
15103n/a _PyUnicode_WSTR(self) = (wchar_t *)data;
15104n/a }
15105n/a
15106n/a memcpy(data, PyUnicode_DATA(unicode),
15107n/a kind * (length + 1));
15108n/a assert(_PyUnicode_CheckConsistency(self, 1));
15109n/a#ifdef Py_DEBUG
15110n/a _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15111n/a#endif
15112n/a Py_DECREF(unicode);
15113n/a return self;
15114n/a
15115n/aonError:
15116n/a Py_DECREF(unicode);
15117n/a Py_DECREF(self);
15118n/a return NULL;
15119n/a}
15120n/a
15121n/aPyDoc_STRVAR(unicode_doc,
15122n/a"str(object='') -> str\n\
15123n/astr(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15124n/a\n\
15125n/aCreate a new string object from the given object. If encoding or\n\
15126n/aerrors is specified, then the object must expose a data buffer\n\
15127n/athat will be decoded using the given encoding and error handler.\n\
15128n/aOtherwise, returns the result of object.__str__() (if defined)\n\
15129n/aor repr(object).\n\
15130n/aencoding defaults to sys.getdefaultencoding().\n\
15131n/aerrors defaults to 'strict'.");
15132n/a
15133n/astatic PyObject *unicode_iter(PyObject *seq);
15134n/a
15135n/aPyTypeObject PyUnicode_Type = {
15136n/a PyVarObject_HEAD_INIT(&PyType_Type, 0)
15137n/a "str", /* tp_name */
15138n/a sizeof(PyUnicodeObject), /* tp_size */
15139n/a 0, /* tp_itemsize */
15140n/a /* Slots */
15141n/a (destructor)unicode_dealloc, /* tp_dealloc */
15142n/a 0, /* tp_print */
15143n/a 0, /* tp_getattr */
15144n/a 0, /* tp_setattr */
15145n/a 0, /* tp_reserved */
15146n/a unicode_repr, /* tp_repr */
15147n/a &unicode_as_number, /* tp_as_number */
15148n/a &unicode_as_sequence, /* tp_as_sequence */
15149n/a &unicode_as_mapping, /* tp_as_mapping */
15150n/a (hashfunc) unicode_hash, /* tp_hash*/
15151n/a 0, /* tp_call*/
15152n/a (reprfunc) unicode_str, /* tp_str */
15153n/a PyObject_GenericGetAttr, /* tp_getattro */
15154n/a 0, /* tp_setattro */
15155n/a 0, /* tp_as_buffer */
15156n/a Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15157n/a Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15158n/a unicode_doc, /* tp_doc */
15159n/a 0, /* tp_traverse */
15160n/a 0, /* tp_clear */
15161n/a PyUnicode_RichCompare, /* tp_richcompare */
15162n/a 0, /* tp_weaklistoffset */
15163n/a unicode_iter, /* tp_iter */
15164n/a 0, /* tp_iternext */
15165n/a unicode_methods, /* tp_methods */
15166n/a 0, /* tp_members */
15167n/a 0, /* tp_getset */
15168n/a &PyBaseObject_Type, /* tp_base */
15169n/a 0, /* tp_dict */
15170n/a 0, /* tp_descr_get */
15171n/a 0, /* tp_descr_set */
15172n/a 0, /* tp_dictoffset */
15173n/a 0, /* tp_init */
15174n/a 0, /* tp_alloc */
15175n/a unicode_new, /* tp_new */
15176n/a PyObject_Del, /* tp_free */
15177n/a};
15178n/a
15179n/a/* Initialize the Unicode implementation */
15180n/a
15181n/aint _PyUnicode_Init(void)
15182n/a{
15183n/a /* XXX - move this array to unicodectype.c ? */
15184n/a Py_UCS2 linebreak[] = {
15185n/a 0x000A, /* LINE FEED */
15186n/a 0x000D, /* CARRIAGE RETURN */
15187n/a 0x001C, /* FILE SEPARATOR */
15188n/a 0x001D, /* GROUP SEPARATOR */
15189n/a 0x001E, /* RECORD SEPARATOR */
15190n/a 0x0085, /* NEXT LINE */
15191n/a 0x2028, /* LINE SEPARATOR */
15192n/a 0x2029, /* PARAGRAPH SEPARATOR */
15193n/a };
15194n/a
15195n/a /* Init the implementation */
15196n/a _Py_INCREF_UNICODE_EMPTY();
15197n/a if (!unicode_empty)
15198n/a Py_FatalError("Can't create empty string");
15199n/a Py_DECREF(unicode_empty);
15200n/a
15201n/a if (PyType_Ready(&PyUnicode_Type) < 0)
15202n/a Py_FatalError("Can't initialize 'unicode'");
15203n/a
15204n/a /* initialize the linebreak bloom filter */
15205n/a bloom_linebreak = make_bloom_mask(
15206n/a PyUnicode_2BYTE_KIND, linebreak,
15207n/a Py_ARRAY_LENGTH(linebreak));
15208n/a
15209n/a if (PyType_Ready(&EncodingMapType) < 0)
15210n/a Py_FatalError("Can't initialize encoding map type");
15211n/a
15212n/a if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15213n/a Py_FatalError("Can't initialize field name iterator type");
15214n/a
15215n/a if (PyType_Ready(&PyFormatterIter_Type) < 0)
15216n/a Py_FatalError("Can't initialize formatter iter type");
15217n/a
15218n/a return 0;
15219n/a}
15220n/a
15221n/a/* Finalize the Unicode implementation */
15222n/a
15223n/aint
15224n/aPyUnicode_ClearFreeList(void)
15225n/a{
15226n/a return 0;
15227n/a}
15228n/a
15229n/avoid
15230n/a_PyUnicode_Fini(void)
15231n/a{
15232n/a int i;
15233n/a
15234n/a Py_CLEAR(unicode_empty);
15235n/a
15236n/a for (i = 0; i < 256; i++)
15237n/a Py_CLEAR(unicode_latin1[i]);
15238n/a _PyUnicode_ClearStaticStrings();
15239n/a (void)PyUnicode_ClearFreeList();
15240n/a}
15241n/a
15242n/avoid
15243n/aPyUnicode_InternInPlace(PyObject **p)
15244n/a{
15245n/a PyObject *s = *p;
15246n/a PyObject *t;
15247n/a#ifdef Py_DEBUG
15248n/a assert(s != NULL);
15249n/a assert(_PyUnicode_CHECK(s));
15250n/a#else
15251n/a if (s == NULL || !PyUnicode_Check(s))
15252n/a return;
15253n/a#endif
15254n/a /* If it's a subclass, we don't really know what putting
15255n/a it in the interned dict might do. */
15256n/a if (!PyUnicode_CheckExact(s))
15257n/a return;
15258n/a if (PyUnicode_CHECK_INTERNED(s))
15259n/a return;
15260n/a if (interned == NULL) {
15261n/a interned = PyDict_New();
15262n/a if (interned == NULL) {
15263n/a PyErr_Clear(); /* Don't leave an exception */
15264n/a return;
15265n/a }
15266n/a }
15267n/a Py_ALLOW_RECURSION
15268n/a t = PyDict_SetDefault(interned, s, s);
15269n/a Py_END_ALLOW_RECURSION
15270n/a if (t == NULL) {
15271n/a PyErr_Clear();
15272n/a return;
15273n/a }
15274n/a if (t != s) {
15275n/a Py_INCREF(t);
15276n/a Py_SETREF(*p, t);
15277n/a return;
15278n/a }
15279n/a /* The two references in interned are not counted by refcnt.
15280n/a The deallocator will take care of this */
15281n/a Py_REFCNT(s) -= 2;
15282n/a _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15283n/a}
15284n/a
15285n/avoid
15286n/aPyUnicode_InternImmortal(PyObject **p)
15287n/a{
15288n/a PyUnicode_InternInPlace(p);
15289n/a if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15290n/a _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15291n/a Py_INCREF(*p);
15292n/a }
15293n/a}
15294n/a
15295n/aPyObject *
15296n/aPyUnicode_InternFromString(const char *cp)
15297n/a{
15298n/a PyObject *s = PyUnicode_FromString(cp);
15299n/a if (s == NULL)
15300n/a return NULL;
15301n/a PyUnicode_InternInPlace(&s);
15302n/a return s;
15303n/a}
15304n/a
15305n/avoid
15306n/a_Py_ReleaseInternedUnicodeStrings(void)
15307n/a{
15308n/a PyObject *keys;
15309n/a PyObject *s;
15310n/a Py_ssize_t i, n;
15311n/a Py_ssize_t immortal_size = 0, mortal_size = 0;
15312n/a
15313n/a if (interned == NULL || !PyDict_Check(interned))
15314n/a return;
15315n/a keys = PyDict_Keys(interned);
15316n/a if (keys == NULL || !PyList_Check(keys)) {
15317n/a PyErr_Clear();
15318n/a return;
15319n/a }
15320n/a
15321n/a /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15322n/a detector, interned unicode strings are not forcibly deallocated;
15323n/a rather, we give them their stolen references back, and then clear
15324n/a and DECREF the interned dict. */
15325n/a
15326n/a n = PyList_GET_SIZE(keys);
15327n/a fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15328n/a n);
15329n/a for (i = 0; i < n; i++) {
15330n/a s = PyList_GET_ITEM(keys, i);
15331n/a if (PyUnicode_READY(s) == -1) {
15332n/a assert(0 && "could not ready string");
15333n/a fprintf(stderr, "could not ready string\n");
15334n/a }
15335n/a switch (PyUnicode_CHECK_INTERNED(s)) {
15336n/a case SSTATE_NOT_INTERNED:
15337n/a /* XXX Shouldn't happen */
15338n/a break;
15339n/a case SSTATE_INTERNED_IMMORTAL:
15340n/a Py_REFCNT(s) += 1;
15341n/a immortal_size += PyUnicode_GET_LENGTH(s);
15342n/a break;
15343n/a case SSTATE_INTERNED_MORTAL:
15344n/a Py_REFCNT(s) += 2;
15345n/a mortal_size += PyUnicode_GET_LENGTH(s);
15346n/a break;
15347n/a default:
15348n/a Py_FatalError("Inconsistent interned string state.");
15349n/a }
15350n/a _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15351n/a }
15352n/a fprintf(stderr, "total size of all interned strings: "
15353n/a "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15354n/a "mortal/immortal\n", mortal_size, immortal_size);
15355n/a Py_DECREF(keys);
15356n/a PyDict_Clear(interned);
15357n/a Py_CLEAR(interned);
15358n/a}
15359n/a
15360n/a
15361n/a/********************* Unicode Iterator **************************/
15362n/a
15363n/atypedef struct {
15364n/a PyObject_HEAD
15365n/a Py_ssize_t it_index;
15366n/a PyObject *it_seq; /* Set to NULL when iterator is exhausted */
15367n/a} unicodeiterobject;
15368n/a
15369n/astatic void
15370n/aunicodeiter_dealloc(unicodeiterobject *it)
15371n/a{
15372n/a _PyObject_GC_UNTRACK(it);
15373n/a Py_XDECREF(it->it_seq);
15374n/a PyObject_GC_Del(it);
15375n/a}
15376n/a
15377n/astatic int
15378n/aunicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15379n/a{
15380n/a Py_VISIT(it->it_seq);
15381n/a return 0;
15382n/a}
15383n/a
15384n/astatic PyObject *
15385n/aunicodeiter_next(unicodeiterobject *it)
15386n/a{
15387n/a PyObject *seq, *item;
15388n/a
15389n/a assert(it != NULL);
15390n/a seq = it->it_seq;
15391n/a if (seq == NULL)
15392n/a return NULL;
15393n/a assert(_PyUnicode_CHECK(seq));
15394n/a
15395n/a if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15396n/a int kind = PyUnicode_KIND(seq);
15397n/a void *data = PyUnicode_DATA(seq);
15398n/a Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15399n/a item = PyUnicode_FromOrdinal(chr);
15400n/a if (item != NULL)
15401n/a ++it->it_index;
15402n/a return item;
15403n/a }
15404n/a
15405n/a it->it_seq = NULL;
15406n/a Py_DECREF(seq);
15407n/a return NULL;
15408n/a}
15409n/a
15410n/astatic PyObject *
15411n/aunicodeiter_len(unicodeiterobject *it)
15412n/a{
15413n/a Py_ssize_t len = 0;
15414n/a if (it->it_seq)
15415n/a len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15416n/a return PyLong_FromSsize_t(len);
15417n/a}
15418n/a
15419n/aPyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15420n/a
15421n/astatic PyObject *
15422n/aunicodeiter_reduce(unicodeiterobject *it)
15423n/a{
15424n/a if (it->it_seq != NULL) {
15425n/a return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
15426n/a it->it_seq, it->it_index);
15427n/a } else {
15428n/a PyObject *u = (PyObject *)_PyUnicode_New(0);
15429n/a if (u == NULL)
15430n/a return NULL;
15431n/a return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
15432n/a }
15433n/a}
15434n/a
15435n/aPyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15436n/a
15437n/astatic PyObject *
15438n/aunicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15439n/a{
15440n/a Py_ssize_t index = PyLong_AsSsize_t(state);
15441n/a if (index == -1 && PyErr_Occurred())
15442n/a return NULL;
15443n/a if (it->it_seq != NULL) {
15444n/a if (index < 0)
15445n/a index = 0;
15446n/a else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15447n/a index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15448n/a it->it_index = index;
15449n/a }
15450n/a Py_RETURN_NONE;
15451n/a}
15452n/a
15453n/aPyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15454n/a
15455n/astatic PyMethodDef unicodeiter_methods[] = {
15456n/a {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15457n/a length_hint_doc},
15458n/a {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15459n/a reduce_doc},
15460n/a {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15461n/a setstate_doc},
15462n/a {NULL, NULL} /* sentinel */
15463n/a};
15464n/a
15465n/aPyTypeObject PyUnicodeIter_Type = {
15466n/a PyVarObject_HEAD_INIT(&PyType_Type, 0)
15467n/a "str_iterator", /* tp_name */
15468n/a sizeof(unicodeiterobject), /* tp_basicsize */
15469n/a 0, /* tp_itemsize */
15470n/a /* methods */
15471n/a (destructor)unicodeiter_dealloc, /* tp_dealloc */
15472n/a 0, /* tp_print */
15473n/a 0, /* tp_getattr */
15474n/a 0, /* tp_setattr */
15475n/a 0, /* tp_reserved */
15476n/a 0, /* tp_repr */
15477n/a 0, /* tp_as_number */
15478n/a 0, /* tp_as_sequence */
15479n/a 0, /* tp_as_mapping */
15480n/a 0, /* tp_hash */
15481n/a 0, /* tp_call */
15482n/a 0, /* tp_str */
15483n/a PyObject_GenericGetAttr, /* tp_getattro */
15484n/a 0, /* tp_setattro */
15485n/a 0, /* tp_as_buffer */
15486n/a Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15487n/a 0, /* tp_doc */
15488n/a (traverseproc)unicodeiter_traverse, /* tp_traverse */
15489n/a 0, /* tp_clear */
15490n/a 0, /* tp_richcompare */
15491n/a 0, /* tp_weaklistoffset */
15492n/a PyObject_SelfIter, /* tp_iter */
15493n/a (iternextfunc)unicodeiter_next, /* tp_iternext */
15494n/a unicodeiter_methods, /* tp_methods */
15495n/a 0,
15496n/a};
15497n/a
15498n/astatic PyObject *
15499n/aunicode_iter(PyObject *seq)
15500n/a{
15501n/a unicodeiterobject *it;
15502n/a
15503n/a if (!PyUnicode_Check(seq)) {
15504n/a PyErr_BadInternalCall();
15505n/a return NULL;
15506n/a }
15507n/a if (PyUnicode_READY(seq) == -1)
15508n/a return NULL;
15509n/a it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15510n/a if (it == NULL)
15511n/a return NULL;
15512n/a it->it_index = 0;
15513n/a Py_INCREF(seq);
15514n/a it->it_seq = seq;
15515n/a _PyObject_GC_TRACK(it);
15516n/a return (PyObject *)it;
15517n/a}
15518n/a
15519n/a
15520n/asize_t
15521n/aPy_UNICODE_strlen(const Py_UNICODE *u)
15522n/a{
15523n/a return wcslen(u);
15524n/a}
15525n/a
15526n/aPy_UNICODE*
15527n/aPy_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15528n/a{
15529n/a Py_UNICODE *u = s1;
15530n/a while ((*u++ = *s2++));
15531n/a return s1;
15532n/a}
15533n/a
15534n/aPy_UNICODE*
15535n/aPy_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15536n/a{
15537n/a Py_UNICODE *u = s1;
15538n/a while ((*u++ = *s2++))
15539n/a if (n-- == 0)
15540n/a break;
15541n/a return s1;
15542n/a}
15543n/a
15544n/aPy_UNICODE*
15545n/aPy_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15546n/a{
15547n/a Py_UNICODE *u1 = s1;
15548n/a u1 += wcslen(u1);
15549n/a while ((*u1++ = *s2++));
15550n/a return s1;
15551n/a}
15552n/a
15553n/aint
15554n/aPy_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15555n/a{
15556n/a while (*s1 && *s2 && *s1 == *s2)
15557n/a s1++, s2++;
15558n/a if (*s1 && *s2)
15559n/a return (*s1 < *s2) ? -1 : +1;
15560n/a if (*s1)
15561n/a return 1;
15562n/a if (*s2)
15563n/a return -1;
15564n/a return 0;
15565n/a}
15566n/a
15567n/aint
15568n/aPy_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15569n/a{
15570n/a Py_UNICODE u1, u2;
15571n/a for (; n != 0; n--) {
15572n/a u1 = *s1;
15573n/a u2 = *s2;
15574n/a if (u1 != u2)
15575n/a return (u1 < u2) ? -1 : +1;
15576n/a if (u1 == '\0')
15577n/a return 0;
15578n/a s1++;
15579n/a s2++;
15580n/a }
15581n/a return 0;
15582n/a}
15583n/a
15584n/aPy_UNICODE*
15585n/aPy_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15586n/a{
15587n/a const Py_UNICODE *p;
15588n/a for (p = s; *p; p++)
15589n/a if (*p == c)
15590n/a return (Py_UNICODE*)p;
15591n/a return NULL;
15592n/a}
15593n/a
15594n/aPy_UNICODE*
15595n/aPy_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15596n/a{
15597n/a const Py_UNICODE *p;
15598n/a p = s + wcslen(s);
15599n/a while (p != s) {
15600n/a p--;
15601n/a if (*p == c)
15602n/a return (Py_UNICODE*)p;
15603n/a }
15604n/a return NULL;
15605n/a}
15606n/a
15607n/aPy_UNICODE*
15608n/aPyUnicode_AsUnicodeCopy(PyObject *unicode)
15609n/a{
15610n/a Py_UNICODE *u, *copy;
15611n/a Py_ssize_t len, size;
15612n/a
15613n/a if (!PyUnicode_Check(unicode)) {
15614n/a PyErr_BadArgument();
15615n/a return NULL;
15616n/a }
15617n/a u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15618n/a if (u == NULL)
15619n/a return NULL;
15620n/a /* Ensure we won't overflow the size. */
15621n/a if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
15622n/a PyErr_NoMemory();
15623n/a return NULL;
15624n/a }
15625n/a size = len + 1; /* copy the null character */
15626n/a size *= sizeof(Py_UNICODE);
15627n/a copy = PyMem_Malloc(size);
15628n/a if (copy == NULL) {
15629n/a PyErr_NoMemory();
15630n/a return NULL;
15631n/a }
15632n/a memcpy(copy, u, size);
15633n/a return copy;
15634n/a}
15635n/a
15636n/a/* A _string module, to export formatter_parser and formatter_field_name_split
15637n/a to the string.Formatter class implemented in Python. */
15638n/a
15639n/astatic PyMethodDef _string_methods[] = {
15640n/a {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15641n/a METH_O, PyDoc_STR("split the argument as a field name")},
15642n/a {"formatter_parser", (PyCFunction) formatter_parser,
15643n/a METH_O, PyDoc_STR("parse the argument as a format string")},
15644n/a {NULL, NULL}
15645n/a};
15646n/a
15647n/astatic struct PyModuleDef _string_module = {
15648n/a PyModuleDef_HEAD_INIT,
15649n/a "_string",
15650n/a PyDoc_STR("string helper module"),
15651n/a 0,
15652n/a _string_methods,
15653n/a NULL,
15654n/a NULL,
15655n/a NULL,
15656n/a NULL
15657n/a};
15658n/a
15659n/aPyMODINIT_FUNC
15660n/aPyInit__string(void)
15661n/a{
15662n/a return PyModule_Create(&_string_module);
15663n/a}
15664n/a
15665n/a
15666n/a#ifdef __cplusplus
15667n/a}
15668n/a#endif