»Core Development>Code coverage>Modules/unicodedata.c

Python code coverage for Modules/unicodedata.c

#countcontent
1n/a/* ------------------------------------------------------------------------
2n/a
3n/a unicodedata -- Provides access to the Unicode database.
4n/a
5n/a Data was extracted from the UnicodeData.txt file.
6n/a The current version number is reported in the unidata_version constant.
7n/a
8n/a Written by Marc-Andre Lemburg (mal@lemburg.com).
9n/a Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
10n/a Modified by Martin v. Löwis (martin@v.loewis.de)
11n/a
12n/a Copyright (c) Corporation for National Research Initiatives.
13n/a
14n/a ------------------------------------------------------------------------ */
15n/a
16n/a#define PY_SSIZE_T_CLEAN
17n/a
18n/a#include "Python.h"
19n/a#include "ucnhash.h"
20n/a#include "structmember.h"
21n/a
22n/a/*[clinic input]
23n/amodule unicodedata
24n/aclass unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
25n/a[clinic start generated code]*/
26n/a/*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
27n/a
28n/a/* character properties */
29n/a
30n/atypedef struct {
31n/a const unsigned char category; /* index into
32n/a _PyUnicode_CategoryNames */
33n/a const unsigned char combining; /* combining class value 0 - 255 */
34n/a const unsigned char bidirectional; /* index into
35n/a _PyUnicode_BidirectionalNames */
36n/a const unsigned char mirrored; /* true if mirrored in bidir mode */
37n/a const unsigned char east_asian_width; /* index into
38n/a _PyUnicode_EastAsianWidth */
39n/a const unsigned char normalization_quick_check; /* see is_normalized() */
40n/a} _PyUnicode_DatabaseRecord;
41n/a
42n/atypedef struct change_record {
43n/a /* sequence of fields should be the same as in merge_old_version */
44n/a const unsigned char bidir_changed;
45n/a const unsigned char category_changed;
46n/a const unsigned char decimal_changed;
47n/a const unsigned char mirrored_changed;
48n/a const unsigned char east_asian_width_changed;
49n/a const double numeric_changed;
50n/a} change_record;
51n/a
52n/a/* data file generated by Tools/unicode/makeunicodedata.py */
53n/a#include "unicodedata_db.h"
54n/a
55n/astatic const _PyUnicode_DatabaseRecord*
56n/a_getrecord_ex(Py_UCS4 code)
57n/a{
58n/a int index;
59n/a if (code >= 0x110000)
60n/a index = 0;
61n/a else {
62n/a index = index1[(code>>SHIFT)];
63n/a index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
64n/a }
65n/a
66n/a return &_PyUnicode_Database_Records[index];
67n/a}
68n/a
69n/a/* ------------- Previous-version API ------------------------------------- */
70n/atypedef struct previous_version {
71n/a PyObject_HEAD
72n/a const char *name;
73n/a const change_record* (*getrecord)(Py_UCS4);
74n/a Py_UCS4 (*normalization)(Py_UCS4);
75n/a} PreviousDBVersion;
76n/a
77n/a#include "clinic/unicodedata.c.h"
78n/a
79n/a#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
80n/a
81n/astatic PyMemberDef DB_members[] = {
82n/a {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
83n/a {NULL}
84n/a};
85n/a
86n/a/* forward declaration */
87n/astatic PyTypeObject UCD_Type;
88n/a#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
89n/a
90n/astatic PyObject*
91n/anew_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
92n/a Py_UCS4 (*normalization)(Py_UCS4))
93n/a{
94n/a PreviousDBVersion *self;
95n/a self = PyObject_New(PreviousDBVersion, &UCD_Type);
96n/a if (self == NULL)
97n/a return NULL;
98n/a self->name = name;
99n/a self->getrecord = getrecord;
100n/a self->normalization = normalization;
101n/a return (PyObject*)self;
102n/a}
103n/a
104n/a
105n/a/* --- Module API --------------------------------------------------------- */
106n/a
107n/a/*[clinic input]
108n/aunicodedata.UCD.decimal
109n/a
110n/a self: self
111n/a chr: int(accept={str})
112n/a default: object=NULL
113n/a /
114n/a
115n/aConverts a Unicode character into its equivalent decimal value.
116n/a
117n/aReturns the decimal value assigned to the character chr as integer.
118n/aIf no such value is defined, default is returned, or, if not given,
119n/aValueError is raised.
120n/a[clinic start generated code]*/
121n/a
122n/astatic PyObject *
123n/aunicodedata_UCD_decimal_impl(PyObject *self, int chr,
124n/a PyObject *default_value)
125n/a/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
126n/a{
127n/a int have_old = 0;
128n/a long rc;
129n/a Py_UCS4 c = (Py_UCS4)chr;
130n/a
131n/a if (self && UCD_Check(self)) {
132n/a const change_record *old = get_old_record(self, c);
133n/a if (old->category_changed == 0) {
134n/a /* unassigned */
135n/a have_old = 1;
136n/a rc = -1;
137n/a }
138n/a else if (old->decimal_changed != 0xFF) {
139n/a have_old = 1;
140n/a rc = old->decimal_changed;
141n/a }
142n/a }
143n/a
144n/a if (!have_old)
145n/a rc = Py_UNICODE_TODECIMAL(c);
146n/a if (rc < 0) {
147n/a if (default_value == NULL) {
148n/a PyErr_SetString(PyExc_ValueError,
149n/a "not a decimal");
150n/a return NULL;
151n/a }
152n/a else {
153n/a Py_INCREF(default_value);
154n/a return default_value;
155n/a }
156n/a }
157n/a return PyLong_FromLong(rc);
158n/a}
159n/a
160n/a/*[clinic input]
161n/aunicodedata.UCD.digit
162n/a
163n/a self: self
164n/a chr: int(accept={str})
165n/a default: object=NULL
166n/a /
167n/a
168n/aConverts a Unicode character into its equivalent digit value.
169n/a
170n/aReturns the digit value assigned to the character chr as integer.
171n/aIf no such value is defined, default is returned, or, if not given,
172n/aValueError is raised.
173n/a[clinic start generated code]*/
174n/a
175n/astatic PyObject *
176n/aunicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
177n/a/*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
178n/a{
179n/a long rc;
180n/a Py_UCS4 c = (Py_UCS4)chr;
181n/a rc = Py_UNICODE_TODIGIT(c);
182n/a if (rc < 0) {
183n/a if (default_value == NULL) {
184n/a PyErr_SetString(PyExc_ValueError, "not a digit");
185n/a return NULL;
186n/a }
187n/a else {
188n/a Py_INCREF(default_value);
189n/a return default_value;
190n/a }
191n/a }
192n/a return PyLong_FromLong(rc);
193n/a}
194n/a
195n/a/*[clinic input]
196n/aunicodedata.UCD.numeric
197n/a
198n/a self: self
199n/a chr: int(accept={str})
200n/a default: object=NULL
201n/a /
202n/a
203n/aConverts a Unicode character into its equivalent numeric value.
204n/a
205n/aReturns the numeric value assigned to the character chr as float.
206n/aIf no such value is defined, default is returned, or, if not given,
207n/aValueError is raised.
208n/a[clinic start generated code]*/
209n/a
210n/astatic PyObject *
211n/aunicodedata_UCD_numeric_impl(PyObject *self, int chr,
212n/a PyObject *default_value)
213n/a/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
214n/a{
215n/a int have_old = 0;
216n/a double rc;
217n/a Py_UCS4 c = (Py_UCS4)chr;
218n/a
219n/a if (self && UCD_Check(self)) {
220n/a const change_record *old = get_old_record(self, c);
221n/a if (old->category_changed == 0) {
222n/a /* unassigned */
223n/a have_old = 1;
224n/a rc = -1.0;
225n/a }
226n/a else if (old->decimal_changed != 0xFF) {
227n/a have_old = 1;
228n/a rc = old->decimal_changed;
229n/a }
230n/a }
231n/a
232n/a if (!have_old)
233n/a rc = Py_UNICODE_TONUMERIC(c);
234n/a if (rc == -1.0) {
235n/a if (default_value == NULL) {
236n/a PyErr_SetString(PyExc_ValueError, "not a numeric character");
237n/a return NULL;
238n/a }
239n/a else {
240n/a Py_INCREF(default_value);
241n/a return default_value;
242n/a }
243n/a }
244n/a return PyFloat_FromDouble(rc);
245n/a}
246n/a
247n/a/*[clinic input]
248n/aunicodedata.UCD.category
249n/a
250n/a self: self
251n/a chr: int(accept={str})
252n/a /
253n/a
254n/aReturns the general category assigned to the character chr as string.
255n/a[clinic start generated code]*/
256n/a
257n/astatic PyObject *
258n/aunicodedata_UCD_category_impl(PyObject *self, int chr)
259n/a/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
260n/a{
261n/a int index;
262n/a Py_UCS4 c = (Py_UCS4)chr;
263n/a index = (int) _getrecord_ex(c)->category;
264n/a if (self && UCD_Check(self)) {
265n/a const change_record *old = get_old_record(self, c);
266n/a if (old->category_changed != 0xFF)
267n/a index = old->category_changed;
268n/a }
269n/a return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
270n/a}
271n/a
272n/a/*[clinic input]
273n/aunicodedata.UCD.bidirectional
274n/a
275n/a self: self
276n/a chr: int(accept={str})
277n/a /
278n/a
279n/aReturns the bidirectional class assigned to the character chr as string.
280n/a
281n/aIf no such value is defined, an empty string is returned.
282n/a[clinic start generated code]*/
283n/a
284n/astatic PyObject *
285n/aunicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
286n/a/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
287n/a{
288n/a int index;
289n/a Py_UCS4 c = (Py_UCS4)chr;
290n/a index = (int) _getrecord_ex(c)->bidirectional;
291n/a if (self && UCD_Check(self)) {
292n/a const change_record *old = get_old_record(self, c);
293n/a if (old->category_changed == 0)
294n/a index = 0; /* unassigned */
295n/a else if (old->bidir_changed != 0xFF)
296n/a index = old->bidir_changed;
297n/a }
298n/a return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
299n/a}
300n/a
301n/a/*[clinic input]
302n/aunicodedata.UCD.combining -> int
303n/a
304n/a self: self
305n/a chr: int(accept={str})
306n/a /
307n/a
308n/aReturns the canonical combining class assigned to the character chr as integer.
309n/a
310n/aReturns 0 if no combining class is defined.
311n/a[clinic start generated code]*/
312n/a
313n/astatic int
314n/aunicodedata_UCD_combining_impl(PyObject *self, int chr)
315n/a/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
316n/a{
317n/a int index;
318n/a Py_UCS4 c = (Py_UCS4)chr;
319n/a index = (int) _getrecord_ex(c)->combining;
320n/a if (self && UCD_Check(self)) {
321n/a const change_record *old = get_old_record(self, c);
322n/a if (old->category_changed == 0)
323n/a index = 0; /* unassigned */
324n/a }
325n/a return index;
326n/a}
327n/a
328n/a/*[clinic input]
329n/aunicodedata.UCD.mirrored -> int
330n/a
331n/a self: self
332n/a chr: int(accept={str})
333n/a /
334n/a
335n/aReturns the mirrored property assigned to the character chr as integer.
336n/a
337n/aReturns 1 if the character has been identified as a "mirrored"
338n/acharacter in bidirectional text, 0 otherwise.
339n/a[clinic start generated code]*/
340n/a
341n/astatic int
342n/aunicodedata_UCD_mirrored_impl(PyObject *self, int chr)
343n/a/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
344n/a{
345n/a int index;
346n/a Py_UCS4 c = (Py_UCS4)chr;
347n/a index = (int) _getrecord_ex(c)->mirrored;
348n/a if (self && UCD_Check(self)) {
349n/a const change_record *old = get_old_record(self, c);
350n/a if (old->category_changed == 0)
351n/a index = 0; /* unassigned */
352n/a else if (old->mirrored_changed != 0xFF)
353n/a index = old->mirrored_changed;
354n/a }
355n/a return index;
356n/a}
357n/a
358n/a/*[clinic input]
359n/aunicodedata.UCD.east_asian_width
360n/a
361n/a self: self
362n/a chr: int(accept={str})
363n/a /
364n/a
365n/aReturns the east asian width assigned to the character chr as string.
366n/a[clinic start generated code]*/
367n/a
368n/astatic PyObject *
369n/aunicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
370n/a/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
371n/a{
372n/a int index;
373n/a Py_UCS4 c = (Py_UCS4)chr;
374n/a index = (int) _getrecord_ex(c)->east_asian_width;
375n/a if (self && UCD_Check(self)) {
376n/a const change_record *old = get_old_record(self, c);
377n/a if (old->category_changed == 0)
378n/a index = 0; /* unassigned */
379n/a else if (old->east_asian_width_changed != 0xFF)
380n/a index = old->east_asian_width_changed;
381n/a }
382n/a return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
383n/a}
384n/a
385n/a/*[clinic input]
386n/aunicodedata.UCD.decomposition
387n/a
388n/a self: self
389n/a chr: int(accept={str})
390n/a /
391n/a
392n/aReturns the character decomposition mapping assigned to the character chr as string.
393n/a
394n/aAn empty string is returned in case no such mapping is defined.
395n/a[clinic start generated code]*/
396n/a
397n/astatic PyObject *
398n/aunicodedata_UCD_decomposition_impl(PyObject *self, int chr)
399n/a/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
400n/a{
401n/a char decomp[256];
402n/a int code, index, count;
403n/a size_t i;
404n/a unsigned int prefix_index;
405n/a Py_UCS4 c = (Py_UCS4)chr;
406n/a
407n/a code = (int)c;
408n/a
409n/a if (self && UCD_Check(self)) {
410n/a const change_record *old = get_old_record(self, c);
411n/a if (old->category_changed == 0)
412n/a return PyUnicode_FromString(""); /* unassigned */
413n/a }
414n/a
415n/a if (code < 0 || code >= 0x110000)
416n/a index = 0;
417n/a else {
418n/a index = decomp_index1[(code>>DECOMP_SHIFT)];
419n/a index = decomp_index2[(index<<DECOMP_SHIFT)+
420n/a (code&((1<<DECOMP_SHIFT)-1))];
421n/a }
422n/a
423n/a /* high byte is number of hex bytes (usually one or two), low byte
424n/a is prefix code (from*/
425n/a count = decomp_data[index] >> 8;
426n/a
427n/a /* XXX: could allocate the PyString up front instead
428n/a (strlen(prefix) + 5 * count + 1 bytes) */
429n/a
430n/a /* Based on how index is calculated above and decomp_data is generated
431n/a from Tools/unicode/makeunicodedata.py, it should not be possible
432n/a to overflow decomp_prefix. */
433n/a prefix_index = decomp_data[index] & 255;
434n/a assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
435n/a
436n/a /* copy prefix */
437n/a i = strlen(decomp_prefix[prefix_index]);
438n/a memcpy(decomp, decomp_prefix[prefix_index], i);
439n/a
440n/a while (count-- > 0) {
441n/a if (i)
442n/a decomp[i++] = ' ';
443n/a assert(i < sizeof(decomp));
444n/a PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
445n/a decomp_data[++index]);
446n/a i += strlen(decomp + i);
447n/a }
448n/a return PyUnicode_FromStringAndSize(decomp, i);
449n/a}
450n/a
451n/astatic void
452n/aget_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
453n/a{
454n/a if (code >= 0x110000) {
455n/a *index = 0;
456n/a } else if (self && UCD_Check(self) &&
457n/a get_old_record(self, code)->category_changed==0) {
458n/a /* unassigned in old version */
459n/a *index = 0;
460n/a }
461n/a else {
462n/a *index = decomp_index1[(code>>DECOMP_SHIFT)];
463n/a *index = decomp_index2[(*index<<DECOMP_SHIFT)+
464n/a (code&((1<<DECOMP_SHIFT)-1))];
465n/a }
466n/a
467n/a /* high byte is number of hex bytes (usually one or two), low byte
468n/a is prefix code (from*/
469n/a *count = decomp_data[*index] >> 8;
470n/a *prefix = decomp_data[*index] & 255;
471n/a
472n/a (*index)++;
473n/a}
474n/a
475n/a#define SBase 0xAC00
476n/a#define LBase 0x1100
477n/a#define VBase 0x1161
478n/a#define TBase 0x11A7
479n/a#define LCount 19
480n/a#define VCount 21
481n/a#define TCount 28
482n/a#define NCount (VCount*TCount)
483n/a#define SCount (LCount*NCount)
484n/a
485n/astatic PyObject*
486n/anfd_nfkd(PyObject *self, PyObject *input, int k)
487n/a{
488n/a PyObject *result;
489n/a Py_UCS4 *output;
490n/a Py_ssize_t i, o, osize;
491n/a int kind;
492n/a void *data;
493n/a /* Longest decomposition in Unicode 3.2: U+FDFA */
494n/a Py_UCS4 stack[20];
495n/a Py_ssize_t space, isize;
496n/a int index, prefix, count, stackptr;
497n/a unsigned char prev, cur;
498n/a
499n/a stackptr = 0;
500n/a isize = PyUnicode_GET_LENGTH(input);
501n/a space = isize;
502n/a /* Overallocate at most 10 characters. */
503n/a if (space > 10) {
504n/a if (space <= PY_SSIZE_T_MAX - 10)
505n/a space += 10;
506n/a }
507n/a else {
508n/a space *= 2;
509n/a }
510n/a osize = space;
511n/a output = PyMem_NEW(Py_UCS4, space);
512n/a if (!output) {
513n/a PyErr_NoMemory();
514n/a return NULL;
515n/a }
516n/a i = o = 0;
517n/a kind = PyUnicode_KIND(input);
518n/a data = PyUnicode_DATA(input);
519n/a
520n/a while (i < isize) {
521n/a stack[stackptr++] = PyUnicode_READ(kind, data, i++);
522n/a while(stackptr) {
523n/a Py_UCS4 code = stack[--stackptr];
524n/a /* Hangul Decomposition adds three characters in
525n/a a single step, so we need at least that much room. */
526n/a if (space < 3) {
527n/a Py_UCS4 *new_output;
528n/a osize += 10;
529n/a space += 10;
530n/a new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
531n/a if (new_output == NULL) {
532n/a PyMem_Free(output);
533n/a PyErr_NoMemory();
534n/a return NULL;
535n/a }
536n/a output = new_output;
537n/a }
538n/a /* Hangul Decomposition. */
539n/a if (SBase <= code && code < (SBase+SCount)) {
540n/a int SIndex = code - SBase;
541n/a int L = LBase + SIndex / NCount;
542n/a int V = VBase + (SIndex % NCount) / TCount;
543n/a int T = TBase + SIndex % TCount;
544n/a output[o++] = L;
545n/a output[o++] = V;
546n/a space -= 2;
547n/a if (T != TBase) {
548n/a output[o++] = T;
549n/a space --;
550n/a }
551n/a continue;
552n/a }
553n/a /* normalization changes */
554n/a if (self && UCD_Check(self)) {
555n/a Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
556n/a if (value != 0) {
557n/a stack[stackptr++] = value;
558n/a continue;
559n/a }
560n/a }
561n/a
562n/a /* Other decompositions. */
563n/a get_decomp_record(self, code, &index, &prefix, &count);
564n/a
565n/a /* Copy character if it is not decomposable, or has a
566n/a compatibility decomposition, but we do NFD. */
567n/a if (!count || (prefix && !k)) {
568n/a output[o++] = code;
569n/a space--;
570n/a continue;
571n/a }
572n/a /* Copy decomposition onto the stack, in reverse
573n/a order. */
574n/a while(count) {
575n/a code = decomp_data[index + (--count)];
576n/a stack[stackptr++] = code;
577n/a }
578n/a }
579n/a }
580n/a
581n/a result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
582n/a output, o);
583n/a PyMem_Free(output);
584n/a if (!result)
585n/a return NULL;
586n/a /* result is guaranteed to be ready, as it is compact. */
587n/a kind = PyUnicode_KIND(result);
588n/a data = PyUnicode_DATA(result);
589n/a
590n/a /* Sort canonically. */
591n/a i = 0;
592n/a prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
593n/a for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
594n/a cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
595n/a if (prev == 0 || cur == 0 || prev <= cur) {
596n/a prev = cur;
597n/a continue;
598n/a }
599n/a /* Non-canonical order. Need to switch *i with previous. */
600n/a o = i - 1;
601n/a while (1) {
602n/a Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
603n/a PyUnicode_WRITE(kind, data, o+1,
604n/a PyUnicode_READ(kind, data, o));
605n/a PyUnicode_WRITE(kind, data, o, tmp);
606n/a o--;
607n/a if (o < 0)
608n/a break;
609n/a prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
610n/a if (prev == 0 || prev <= cur)
611n/a break;
612n/a }
613n/a prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
614n/a }
615n/a return result;
616n/a}
617n/a
618n/astatic int
619n/afind_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
620n/a{
621n/a unsigned int index;
622n/a for (index = 0; nfc[index].start; index++) {
623n/a unsigned int start = nfc[index].start;
624n/a if (code < start)
625n/a return -1;
626n/a if (code <= start + nfc[index].count) {
627n/a unsigned int delta = code - start;
628n/a return nfc[index].index + delta;
629n/a }
630n/a }
631n/a return -1;
632n/a}
633n/a
634n/astatic PyObject*
635n/anfc_nfkc(PyObject *self, PyObject *input, int k)
636n/a{
637n/a PyObject *result;
638n/a int kind;
639n/a void *data;
640n/a Py_UCS4 *output;
641n/a Py_ssize_t i, i1, o, len;
642n/a int f,l,index,index1,comb;
643n/a Py_UCS4 code;
644n/a Py_ssize_t skipped[20];
645n/a int cskipped = 0;
646n/a
647n/a result = nfd_nfkd(self, input, k);
648n/a if (!result)
649n/a return NULL;
650n/a /* result will be "ready". */
651n/a kind = PyUnicode_KIND(result);
652n/a data = PyUnicode_DATA(result);
653n/a len = PyUnicode_GET_LENGTH(result);
654n/a
655n/a /* We allocate a buffer for the output.
656n/a If we find that we made no changes, we still return
657n/a the NFD result. */
658n/a output = PyMem_NEW(Py_UCS4, len);
659n/a if (!output) {
660n/a PyErr_NoMemory();
661n/a Py_DECREF(result);
662n/a return 0;
663n/a }
664n/a i = o = 0;
665n/a
666n/a again:
667n/a while (i < len) {
668n/a for (index = 0; index < cskipped; index++) {
669n/a if (skipped[index] == i) {
670n/a /* *i character is skipped.
671n/a Remove from list. */
672n/a skipped[index] = skipped[cskipped-1];
673n/a cskipped--;
674n/a i++;
675n/a goto again; /* continue while */
676n/a }
677n/a }
678n/a /* Hangul Composition. We don't need to check for <LV,T>
679n/a pairs, since we always have decomposed data. */
680n/a code = PyUnicode_READ(kind, data, i);
681n/a if (LBase <= code && code < (LBase+LCount) &&
682n/a i + 1 < len &&
683n/a VBase <= PyUnicode_READ(kind, data, i+1) &&
684n/a PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
685n/a int LIndex, VIndex;
686n/a LIndex = code - LBase;
687n/a VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
688n/a code = SBase + (LIndex*VCount+VIndex)*TCount;
689n/a i+=2;
690n/a if (i < len &&
691n/a TBase <= PyUnicode_READ(kind, data, i) &&
692n/a PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
693n/a code += PyUnicode_READ(kind, data, i)-TBase;
694n/a i++;
695n/a }
696n/a output[o++] = code;
697n/a continue;
698n/a }
699n/a
700n/a /* code is still input[i] here */
701n/a f = find_nfc_index(self, nfc_first, code);
702n/a if (f == -1) {
703n/a output[o++] = code;
704n/a i++;
705n/a continue;
706n/a }
707n/a /* Find next unblocked character. */
708n/a i1 = i+1;
709n/a comb = 0;
710n/a /* output base character for now; might be updated later. */
711n/a output[o] = PyUnicode_READ(kind, data, i);
712n/a while (i1 < len) {
713n/a Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
714n/a int comb1 = _getrecord_ex(code1)->combining;
715n/a if (comb) {
716n/a if (comb1 == 0)
717n/a break;
718n/a if (comb >= comb1) {
719n/a /* Character is blocked. */
720n/a i1++;
721n/a continue;
722n/a }
723n/a }
724n/a l = find_nfc_index(self, nfc_last, code1);
725n/a /* i1 cannot be combined with i. If i1
726n/a is a starter, we don't need to look further.
727n/a Otherwise, record the combining class. */
728n/a if (l == -1) {
729n/a not_combinable:
730n/a if (comb1 == 0)
731n/a break;
732n/a comb = comb1;
733n/a i1++;
734n/a continue;
735n/a }
736n/a index = f*TOTAL_LAST + l;
737n/a index1 = comp_index[index >> COMP_SHIFT];
738n/a code = comp_data[(index1<<COMP_SHIFT)+
739n/a (index&((1<<COMP_SHIFT)-1))];
740n/a if (code == 0)
741n/a goto not_combinable;
742n/a
743n/a /* Replace the original character. */
744n/a output[o] = code;
745n/a /* Mark the second character unused. */
746n/a assert(cskipped < 20);
747n/a skipped[cskipped++] = i1;
748n/a i1++;
749n/a f = find_nfc_index(self, nfc_first, output[o]);
750n/a if (f == -1)
751n/a break;
752n/a }
753n/a /* Output character was already written.
754n/a Just advance the indices. */
755n/a o++; i++;
756n/a }
757n/a if (o == len) {
758n/a /* No changes. Return original string. */
759n/a PyMem_Free(output);
760n/a return result;
761n/a }
762n/a Py_DECREF(result);
763n/a result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
764n/a output, o);
765n/a PyMem_Free(output);
766n/a return result;
767n/a}
768n/a
769n/a/* Return 1 if the input is certainly normalized, 0 if it might not be. */
770n/astatic int
771n/ais_normalized(PyObject *self, PyObject *input, int nfc, int k)
772n/a{
773n/a Py_ssize_t i, len;
774n/a int kind;
775n/a void *data;
776n/a unsigned char prev_combining = 0, quickcheck_mask;
777n/a
778n/a /* An older version of the database is requested, quickchecks must be
779n/a disabled. */
780n/a if (self && UCD_Check(self))
781n/a return 0;
782n/a
783n/a /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
784n/a as described in http://unicode.org/reports/tr15/#Annex8. */
785n/a quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
786n/a
787n/a i = 0;
788n/a kind = PyUnicode_KIND(input);
789n/a data = PyUnicode_DATA(input);
790n/a len = PyUnicode_GET_LENGTH(input);
791n/a while (i < len) {
792n/a Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
793n/a const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
794n/a unsigned char combining = record->combining;
795n/a unsigned char quickcheck = record->normalization_quick_check;
796n/a
797n/a if (quickcheck & quickcheck_mask)
798n/a return 0; /* this string might need normalization */
799n/a if (combining && prev_combining > combining)
800n/a return 0; /* non-canonical sort order, not normalized */
801n/a prev_combining = combining;
802n/a }
803n/a return 1; /* certainly normalized */
804n/a}
805n/a
806n/a/*[clinic input]
807n/aunicodedata.UCD.normalize
808n/a
809n/a self: self
810n/a form: str
811n/a unistr as input: unicode
812n/a /
813n/a
814n/aReturn the normal form 'form' for the Unicode string unistr.
815n/a
816n/aValid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
817n/a[clinic start generated code]*/
818n/a
819n/astatic PyObject *
820n/aunicodedata_UCD_normalize_impl(PyObject *self, const char *form,
821n/a PyObject *input)
822n/a/*[clinic end generated code: output=62d1f8870027efdc input=1744c55f4ab79bf0]*/
823n/a{
824n/a if (PyUnicode_GET_LENGTH(input) == 0) {
825n/a /* Special case empty input strings, since resizing
826n/a them later would cause internal errors. */
827n/a Py_INCREF(input);
828n/a return input;
829n/a }
830n/a
831n/a if (strcmp(form, "NFC") == 0) {
832n/a if (is_normalized(self, input, 1, 0)) {
833n/a Py_INCREF(input);
834n/a return input;
835n/a }
836n/a return nfc_nfkc(self, input, 0);
837n/a }
838n/a if (strcmp(form, "NFKC") == 0) {
839n/a if (is_normalized(self, input, 1, 1)) {
840n/a Py_INCREF(input);
841n/a return input;
842n/a }
843n/a return nfc_nfkc(self, input, 1);
844n/a }
845n/a if (strcmp(form, "NFD") == 0) {
846n/a if (is_normalized(self, input, 0, 0)) {
847n/a Py_INCREF(input);
848n/a return input;
849n/a }
850n/a return nfd_nfkd(self, input, 0);
851n/a }
852n/a if (strcmp(form, "NFKD") == 0) {
853n/a if (is_normalized(self, input, 0, 1)) {
854n/a Py_INCREF(input);
855n/a return input;
856n/a }
857n/a return nfd_nfkd(self, input, 1);
858n/a }
859n/a PyErr_SetString(PyExc_ValueError, "invalid normalization form");
860n/a return NULL;
861n/a}
862n/a
863n/a/* -------------------------------------------------------------------- */
864n/a/* unicode character name tables */
865n/a
866n/a/* data file generated by Tools/unicode/makeunicodedata.py */
867n/a#include "unicodename_db.h"
868n/a
869n/a/* -------------------------------------------------------------------- */
870n/a/* database code (cut and pasted from the unidb package) */
871n/a
872n/astatic unsigned long
873n/a_gethash(const char *s, int len, int scale)
874n/a{
875n/a int i;
876n/a unsigned long h = 0;
877n/a unsigned long ix;
878n/a for (i = 0; i < len; i++) {
879n/a h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
880n/a ix = h & 0xff000000;
881n/a if (ix)
882n/a h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
883n/a }
884n/a return h;
885n/a}
886n/a
887n/astatic const char * const hangul_syllables[][3] = {
888n/a { "G", "A", "" },
889n/a { "GG", "AE", "G" },
890n/a { "N", "YA", "GG" },
891n/a { "D", "YAE", "GS" },
892n/a { "DD", "EO", "N", },
893n/a { "R", "E", "NJ" },
894n/a { "M", "YEO", "NH" },
895n/a { "B", "YE", "D" },
896n/a { "BB", "O", "L" },
897n/a { "S", "WA", "LG" },
898n/a { "SS", "WAE", "LM" },
899n/a { "", "OE", "LB" },
900n/a { "J", "YO", "LS" },
901n/a { "JJ", "U", "LT" },
902n/a { "C", "WEO", "LP" },
903n/a { "K", "WE", "LH" },
904n/a { "T", "WI", "M" },
905n/a { "P", "YU", "B" },
906n/a { "H", "EU", "BS" },
907n/a { 0, "YI", "S" },
908n/a { 0, "I", "SS" },
909n/a { 0, 0, "NG" },
910n/a { 0, 0, "J" },
911n/a { 0, 0, "C" },
912n/a { 0, 0, "K" },
913n/a { 0, 0, "T" },
914n/a { 0, 0, "P" },
915n/a { 0, 0, "H" }
916n/a};
917n/a
918n/a/* These ranges need to match makeunicodedata.py:cjk_ranges. */
919n/astatic int
920n/ais_unified_ideograph(Py_UCS4 code)
921n/a{
922n/a return
923n/a (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
924n/a (0x4E00 <= code && code <= 0x9FD5) || /* CJK Ideograph */
925n/a (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
926n/a (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
927n/a (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
928n/a (0x2B820 <= code && code <= 0x2CEA1); /* CJK Ideograph Extension E */
929n/a}
930n/a
931n/a/* macros used to determine if the given code point is in the PUA range that
932n/a * we are using to store aliases and named sequences */
933n/a#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
934n/a#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
935n/a (cp < named_sequences_end))
936n/a
937n/astatic int
938n/a_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
939n/a int with_alias_and_seq)
940n/a{
941n/a /* Find the name associated with the given code point.
942n/a * If with_alias_and_seq is 1, check for names in the Private Use Area 15
943n/a * that we are using for aliases and named sequences. */
944n/a int offset;
945n/a int i;
946n/a int word;
947n/a unsigned char* w;
948n/a
949n/a if (code >= 0x110000)
950n/a return 0;
951n/a
952n/a /* XXX should we just skip all the code points in the PUAs here? */
953n/a if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
954n/a return 0;
955n/a
956n/a if (self && UCD_Check(self)) {
957n/a /* in 3.2.0 there are no aliases and named sequences */
958n/a const change_record *old;
959n/a if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
960n/a return 0;
961n/a old = get_old_record(self, code);
962n/a if (old->category_changed == 0) {
963n/a /* unassigned */
964n/a return 0;
965n/a }
966n/a }
967n/a
968n/a if (SBase <= code && code < SBase+SCount) {
969n/a /* Hangul syllable. */
970n/a int SIndex = code - SBase;
971n/a int L = SIndex / NCount;
972n/a int V = (SIndex % NCount) / TCount;
973n/a int T = SIndex % TCount;
974n/a
975n/a if (buflen < 27)
976n/a /* Worst case: HANGUL SYLLABLE <10chars>. */
977n/a return 0;
978n/a strcpy(buffer, "HANGUL SYLLABLE ");
979n/a buffer += 16;
980n/a strcpy(buffer, hangul_syllables[L][0]);
981n/a buffer += strlen(hangul_syllables[L][0]);
982n/a strcpy(buffer, hangul_syllables[V][1]);
983n/a buffer += strlen(hangul_syllables[V][1]);
984n/a strcpy(buffer, hangul_syllables[T][2]);
985n/a buffer += strlen(hangul_syllables[T][2]);
986n/a *buffer = '\0';
987n/a return 1;
988n/a }
989n/a
990n/a if (is_unified_ideograph(code)) {
991n/a if (buflen < 28)
992n/a /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
993n/a return 0;
994n/a sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
995n/a return 1;
996n/a }
997n/a
998n/a /* get offset into phrasebook */
999n/a offset = phrasebook_offset1[(code>>phrasebook_shift)];
1000n/a offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1001n/a (code&((1<<phrasebook_shift)-1))];
1002n/a if (!offset)
1003n/a return 0;
1004n/a
1005n/a i = 0;
1006n/a
1007n/a for (;;) {
1008n/a /* get word index */
1009n/a word = phrasebook[offset] - phrasebook_short;
1010n/a if (word >= 0) {
1011n/a word = (word << 8) + phrasebook[offset+1];
1012n/a offset += 2;
1013n/a } else
1014n/a word = phrasebook[offset++];
1015n/a if (i) {
1016n/a if (i > buflen)
1017n/a return 0; /* buffer overflow */
1018n/a buffer[i++] = ' ';
1019n/a }
1020n/a /* copy word string from lexicon. the last character in the
1021n/a word has bit 7 set. the last word in a string ends with
1022n/a 0x80 */
1023n/a w = lexicon + lexicon_offset[word];
1024n/a while (*w < 128) {
1025n/a if (i >= buflen)
1026n/a return 0; /* buffer overflow */
1027n/a buffer[i++] = *w++;
1028n/a }
1029n/a if (i >= buflen)
1030n/a return 0; /* buffer overflow */
1031n/a buffer[i++] = *w & 127;
1032n/a if (*w == 128)
1033n/a break; /* end of word */
1034n/a }
1035n/a
1036n/a return 1;
1037n/a}
1038n/a
1039n/astatic int
1040n/a_cmpname(PyObject *self, int code, const char* name, int namelen)
1041n/a{
1042n/a /* check if code corresponds to the given name */
1043n/a int i;
1044n/a char buffer[NAME_MAXLEN+1];
1045n/a if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
1046n/a return 0;
1047n/a for (i = 0; i < namelen; i++) {
1048n/a if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
1049n/a return 0;
1050n/a }
1051n/a return buffer[namelen] == '\0';
1052n/a}
1053n/a
1054n/astatic void
1055n/afind_syllable(const char *str, int *len, int *pos, int count, int column)
1056n/a{
1057n/a int i, len1;
1058n/a *len = -1;
1059n/a for (i = 0; i < count; i++) {
1060n/a const char *s = hangul_syllables[i][column];
1061n/a len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
1062n/a if (len1 <= *len)
1063n/a continue;
1064n/a if (strncmp(str, s, len1) == 0) {
1065n/a *len = len1;
1066n/a *pos = i;
1067n/a }
1068n/a }
1069n/a if (*len == -1) {
1070n/a *len = 0;
1071n/a }
1072n/a}
1073n/a
1074n/astatic int
1075n/a_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
1076n/a{
1077n/a /* check if named sequences are allowed */
1078n/a if (!with_named_seq && IS_NAMED_SEQ(cp))
1079n/a return 0;
1080n/a /* if the code point is in the PUA range that we use for aliases,
1081n/a * convert it to obtain the right code point */
1082n/a if (IS_ALIAS(cp))
1083n/a *code = name_aliases[cp-aliases_start];
1084n/a else
1085n/a *code = cp;
1086n/a return 1;
1087n/a}
1088n/a
1089n/astatic int
1090n/a_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1091n/a int with_named_seq)
1092n/a{
1093n/a /* Return the code point associated with the given name.
1094n/a * Named aliases are resolved too (unless self != NULL (i.e. we are using
1095n/a * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
1096n/a * using for the named sequence, and the caller must then convert it. */
1097n/a unsigned int h, v;
1098n/a unsigned int mask = code_size-1;
1099n/a unsigned int i, incr;
1100n/a
1101n/a /* Check for hangul syllables. */
1102n/a if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1103n/a int len, L = -1, V = -1, T = -1;
1104n/a const char *pos = name + 16;
1105n/a find_syllable(pos, &len, &L, LCount, 0);
1106n/a pos += len;
1107n/a find_syllable(pos, &len, &V, VCount, 1);
1108n/a pos += len;
1109n/a find_syllable(pos, &len, &T, TCount, 2);
1110n/a pos += len;
1111n/a if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1112n/a *code = SBase + (L*VCount+V)*TCount + T;
1113n/a return 1;
1114n/a }
1115n/a /* Otherwise, it's an illegal syllable name. */
1116n/a return 0;
1117n/a }
1118n/a
1119n/a /* Check for unified ideographs. */
1120n/a if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1121n/a /* Four or five hexdigits must follow. */
1122n/a v = 0;
1123n/a name += 22;
1124n/a namelen -= 22;
1125n/a if (namelen != 4 && namelen != 5)
1126n/a return 0;
1127n/a while (namelen--) {
1128n/a v *= 16;
1129n/a if (*name >= '0' && *name <= '9')
1130n/a v += *name - '0';
1131n/a else if (*name >= 'A' && *name <= 'F')
1132n/a v += *name - 'A' + 10;
1133n/a else
1134n/a return 0;
1135n/a name++;
1136n/a }
1137n/a if (!is_unified_ideograph(v))
1138n/a return 0;
1139n/a *code = v;
1140n/a return 1;
1141n/a }
1142n/a
1143n/a /* the following is the same as python's dictionary lookup, with
1144n/a only minor changes. see the makeunicodedata script for more
1145n/a details */
1146n/a
1147n/a h = (unsigned int) _gethash(name, namelen, code_magic);
1148n/a i = (~h) & mask;
1149n/a v = code_hash[i];
1150n/a if (!v)
1151n/a return 0;
1152n/a if (_cmpname(self, v, name, namelen))
1153n/a return _check_alias_and_seq(v, code, with_named_seq);
1154n/a incr = (h ^ (h >> 3)) & mask;
1155n/a if (!incr)
1156n/a incr = mask;
1157n/a for (;;) {
1158n/a i = (i + incr) & mask;
1159n/a v = code_hash[i];
1160n/a if (!v)
1161n/a return 0;
1162n/a if (_cmpname(self, v, name, namelen))
1163n/a return _check_alias_and_seq(v, code, with_named_seq);
1164n/a incr = incr << 1;
1165n/a if (incr > mask)
1166n/a incr = incr ^ code_poly;
1167n/a }
1168n/a}
1169n/a
1170n/astatic const _PyUnicode_Name_CAPI hashAPI =
1171n/a{
1172n/a sizeof(_PyUnicode_Name_CAPI),
1173n/a _getucname,
1174n/a _getcode
1175n/a};
1176n/a
1177n/a/* -------------------------------------------------------------------- */
1178n/a/* Python bindings */
1179n/a
1180n/a/*[clinic input]
1181n/aunicodedata.UCD.name
1182n/a
1183n/a self: self
1184n/a chr: int(accept={str})
1185n/a default: object=NULL
1186n/a /
1187n/a
1188n/aReturns the name assigned to the character chr as a string.
1189n/a
1190n/aIf no name is defined, default is returned, or, if not given,
1191n/aValueError is raised.
1192n/a[clinic start generated code]*/
1193n/a
1194n/astatic PyObject *
1195n/aunicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
1196n/a/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
1197n/a{
1198n/a char name[NAME_MAXLEN+1];
1199n/a Py_UCS4 c = (Py_UCS4)chr;
1200n/a
1201n/a if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
1202n/a if (default_value == NULL) {
1203n/a PyErr_SetString(PyExc_ValueError, "no such name");
1204n/a return NULL;
1205n/a }
1206n/a else {
1207n/a Py_INCREF(default_value);
1208n/a return default_value;
1209n/a }
1210n/a }
1211n/a
1212n/a return PyUnicode_FromString(name);
1213n/a}
1214n/a
1215n/a/*[clinic input]
1216n/aunicodedata.UCD.lookup
1217n/a
1218n/a self: self
1219n/a name: str(accept={str, robuffer}, zeroes=True)
1220n/a /
1221n/a
1222n/aLook up character by name.
1223n/a
1224n/aIf a character with the given name is found, return the
1225n/acorresponding character. If not found, KeyError is raised.
1226n/a[clinic start generated code]*/
1227n/a
1228n/astatic PyObject *
1229n/aunicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1230n/a Py_ssize_clean_t name_length)
1231n/a/*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
1232n/a{
1233n/a Py_UCS4 code;
1234n/a unsigned int index;
1235n/a if (name_length > NAME_MAXLEN) {
1236n/a PyErr_SetString(PyExc_KeyError, "name too long");
1237n/a return NULL;
1238n/a }
1239n/a
1240n/a if (!_getcode(self, name, (int)name_length, &code, 1)) {
1241n/a PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1242n/a return NULL;
1243n/a }
1244n/a /* check if code is in the PUA range that we use for named sequences
1245n/a and convert it */
1246n/a if (IS_NAMED_SEQ(code)) {
1247n/a index = code-named_sequences_start;
1248n/a return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1249n/a named_sequences[index].seq,
1250n/a named_sequences[index].seqlen);
1251n/a }
1252n/a return PyUnicode_FromOrdinal(code);
1253n/a}
1254n/a
1255n/a/* XXX Add doc strings. */
1256n/a
1257n/astatic PyMethodDef unicodedata_functions[] = {
1258n/a UNICODEDATA_UCD_DECIMAL_METHODDEF
1259n/a UNICODEDATA_UCD_DIGIT_METHODDEF
1260n/a UNICODEDATA_UCD_NUMERIC_METHODDEF
1261n/a UNICODEDATA_UCD_CATEGORY_METHODDEF
1262n/a UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1263n/a UNICODEDATA_UCD_COMBINING_METHODDEF
1264n/a UNICODEDATA_UCD_MIRRORED_METHODDEF
1265n/a UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1266n/a UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1267n/a UNICODEDATA_UCD_NAME_METHODDEF
1268n/a UNICODEDATA_UCD_LOOKUP_METHODDEF
1269n/a UNICODEDATA_UCD_NORMALIZE_METHODDEF
1270n/a {NULL, NULL} /* sentinel */
1271n/a};
1272n/a
1273n/astatic PyTypeObject UCD_Type = {
1274n/a /* The ob_type field must be initialized in the module init function
1275n/a * to be portable to Windows without using C++. */
1276n/a PyVarObject_HEAD_INIT(NULL, 0)
1277n/a "unicodedata.UCD", /*tp_name*/
1278n/a sizeof(PreviousDBVersion), /*tp_basicsize*/
1279n/a 0, /*tp_itemsize*/
1280n/a /* methods */
1281n/a (destructor)PyObject_Del, /*tp_dealloc*/
1282n/a 0, /*tp_print*/
1283n/a 0, /*tp_getattr*/
1284n/a 0, /*tp_setattr*/
1285n/a 0, /*tp_reserved*/
1286n/a 0, /*tp_repr*/
1287n/a 0, /*tp_as_number*/
1288n/a 0, /*tp_as_sequence*/
1289n/a 0, /*tp_as_mapping*/
1290n/a 0, /*tp_hash*/
1291n/a 0, /*tp_call*/
1292n/a 0, /*tp_str*/
1293n/a PyObject_GenericGetAttr,/*tp_getattro*/
1294n/a 0, /*tp_setattro*/
1295n/a 0, /*tp_as_buffer*/
1296n/a Py_TPFLAGS_DEFAULT, /*tp_flags*/
1297n/a 0, /*tp_doc*/
1298n/a 0, /*tp_traverse*/
1299n/a 0, /*tp_clear*/
1300n/a 0, /*tp_richcompare*/
1301n/a 0, /*tp_weaklistoffset*/
1302n/a 0, /*tp_iter*/
1303n/a 0, /*tp_iternext*/
1304n/a unicodedata_functions, /*tp_methods*/
1305n/a DB_members, /*tp_members*/
1306n/a 0, /*tp_getset*/
1307n/a 0, /*tp_base*/
1308n/a 0, /*tp_dict*/
1309n/a 0, /*tp_descr_get*/
1310n/a 0, /*tp_descr_set*/
1311n/a 0, /*tp_dictoffset*/
1312n/a 0, /*tp_init*/
1313n/a 0, /*tp_alloc*/
1314n/a 0, /*tp_new*/
1315n/a 0, /*tp_free*/
1316n/a 0, /*tp_is_gc*/
1317n/a};
1318n/a
1319n/aPyDoc_STRVAR(unicodedata_docstring,
1320n/a"This module provides access to the Unicode Character Database which\n\
1321n/adefines character properties for all Unicode characters. The data in\n\
1322n/athis database is based on the UnicodeData.txt file version\n\
1323n/a" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
1324n/a\n\
1325n/aThe module uses the same names and symbols as defined by the\n\
1326n/aUnicodeData File Format " UNIDATA_VERSION ".");
1327n/a
1328n/astatic struct PyModuleDef unicodedatamodule = {
1329n/a PyModuleDef_HEAD_INIT,
1330n/a "unicodedata",
1331n/a unicodedata_docstring,
1332n/a -1,
1333n/a unicodedata_functions,
1334n/a NULL,
1335n/a NULL,
1336n/a NULL,
1337n/a NULL
1338n/a};
1339n/a
1340n/aPyMODINIT_FUNC
1341n/aPyInit_unicodedata(void)
1342n/a{
1343n/a PyObject *m, *v;
1344n/a
1345n/a Py_TYPE(&UCD_Type) = &PyType_Type;
1346n/a
1347n/a m = PyModule_Create(&unicodedatamodule);
1348n/a if (!m)
1349n/a return NULL;
1350n/a
1351n/a PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1352n/a Py_INCREF(&UCD_Type);
1353n/a PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1354n/a
1355n/a /* Previous versions */
1356n/a v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1357n/a if (v != NULL)
1358n/a PyModule_AddObject(m, "ucd_3_2_0", v);
1359n/a
1360n/a /* Export C API */
1361n/a v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
1362n/a if (v != NULL)
1363n/a PyModule_AddObject(m, "ucnhash_CAPI", v);
1364n/a return m;
1365n/a}
1366n/a
1367n/a/*
1368n/aLocal variables:
1369n/ac-basic-offset: 4
1370n/aindent-tabs-mode: nil
1371n/aEnd:
1372n/a*/