Python code coverage for Modules/unicodedata.c

#	count	content
1	n/a	/* ------------------------------------------------------------------------
2	n/a
3	n/a	unicodedata -- Provides access to the Unicode database.
4	n/a
5	n/a	Data was extracted from the UnicodeData.txt file.
6	n/a	The current version number is reported in the unidata_version constant.
7	n/a
8	n/a	Written by Marc-Andre Lemburg (mal@lemburg.com).
9	n/a	Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
10	n/a	Modified by Martin v. LÃ¶wis (martin@v.loewis.de)
11	n/a
12	n/a	Copyright (c) Corporation for National Research Initiatives.
13	n/a
14	n/a	------------------------------------------------------------------------ */
15	n/a
16	n/a	#define PY_SSIZE_T_CLEAN
17	n/a
18	n/a	#include "Python.h"
19	n/a	#include "ucnhash.h"
20	n/a	#include "structmember.h"
21	n/a
22	n/a	/*[clinic input]
23	n/a	module unicodedata
24	n/a	class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
25	n/a	[clinic start generated code]*/
26	n/a	/[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]/
27	n/a
28	n/a	/* character properties */
29	n/a
30	n/a	typedef struct {
31	n/a	const unsigned char category; /* index into
32	n/a	_PyUnicode_CategoryNames */
33	n/a	const unsigned char combining; /* combining class value 0 - 255 */
34	n/a	const unsigned char bidirectional; /* index into
35	n/a	_PyUnicode_BidirectionalNames */
36	n/a	const unsigned char mirrored; /* true if mirrored in bidir mode */
37	n/a	const unsigned char east_asian_width; /* index into
38	n/a	_PyUnicode_EastAsianWidth */
39	n/a	const unsigned char normalization_quick_check; /* see is_normalized() */
40	n/a	} _PyUnicode_DatabaseRecord;
41	n/a
42	n/a	typedef struct change_record {
43	n/a	/* sequence of fields should be the same as in merge_old_version */
44	n/a	const unsigned char bidir_changed;
45	n/a	const unsigned char category_changed;
46	n/a	const unsigned char decimal_changed;
47	n/a	const unsigned char mirrored_changed;
48	n/a	const unsigned char east_asian_width_changed;
49	n/a	const double numeric_changed;
50	n/a	} change_record;
51	n/a
52	n/a	/* data file generated by Tools/unicode/makeunicodedata.py */
53	n/a	#include "unicodedata_db.h"
54	n/a
55	n/a	static const _PyUnicode_DatabaseRecord*
56	n/a	_getrecord_ex(Py_UCS4 code)
57	n/a	{
58	n/a	int index;
59	n/a	if (code >= 0x110000)
60	n/a	index = 0;
61	n/a	else {
62	n/a	index = index1[(code>>SHIFT)];
63	n/a	index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
64	n/a	}
65	n/a
66	n/a	return &_PyUnicode_Database_Records[index];
67	n/a	}
68	n/a
69	n/a	/* ------------- Previous-version API ------------------------------------- */
70	n/a	typedef struct previous_version {
71	n/a	PyObject_HEAD
72	n/a	const char *name;
73	n/a	const change_record* (*getrecord)(Py_UCS4);
74	n/a	Py_UCS4 (*normalization)(Py_UCS4);
75	n/a	} PreviousDBVersion;
76	n/a
77	n/a	#include "clinic/unicodedata.c.h"
78	n/a
79	n/a	#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
80	n/a
81	n/a	static PyMemberDef DB_members[] = {
82	n/a	{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
83	n/a	{NULL}
84	n/a	};
85	n/a
86	n/a	/* forward declaration */
87	n/a	static PyTypeObject UCD_Type;
88	n/a	#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
89	n/a
90	n/a	static PyObject*
91	n/a	new_previous_version(const charname, const change_record (*getrecord)(Py_UCS4),
92	n/a	Py_UCS4 (*normalization)(Py_UCS4))
93	n/a	{
94	n/a	PreviousDBVersion *self;
95	n/a	self = PyObject_New(PreviousDBVersion, &UCD_Type);
96	n/a	if (self == NULL)
97	n/a	return NULL;
98	n/a	self->name = name;
99	n/a	self->getrecord = getrecord;
100	n/a	self->normalization = normalization;
101	n/a	return (PyObject*)self;
102	n/a	}
103	n/a
104	n/a
105	n/a	/* --- Module API --------------------------------------------------------- */
106	n/a
107	n/a	/*[clinic input]
108	n/a	unicodedata.UCD.decimal
109	n/a
110	n/a	self: self
111	n/a	chr: int(accept={str})
112	n/a	default: object=NULL
113	n/a	/
114	n/a
115	n/a	Converts a Unicode character into its equivalent decimal value.
116	n/a
117	n/a	Returns the decimal value assigned to the character chr as integer.
118	n/a	If no such value is defined, default is returned, or, if not given,
119	n/a	ValueError is raised.
120	n/a	[clinic start generated code]*/
121	n/a
122	n/a	static PyObject *
123	n/a	unicodedata_UCD_decimal_impl(PyObject *self, int chr,
124	n/a	PyObject *default_value)
125	n/a	/[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]/
126	n/a	{
127	n/a	int have_old = 0;
128	n/a	long rc;
129	n/a	Py_UCS4 c = (Py_UCS4)chr;
130	n/a
131	n/a	if (self && UCD_Check(self)) {
132	n/a	const change_record *old = get_old_record(self, c);
133	n/a	if (old->category_changed == 0) {
134	n/a	/* unassigned */
135	n/a	have_old = 1;
136	n/a	rc = -1;
137	n/a	}
138	n/a	else if (old->decimal_changed != 0xFF) {
139	n/a	have_old = 1;
140	n/a	rc = old->decimal_changed;
141	n/a	}
142	n/a	}
143	n/a
144	n/a	if (!have_old)
145	n/a	rc = Py_UNICODE_TODECIMAL(c);
146	n/a	if (rc < 0) {
147	n/a	if (default_value == NULL) {
148	n/a	PyErr_SetString(PyExc_ValueError,
149	n/a	"not a decimal");
150	n/a	return NULL;
151	n/a	}
152	n/a	else {
153	n/a	Py_INCREF(default_value);
154	n/a	return default_value;
155	n/a	}
156	n/a	}
157	n/a	return PyLong_FromLong(rc);
158	n/a	}
159	n/a
160	n/a	/*[clinic input]
161	n/a	unicodedata.UCD.digit
162	n/a
163	n/a	self: self
164	n/a	chr: int(accept={str})
165	n/a	default: object=NULL
166	n/a	/
167	n/a
168	n/a	Converts a Unicode character into its equivalent digit value.
169	n/a
170	n/a	Returns the digit value assigned to the character chr as integer.
171	n/a	If no such value is defined, default is returned, or, if not given,
172	n/a	ValueError is raised.
173	n/a	[clinic start generated code]*/
174	n/a
175	n/a	static PyObject *
176	n/a	unicodedata_UCD_digit_impl(PyObject self, int chr, PyObject default_value)
177	n/a	/[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]/
178	n/a	{
179	n/a	long rc;
180	n/a	Py_UCS4 c = (Py_UCS4)chr;
181	n/a	rc = Py_UNICODE_TODIGIT(c);
182	n/a	if (rc < 0) {
183	n/a	if (default_value == NULL) {
184	n/a	PyErr_SetString(PyExc_ValueError, "not a digit");
185	n/a	return NULL;
186	n/a	}
187	n/a	else {
188	n/a	Py_INCREF(default_value);
189	n/a	return default_value;
190	n/a	}
191	n/a	}
192	n/a	return PyLong_FromLong(rc);
193	n/a	}
194	n/a
195	n/a	/*[clinic input]
196	n/a	unicodedata.UCD.numeric
197	n/a
198	n/a	self: self
199	n/a	chr: int(accept={str})
200	n/a	default: object=NULL
201	n/a	/
202	n/a
203	n/a	Converts a Unicode character into its equivalent numeric value.
204	n/a
205	n/a	Returns the numeric value assigned to the character chr as float.
206	n/a	If no such value is defined, default is returned, or, if not given,
207	n/a	ValueError is raised.
208	n/a	[clinic start generated code]*/
209	n/a
210	n/a	static PyObject *
211	n/a	unicodedata_UCD_numeric_impl(PyObject *self, int chr,
212	n/a	PyObject *default_value)
213	n/a	/[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]/
214	n/a	{
215	n/a	int have_old = 0;
216	n/a	double rc;
217	n/a	Py_UCS4 c = (Py_UCS4)chr;
218	n/a
219	n/a	if (self && UCD_Check(self)) {
220	n/a	const change_record *old = get_old_record(self, c);
221	n/a	if (old->category_changed == 0) {
222	n/a	/* unassigned */
223	n/a	have_old = 1;
224	n/a	rc = -1.0;
225	n/a	}
226	n/a	else if (old->decimal_changed != 0xFF) {
227	n/a	have_old = 1;
228	n/a	rc = old->decimal_changed;
229	n/a	}
230	n/a	}
231	n/a
232	n/a	if (!have_old)
233	n/a	rc = Py_UNICODE_TONUMERIC(c);
234	n/a	if (rc == -1.0) {
235	n/a	if (default_value == NULL) {
236	n/a	PyErr_SetString(PyExc_ValueError, "not a numeric character");
237	n/a	return NULL;
238	n/a	}
239	n/a	else {
240	n/a	Py_INCREF(default_value);
241	n/a	return default_value;
242	n/a	}
243	n/a	}
244	n/a	return PyFloat_FromDouble(rc);
245	n/a	}
246	n/a
247	n/a	/*[clinic input]
248	n/a	unicodedata.UCD.category
249	n/a
250	n/a	self: self
251	n/a	chr: int(accept={str})
252	n/a	/
253	n/a
254	n/a	Returns the general category assigned to the character chr as string.
255	n/a	[clinic start generated code]*/
256	n/a
257	n/a	static PyObject *
258	n/a	unicodedata_UCD_category_impl(PyObject *self, int chr)
259	n/a	/[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]/
260	n/a	{
261	n/a	int index;
262	n/a	Py_UCS4 c = (Py_UCS4)chr;
263	n/a	index = (int) _getrecord_ex(c)->category;
264	n/a	if (self && UCD_Check(self)) {
265	n/a	const change_record *old = get_old_record(self, c);
266	n/a	if (old->category_changed != 0xFF)
267	n/a	index = old->category_changed;
268	n/a	}
269	n/a	return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
270	n/a	}
271	n/a
272	n/a	/*[clinic input]
273	n/a	unicodedata.UCD.bidirectional
274	n/a
275	n/a	self: self
276	n/a	chr: int(accept={str})
277	n/a	/
278	n/a
279	n/a	Returns the bidirectional class assigned to the character chr as string.
280	n/a
281	n/a	If no such value is defined, an empty string is returned.
282	n/a	[clinic start generated code]*/
283	n/a
284	n/a	static PyObject *
285	n/a	unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
286	n/a	/[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]/
287	n/a	{
288	n/a	int index;
289	n/a	Py_UCS4 c = (Py_UCS4)chr;
290	n/a	index = (int) _getrecord_ex(c)->bidirectional;
291	n/a	if (self && UCD_Check(self)) {
292	n/a	const change_record *old = get_old_record(self, c);
293	n/a	if (old->category_changed == 0)
294	n/a	index = 0; /* unassigned */
295	n/a	else if (old->bidir_changed != 0xFF)
296	n/a	index = old->bidir_changed;
297	n/a	}
298	n/a	return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
299	n/a	}
300	n/a
301	n/a	/*[clinic input]
302	n/a	unicodedata.UCD.combining -> int
303	n/a
304	n/a	self: self
305	n/a	chr: int(accept={str})
306	n/a	/
307	n/a
308	n/a	Returns the canonical combining class assigned to the character chr as integer.
309	n/a
310	n/a	Returns 0 if no combining class is defined.
311	n/a	[clinic start generated code]*/
312	n/a
313	n/a	static int
314	n/a	unicodedata_UCD_combining_impl(PyObject *self, int chr)
315	n/a	/[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]/
316	n/a	{
317	n/a	int index;
318	n/a	Py_UCS4 c = (Py_UCS4)chr;
319	n/a	index = (int) _getrecord_ex(c)->combining;
320	n/a	if (self && UCD_Check(self)) {
321	n/a	const change_record *old = get_old_record(self, c);
322	n/a	if (old->category_changed == 0)
323	n/a	index = 0; /* unassigned */
324	n/a	}
325	n/a	return index;
326	n/a	}
327	n/a
328	n/a	/*[clinic input]
329	n/a	unicodedata.UCD.mirrored -> int
330	n/a
331	n/a	self: self
332	n/a	chr: int(accept={str})
333	n/a	/
334	n/a
335	n/a	Returns the mirrored property assigned to the character chr as integer.
336	n/a
337	n/a	Returns 1 if the character has been identified as a "mirrored"
338	n/a	character in bidirectional text, 0 otherwise.
339	n/a	[clinic start generated code]*/
340	n/a
341	n/a	static int
342	n/a	unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
343	n/a	/[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]/
344	n/a	{
345	n/a	int index;
346	n/a	Py_UCS4 c = (Py_UCS4)chr;
347	n/a	index = (int) _getrecord_ex(c)->mirrored;
348	n/a	if (self && UCD_Check(self)) {
349	n/a	const change_record *old = get_old_record(self, c);
350	n/a	if (old->category_changed == 0)
351	n/a	index = 0; /* unassigned */
352	n/a	else if (old->mirrored_changed != 0xFF)
353	n/a	index = old->mirrored_changed;
354	n/a	}
355	n/a	return index;
356	n/a	}
357	n/a
358	n/a	/*[clinic input]
359	n/a	unicodedata.UCD.east_asian_width
360	n/a
361	n/a	self: self
362	n/a	chr: int(accept={str})
363	n/a	/
364	n/a
365	n/a	Returns the east asian width assigned to the character chr as string.
366	n/a	[clinic start generated code]*/
367	n/a
368	n/a	static PyObject *
369	n/a	unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
370	n/a	/[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]/
371	n/a	{
372	n/a	int index;
373	n/a	Py_UCS4 c = (Py_UCS4)chr;
374	n/a	index = (int) _getrecord_ex(c)->east_asian_width;
375	n/a	if (self && UCD_Check(self)) {
376	n/a	const change_record *old = get_old_record(self, c);
377	n/a	if (old->category_changed == 0)
378	n/a	index = 0; /* unassigned */
379	n/a	else if (old->east_asian_width_changed != 0xFF)
380	n/a	index = old->east_asian_width_changed;
381	n/a	}
382	n/a	return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
383	n/a	}
384	n/a
385	n/a	/*[clinic input]
386	n/a	unicodedata.UCD.decomposition
387	n/a
388	n/a	self: self
389	n/a	chr: int(accept={str})
390	n/a	/
391	n/a
392	n/a	Returns the character decomposition mapping assigned to the character chr as string.
393	n/a
394	n/a	An empty string is returned in case no such mapping is defined.
395	n/a	[clinic start generated code]*/
396	n/a
397	n/a	static PyObject *
398	n/a	unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
399	n/a	/[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]/
400	n/a	{
401	n/a	char decomp[256];
402	n/a	int code, index, count;
403	n/a	size_t i;
404	n/a	unsigned int prefix_index;
405	n/a	Py_UCS4 c = (Py_UCS4)chr;
406	n/a
407	n/a	code = (int)c;
408	n/a
409	n/a	if (self && UCD_Check(self)) {
410	n/a	const change_record *old = get_old_record(self, c);
411	n/a	if (old->category_changed == 0)
412	n/a	return PyUnicode_FromString(""); /* unassigned */
413	n/a	}
414	n/a
415	n/a	if (code < 0 \|\| code >= 0x110000)
416	n/a	index = 0;
417	n/a	else {
418	n/a	index = decomp_index1[(code>>DECOMP_SHIFT)];
419	n/a	index = decomp_index2[(index<<DECOMP_SHIFT)+
420	n/a	(code&((1<<DECOMP_SHIFT)-1))];
421	n/a	}
422	n/a
423	n/a	/* high byte is number of hex bytes (usually one or two), low byte
424	n/a	is prefix code (from*/
425	n/a	count = decomp_data[index] >> 8;
426	n/a
427	n/a	/* XXX: could allocate the PyString up front instead
428	n/a	(strlen(prefix) + 5 * count + 1 bytes) */
429	n/a
430	n/a	/* Based on how index is calculated above and decomp_data is generated
431	n/a	from Tools/unicode/makeunicodedata.py, it should not be possible
432	n/a	to overflow decomp_prefix. */
433	n/a	prefix_index = decomp_data[index] & 255;
434	n/a	assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
435	n/a
436	n/a	/* copy prefix */
437	n/a	i = strlen(decomp_prefix[prefix_index]);
438	n/a	memcpy(decomp, decomp_prefix[prefix_index], i);
439	n/a
440	n/a	while (count-- > 0) {
441	n/a	if (i)
442	n/a	decomp[i++] = ' ';
443	n/a	assert(i < sizeof(decomp));
444	n/a	PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
445	n/a	decomp_data[++index]);
446	n/a	i += strlen(decomp + i);
447	n/a	}
448	n/a	return PyUnicode_FromStringAndSize(decomp, i);
449	n/a	}
450	n/a
451	n/a	static void
452	n/a	get_decomp_record(PyObject self, Py_UCS4 code, int index, int prefix, int count)
453	n/a	{
454	n/a	if (code >= 0x110000) {
455	n/a	*index = 0;
456	n/a	} else if (self && UCD_Check(self) &&
457	n/a	get_old_record(self, code)->category_changed==0) {
458	n/a	/* unassigned in old version */
459	n/a	*index = 0;
460	n/a	}
461	n/a	else {
462	n/a	*index = decomp_index1[(code>>DECOMP_SHIFT)];
463	n/a	index = decomp_index2[(index<<DECOMP_SHIFT)+
464	n/a	(code&((1<<DECOMP_SHIFT)-1))];
465	n/a	}
466	n/a
467	n/a	/* high byte is number of hex bytes (usually one or two), low byte
468	n/a	is prefix code (from*/
469	n/a	count = decomp_data[index] >> 8;
470	n/a	prefix = decomp_data[index] & 255;
471	n/a
472	n/a	(*index)++;
473	n/a	}
474	n/a
475	n/a	#define SBase 0xAC00
476	n/a	#define LBase 0x1100
477	n/a	#define VBase 0x1161
478	n/a	#define TBase 0x11A7
479	n/a	#define LCount 19
480	n/a	#define VCount 21
481	n/a	#define TCount 28
482	n/a	#define NCount (VCount*TCount)
483	n/a	#define SCount (LCount*NCount)
484	n/a
485	n/a	static PyObject*
486	n/a	nfd_nfkd(PyObject self, PyObject input, int k)
487	n/a	{
488	n/a	PyObject *result;
489	n/a	Py_UCS4 *output;
490	n/a	Py_ssize_t i, o, osize;
491	n/a	int kind;
492	n/a	void *data;
493	n/a	/* Longest decomposition in Unicode 3.2: U+FDFA */
494	n/a	Py_UCS4 stack[20];
495	n/a	Py_ssize_t space, isize;
496	n/a	int index, prefix, count, stackptr;
497	n/a	unsigned char prev, cur;
498	n/a
499	n/a	stackptr = 0;
500	n/a	isize = PyUnicode_GET_LENGTH(input);
501	n/a	space = isize;
502	n/a	/* Overallocate at most 10 characters. */
503	n/a	if (space > 10) {
504	n/a	if (space <= PY_SSIZE_T_MAX - 10)
505	n/a	space += 10;
506	n/a	}
507	n/a	else {
508	n/a	space *= 2;
509	n/a	}
510	n/a	osize = space;
511	n/a	output = PyMem_NEW(Py_UCS4, space);
512	n/a	if (!output) {
513	n/a	PyErr_NoMemory();
514	n/a	return NULL;
515	n/a	}
516	n/a	i = o = 0;
517	n/a	kind = PyUnicode_KIND(input);
518	n/a	data = PyUnicode_DATA(input);
519	n/a
520	n/a	while (i < isize) {
521	n/a	stack[stackptr++] = PyUnicode_READ(kind, data, i++);
522	n/a	while(stackptr) {
523	n/a	Py_UCS4 code = stack[--stackptr];
524	n/a	/* Hangul Decomposition adds three characters in
525	n/a	a single step, so we need at least that much room. */
526	n/a	if (space < 3) {
527	n/a	Py_UCS4 *new_output;
528	n/a	osize += 10;
529	n/a	space += 10;
530	n/a	new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
531	n/a	if (new_output == NULL) {
532	n/a	PyMem_Free(output);
533	n/a	PyErr_NoMemory();
534	n/a	return NULL;
535	n/a	}
536	n/a	output = new_output;
537	n/a	}
538	n/a	/* Hangul Decomposition. */
539	n/a	if (SBase <= code && code < (SBase+SCount)) {
540	n/a	int SIndex = code - SBase;
541	n/a	int L = LBase + SIndex / NCount;
542	n/a	int V = VBase + (SIndex % NCount) / TCount;
543	n/a	int T = TBase + SIndex % TCount;
544	n/a	output[o++] = L;
545	n/a	output[o++] = V;
546	n/a	space -= 2;
547	n/a	if (T != TBase) {
548	n/a	output[o++] = T;
549	n/a	space --;
550	n/a	}
551	n/a	continue;
552	n/a	}
553	n/a	/* normalization changes */
554	n/a	if (self && UCD_Check(self)) {
555	n/a	Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
556	n/a	if (value != 0) {
557	n/a	stack[stackptr++] = value;
558	n/a	continue;
559	n/a	}
560	n/a	}
561	n/a
562	n/a	/* Other decompositions. */
563	n/a	get_decomp_record(self, code, &index, &prefix, &count);
564	n/a
565	n/a	/* Copy character if it is not decomposable, or has a
566	n/a	compatibility decomposition, but we do NFD. */
567	n/a	if (!count \|\| (prefix && !k)) {
568	n/a	output[o++] = code;
569	n/a	space--;
570	n/a	continue;
571	n/a	}
572	n/a	/* Copy decomposition onto the stack, in reverse
573	n/a	order. */
574	n/a	while(count) {
575	n/a	code = decomp_data[index + (--count)];
576	n/a	stack[stackptr++] = code;
577	n/a	}
578	n/a	}
579	n/a	}
580	n/a
581	n/a	result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
582	n/a	output, o);
583	n/a	PyMem_Free(output);
584	n/a	if (!result)
585	n/a	return NULL;
586	n/a	/* result is guaranteed to be ready, as it is compact. */
587	n/a	kind = PyUnicode_KIND(result);
588	n/a	data = PyUnicode_DATA(result);
589	n/a
590	n/a	/* Sort canonically. */
591	n/a	i = 0;
592	n/a	prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
593	n/a	for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
594	n/a	cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
595	n/a	if (prev == 0 \|\| cur == 0 \|\| prev <= cur) {
596	n/a	prev = cur;
597	n/a	continue;
598	n/a	}
599	n/a	/* Non-canonical order. Need to switch i with previous. /
600	n/a	o = i - 1;
601	n/a	while (1) {
602	n/a	Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
603	n/a	PyUnicode_WRITE(kind, data, o+1,
604	n/a	PyUnicode_READ(kind, data, o));
605	n/a	PyUnicode_WRITE(kind, data, o, tmp);
606	n/a	o--;
607	n/a	if (o < 0)
608	n/a	break;
609	n/a	prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
610	n/a	if (prev == 0 \|\| prev <= cur)
611	n/a	break;
612	n/a	}
613	n/a	prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
614	n/a	}
615	n/a	return result;
616	n/a	}
617	n/a
618	n/a	static int
619	n/a	find_nfc_index(PyObject self, struct reindex nfc, Py_UCS4 code)
620	n/a	{
621	n/a	unsigned int index;
622	n/a	for (index = 0; nfc[index].start; index++) {
623	n/a	unsigned int start = nfc[index].start;
624	n/a	if (code < start)
625	n/a	return -1;
626	n/a	if (code <= start + nfc[index].count) {
627	n/a	unsigned int delta = code - start;
628	n/a	return nfc[index].index + delta;
629	n/a	}
630	n/a	}
631	n/a	return -1;
632	n/a	}
633	n/a
634	n/a	static PyObject*
635	n/a	nfc_nfkc(PyObject self, PyObject input, int k)
636	n/a	{
637	n/a	PyObject *result;
638	n/a	int kind;
639	n/a	void *data;
640	n/a	Py_UCS4 *output;
641	n/a	Py_ssize_t i, i1, o, len;
642	n/a	int f,l,index,index1,comb;
643	n/a	Py_UCS4 code;
644	n/a	Py_ssize_t skipped[20];
645	n/a	int cskipped = 0;
646	n/a
647	n/a	result = nfd_nfkd(self, input, k);
648	n/a	if (!result)
649	n/a	return NULL;
650	n/a	/* result will be "ready". */
651	n/a	kind = PyUnicode_KIND(result);
652	n/a	data = PyUnicode_DATA(result);
653	n/a	len = PyUnicode_GET_LENGTH(result);
654	n/a
655	n/a	/* We allocate a buffer for the output.
656	n/a	If we find that we made no changes, we still return
657	n/a	the NFD result. */
658	n/a	output = PyMem_NEW(Py_UCS4, len);
659	n/a	if (!output) {
660	n/a	PyErr_NoMemory();
661	n/a	Py_DECREF(result);
662	n/a	return 0;
663	n/a	}
664	n/a	i = o = 0;
665	n/a
666	n/a	again:
667	n/a	while (i < len) {
668	n/a	for (index = 0; index < cskipped; index++) {
669	n/a	if (skipped[index] == i) {
670	n/a	/* *i character is skipped.
671	n/a	Remove from list. */
672	n/a	skipped[index] = skipped[cskipped-1];
673	n/a	cskipped--;
674	n/a	i++;
675	n/a	goto again; /* continue while */
676	n/a	}
677	n/a	}
678	n/a	/* Hangul Composition. We don't need to check for <LV,T>
679	n/a	pairs, since we always have decomposed data. */
680	n/a	code = PyUnicode_READ(kind, data, i);
681	n/a	if (LBase <= code && code < (LBase+LCount) &&
682	n/a	i + 1 < len &&
683	n/a	VBase <= PyUnicode_READ(kind, data, i+1) &&
684	n/a	PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
685	n/a	int LIndex, VIndex;
686	n/a	LIndex = code - LBase;
687	n/a	VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
688	n/a	code = SBase + (LIndexVCount+VIndex)TCount;
689	n/a	i+=2;
690	n/a	if (i < len &&
691	n/a	TBase <= PyUnicode_READ(kind, data, i) &&
692	n/a	PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
693	n/a	code += PyUnicode_READ(kind, data, i)-TBase;
694	n/a	i++;
695	n/a	}
696	n/a	output[o++] = code;
697	n/a	continue;
698	n/a	}
699	n/a
700	n/a	/* code is still input[i] here */
701	n/a	f = find_nfc_index(self, nfc_first, code);
702	n/a	if (f == -1) {
703	n/a	output[o++] = code;
704	n/a	i++;
705	n/a	continue;
706	n/a	}
707	n/a	/* Find next unblocked character. */
708	n/a	i1 = i+1;
709	n/a	comb = 0;
710	n/a	/* output base character for now; might be updated later. */
711	n/a	output[o] = PyUnicode_READ(kind, data, i);
712	n/a	while (i1 < len) {
713	n/a	Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
714	n/a	int comb1 = _getrecord_ex(code1)->combining;
715	n/a	if (comb) {
716	n/a	if (comb1 == 0)
717	n/a	break;
718	n/a	if (comb >= comb1) {
719	n/a	/* Character is blocked. */
720	n/a	i1++;
721	n/a	continue;
722	n/a	}
723	n/a	}
724	n/a	l = find_nfc_index(self, nfc_last, code1);
725	n/a	/* i1 cannot be combined with i. If i1
726	n/a	is a starter, we don't need to look further.
727	n/a	Otherwise, record the combining class. */
728	n/a	if (l == -1) {
729	n/a	not_combinable:
730	n/a	if (comb1 == 0)
731	n/a	break;
732	n/a	comb = comb1;
733	n/a	i1++;
734	n/a	continue;
735	n/a	}
736	n/a	index = f*TOTAL_LAST + l;
737	n/a	index1 = comp_index[index >> COMP_SHIFT];
738	n/a	code = comp_data[(index1<<COMP_SHIFT)+
739	n/a	(index&((1<<COMP_SHIFT)-1))];
740	n/a	if (code == 0)
741	n/a	goto not_combinable;
742	n/a
743	n/a	/* Replace the original character. */
744	n/a	output[o] = code;
745	n/a	/* Mark the second character unused. */
746	n/a	assert(cskipped < 20);
747	n/a	skipped[cskipped++] = i1;
748	n/a	i1++;
749	n/a	f = find_nfc_index(self, nfc_first, output[o]);
750	n/a	if (f == -1)
751	n/a	break;
752	n/a	}
753	n/a	/* Output character was already written.
754	n/a	Just advance the indices. */
755	n/a	o++; i++;
756	n/a	}
757	n/a	if (o == len) {
758	n/a	/* No changes. Return original string. */
759	n/a	PyMem_Free(output);
760	n/a	return result;
761	n/a	}
762	n/a	Py_DECREF(result);
763	n/a	result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
764	n/a	output, o);
765	n/a	PyMem_Free(output);
766	n/a	return result;
767	n/a	}
768	n/a
769	n/a	/* Return 1 if the input is certainly normalized, 0 if it might not be. */
770	n/a	static int
771	n/a	is_normalized(PyObject self, PyObject input, int nfc, int k)
772	n/a	{
773	n/a	Py_ssize_t i, len;
774	n/a	int kind;
775	n/a	void *data;
776	n/a	unsigned char prev_combining = 0, quickcheck_mask;
777	n/a
778	n/a	/* An older version of the database is requested, quickchecks must be
779	n/a	disabled. */
780	n/a	if (self && UCD_Check(self))
781	n/a	return 0;
782	n/a
783	n/a	/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
784	n/a	as described in http://unicode.org/reports/tr15/#Annex8. */
785	n/a	quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
786	n/a
787	n/a	i = 0;
788	n/a	kind = PyUnicode_KIND(input);
789	n/a	data = PyUnicode_DATA(input);
790	n/a	len = PyUnicode_GET_LENGTH(input);
791	n/a	while (i < len) {
792	n/a	Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
793	n/a	const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
794	n/a	unsigned char combining = record->combining;
795	n/a	unsigned char quickcheck = record->normalization_quick_check;
796	n/a
797	n/a	if (quickcheck & quickcheck_mask)
798	n/a	return 0; /* this string might need normalization */
799	n/a	if (combining && prev_combining > combining)
800	n/a	return 0; /* non-canonical sort order, not normalized */
801	n/a	prev_combining = combining;
802	n/a	}
803	n/a	return 1; /* certainly normalized */
804	n/a	}
805	n/a
806	n/a	/*[clinic input]
807	n/a	unicodedata.UCD.normalize
808	n/a
809	n/a	self: self
810	n/a	form: str
811	n/a	unistr as input: unicode
812	n/a	/
813	n/a
814	n/a	Return the normal form 'form' for the Unicode string unistr.
815	n/a
816	n/a	Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
817	n/a	[clinic start generated code]*/
818	n/a
819	n/a	static PyObject *
820	n/a	unicodedata_UCD_normalize_impl(PyObject self, const char form,
821	n/a	PyObject *input)
822	n/a	/[clinic end generated code: output=62d1f8870027efdc input=1744c55f4ab79bf0]/
823	n/a	{
824	n/a	if (PyUnicode_GET_LENGTH(input) == 0) {
825	n/a	/* Special case empty input strings, since resizing
826	n/a	them later would cause internal errors. */
827	n/a	Py_INCREF(input);
828	n/a	return input;
829	n/a	}
830	n/a
831	n/a	if (strcmp(form, "NFC") == 0) {
832	n/a	if (is_normalized(self, input, 1, 0)) {
833	n/a	Py_INCREF(input);
834	n/a	return input;
835	n/a	}
836	n/a	return nfc_nfkc(self, input, 0);
837	n/a	}
838	n/a	if (strcmp(form, "NFKC") == 0) {
839	n/a	if (is_normalized(self, input, 1, 1)) {
840	n/a	Py_INCREF(input);
841	n/a	return input;
842	n/a	}
843	n/a	return nfc_nfkc(self, input, 1);
844	n/a	}
845	n/a	if (strcmp(form, "NFD") == 0) {
846	n/a	if (is_normalized(self, input, 0, 0)) {
847	n/a	Py_INCREF(input);
848	n/a	return input;
849	n/a	}
850	n/a	return nfd_nfkd(self, input, 0);
851	n/a	}
852	n/a	if (strcmp(form, "NFKD") == 0) {
853	n/a	if (is_normalized(self, input, 0, 1)) {
854	n/a	Py_INCREF(input);
855	n/a	return input;
856	n/a	}
857	n/a	return nfd_nfkd(self, input, 1);
858	n/a	}
859	n/a	PyErr_SetString(PyExc_ValueError, "invalid normalization form");
860	n/a	return NULL;
861	n/a	}
862	n/a
863	n/a	/* -------------------------------------------------------------------- */
864	n/a	/* unicode character name tables */
865	n/a
866	n/a	/* data file generated by Tools/unicode/makeunicodedata.py */
867	n/a	#include "unicodename_db.h"
868	n/a
869	n/a	/* -------------------------------------------------------------------- */
870	n/a	/* database code (cut and pasted from the unidb package) */
871	n/a
872	n/a	static unsigned long
873	n/a	_gethash(const char *s, int len, int scale)
874	n/a	{
875	n/a	int i;
876	n/a	unsigned long h = 0;
877	n/a	unsigned long ix;
878	n/a	for (i = 0; i < len; i++) {
879	n/a	h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
880	n/a	ix = h & 0xff000000;
881	n/a	if (ix)
882	n/a	h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
883	n/a	}
884	n/a	return h;
885	n/a	}
886	n/a
887	n/a	static const char * const hangul_syllables[][3] = {
888	n/a	{ "G", "A", "" },
889	n/a	{ "GG", "AE", "G" },
890	n/a	{ "N", "YA", "GG" },
891	n/a	{ "D", "YAE", "GS" },
892	n/a	{ "DD", "EO", "N", },
893	n/a	{ "R", "E", "NJ" },
894	n/a	{ "M", "YEO", "NH" },
895	n/a	{ "B", "YE", "D" },
896	n/a	{ "BB", "O", "L" },
897	n/a	{ "S", "WA", "LG" },
898	n/a	{ "SS", "WAE", "LM" },
899	n/a	{ "", "OE", "LB" },
900	n/a	{ "J", "YO", "LS" },
901	n/a	{ "JJ", "U", "LT" },
902	n/a	{ "C", "WEO", "LP" },
903	n/a	{ "K", "WE", "LH" },
904	n/a	{ "T", "WI", "M" },
905	n/a	{ "P", "YU", "B" },
906	n/a	{ "H", "EU", "BS" },
907	n/a	{ 0, "YI", "S" },
908	n/a	{ 0, "I", "SS" },
909	n/a	{ 0, 0, "NG" },
910	n/a	{ 0, 0, "J" },
911	n/a	{ 0, 0, "C" },
912	n/a	{ 0, 0, "K" },
913	n/a	{ 0, 0, "T" },
914	n/a	{ 0, 0, "P" },
915	n/a	{ 0, 0, "H" }
916	n/a	};
917	n/a
918	n/a	/* These ranges need to match makeunicodedata.py:cjk_ranges. */
919	n/a	static int
920	n/a	is_unified_ideograph(Py_UCS4 code)
921	n/a	{
922	n/a	return
923	n/a	(0x3400 <= code && code <= 0x4DB5) \|\| /* CJK Ideograph Extension A */
924	n/a	(0x4E00 <= code && code <= 0x9FD5) \|\| /* CJK Ideograph */
925	n/a	(0x20000 <= code && code <= 0x2A6D6) \|\| /* CJK Ideograph Extension B */
926	n/a	(0x2A700 <= code && code <= 0x2B734) \|\| /* CJK Ideograph Extension C */
927	n/a	(0x2B740 <= code && code <= 0x2B81D) \|\| /* CJK Ideograph Extension D */
928	n/a	(0x2B820 <= code && code <= 0x2CEA1); /* CJK Ideograph Extension E */
929	n/a	}
930	n/a
931	n/a	/* macros used to determine if the given code point is in the PUA range that
932	n/a	* we are using to store aliases and named sequences */
933	n/a	#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
934	n/a	#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
935	n/a	(cp < named_sequences_end))
936	n/a
937	n/a	static int
938	n/a	_getucname(PyObject self, Py_UCS4 code, char buffer, int buflen,
939	n/a	int with_alias_and_seq)
940	n/a	{
941	n/a	/* Find the name associated with the given code point.
942	n/a	* If with_alias_and_seq is 1, check for names in the Private Use Area 15
943	n/a	* that we are using for aliases and named sequences. */
944	n/a	int offset;
945	n/a	int i;
946	n/a	int word;
947	n/a	unsigned char* w;
948	n/a
949	n/a	if (code >= 0x110000)
950	n/a	return 0;
951	n/a
952	n/a	/* XXX should we just skip all the code points in the PUAs here? */
953	n/a	if (!with_alias_and_seq && (IS_ALIAS(code) \|\| IS_NAMED_SEQ(code)))
954	n/a	return 0;
955	n/a
956	n/a	if (self && UCD_Check(self)) {
957	n/a	/* in 3.2.0 there are no aliases and named sequences */
958	n/a	const change_record *old;
959	n/a	if (IS_ALIAS(code) \|\| IS_NAMED_SEQ(code))
960	n/a	return 0;
961	n/a	old = get_old_record(self, code);
962	n/a	if (old->category_changed == 0) {
963	n/a	/* unassigned */
964	n/a	return 0;
965	n/a	}
966	n/a	}
967	n/a
968	n/a	if (SBase <= code && code < SBase+SCount) {
969	n/a	/* Hangul syllable. */
970	n/a	int SIndex = code - SBase;
971	n/a	int L = SIndex / NCount;
972	n/a	int V = (SIndex % NCount) / TCount;
973	n/a	int T = SIndex % TCount;
974	n/a
975	n/a	if (buflen < 27)
976	n/a	/* Worst case: HANGUL SYLLABLE <10chars>. */
977	n/a	return 0;
978	n/a	strcpy(buffer, "HANGUL SYLLABLE ");
979	n/a	buffer += 16;
980	n/a	strcpy(buffer, hangul_syllables[L][0]);
981	n/a	buffer += strlen(hangul_syllables[L][0]);
982	n/a	strcpy(buffer, hangul_syllables[V][1]);
983	n/a	buffer += strlen(hangul_syllables[V][1]);
984	n/a	strcpy(buffer, hangul_syllables[T][2]);
985	n/a	buffer += strlen(hangul_syllables[T][2]);
986	n/a	*buffer = '\0';
987	n/a	return 1;
988	n/a	}
989	n/a
990	n/a	if (is_unified_ideograph(code)) {
991	n/a	if (buflen < 28)
992	n/a	/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
993	n/a	return 0;
994	n/a	sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
995	n/a	return 1;
996	n/a	}
997	n/a
998	n/a	/* get offset into phrasebook */
999	n/a	offset = phrasebook_offset1[(code>>phrasebook_shift)];
1000	n/a	offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1001	n/a	(code&((1<<phrasebook_shift)-1))];
1002	n/a	if (!offset)
1003	n/a	return 0;
1004	n/a
1005	n/a	i = 0;
1006	n/a
1007	n/a	for (;;) {
1008	n/a	/* get word index */
1009	n/a	word = phrasebook[offset] - phrasebook_short;
1010	n/a	if (word >= 0) {
1011	n/a	word = (word << 8) + phrasebook[offset+1];
1012	n/a	offset += 2;
1013	n/a	} else
1014	n/a	word = phrasebook[offset++];
1015	n/a	if (i) {
1016	n/a	if (i > buflen)
1017	n/a	return 0; /* buffer overflow */
1018	n/a	buffer[i++] = ' ';
1019	n/a	}
1020	n/a	/* copy word string from lexicon. the last character in the
1021	n/a	word has bit 7 set. the last word in a string ends with
1022	n/a	0x80 */
1023	n/a	w = lexicon + lexicon_offset[word];
1024	n/a	while (*w < 128) {
1025	n/a	if (i >= buflen)
1026	n/a	return 0; /* buffer overflow */
1027	n/a	buffer[i++] = *w++;
1028	n/a	}
1029	n/a	if (i >= buflen)
1030	n/a	return 0; /* buffer overflow */
1031	n/a	buffer[i++] = *w & 127;
1032	n/a	if (*w == 128)
1033	n/a	break; /* end of word */
1034	n/a	}
1035	n/a
1036	n/a	return 1;
1037	n/a	}
1038	n/a
1039	n/a	static int
1040	n/a	_cmpname(PyObject self, int code, const char name, int namelen)
1041	n/a	{
1042	n/a	/* check if code corresponds to the given name */
1043	n/a	int i;
1044	n/a	char buffer[NAME_MAXLEN+1];
1045	n/a	if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
1046	n/a	return 0;
1047	n/a	for (i = 0; i < namelen; i++) {
1048	n/a	if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
1049	n/a	return 0;
1050	n/a	}
1051	n/a	return buffer[namelen] == '\0';
1052	n/a	}
1053	n/a
1054	n/a	static void
1055	n/a	find_syllable(const char str, int len, int *pos, int count, int column)
1056	n/a	{
1057	n/a	int i, len1;
1058	n/a	*len = -1;
1059	n/a	for (i = 0; i < count; i++) {
1060	n/a	const char *s = hangul_syllables[i][column];
1061	n/a	len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
1062	n/a	if (len1 <= *len)
1063	n/a	continue;
1064	n/a	if (strncmp(str, s, len1) == 0) {
1065	n/a	*len = len1;
1066	n/a	*pos = i;
1067	n/a	}
1068	n/a	}
1069	n/a	if (*len == -1) {
1070	n/a	*len = 0;
1071	n/a	}
1072	n/a	}
1073	n/a
1074	n/a	static int
1075	n/a	_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
1076	n/a	{
1077	n/a	/* check if named sequences are allowed */
1078	n/a	if (!with_named_seq && IS_NAMED_SEQ(cp))
1079	n/a	return 0;
1080	n/a	/* if the code point is in the PUA range that we use for aliases,
1081	n/a	* convert it to obtain the right code point */
1082	n/a	if (IS_ALIAS(cp))
1083	n/a	*code = name_aliases[cp-aliases_start];
1084	n/a	else
1085	n/a	*code = cp;
1086	n/a	return 1;
1087	n/a	}
1088	n/a
1089	n/a	static int
1090	n/a	_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1091	n/a	int with_named_seq)
1092	n/a	{
1093	n/a	/* Return the code point associated with the given name.
1094	n/a	* Named aliases are resolved too (unless self != NULL (i.e. we are using
1095	n/a	* 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
1096	n/a	* using for the named sequence, and the caller must then convert it. */
1097	n/a	unsigned int h, v;
1098	n/a	unsigned int mask = code_size-1;
1099	n/a	unsigned int i, incr;
1100	n/a
1101	n/a	/* Check for hangul syllables. */
1102	n/a	if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1103	n/a	int len, L = -1, V = -1, T = -1;
1104	n/a	const char *pos = name + 16;
1105	n/a	find_syllable(pos, &len, &L, LCount, 0);
1106	n/a	pos += len;
1107	n/a	find_syllable(pos, &len, &V, VCount, 1);
1108	n/a	pos += len;
1109	n/a	find_syllable(pos, &len, &T, TCount, 2);
1110	n/a	pos += len;
1111	n/a	if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1112	n/a	code = SBase + (LVCount+V)*TCount + T;
1113	n/a	return 1;
1114	n/a	}
1115	n/a	/* Otherwise, it's an illegal syllable name. */
1116	n/a	return 0;
1117	n/a	}
1118	n/a
1119	n/a	/* Check for unified ideographs. */
1120	n/a	if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1121	n/a	/* Four or five hexdigits must follow. */
1122	n/a	v = 0;
1123	n/a	name += 22;
1124	n/a	namelen -= 22;
1125	n/a	if (namelen != 4 && namelen != 5)
1126	n/a	return 0;
1127	n/a	while (namelen--) {
1128	n/a	v *= 16;
1129	n/a	if (name >= '0' && name <= '9')
1130	n/a	v += *name - '0';
1131	n/a	else if (name >= 'A' && name <= 'F')
1132	n/a	v += *name - 'A' + 10;
1133	n/a	else
1134	n/a	return 0;
1135	n/a	name++;
1136	n/a	}
1137	n/a	if (!is_unified_ideograph(v))
1138	n/a	return 0;
1139	n/a	*code = v;
1140	n/a	return 1;
1141	n/a	}
1142	n/a
1143	n/a	/* the following is the same as python's dictionary lookup, with
1144	n/a	only minor changes. see the makeunicodedata script for more
1145	n/a	details */
1146	n/a
1147	n/a	h = (unsigned int) _gethash(name, namelen, code_magic);
1148	n/a	i = (~h) & mask;
1149	n/a	v = code_hash[i];
1150	n/a	if (!v)
1151	n/a	return 0;
1152	n/a	if (_cmpname(self, v, name, namelen))
1153	n/a	return _check_alias_and_seq(v, code, with_named_seq);
1154	n/a	incr = (h ^ (h >> 3)) & mask;
1155	n/a	if (!incr)
1156	n/a	incr = mask;
1157	n/a	for (;;) {
1158	n/a	i = (i + incr) & mask;
1159	n/a	v = code_hash[i];
1160	n/a	if (!v)
1161	n/a	return 0;
1162	n/a	if (_cmpname(self, v, name, namelen))
1163	n/a	return _check_alias_and_seq(v, code, with_named_seq);
1164	n/a	incr = incr << 1;
1165	n/a	if (incr > mask)
1166	n/a	incr = incr ^ code_poly;
1167	n/a	}
1168	n/a	}
1169	n/a
1170	n/a	static const _PyUnicode_Name_CAPI hashAPI =
1171	n/a	{
1172	n/a	sizeof(_PyUnicode_Name_CAPI),
1173	n/a	_getucname,
1174	n/a	_getcode
1175	n/a	};
1176	n/a
1177	n/a	/* -------------------------------------------------------------------- */
1178	n/a	/* Python bindings */
1179	n/a
1180	n/a	/*[clinic input]
1181	n/a	unicodedata.UCD.name
1182	n/a
1183	n/a	self: self
1184	n/a	chr: int(accept={str})
1185	n/a	default: object=NULL
1186	n/a	/
1187	n/a
1188	n/a	Returns the name assigned to the character chr as a string.
1189	n/a
1190	n/a	If no name is defined, default is returned, or, if not given,
1191	n/a	ValueError is raised.
1192	n/a	[clinic start generated code]*/
1193	n/a
1194	n/a	static PyObject *
1195	n/a	unicodedata_UCD_name_impl(PyObject self, int chr, PyObject default_value)
1196	n/a	/[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]/
1197	n/a	{
1198	n/a	char name[NAME_MAXLEN+1];
1199	n/a	Py_UCS4 c = (Py_UCS4)chr;
1200	n/a
1201	n/a	if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
1202	n/a	if (default_value == NULL) {
1203	n/a	PyErr_SetString(PyExc_ValueError, "no such name");
1204	n/a	return NULL;
1205	n/a	}
1206	n/a	else {
1207	n/a	Py_INCREF(default_value);
1208	n/a	return default_value;
1209	n/a	}
1210	n/a	}
1211	n/a
1212	n/a	return PyUnicode_FromString(name);
1213	n/a	}
1214	n/a
1215	n/a	/*[clinic input]
1216	n/a	unicodedata.UCD.lookup
1217	n/a
1218	n/a	self: self
1219	n/a	name: str(accept={str, robuffer}, zeroes=True)
1220	n/a	/
1221	n/a
1222	n/a	Look up character by name.
1223	n/a
1224	n/a	If a character with the given name is found, return the
1225	n/a	corresponding character. If not found, KeyError is raised.
1226	n/a	[clinic start generated code]*/
1227	n/a
1228	n/a	static PyObject *
1229	n/a	unicodedata_UCD_lookup_impl(PyObject self, const char name,
1230	n/a	Py_ssize_clean_t name_length)
1231	n/a	/[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]/
1232	n/a	{
1233	n/a	Py_UCS4 code;
1234	n/a	unsigned int index;
1235	n/a	if (name_length > NAME_MAXLEN) {
1236	n/a	PyErr_SetString(PyExc_KeyError, "name too long");
1237	n/a	return NULL;
1238	n/a	}
1239	n/a
1240	n/a	if (!_getcode(self, name, (int)name_length, &code, 1)) {
1241	n/a	PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1242	n/a	return NULL;
1243	n/a	}
1244	n/a	/* check if code is in the PUA range that we use for named sequences
1245	n/a	and convert it */
1246	n/a	if (IS_NAMED_SEQ(code)) {
1247	n/a	index = code-named_sequences_start;
1248	n/a	return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1249	n/a	named_sequences[index].seq,
1250	n/a	named_sequences[index].seqlen);
1251	n/a	}
1252	n/a	return PyUnicode_FromOrdinal(code);
1253	n/a	}
1254	n/a
1255	n/a	/* XXX Add doc strings. */
1256	n/a
1257	n/a	static PyMethodDef unicodedata_functions[] = {
1258	n/a	UNICODEDATA_UCD_DECIMAL_METHODDEF
1259	n/a	UNICODEDATA_UCD_DIGIT_METHODDEF
1260	n/a	UNICODEDATA_UCD_NUMERIC_METHODDEF
1261	n/a	UNICODEDATA_UCD_CATEGORY_METHODDEF
1262	n/a	UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1263	n/a	UNICODEDATA_UCD_COMBINING_METHODDEF
1264	n/a	UNICODEDATA_UCD_MIRRORED_METHODDEF
1265	n/a	UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1266	n/a	UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1267	n/a	UNICODEDATA_UCD_NAME_METHODDEF
1268	n/a	UNICODEDATA_UCD_LOOKUP_METHODDEF
1269	n/a	UNICODEDATA_UCD_NORMALIZE_METHODDEF
1270	n/a	{NULL, NULL} /* sentinel */
1271	n/a	};
1272	n/a
1273	n/a	static PyTypeObject UCD_Type = {
1274	n/a	/* The ob_type field must be initialized in the module init function
1275	n/a	* to be portable to Windows without using C++. */
1276	n/a	PyVarObject_HEAD_INIT(NULL, 0)
1277	n/a	"unicodedata.UCD", /tp_name/
1278	n/a	sizeof(PreviousDBVersion), /tp_basicsize/
1279	n/a	0, /tp_itemsize/
1280	n/a	/* methods */
1281	n/a	(destructor)PyObject_Del, /tp_dealloc/
1282	n/a	0, /tp_print/
1283	n/a	0, /tp_getattr/
1284	n/a	0, /tp_setattr/
1285	n/a	0, /tp_reserved/
1286	n/a	0, /tp_repr/
1287	n/a	0, /tp_as_number/
1288	n/a	0, /tp_as_sequence/
1289	n/a	0, /tp_as_mapping/
1290	n/a	0, /tp_hash/
1291	n/a	0, /tp_call/
1292	n/a	0, /tp_str/
1293	n/a	PyObject_GenericGetAttr,/tp_getattro/
1294	n/a	0, /tp_setattro/
1295	n/a	0, /tp_as_buffer/
1296	n/a	Py_TPFLAGS_DEFAULT, /tp_flags/
1297	n/a	0, /tp_doc/
1298	n/a	0, /tp_traverse/
1299	n/a	0, /tp_clear/
1300	n/a	0, /tp_richcompare/
1301	n/a	0, /tp_weaklistoffset/
1302	n/a	0, /tp_iter/
1303	n/a	0, /tp_iternext/
1304	n/a	unicodedata_functions, /tp_methods/
1305	n/a	DB_members, /tp_members/
1306	n/a	0, /tp_getset/
1307	n/a	0, /tp_base/
1308	n/a	0, /tp_dict/
1309	n/a	0, /tp_descr_get/
1310	n/a	0, /tp_descr_set/
1311	n/a	0, /tp_dictoffset/
1312	n/a	0, /tp_init/
1313	n/a	0, /tp_alloc/
1314	n/a	0, /tp_new/
1315	n/a	0, /tp_free/
1316	n/a	0, /tp_is_gc/
1317	n/a	};
1318	n/a
1319	n/a	PyDoc_STRVAR(unicodedata_docstring,
1320	n/a	"This module provides access to the Unicode Character Database which\n\
1321	n/a	defines character properties for all Unicode characters. The data in\n\
1322	n/a	this database is based on the UnicodeData.txt file version\n\
1323	n/a	" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
1324	n/a	\n\
1325	n/a	The module uses the same names and symbols as defined by the\n\
1326	n/a	UnicodeData File Format " UNIDATA_VERSION ".");
1327	n/a
1328	n/a	static struct PyModuleDef unicodedatamodule = {
1329	n/a	PyModuleDef_HEAD_INIT,
1330	n/a	"unicodedata",
1331	n/a	unicodedata_docstring,
1332	n/a	-1,
1333	n/a	unicodedata_functions,
1334	n/a	NULL,
1335	n/a	NULL,
1336	n/a	NULL,
1337	n/a	NULL
1338	n/a	};
1339	n/a
1340	n/a	PyMODINIT_FUNC
1341	n/a	PyInit_unicodedata(void)
1342	n/a	{
1343	n/a	PyObject m, v;
1344	n/a
1345	n/a	Py_TYPE(&UCD_Type) = &PyType_Type;
1346	n/a
1347	n/a	m = PyModule_Create(&unicodedatamodule);
1348	n/a	if (!m)
1349	n/a	return NULL;
1350	n/a
1351	n/a	PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1352	n/a	Py_INCREF(&UCD_Type);
1353	n/a	PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1354	n/a
1355	n/a	/* Previous versions */
1356	n/a	v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1357	n/a	if (v != NULL)
1358	n/a	PyModule_AddObject(m, "ucd_3_2_0", v);
1359	n/a
1360	n/a	/* Export C API */
1361	n/a	v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
1362	n/a	if (v != NULL)
1363	n/a	PyModule_AddObject(m, "ucnhash_CAPI", v);
1364	n/a	return m;
1365	n/a	}
1366	n/a
1367	n/a	/*
1368	n/a	Local variables:
1369	n/a	c-basic-offset: 4
1370	n/a	indent-tabs-mode: nil
1371	n/a	End:
1372	n/a	*/