Python code coverage for Tools/unicode/makeunicodedata.py

#	count	content
1	n/a	#
2	n/a	# (re)generate unicode property and type databases
3	n/a	#
4	n/a	# this script converts a unicode 3.2 database file to
5	n/a	# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6	n/a	# and Objects/unicodetype_db.h
7	n/a	#
8	n/a	# history:
9	n/a	# 2000-09-24 fl created (based on bits and pieces from unidb)
10	n/a	# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
11	n/a	# 2000-09-25 fl added character type table
12	n/a	# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
13	n/a	# 2000-11-03 fl expand first/last ranges
14	n/a	# 2001-01-19 fl added character name tables (2.1)
15	n/a	# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
16	n/a	# 2002-09-11 wd use string methods
17	n/a	# 2002-10-18 mvl update to Unicode 3.2
18	n/a	# 2002-10-22 mvl generate NFC tables
19	n/a	# 2002-11-24 mvl expand all ranges, sort names version-independently
20	n/a	# 2002-11-25 mvl add UNIDATA_VERSION
21	n/a	# 2004-05-29 perky add east asian width information
22	n/a	# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
23	n/a	# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
24	n/a	# 2011-10-21 ezio add support for name aliases and named sequences
25	n/a	# 2012-01 benjamin add full case mappings
26	n/a	#
27	n/a	# written by Fredrik Lundh (fredrik@pythonware.com)
28	n/a	#
29	n/a
30	n/a	import os
31	n/a	import sys
32	n/a	import zipfile
33	n/a
34	n/a	from textwrap import dedent
35	n/a
36	n/a	SCRIPT = sys.argv[0]
37	n/a	VERSION = "3.2"
38	n/a
39	n/a	# The Unicode Database
40	n/a	# --------------------
41	n/a	# When changing UCD version please update
42	n/a	# * Doc/library/stdtypes.rst, and
43	n/a	# * Doc/library/unicodedata.rst
44	n/a	# * Doc/reference/lexical_analysis.rst (two occurrences)
45	n/a	UNIDATA_VERSION = "9.0.0"
46	n/a	UNICODE_DATA = "UnicodeData%s.txt"
47	n/a	COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
48	n/a	EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
49	n/a	UNIHAN = "Unihan%s.zip"
50	n/a	DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
51	n/a	DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
52	n/a	LINE_BREAK = "LineBreak%s.txt"
53	n/a	NAME_ALIASES = "NameAliases%s.txt"
54	n/a	NAMED_SEQUENCES = "NamedSequences%s.txt"
55	n/a	SPECIAL_CASING = "SpecialCasing%s.txt"
56	n/a	CASE_FOLDING = "CaseFolding%s.txt"
57	n/a
58	n/a	# Private Use Areas -- in planes 1, 15, 16
59	n/a	PUA_1 = range(0xE000, 0xF900)
60	n/a	PUA_15 = range(0xF0000, 0xFFFFE)
61	n/a	PUA_16 = range(0x100000, 0x10FFFE)
62	n/a
63	n/a	# we use this ranges of PUA_15 to store name aliases and named sequences
64	n/a	NAME_ALIASES_START = 0xF0000
65	n/a	NAMED_SEQUENCES_START = 0xF0200
66	n/a
67	n/a	old_versions = ["3.2.0"]
68	n/a
69	n/a	CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
70	n/a	"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
71	n/a	"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
72	n/a	"So" ]
73	n/a
74	n/a	BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
75	n/a	"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
76	n/a	"ON", "LRI", "RLI", "FSI", "PDI" ]
77	n/a
78	n/a	EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
79	n/a
80	n/a	MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
81	n/a
82	n/a	# note: should match definitions in Objects/unicodectype.c
83	n/a	ALPHA_MASK = 0x01
84	n/a	DECIMAL_MASK = 0x02
85	n/a	DIGIT_MASK = 0x04
86	n/a	LOWER_MASK = 0x08
87	n/a	LINEBREAK_MASK = 0x10
88	n/a	SPACE_MASK = 0x20
89	n/a	TITLE_MASK = 0x40
90	n/a	UPPER_MASK = 0x80
91	n/a	XID_START_MASK = 0x100
92	n/a	XID_CONTINUE_MASK = 0x200
93	n/a	PRINTABLE_MASK = 0x400
94	n/a	NUMERIC_MASK = 0x800
95	n/a	CASE_IGNORABLE_MASK = 0x1000
96	n/a	CASED_MASK = 0x2000
97	n/a	EXTENDED_CASE_MASK = 0x4000
98	n/a
99	n/a	# these ranges need to match unicodedata.c:is_unified_ideograph
100	n/a	cjk_ranges = [
101	n/a	('3400', '4DB5'),
102	n/a	('4E00', '9FD5'),
103	n/a	('20000', '2A6D6'),
104	n/a	('2A700', '2B734'),
105	n/a	('2B740', '2B81D'),
106	n/a	('2B820', '2CEA1'),
107	n/a	]
108	n/a
109	n/a	def maketables(trace=0):
110	n/a
111	n/a	print("--- Reading", UNICODE_DATA % "", "...")
112	n/a
113	n/a	version = ""
114	n/a	unicode = UnicodeData(UNIDATA_VERSION)
115	n/a
116	n/a	print(len(list(filter(None, unicode.table))), "characters")
117	n/a
118	n/a	for version in old_versions:
119	n/a	print("--- Reading", UNICODE_DATA % ("-"+version), "...")
120	n/a	old_unicode = UnicodeData(version, cjk_check=False)
121	n/a	print(len(list(filter(None, old_unicode.table))), "characters")
122	n/a	merge_old_version(version, unicode, old_unicode)
123	n/a
124	n/a	makeunicodename(unicode, trace)
125	n/a	makeunicodedata(unicode, trace)
126	n/a	makeunicodetype(unicode, trace)
127	n/a
128	n/a	# --------------------------------------------------------------------
129	n/a	# unicode character properties
130	n/a
131	n/a	def makeunicodedata(unicode, trace):
132	n/a
133	n/a	dummy = (0, 0, 0, 0, 0, 0)
134	n/a	table = [dummy]
135	n/a	cache = {0: dummy}
136	n/a	index = [0] * len(unicode.chars)
137	n/a
138	n/a	FILE = "Modules/unicodedata_db.h"
139	n/a
140	n/a	print("--- Preparing", FILE, "...")
141	n/a
142	n/a	# 1) database properties
143	n/a
144	n/a	for char in unicode.chars:
145	n/a	record = unicode.table[char]
146	n/a	if record:
147	n/a	# extract database properties
148	n/a	category = CATEGORY_NAMES.index(record[2])
149	n/a	combining = int(record[3])
150	n/a	bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
151	n/a	mirrored = record[9] == "Y"
152	n/a	eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
153	n/a	normalizationquickcheck = record[17]
154	n/a	item = (
155	n/a	category, combining, bidirectional, mirrored, eastasianwidth,
156	n/a	normalizationquickcheck
157	n/a	)
158	n/a	# add entry to index and item tables
159	n/a	i = cache.get(item)
160	n/a	if i is None:
161	n/a	cache[item] = i = len(table)
162	n/a	table.append(item)
163	n/a	index[char] = i
164	n/a
165	n/a	# 2) decomposition data
166	n/a
167	n/a	decomp_data = [0]
168	n/a	decomp_prefix = [""]
169	n/a	decomp_index = [0] * len(unicode.chars)
170	n/a	decomp_size = 0
171	n/a
172	n/a	comp_pairs = []
173	n/a	comp_first = [None] * len(unicode.chars)
174	n/a	comp_last = [None] * len(unicode.chars)
175	n/a
176	n/a	for char in unicode.chars:
177	n/a	record = unicode.table[char]
178	n/a	if record:
179	n/a	if record[5]:
180	n/a	decomp = record[5].split()
181	n/a	if len(decomp) > 19:
182	n/a	raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
183	n/a	# prefix
184	n/a	if decomp[0][0] == "<":
185	n/a	prefix = decomp.pop(0)
186	n/a	else:
187	n/a	prefix = ""
188	n/a	try:
189	n/a	i = decomp_prefix.index(prefix)
190	n/a	except ValueError:
191	n/a	i = len(decomp_prefix)
192	n/a	decomp_prefix.append(prefix)
193	n/a	prefix = i
194	n/a	assert prefix < 256
195	n/a	# content
196	n/a	decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
197	n/a	# Collect NFC pairs
198	n/a	if not prefix and len(decomp) == 3 and \
199	n/a	char not in unicode.exclusions and \
200	n/a	unicode.table[decomp[1]][3] == "0":
201	n/a	p, l, r = decomp
202	n/a	comp_first[l] = 1
203	n/a	comp_last[r] = 1
204	n/a	comp_pairs.append((l,r,char))
205	n/a	try:
206	n/a	i = decomp_data.index(decomp)
207	n/a	except ValueError:
208	n/a	i = len(decomp_data)
209	n/a	decomp_data.extend(decomp)
210	n/a	decomp_size = decomp_size + len(decomp) * 2
211	n/a	else:
212	n/a	i = 0
213	n/a	decomp_index[char] = i
214	n/a
215	n/a	f = l = 0
216	n/a	comp_first_ranges = []
217	n/a	comp_last_ranges = []
218	n/a	prev_f = prev_l = None
219	n/a	for i in unicode.chars:
220	n/a	if comp_first[i] is not None:
221	n/a	comp_first[i] = f
222	n/a	f += 1
223	n/a	if prev_f is None:
224	n/a	prev_f = (i,i)
225	n/a	elif prev_f[1]+1 == i:
226	n/a	prev_f = prev_f[0],i
227	n/a	else:
228	n/a	comp_first_ranges.append(prev_f)
229	n/a	prev_f = (i,i)
230	n/a	if comp_last[i] is not None:
231	n/a	comp_last[i] = l
232	n/a	l += 1
233	n/a	if prev_l is None:
234	n/a	prev_l = (i,i)
235	n/a	elif prev_l[1]+1 == i:
236	n/a	prev_l = prev_l[0],i
237	n/a	else:
238	n/a	comp_last_ranges.append(prev_l)
239	n/a	prev_l = (i,i)
240	n/a	comp_first_ranges.append(prev_f)
241	n/a	comp_last_ranges.append(prev_l)
242	n/a	total_first = f
243	n/a	total_last = l
244	n/a
245	n/a	comp_data = [0](total_firsttotal_last)
246	n/a	for f,l,char in comp_pairs:
247	n/a	f = comp_first[f]
248	n/a	l = comp_last[l]
249	n/a	comp_data[f*total_last+l] = char
250	n/a
251	n/a	print(len(table), "unique properties")
252	n/a	print(len(decomp_prefix), "unique decomposition prefixes")
253	n/a	print(len(decomp_data), "unique decomposition entries:", end=' ')
254	n/a	print(decomp_size, "bytes")
255	n/a	print(total_first, "first characters in NFC")
256	n/a	print(total_last, "last characters in NFC")
257	n/a	print(len(comp_pairs), "NFC pairs")
258	n/a
259	n/a	print("--- Writing", FILE, "...")
260	n/a
261	n/a	fp = open(FILE, "w")
262	n/a	print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
263	n/a	print(file=fp)
264	n/a	print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
265	n/a	print("/* a list of unique database records */", file=fp)
266	n/a	print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
267	n/a	for item in table:
268	n/a	print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
269	n/a	print("};", file=fp)
270	n/a	print(file=fp)
271	n/a
272	n/a	print("/* Reindexing of NFC first characters. */", file=fp)
273	n/a	print("#define TOTAL_FIRST",total_first, file=fp)
274	n/a	print("#define TOTAL_LAST",total_last, file=fp)
275	n/a	print("struct reindex{int start;short count,index;};", file=fp)
276	n/a	print("static struct reindex nfc_first[] = {", file=fp)
277	n/a	for start,end in comp_first_ranges:
278	n/a	print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
279	n/a	print(" {0,0,0}", file=fp)
280	n/a	print("};\n", file=fp)
281	n/a	print("static struct reindex nfc_last[] = {", file=fp)
282	n/a	for start,end in comp_last_ranges:
283	n/a	print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
284	n/a	print(" {0,0,0}", file=fp)
285	n/a	print("};\n", file=fp)
286	n/a
287	n/a	# FIXME: <fl> the following tables could be made static, and
288	n/a	# the support code moved into unicodedatabase.c
289	n/a
290	n/a	print("/* string literals */", file=fp)
291	n/a	print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
292	n/a	for name in CATEGORY_NAMES:
293	n/a	print(" \"%s\"," % name, file=fp)
294	n/a	print(" NULL", file=fp)
295	n/a	print("};", file=fp)
296	n/a
297	n/a	print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
298	n/a	for name in BIDIRECTIONAL_NAMES:
299	n/a	print(" \"%s\"," % name, file=fp)
300	n/a	print(" NULL", file=fp)
301	n/a	print("};", file=fp)
302	n/a
303	n/a	print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
304	n/a	for name in EASTASIANWIDTH_NAMES:
305	n/a	print(" \"%s\"," % name, file=fp)
306	n/a	print(" NULL", file=fp)
307	n/a	print("};", file=fp)
308	n/a
309	n/a	print("static const char *decomp_prefix[] = {", file=fp)
310	n/a	for name in decomp_prefix:
311	n/a	print(" \"%s\"," % name, file=fp)
312	n/a	print(" NULL", file=fp)
313	n/a	print("};", file=fp)
314	n/a
315	n/a	# split record index table
316	n/a	index1, index2, shift = splitbins(index, trace)
317	n/a
318	n/a	print("/* index tables for the database records */", file=fp)
319	n/a	print("#define SHIFT", shift, file=fp)
320	n/a	Array("index1", index1).dump(fp, trace)
321	n/a	Array("index2", index2).dump(fp, trace)
322	n/a
323	n/a	# split decomposition index table
324	n/a	index1, index2, shift = splitbins(decomp_index, trace)
325	n/a
326	n/a	print("/* decomposition data */", file=fp)
327	n/a	Array("decomp_data", decomp_data).dump(fp, trace)
328	n/a
329	n/a	print("/* index tables for the decomposition data */", file=fp)
330	n/a	print("#define DECOMP_SHIFT", shift, file=fp)
331	n/a	Array("decomp_index1", index1).dump(fp, trace)
332	n/a	Array("decomp_index2", index2).dump(fp, trace)
333	n/a
334	n/a	index, index2, shift = splitbins(comp_data, trace)
335	n/a	print("/* NFC pairs */", file=fp)
336	n/a	print("#define COMP_SHIFT", shift, file=fp)
337	n/a	Array("comp_index", index).dump(fp, trace)
338	n/a	Array("comp_data", index2).dump(fp, trace)
339	n/a
340	n/a	# Generate delta tables for old versions
341	n/a	for version, table, normalization in unicode.changed:
342	n/a	cversion = version.replace(".","_")
343	n/a	records = [table[0]]
344	n/a	cache = {table[0]:0}
345	n/a	index = [0] * len(table)
346	n/a	for i, record in enumerate(table):
347	n/a	try:
348	n/a	index[i] = cache[record]
349	n/a	except KeyError:
350	n/a	index[i] = cache[record] = len(records)
351	n/a	records.append(record)
352	n/a	index1, index2, shift = splitbins(index, trace)
353	n/a	print("static const change_record change_records_%s[] = {" % cversion, file=fp)
354	n/a	for record in records:
355	n/a	print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
356	n/a	print("};", file=fp)
357	n/a	Array("changes_%s_index" % cversion, index1).dump(fp, trace)
358	n/a	Array("changes_%s_data" % cversion, index2).dump(fp, trace)
359	n/a	print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
360	n/a	print("{", file=fp)
361	n/a	print("\tint index;", file=fp)
362	n/a	print("\tif (n >= 0x110000) index = 0;", file=fp)
363	n/a	print("\telse {", file=fp)
364	n/a	print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
365	n/a	print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
366	n/a	(cversion, shift, ((1<<shift)-1)), file=fp)
367	n/a	print("\t}", file=fp)
368	n/a	print("\treturn change_records_%s+index;" % cversion, file=fp)
369	n/a	print("}\n", file=fp)
370	n/a	print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
371	n/a	print("{", file=fp)
372	n/a	print("\tswitch(n) {", file=fp)
373	n/a	for k, v in normalization:
374	n/a	print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
375	n/a	print("\tdefault: return 0;", file=fp)
376	n/a	print("\t}\n}\n", file=fp)
377	n/a
378	n/a	fp.close()
379	n/a
380	n/a	# --------------------------------------------------------------------
381	n/a	# unicode character type tables
382	n/a
383	n/a	def makeunicodetype(unicode, trace):
384	n/a
385	n/a	FILE = "Objects/unicodetype_db.h"
386	n/a
387	n/a	print("--- Preparing", FILE, "...")
388	n/a
389	n/a	# extract unicode types
390	n/a	dummy = (0, 0, 0, 0, 0, 0)
391	n/a	table = [dummy]
392	n/a	cache = {0: dummy}
393	n/a	index = [0] * len(unicode.chars)
394	n/a	numeric = {}
395	n/a	spaces = []
396	n/a	linebreaks = []
397	n/a	extra_casing = []
398	n/a
399	n/a	for char in unicode.chars:
400	n/a	record = unicode.table[char]
401	n/a	if record:
402	n/a	# extract database properties
403	n/a	category = record[2]
404	n/a	bidirectional = record[4]
405	n/a	properties = record[16]
406	n/a	flags = 0
407	n/a	delta = True
408	n/a	if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
409	n/a	flags \|= ALPHA_MASK
410	n/a	if "Lowercase" in properties:
411	n/a	flags \|= LOWER_MASK
412	n/a	if 'Line_Break' in properties or bidirectional == "B":
413	n/a	flags \|= LINEBREAK_MASK
414	n/a	linebreaks.append(char)
415	n/a	if category == "Zs" or bidirectional in ("WS", "B", "S"):
416	n/a	flags \|= SPACE_MASK
417	n/a	spaces.append(char)
418	n/a	if category == "Lt":
419	n/a	flags \|= TITLE_MASK
420	n/a	if "Uppercase" in properties:
421	n/a	flags \|= UPPER_MASK
422	n/a	if char == ord(" ") or category[0] not in ("C", "Z"):
423	n/a	flags \|= PRINTABLE_MASK
424	n/a	if "XID_Start" in properties:
425	n/a	flags \|= XID_START_MASK
426	n/a	if "XID_Continue" in properties:
427	n/a	flags \|= XID_CONTINUE_MASK
428	n/a	if "Cased" in properties:
429	n/a	flags \|= CASED_MASK
430	n/a	if "Case_Ignorable" in properties:
431	n/a	flags \|= CASE_IGNORABLE_MASK
432	n/a	sc = unicode.special_casing.get(char)
433	n/a	cf = unicode.case_folding.get(char, [char])
434	n/a	if record[12]:
435	n/a	upper = int(record[12], 16)
436	n/a	else:
437	n/a	upper = char
438	n/a	if record[13]:
439	n/a	lower = int(record[13], 16)
440	n/a	else:
441	n/a	lower = char
442	n/a	if record[14]:
443	n/a	title = int(record[14], 16)
444	n/a	else:
445	n/a	title = upper
446	n/a	if sc is None and cf != [lower]:
447	n/a	sc = ([lower], [title], [upper])
448	n/a	if sc is None:
449	n/a	if upper == lower == title:
450	n/a	upper = lower = title = 0
451	n/a	else:
452	n/a	upper = upper - char
453	n/a	lower = lower - char
454	n/a	title = title - char
455	n/a	assert (abs(upper) <= 2147483647 and
456	n/a	abs(lower) <= 2147483647 and
457	n/a	abs(title) <= 2147483647)
458	n/a	else:
459	n/a	# This happens either when some character maps to more than one
460	n/a	# character in uppercase, lowercase, or titlecase or the
461	n/a	# casefolded version of the character is different from the
462	n/a	# lowercase. The extra characters are stored in a different
463	n/a	# array.
464	n/a	flags \|= EXTENDED_CASE_MASK
465	n/a	lower = len(extra_casing) \| (len(sc[0]) << 24)
466	n/a	extra_casing.extend(sc[0])
467	n/a	if cf != sc[0]:
468	n/a	lower \|= len(cf) << 20
469	n/a	extra_casing.extend(cf)
470	n/a	upper = len(extra_casing) \| (len(sc[2]) << 24)
471	n/a	extra_casing.extend(sc[2])
472	n/a	# Title is probably equal to upper.
473	n/a	if sc[1] == sc[2]:
474	n/a	title = upper
475	n/a	else:
476	n/a	title = len(extra_casing) \| (len(sc[1]) << 24)
477	n/a	extra_casing.extend(sc[1])
478	n/a	# decimal digit, integer digit
479	n/a	decimal = 0
480	n/a	if record[6]:
481	n/a	flags \|= DECIMAL_MASK
482	n/a	decimal = int(record[6])
483	n/a	digit = 0
484	n/a	if record[7]:
485	n/a	flags \|= DIGIT_MASK
486	n/a	digit = int(record[7])
487	n/a	if record[8]:
488	n/a	flags \|= NUMERIC_MASK
489	n/a	numeric.setdefault(record[8], []).append(char)
490	n/a	item = (
491	n/a	upper, lower, title, decimal, digit, flags
492	n/a	)
493	n/a	# add entry to index and item tables
494	n/a	i = cache.get(item)
495	n/a	if i is None:
496	n/a	cache[item] = i = len(table)
497	n/a	table.append(item)
498	n/a	index[char] = i
499	n/a
500	n/a	print(len(table), "unique character type entries")
501	n/a	print(sum(map(len, numeric.values())), "numeric code points")
502	n/a	print(len(spaces), "whitespace code points")
503	n/a	print(len(linebreaks), "linebreak code points")
504	n/a	print(len(extra_casing), "extended case array")
505	n/a
506	n/a	print("--- Writing", FILE, "...")
507	n/a
508	n/a	fp = open(FILE, "w")
509	n/a	print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
510	n/a	print(file=fp)
511	n/a	print("/* a list of unique character type descriptors */", file=fp)
512	n/a	print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
513	n/a	for item in table:
514	n/a	print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
515	n/a	print("};", file=fp)
516	n/a	print(file=fp)
517	n/a
518	n/a	print("/* extended case mappings */", file=fp)
519	n/a	print(file=fp)
520	n/a	print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
521	n/a	for c in extra_casing:
522	n/a	print(" %d," % c, file=fp)
523	n/a	print("};", file=fp)
524	n/a	print(file=fp)
525	n/a
526	n/a	# split decomposition index table
527	n/a	index1, index2, shift = splitbins(index, trace)
528	n/a
529	n/a	print("/* type indexes */", file=fp)
530	n/a	print("#define SHIFT", shift, file=fp)
531	n/a	Array("index1", index1).dump(fp, trace)
532	n/a	Array("index2", index2).dump(fp, trace)
533	n/a
534	n/a	# Generate code for _PyUnicode_ToNumeric()
535	n/a	numeric_items = sorted(numeric.items())
536	n/a	print('/* Returns the numeric value as double for Unicode characters', file=fp)
537	n/a	print(' * having this property, -1.0 otherwise.', file=fp)
538	n/a	print(' */', file=fp)
539	n/a	print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
540	n/a	print('{', file=fp)
541	n/a	print(' switch (ch) {', file=fp)
542	n/a	for value, codepoints in numeric_items:
543	n/a	# Turn text into float literals
544	n/a	parts = value.split('/')
545	n/a	parts = [repr(float(part)) for part in parts]
546	n/a	value = '/'.join(parts)
547	n/a
548	n/a	codepoints.sort()
549	n/a	for codepoint in codepoints:
550	n/a	print(' case 0x%04X:' % (codepoint,), file=fp)
551	n/a	print(' return (double) %s;' % (value,), file=fp)
552	n/a	print(' }', file=fp)
553	n/a	print(' return -1.0;', file=fp)
554	n/a	print('}', file=fp)
555	n/a	print(file=fp)
556	n/a
557	n/a	# Generate code for _PyUnicode_IsWhitespace()
558	n/a	print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
559	n/a	print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
560	n/a	print(" */", file=fp)
561	n/a	print('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)', file=fp)
562	n/a	print('{', file=fp)
563	n/a	print(' switch (ch) {', file=fp)
564	n/a
565	n/a	for codepoint in sorted(spaces):
566	n/a	print(' case 0x%04X:' % (codepoint,), file=fp)
567	n/a	print(' return 1;', file=fp)
568	n/a
569	n/a	print(' }', file=fp)
570	n/a	print(' return 0;', file=fp)
571	n/a	print('}', file=fp)
572	n/a	print(file=fp)
573	n/a
574	n/a	# Generate code for _PyUnicode_IsLinebreak()
575	n/a	print("/* Returns 1 for Unicode characters having the line break", file=fp)
576	n/a	print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
577	n/a	print(" * type 'B', 0 otherwise.", file=fp)
578	n/a	print(" */", file=fp)
579	n/a	print('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)', file=fp)
580	n/a	print('{', file=fp)
581	n/a	print(' switch (ch) {', file=fp)
582	n/a	for codepoint in sorted(linebreaks):
583	n/a	print(' case 0x%04X:' % (codepoint,), file=fp)
584	n/a	print(' return 1;', file=fp)
585	n/a
586	n/a	print(' }', file=fp)
587	n/a	print(' return 0;', file=fp)
588	n/a	print('}', file=fp)
589	n/a	print(file=fp)
590	n/a
591	n/a	fp.close()
592	n/a
593	n/a	# --------------------------------------------------------------------
594	n/a	# unicode name database
595	n/a
596	n/a	def makeunicodename(unicode, trace):
597	n/a
598	n/a	FILE = "Modules/unicodename_db.h"
599	n/a
600	n/a	print("--- Preparing", FILE, "...")
601	n/a
602	n/a	# collect names
603	n/a	names = [None] * len(unicode.chars)
604	n/a
605	n/a	for char in unicode.chars:
606	n/a	record = unicode.table[char]
607	n/a	if record:
608	n/a	name = record[1].strip()
609	n/a	if name and name[0] != "<":
610	n/a	names[char] = name + chr(0)
611	n/a
612	n/a	print(len(list(n for n in names if n is not None)), "distinct names")
613	n/a
614	n/a	# collect unique words from names (note that we differ between
615	n/a	# words inside a sentence, and words ending a sentence. the
616	n/a	# latter includes the trailing null byte.
617	n/a
618	n/a	words = {}
619	n/a	n = b = 0
620	n/a	for char in unicode.chars:
621	n/a	name = names[char]
622	n/a	if name:
623	n/a	w = name.split()
624	n/a	b = b + len(name)
625	n/a	n = n + len(w)
626	n/a	for w in w:
627	n/a	l = words.get(w)
628	n/a	if l:
629	n/a	l.append(None)
630	n/a	else:
631	n/a	words[w] = [len(words)]
632	n/a
633	n/a	print(n, "words in text;", b, "bytes")
634	n/a
635	n/a	wordlist = list(words.items())
636	n/a
637	n/a	# sort on falling frequency, then by name
638	n/a	def word_key(a):
639	n/a	aword, alist = a
640	n/a	return -len(alist), aword
641	n/a	wordlist.sort(key=word_key)
642	n/a
643	n/a	# figure out how many phrasebook escapes we need
644	n/a	escapes = 0
645	n/a	while escapes * 256 < len(wordlist):
646	n/a	escapes = escapes + 1
647	n/a	print(escapes, "escapes")
648	n/a
649	n/a	short = 256 - escapes
650	n/a
651	n/a	assert short > 0
652	n/a
653	n/a	print(short, "short indexes in lexicon")
654	n/a
655	n/a	# statistics
656	n/a	n = 0
657	n/a	for i in range(short):
658	n/a	n = n + len(wordlist[i][1])
659	n/a	print(n, "short indexes in phrasebook")
660	n/a
661	n/a	# pick the most commonly used words, and sort the rest on falling
662	n/a	# length (to maximize overlap)
663	n/a
664	n/a	wordlist, wordtail = wordlist[:short], wordlist[short:]
665	n/a	wordtail.sort(key=lambda a: a[0], reverse=True)
666	n/a	wordlist.extend(wordtail)
667	n/a
668	n/a	# generate lexicon from words
669	n/a
670	n/a	lexicon_offset = [0]
671	n/a	lexicon = ""
672	n/a	words = {}
673	n/a
674	n/a	# build a lexicon string
675	n/a	offset = 0
676	n/a	for w, x in wordlist:
677	n/a	# encoding: bit 7 indicates last character in word (chr(128)
678	n/a	# indicates the last character in an entire string)
679	n/a	ww = w[:-1] + chr(ord(w[-1])+128)
680	n/a	# reuse string tails, when possible
681	n/a	o = lexicon.find(ww)
682	n/a	if o < 0:
683	n/a	o = offset
684	n/a	lexicon = lexicon + ww
685	n/a	offset = offset + len(w)
686	n/a	words[w] = len(lexicon_offset)
687	n/a	lexicon_offset.append(o)
688	n/a
689	n/a	lexicon = list(map(ord, lexicon))
690	n/a
691	n/a	# generate phrasebook from names and lexicon
692	n/a	phrasebook = [0]
693	n/a	phrasebook_offset = [0] * len(unicode.chars)
694	n/a	for char in unicode.chars:
695	n/a	name = names[char]
696	n/a	if name:
697	n/a	w = name.split()
698	n/a	phrasebook_offset[char] = len(phrasebook)
699	n/a	for w in w:
700	n/a	i = words[w]
701	n/a	if i < short:
702	n/a	phrasebook.append(i)
703	n/a	else:
704	n/a	# store as two bytes
705	n/a	phrasebook.append((i>>8) + short)
706	n/a	phrasebook.append(i&255)
707	n/a
708	n/a	assert getsize(phrasebook) == 1
709	n/a
710	n/a	#
711	n/a	# unicode name hash table
712	n/a
713	n/a	# extract names
714	n/a	data = []
715	n/a	for char in unicode.chars:
716	n/a	record = unicode.table[char]
717	n/a	if record:
718	n/a	name = record[1].strip()
719	n/a	if name and name[0] != "<":
720	n/a	data.append((name, char))
721	n/a
722	n/a	# the magic number 47 was chosen to minimize the number of
723	n/a	# collisions on the current data set. if you like, change it
724	n/a	# and see what happens...
725	n/a
726	n/a	codehash = Hash("code", data, 47)
727	n/a
728	n/a	print("--- Writing", FILE, "...")
729	n/a
730	n/a	fp = open(FILE, "w")
731	n/a	print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
732	n/a	print(file=fp)
733	n/a	print("#define NAME_MAXLEN", 256, file=fp)
734	n/a	print(file=fp)
735	n/a	print("/* lexicon */", file=fp)
736	n/a	Array("lexicon", lexicon).dump(fp, trace)
737	n/a	Array("lexicon_offset", lexicon_offset).dump(fp, trace)
738	n/a
739	n/a	# split decomposition index table
740	n/a	offset1, offset2, shift = splitbins(phrasebook_offset, trace)
741	n/a
742	n/a	print("/* code->name phrasebook */", file=fp)
743	n/a	print("#define phrasebook_shift", shift, file=fp)
744	n/a	print("#define phrasebook_short", short, file=fp)
745	n/a
746	n/a	Array("phrasebook", phrasebook).dump(fp, trace)
747	n/a	Array("phrasebook_offset1", offset1).dump(fp, trace)
748	n/a	Array("phrasebook_offset2", offset2).dump(fp, trace)
749	n/a
750	n/a	print("/* name->code dictionary */", file=fp)
751	n/a	codehash.dump(fp, trace)
752	n/a
753	n/a	print(file=fp)
754	n/a	print('static const unsigned int aliases_start = %#x;' %
755	n/a	NAME_ALIASES_START, file=fp)
756	n/a	print('static const unsigned int aliases_end = %#x;' %
757	n/a	(NAME_ALIASES_START + len(unicode.aliases)), file=fp)
758	n/a
759	n/a	print('static const unsigned int name_aliases[] = {', file=fp)
760	n/a	for name, codepoint in unicode.aliases:
761	n/a	print(' 0x%04X,' % codepoint, file=fp)
762	n/a	print('};', file=fp)
763	n/a
764	n/a	# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
765	n/a	# so we are using Py_UCS2 seq[4]. This needs to be updated if longer
766	n/a	# sequences or sequences with non-BMP chars are added.
767	n/a	# unicodedata_lookup should be adapted too.
768	n/a	print(dedent("""
769	n/a	typedef struct NamedSequence {
770	n/a	int seqlen;
771	n/a	Py_UCS2 seq[4];
772	n/a	} named_sequence;
773	n/a	"""), file=fp)
774	n/a
775	n/a	print('static const unsigned int named_sequences_start = %#x;' %
776	n/a	NAMED_SEQUENCES_START, file=fp)
777	n/a	print('static const unsigned int named_sequences_end = %#x;' %
778	n/a	(NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
779	n/a
780	n/a	print('static const named_sequence named_sequences[] = {', file=fp)
781	n/a	for name, sequence in unicode.named_sequences:
782	n/a	seq_str = ', '.join('0x%04X' % cp for cp in sequence)
783	n/a	print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)
784	n/a	print('};', file=fp)
785	n/a
786	n/a	fp.close()
787	n/a
788	n/a
789	n/a	def merge_old_version(version, new, old):
790	n/a	# Changes to exclusion file not implemented yet
791	n/a	if old.exclusions != new.exclusions:
792	n/a	raise NotImplementedError("exclusions differ")
793	n/a
794	n/a	# In these change records, 0xFF means "no change"
795	n/a	bidir_changes = [0xFF]*0x110000
796	n/a	category_changes = [0xFF]*0x110000
797	n/a	decimal_changes = [0xFF]*0x110000
798	n/a	mirrored_changes = [0xFF]*0x110000
799	n/a	east_asian_width_changes = [0xFF]*0x110000
800	n/a	# In numeric data, 0 means "no change",
801	n/a	# -1 means "did not have a numeric value
802	n/a	numeric_changes = [0] * 0x110000
803	n/a	# normalization_changes is a list of key-value pairs
804	n/a	normalization_changes = []
805	n/a	for i in range(0x110000):
806	n/a	if new.table[i] is None:
807	n/a	# Characters unassigned in the new version ought to
808	n/a	# be unassigned in the old one
809	n/a	assert old.table[i] is None
810	n/a	continue
811	n/a	# check characters unassigned in the old version
812	n/a	if old.table[i] is None:
813	n/a	# category 0 is "unassigned"
814	n/a	category_changes[i] = 0
815	n/a	continue
816	n/a	# check characters that differ
817	n/a	if old.table[i] != new.table[i]:
818	n/a	for k in range(len(old.table[i])):
819	n/a	if old.table[i][k] != new.table[i][k]:
820	n/a	value = old.table[i][k]
821	n/a	if k == 1 and i in PUA_15:
822	n/a	# the name is not set in the old.table, but in the
823	n/a	# new.table we are using it for aliases and named seq
824	n/a	assert value == ''
825	n/a	elif k == 2:
826	n/a	#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
827	n/a	category_changes[i] = CATEGORY_NAMES.index(value)
828	n/a	elif k == 4:
829	n/a	#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
830	n/a	bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
831	n/a	elif k == 5:
832	n/a	#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
833	n/a	# We assume that all normalization changes are in 1:1 mappings
834	n/a	assert " " not in value
835	n/a	normalization_changes.append((i, value))
836	n/a	elif k == 6:
837	n/a	#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
838	n/a	# we only support changes where the old value is a single digit
839	n/a	assert value in "0123456789"
840	n/a	decimal_changes[i] = int(value)
841	n/a	elif k == 8:
842	n/a	# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
843	n/a	# Since 0 encodes "no change", the old value is better not 0
844	n/a	if not value:
845	n/a	numeric_changes[i] = -1
846	n/a	else:
847	n/a	numeric_changes[i] = float(value)
848	n/a	assert numeric_changes[i] not in (0, -1)
849	n/a	elif k == 9:
850	n/a	if value == 'Y':
851	n/a	mirrored_changes[i] = '1'
852	n/a	else:
853	n/a	mirrored_changes[i] = '0'
854	n/a	elif k == 11:
855	n/a	# change to ISO comment, ignore
856	n/a	pass
857	n/a	elif k == 12:
858	n/a	# change to simple uppercase mapping; ignore
859	n/a	pass
860	n/a	elif k == 13:
861	n/a	# change to simple lowercase mapping; ignore
862	n/a	pass
863	n/a	elif k == 14:
864	n/a	# change to simple titlecase mapping; ignore
865	n/a	pass
866	n/a	elif k == 15:
867	n/a	# change to east asian width
868	n/a	east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
869	n/a	elif k == 16:
870	n/a	# derived property changes; not yet
871	n/a	pass
872	n/a	elif k == 17:
873	n/a	# normalization quickchecks are not performed
874	n/a	# for older versions
875	n/a	pass
876	n/a	else:
877	n/a	class Difference(Exception):pass
878	n/a	raise Difference(hex(i), k, old.table[i], new.table[i])
879	n/a	new.changed.append((version, list(zip(bidir_changes, category_changes,
880	n/a	decimal_changes, mirrored_changes,
881	n/a	east_asian_width_changes,
882	n/a	numeric_changes)),
883	n/a	normalization_changes))
884	n/a
885	n/a	def open_data(template, version):
886	n/a	local = template % ('-'+version,)
887	n/a	if not os.path.exists(local):
888	n/a	import urllib.request
889	n/a	if version == '3.2.0':
890	n/a	# irregular url structure
891	n/a	url = 'http://www.unicode.org/Public/3.2-Update/' + local
892	n/a	else:
893	n/a	url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
894	n/a	urllib.request.urlretrieve(url, filename=local)
895	n/a	if local.endswith('.txt'):
896	n/a	return open(local, encoding='utf-8')
897	n/a	else:
898	n/a	# Unihan.zip
899	n/a	return open(local, 'rb')
900	n/a
901	n/a	# --------------------------------------------------------------------
902	n/a	# the following support code is taken from the unidb utilities
903	n/a	# Copyright (c) 1999-2000 by Secret Labs AB
904	n/a
905	n/a	# load a unicode-data file from disk
906	n/a
907	n/a	class UnicodeData:
908	n/a	# Record structure:
909	n/a	# [ID, name, category, combining, bidi, decomp, (6)
910	n/a	# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
911	n/a	# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
912	n/a	# derived-props] (17)
913	n/a
914	n/a	def __init__(self, version,
915	n/a	linebreakprops=False,
916	n/a	expand=1,
917	n/a	cjk_check=True):
918	n/a	self.changed = []
919	n/a	table = [None] * 0x110000
920	n/a	with open_data(UNICODE_DATA, version) as file:
921	n/a	while 1:
922	n/a	s = file.readline()
923	n/a	if not s:
924	n/a	break
925	n/a	s = s.strip().split(";")
926	n/a	char = int(s[0], 16)
927	n/a	table[char] = s
928	n/a
929	n/a	cjk_ranges_found = []
930	n/a
931	n/a	# expand first-last ranges
932	n/a	if expand:
933	n/a	field = None
934	n/a	for i in range(0, 0x110000):
935	n/a	s = table[i]
936	n/a	if s:
937	n/a	if s[1][-6:] == "First>":
938	n/a	s[1] = ""
939	n/a	field = s
940	n/a	elif s[1][-5:] == "Last>":
941	n/a	if s[1].startswith("<CJK Ideograph"):
942	n/a	cjk_ranges_found.append((field[0],
943	n/a	s[0]))
944	n/a	s[1] = ""
945	n/a	field = None
946	n/a	elif field:
947	n/a	f2 = field[:]
948	n/a	f2[0] = "%X" % i
949	n/a	table[i] = f2
950	n/a	if cjk_check and cjk_ranges != cjk_ranges_found:
951	n/a	raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
952	n/a
953	n/a	# public attributes
954	n/a	self.filename = UNICODE_DATA % ''
955	n/a	self.table = table
956	n/a	self.chars = list(range(0x110000)) # unicode 3.2
957	n/a
958	n/a	# check for name aliases and named sequences, see #12753
959	n/a	# aliases and named sequences are not in 3.2.0
960	n/a	if version != '3.2.0':
961	n/a	self.aliases = []
962	n/a	# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
963	n/a	# in order to take advantage of the compression and lookup
964	n/a	# algorithms used for the other characters
965	n/a	pua_index = NAME_ALIASES_START
966	n/a	with open_data(NAME_ALIASES, version) as file:
967	n/a	for s in file:
968	n/a	s = s.strip()
969	n/a	if not s or s.startswith('#'):
970	n/a	continue
971	n/a	char, name, abbrev = s.split(';')
972	n/a	char = int(char, 16)
973	n/a	self.aliases.append((name, char))
974	n/a	# also store the name in the PUA 1
975	n/a	self.table[pua_index][1] = name
976	n/a	pua_index += 1
977	n/a	assert pua_index - NAME_ALIASES_START == len(self.aliases)
978	n/a
979	n/a	self.named_sequences = []
980	n/a	# store named sequences in the PUA 1, in range U+F0100..,
981	n/a	# in order to take advantage of the compression and lookup
982	n/a	# algorithms used for the other characters.
983	n/a
984	n/a	assert pua_index < NAMED_SEQUENCES_START
985	n/a	pua_index = NAMED_SEQUENCES_START
986	n/a	with open_data(NAMED_SEQUENCES, version) as file:
987	n/a	for s in file:
988	n/a	s = s.strip()
989	n/a	if not s or s.startswith('#'):
990	n/a	continue
991	n/a	name, chars = s.split(';')
992	n/a	chars = tuple(int(char, 16) for char in chars.split())
993	n/a	# check that the structure defined in makeunicodename is OK
994	n/a	assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
995	n/a	assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
996	n/a	"the NamedSequence struct and in unicodedata_lookup")
997	n/a	self.named_sequences.append((name, chars))
998	n/a	# also store these in the PUA 1
999	n/a	self.table[pua_index][1] = name
1000	n/a	pua_index += 1
1001	n/a	assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
1002	n/a
1003	n/a	self.exclusions = {}
1004	n/a	with open_data(COMPOSITION_EXCLUSIONS, version) as file:
1005	n/a	for s in file:
1006	n/a	s = s.strip()
1007	n/a	if not s:
1008	n/a	continue
1009	n/a	if s[0] == '#':
1010	n/a	continue
1011	n/a	char = int(s.split()[0],16)
1012	n/a	self.exclusions[char] = 1
1013	n/a
1014	n/a	widths = [None] * 0x110000
1015	n/a	with open_data(EASTASIAN_WIDTH, version) as file:
1016	n/a	for s in file:
1017	n/a	s = s.strip()
1018	n/a	if not s:
1019	n/a	continue
1020	n/a	if s[0] == '#':
1021	n/a	continue
1022	n/a	s = s.split()[0].split(';')
1023	n/a	if '..' in s[0]:
1024	n/a	first, last = [int(c, 16) for c in s[0].split('..')]
1025	n/a	chars = list(range(first, last+1))
1026	n/a	else:
1027	n/a	chars = [int(s[0], 16)]
1028	n/a	for char in chars:
1029	n/a	widths[char] = s[1]
1030	n/a
1031	n/a	for i in range(0, 0x110000):
1032	n/a	if table[i] is not None:
1033	n/a	table[i].append(widths[i])
1034	n/a
1035	n/a	for i in range(0, 0x110000):
1036	n/a	if table[i] is not None:
1037	n/a	table[i].append(set())
1038	n/a
1039	n/a	with open_data(DERIVED_CORE_PROPERTIES, version) as file:
1040	n/a	for s in file:
1041	n/a	s = s.split('#', 1)[0].strip()
1042	n/a	if not s:
1043	n/a	continue
1044	n/a
1045	n/a	r, p = s.split(";")
1046	n/a	r = r.strip()
1047	n/a	p = p.strip()
1048	n/a	if ".." in r:
1049	n/a	first, last = [int(c, 16) for c in r.split('..')]
1050	n/a	chars = list(range(first, last+1))
1051	n/a	else:
1052	n/a	chars = [int(r, 16)]
1053	n/a	for char in chars:
1054	n/a	if table[char]:
1055	n/a	# Some properties (e.g. Default_Ignorable_Code_Point)
1056	n/a	# apply to unassigned code points; ignore them
1057	n/a	table[char][-1].add(p)
1058	n/a
1059	n/a	with open_data(LINE_BREAK, version) as file:
1060	n/a	for s in file:
1061	n/a	s = s.partition('#')[0]
1062	n/a	s = [i.strip() for i in s.split(';')]
1063	n/a	if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1064	n/a	continue
1065	n/a	if '..' not in s[0]:
1066	n/a	first = last = int(s[0], 16)
1067	n/a	else:
1068	n/a	first, last = [int(c, 16) for c in s[0].split('..')]
1069	n/a	for char in range(first, last+1):
1070	n/a	table[char][-1].add('Line_Break')
1071	n/a
1072	n/a	# We only want the quickcheck properties
1073	n/a	# Format: NF?_QC; Y(es)/N(o)/M(aybe)
1074	n/a	# Yes is the default, hence only N and M occur
1075	n/a	# In 3.2.0, the format was different (NF?_NO)
1076	n/a	# The parsing will incorrectly determine these as
1077	n/a	# "yes", however, unicodedata.c will not perform quickchecks
1078	n/a	# for older versions, and no delta records will be created.
1079	n/a	quickchecks = [0] * 0x110000
1080	n/a	qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
1081	n/a	with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
1082	n/a	for s in file:
1083	n/a	if '#' in s:
1084	n/a	s = s[:s.index('#')]
1085	n/a	s = [i.strip() for i in s.split(';')]
1086	n/a	if len(s) < 2 or s[1] not in qc_order:
1087	n/a	continue
1088	n/a	quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1089	n/a	quickcheck_shift = qc_order.index(s[1])*2
1090	n/a	quickcheck <<= quickcheck_shift
1091	n/a	if '..' not in s[0]:
1092	n/a	first = last = int(s[0], 16)
1093	n/a	else:
1094	n/a	first, last = [int(c, 16) for c in s[0].split('..')]
1095	n/a	for char in range(first, last+1):
1096	n/a	assert not (quickchecks[char]>>quickcheck_shift)&3
1097	n/a	quickchecks[char] \|= quickcheck
1098	n/a	for i in range(0, 0x110000):
1099	n/a	if table[i] is not None:
1100	n/a	table[i].append(quickchecks[i])
1101	n/a
1102	n/a	with open_data(UNIHAN, version) as file:
1103	n/a	zip = zipfile.ZipFile(file)
1104	n/a	if version == '3.2.0':
1105	n/a	data = zip.open('Unihan-3.2.0.txt').read()
1106	n/a	else:
1107	n/a	data = zip.open('Unihan_NumericValues.txt').read()
1108	n/a	for line in data.decode("utf-8").splitlines():
1109	n/a	if not line.startswith('U+'):
1110	n/a	continue
1111	n/a	code, tag, value = line.split(None, 3)[:3]
1112	n/a	if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1113	n/a	'kOtherNumeric'):
1114	n/a	continue
1115	n/a	value = value.strip().replace(',', '')
1116	n/a	i = int(code[2:], 16)
1117	n/a	# Patch the numeric field
1118	n/a	if table[i] is not None:
1119	n/a	table[i][8] = value
1120	n/a	sc = self.special_casing = {}
1121	n/a	with open_data(SPECIAL_CASING, version) as file:
1122	n/a	for s in file:
1123	n/a	s = s[:-1].split('#', 1)[0]
1124	n/a	if not s:
1125	n/a	continue
1126	n/a	data = s.split("; ")
1127	n/a	if data[4]:
1128	n/a	# We ignore all conditionals (since they depend on
1129	n/a	# languages) except for one, which is hardcoded. See
1130	n/a	# handle_capital_sigma in unicodeobject.c.
1131	n/a	continue
1132	n/a	c = int(data[0], 16)
1133	n/a	lower = [int(char, 16) for char in data[1].split()]
1134	n/a	title = [int(char, 16) for char in data[2].split()]
1135	n/a	upper = [int(char, 16) for char in data[3].split()]
1136	n/a	sc[c] = (lower, title, upper)
1137	n/a	cf = self.case_folding = {}
1138	n/a	if version != '3.2.0':
1139	n/a	with open_data(CASE_FOLDING, version) as file:
1140	n/a	for s in file:
1141	n/a	s = s[:-1].split('#', 1)[0]
1142	n/a	if not s:
1143	n/a	continue
1144	n/a	data = s.split("; ")
1145	n/a	if data[1] in "CF":
1146	n/a	c = int(data[0], 16)
1147	n/a	cf[c] = [int(char, 16) for char in data[2].split()]
1148	n/a
1149	n/a	def uselatin1(self):
1150	n/a	# restrict character range to ISO Latin 1
1151	n/a	self.chars = list(range(256))
1152	n/a
1153	n/a	# hash table tools
1154	n/a
1155	n/a	# this is a straight-forward reimplementation of Python's built-in
1156	n/a	# dictionary type, using a static data structure, and a custom string
1157	n/a	# hash algorithm.
1158	n/a
1159	n/a	def myhash(s, magic):
1160	n/a	h = 0
1161	n/a	for c in map(ord, s.upper()):
1162	n/a	h = (h * magic) + c
1163	n/a	ix = h & 0xff000000
1164	n/a	if ix:
1165	n/a	h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1166	n/a	return h
1167	n/a
1168	n/a	SIZES = [
1169	n/a	(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1170	n/a	(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1171	n/a	(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1172	n/a	(2097152,5), (4194304,3), (8388608,33), (16777216,27)
1173	n/a	]
1174	n/a
1175	n/a	class Hash:
1176	n/a	def __init__(self, name, data, magic):
1177	n/a	# turn a (key, value) list into a static hash table structure
1178	n/a
1179	n/a	# determine table size
1180	n/a	for size, poly in SIZES:
1181	n/a	if size > len(data):
1182	n/a	poly = size + poly
1183	n/a	break
1184	n/a	else:
1185	n/a	raise AssertionError("ran out of polynomials")
1186	n/a
1187	n/a	print(size, "slots in hash table")
1188	n/a
1189	n/a	table = [None] * size
1190	n/a
1191	n/a	mask = size-1
1192	n/a
1193	n/a	n = 0
1194	n/a
1195	n/a	hash = myhash
1196	n/a
1197	n/a	# initialize hash table
1198	n/a	for key, value in data:
1199	n/a	h = hash(key, magic)
1200	n/a	i = (~h) & mask
1201	n/a	v = table[i]
1202	n/a	if v is None:
1203	n/a	table[i] = value
1204	n/a	continue
1205	n/a	incr = (h ^ (h >> 3)) & mask;
1206	n/a	if not incr:
1207	n/a	incr = mask
1208	n/a	while 1:
1209	n/a	n = n + 1
1210	n/a	i = (i + incr) & mask
1211	n/a	v = table[i]
1212	n/a	if v is None:
1213	n/a	table[i] = value
1214	n/a	break
1215	n/a	incr = incr << 1
1216	n/a	if incr > mask:
1217	n/a	incr = incr ^ poly
1218	n/a
1219	n/a	print(n, "collisions")
1220	n/a	self.collisions = n
1221	n/a
1222	n/a	for i in range(len(table)):
1223	n/a	if table[i] is None:
1224	n/a	table[i] = 0
1225	n/a
1226	n/a	self.data = Array(name + "_hash", table)
1227	n/a	self.magic = magic
1228	n/a	self.name = name
1229	n/a	self.size = size
1230	n/a	self.poly = poly
1231	n/a
1232	n/a	def dump(self, file, trace):
1233	n/a	# write data to file, as a C array
1234	n/a	self.data.dump(file, trace)
1235	n/a	file.write("#define %s_magic %d\n" % (self.name, self.magic))
1236	n/a	file.write("#define %s_size %d\n" % (self.name, self.size))
1237	n/a	file.write("#define %s_poly %d\n" % (self.name, self.poly))
1238	n/a
1239	n/a	# stuff to deal with arrays of unsigned integers
1240	n/a
1241	n/a	class Array:
1242	n/a
1243	n/a	def __init__(self, name, data):
1244	n/a	self.name = name
1245	n/a	self.data = data
1246	n/a
1247	n/a	def dump(self, file, trace=0):
1248	n/a	# write data to file, as a C array
1249	n/a	size = getsize(self.data)
1250	n/a	if trace:
1251	n/a	print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
1252	n/a	file.write("static ")
1253	n/a	if size == 1:
1254	n/a	file.write("unsigned char")
1255	n/a	elif size == 2:
1256	n/a	file.write("unsigned short")
1257	n/a	else:
1258	n/a	file.write("unsigned int")
1259	n/a	file.write(" " + self.name + "[] = {\n")
1260	n/a	if self.data:
1261	n/a	s = " "
1262	n/a	for item in self.data:
1263	n/a	i = str(item) + ", "
1264	n/a	if len(s) + len(i) > 78:
1265	n/a	file.write(s + "\n")
1266	n/a	s = " " + i
1267	n/a	else:
1268	n/a	s = s + i
1269	n/a	if s.strip():
1270	n/a	file.write(s + "\n")
1271	n/a	file.write("};\n\n")
1272	n/a
1273	n/a	def getsize(data):
1274	n/a	# return smallest possible integer size for the given array
1275	n/a	maxdata = max(data)
1276	n/a	if maxdata < 256:
1277	n/a	return 1
1278	n/a	elif maxdata < 65536:
1279	n/a	return 2
1280	n/a	else:
1281	n/a	return 4
1282	n/a
1283	n/a	def splitbins(t, trace=0):
1284	n/a	"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
1285	n/a
1286	n/a	t is a sequence of ints. This function can be useful to save space if
1287	n/a	many of the ints are the same. t1 and t2 are lists of ints, and shift
1288	n/a	is an int, chosen to minimize the combined size of t1 and t2 (in C
1289	n/a	code), and where for each i in range(len(t)),
1290	n/a	t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1291	n/a	where mask is a bitmask isolating the last "shift" bits.
1292	n/a
1293	n/a	If optional arg trace is non-zero (default zero), progress info
1294	n/a	is printed to sys.stderr. The higher the value, the more info
1295	n/a	you'll get.
1296	n/a	"""
1297	n/a
1298	n/a	if trace:
1299	n/a	def dump(t1, t2, shift, bytes):
1300	n/a	print("%d+%d bins at shift %d; %d bytes" % (
1301	n/a	len(t1), len(t2), shift, bytes), file=sys.stderr)
1302	n/a	print("Size of original table:", len(t)*getsize(t), \
1303	n/a	"bytes", file=sys.stderr)
1304	n/a	n = len(t)-1 # last valid index
1305	n/a	maxshift = 0 # the most we can shift n and still have something left
1306	n/a	if n > 0:
1307	n/a	while n >> 1:
1308	n/a	n >>= 1
1309	n/a	maxshift += 1
1310	n/a	del n
1311	n/a	bytes = sys.maxsize # smallest total size so far
1312	n/a	t = tuple(t) # so slices can be dict keys
1313	n/a	for shift in range(maxshift + 1):
1314	n/a	t1 = []
1315	n/a	t2 = []
1316	n/a	size = 2**shift
1317	n/a	bincache = {}
1318	n/a	for i in range(0, len(t), size):
1319	n/a	bin = t[i:i+size]
1320	n/a	index = bincache.get(bin)
1321	n/a	if index is None:
1322	n/a	index = len(t2)
1323	n/a	bincache[bin] = index
1324	n/a	t2.extend(bin)
1325	n/a	t1.append(index >> shift)
1326	n/a	# determine memory size
1327	n/a	b = len(t1)getsize(t1) + len(t2)getsize(t2)
1328	n/a	if trace > 1:
1329	n/a	dump(t1, t2, shift, b)
1330	n/a	if b < bytes:
1331	n/a	best = t1, t2, shift
1332	n/a	bytes = b
1333	n/a	t1, t2, shift = best
1334	n/a	if trace:
1335	n/a	print("Best:", end=' ', file=sys.stderr)
1336	n/a	dump(t1, t2, shift, bytes)
1337	n/a	if __debug__:
1338	n/a	# exhaustively verify that the decomposition is correct
1339	n/a	mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
1340	n/a	for i in range(len(t)):
1341	n/a	assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1342	n/a	return best
1343	n/a
1344	n/a	if __name__ == "__main__":
1345	n/a	maketables(1)