ยปCore Development>Code coverage>Tools/unicode/makeunicodedata.py

Python code coverage for Tools/unicode/makeunicodedata.py

#countcontent
1n/a#
2n/a# (re)generate unicode property and type databases
3n/a#
4n/a# this script converts a unicode 3.2 database file to
5n/a# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6n/a# and Objects/unicodetype_db.h
7n/a#
8n/a# history:
9n/a# 2000-09-24 fl created (based on bits and pieces from unidb)
10n/a# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
11n/a# 2000-09-25 fl added character type table
12n/a# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
13n/a# 2000-11-03 fl expand first/last ranges
14n/a# 2001-01-19 fl added character name tables (2.1)
15n/a# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
16n/a# 2002-09-11 wd use string methods
17n/a# 2002-10-18 mvl update to Unicode 3.2
18n/a# 2002-10-22 mvl generate NFC tables
19n/a# 2002-11-24 mvl expand all ranges, sort names version-independently
20n/a# 2002-11-25 mvl add UNIDATA_VERSION
21n/a# 2004-05-29 perky add east asian width information
22n/a# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
23n/a# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
24n/a# 2011-10-21 ezio add support for name aliases and named sequences
25n/a# 2012-01 benjamin add full case mappings
26n/a#
27n/a# written by Fredrik Lundh (fredrik@pythonware.com)
28n/a#
29n/a
30n/aimport os
31n/aimport sys
32n/aimport zipfile
33n/a
34n/afrom textwrap import dedent
35n/a
36n/aSCRIPT = sys.argv[0]
37n/aVERSION = "3.2"
38n/a
39n/a# The Unicode Database
40n/a# --------------------
41n/a# When changing UCD version please update
42n/a# * Doc/library/stdtypes.rst, and
43n/a# * Doc/library/unicodedata.rst
44n/a# * Doc/reference/lexical_analysis.rst (two occurrences)
45n/aUNIDATA_VERSION = "9.0.0"
46n/aUNICODE_DATA = "UnicodeData%s.txt"
47n/aCOMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
48n/aEASTASIAN_WIDTH = "EastAsianWidth%s.txt"
49n/aUNIHAN = "Unihan%s.zip"
50n/aDERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
51n/aDERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
52n/aLINE_BREAK = "LineBreak%s.txt"
53n/aNAME_ALIASES = "NameAliases%s.txt"
54n/aNAMED_SEQUENCES = "NamedSequences%s.txt"
55n/aSPECIAL_CASING = "SpecialCasing%s.txt"
56n/aCASE_FOLDING = "CaseFolding%s.txt"
57n/a
58n/a# Private Use Areas -- in planes 1, 15, 16
59n/aPUA_1 = range(0xE000, 0xF900)
60n/aPUA_15 = range(0xF0000, 0xFFFFE)
61n/aPUA_16 = range(0x100000, 0x10FFFE)
62n/a
63n/a# we use this ranges of PUA_15 to store name aliases and named sequences
64n/aNAME_ALIASES_START = 0xF0000
65n/aNAMED_SEQUENCES_START = 0xF0200
66n/a
67n/aold_versions = ["3.2.0"]
68n/a
69n/aCATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
70n/a "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
71n/a "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
72n/a "So" ]
73n/a
74n/aBIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
75n/a "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
76n/a "ON", "LRI", "RLI", "FSI", "PDI" ]
77n/a
78n/aEASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
79n/a
80n/aMANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
81n/a
82n/a# note: should match definitions in Objects/unicodectype.c
83n/aALPHA_MASK = 0x01
84n/aDECIMAL_MASK = 0x02
85n/aDIGIT_MASK = 0x04
86n/aLOWER_MASK = 0x08
87n/aLINEBREAK_MASK = 0x10
88n/aSPACE_MASK = 0x20
89n/aTITLE_MASK = 0x40
90n/aUPPER_MASK = 0x80
91n/aXID_START_MASK = 0x100
92n/aXID_CONTINUE_MASK = 0x200
93n/aPRINTABLE_MASK = 0x400
94n/aNUMERIC_MASK = 0x800
95n/aCASE_IGNORABLE_MASK = 0x1000
96n/aCASED_MASK = 0x2000
97n/aEXTENDED_CASE_MASK = 0x4000
98n/a
99n/a# these ranges need to match unicodedata.c:is_unified_ideograph
100n/acjk_ranges = [
101n/a ('3400', '4DB5'),
102n/a ('4E00', '9FD5'),
103n/a ('20000', '2A6D6'),
104n/a ('2A700', '2B734'),
105n/a ('2B740', '2B81D'),
106n/a ('2B820', '2CEA1'),
107n/a]
108n/a
109n/adef maketables(trace=0):
110n/a
111n/a print("--- Reading", UNICODE_DATA % "", "...")
112n/a
113n/a version = ""
114n/a unicode = UnicodeData(UNIDATA_VERSION)
115n/a
116n/a print(len(list(filter(None, unicode.table))), "characters")
117n/a
118n/a for version in old_versions:
119n/a print("--- Reading", UNICODE_DATA % ("-"+version), "...")
120n/a old_unicode = UnicodeData(version, cjk_check=False)
121n/a print(len(list(filter(None, old_unicode.table))), "characters")
122n/a merge_old_version(version, unicode, old_unicode)
123n/a
124n/a makeunicodename(unicode, trace)
125n/a makeunicodedata(unicode, trace)
126n/a makeunicodetype(unicode, trace)
127n/a
128n/a# --------------------------------------------------------------------
129n/a# unicode character properties
130n/a
131n/adef makeunicodedata(unicode, trace):
132n/a
133n/a dummy = (0, 0, 0, 0, 0, 0)
134n/a table = [dummy]
135n/a cache = {0: dummy}
136n/a index = [0] * len(unicode.chars)
137n/a
138n/a FILE = "Modules/unicodedata_db.h"
139n/a
140n/a print("--- Preparing", FILE, "...")
141n/a
142n/a # 1) database properties
143n/a
144n/a for char in unicode.chars:
145n/a record = unicode.table[char]
146n/a if record:
147n/a # extract database properties
148n/a category = CATEGORY_NAMES.index(record[2])
149n/a combining = int(record[3])
150n/a bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
151n/a mirrored = record[9] == "Y"
152n/a eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
153n/a normalizationquickcheck = record[17]
154n/a item = (
155n/a category, combining, bidirectional, mirrored, eastasianwidth,
156n/a normalizationquickcheck
157n/a )
158n/a # add entry to index and item tables
159n/a i = cache.get(item)
160n/a if i is None:
161n/a cache[item] = i = len(table)
162n/a table.append(item)
163n/a index[char] = i
164n/a
165n/a # 2) decomposition data
166n/a
167n/a decomp_data = [0]
168n/a decomp_prefix = [""]
169n/a decomp_index = [0] * len(unicode.chars)
170n/a decomp_size = 0
171n/a
172n/a comp_pairs = []
173n/a comp_first = [None] * len(unicode.chars)
174n/a comp_last = [None] * len(unicode.chars)
175n/a
176n/a for char in unicode.chars:
177n/a record = unicode.table[char]
178n/a if record:
179n/a if record[5]:
180n/a decomp = record[5].split()
181n/a if len(decomp) > 19:
182n/a raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
183n/a # prefix
184n/a if decomp[0][0] == "<":
185n/a prefix = decomp.pop(0)
186n/a else:
187n/a prefix = ""
188n/a try:
189n/a i = decomp_prefix.index(prefix)
190n/a except ValueError:
191n/a i = len(decomp_prefix)
192n/a decomp_prefix.append(prefix)
193n/a prefix = i
194n/a assert prefix < 256
195n/a # content
196n/a decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
197n/a # Collect NFC pairs
198n/a if not prefix and len(decomp) == 3 and \
199n/a char not in unicode.exclusions and \
200n/a unicode.table[decomp[1]][3] == "0":
201n/a p, l, r = decomp
202n/a comp_first[l] = 1
203n/a comp_last[r] = 1
204n/a comp_pairs.append((l,r,char))
205n/a try:
206n/a i = decomp_data.index(decomp)
207n/a except ValueError:
208n/a i = len(decomp_data)
209n/a decomp_data.extend(decomp)
210n/a decomp_size = decomp_size + len(decomp) * 2
211n/a else:
212n/a i = 0
213n/a decomp_index[char] = i
214n/a
215n/a f = l = 0
216n/a comp_first_ranges = []
217n/a comp_last_ranges = []
218n/a prev_f = prev_l = None
219n/a for i in unicode.chars:
220n/a if comp_first[i] is not None:
221n/a comp_first[i] = f
222n/a f += 1
223n/a if prev_f is None:
224n/a prev_f = (i,i)
225n/a elif prev_f[1]+1 == i:
226n/a prev_f = prev_f[0],i
227n/a else:
228n/a comp_first_ranges.append(prev_f)
229n/a prev_f = (i,i)
230n/a if comp_last[i] is not None:
231n/a comp_last[i] = l
232n/a l += 1
233n/a if prev_l is None:
234n/a prev_l = (i,i)
235n/a elif prev_l[1]+1 == i:
236n/a prev_l = prev_l[0],i
237n/a else:
238n/a comp_last_ranges.append(prev_l)
239n/a prev_l = (i,i)
240n/a comp_first_ranges.append(prev_f)
241n/a comp_last_ranges.append(prev_l)
242n/a total_first = f
243n/a total_last = l
244n/a
245n/a comp_data = [0]*(total_first*total_last)
246n/a for f,l,char in comp_pairs:
247n/a f = comp_first[f]
248n/a l = comp_last[l]
249n/a comp_data[f*total_last+l] = char
250n/a
251n/a print(len(table), "unique properties")
252n/a print(len(decomp_prefix), "unique decomposition prefixes")
253n/a print(len(decomp_data), "unique decomposition entries:", end=' ')
254n/a print(decomp_size, "bytes")
255n/a print(total_first, "first characters in NFC")
256n/a print(total_last, "last characters in NFC")
257n/a print(len(comp_pairs), "NFC pairs")
258n/a
259n/a print("--- Writing", FILE, "...")
260n/a
261n/a fp = open(FILE, "w")
262n/a print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
263n/a print(file=fp)
264n/a print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
265n/a print("/* a list of unique database records */", file=fp)
266n/a print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
267n/a for item in table:
268n/a print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
269n/a print("};", file=fp)
270n/a print(file=fp)
271n/a
272n/a print("/* Reindexing of NFC first characters. */", file=fp)
273n/a print("#define TOTAL_FIRST",total_first, file=fp)
274n/a print("#define TOTAL_LAST",total_last, file=fp)
275n/a print("struct reindex{int start;short count,index;};", file=fp)
276n/a print("static struct reindex nfc_first[] = {", file=fp)
277n/a for start,end in comp_first_ranges:
278n/a print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
279n/a print(" {0,0,0}", file=fp)
280n/a print("};\n", file=fp)
281n/a print("static struct reindex nfc_last[] = {", file=fp)
282n/a for start,end in comp_last_ranges:
283n/a print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
284n/a print(" {0,0,0}", file=fp)
285n/a print("};\n", file=fp)
286n/a
287n/a # FIXME: <fl> the following tables could be made static, and
288n/a # the support code moved into unicodedatabase.c
289n/a
290n/a print("/* string literals */", file=fp)
291n/a print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
292n/a for name in CATEGORY_NAMES:
293n/a print(" \"%s\"," % name, file=fp)
294n/a print(" NULL", file=fp)
295n/a print("};", file=fp)
296n/a
297n/a print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
298n/a for name in BIDIRECTIONAL_NAMES:
299n/a print(" \"%s\"," % name, file=fp)
300n/a print(" NULL", file=fp)
301n/a print("};", file=fp)
302n/a
303n/a print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
304n/a for name in EASTASIANWIDTH_NAMES:
305n/a print(" \"%s\"," % name, file=fp)
306n/a print(" NULL", file=fp)
307n/a print("};", file=fp)
308n/a
309n/a print("static const char *decomp_prefix[] = {", file=fp)
310n/a for name in decomp_prefix:
311n/a print(" \"%s\"," % name, file=fp)
312n/a print(" NULL", file=fp)
313n/a print("};", file=fp)
314n/a
315n/a # split record index table
316n/a index1, index2, shift = splitbins(index, trace)
317n/a
318n/a print("/* index tables for the database records */", file=fp)
319n/a print("#define SHIFT", shift, file=fp)
320n/a Array("index1", index1).dump(fp, trace)
321n/a Array("index2", index2).dump(fp, trace)
322n/a
323n/a # split decomposition index table
324n/a index1, index2, shift = splitbins(decomp_index, trace)
325n/a
326n/a print("/* decomposition data */", file=fp)
327n/a Array("decomp_data", decomp_data).dump(fp, trace)
328n/a
329n/a print("/* index tables for the decomposition data */", file=fp)
330n/a print("#define DECOMP_SHIFT", shift, file=fp)
331n/a Array("decomp_index1", index1).dump(fp, trace)
332n/a Array("decomp_index2", index2).dump(fp, trace)
333n/a
334n/a index, index2, shift = splitbins(comp_data, trace)
335n/a print("/* NFC pairs */", file=fp)
336n/a print("#define COMP_SHIFT", shift, file=fp)
337n/a Array("comp_index", index).dump(fp, trace)
338n/a Array("comp_data", index2).dump(fp, trace)
339n/a
340n/a # Generate delta tables for old versions
341n/a for version, table, normalization in unicode.changed:
342n/a cversion = version.replace(".","_")
343n/a records = [table[0]]
344n/a cache = {table[0]:0}
345n/a index = [0] * len(table)
346n/a for i, record in enumerate(table):
347n/a try:
348n/a index[i] = cache[record]
349n/a except KeyError:
350n/a index[i] = cache[record] = len(records)
351n/a records.append(record)
352n/a index1, index2, shift = splitbins(index, trace)
353n/a print("static const change_record change_records_%s[] = {" % cversion, file=fp)
354n/a for record in records:
355n/a print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
356n/a print("};", file=fp)
357n/a Array("changes_%s_index" % cversion, index1).dump(fp, trace)
358n/a Array("changes_%s_data" % cversion, index2).dump(fp, trace)
359n/a print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
360n/a print("{", file=fp)
361n/a print("\tint index;", file=fp)
362n/a print("\tif (n >= 0x110000) index = 0;", file=fp)
363n/a print("\telse {", file=fp)
364n/a print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
365n/a print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
366n/a (cversion, shift, ((1<<shift)-1)), file=fp)
367n/a print("\t}", file=fp)
368n/a print("\treturn change_records_%s+index;" % cversion, file=fp)
369n/a print("}\n", file=fp)
370n/a print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
371n/a print("{", file=fp)
372n/a print("\tswitch(n) {", file=fp)
373n/a for k, v in normalization:
374n/a print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
375n/a print("\tdefault: return 0;", file=fp)
376n/a print("\t}\n}\n", file=fp)
377n/a
378n/a fp.close()
379n/a
380n/a# --------------------------------------------------------------------
381n/a# unicode character type tables
382n/a
383n/adef makeunicodetype(unicode, trace):
384n/a
385n/a FILE = "Objects/unicodetype_db.h"
386n/a
387n/a print("--- Preparing", FILE, "...")
388n/a
389n/a # extract unicode types
390n/a dummy = (0, 0, 0, 0, 0, 0)
391n/a table = [dummy]
392n/a cache = {0: dummy}
393n/a index = [0] * len(unicode.chars)
394n/a numeric = {}
395n/a spaces = []
396n/a linebreaks = []
397n/a extra_casing = []
398n/a
399n/a for char in unicode.chars:
400n/a record = unicode.table[char]
401n/a if record:
402n/a # extract database properties
403n/a category = record[2]
404n/a bidirectional = record[4]
405n/a properties = record[16]
406n/a flags = 0
407n/a delta = True
408n/a if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
409n/a flags |= ALPHA_MASK
410n/a if "Lowercase" in properties:
411n/a flags |= LOWER_MASK
412n/a if 'Line_Break' in properties or bidirectional == "B":
413n/a flags |= LINEBREAK_MASK
414n/a linebreaks.append(char)
415n/a if category == "Zs" or bidirectional in ("WS", "B", "S"):
416n/a flags |= SPACE_MASK
417n/a spaces.append(char)
418n/a if category == "Lt":
419n/a flags |= TITLE_MASK
420n/a if "Uppercase" in properties:
421n/a flags |= UPPER_MASK
422n/a if char == ord(" ") or category[0] not in ("C", "Z"):
423n/a flags |= PRINTABLE_MASK
424n/a if "XID_Start" in properties:
425n/a flags |= XID_START_MASK
426n/a if "XID_Continue" in properties:
427n/a flags |= XID_CONTINUE_MASK
428n/a if "Cased" in properties:
429n/a flags |= CASED_MASK
430n/a if "Case_Ignorable" in properties:
431n/a flags |= CASE_IGNORABLE_MASK
432n/a sc = unicode.special_casing.get(char)
433n/a cf = unicode.case_folding.get(char, [char])
434n/a if record[12]:
435n/a upper = int(record[12], 16)
436n/a else:
437n/a upper = char
438n/a if record[13]:
439n/a lower = int(record[13], 16)
440n/a else:
441n/a lower = char
442n/a if record[14]:
443n/a title = int(record[14], 16)
444n/a else:
445n/a title = upper
446n/a if sc is None and cf != [lower]:
447n/a sc = ([lower], [title], [upper])
448n/a if sc is None:
449n/a if upper == lower == title:
450n/a upper = lower = title = 0
451n/a else:
452n/a upper = upper - char
453n/a lower = lower - char
454n/a title = title - char
455n/a assert (abs(upper) <= 2147483647 and
456n/a abs(lower) <= 2147483647 and
457n/a abs(title) <= 2147483647)
458n/a else:
459n/a # This happens either when some character maps to more than one
460n/a # character in uppercase, lowercase, or titlecase or the
461n/a # casefolded version of the character is different from the
462n/a # lowercase. The extra characters are stored in a different
463n/a # array.
464n/a flags |= EXTENDED_CASE_MASK
465n/a lower = len(extra_casing) | (len(sc[0]) << 24)
466n/a extra_casing.extend(sc[0])
467n/a if cf != sc[0]:
468n/a lower |= len(cf) << 20
469n/a extra_casing.extend(cf)
470n/a upper = len(extra_casing) | (len(sc[2]) << 24)
471n/a extra_casing.extend(sc[2])
472n/a # Title is probably equal to upper.
473n/a if sc[1] == sc[2]:
474n/a title = upper
475n/a else:
476n/a title = len(extra_casing) | (len(sc[1]) << 24)
477n/a extra_casing.extend(sc[1])
478n/a # decimal digit, integer digit
479n/a decimal = 0
480n/a if record[6]:
481n/a flags |= DECIMAL_MASK
482n/a decimal = int(record[6])
483n/a digit = 0
484n/a if record[7]:
485n/a flags |= DIGIT_MASK
486n/a digit = int(record[7])
487n/a if record[8]:
488n/a flags |= NUMERIC_MASK
489n/a numeric.setdefault(record[8], []).append(char)
490n/a item = (
491n/a upper, lower, title, decimal, digit, flags
492n/a )
493n/a # add entry to index and item tables
494n/a i = cache.get(item)
495n/a if i is None:
496n/a cache[item] = i = len(table)
497n/a table.append(item)
498n/a index[char] = i
499n/a
500n/a print(len(table), "unique character type entries")
501n/a print(sum(map(len, numeric.values())), "numeric code points")
502n/a print(len(spaces), "whitespace code points")
503n/a print(len(linebreaks), "linebreak code points")
504n/a print(len(extra_casing), "extended case array")
505n/a
506n/a print("--- Writing", FILE, "...")
507n/a
508n/a fp = open(FILE, "w")
509n/a print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
510n/a print(file=fp)
511n/a print("/* a list of unique character type descriptors */", file=fp)
512n/a print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
513n/a for item in table:
514n/a print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
515n/a print("};", file=fp)
516n/a print(file=fp)
517n/a
518n/a print("/* extended case mappings */", file=fp)
519n/a print(file=fp)
520n/a print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
521n/a for c in extra_casing:
522n/a print(" %d," % c, file=fp)
523n/a print("};", file=fp)
524n/a print(file=fp)
525n/a
526n/a # split decomposition index table
527n/a index1, index2, shift = splitbins(index, trace)
528n/a
529n/a print("/* type indexes */", file=fp)
530n/a print("#define SHIFT", shift, file=fp)
531n/a Array("index1", index1).dump(fp, trace)
532n/a Array("index2", index2).dump(fp, trace)
533n/a
534n/a # Generate code for _PyUnicode_ToNumeric()
535n/a numeric_items = sorted(numeric.items())
536n/a print('/* Returns the numeric value as double for Unicode characters', file=fp)
537n/a print(' * having this property, -1.0 otherwise.', file=fp)
538n/a print(' */', file=fp)
539n/a print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
540n/a print('{', file=fp)
541n/a print(' switch (ch) {', file=fp)
542n/a for value, codepoints in numeric_items:
543n/a # Turn text into float literals
544n/a parts = value.split('/')
545n/a parts = [repr(float(part)) for part in parts]
546n/a value = '/'.join(parts)
547n/a
548n/a codepoints.sort()
549n/a for codepoint in codepoints:
550n/a print(' case 0x%04X:' % (codepoint,), file=fp)
551n/a print(' return (double) %s;' % (value,), file=fp)
552n/a print(' }', file=fp)
553n/a print(' return -1.0;', file=fp)
554n/a print('}', file=fp)
555n/a print(file=fp)
556n/a
557n/a # Generate code for _PyUnicode_IsWhitespace()
558n/a print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
559n/a print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
560n/a print(" */", file=fp)
561n/a print('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)', file=fp)
562n/a print('{', file=fp)
563n/a print(' switch (ch) {', file=fp)
564n/a
565n/a for codepoint in sorted(spaces):
566n/a print(' case 0x%04X:' % (codepoint,), file=fp)
567n/a print(' return 1;', file=fp)
568n/a
569n/a print(' }', file=fp)
570n/a print(' return 0;', file=fp)
571n/a print('}', file=fp)
572n/a print(file=fp)
573n/a
574n/a # Generate code for _PyUnicode_IsLinebreak()
575n/a print("/* Returns 1 for Unicode characters having the line break", file=fp)
576n/a print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
577n/a print(" * type 'B', 0 otherwise.", file=fp)
578n/a print(" */", file=fp)
579n/a print('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)', file=fp)
580n/a print('{', file=fp)
581n/a print(' switch (ch) {', file=fp)
582n/a for codepoint in sorted(linebreaks):
583n/a print(' case 0x%04X:' % (codepoint,), file=fp)
584n/a print(' return 1;', file=fp)
585n/a
586n/a print(' }', file=fp)
587n/a print(' return 0;', file=fp)
588n/a print('}', file=fp)
589n/a print(file=fp)
590n/a
591n/a fp.close()
592n/a
593n/a# --------------------------------------------------------------------
594n/a# unicode name database
595n/a
596n/adef makeunicodename(unicode, trace):
597n/a
598n/a FILE = "Modules/unicodename_db.h"
599n/a
600n/a print("--- Preparing", FILE, "...")
601n/a
602n/a # collect names
603n/a names = [None] * len(unicode.chars)
604n/a
605n/a for char in unicode.chars:
606n/a record = unicode.table[char]
607n/a if record:
608n/a name = record[1].strip()
609n/a if name and name[0] != "<":
610n/a names[char] = name + chr(0)
611n/a
612n/a print(len(list(n for n in names if n is not None)), "distinct names")
613n/a
614n/a # collect unique words from names (note that we differ between
615n/a # words inside a sentence, and words ending a sentence. the
616n/a # latter includes the trailing null byte.
617n/a
618n/a words = {}
619n/a n = b = 0
620n/a for char in unicode.chars:
621n/a name = names[char]
622n/a if name:
623n/a w = name.split()
624n/a b = b + len(name)
625n/a n = n + len(w)
626n/a for w in w:
627n/a l = words.get(w)
628n/a if l:
629n/a l.append(None)
630n/a else:
631n/a words[w] = [len(words)]
632n/a
633n/a print(n, "words in text;", b, "bytes")
634n/a
635n/a wordlist = list(words.items())
636n/a
637n/a # sort on falling frequency, then by name
638n/a def word_key(a):
639n/a aword, alist = a
640n/a return -len(alist), aword
641n/a wordlist.sort(key=word_key)
642n/a
643n/a # figure out how many phrasebook escapes we need
644n/a escapes = 0
645n/a while escapes * 256 < len(wordlist):
646n/a escapes = escapes + 1
647n/a print(escapes, "escapes")
648n/a
649n/a short = 256 - escapes
650n/a
651n/a assert short > 0
652n/a
653n/a print(short, "short indexes in lexicon")
654n/a
655n/a # statistics
656n/a n = 0
657n/a for i in range(short):
658n/a n = n + len(wordlist[i][1])
659n/a print(n, "short indexes in phrasebook")
660n/a
661n/a # pick the most commonly used words, and sort the rest on falling
662n/a # length (to maximize overlap)
663n/a
664n/a wordlist, wordtail = wordlist[:short], wordlist[short:]
665n/a wordtail.sort(key=lambda a: a[0], reverse=True)
666n/a wordlist.extend(wordtail)
667n/a
668n/a # generate lexicon from words
669n/a
670n/a lexicon_offset = [0]
671n/a lexicon = ""
672n/a words = {}
673n/a
674n/a # build a lexicon string
675n/a offset = 0
676n/a for w, x in wordlist:
677n/a # encoding: bit 7 indicates last character in word (chr(128)
678n/a # indicates the last character in an entire string)
679n/a ww = w[:-1] + chr(ord(w[-1])+128)
680n/a # reuse string tails, when possible
681n/a o = lexicon.find(ww)
682n/a if o < 0:
683n/a o = offset
684n/a lexicon = lexicon + ww
685n/a offset = offset + len(w)
686n/a words[w] = len(lexicon_offset)
687n/a lexicon_offset.append(o)
688n/a
689n/a lexicon = list(map(ord, lexicon))
690n/a
691n/a # generate phrasebook from names and lexicon
692n/a phrasebook = [0]
693n/a phrasebook_offset = [0] * len(unicode.chars)
694n/a for char in unicode.chars:
695n/a name = names[char]
696n/a if name:
697n/a w = name.split()
698n/a phrasebook_offset[char] = len(phrasebook)
699n/a for w in w:
700n/a i = words[w]
701n/a if i < short:
702n/a phrasebook.append(i)
703n/a else:
704n/a # store as two bytes
705n/a phrasebook.append((i>>8) + short)
706n/a phrasebook.append(i&255)
707n/a
708n/a assert getsize(phrasebook) == 1
709n/a
710n/a #
711n/a # unicode name hash table
712n/a
713n/a # extract names
714n/a data = []
715n/a for char in unicode.chars:
716n/a record = unicode.table[char]
717n/a if record:
718n/a name = record[1].strip()
719n/a if name and name[0] != "<":
720n/a data.append((name, char))
721n/a
722n/a # the magic number 47 was chosen to minimize the number of
723n/a # collisions on the current data set. if you like, change it
724n/a # and see what happens...
725n/a
726n/a codehash = Hash("code", data, 47)
727n/a
728n/a print("--- Writing", FILE, "...")
729n/a
730n/a fp = open(FILE, "w")
731n/a print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
732n/a print(file=fp)
733n/a print("#define NAME_MAXLEN", 256, file=fp)
734n/a print(file=fp)
735n/a print("/* lexicon */", file=fp)
736n/a Array("lexicon", lexicon).dump(fp, trace)
737n/a Array("lexicon_offset", lexicon_offset).dump(fp, trace)
738n/a
739n/a # split decomposition index table
740n/a offset1, offset2, shift = splitbins(phrasebook_offset, trace)
741n/a
742n/a print("/* code->name phrasebook */", file=fp)
743n/a print("#define phrasebook_shift", shift, file=fp)
744n/a print("#define phrasebook_short", short, file=fp)
745n/a
746n/a Array("phrasebook", phrasebook).dump(fp, trace)
747n/a Array("phrasebook_offset1", offset1).dump(fp, trace)
748n/a Array("phrasebook_offset2", offset2).dump(fp, trace)
749n/a
750n/a print("/* name->code dictionary */", file=fp)
751n/a codehash.dump(fp, trace)
752n/a
753n/a print(file=fp)
754n/a print('static const unsigned int aliases_start = %#x;' %
755n/a NAME_ALIASES_START, file=fp)
756n/a print('static const unsigned int aliases_end = %#x;' %
757n/a (NAME_ALIASES_START + len(unicode.aliases)), file=fp)
758n/a
759n/a print('static const unsigned int name_aliases[] = {', file=fp)
760n/a for name, codepoint in unicode.aliases:
761n/a print(' 0x%04X,' % codepoint, file=fp)
762n/a print('};', file=fp)
763n/a
764n/a # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
765n/a # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
766n/a # sequences or sequences with non-BMP chars are added.
767n/a # unicodedata_lookup should be adapted too.
768n/a print(dedent("""
769n/a typedef struct NamedSequence {
770n/a int seqlen;
771n/a Py_UCS2 seq[4];
772n/a } named_sequence;
773n/a """), file=fp)
774n/a
775n/a print('static const unsigned int named_sequences_start = %#x;' %
776n/a NAMED_SEQUENCES_START, file=fp)
777n/a print('static const unsigned int named_sequences_end = %#x;' %
778n/a (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
779n/a
780n/a print('static const named_sequence named_sequences[] = {', file=fp)
781n/a for name, sequence in unicode.named_sequences:
782n/a seq_str = ', '.join('0x%04X' % cp for cp in sequence)
783n/a print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)
784n/a print('};', file=fp)
785n/a
786n/a fp.close()
787n/a
788n/a
789n/adef merge_old_version(version, new, old):
790n/a # Changes to exclusion file not implemented yet
791n/a if old.exclusions != new.exclusions:
792n/a raise NotImplementedError("exclusions differ")
793n/a
794n/a # In these change records, 0xFF means "no change"
795n/a bidir_changes = [0xFF]*0x110000
796n/a category_changes = [0xFF]*0x110000
797n/a decimal_changes = [0xFF]*0x110000
798n/a mirrored_changes = [0xFF]*0x110000
799n/a east_asian_width_changes = [0xFF]*0x110000
800n/a # In numeric data, 0 means "no change",
801n/a # -1 means "did not have a numeric value
802n/a numeric_changes = [0] * 0x110000
803n/a # normalization_changes is a list of key-value pairs
804n/a normalization_changes = []
805n/a for i in range(0x110000):
806n/a if new.table[i] is None:
807n/a # Characters unassigned in the new version ought to
808n/a # be unassigned in the old one
809n/a assert old.table[i] is None
810n/a continue
811n/a # check characters unassigned in the old version
812n/a if old.table[i] is None:
813n/a # category 0 is "unassigned"
814n/a category_changes[i] = 0
815n/a continue
816n/a # check characters that differ
817n/a if old.table[i] != new.table[i]:
818n/a for k in range(len(old.table[i])):
819n/a if old.table[i][k] != new.table[i][k]:
820n/a value = old.table[i][k]
821n/a if k == 1 and i in PUA_15:
822n/a # the name is not set in the old.table, but in the
823n/a # new.table we are using it for aliases and named seq
824n/a assert value == ''
825n/a elif k == 2:
826n/a #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
827n/a category_changes[i] = CATEGORY_NAMES.index(value)
828n/a elif k == 4:
829n/a #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
830n/a bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
831n/a elif k == 5:
832n/a #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
833n/a # We assume that all normalization changes are in 1:1 mappings
834n/a assert " " not in value
835n/a normalization_changes.append((i, value))
836n/a elif k == 6:
837n/a #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
838n/a # we only support changes where the old value is a single digit
839n/a assert value in "0123456789"
840n/a decimal_changes[i] = int(value)
841n/a elif k == 8:
842n/a # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
843n/a # Since 0 encodes "no change", the old value is better not 0
844n/a if not value:
845n/a numeric_changes[i] = -1
846n/a else:
847n/a numeric_changes[i] = float(value)
848n/a assert numeric_changes[i] not in (0, -1)
849n/a elif k == 9:
850n/a if value == 'Y':
851n/a mirrored_changes[i] = '1'
852n/a else:
853n/a mirrored_changes[i] = '0'
854n/a elif k == 11:
855n/a # change to ISO comment, ignore
856n/a pass
857n/a elif k == 12:
858n/a # change to simple uppercase mapping; ignore
859n/a pass
860n/a elif k == 13:
861n/a # change to simple lowercase mapping; ignore
862n/a pass
863n/a elif k == 14:
864n/a # change to simple titlecase mapping; ignore
865n/a pass
866n/a elif k == 15:
867n/a # change to east asian width
868n/a east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
869n/a elif k == 16:
870n/a # derived property changes; not yet
871n/a pass
872n/a elif k == 17:
873n/a # normalization quickchecks are not performed
874n/a # for older versions
875n/a pass
876n/a else:
877n/a class Difference(Exception):pass
878n/a raise Difference(hex(i), k, old.table[i], new.table[i])
879n/a new.changed.append((version, list(zip(bidir_changes, category_changes,
880n/a decimal_changes, mirrored_changes,
881n/a east_asian_width_changes,
882n/a numeric_changes)),
883n/a normalization_changes))
884n/a
885n/adef open_data(template, version):
886n/a local = template % ('-'+version,)
887n/a if not os.path.exists(local):
888n/a import urllib.request
889n/a if version == '3.2.0':
890n/a # irregular url structure
891n/a url = 'http://www.unicode.org/Public/3.2-Update/' + local
892n/a else:
893n/a url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
894n/a urllib.request.urlretrieve(url, filename=local)
895n/a if local.endswith('.txt'):
896n/a return open(local, encoding='utf-8')
897n/a else:
898n/a # Unihan.zip
899n/a return open(local, 'rb')
900n/a
901n/a# --------------------------------------------------------------------
902n/a# the following support code is taken from the unidb utilities
903n/a# Copyright (c) 1999-2000 by Secret Labs AB
904n/a
905n/a# load a unicode-data file from disk
906n/a
907n/aclass UnicodeData:
908n/a # Record structure:
909n/a # [ID, name, category, combining, bidi, decomp, (6)
910n/a # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
911n/a # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
912n/a # derived-props] (17)
913n/a
914n/a def __init__(self, version,
915n/a linebreakprops=False,
916n/a expand=1,
917n/a cjk_check=True):
918n/a self.changed = []
919n/a table = [None] * 0x110000
920n/a with open_data(UNICODE_DATA, version) as file:
921n/a while 1:
922n/a s = file.readline()
923n/a if not s:
924n/a break
925n/a s = s.strip().split(";")
926n/a char = int(s[0], 16)
927n/a table[char] = s
928n/a
929n/a cjk_ranges_found = []
930n/a
931n/a # expand first-last ranges
932n/a if expand:
933n/a field = None
934n/a for i in range(0, 0x110000):
935n/a s = table[i]
936n/a if s:
937n/a if s[1][-6:] == "First>":
938n/a s[1] = ""
939n/a field = s
940n/a elif s[1][-5:] == "Last>":
941n/a if s[1].startswith("<CJK Ideograph"):
942n/a cjk_ranges_found.append((field[0],
943n/a s[0]))
944n/a s[1] = ""
945n/a field = None
946n/a elif field:
947n/a f2 = field[:]
948n/a f2[0] = "%X" % i
949n/a table[i] = f2
950n/a if cjk_check and cjk_ranges != cjk_ranges_found:
951n/a raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
952n/a
953n/a # public attributes
954n/a self.filename = UNICODE_DATA % ''
955n/a self.table = table
956n/a self.chars = list(range(0x110000)) # unicode 3.2
957n/a
958n/a # check for name aliases and named sequences, see #12753
959n/a # aliases and named sequences are not in 3.2.0
960n/a if version != '3.2.0':
961n/a self.aliases = []
962n/a # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
963n/a # in order to take advantage of the compression and lookup
964n/a # algorithms used for the other characters
965n/a pua_index = NAME_ALIASES_START
966n/a with open_data(NAME_ALIASES, version) as file:
967n/a for s in file:
968n/a s = s.strip()
969n/a if not s or s.startswith('#'):
970n/a continue
971n/a char, name, abbrev = s.split(';')
972n/a char = int(char, 16)
973n/a self.aliases.append((name, char))
974n/a # also store the name in the PUA 1
975n/a self.table[pua_index][1] = name
976n/a pua_index += 1
977n/a assert pua_index - NAME_ALIASES_START == len(self.aliases)
978n/a
979n/a self.named_sequences = []
980n/a # store named sequences in the PUA 1, in range U+F0100..,
981n/a # in order to take advantage of the compression and lookup
982n/a # algorithms used for the other characters.
983n/a
984n/a assert pua_index < NAMED_SEQUENCES_START
985n/a pua_index = NAMED_SEQUENCES_START
986n/a with open_data(NAMED_SEQUENCES, version) as file:
987n/a for s in file:
988n/a s = s.strip()
989n/a if not s or s.startswith('#'):
990n/a continue
991n/a name, chars = s.split(';')
992n/a chars = tuple(int(char, 16) for char in chars.split())
993n/a # check that the structure defined in makeunicodename is OK
994n/a assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
995n/a assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
996n/a "the NamedSequence struct and in unicodedata_lookup")
997n/a self.named_sequences.append((name, chars))
998n/a # also store these in the PUA 1
999n/a self.table[pua_index][1] = name
1000n/a pua_index += 1
1001n/a assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
1002n/a
1003n/a self.exclusions = {}
1004n/a with open_data(COMPOSITION_EXCLUSIONS, version) as file:
1005n/a for s in file:
1006n/a s = s.strip()
1007n/a if not s:
1008n/a continue
1009n/a if s[0] == '#':
1010n/a continue
1011n/a char = int(s.split()[0],16)
1012n/a self.exclusions[char] = 1
1013n/a
1014n/a widths = [None] * 0x110000
1015n/a with open_data(EASTASIAN_WIDTH, version) as file:
1016n/a for s in file:
1017n/a s = s.strip()
1018n/a if not s:
1019n/a continue
1020n/a if s[0] == '#':
1021n/a continue
1022n/a s = s.split()[0].split(';')
1023n/a if '..' in s[0]:
1024n/a first, last = [int(c, 16) for c in s[0].split('..')]
1025n/a chars = list(range(first, last+1))
1026n/a else:
1027n/a chars = [int(s[0], 16)]
1028n/a for char in chars:
1029n/a widths[char] = s[1]
1030n/a
1031n/a for i in range(0, 0x110000):
1032n/a if table[i] is not None:
1033n/a table[i].append(widths[i])
1034n/a
1035n/a for i in range(0, 0x110000):
1036n/a if table[i] is not None:
1037n/a table[i].append(set())
1038n/a
1039n/a with open_data(DERIVED_CORE_PROPERTIES, version) as file:
1040n/a for s in file:
1041n/a s = s.split('#', 1)[0].strip()
1042n/a if not s:
1043n/a continue
1044n/a
1045n/a r, p = s.split(";")
1046n/a r = r.strip()
1047n/a p = p.strip()
1048n/a if ".." in r:
1049n/a first, last = [int(c, 16) for c in r.split('..')]
1050n/a chars = list(range(first, last+1))
1051n/a else:
1052n/a chars = [int(r, 16)]
1053n/a for char in chars:
1054n/a if table[char]:
1055n/a # Some properties (e.g. Default_Ignorable_Code_Point)
1056n/a # apply to unassigned code points; ignore them
1057n/a table[char][-1].add(p)
1058n/a
1059n/a with open_data(LINE_BREAK, version) as file:
1060n/a for s in file:
1061n/a s = s.partition('#')[0]
1062n/a s = [i.strip() for i in s.split(';')]
1063n/a if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1064n/a continue
1065n/a if '..' not in s[0]:
1066n/a first = last = int(s[0], 16)
1067n/a else:
1068n/a first, last = [int(c, 16) for c in s[0].split('..')]
1069n/a for char in range(first, last+1):
1070n/a table[char][-1].add('Line_Break')
1071n/a
1072n/a # We only want the quickcheck properties
1073n/a # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1074n/a # Yes is the default, hence only N and M occur
1075n/a # In 3.2.0, the format was different (NF?_NO)
1076n/a # The parsing will incorrectly determine these as
1077n/a # "yes", however, unicodedata.c will not perform quickchecks
1078n/a # for older versions, and no delta records will be created.
1079n/a quickchecks = [0] * 0x110000
1080n/a qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
1081n/a with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
1082n/a for s in file:
1083n/a if '#' in s:
1084n/a s = s[:s.index('#')]
1085n/a s = [i.strip() for i in s.split(';')]
1086n/a if len(s) < 2 or s[1] not in qc_order:
1087n/a continue
1088n/a quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1089n/a quickcheck_shift = qc_order.index(s[1])*2
1090n/a quickcheck <<= quickcheck_shift
1091n/a if '..' not in s[0]:
1092n/a first = last = int(s[0], 16)
1093n/a else:
1094n/a first, last = [int(c, 16) for c in s[0].split('..')]
1095n/a for char in range(first, last+1):
1096n/a assert not (quickchecks[char]>>quickcheck_shift)&3
1097n/a quickchecks[char] |= quickcheck
1098n/a for i in range(0, 0x110000):
1099n/a if table[i] is not None:
1100n/a table[i].append(quickchecks[i])
1101n/a
1102n/a with open_data(UNIHAN, version) as file:
1103n/a zip = zipfile.ZipFile(file)
1104n/a if version == '3.2.0':
1105n/a data = zip.open('Unihan-3.2.0.txt').read()
1106n/a else:
1107n/a data = zip.open('Unihan_NumericValues.txt').read()
1108n/a for line in data.decode("utf-8").splitlines():
1109n/a if not line.startswith('U+'):
1110n/a continue
1111n/a code, tag, value = line.split(None, 3)[:3]
1112n/a if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1113n/a 'kOtherNumeric'):
1114n/a continue
1115n/a value = value.strip().replace(',', '')
1116n/a i = int(code[2:], 16)
1117n/a # Patch the numeric field
1118n/a if table[i] is not None:
1119n/a table[i][8] = value
1120n/a sc = self.special_casing = {}
1121n/a with open_data(SPECIAL_CASING, version) as file:
1122n/a for s in file:
1123n/a s = s[:-1].split('#', 1)[0]
1124n/a if not s:
1125n/a continue
1126n/a data = s.split("; ")
1127n/a if data[4]:
1128n/a # We ignore all conditionals (since they depend on
1129n/a # languages) except for one, which is hardcoded. See
1130n/a # handle_capital_sigma in unicodeobject.c.
1131n/a continue
1132n/a c = int(data[0], 16)
1133n/a lower = [int(char, 16) for char in data[1].split()]
1134n/a title = [int(char, 16) for char in data[2].split()]
1135n/a upper = [int(char, 16) for char in data[3].split()]
1136n/a sc[c] = (lower, title, upper)
1137n/a cf = self.case_folding = {}
1138n/a if version != '3.2.0':
1139n/a with open_data(CASE_FOLDING, version) as file:
1140n/a for s in file:
1141n/a s = s[:-1].split('#', 1)[0]
1142n/a if not s:
1143n/a continue
1144n/a data = s.split("; ")
1145n/a if data[1] in "CF":
1146n/a c = int(data[0], 16)
1147n/a cf[c] = [int(char, 16) for char in data[2].split()]
1148n/a
1149n/a def uselatin1(self):
1150n/a # restrict character range to ISO Latin 1
1151n/a self.chars = list(range(256))
1152n/a
1153n/a# hash table tools
1154n/a
1155n/a# this is a straight-forward reimplementation of Python's built-in
1156n/a# dictionary type, using a static data structure, and a custom string
1157n/a# hash algorithm.
1158n/a
1159n/adef myhash(s, magic):
1160n/a h = 0
1161n/a for c in map(ord, s.upper()):
1162n/a h = (h * magic) + c
1163n/a ix = h & 0xff000000
1164n/a if ix:
1165n/a h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1166n/a return h
1167n/a
1168n/aSIZES = [
1169n/a (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1170n/a (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1171n/a (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1172n/a (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1173n/a]
1174n/a
1175n/aclass Hash:
1176n/a def __init__(self, name, data, magic):
1177n/a # turn a (key, value) list into a static hash table structure
1178n/a
1179n/a # determine table size
1180n/a for size, poly in SIZES:
1181n/a if size > len(data):
1182n/a poly = size + poly
1183n/a break
1184n/a else:
1185n/a raise AssertionError("ran out of polynomials")
1186n/a
1187n/a print(size, "slots in hash table")
1188n/a
1189n/a table = [None] * size
1190n/a
1191n/a mask = size-1
1192n/a
1193n/a n = 0
1194n/a
1195n/a hash = myhash
1196n/a
1197n/a # initialize hash table
1198n/a for key, value in data:
1199n/a h = hash(key, magic)
1200n/a i = (~h) & mask
1201n/a v = table[i]
1202n/a if v is None:
1203n/a table[i] = value
1204n/a continue
1205n/a incr = (h ^ (h >> 3)) & mask;
1206n/a if not incr:
1207n/a incr = mask
1208n/a while 1:
1209n/a n = n + 1
1210n/a i = (i + incr) & mask
1211n/a v = table[i]
1212n/a if v is None:
1213n/a table[i] = value
1214n/a break
1215n/a incr = incr << 1
1216n/a if incr > mask:
1217n/a incr = incr ^ poly
1218n/a
1219n/a print(n, "collisions")
1220n/a self.collisions = n
1221n/a
1222n/a for i in range(len(table)):
1223n/a if table[i] is None:
1224n/a table[i] = 0
1225n/a
1226n/a self.data = Array(name + "_hash", table)
1227n/a self.magic = magic
1228n/a self.name = name
1229n/a self.size = size
1230n/a self.poly = poly
1231n/a
1232n/a def dump(self, file, trace):
1233n/a # write data to file, as a C array
1234n/a self.data.dump(file, trace)
1235n/a file.write("#define %s_magic %d\n" % (self.name, self.magic))
1236n/a file.write("#define %s_size %d\n" % (self.name, self.size))
1237n/a file.write("#define %s_poly %d\n" % (self.name, self.poly))
1238n/a
1239n/a# stuff to deal with arrays of unsigned integers
1240n/a
1241n/aclass Array:
1242n/a
1243n/a def __init__(self, name, data):
1244n/a self.name = name
1245n/a self.data = data
1246n/a
1247n/a def dump(self, file, trace=0):
1248n/a # write data to file, as a C array
1249n/a size = getsize(self.data)
1250n/a if trace:
1251n/a print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
1252n/a file.write("static ")
1253n/a if size == 1:
1254n/a file.write("unsigned char")
1255n/a elif size == 2:
1256n/a file.write("unsigned short")
1257n/a else:
1258n/a file.write("unsigned int")
1259n/a file.write(" " + self.name + "[] = {\n")
1260n/a if self.data:
1261n/a s = " "
1262n/a for item in self.data:
1263n/a i = str(item) + ", "
1264n/a if len(s) + len(i) > 78:
1265n/a file.write(s + "\n")
1266n/a s = " " + i
1267n/a else:
1268n/a s = s + i
1269n/a if s.strip():
1270n/a file.write(s + "\n")
1271n/a file.write("};\n\n")
1272n/a
1273n/adef getsize(data):
1274n/a # return smallest possible integer size for the given array
1275n/a maxdata = max(data)
1276n/a if maxdata < 256:
1277n/a return 1
1278n/a elif maxdata < 65536:
1279n/a return 2
1280n/a else:
1281n/a return 4
1282n/a
1283n/adef splitbins(t, trace=0):
1284n/a """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1285n/a
1286n/a t is a sequence of ints. This function can be useful to save space if
1287n/a many of the ints are the same. t1 and t2 are lists of ints, and shift
1288n/a is an int, chosen to minimize the combined size of t1 and t2 (in C
1289n/a code), and where for each i in range(len(t)),
1290n/a t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1291n/a where mask is a bitmask isolating the last "shift" bits.
1292n/a
1293n/a If optional arg trace is non-zero (default zero), progress info
1294n/a is printed to sys.stderr. The higher the value, the more info
1295n/a you'll get.
1296n/a """
1297n/a
1298n/a if trace:
1299n/a def dump(t1, t2, shift, bytes):
1300n/a print("%d+%d bins at shift %d; %d bytes" % (
1301n/a len(t1), len(t2), shift, bytes), file=sys.stderr)
1302n/a print("Size of original table:", len(t)*getsize(t), \
1303n/a "bytes", file=sys.stderr)
1304n/a n = len(t)-1 # last valid index
1305n/a maxshift = 0 # the most we can shift n and still have something left
1306n/a if n > 0:
1307n/a while n >> 1:
1308n/a n >>= 1
1309n/a maxshift += 1
1310n/a del n
1311n/a bytes = sys.maxsize # smallest total size so far
1312n/a t = tuple(t) # so slices can be dict keys
1313n/a for shift in range(maxshift + 1):
1314n/a t1 = []
1315n/a t2 = []
1316n/a size = 2**shift
1317n/a bincache = {}
1318n/a for i in range(0, len(t), size):
1319n/a bin = t[i:i+size]
1320n/a index = bincache.get(bin)
1321n/a if index is None:
1322n/a index = len(t2)
1323n/a bincache[bin] = index
1324n/a t2.extend(bin)
1325n/a t1.append(index >> shift)
1326n/a # determine memory size
1327n/a b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
1328n/a if trace > 1:
1329n/a dump(t1, t2, shift, b)
1330n/a if b < bytes:
1331n/a best = t1, t2, shift
1332n/a bytes = b
1333n/a t1, t2, shift = best
1334n/a if trace:
1335n/a print("Best:", end=' ', file=sys.stderr)
1336n/a dump(t1, t2, shift, bytes)
1337n/a if __debug__:
1338n/a # exhaustively verify that the decomposition is correct
1339n/a mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
1340n/a for i in range(len(t)):
1341n/a assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1342n/a return best
1343n/a
1344n/aif __name__ == "__main__":
1345n/a maketables(1)