ยปCore Development>Code coverage>Tools/unicode/mkstringprep.py

Python code coverage for Tools/unicode/mkstringprep.py

#countcontent
1n/aimport re, sys
2n/afrom unicodedata import ucd_3_2_0 as unicodedata
3n/a
4n/aif sys.maxunicode == 65535:
5n/a raise RuntimeError("need UCS-4 Python")
6n/a
7n/adef gen_category(cats):
8n/a for i in range(0, 0x110000):
9n/a if unicodedata.category(chr(i)) in cats:
10n/a yield(i)
11n/a
12n/adef gen_bidirectional(cats):
13n/a for i in range(0, 0x110000):
14n/a if unicodedata.bidirectional(chr(i)) in cats:
15n/a yield(i)
16n/a
17n/adef compact_set(l):
18n/a single = []
19n/a tuple = []
20n/a prev = None
21n/a span = 0
22n/a for e in l:
23n/a if prev is None:
24n/a prev = e
25n/a span = 0
26n/a continue
27n/a if prev+span+1 != e:
28n/a if span > 2:
29n/a tuple.append((prev,prev+span+1))
30n/a else:
31n/a for i in range(prev, prev+span+1):
32n/a single.append(i)
33n/a prev = e
34n/a span = 0
35n/a else:
36n/a span += 1
37n/a if span:
38n/a tuple.append((prev,prev+span+1))
39n/a else:
40n/a single.append(prev)
41n/a if not single and len(tuple) == 1:
42n/a tuple = "range(%d,%d)" % tuple[0]
43n/a else:
44n/a tuple = " + ".join("list(range(%d,%d))" % t for t in tuple)
45n/a if not single:
46n/a return "set(%s)" % tuple
47n/a if not tuple:
48n/a return "set(%r)" % (single,)
49n/a return "set(%r + %s)" % (single, tuple)
50n/a
51n/a############## Read the tables in the RFC #######################
52n/a
53n/awith open("rfc3454.txt") as f:
54n/a data = f.readlines()
55n/a
56n/atables = []
57n/acurname = None
58n/afor l in data:
59n/a l = l.strip()
60n/a if not l:
61n/a continue
62n/a # Skip RFC page breaks
63n/a if l.startswith(("Hoffman & Blanchet", "RFC 3454")):
64n/a continue
65n/a # Find start/end lines
66n/a m = re.match("----- (Start|End) Table ([A-Z](.[0-9])+) -----", l)
67n/a if m:
68n/a if m.group(1) == "Start":
69n/a if curname:
70n/a raise RuntimeError("Double Start", (curname, l))
71n/a curname = m.group(2)
72n/a table = {}
73n/a tables.append((curname, table))
74n/a continue
75n/a else:
76n/a if not curname:
77n/a raise RuntimeError("End without start", l)
78n/a if curname != m.group(2):
79n/a raise RuntimeError("Unexpected end", l)
80n/a curname = None
81n/a continue
82n/a if not curname:
83n/a continue
84n/a # Now we are in a table
85n/a fields = l.split(";")
86n/a if len(fields) > 1:
87n/a # Drop comment field
88n/a fields = fields[:-1]
89n/a if len(fields) == 1:
90n/a fields = fields[0].split("-")
91n/a if len(fields) > 1:
92n/a # range
93n/a try:
94n/a start, end = fields
95n/a except ValueError:
96n/a raise RuntimeError("Unpacking problem", l)
97n/a else:
98n/a start = end = fields[0]
99n/a start = int(start, 16)
100n/a end = int(end, 16)
101n/a for i in range(start, end+1):
102n/a table[i] = i
103n/a else:
104n/a code, value = fields
105n/a value = value.strip()
106n/a if value:
107n/a value = [int(v, 16) for v in value.split(" ")]
108n/a else:
109n/a # table B.1
110n/a value = None
111n/a table[int(code, 16)] = value
112n/a
113n/a########### Generate compact Python versions of the tables #############
114n/a
115n/aprint("""# This file is generated by mkstringprep.py. DO NOT EDIT.
116n/a\"\"\"Library that exposes various tables found in the StringPrep RFC 3454.
117n/a
118n/aThere are two kinds of tables: sets, for which a member test is provided,
119n/aand mappings, for which a mapping function is provided.
120n/a\"\"\"
121n/a
122n/afrom unicodedata import ucd_3_2_0 as unicodedata
123n/a""")
124n/a
125n/aprint("assert unicodedata.unidata_version == %r" % (unicodedata.unidata_version,))
126n/a
127n/a# A.1 is the table of unassigned characters
128n/a# XXX Plane 15 PUA is listed as unassigned in Python.
129n/aname, table = tables[0]
130n/adel tables[0]
131n/aassert name == "A.1"
132n/atable = set(table.keys())
133n/aCn = set(gen_category(["Cn"]))
134n/a
135n/a# FDD0..FDEF are process internal codes
136n/aCn -= set(range(0xFDD0, 0xFDF0))
137n/a# not a character
138n/aCn -= set(range(0xFFFE, 0x110000, 0x10000))
139n/aCn -= set(range(0xFFFF, 0x110000, 0x10000))
140n/a
141n/a# assert table == Cn
142n/a
143n/aprint("""
144n/adef in_table_a1(code):
145n/a if unicodedata.category(code) != 'Cn': return False
146n/a c = ord(code)
147n/a if 0xFDD0 <= c < 0xFDF0: return False
148n/a return (c & 0xFFFF) not in (0xFFFE, 0xFFFF)
149n/a""")
150n/a
151n/a# B.1 cannot easily be derived
152n/aname, table = tables[0]
153n/adel tables[0]
154n/aassert name == "B.1"
155n/atable = sorted(table.keys())
156n/aprint("""
157n/ab1_set = """ + compact_set(table) + """
158n/adef in_table_b1(code):
159n/a return ord(code) in b1_set
160n/a""")
161n/a
162n/a# B.2 and B.3 is case folding.
163n/a# It takes CaseFolding.txt into account, which is
164n/a# not available in the Python database. Since
165n/a# B.2 is derived from B.3, we process B.3 first.
166n/a# B.3 supposedly *is* CaseFolding-3.2.0.txt.
167n/a
168n/aname, table_b2 = tables[0]
169n/adel tables[0]
170n/aassert name == "B.2"
171n/a
172n/aname, table_b3 = tables[0]
173n/adel tables[0]
174n/aassert name == "B.3"
175n/a
176n/a# B.3 is mostly Python's .lower, except for a number
177n/a# of special cases, e.g. considering canonical forms.
178n/a
179n/ab3_exceptions = {}
180n/a
181n/afor k,v in table_b2.items():
182n/a if list(map(ord, chr(k).lower())) != v:
183n/a b3_exceptions[k] = "".join(map(chr,v))
184n/a
185n/ab3 = sorted(b3_exceptions.items())
186n/a
187n/aprint("""
188n/ab3_exceptions = {""")
189n/afor i, kv in enumerate(b3):
190n/a print("0x%x:%a," % kv, end=' ')
191n/a if i % 4 == 3:
192n/a print()
193n/aprint("}")
194n/a
195n/aprint("""
196n/adef map_table_b3(code):
197n/a r = b3_exceptions.get(ord(code))
198n/a if r is not None: return r
199n/a return code.lower()
200n/a""")
201n/a
202n/adef map_table_b3(code):
203n/a r = b3_exceptions.get(ord(code))
204n/a if r is not None: return r
205n/a return code.lower()
206n/a
207n/a# B.2 is case folding for NFKC. This is the same as B.3,
208n/a# except where NormalizeWithKC(Fold(a)) !=
209n/a# NormalizeWithKC(Fold(NormalizeWithKC(Fold(a))))
210n/a
211n/adef map_table_b2(a):
212n/a al = map_table_b3(a)
213n/a b = unicodedata.normalize("NFKC", al)
214n/a bl = "".join([map_table_b3(ch) for ch in b])
215n/a c = unicodedata.normalize("NFKC", bl)
216n/a if b != c:
217n/a return c
218n/a else:
219n/a return al
220n/a
221n/aspecials = {}
222n/afor k,v in table_b2.items():
223n/a if list(map(ord, map_table_b2(chr(k)))) != v:
224n/a specials[k] = v
225n/a
226n/a# B.3 should not add any additional special cases
227n/aassert specials == {}
228n/a
229n/aprint("""
230n/adef map_table_b2(a):
231n/a al = map_table_b3(a)
232n/a b = unicodedata.normalize("NFKC", al)
233n/a bl = "".join([map_table_b3(ch) for ch in b])
234n/a c = unicodedata.normalize("NFKC", bl)
235n/a if b != c:
236n/a return c
237n/a else:
238n/a return al
239n/a""")
240n/a
241n/a# C.1.1 is a table with a single character
242n/aname, table = tables[0]
243n/adel tables[0]
244n/aassert name == "C.1.1"
245n/aassert table == {0x20:0x20}
246n/a
247n/aprint("""
248n/adef in_table_c11(code):
249n/a return code == " "
250n/a""")
251n/a
252n/a# C.1.2 is the rest of all space characters
253n/aname, table = tables[0]
254n/adel tables[0]
255n/aassert name == "C.1.2"
256n/a
257n/a# table = set(table.keys())
258n/a# Zs = set(gen_category(["Zs"])) - {0x20}
259n/a# assert Zs == table
260n/a
261n/aprint("""
262n/adef in_table_c12(code):
263n/a return unicodedata.category(code) == "Zs" and code != " "
264n/a
265n/adef in_table_c11_c12(code):
266n/a return unicodedata.category(code) == "Zs"
267n/a""")
268n/a
269n/a# C.2.1 ASCII control characters
270n/aname, table_c21 = tables[0]
271n/adel tables[0]
272n/aassert name == "C.2.1"
273n/a
274n/aCc = set(gen_category(["Cc"]))
275n/aCc_ascii = Cc & set(range(128))
276n/atable_c21 = set(table_c21.keys())
277n/aassert Cc_ascii == table_c21
278n/a
279n/aprint("""
280n/adef in_table_c21(code):
281n/a return ord(code) < 128 and unicodedata.category(code) == "Cc"
282n/a""")
283n/a
284n/a# C.2.2 Non-ASCII control characters. It also includes
285n/a# a number of characters in category Cf.
286n/aname, table_c22 = tables[0]
287n/adel tables[0]
288n/aassert name == "C.2.2"
289n/a
290n/aCc_nonascii = Cc - Cc_ascii
291n/atable_c22 = set(table_c22.keys())
292n/aassert len(Cc_nonascii - table_c22) == 0
293n/a
294n/aspecials = list(table_c22 - Cc_nonascii)
295n/aspecials.sort()
296n/a
297n/aprint("""c22_specials = """ + compact_set(specials) + """
298n/adef in_table_c22(code):
299n/a c = ord(code)
300n/a if c < 128: return False
301n/a if unicodedata.category(code) == "Cc": return True
302n/a return c in c22_specials
303n/a
304n/adef in_table_c21_c22(code):
305n/a return unicodedata.category(code) == "Cc" or \\
306n/a ord(code) in c22_specials
307n/a""")
308n/a
309n/a# C.3 Private use
310n/aname, table = tables[0]
311n/adel tables[0]
312n/aassert name == "C.3"
313n/a
314n/aCo = set(gen_category(["Co"]))
315n/aassert set(table.keys()) == Co
316n/a
317n/aprint("""
318n/adef in_table_c3(code):
319n/a return unicodedata.category(code) == "Co"
320n/a""")
321n/a
322n/a# C.4 Non-character code points, xFFFE, xFFFF
323n/a# plus process internal codes
324n/aname, table = tables[0]
325n/adel tables[0]
326n/aassert name == "C.4"
327n/a
328n/anonchar = set(range(0xFDD0,0xFDF0))
329n/anonchar.update(range(0xFFFE,0x110000,0x10000))
330n/anonchar.update(range(0xFFFF,0x110000,0x10000))
331n/atable = set(table.keys())
332n/aassert table == nonchar
333n/a
334n/aprint("""
335n/adef in_table_c4(code):
336n/a c = ord(code)
337n/a if c < 0xFDD0: return False
338n/a if c < 0xFDF0: return True
339n/a return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF)
340n/a""")
341n/a
342n/a# C.5 Surrogate codes
343n/aname, table = tables[0]
344n/adel tables[0]
345n/aassert name == "C.5"
346n/a
347n/aCs = set(gen_category(["Cs"]))
348n/aassert set(table.keys()) == Cs
349n/a
350n/aprint("""
351n/adef in_table_c5(code):
352n/a return unicodedata.category(code) == "Cs"
353n/a""")
354n/a
355n/a# C.6 Inappropriate for plain text
356n/aname, table = tables[0]
357n/adel tables[0]
358n/aassert name == "C.6"
359n/a
360n/atable = sorted(table.keys())
361n/a
362n/aprint("""
363n/ac6_set = """ + compact_set(table) + """
364n/adef in_table_c6(code):
365n/a return ord(code) in c6_set
366n/a""")
367n/a
368n/a# C.7 Inappropriate for canonical representation
369n/aname, table = tables[0]
370n/adel tables[0]
371n/aassert name == "C.7"
372n/a
373n/atable = sorted(table.keys())
374n/a
375n/aprint("""
376n/ac7_set = """ + compact_set(table) + """
377n/adef in_table_c7(code):
378n/a return ord(code) in c7_set
379n/a""")
380n/a
381n/a# C.8 Change display properties or are deprecated
382n/aname, table = tables[0]
383n/adel tables[0]
384n/aassert name == "C.8"
385n/a
386n/atable = sorted(table.keys())
387n/a
388n/aprint("""
389n/ac8_set = """ + compact_set(table) + """
390n/adef in_table_c8(code):
391n/a return ord(code) in c8_set
392n/a""")
393n/a
394n/a# C.9 Tagging characters
395n/aname, table = tables[0]
396n/adel tables[0]
397n/aassert name == "C.9"
398n/a
399n/atable = sorted(table.keys())
400n/a
401n/aprint("""
402n/ac9_set = """ + compact_set(table) + """
403n/adef in_table_c9(code):
404n/a return ord(code) in c9_set
405n/a""")
406n/a
407n/a# D.1 Characters with bidirectional property "R" or "AL"
408n/aname, table = tables[0]
409n/adel tables[0]
410n/aassert name == "D.1"
411n/a
412n/aRandAL = set(gen_bidirectional(["R","AL"]))
413n/aassert set(table.keys()) == RandAL
414n/a
415n/aprint("""
416n/adef in_table_d1(code):
417n/a return unicodedata.bidirectional(code) in ("R","AL")
418n/a""")
419n/a
420n/a# D.2 Characters with bidirectional property "L"
421n/aname, table = tables[0]
422n/adel tables[0]
423n/aassert name == "D.2"
424n/a
425n/aL = set(gen_bidirectional(["L"]))
426n/aassert set(table.keys()) == L
427n/a
428n/aprint("""
429n/adef in_table_d2(code):
430n/a return unicodedata.bidirectional(code) == "L"
431n/a""")