ยปCore Development>Code coverage>Tools/unicode/gencodec.py

Python code coverage for Tools/unicode/gencodec.py

#countcontent
1n/a""" Unicode Mapping Parser and Codec Generator.
2n/a
3n/aThis script parses Unicode mapping files as available from the Unicode
4n/asite (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5n/amodules from them. The codecs use the standard character mapping codec
6n/ato actually apply the mapping.
7n/a
8n/aSynopsis: gencodec.py dir codec_prefix
9n/a
10n/aAll files in dir are scanned and those producing non-empty mappings
11n/awill be written to <codec_prefix><mapname>.py with <mapname> being the
12n/afirst part of the map's filename ('a' in a.b.c.txt) converted to
13n/alowercase with hyphens replaced by underscores.
14n/a
15n/aThe tool also writes marshalled versions of the mapping tables to the
16n/asame location (with .mapping extension).
17n/a
18n/aWritten by Marc-Andre Lemburg (mal@lemburg.com).
19n/a
20n/a(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21n/a(c) Copyright Guido van Rossum, 2000.
22n/a
23n/aTable generation:
24n/a(c) Copyright Marc-Andre Lemburg, 2005.
25n/a Licensed to PSF under a Contributor Agreement.
26n/a
27n/a"""#"
28n/a
29n/aimport re, os, marshal, codecs
30n/a
31n/a# Maximum allowed size of charmap tables
32n/aMAX_TABLE_SIZE = 8192
33n/a
34n/a# Standard undefined Unicode code point
35n/aUNI_UNDEFINED = chr(0xFFFE)
36n/a
37n/a# Placeholder for a missing code point
38n/aMISSING_CODE = -1
39n/a
40n/amapRE = re.compile(r'((?:0x[0-9a-fA-F]+\+?)+)'
41n/a r'\s+'
42n/a r'((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
43n/a r'\s*'
44n/a r'(#.+)?')
45n/a
46n/adef parsecodes(codes, len=len, range=range):
47n/a
48n/a """ Converts code combinations to either a single code integer
49n/a or a tuple of integers.
50n/a
51n/a meta-codes (in angular brackets, e.g. <LR> and <RL>) are
52n/a ignored.
53n/a
54n/a Empty codes or illegal ones are returned as None.
55n/a
56n/a """
57n/a if not codes:
58n/a return MISSING_CODE
59n/a l = codes.split('+')
60n/a if len(l) == 1:
61n/a return int(l[0],16)
62n/a for i in range(len(l)):
63n/a try:
64n/a l[i] = int(l[i],16)
65n/a except ValueError:
66n/a l[i] = MISSING_CODE
67n/a l = [x for x in l if x != MISSING_CODE]
68n/a if len(l) == 1:
69n/a return l[0]
70n/a else:
71n/a return tuple(l)
72n/a
73n/adef readmap(filename):
74n/a
75n/a f = open(filename,'r')
76n/a lines = f.readlines()
77n/a f.close()
78n/a enc2uni = {}
79n/a identity = []
80n/a unmapped = list(range(256))
81n/a
82n/a # UTC mapping tables per convention don't include the identity
83n/a # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
84n/a # explicitly mapped to different characters or undefined
85n/a for i in list(range(32)) + [127]:
86n/a identity.append(i)
87n/a unmapped.remove(i)
88n/a enc2uni[i] = (i, 'CONTROL CHARACTER')
89n/a
90n/a for line in lines:
91n/a line = line.strip()
92n/a if not line or line[0] == '#':
93n/a continue
94n/a m = mapRE.match(line)
95n/a if not m:
96n/a #print '* not matched: %s' % repr(line)
97n/a continue
98n/a enc,uni,comment = m.groups()
99n/a enc = parsecodes(enc)
100n/a uni = parsecodes(uni)
101n/a if comment is None:
102n/a comment = ''
103n/a else:
104n/a comment = comment[1:].strip()
105n/a if not isinstance(enc, tuple) and enc < 256:
106n/a if enc in unmapped:
107n/a unmapped.remove(enc)
108n/a if enc == uni:
109n/a identity.append(enc)
110n/a enc2uni[enc] = (uni,comment)
111n/a else:
112n/a enc2uni[enc] = (uni,comment)
113n/a
114n/a # If there are more identity-mapped entries than unmapped entries,
115n/a # it pays to generate an identity dictionary first, and add explicit
116n/a # mappings to None for the rest
117n/a if len(identity) >= len(unmapped):
118n/a for enc in unmapped:
119n/a enc2uni[enc] = (MISSING_CODE, "")
120n/a enc2uni['IDENTITY'] = 256
121n/a
122n/a return enc2uni
123n/a
124n/adef hexrepr(t, precision=4):
125n/a
126n/a if t is None:
127n/a return 'None'
128n/a try:
129n/a len(t)
130n/a except TypeError:
131n/a return '0x%0*X' % (precision, t)
132n/a try:
133n/a return '(' + ', '.join(['0x%0*X' % (precision, item)
134n/a for item in t]) + ')'
135n/a except TypeError as why:
136n/a print('* failed to convert %r: %s' % (t, why))
137n/a raise
138n/a
139n/adef python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
140n/a
141n/a l = []
142n/a append = l.append
143n/a if "IDENTITY" in map:
144n/a append("%s = codecs.make_identity_dict(range(%d))" %
145n/a (varname, map["IDENTITY"]))
146n/a append("%s.update({" % varname)
147n/a splits = 1
148n/a del map["IDENTITY"]
149n/a identity = 1
150n/a else:
151n/a append("%s = {" % varname)
152n/a splits = 0
153n/a identity = 0
154n/a
155n/a mappings = sorted(map.items())
156n/a i = 0
157n/a key_precision, value_precision = precisions
158n/a for mapkey, mapvalue in mappings:
159n/a mapcomment = ''
160n/a if isinstance(mapkey, tuple):
161n/a (mapkey, mapcomment) = mapkey
162n/a if isinstance(mapvalue, tuple):
163n/a (mapvalue, mapcomment) = mapvalue
164n/a if mapkey is None:
165n/a continue
166n/a if (identity and
167n/a mapkey == mapvalue and
168n/a mapkey < 256):
169n/a # No need to include identity mappings, since these
170n/a # are already set for the first 256 code points.
171n/a continue
172n/a key = hexrepr(mapkey, key_precision)
173n/a value = hexrepr(mapvalue, value_precision)
174n/a if mapcomment and comments:
175n/a append(' %s: %s,\t# %s' % (key, value, mapcomment))
176n/a else:
177n/a append(' %s: %s,' % (key, value))
178n/a i += 1
179n/a if i == 4096:
180n/a # Split the definition into parts to that the Python
181n/a # parser doesn't dump core
182n/a if splits == 0:
183n/a append('}')
184n/a else:
185n/a append('})')
186n/a append('%s.update({' % varname)
187n/a i = 0
188n/a splits = splits + 1
189n/a if splits == 0:
190n/a append('}')
191n/a else:
192n/a append('})')
193n/a
194n/a return l
195n/a
196n/adef python_tabledef_code(varname, map, comments=1, key_precision=2):
197n/a
198n/a l = []
199n/a append = l.append
200n/a append('%s = (' % varname)
201n/a
202n/a # Analyze map and create table dict
203n/a mappings = sorted(map.items())
204n/a table = {}
205n/a maxkey = 255
206n/a if 'IDENTITY' in map:
207n/a for key in range(256):
208n/a table[key] = (key, '')
209n/a del map['IDENTITY']
210n/a for mapkey, mapvalue in mappings:
211n/a mapcomment = ''
212n/a if isinstance(mapkey, tuple):
213n/a (mapkey, mapcomment) = mapkey
214n/a if isinstance(mapvalue, tuple):
215n/a (mapvalue, mapcomment) = mapvalue
216n/a if mapkey == MISSING_CODE:
217n/a continue
218n/a table[mapkey] = (mapvalue, mapcomment)
219n/a if mapkey > maxkey:
220n/a maxkey = mapkey
221n/a if maxkey > MAX_TABLE_SIZE:
222n/a # Table too large
223n/a return None
224n/a
225n/a # Create table code
226n/a maxchar = 0
227n/a for key in range(maxkey + 1):
228n/a if key not in table:
229n/a mapvalue = MISSING_CODE
230n/a mapcomment = 'UNDEFINED'
231n/a else:
232n/a mapvalue, mapcomment = table[key]
233n/a if mapvalue == MISSING_CODE:
234n/a mapchar = UNI_UNDEFINED
235n/a else:
236n/a if isinstance(mapvalue, tuple):
237n/a # 1-n mappings not supported
238n/a return None
239n/a else:
240n/a mapchar = chr(mapvalue)
241n/a maxchar = max(maxchar, ord(mapchar))
242n/a if mapcomment and comments:
243n/a append(' %a \t# %s -> %s' % (mapchar,
244n/a hexrepr(key, key_precision),
245n/a mapcomment))
246n/a else:
247n/a append(' %a' % mapchar)
248n/a
249n/a if maxchar < 256:
250n/a append(' %a \t## Widen to UCS2 for optimization' % UNI_UNDEFINED)
251n/a append(')')
252n/a return l
253n/a
254n/adef codegen(name, map, encodingname, comments=1):
255n/a
256n/a """ Returns Python source for the given map.
257n/a
258n/a Comments are included in the source, if comments is true (default).
259n/a
260n/a """
261n/a # Generate code
262n/a decoding_map_code = python_mapdef_code(
263n/a 'decoding_map',
264n/a map,
265n/a comments=comments)
266n/a decoding_table_code = python_tabledef_code(
267n/a 'decoding_table',
268n/a map,
269n/a comments=comments)
270n/a encoding_map_code = python_mapdef_code(
271n/a 'encoding_map',
272n/a codecs.make_encoding_map(map),
273n/a comments=comments,
274n/a precisions=(4, 2))
275n/a
276n/a if decoding_table_code:
277n/a suffix = 'table'
278n/a else:
279n/a suffix = 'map'
280n/a
281n/a l = [
282n/a '''\
283n/a""" Python Character Mapping Codec %s generated from '%s' with gencodec.py.
284n/a
285n/a"""#"
286n/a
287n/aimport codecs
288n/a
289n/a### Codec APIs
290n/a
291n/aclass Codec(codecs.Codec):
292n/a
293n/a def encode(self, input, errors='strict'):
294n/a return codecs.charmap_encode(input, errors, encoding_%s)
295n/a
296n/a def decode(self, input, errors='strict'):
297n/a return codecs.charmap_decode(input, errors, decoding_%s)
298n/a''' % (encodingname, name, suffix, suffix)]
299n/a l.append('''\
300n/aclass IncrementalEncoder(codecs.IncrementalEncoder):
301n/a def encode(self, input, final=False):
302n/a return codecs.charmap_encode(input, self.errors, encoding_%s)[0]
303n/a
304n/aclass IncrementalDecoder(codecs.IncrementalDecoder):
305n/a def decode(self, input, final=False):
306n/a return codecs.charmap_decode(input, self.errors, decoding_%s)[0]''' %
307n/a (suffix, suffix))
308n/a
309n/a l.append('''
310n/aclass StreamWriter(Codec, codecs.StreamWriter):
311n/a pass
312n/a
313n/aclass StreamReader(Codec, codecs.StreamReader):
314n/a pass
315n/a
316n/a### encodings module API
317n/a
318n/adef getregentry():
319n/a return codecs.CodecInfo(
320n/a name=%r,
321n/a encode=Codec().encode,
322n/a decode=Codec().decode,
323n/a incrementalencoder=IncrementalEncoder,
324n/a incrementaldecoder=IncrementalDecoder,
325n/a streamreader=StreamReader,
326n/a streamwriter=StreamWriter,
327n/a )
328n/a''' % encodingname.replace('_', '-'))
329n/a
330n/a # Add decoding table or map (with preference to the table)
331n/a if not decoding_table_code:
332n/a l.append('''
333n/a### Decoding Map
334n/a''')
335n/a l.extend(decoding_map_code)
336n/a else:
337n/a l.append('''
338n/a### Decoding Table
339n/a''')
340n/a l.extend(decoding_table_code)
341n/a
342n/a # Add encoding map
343n/a if decoding_table_code:
344n/a l.append('''
345n/a### Encoding table
346n/aencoding_table = codecs.charmap_build(decoding_table)
347n/a''')
348n/a else:
349n/a l.append('''
350n/a### Encoding Map
351n/a''')
352n/a l.extend(encoding_map_code)
353n/a
354n/a # Final new-line
355n/a l.append('')
356n/a
357n/a return '\n'.join(l).expandtabs()
358n/a
359n/adef pymap(name,map,pyfile,encodingname,comments=1):
360n/a
361n/a code = codegen(name,map,encodingname,comments)
362n/a f = open(pyfile,'w')
363n/a f.write(code)
364n/a f.close()
365n/a
366n/adef marshalmap(name,map,marshalfile):
367n/a
368n/a d = {}
369n/a for e,(u,c) in map.items():
370n/a d[e] = (u,c)
371n/a f = open(marshalfile,'wb')
372n/a marshal.dump(d,f)
373n/a f.close()
374n/a
375n/adef convertdir(dir, dirprefix='', nameprefix='', comments=1):
376n/a
377n/a mapnames = os.listdir(dir)
378n/a for mapname in mapnames:
379n/a mappathname = os.path.join(dir, mapname)
380n/a if not os.path.isfile(mappathname):
381n/a continue
382n/a name = os.path.split(mapname)[1]
383n/a name = name.replace('-','_')
384n/a name = name.split('.')[0]
385n/a name = name.lower()
386n/a name = nameprefix + name
387n/a codefile = name + '.py'
388n/a marshalfile = name + '.mapping'
389n/a print('converting %s to %s and %s' % (mapname,
390n/a dirprefix + codefile,
391n/a dirprefix + marshalfile))
392n/a try:
393n/a map = readmap(os.path.join(dir,mapname))
394n/a if not map:
395n/a print('* map is empty; skipping')
396n/a else:
397n/a pymap(mappathname, map, dirprefix + codefile,name,comments)
398n/a marshalmap(mappathname, map, dirprefix + marshalfile)
399n/a except ValueError as why:
400n/a print('* conversion failed: %s' % why)
401n/a raise
402n/a
403n/adef rewritepythondir(dir, dirprefix='', comments=1):
404n/a
405n/a mapnames = os.listdir(dir)
406n/a for mapname in mapnames:
407n/a if not mapname.endswith('.mapping'):
408n/a continue
409n/a name = mapname[:-len('.mapping')]
410n/a codefile = name + '.py'
411n/a print('converting %s to %s' % (mapname,
412n/a dirprefix + codefile))
413n/a try:
414n/a map = marshal.load(open(os.path.join(dir,mapname),
415n/a 'rb'))
416n/a if not map:
417n/a print('* map is empty; skipping')
418n/a else:
419n/a pymap(mapname, map, dirprefix + codefile,name,comments)
420n/a except ValueError as why:
421n/a print('* conversion failed: %s' % why)
422n/a
423n/aif __name__ == '__main__':
424n/a
425n/a import sys
426n/a if 1:
427n/a convertdir(*sys.argv[1:])
428n/a else:
429n/a rewritepythondir(*sys.argv[1:])