ยปCore Development>Code coverage>Modules/cjkcodecs/_codecs_cn.c

Python code coverage for Modules/cjkcodecs/_codecs_cn.c

#countcontent
1n/a/*
2n/a * _codecs_cn.c: Codecs collection for Mainland Chinese encodings
3n/a *
4n/a * Written by Hye-Shik Chang <perky@FreeBSD.org>
5n/a */
6n/a
7n/a#include "cjkcodecs.h"
8n/a#include "mappings_cn.h"
9n/a
10n/a/**
11n/a * hz is predefined as 100 on AIX. So we undefine it to avoid
12n/a * conflict against hz codec's.
13n/a */
14n/a#ifdef _AIX
15n/a#undef hz
16n/a#endif
17n/a
18n/a/* GBK and GB2312 map differently in few code points that are listed below:
19n/a *
20n/a * gb2312 gbk
21n/a * A1A4 U+30FB KATAKANA MIDDLE DOT U+00B7 MIDDLE DOT
22n/a * A1AA U+2015 HORIZONTAL BAR U+2014 EM DASH
23n/a * A844 undefined U+2015 HORIZONTAL BAR
24n/a */
25n/a
26n/a#define GBK_DECODE(dc1, dc2, writer) \
27n/a if ((dc1) == 0xa1 && (dc2) == 0xaa) { \
28n/a OUTCHAR(0x2014); \
29n/a } \
30n/a else if ((dc1) == 0xa8 && (dc2) == 0x44) { \
31n/a OUTCHAR(0x2015); \
32n/a } \
33n/a else if ((dc1) == 0xa1 && (dc2) == 0xa4) { \
34n/a OUTCHAR(0x00b7); \
35n/a } \
36n/a else if (TRYMAP_DEC(gb2312, decoded, dc1 ^ 0x80, dc2 ^ 0x80)) { \
37n/a OUTCHAR(decoded); \
38n/a } \
39n/a else if (TRYMAP_DEC(gbkext, decoded, dc1, dc2)) { \
40n/a OUTCHAR(decoded); \
41n/a }
42n/a
43n/a#define GBK_ENCODE(code, assi) \
44n/a if ((code) == 0x2014) { \
45n/a (assi) = 0xa1aa; \
46n/a } else if ((code) == 0x2015) { \
47n/a (assi) = 0xa844; \
48n/a } else if ((code) == 0x00b7) { \
49n/a (assi) = 0xa1a4; \
50n/a } else if ((code) != 0x30fb && TRYMAP_ENC(gbcommon, assi, code)) { \
51n/a ; \
52n/a }
53n/a
54n/a/*
55n/a * GB2312 codec
56n/a */
57n/a
58n/aENCODER(gb2312)
59n/a{
60n/a while (*inpos < inlen) {
61n/a Py_UCS4 c = INCHAR1;
62n/a DBCHAR code;
63n/a
64n/a if (c < 0x80) {
65n/a WRITEBYTE1((unsigned char)c);
66n/a NEXT(1, 1);
67n/a continue;
68n/a }
69n/a
70n/a if (c > 0xFFFF)
71n/a return 1;
72n/a
73n/a REQUIRE_OUTBUF(2);
74n/a if (TRYMAP_ENC(gbcommon, code, c))
75n/a ;
76n/a else
77n/a return 1;
78n/a
79n/a if (code & 0x8000) /* MSB set: GBK */
80n/a return 1;
81n/a
82n/a OUTBYTE1((code >> 8) | 0x80);
83n/a OUTBYTE2((code & 0xFF) | 0x80);
84n/a NEXT(1, 2);
85n/a }
86n/a
87n/a return 0;
88n/a}
89n/a
90n/aDECODER(gb2312)
91n/a{
92n/a while (inleft > 0) {
93n/a unsigned char c = **inbuf;
94n/a Py_UCS4 decoded;
95n/a
96n/a if (c < 0x80) {
97n/a OUTCHAR(c);
98n/a NEXT_IN(1);
99n/a continue;
100n/a }
101n/a
102n/a REQUIRE_INBUF(2);
103n/a if (TRYMAP_DEC(gb2312, decoded, c ^ 0x80, INBYTE2 ^ 0x80)) {
104n/a OUTCHAR(decoded);
105n/a NEXT_IN(2);
106n/a }
107n/a else
108n/a return 1;
109n/a }
110n/a
111n/a return 0;
112n/a}
113n/a
114n/a
115n/a/*
116n/a * GBK codec
117n/a */
118n/a
119n/aENCODER(gbk)
120n/a{
121n/a while (*inpos < inlen) {
122n/a Py_UCS4 c = INCHAR1;
123n/a DBCHAR code;
124n/a
125n/a if (c < 0x80) {
126n/a WRITEBYTE1((unsigned char)c);
127n/a NEXT(1, 1);
128n/a continue;
129n/a }
130n/a
131n/a if (c > 0xFFFF)
132n/a return 1;
133n/a
134n/a REQUIRE_OUTBUF(2);
135n/a
136n/a GBK_ENCODE(c, code)
137n/a else
138n/a return 1;
139n/a
140n/a OUTBYTE1((code >> 8) | 0x80);
141n/a if (code & 0x8000)
142n/a OUTBYTE2((code & 0xFF)); /* MSB set: GBK */
143n/a else
144n/a OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: GB2312 */
145n/a NEXT(1, 2);
146n/a }
147n/a
148n/a return 0;
149n/a}
150n/a
151n/aDECODER(gbk)
152n/a{
153n/a while (inleft > 0) {
154n/a unsigned char c = INBYTE1;
155n/a Py_UCS4 decoded;
156n/a
157n/a if (c < 0x80) {
158n/a OUTCHAR(c);
159n/a NEXT_IN(1);
160n/a continue;
161n/a }
162n/a
163n/a REQUIRE_INBUF(2);
164n/a
165n/a GBK_DECODE(c, INBYTE2, writer)
166n/a else
167n/a return 1;
168n/a
169n/a NEXT_IN(2);
170n/a }
171n/a
172n/a return 0;
173n/a}
174n/a
175n/a
176n/a/*
177n/a * GB18030 codec
178n/a */
179n/a
180n/aENCODER(gb18030)
181n/a{
182n/a while (*inpos < inlen) {
183n/a Py_UCS4 c = INCHAR1;
184n/a DBCHAR code;
185n/a
186n/a if (c < 0x80) {
187n/a WRITEBYTE1(c);
188n/a NEXT(1, 1);
189n/a continue;
190n/a }
191n/a
192n/a if (c >= 0x10000) {
193n/a Py_UCS4 tc = c - 0x10000;
194n/a assert (c <= 0x10FFFF);
195n/a
196n/a REQUIRE_OUTBUF(4);
197n/a
198n/a OUTBYTE4((unsigned char)(tc % 10) + 0x30);
199n/a tc /= 10;
200n/a OUTBYTE3((unsigned char)(tc % 126) + 0x81);
201n/a tc /= 126;
202n/a OUTBYTE2((unsigned char)(tc % 10) + 0x30);
203n/a tc /= 10;
204n/a OUTBYTE1((unsigned char)(tc + 0x90));
205n/a
206n/a NEXT(1, 4);
207n/a continue;
208n/a }
209n/a
210n/a REQUIRE_OUTBUF(2);
211n/a
212n/a GBK_ENCODE(c, code)
213n/a else if (TRYMAP_ENC(gb18030ext, code, c))
214n/a ;
215n/a else {
216n/a const struct _gb18030_to_unibmp_ranges *utrrange;
217n/a
218n/a REQUIRE_OUTBUF(4);
219n/a
220n/a for (utrrange = gb18030_to_unibmp_ranges;
221n/a utrrange->first != 0;
222n/a utrrange++)
223n/a if (utrrange->first <= c &&
224n/a c <= utrrange->last) {
225n/a Py_UCS4 tc;
226n/a
227n/a tc = c - utrrange->first +
228n/a utrrange->base;
229n/a
230n/a OUTBYTE4((unsigned char)(tc % 10) + 0x30);
231n/a tc /= 10;
232n/a OUTBYTE3((unsigned char)(tc % 126) + 0x81);
233n/a tc /= 126;
234n/a OUTBYTE2((unsigned char)(tc % 10) + 0x30);
235n/a tc /= 10;
236n/a OUTBYTE1((unsigned char)tc + 0x81);
237n/a
238n/a NEXT(1, 4);
239n/a break;
240n/a }
241n/a
242n/a if (utrrange->first == 0)
243n/a return 1;
244n/a continue;
245n/a }
246n/a
247n/a OUTBYTE1((code >> 8) | 0x80);
248n/a if (code & 0x8000)
249n/a OUTBYTE2((code & 0xFF)); /* MSB set: GBK or GB18030ext */
250n/a else
251n/a OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: GB2312 */
252n/a
253n/a NEXT(1, 2);
254n/a }
255n/a
256n/a return 0;
257n/a}
258n/a
259n/aDECODER(gb18030)
260n/a{
261n/a while (inleft > 0) {
262n/a unsigned char c = INBYTE1, c2;
263n/a Py_UCS4 decoded;
264n/a
265n/a if (c < 0x80) {
266n/a OUTCHAR(c);
267n/a NEXT_IN(1);
268n/a continue;
269n/a }
270n/a
271n/a REQUIRE_INBUF(2);
272n/a
273n/a c2 = INBYTE2;
274n/a if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
275n/a const struct _gb18030_to_unibmp_ranges *utr;
276n/a unsigned char c3, c4;
277n/a Py_UCS4 lseq;
278n/a
279n/a REQUIRE_INBUF(4);
280n/a c3 = INBYTE3;
281n/a c4 = INBYTE4;
282n/a if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
283n/a return 1;
284n/a c -= 0x81; c2 -= 0x30;
285n/a c3 -= 0x81; c4 -= 0x30;
286n/a
287n/a if (c < 4) { /* U+0080 - U+FFFF */
288n/a lseq = ((Py_UCS4)c * 10 + c2) * 1260 +
289n/a (Py_UCS4)c3 * 10 + c4;
290n/a if (lseq < 39420) {
291n/a for (utr = gb18030_to_unibmp_ranges;
292n/a lseq >= (utr + 1)->base;
293n/a utr++) ;
294n/a OUTCHAR(utr->first - utr->base + lseq);
295n/a NEXT_IN(4);
296n/a continue;
297n/a }
298n/a }
299n/a else if (c >= 15) { /* U+10000 - U+10FFFF */
300n/a lseq = 0x10000 + (((Py_UCS4)c-15) * 10 + c2)
301n/a * 1260 + (Py_UCS4)c3 * 10 + c4;
302n/a if (lseq <= 0x10FFFF) {
303n/a OUTCHAR(lseq);
304n/a NEXT_IN(4);
305n/a continue;
306n/a }
307n/a }
308n/a return 1;
309n/a }
310n/a
311n/a GBK_DECODE(c, c2, writer)
312n/a else if (TRYMAP_DEC(gb18030ext, decoded, c, c2))
313n/a OUTCHAR(decoded);
314n/a else
315n/a return 1;
316n/a
317n/a NEXT_IN(2);
318n/a }
319n/a
320n/a return 0;
321n/a}
322n/a
323n/a
324n/a/*
325n/a * HZ codec
326n/a */
327n/a
328n/aENCODER_INIT(hz)
329n/a{
330n/a state->i = 0;
331n/a return 0;
332n/a}
333n/a
334n/aENCODER_RESET(hz)
335n/a{
336n/a if (state->i != 0) {
337n/a WRITEBYTE2('~', '}');
338n/a state->i = 0;
339n/a NEXT_OUT(2);
340n/a }
341n/a return 0;
342n/a}
343n/a
344n/aENCODER(hz)
345n/a{
346n/a while (*inpos < inlen) {
347n/a Py_UCS4 c = INCHAR1;
348n/a DBCHAR code;
349n/a
350n/a if (c < 0x80) {
351n/a if (state->i == 0) {
352n/a WRITEBYTE1((unsigned char)c);
353n/a NEXT(1, 1);
354n/a }
355n/a else {
356n/a WRITEBYTE3('~', '}', (unsigned char)c);
357n/a NEXT(1, 3);
358n/a state->i = 0;
359n/a }
360n/a continue;
361n/a }
362n/a
363n/a if (c > 0xFFFF)
364n/a return 1;
365n/a
366n/a if (TRYMAP_ENC(gbcommon, code, c))
367n/a ;
368n/a else
369n/a return 1;
370n/a
371n/a if (code & 0x8000) /* MSB set: GBK */
372n/a return 1;
373n/a
374n/a if (state->i == 0) {
375n/a WRITEBYTE4('~', '{', code >> 8, code & 0xff);
376n/a NEXT(1, 4);
377n/a state->i = 1;
378n/a }
379n/a else {
380n/a WRITEBYTE2(code >> 8, code & 0xff);
381n/a NEXT(1, 2);
382n/a }
383n/a }
384n/a
385n/a return 0;
386n/a}
387n/a
388n/aDECODER_INIT(hz)
389n/a{
390n/a state->i = 0;
391n/a return 0;
392n/a}
393n/a
394n/aDECODER_RESET(hz)
395n/a{
396n/a state->i = 0;
397n/a return 0;
398n/a}
399n/a
400n/aDECODER(hz)
401n/a{
402n/a while (inleft > 0) {
403n/a unsigned char c = INBYTE1;
404n/a Py_UCS4 decoded;
405n/a
406n/a if (c == '~') {
407n/a unsigned char c2 = INBYTE2;
408n/a
409n/a REQUIRE_INBUF(2);
410n/a if (c2 == '~') {
411n/a OUTCHAR('~');
412n/a NEXT_IN(2);
413n/a continue;
414n/a }
415n/a else if (c2 == '{' && state->i == 0)
416n/a state->i = 1; /* set GB */
417n/a else if (c2 == '}' && state->i == 1)
418n/a state->i = 0; /* set ASCII */
419n/a else if (c2 == '\n')
420n/a ; /* line-continuation */
421n/a else
422n/a return 1;
423n/a NEXT_IN(2);
424n/a continue;
425n/a }
426n/a
427n/a if (c & 0x80)
428n/a return 1;
429n/a
430n/a if (state->i == 0) { /* ASCII mode */
431n/a OUTCHAR(c);
432n/a NEXT_IN(1);
433n/a }
434n/a else { /* GB mode */
435n/a REQUIRE_INBUF(2);
436n/a if (TRYMAP_DEC(gb2312, decoded, c, INBYTE2)) {
437n/a OUTCHAR(decoded);
438n/a NEXT_IN(2);
439n/a }
440n/a else
441n/a return 1;
442n/a }
443n/a }
444n/a
445n/a return 0;
446n/a}
447n/a
448n/a
449n/aBEGIN_MAPPINGS_LIST
450n/a MAPPING_DECONLY(gb2312)
451n/a MAPPING_DECONLY(gbkext)
452n/a MAPPING_ENCONLY(gbcommon)
453n/a MAPPING_ENCDEC(gb18030ext)
454n/aEND_MAPPINGS_LIST
455n/a
456n/aBEGIN_CODECS_LIST
457n/a CODEC_STATELESS(gb2312)
458n/a CODEC_STATELESS(gbk)
459n/a CODEC_STATELESS(gb18030)
460n/a CODEC_STATEFUL(hz)
461n/aEND_CODECS_LIST
462n/a
463n/aI_AM_A_MODULE_FOR(cn)