ยปCore Development>Code coverage>Modules/expat/xmltok.c

Python code coverage for Modules/expat/xmltok.c

#countcontent
1n/a/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2n/a See the file COPYING for copying permission.
3n/a*/
4n/a
5n/a#include <stddef.h>
6n/a
7n/a#ifdef COMPILED_FROM_DSP
8n/a#include "winconfig.h"
9n/a#elif defined(MACOS_CLASSIC)
10n/a#include "macconfig.h"
11n/a#elif defined(__amigaos__)
12n/a#include "amigaconfig.h"
13n/a#elif defined(__WATCOMC__)
14n/a#include "watcomconfig.h"
15n/a#else
16n/a#ifdef HAVE_EXPAT_CONFIG_H
17n/a#include <expat_config.h>
18n/a#endif
19n/a#endif /* ndef COMPILED_FROM_DSP */
20n/a
21n/a#include "expat_external.h"
22n/a#include "internal.h"
23n/a#include "xmltok.h"
24n/a#include "nametab.h"
25n/a
26n/a#ifdef XML_DTD
27n/a#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
28n/a#else
29n/a#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
30n/a#endif
31n/a
32n/a#define VTABLE1 \
33n/a { PREFIX(prologTok), PREFIX(contentTok), \
34n/a PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
35n/a { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
36n/a PREFIX(sameName), \
37n/a PREFIX(nameMatchesAscii), \
38n/a PREFIX(nameLength), \
39n/a PREFIX(skipS), \
40n/a PREFIX(getAtts), \
41n/a PREFIX(charRefNumber), \
42n/a PREFIX(predefinedEntityName), \
43n/a PREFIX(updatePosition), \
44n/a PREFIX(isPublicId)
45n/a
46n/a#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
47n/a
48n/a#define UCS2_GET_NAMING(pages, hi, lo) \
49n/a (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
50n/a
51n/a/* A 2 byte UTF-8 representation splits the characters 11 bits between
52n/a the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
53n/a pages, 3 bits to add to that index and 5 bits to generate the mask.
54n/a*/
55n/a#define UTF8_GET_NAMING2(pages, byte) \
56n/a (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
57n/a + ((((byte)[0]) & 3) << 1) \
58n/a + ((((byte)[1]) >> 5) & 1)] \
59n/a & (1 << (((byte)[1]) & 0x1F)))
60n/a
61n/a/* A 3 byte UTF-8 representation splits the characters 16 bits between
62n/a the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
63n/a into pages, 3 bits to add to that index and 5 bits to generate the
64n/a mask.
65n/a*/
66n/a#define UTF8_GET_NAMING3(pages, byte) \
67n/a (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
68n/a + ((((byte)[1]) >> 2) & 0xF)] \
69n/a << 3) \
70n/a + ((((byte)[1]) & 3) << 1) \
71n/a + ((((byte)[2]) >> 5) & 1)] \
72n/a & (1 << (((byte)[2]) & 0x1F)))
73n/a
74n/a#define UTF8_GET_NAMING(pages, p, n) \
75n/a ((n) == 2 \
76n/a ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
77n/a : ((n) == 3 \
78n/a ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
79n/a : 0))
80n/a
81n/a/* Detection of invalid UTF-8 sequences is based on Table 3.1B
82n/a of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
83n/a with the additional restriction of not allowing the Unicode
84n/a code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
85n/a Implementation details:
86n/a (A & 0x80) == 0 means A < 0x80
87n/a and
88n/a (A & 0xC0) == 0xC0 means A > 0xBF
89n/a*/
90n/a
91n/a#define UTF8_INVALID2(p) \
92n/a ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
93n/a
94n/a#define UTF8_INVALID3(p) \
95n/a (((p)[2] & 0x80) == 0 \
96n/a || \
97n/a ((*p) == 0xEF && (p)[1] == 0xBF \
98n/a ? \
99n/a (p)[2] > 0xBD \
100n/a : \
101n/a ((p)[2] & 0xC0) == 0xC0) \
102n/a || \
103n/a ((*p) == 0xE0 \
104n/a ? \
105n/a (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
106n/a : \
107n/a ((p)[1] & 0x80) == 0 \
108n/a || \
109n/a ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
110n/a
111n/a#define UTF8_INVALID4(p) \
112n/a (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
113n/a || \
114n/a ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
115n/a || \
116n/a ((*p) == 0xF0 \
117n/a ? \
118n/a (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
119n/a : \
120n/a ((p)[1] & 0x80) == 0 \
121n/a || \
122n/a ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
123n/a
124n/astatic int PTRFASTCALL
125n/aisNever(const ENCODING *enc, const char *p)
126n/a{
127n/a return 0;
128n/a}
129n/a
130n/astatic int PTRFASTCALL
131n/autf8_isName2(const ENCODING *enc, const char *p)
132n/a{
133n/a return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
134n/a}
135n/a
136n/astatic int PTRFASTCALL
137n/autf8_isName3(const ENCODING *enc, const char *p)
138n/a{
139n/a return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
140n/a}
141n/a
142n/a#define utf8_isName4 isNever
143n/a
144n/astatic int PTRFASTCALL
145n/autf8_isNmstrt2(const ENCODING *enc, const char *p)
146n/a{
147n/a return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
148n/a}
149n/a
150n/astatic int PTRFASTCALL
151n/autf8_isNmstrt3(const ENCODING *enc, const char *p)
152n/a{
153n/a return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
154n/a}
155n/a
156n/a#define utf8_isNmstrt4 isNever
157n/a
158n/astatic int PTRFASTCALL
159n/autf8_isInvalid2(const ENCODING *enc, const char *p)
160n/a{
161n/a return UTF8_INVALID2((const unsigned char *)p);
162n/a}
163n/a
164n/astatic int PTRFASTCALL
165n/autf8_isInvalid3(const ENCODING *enc, const char *p)
166n/a{
167n/a return UTF8_INVALID3((const unsigned char *)p);
168n/a}
169n/a
170n/astatic int PTRFASTCALL
171n/autf8_isInvalid4(const ENCODING *enc, const char *p)
172n/a{
173n/a return UTF8_INVALID4((const unsigned char *)p);
174n/a}
175n/a
176n/astruct normal_encoding {
177n/a ENCODING enc;
178n/a unsigned char type[256];
179n/a#ifdef XML_MIN_SIZE
180n/a int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
181n/a int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
182n/a int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
183n/a int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
184n/a int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
185n/a#endif /* XML_MIN_SIZE */
186n/a int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
187n/a int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
188n/a int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
189n/a int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
190n/a int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
191n/a int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
192n/a int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
193n/a int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
194n/a int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
195n/a};
196n/a
197n/a#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc))
198n/a
199n/a#ifdef XML_MIN_SIZE
200n/a
201n/a#define STANDARD_VTABLE(E) \
202n/a E ## byteType, \
203n/a E ## isNameMin, \
204n/a E ## isNmstrtMin, \
205n/a E ## byteToAscii, \
206n/a E ## charMatches,
207n/a
208n/a#else
209n/a
210n/a#define STANDARD_VTABLE(E) /* as nothing */
211n/a
212n/a#endif
213n/a
214n/a#define NORMAL_VTABLE(E) \
215n/a E ## isName2, \
216n/a E ## isName3, \
217n/a E ## isName4, \
218n/a E ## isNmstrt2, \
219n/a E ## isNmstrt3, \
220n/a E ## isNmstrt4, \
221n/a E ## isInvalid2, \
222n/a E ## isInvalid3, \
223n/a E ## isInvalid4
224n/a
225n/astatic int FASTCALL checkCharRefNumber(int);
226n/a
227n/a#include "xmltok_impl.h"
228n/a#include "ascii.h"
229n/a
230n/a#ifdef XML_MIN_SIZE
231n/a#define sb_isNameMin isNever
232n/a#define sb_isNmstrtMin isNever
233n/a#endif
234n/a
235n/a#ifdef XML_MIN_SIZE
236n/a#define MINBPC(enc) ((enc)->minBytesPerChar)
237n/a#else
238n/a/* minimum bytes per character */
239n/a#define MINBPC(enc) 1
240n/a#endif
241n/a
242n/a#define SB_BYTE_TYPE(enc, p) \
243n/a (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
244n/a
245n/a#ifdef XML_MIN_SIZE
246n/astatic int PTRFASTCALL
247n/asb_byteType(const ENCODING *enc, const char *p)
248n/a{
249n/a return SB_BYTE_TYPE(enc, p);
250n/a}
251n/a#define BYTE_TYPE(enc, p) \
252n/a (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
253n/a#else
254n/a#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
255n/a#endif
256n/a
257n/a#ifdef XML_MIN_SIZE
258n/a#define BYTE_TO_ASCII(enc, p) \
259n/a (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
260n/astatic int PTRFASTCALL
261n/asb_byteToAscii(const ENCODING *enc, const char *p)
262n/a{
263n/a return *p;
264n/a}
265n/a#else
266n/a#define BYTE_TO_ASCII(enc, p) (*(p))
267n/a#endif
268n/a
269n/a#define IS_NAME_CHAR(enc, p, n) \
270n/a (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
271n/a#define IS_NMSTRT_CHAR(enc, p, n) \
272n/a (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
273n/a#define IS_INVALID_CHAR(enc, p, n) \
274n/a (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
275n/a
276n/a#ifdef XML_MIN_SIZE
277n/a#define IS_NAME_CHAR_MINBPC(enc, p) \
278n/a (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
279n/a#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
280n/a (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
281n/a#else
282n/a#define IS_NAME_CHAR_MINBPC(enc, p) (0)
283n/a#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
284n/a#endif
285n/a
286n/a#ifdef XML_MIN_SIZE
287n/a#define CHAR_MATCHES(enc, p, c) \
288n/a (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
289n/astatic int PTRCALL
290n/asb_charMatches(const ENCODING *enc, const char *p, int c)
291n/a{
292n/a return *p == c;
293n/a}
294n/a#else
295n/a/* c is an ASCII character */
296n/a#define CHAR_MATCHES(enc, p, c) (*(p) == c)
297n/a#endif
298n/a
299n/a#define PREFIX(ident) normal_ ## ident
300n/a#define XML_TOK_IMPL_C
301n/a#include "xmltok_impl.c"
302n/a#undef XML_TOK_IMPL_C
303n/a
304n/a#undef MINBPC
305n/a#undef BYTE_TYPE
306n/a#undef BYTE_TO_ASCII
307n/a#undef CHAR_MATCHES
308n/a#undef IS_NAME_CHAR
309n/a#undef IS_NAME_CHAR_MINBPC
310n/a#undef IS_NMSTRT_CHAR
311n/a#undef IS_NMSTRT_CHAR_MINBPC
312n/a#undef IS_INVALID_CHAR
313n/a
314n/aenum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
315n/a UTF8_cval1 = 0x00,
316n/a UTF8_cval2 = 0xc0,
317n/a UTF8_cval3 = 0xe0,
318n/a UTF8_cval4 = 0xf0
319n/a};
320n/a
321n/astatic void PTRCALL
322n/autf8_toUtf8(const ENCODING *enc,
323n/a const char **fromP, const char *fromLim,
324n/a char **toP, const char *toLim)
325n/a{
326n/a char *to;
327n/a const char *from;
328n/a if (fromLim - *fromP > toLim - *toP) {
329n/a /* Avoid copying partial characters. */
330n/a for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
331n/a if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
332n/a break;
333n/a }
334n/a for (to = *toP, from = *fromP; from != fromLim; from++, to++)
335n/a *to = *from;
336n/a *fromP = from;
337n/a *toP = to;
338n/a}
339n/a
340n/astatic void PTRCALL
341n/autf8_toUtf16(const ENCODING *enc,
342n/a const char **fromP, const char *fromLim,
343n/a unsigned short **toP, const unsigned short *toLim)
344n/a{
345n/a unsigned short *to = *toP;
346n/a const char *from = *fromP;
347n/a while (from != fromLim && to != toLim) {
348n/a switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
349n/a case BT_LEAD2:
350n/a *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
351n/a from += 2;
352n/a break;
353n/a case BT_LEAD3:
354n/a *to++ = (unsigned short)(((from[0] & 0xf) << 12)
355n/a | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
356n/a from += 3;
357n/a break;
358n/a case BT_LEAD4:
359n/a {
360n/a unsigned long n;
361n/a if (to + 1 == toLim)
362n/a goto after;
363n/a n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
364n/a | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
365n/a n -= 0x10000;
366n/a to[0] = (unsigned short)((n >> 10) | 0xD800);
367n/a to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
368n/a to += 2;
369n/a from += 4;
370n/a }
371n/a break;
372n/a default:
373n/a *to++ = *from++;
374n/a break;
375n/a }
376n/a }
377n/aafter:
378n/a *fromP = from;
379n/a *toP = to;
380n/a}
381n/a
382n/a#ifdef XML_NS
383n/astatic const struct normal_encoding utf8_encoding_ns = {
384n/a { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
385n/a {
386n/a#include "asciitab.h"
387n/a#include "utf8tab.h"
388n/a },
389n/a STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
390n/a};
391n/a#endif
392n/a
393n/astatic const struct normal_encoding utf8_encoding = {
394n/a { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
395n/a {
396n/a#define BT_COLON BT_NMSTRT
397n/a#include "asciitab.h"
398n/a#undef BT_COLON
399n/a#include "utf8tab.h"
400n/a },
401n/a STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
402n/a};
403n/a
404n/a#ifdef XML_NS
405n/a
406n/astatic const struct normal_encoding internal_utf8_encoding_ns = {
407n/a { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
408n/a {
409n/a#include "iasciitab.h"
410n/a#include "utf8tab.h"
411n/a },
412n/a STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
413n/a};
414n/a
415n/a#endif
416n/a
417n/astatic const struct normal_encoding internal_utf8_encoding = {
418n/a { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
419n/a {
420n/a#define BT_COLON BT_NMSTRT
421n/a#include "iasciitab.h"
422n/a#undef BT_COLON
423n/a#include "utf8tab.h"
424n/a },
425n/a STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
426n/a};
427n/a
428n/astatic void PTRCALL
429n/alatin1_toUtf8(const ENCODING *enc,
430n/a const char **fromP, const char *fromLim,
431n/a char **toP, const char *toLim)
432n/a{
433n/a for (;;) {
434n/a unsigned char c;
435n/a if (*fromP == fromLim)
436n/a break;
437n/a c = (unsigned char)**fromP;
438n/a if (c & 0x80) {
439n/a if (toLim - *toP < 2)
440n/a break;
441n/a *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
442n/a *(*toP)++ = (char)((c & 0x3f) | 0x80);
443n/a (*fromP)++;
444n/a }
445n/a else {
446n/a if (*toP == toLim)
447n/a break;
448n/a *(*toP)++ = *(*fromP)++;
449n/a }
450n/a }
451n/a}
452n/a
453n/astatic void PTRCALL
454n/alatin1_toUtf16(const ENCODING *enc,
455n/a const char **fromP, const char *fromLim,
456n/a unsigned short **toP, const unsigned short *toLim)
457n/a{
458n/a while (*fromP != fromLim && *toP != toLim)
459n/a *(*toP)++ = (unsigned char)*(*fromP)++;
460n/a}
461n/a
462n/a#ifdef XML_NS
463n/a
464n/astatic const struct normal_encoding latin1_encoding_ns = {
465n/a { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
466n/a {
467n/a#include "asciitab.h"
468n/a#include "latin1tab.h"
469n/a },
470n/a STANDARD_VTABLE(sb_)
471n/a};
472n/a
473n/a#endif
474n/a
475n/astatic const struct normal_encoding latin1_encoding = {
476n/a { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
477n/a {
478n/a#define BT_COLON BT_NMSTRT
479n/a#include "asciitab.h"
480n/a#undef BT_COLON
481n/a#include "latin1tab.h"
482n/a },
483n/a STANDARD_VTABLE(sb_)
484n/a};
485n/a
486n/astatic void PTRCALL
487n/aascii_toUtf8(const ENCODING *enc,
488n/a const char **fromP, const char *fromLim,
489n/a char **toP, const char *toLim)
490n/a{
491n/a while (*fromP != fromLim && *toP != toLim)
492n/a *(*toP)++ = *(*fromP)++;
493n/a}
494n/a
495n/a#ifdef XML_NS
496n/a
497n/astatic const struct normal_encoding ascii_encoding_ns = {
498n/a { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
499n/a {
500n/a#include "asciitab.h"
501n/a/* BT_NONXML == 0 */
502n/a },
503n/a STANDARD_VTABLE(sb_)
504n/a};
505n/a
506n/a#endif
507n/a
508n/astatic const struct normal_encoding ascii_encoding = {
509n/a { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
510n/a {
511n/a#define BT_COLON BT_NMSTRT
512n/a#include "asciitab.h"
513n/a#undef BT_COLON
514n/a/* BT_NONXML == 0 */
515n/a },
516n/a STANDARD_VTABLE(sb_)
517n/a};
518n/a
519n/astatic int PTRFASTCALL
520n/aunicode_byte_type(char hi, char lo)
521n/a{
522n/a switch ((unsigned char)hi) {
523n/a case 0xD8: case 0xD9: case 0xDA: case 0xDB:
524n/a return BT_LEAD4;
525n/a case 0xDC: case 0xDD: case 0xDE: case 0xDF:
526n/a return BT_TRAIL;
527n/a case 0xFF:
528n/a switch ((unsigned char)lo) {
529n/a case 0xFF:
530n/a case 0xFE:
531n/a return BT_NONXML;
532n/a }
533n/a break;
534n/a }
535n/a return BT_NONASCII;
536n/a}
537n/a
538n/a#define DEFINE_UTF16_TO_UTF8(E) \
539n/astatic void PTRCALL \
540n/aE ## toUtf8(const ENCODING *enc, \
541n/a const char **fromP, const char *fromLim, \
542n/a char **toP, const char *toLim) \
543n/a{ \
544n/a const char *from; \
545n/a for (from = *fromP; from != fromLim; from += 2) { \
546n/a int plane; \
547n/a unsigned char lo2; \
548n/a unsigned char lo = GET_LO(from); \
549n/a unsigned char hi = GET_HI(from); \
550n/a switch (hi) { \
551n/a case 0: \
552n/a if (lo < 0x80) { \
553n/a if (*toP == toLim) { \
554n/a *fromP = from; \
555n/a return; \
556n/a } \
557n/a *(*toP)++ = lo; \
558n/a break; \
559n/a } \
560n/a /* fall through */ \
561n/a case 0x1: case 0x2: case 0x3: \
562n/a case 0x4: case 0x5: case 0x6: case 0x7: \
563n/a if (toLim - *toP < 2) { \
564n/a *fromP = from; \
565n/a return; \
566n/a } \
567n/a *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
568n/a *(*toP)++ = ((lo & 0x3f) | 0x80); \
569n/a break; \
570n/a default: \
571n/a if (toLim - *toP < 3) { \
572n/a *fromP = from; \
573n/a return; \
574n/a } \
575n/a /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
576n/a *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
577n/a *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
578n/a *(*toP)++ = ((lo & 0x3f) | 0x80); \
579n/a break; \
580n/a case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
581n/a if (toLim - *toP < 4) { \
582n/a *fromP = from; \
583n/a return; \
584n/a } \
585n/a plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
586n/a *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
587n/a *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
588n/a from += 2; \
589n/a lo2 = GET_LO(from); \
590n/a *(*toP)++ = (((lo & 0x3) << 4) \
591n/a | ((GET_HI(from) & 0x3) << 2) \
592n/a | (lo2 >> 6) \
593n/a | 0x80); \
594n/a *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
595n/a break; \
596n/a } \
597n/a } \
598n/a *fromP = from; \
599n/a}
600n/a
601n/a#define DEFINE_UTF16_TO_UTF16(E) \
602n/astatic void PTRCALL \
603n/aE ## toUtf16(const ENCODING *enc, \
604n/a const char **fromP, const char *fromLim, \
605n/a unsigned short **toP, const unsigned short *toLim) \
606n/a{ \
607n/a /* Avoid copying first half only of surrogate */ \
608n/a if (fromLim - *fromP > ((toLim - *toP) << 1) \
609n/a && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
610n/a fromLim -= 2; \
611n/a for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
612n/a *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
613n/a}
614n/a
615n/a#define SET2(ptr, ch) \
616n/a (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
617n/a#define GET_LO(ptr) ((unsigned char)(ptr)[0])
618n/a#define GET_HI(ptr) ((unsigned char)(ptr)[1])
619n/a
620n/aDEFINE_UTF16_TO_UTF8(little2_)
621n/aDEFINE_UTF16_TO_UTF16(little2_)
622n/a
623n/a#undef SET2
624n/a#undef GET_LO
625n/a#undef GET_HI
626n/a
627n/a#define SET2(ptr, ch) \
628n/a (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
629n/a#define GET_LO(ptr) ((unsigned char)(ptr)[1])
630n/a#define GET_HI(ptr) ((unsigned char)(ptr)[0])
631n/a
632n/aDEFINE_UTF16_TO_UTF8(big2_)
633n/aDEFINE_UTF16_TO_UTF16(big2_)
634n/a
635n/a#undef SET2
636n/a#undef GET_LO
637n/a#undef GET_HI
638n/a
639n/a#define LITTLE2_BYTE_TYPE(enc, p) \
640n/a ((p)[1] == 0 \
641n/a ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
642n/a : unicode_byte_type((p)[1], (p)[0]))
643n/a#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
644n/a#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
645n/a#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
646n/a UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
647n/a#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
648n/a UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
649n/a
650n/a#ifdef XML_MIN_SIZE
651n/a
652n/astatic int PTRFASTCALL
653n/alittle2_byteType(const ENCODING *enc, const char *p)
654n/a{
655n/a return LITTLE2_BYTE_TYPE(enc, p);
656n/a}
657n/a
658n/astatic int PTRFASTCALL
659n/alittle2_byteToAscii(const ENCODING *enc, const char *p)
660n/a{
661n/a return LITTLE2_BYTE_TO_ASCII(enc, p);
662n/a}
663n/a
664n/astatic int PTRCALL
665n/alittle2_charMatches(const ENCODING *enc, const char *p, int c)
666n/a{
667n/a return LITTLE2_CHAR_MATCHES(enc, p, c);
668n/a}
669n/a
670n/astatic int PTRFASTCALL
671n/alittle2_isNameMin(const ENCODING *enc, const char *p)
672n/a{
673n/a return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
674n/a}
675n/a
676n/astatic int PTRFASTCALL
677n/alittle2_isNmstrtMin(const ENCODING *enc, const char *p)
678n/a{
679n/a return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
680n/a}
681n/a
682n/a#undef VTABLE
683n/a#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
684n/a
685n/a#else /* not XML_MIN_SIZE */
686n/a
687n/a#undef PREFIX
688n/a#define PREFIX(ident) little2_ ## ident
689n/a#define MINBPC(enc) 2
690n/a/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
691n/a#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
692n/a#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
693n/a#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
694n/a#define IS_NAME_CHAR(enc, p, n) 0
695n/a#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
696n/a#define IS_NMSTRT_CHAR(enc, p, n) (0)
697n/a#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
698n/a
699n/a#define XML_TOK_IMPL_C
700n/a#include "xmltok_impl.c"
701n/a#undef XML_TOK_IMPL_C
702n/a
703n/a#undef MINBPC
704n/a#undef BYTE_TYPE
705n/a#undef BYTE_TO_ASCII
706n/a#undef CHAR_MATCHES
707n/a#undef IS_NAME_CHAR
708n/a#undef IS_NAME_CHAR_MINBPC
709n/a#undef IS_NMSTRT_CHAR
710n/a#undef IS_NMSTRT_CHAR_MINBPC
711n/a#undef IS_INVALID_CHAR
712n/a
713n/a#endif /* not XML_MIN_SIZE */
714n/a
715n/a#ifdef XML_NS
716n/a
717n/astatic const struct normal_encoding little2_encoding_ns = {
718n/a { VTABLE, 2, 0,
719n/a#if BYTEORDER == 1234
720n/a 1
721n/a#else
722n/a 0
723n/a#endif
724n/a },
725n/a {
726n/a#include "asciitab.h"
727n/a#include "latin1tab.h"
728n/a },
729n/a STANDARD_VTABLE(little2_)
730n/a};
731n/a
732n/a#endif
733n/a
734n/astatic const struct normal_encoding little2_encoding = {
735n/a { VTABLE, 2, 0,
736n/a#if BYTEORDER == 1234
737n/a 1
738n/a#else
739n/a 0
740n/a#endif
741n/a },
742n/a {
743n/a#define BT_COLON BT_NMSTRT
744n/a#include "asciitab.h"
745n/a#undef BT_COLON
746n/a#include "latin1tab.h"
747n/a },
748n/a STANDARD_VTABLE(little2_)
749n/a};
750n/a
751n/a#if BYTEORDER != 4321
752n/a
753n/a#ifdef XML_NS
754n/a
755n/astatic const struct normal_encoding internal_little2_encoding_ns = {
756n/a { VTABLE, 2, 0, 1 },
757n/a {
758n/a#include "iasciitab.h"
759n/a#include "latin1tab.h"
760n/a },
761n/a STANDARD_VTABLE(little2_)
762n/a};
763n/a
764n/a#endif
765n/a
766n/astatic const struct normal_encoding internal_little2_encoding = {
767n/a { VTABLE, 2, 0, 1 },
768n/a {
769n/a#define BT_COLON BT_NMSTRT
770n/a#include "iasciitab.h"
771n/a#undef BT_COLON
772n/a#include "latin1tab.h"
773n/a },
774n/a STANDARD_VTABLE(little2_)
775n/a};
776n/a
777n/a#endif
778n/a
779n/a
780n/a#define BIG2_BYTE_TYPE(enc, p) \
781n/a ((p)[0] == 0 \
782n/a ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
783n/a : unicode_byte_type((p)[0], (p)[1]))
784n/a#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
785n/a#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
786n/a#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
787n/a UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
788n/a#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
789n/a UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
790n/a
791n/a#ifdef XML_MIN_SIZE
792n/a
793n/astatic int PTRFASTCALL
794n/abig2_byteType(const ENCODING *enc, const char *p)
795n/a{
796n/a return BIG2_BYTE_TYPE(enc, p);
797n/a}
798n/a
799n/astatic int PTRFASTCALL
800n/abig2_byteToAscii(const ENCODING *enc, const char *p)
801n/a{
802n/a return BIG2_BYTE_TO_ASCII(enc, p);
803n/a}
804n/a
805n/astatic int PTRCALL
806n/abig2_charMatches(const ENCODING *enc, const char *p, int c)
807n/a{
808n/a return BIG2_CHAR_MATCHES(enc, p, c);
809n/a}
810n/a
811n/astatic int PTRFASTCALL
812n/abig2_isNameMin(const ENCODING *enc, const char *p)
813n/a{
814n/a return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
815n/a}
816n/a
817n/astatic int PTRFASTCALL
818n/abig2_isNmstrtMin(const ENCODING *enc, const char *p)
819n/a{
820n/a return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
821n/a}
822n/a
823n/a#undef VTABLE
824n/a#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
825n/a
826n/a#else /* not XML_MIN_SIZE */
827n/a
828n/a#undef PREFIX
829n/a#define PREFIX(ident) big2_ ## ident
830n/a#define MINBPC(enc) 2
831n/a/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
832n/a#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
833n/a#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
834n/a#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
835n/a#define IS_NAME_CHAR(enc, p, n) 0
836n/a#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
837n/a#define IS_NMSTRT_CHAR(enc, p, n) (0)
838n/a#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
839n/a
840n/a#define XML_TOK_IMPL_C
841n/a#include "xmltok_impl.c"
842n/a#undef XML_TOK_IMPL_C
843n/a
844n/a#undef MINBPC
845n/a#undef BYTE_TYPE
846n/a#undef BYTE_TO_ASCII
847n/a#undef CHAR_MATCHES
848n/a#undef IS_NAME_CHAR
849n/a#undef IS_NAME_CHAR_MINBPC
850n/a#undef IS_NMSTRT_CHAR
851n/a#undef IS_NMSTRT_CHAR_MINBPC
852n/a#undef IS_INVALID_CHAR
853n/a
854n/a#endif /* not XML_MIN_SIZE */
855n/a
856n/a#ifdef XML_NS
857n/a
858n/astatic const struct normal_encoding big2_encoding_ns = {
859n/a { VTABLE, 2, 0,
860n/a#if BYTEORDER == 4321
861n/a 1
862n/a#else
863n/a 0
864n/a#endif
865n/a },
866n/a {
867n/a#include "asciitab.h"
868n/a#include "latin1tab.h"
869n/a },
870n/a STANDARD_VTABLE(big2_)
871n/a};
872n/a
873n/a#endif
874n/a
875n/astatic const struct normal_encoding big2_encoding = {
876n/a { VTABLE, 2, 0,
877n/a#if BYTEORDER == 4321
878n/a 1
879n/a#else
880n/a 0
881n/a#endif
882n/a },
883n/a {
884n/a#define BT_COLON BT_NMSTRT
885n/a#include "asciitab.h"
886n/a#undef BT_COLON
887n/a#include "latin1tab.h"
888n/a },
889n/a STANDARD_VTABLE(big2_)
890n/a};
891n/a
892n/a#if BYTEORDER != 1234
893n/a
894n/a#ifdef XML_NS
895n/a
896n/astatic const struct normal_encoding internal_big2_encoding_ns = {
897n/a { VTABLE, 2, 0, 1 },
898n/a {
899n/a#include "iasciitab.h"
900n/a#include "latin1tab.h"
901n/a },
902n/a STANDARD_VTABLE(big2_)
903n/a};
904n/a
905n/a#endif
906n/a
907n/astatic const struct normal_encoding internal_big2_encoding = {
908n/a { VTABLE, 2, 0, 1 },
909n/a {
910n/a#define BT_COLON BT_NMSTRT
911n/a#include "iasciitab.h"
912n/a#undef BT_COLON
913n/a#include "latin1tab.h"
914n/a },
915n/a STANDARD_VTABLE(big2_)
916n/a};
917n/a
918n/a#endif
919n/a
920n/a#undef PREFIX
921n/a
922n/astatic int FASTCALL
923n/astreqci(const char *s1, const char *s2)
924n/a{
925n/a for (;;) {
926n/a char c1 = *s1++;
927n/a char c2 = *s2++;
928n/a if (ASCII_a <= c1 && c1 <= ASCII_z)
929n/a c1 += ASCII_A - ASCII_a;
930n/a if (ASCII_a <= c2 && c2 <= ASCII_z)
931n/a c2 += ASCII_A - ASCII_a;
932n/a if (c1 != c2)
933n/a return 0;
934n/a if (!c1)
935n/a break;
936n/a }
937n/a return 1;
938n/a}
939n/a
940n/astatic void PTRCALL
941n/ainitUpdatePosition(const ENCODING *enc, const char *ptr,
942n/a const char *end, POSITION *pos)
943n/a{
944n/a normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
945n/a}
946n/a
947n/astatic int
948n/atoAscii(const ENCODING *enc, const char *ptr, const char *end)
949n/a{
950n/a char buf[1];
951n/a char *p = buf;
952n/a XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
953n/a if (p == buf)
954n/a return -1;
955n/a else
956n/a return buf[0];
957n/a}
958n/a
959n/astatic int FASTCALL
960n/aisSpace(int c)
961n/a{
962n/a switch (c) {
963n/a case 0x20:
964n/a case 0xD:
965n/a case 0xA:
966n/a case 0x9:
967n/a return 1;
968n/a }
969n/a return 0;
970n/a}
971n/a
972n/a/* Return 1 if there's just optional white space or there's an S
973n/a followed by name=val.
974n/a*/
975n/astatic int
976n/aparsePseudoAttribute(const ENCODING *enc,
977n/a const char *ptr,
978n/a const char *end,
979n/a const char **namePtr,
980n/a const char **nameEndPtr,
981n/a const char **valPtr,
982n/a const char **nextTokPtr)
983n/a{
984n/a int c;
985n/a char open;
986n/a if (ptr == end) {
987n/a *namePtr = NULL;
988n/a return 1;
989n/a }
990n/a if (!isSpace(toAscii(enc, ptr, end))) {
991n/a *nextTokPtr = ptr;
992n/a return 0;
993n/a }
994n/a do {
995n/a ptr += enc->minBytesPerChar;
996n/a } while (isSpace(toAscii(enc, ptr, end)));
997n/a if (ptr == end) {
998n/a *namePtr = NULL;
999n/a return 1;
1000n/a }
1001n/a *namePtr = ptr;
1002n/a for (;;) {
1003n/a c = toAscii(enc, ptr, end);
1004n/a if (c == -1) {
1005n/a *nextTokPtr = ptr;
1006n/a return 0;
1007n/a }
1008n/a if (c == ASCII_EQUALS) {
1009n/a *nameEndPtr = ptr;
1010n/a break;
1011n/a }
1012n/a if (isSpace(c)) {
1013n/a *nameEndPtr = ptr;
1014n/a do {
1015n/a ptr += enc->minBytesPerChar;
1016n/a } while (isSpace(c = toAscii(enc, ptr, end)));
1017n/a if (c != ASCII_EQUALS) {
1018n/a *nextTokPtr = ptr;
1019n/a return 0;
1020n/a }
1021n/a break;
1022n/a }
1023n/a ptr += enc->minBytesPerChar;
1024n/a }
1025n/a if (ptr == *namePtr) {
1026n/a *nextTokPtr = ptr;
1027n/a return 0;
1028n/a }
1029n/a ptr += enc->minBytesPerChar;
1030n/a c = toAscii(enc, ptr, end);
1031n/a while (isSpace(c)) {
1032n/a ptr += enc->minBytesPerChar;
1033n/a c = toAscii(enc, ptr, end);
1034n/a }
1035n/a if (c != ASCII_QUOT && c != ASCII_APOS) {
1036n/a *nextTokPtr = ptr;
1037n/a return 0;
1038n/a }
1039n/a open = (char)c;
1040n/a ptr += enc->minBytesPerChar;
1041n/a *valPtr = ptr;
1042n/a for (;; ptr += enc->minBytesPerChar) {
1043n/a c = toAscii(enc, ptr, end);
1044n/a if (c == open)
1045n/a break;
1046n/a if (!(ASCII_a <= c && c <= ASCII_z)
1047n/a && !(ASCII_A <= c && c <= ASCII_Z)
1048n/a && !(ASCII_0 <= c && c <= ASCII_9)
1049n/a && c != ASCII_PERIOD
1050n/a && c != ASCII_MINUS
1051n/a && c != ASCII_UNDERSCORE) {
1052n/a *nextTokPtr = ptr;
1053n/a return 0;
1054n/a }
1055n/a }
1056n/a *nextTokPtr = ptr + enc->minBytesPerChar;
1057n/a return 1;
1058n/a}
1059n/a
1060n/astatic const char KW_version[] = {
1061n/a ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1062n/a};
1063n/a
1064n/astatic const char KW_encoding[] = {
1065n/a ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1066n/a};
1067n/a
1068n/astatic const char KW_standalone[] = {
1069n/a ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
1070n/a ASCII_n, ASCII_e, '\0'
1071n/a};
1072n/a
1073n/astatic const char KW_yes[] = {
1074n/a ASCII_y, ASCII_e, ASCII_s, '\0'
1075n/a};
1076n/a
1077n/astatic const char KW_no[] = {
1078n/a ASCII_n, ASCII_o, '\0'
1079n/a};
1080n/a
1081n/astatic int
1082n/adoParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1083n/a const char *,
1084n/a const char *),
1085n/a int isGeneralTextEntity,
1086n/a const ENCODING *enc,
1087n/a const char *ptr,
1088n/a const char *end,
1089n/a const char **badPtr,
1090n/a const char **versionPtr,
1091n/a const char **versionEndPtr,
1092n/a const char **encodingName,
1093n/a const ENCODING **encoding,
1094n/a int *standalone)
1095n/a{
1096n/a const char *val = NULL;
1097n/a const char *name = NULL;
1098n/a const char *nameEnd = NULL;
1099n/a ptr += 5 * enc->minBytesPerChar;
1100n/a end -= 2 * enc->minBytesPerChar;
1101n/a if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1102n/a || !name) {
1103n/a *badPtr = ptr;
1104n/a return 0;
1105n/a }
1106n/a if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1107n/a if (!isGeneralTextEntity) {
1108n/a *badPtr = name;
1109n/a return 0;
1110n/a }
1111n/a }
1112n/a else {
1113n/a if (versionPtr)
1114n/a *versionPtr = val;
1115n/a if (versionEndPtr)
1116n/a *versionEndPtr = ptr;
1117n/a if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1118n/a *badPtr = ptr;
1119n/a return 0;
1120n/a }
1121n/a if (!name) {
1122n/a if (isGeneralTextEntity) {
1123n/a /* a TextDecl must have an EncodingDecl */
1124n/a *badPtr = ptr;
1125n/a return 0;
1126n/a }
1127n/a return 1;
1128n/a }
1129n/a }
1130n/a if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1131n/a int c = toAscii(enc, val, end);
1132n/a if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1133n/a *badPtr = val;
1134n/a return 0;
1135n/a }
1136n/a if (encodingName)
1137n/a *encodingName = val;
1138n/a if (encoding)
1139n/a *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1140n/a if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1141n/a *badPtr = ptr;
1142n/a return 0;
1143n/a }
1144n/a if (!name)
1145n/a return 1;
1146n/a }
1147n/a if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1148n/a || isGeneralTextEntity) {
1149n/a *badPtr = name;
1150n/a return 0;
1151n/a }
1152n/a if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1153n/a if (standalone)
1154n/a *standalone = 1;
1155n/a }
1156n/a else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1157n/a if (standalone)
1158n/a *standalone = 0;
1159n/a }
1160n/a else {
1161n/a *badPtr = val;
1162n/a return 0;
1163n/a }
1164n/a while (isSpace(toAscii(enc, ptr, end)))
1165n/a ptr += enc->minBytesPerChar;
1166n/a if (ptr != end) {
1167n/a *badPtr = ptr;
1168n/a return 0;
1169n/a }
1170n/a return 1;
1171n/a}
1172n/a
1173n/astatic int FASTCALL
1174n/acheckCharRefNumber(int result)
1175n/a{
1176n/a switch (result >> 8) {
1177n/a case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1178n/a case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1179n/a return -1;
1180n/a case 0:
1181n/a if (latin1_encoding.type[result] == BT_NONXML)
1182n/a return -1;
1183n/a break;
1184n/a case 0xFF:
1185n/a if (result == 0xFFFE || result == 0xFFFF)
1186n/a return -1;
1187n/a break;
1188n/a }
1189n/a return result;
1190n/a}
1191n/a
1192n/aint FASTCALL
1193n/aXmlUtf8Encode(int c, char *buf)
1194n/a{
1195n/a enum {
1196n/a /* minN is minimum legal resulting value for N byte sequence */
1197n/a min2 = 0x80,
1198n/a min3 = 0x800,
1199n/a min4 = 0x10000
1200n/a };
1201n/a
1202n/a if (c < 0)
1203n/a return 0;
1204n/a if (c < min2) {
1205n/a buf[0] = (char)(c | UTF8_cval1);
1206n/a return 1;
1207n/a }
1208n/a if (c < min3) {
1209n/a buf[0] = (char)((c >> 6) | UTF8_cval2);
1210n/a buf[1] = (char)((c & 0x3f) | 0x80);
1211n/a return 2;
1212n/a }
1213n/a if (c < min4) {
1214n/a buf[0] = (char)((c >> 12) | UTF8_cval3);
1215n/a buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1216n/a buf[2] = (char)((c & 0x3f) | 0x80);
1217n/a return 3;
1218n/a }
1219n/a if (c < 0x110000) {
1220n/a buf[0] = (char)((c >> 18) | UTF8_cval4);
1221n/a buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1222n/a buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1223n/a buf[3] = (char)((c & 0x3f) | 0x80);
1224n/a return 4;
1225n/a }
1226n/a return 0;
1227n/a}
1228n/a
1229n/aint FASTCALL
1230n/aXmlUtf16Encode(int charNum, unsigned short *buf)
1231n/a{
1232n/a if (charNum < 0)
1233n/a return 0;
1234n/a if (charNum < 0x10000) {
1235n/a buf[0] = (unsigned short)charNum;
1236n/a return 1;
1237n/a }
1238n/a if (charNum < 0x110000) {
1239n/a charNum -= 0x10000;
1240n/a buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1241n/a buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1242n/a return 2;
1243n/a }
1244n/a return 0;
1245n/a}
1246n/a
1247n/astruct unknown_encoding {
1248n/a struct normal_encoding normal;
1249n/a CONVERTER convert;
1250n/a void *userData;
1251n/a unsigned short utf16[256];
1252n/a char utf8[256][4];
1253n/a};
1254n/a
1255n/a#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc))
1256n/a
1257n/aint
1258n/aXmlSizeOfUnknownEncoding(void)
1259n/a{
1260n/a return sizeof(struct unknown_encoding);
1261n/a}
1262n/a
1263n/astatic int PTRFASTCALL
1264n/aunknown_isName(const ENCODING *enc, const char *p)
1265n/a{
1266n/a const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1267n/a int c = uenc->convert(uenc->userData, p);
1268n/a if (c & ~0xFFFF)
1269n/a return 0;
1270n/a return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1271n/a}
1272n/a
1273n/astatic int PTRFASTCALL
1274n/aunknown_isNmstrt(const ENCODING *enc, const char *p)
1275n/a{
1276n/a const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1277n/a int c = uenc->convert(uenc->userData, p);
1278n/a if (c & ~0xFFFF)
1279n/a return 0;
1280n/a return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1281n/a}
1282n/a
1283n/astatic int PTRFASTCALL
1284n/aunknown_isInvalid(const ENCODING *enc, const char *p)
1285n/a{
1286n/a const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1287n/a int c = uenc->convert(uenc->userData, p);
1288n/a return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1289n/a}
1290n/a
1291n/astatic void PTRCALL
1292n/aunknown_toUtf8(const ENCODING *enc,
1293n/a const char **fromP, const char *fromLim,
1294n/a char **toP, const char *toLim)
1295n/a{
1296n/a const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1297n/a char buf[XML_UTF8_ENCODE_MAX];
1298n/a for (;;) {
1299n/a const char *utf8;
1300n/a int n;
1301n/a if (*fromP == fromLim)
1302n/a break;
1303n/a utf8 = uenc->utf8[(unsigned char)**fromP];
1304n/a n = *utf8++;
1305n/a if (n == 0) {
1306n/a int c = uenc->convert(uenc->userData, *fromP);
1307n/a n = XmlUtf8Encode(c, buf);
1308n/a if (n > toLim - *toP)
1309n/a break;
1310n/a utf8 = buf;
1311n/a *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1312n/a - (BT_LEAD2 - 2));
1313n/a }
1314n/a else {
1315n/a if (n > toLim - *toP)
1316n/a break;
1317n/a (*fromP)++;
1318n/a }
1319n/a do {
1320n/a *(*toP)++ = *utf8++;
1321n/a } while (--n != 0);
1322n/a }
1323n/a}
1324n/a
1325n/astatic void PTRCALL
1326n/aunknown_toUtf16(const ENCODING *enc,
1327n/a const char **fromP, const char *fromLim,
1328n/a unsigned short **toP, const unsigned short *toLim)
1329n/a{
1330n/a const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1331n/a while (*fromP != fromLim && *toP != toLim) {
1332n/a unsigned short c = uenc->utf16[(unsigned char)**fromP];
1333n/a if (c == 0) {
1334n/a c = (unsigned short)
1335n/a uenc->convert(uenc->userData, *fromP);
1336n/a *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1337n/a - (BT_LEAD2 - 2));
1338n/a }
1339n/a else
1340n/a (*fromP)++;
1341n/a *(*toP)++ = c;
1342n/a }
1343n/a}
1344n/a
1345n/aENCODING *
1346n/aXmlInitUnknownEncoding(void *mem,
1347n/a int *table,
1348n/a CONVERTER convert,
1349n/a void *userData)
1350n/a{
1351n/a int i;
1352n/a struct unknown_encoding *e = (struct unknown_encoding *)mem;
1353n/a for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1354n/a ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1355n/a for (i = 0; i < 128; i++)
1356n/a if (latin1_encoding.type[i] != BT_OTHER
1357n/a && latin1_encoding.type[i] != BT_NONXML
1358n/a && table[i] != i)
1359n/a return 0;
1360n/a for (i = 0; i < 256; i++) {
1361n/a int c = table[i];
1362n/a if (c == -1) {
1363n/a e->normal.type[i] = BT_MALFORM;
1364n/a /* This shouldn't really get used. */
1365n/a e->utf16[i] = 0xFFFF;
1366n/a e->utf8[i][0] = 1;
1367n/a e->utf8[i][1] = 0;
1368n/a }
1369n/a else if (c < 0) {
1370n/a if (c < -4)
1371n/a return 0;
1372n/a e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1373n/a e->utf8[i][0] = 0;
1374n/a e->utf16[i] = 0;
1375n/a }
1376n/a else if (c < 0x80) {
1377n/a if (latin1_encoding.type[c] != BT_OTHER
1378n/a && latin1_encoding.type[c] != BT_NONXML
1379n/a && c != i)
1380n/a return 0;
1381n/a e->normal.type[i] = latin1_encoding.type[c];
1382n/a e->utf8[i][0] = 1;
1383n/a e->utf8[i][1] = (char)c;
1384n/a e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1385n/a }
1386n/a else if (checkCharRefNumber(c) < 0) {
1387n/a e->normal.type[i] = BT_NONXML;
1388n/a /* This shouldn't really get used. */
1389n/a e->utf16[i] = 0xFFFF;
1390n/a e->utf8[i][0] = 1;
1391n/a e->utf8[i][1] = 0;
1392n/a }
1393n/a else {
1394n/a if (c > 0xFFFF)
1395n/a return 0;
1396n/a if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1397n/a e->normal.type[i] = BT_NMSTRT;
1398n/a else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1399n/a e->normal.type[i] = BT_NAME;
1400n/a else
1401n/a e->normal.type[i] = BT_OTHER;
1402n/a e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1403n/a e->utf16[i] = (unsigned short)c;
1404n/a }
1405n/a }
1406n/a e->userData = userData;
1407n/a e->convert = convert;
1408n/a if (convert) {
1409n/a e->normal.isName2 = unknown_isName;
1410n/a e->normal.isName3 = unknown_isName;
1411n/a e->normal.isName4 = unknown_isName;
1412n/a e->normal.isNmstrt2 = unknown_isNmstrt;
1413n/a e->normal.isNmstrt3 = unknown_isNmstrt;
1414n/a e->normal.isNmstrt4 = unknown_isNmstrt;
1415n/a e->normal.isInvalid2 = unknown_isInvalid;
1416n/a e->normal.isInvalid3 = unknown_isInvalid;
1417n/a e->normal.isInvalid4 = unknown_isInvalid;
1418n/a }
1419n/a e->normal.enc.utf8Convert = unknown_toUtf8;
1420n/a e->normal.enc.utf16Convert = unknown_toUtf16;
1421n/a return &(e->normal.enc);
1422n/a}
1423n/a
1424n/a/* If this enumeration is changed, getEncodingIndex and encodings
1425n/amust also be changed. */
1426n/aenum {
1427n/a UNKNOWN_ENC = -1,
1428n/a ISO_8859_1_ENC = 0,
1429n/a US_ASCII_ENC,
1430n/a UTF_8_ENC,
1431n/a UTF_16_ENC,
1432n/a UTF_16BE_ENC,
1433n/a UTF_16LE_ENC,
1434n/a /* must match encodingNames up to here */
1435n/a NO_ENC
1436n/a};
1437n/a
1438n/astatic const char KW_ISO_8859_1[] = {
1439n/a ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
1440n/a ASCII_MINUS, ASCII_1, '\0'
1441n/a};
1442n/astatic const char KW_US_ASCII[] = {
1443n/a ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
1444n/a '\0'
1445n/a};
1446n/astatic const char KW_UTF_8[] = {
1447n/a ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1448n/a};
1449n/astatic const char KW_UTF_16[] = {
1450n/a ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1451n/a};
1452n/astatic const char KW_UTF_16BE[] = {
1453n/a ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
1454n/a '\0'
1455n/a};
1456n/astatic const char KW_UTF_16LE[] = {
1457n/a ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
1458n/a '\0'
1459n/a};
1460n/a
1461n/astatic int FASTCALL
1462n/agetEncodingIndex(const char *name)
1463n/a{
1464n/a static const char * const encodingNames[] = {
1465n/a KW_ISO_8859_1,
1466n/a KW_US_ASCII,
1467n/a KW_UTF_8,
1468n/a KW_UTF_16,
1469n/a KW_UTF_16BE,
1470n/a KW_UTF_16LE,
1471n/a };
1472n/a int i;
1473n/a if (name == NULL)
1474n/a return NO_ENC;
1475n/a for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1476n/a if (streqci(name, encodingNames[i]))
1477n/a return i;
1478n/a return UNKNOWN_ENC;
1479n/a}
1480n/a
1481n/a/* For binary compatibility, we store the index of the encoding
1482n/a specified at initialization in the isUtf16 member.
1483n/a*/
1484n/a
1485n/a#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1486n/a#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1487n/a
1488n/a/* This is what detects the encoding. encodingTable maps from
1489n/a encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1490n/a the external (protocol) specified encoding; state is
1491n/a XML_CONTENT_STATE if we're parsing an external text entity, and
1492n/a XML_PROLOG_STATE otherwise.
1493n/a*/
1494n/a
1495n/a
1496n/astatic int
1497n/ainitScan(const ENCODING * const *encodingTable,
1498n/a const INIT_ENCODING *enc,
1499n/a int state,
1500n/a const char *ptr,
1501n/a const char *end,
1502n/a const char **nextTokPtr)
1503n/a{
1504n/a const ENCODING **encPtr;
1505n/a
1506n/a if (ptr == end)
1507n/a return XML_TOK_NONE;
1508n/a encPtr = enc->encPtr;
1509n/a if (ptr + 1 == end) {
1510n/a /* only a single byte available for auto-detection */
1511n/a#ifndef XML_DTD /* FIXME */
1512n/a /* a well-formed document entity must have more than one byte */
1513n/a if (state != XML_CONTENT_STATE)
1514n/a return XML_TOK_PARTIAL;
1515n/a#endif
1516n/a /* so we're parsing an external text entity... */
1517n/a /* if UTF-16 was externally specified, then we need at least 2 bytes */
1518n/a switch (INIT_ENC_INDEX(enc)) {
1519n/a case UTF_16_ENC:
1520n/a case UTF_16LE_ENC:
1521n/a case UTF_16BE_ENC:
1522n/a return XML_TOK_PARTIAL;
1523n/a }
1524n/a switch ((unsigned char)*ptr) {
1525n/a case 0xFE:
1526n/a case 0xFF:
1527n/a case 0xEF: /* possibly first byte of UTF-8 BOM */
1528n/a if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1529n/a && state == XML_CONTENT_STATE)
1530n/a break;
1531n/a /* fall through */
1532n/a case 0x00:
1533n/a case 0x3C:
1534n/a return XML_TOK_PARTIAL;
1535n/a }
1536n/a }
1537n/a else {
1538n/a switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1539n/a case 0xFEFF:
1540n/a if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1541n/a && state == XML_CONTENT_STATE)
1542n/a break;
1543n/a *nextTokPtr = ptr + 2;
1544n/a *encPtr = encodingTable[UTF_16BE_ENC];
1545n/a return XML_TOK_BOM;
1546n/a /* 00 3C is handled in the default case */
1547n/a case 0x3C00:
1548n/a if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1549n/a || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1550n/a && state == XML_CONTENT_STATE)
1551n/a break;
1552n/a *encPtr = encodingTable[UTF_16LE_ENC];
1553n/a return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1554n/a case 0xFFFE:
1555n/a if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1556n/a && state == XML_CONTENT_STATE)
1557n/a break;
1558n/a *nextTokPtr = ptr + 2;
1559n/a *encPtr = encodingTable[UTF_16LE_ENC];
1560n/a return XML_TOK_BOM;
1561n/a case 0xEFBB:
1562n/a /* Maybe a UTF-8 BOM (EF BB BF) */
1563n/a /* If there's an explicitly specified (external) encoding
1564n/a of ISO-8859-1 or some flavour of UTF-16
1565n/a and this is an external text entity,
1566n/a don't look for the BOM,
1567n/a because it might be a legal data.
1568n/a */
1569n/a if (state == XML_CONTENT_STATE) {
1570n/a int e = INIT_ENC_INDEX(enc);
1571n/a if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
1572n/a || e == UTF_16LE_ENC || e == UTF_16_ENC)
1573n/a break;
1574n/a }
1575n/a if (ptr + 2 == end)
1576n/a return XML_TOK_PARTIAL;
1577n/a if ((unsigned char)ptr[2] == 0xBF) {
1578n/a *nextTokPtr = ptr + 3;
1579n/a *encPtr = encodingTable[UTF_8_ENC];
1580n/a return XML_TOK_BOM;
1581n/a }
1582n/a break;
1583n/a default:
1584n/a if (ptr[0] == '\0') {
1585n/a /* 0 isn't a legal data character. Furthermore a document
1586n/a entity can only start with ASCII characters. So the only
1587n/a way this can fail to be big-endian UTF-16 if it it's an
1588n/a external parsed general entity that's labelled as
1589n/a UTF-16LE.
1590n/a */
1591n/a if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1592n/a break;
1593n/a *encPtr = encodingTable[UTF_16BE_ENC];
1594n/a return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1595n/a }
1596n/a else if (ptr[1] == '\0') {
1597n/a /* We could recover here in the case:
1598n/a - parsing an external entity
1599n/a - second byte is 0
1600n/a - no externally specified encoding
1601n/a - no encoding declaration
1602n/a by assuming UTF-16LE. But we don't, because this would mean when
1603n/a presented just with a single byte, we couldn't reliably determine
1604n/a whether we needed further bytes.
1605n/a */
1606n/a if (state == XML_CONTENT_STATE)
1607n/a break;
1608n/a *encPtr = encodingTable[UTF_16LE_ENC];
1609n/a return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1610n/a }
1611n/a break;
1612n/a }
1613n/a }
1614n/a *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1615n/a return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1616n/a}
1617n/a
1618n/a
1619n/a#define NS(x) x
1620n/a#define ns(x) x
1621n/a#define XML_TOK_NS_C
1622n/a#include "xmltok_ns.c"
1623n/a#undef XML_TOK_NS_C
1624n/a#undef NS
1625n/a#undef ns
1626n/a
1627n/a#ifdef XML_NS
1628n/a
1629n/a#define NS(x) x ## NS
1630n/a#define ns(x) x ## _ns
1631n/a
1632n/a#define XML_TOK_NS_C
1633n/a#include "xmltok_ns.c"
1634n/a#undef XML_TOK_NS_C
1635n/a
1636n/a#undef NS
1637n/a#undef ns
1638n/a
1639n/aENCODING *
1640n/aXmlInitUnknownEncodingNS(void *mem,
1641n/a int *table,
1642n/a CONVERTER convert,
1643n/a void *userData)
1644n/a{
1645n/a ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1646n/a if (enc)
1647n/a ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1648n/a return enc;
1649n/a}
1650n/a
1651n/a#endif /* XML_NS */