ยปCore Development>Code coverage>Parser/tokenizer.c

Python code coverage for Parser/tokenizer.c

#countcontent
1n/a
2n/a/* Tokenizer implementation */
3n/a
4n/a#include "Python.h"
5n/a#include "pgenheaders.h"
6n/a
7n/a#include <ctype.h>
8n/a#include <assert.h>
9n/a
10n/a#include "tokenizer.h"
11n/a#include "errcode.h"
12n/a
13n/a#ifndef PGEN
14n/a#include "unicodeobject.h"
15n/a#include "bytesobject.h"
16n/a#include "fileobject.h"
17n/a#include "codecs.h"
18n/a#include "abstract.h"
19n/a#endif /* PGEN */
20n/a
21n/a#define is_potential_identifier_start(c) (\
22n/a (c >= 'a' && c <= 'z')\
23n/a || (c >= 'A' && c <= 'Z')\
24n/a || c == '_'\
25n/a || (c >= 128))
26n/a
27n/a#define is_potential_identifier_char(c) (\
28n/a (c >= 'a' && c <= 'z')\
29n/a || (c >= 'A' && c <= 'Z')\
30n/a || (c >= '0' && c <= '9')\
31n/a || c == '_'\
32n/a || (c >= 128))
33n/a
34n/aextern char *PyOS_Readline(FILE *, FILE *, const char *);
35n/a/* Return malloc'ed string including trailing \n;
36n/a empty malloc'ed string for EOF;
37n/a NULL if interrupted */
38n/a
39n/a/* Don't ever change this -- it would break the portability of Python code */
40n/a#define TABSIZE 8
41n/a
42n/a/* Forward */
43n/astatic struct tok_state *tok_new(void);
44n/astatic int tok_nextc(struct tok_state *tok);
45n/astatic void tok_backup(struct tok_state *tok, int c);
46n/a
47n/a
48n/a/* Token names */
49n/a
50n/aconst char *_PyParser_TokenNames[] = {
51n/a "ENDMARKER",
52n/a "NAME",
53n/a "NUMBER",
54n/a "STRING",
55n/a "NEWLINE",
56n/a "INDENT",
57n/a "DEDENT",
58n/a "LPAR",
59n/a "RPAR",
60n/a "LSQB",
61n/a "RSQB",
62n/a "COLON",
63n/a "COMMA",
64n/a "SEMI",
65n/a "PLUS",
66n/a "MINUS",
67n/a "STAR",
68n/a "SLASH",
69n/a "VBAR",
70n/a "AMPER",
71n/a "LESS",
72n/a "GREATER",
73n/a "EQUAL",
74n/a "DOT",
75n/a "PERCENT",
76n/a "LBRACE",
77n/a "RBRACE",
78n/a "EQEQUAL",
79n/a "NOTEQUAL",
80n/a "LESSEQUAL",
81n/a "GREATEREQUAL",
82n/a "TILDE",
83n/a "CIRCUMFLEX",
84n/a "LEFTSHIFT",
85n/a "RIGHTSHIFT",
86n/a "DOUBLESTAR",
87n/a "PLUSEQUAL",
88n/a "MINEQUAL",
89n/a "STAREQUAL",
90n/a "SLASHEQUAL",
91n/a "PERCENTEQUAL",
92n/a "AMPEREQUAL",
93n/a "VBAREQUAL",
94n/a "CIRCUMFLEXEQUAL",
95n/a "LEFTSHIFTEQUAL",
96n/a "RIGHTSHIFTEQUAL",
97n/a "DOUBLESTAREQUAL",
98n/a "DOUBLESLASH",
99n/a "DOUBLESLASHEQUAL",
100n/a "AT",
101n/a "ATEQUAL",
102n/a "RARROW",
103n/a "ELLIPSIS",
104n/a /* This table must match the #defines in token.h! */
105n/a "OP",
106n/a "AWAIT",
107n/a "ASYNC",
108n/a "<ERRORTOKEN>",
109n/a "<N_TOKENS>"
110n/a};
111n/a
112n/a
113n/a/* Create and initialize a new tok_state structure */
114n/a
115n/astatic struct tok_state *
116n/atok_new(void)
117n/a{
118n/a struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
119n/a sizeof(struct tok_state));
120n/a if (tok == NULL)
121n/a return NULL;
122n/a tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
123n/a tok->done = E_OK;
124n/a tok->fp = NULL;
125n/a tok->input = NULL;
126n/a tok->tabsize = TABSIZE;
127n/a tok->indent = 0;
128n/a tok->indstack[0] = 0;
129n/a
130n/a tok->atbol = 1;
131n/a tok->pendin = 0;
132n/a tok->prompt = tok->nextprompt = NULL;
133n/a tok->lineno = 0;
134n/a tok->level = 0;
135n/a tok->altwarning = 1;
136n/a tok->alterror = 1;
137n/a tok->alttabsize = 1;
138n/a tok->altindstack[0] = 0;
139n/a tok->decoding_state = STATE_INIT;
140n/a tok->decoding_erred = 0;
141n/a tok->read_coding_spec = 0;
142n/a tok->enc = NULL;
143n/a tok->encoding = NULL;
144n/a tok->cont_line = 0;
145n/a#ifndef PGEN
146n/a tok->filename = NULL;
147n/a tok->decoding_readline = NULL;
148n/a tok->decoding_buffer = NULL;
149n/a#endif
150n/a
151n/a tok->async_def = 0;
152n/a tok->async_def_indent = 0;
153n/a tok->async_def_nl = 0;
154n/a
155n/a return tok;
156n/a}
157n/a
158n/astatic char *
159n/anew_string(const char *s, Py_ssize_t len, struct tok_state *tok)
160n/a{
161n/a char* result = (char *)PyMem_MALLOC(len + 1);
162n/a if (!result) {
163n/a tok->done = E_NOMEM;
164n/a return NULL;
165n/a }
166n/a memcpy(result, s, len);
167n/a result[len] = '\0';
168n/a return result;
169n/a}
170n/a
171n/a#ifdef PGEN
172n/a
173n/astatic char *
174n/adecoding_fgets(char *s, int size, struct tok_state *tok)
175n/a{
176n/a return fgets(s, size, tok->fp);
177n/a}
178n/a
179n/astatic int
180n/adecoding_feof(struct tok_state *tok)
181n/a{
182n/a return feof(tok->fp);
183n/a}
184n/a
185n/astatic char *
186n/adecode_str(const char *str, int exec_input, struct tok_state *tok)
187n/a{
188n/a return new_string(str, strlen(str), tok);
189n/a}
190n/a
191n/a#else /* PGEN */
192n/a
193n/astatic char *
194n/aerror_ret(struct tok_state *tok) /* XXX */
195n/a{
196n/a tok->decoding_erred = 1;
197n/a if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
198n/a PyMem_FREE(tok->buf);
199n/a tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
200n/a tok->done = E_DECODE;
201n/a return NULL; /* as if it were EOF */
202n/a}
203n/a
204n/a
205n/astatic const char *
206n/aget_normal_name(const char *s) /* for utf-8 and latin-1 */
207n/a{
208n/a char buf[13];
209n/a int i;
210n/a for (i = 0; i < 12; i++) {
211n/a int c = s[i];
212n/a if (c == '\0')
213n/a break;
214n/a else if (c == '_')
215n/a buf[i] = '-';
216n/a else
217n/a buf[i] = tolower(c);
218n/a }
219n/a buf[i] = '\0';
220n/a if (strcmp(buf, "utf-8") == 0 ||
221n/a strncmp(buf, "utf-8-", 6) == 0)
222n/a return "utf-8";
223n/a else if (strcmp(buf, "latin-1") == 0 ||
224n/a strcmp(buf, "iso-8859-1") == 0 ||
225n/a strcmp(buf, "iso-latin-1") == 0 ||
226n/a strncmp(buf, "latin-1-", 8) == 0 ||
227n/a strncmp(buf, "iso-8859-1-", 11) == 0 ||
228n/a strncmp(buf, "iso-latin-1-", 12) == 0)
229n/a return "iso-8859-1";
230n/a else
231n/a return s;
232n/a}
233n/a
234n/a/* Return the coding spec in S, or NULL if none is found. */
235n/a
236n/astatic int
237n/aget_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
238n/a{
239n/a Py_ssize_t i;
240n/a *spec = NULL;
241n/a /* Coding spec must be in a comment, and that comment must be
242n/a * the only statement on the source code line. */
243n/a for (i = 0; i < size - 6; i++) {
244n/a if (s[i] == '#')
245n/a break;
246n/a if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
247n/a return 1;
248n/a }
249n/a for (; i < size - 6; i++) { /* XXX inefficient search */
250n/a const char* t = s + i;
251n/a if (strncmp(t, "coding", 6) == 0) {
252n/a const char* begin = NULL;
253n/a t += 6;
254n/a if (t[0] != ':' && t[0] != '=')
255n/a continue;
256n/a do {
257n/a t++;
258n/a } while (t[0] == '\x20' || t[0] == '\t');
259n/a
260n/a begin = t;
261n/a while (Py_ISALNUM(t[0]) ||
262n/a t[0] == '-' || t[0] == '_' || t[0] == '.')
263n/a t++;
264n/a
265n/a if (begin < t) {
266n/a char* r = new_string(begin, t - begin, tok);
267n/a const char* q;
268n/a if (!r)
269n/a return 0;
270n/a q = get_normal_name(r);
271n/a if (r != q) {
272n/a PyMem_FREE(r);
273n/a r = new_string(q, strlen(q), tok);
274n/a if (!r)
275n/a return 0;
276n/a }
277n/a *spec = r;
278n/a break;
279n/a }
280n/a }
281n/a }
282n/a return 1;
283n/a}
284n/a
285n/a/* Check whether the line contains a coding spec. If it does,
286n/a invoke the set_readline function for the new encoding.
287n/a This function receives the tok_state and the new encoding.
288n/a Return 1 on success, 0 on failure. */
289n/a
290n/astatic int
291n/acheck_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
292n/a int set_readline(struct tok_state *, const char *))
293n/a{
294n/a char *cs;
295n/a int r = 1;
296n/a
297n/a if (tok->cont_line) {
298n/a /* It's a continuation line, so it can't be a coding spec. */
299n/a tok->read_coding_spec = 1;
300n/a return 1;
301n/a }
302n/a if (!get_coding_spec(line, &cs, size, tok))
303n/a return 0;
304n/a if (!cs) {
305n/a Py_ssize_t i;
306n/a for (i = 0; i < size; i++) {
307n/a if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
308n/a break;
309n/a if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
310n/a /* Stop checking coding spec after a line containing
311n/a * anything except a comment. */
312n/a tok->read_coding_spec = 1;
313n/a break;
314n/a }
315n/a }
316n/a return 1;
317n/a }
318n/a tok->read_coding_spec = 1;
319n/a if (tok->encoding == NULL) {
320n/a assert(tok->decoding_state == STATE_RAW);
321n/a if (strcmp(cs, "utf-8") == 0) {
322n/a tok->encoding = cs;
323n/a } else {
324n/a r = set_readline(tok, cs);
325n/a if (r) {
326n/a tok->encoding = cs;
327n/a tok->decoding_state = STATE_NORMAL;
328n/a }
329n/a else {
330n/a PyErr_Format(PyExc_SyntaxError,
331n/a "encoding problem: %s", cs);
332n/a PyMem_FREE(cs);
333n/a }
334n/a }
335n/a } else { /* then, compare cs with BOM */
336n/a r = (strcmp(tok->encoding, cs) == 0);
337n/a if (!r)
338n/a PyErr_Format(PyExc_SyntaxError,
339n/a "encoding problem: %s with BOM", cs);
340n/a PyMem_FREE(cs);
341n/a }
342n/a return r;
343n/a}
344n/a
345n/a/* See whether the file starts with a BOM. If it does,
346n/a invoke the set_readline function with the new encoding.
347n/a Return 1 on success, 0 on failure. */
348n/a
349n/astatic int
350n/acheck_bom(int get_char(struct tok_state *),
351n/a void unget_char(int, struct tok_state *),
352n/a int set_readline(struct tok_state *, const char *),
353n/a struct tok_state *tok)
354n/a{
355n/a int ch1, ch2, ch3;
356n/a ch1 = get_char(tok);
357n/a tok->decoding_state = STATE_RAW;
358n/a if (ch1 == EOF) {
359n/a return 1;
360n/a } else if (ch1 == 0xEF) {
361n/a ch2 = get_char(tok);
362n/a if (ch2 != 0xBB) {
363n/a unget_char(ch2, tok);
364n/a unget_char(ch1, tok);
365n/a return 1;
366n/a }
367n/a ch3 = get_char(tok);
368n/a if (ch3 != 0xBF) {
369n/a unget_char(ch3, tok);
370n/a unget_char(ch2, tok);
371n/a unget_char(ch1, tok);
372n/a return 1;
373n/a }
374n/a#if 0
375n/a /* Disable support for UTF-16 BOMs until a decision
376n/a is made whether this needs to be supported. */
377n/a } else if (ch1 == 0xFE) {
378n/a ch2 = get_char(tok);
379n/a if (ch2 != 0xFF) {
380n/a unget_char(ch2, tok);
381n/a unget_char(ch1, tok);
382n/a return 1;
383n/a }
384n/a if (!set_readline(tok, "utf-16-be"))
385n/a return 0;
386n/a tok->decoding_state = STATE_NORMAL;
387n/a } else if (ch1 == 0xFF) {
388n/a ch2 = get_char(tok);
389n/a if (ch2 != 0xFE) {
390n/a unget_char(ch2, tok);
391n/a unget_char(ch1, tok);
392n/a return 1;
393n/a }
394n/a if (!set_readline(tok, "utf-16-le"))
395n/a return 0;
396n/a tok->decoding_state = STATE_NORMAL;
397n/a#endif
398n/a } else {
399n/a unget_char(ch1, tok);
400n/a return 1;
401n/a }
402n/a if (tok->encoding != NULL)
403n/a PyMem_FREE(tok->encoding);
404n/a tok->encoding = new_string("utf-8", 5, tok);
405n/a if (!tok->encoding)
406n/a return 0;
407n/a /* No need to set_readline: input is already utf-8 */
408n/a return 1;
409n/a}
410n/a
411n/a/* Read a line of text from TOK into S, using the stream in TOK.
412n/a Return NULL on failure, else S.
413n/a
414n/a On entry, tok->decoding_buffer will be one of:
415n/a 1) NULL: need to call tok->decoding_readline to get a new line
416n/a 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
417n/a stored the result in tok->decoding_buffer
418n/a 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
419n/a (in the s buffer) to copy entire contents of the line read
420n/a by tok->decoding_readline. tok->decoding_buffer has the overflow.
421n/a In this case, fp_readl is called in a loop (with an expanded buffer)
422n/a until the buffer ends with a '\n' (or until the end of the file is
423n/a reached): see tok_nextc and its calls to decoding_fgets.
424n/a*/
425n/a
426n/astatic char *
427n/afp_readl(char *s, int size, struct tok_state *tok)
428n/a{
429n/a PyObject* bufobj;
430n/a const char *buf;
431n/a Py_ssize_t buflen;
432n/a
433n/a /* Ask for one less byte so we can terminate it */
434n/a assert(size > 0);
435n/a size--;
436n/a
437n/a if (tok->decoding_buffer) {
438n/a bufobj = tok->decoding_buffer;
439n/a Py_INCREF(bufobj);
440n/a }
441n/a else
442n/a {
443n/a bufobj = _PyObject_CallNoArg(tok->decoding_readline);
444n/a if (bufobj == NULL)
445n/a goto error;
446n/a }
447n/a if (PyUnicode_CheckExact(bufobj))
448n/a {
449n/a buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
450n/a if (buf == NULL) {
451n/a goto error;
452n/a }
453n/a }
454n/a else
455n/a {
456n/a buf = PyByteArray_AsString(bufobj);
457n/a if (buf == NULL) {
458n/a goto error;
459n/a }
460n/a buflen = PyByteArray_GET_SIZE(bufobj);
461n/a }
462n/a
463n/a Py_XDECREF(tok->decoding_buffer);
464n/a if (buflen > size) {
465n/a /* Too many chars, the rest goes into tok->decoding_buffer */
466n/a tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
467n/a buflen-size);
468n/a if (tok->decoding_buffer == NULL)
469n/a goto error;
470n/a buflen = size;
471n/a }
472n/a else
473n/a tok->decoding_buffer = NULL;
474n/a
475n/a memcpy(s, buf, buflen);
476n/a s[buflen] = '\0';
477n/a if (buflen == 0) /* EOF */
478n/a s = NULL;
479n/a Py_DECREF(bufobj);
480n/a return s;
481n/a
482n/aerror:
483n/a Py_XDECREF(bufobj);
484n/a return error_ret(tok);
485n/a}
486n/a
487n/a/* Set the readline function for TOK to a StreamReader's
488n/a readline function. The StreamReader is named ENC.
489n/a
490n/a This function is called from check_bom and check_coding_spec.
491n/a
492n/a ENC is usually identical to the future value of tok->encoding,
493n/a except for the (currently unsupported) case of UTF-16.
494n/a
495n/a Return 1 on success, 0 on failure. */
496n/a
497n/astatic int
498n/afp_setreadl(struct tok_state *tok, const char* enc)
499n/a{
500n/a PyObject *readline, *io, *stream;
501n/a _Py_IDENTIFIER(open);
502n/a _Py_IDENTIFIER(readline);
503n/a int fd;
504n/a long pos;
505n/a
506n/a fd = fileno(tok->fp);
507n/a /* Due to buffering the file offset for fd can be different from the file
508n/a * position of tok->fp. If tok->fp was opened in text mode on Windows,
509n/a * its file position counts CRLF as one char and can't be directly mapped
510n/a * to the file offset for fd. Instead we step back one byte and read to
511n/a * the end of line.*/
512n/a pos = ftell(tok->fp);
513n/a if (pos == -1 ||
514n/a lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
515n/a PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
516n/a return 0;
517n/a }
518n/a
519n/a io = PyImport_ImportModuleNoBlock("io");
520n/a if (io == NULL)
521n/a return 0;
522n/a
523n/a stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
524n/a fd, "r", -1, enc, Py_None, Py_None, Py_False);
525n/a Py_DECREF(io);
526n/a if (stream == NULL)
527n/a return 0;
528n/a
529n/a readline = _PyObject_GetAttrId(stream, &PyId_readline);
530n/a Py_DECREF(stream);
531n/a if (readline == NULL)
532n/a return 0;
533n/a Py_XSETREF(tok->decoding_readline, readline);
534n/a
535n/a if (pos > 0) {
536n/a PyObject *bufobj = _PyObject_CallNoArg(readline);
537n/a if (bufobj == NULL)
538n/a return 0;
539n/a Py_DECREF(bufobj);
540n/a }
541n/a
542n/a return 1;
543n/a}
544n/a
545n/a/* Fetch the next byte from TOK. */
546n/a
547n/astatic int fp_getc(struct tok_state *tok) {
548n/a return getc(tok->fp);
549n/a}
550n/a
551n/a/* Unfetch the last byte back into TOK. */
552n/a
553n/astatic void fp_ungetc(int c, struct tok_state *tok) {
554n/a ungetc(c, tok->fp);
555n/a}
556n/a
557n/a/* Check whether the characters at s start a valid
558n/a UTF-8 sequence. Return the number of characters forming
559n/a the sequence if yes, 0 if not. */
560n/astatic int valid_utf8(const unsigned char* s)
561n/a{
562n/a int expected = 0;
563n/a int length;
564n/a if (*s < 0x80)
565n/a /* single-byte code */
566n/a return 1;
567n/a if (*s < 0xc0)
568n/a /* following byte */
569n/a return 0;
570n/a if (*s < 0xE0)
571n/a expected = 1;
572n/a else if (*s < 0xF0)
573n/a expected = 2;
574n/a else if (*s < 0xF8)
575n/a expected = 3;
576n/a else
577n/a return 0;
578n/a length = expected + 1;
579n/a for (; expected; expected--)
580n/a if (s[expected] < 0x80 || s[expected] >= 0xC0)
581n/a return 0;
582n/a return length;
583n/a}
584n/a
585n/a/* Read a line of input from TOK. Determine encoding
586n/a if necessary. */
587n/a
588n/astatic char *
589n/adecoding_fgets(char *s, int size, struct tok_state *tok)
590n/a{
591n/a char *line = NULL;
592n/a int badchar = 0;
593n/a for (;;) {
594n/a if (tok->decoding_state == STATE_NORMAL) {
595n/a /* We already have a codec associated with
596n/a this input. */
597n/a line = fp_readl(s, size, tok);
598n/a break;
599n/a } else if (tok->decoding_state == STATE_RAW) {
600n/a /* We want a 'raw' read. */
601n/a line = Py_UniversalNewlineFgets(s, size,
602n/a tok->fp, NULL);
603n/a break;
604n/a } else {
605n/a /* We have not yet determined the encoding.
606n/a If an encoding is found, use the file-pointer
607n/a reader functions from now on. */
608n/a if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
609n/a return error_ret(tok);
610n/a assert(tok->decoding_state != STATE_INIT);
611n/a }
612n/a }
613n/a if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
614n/a if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
615n/a return error_ret(tok);
616n/a }
617n/a }
618n/a#ifndef PGEN
619n/a /* The default encoding is UTF-8, so make sure we don't have any
620n/a non-UTF-8 sequences in it. */
621n/a if (line && !tok->encoding) {
622n/a unsigned char *c;
623n/a int length;
624n/a for (c = (unsigned char *)line; *c; c += length)
625n/a if (!(length = valid_utf8(c))) {
626n/a badchar = *c;
627n/a break;
628n/a }
629n/a }
630n/a if (badchar) {
631n/a /* Need to add 1 to the line number, since this line
632n/a has not been counted, yet. */
633n/a PyErr_Format(PyExc_SyntaxError,
634n/a "Non-UTF-8 code starting with '\\x%.2x' "
635n/a "in file %U on line %i, "
636n/a "but no encoding declared; "
637n/a "see http://python.org/dev/peps/pep-0263/ for details",
638n/a badchar, tok->filename, tok->lineno + 1);
639n/a return error_ret(tok);
640n/a }
641n/a#endif
642n/a return line;
643n/a}
644n/a
645n/astatic int
646n/adecoding_feof(struct tok_state *tok)
647n/a{
648n/a if (tok->decoding_state != STATE_NORMAL) {
649n/a return feof(tok->fp);
650n/a } else {
651n/a PyObject* buf = tok->decoding_buffer;
652n/a if (buf == NULL) {
653n/a buf = _PyObject_CallNoArg(tok->decoding_readline);
654n/a if (buf == NULL) {
655n/a error_ret(tok);
656n/a return 1;
657n/a } else {
658n/a tok->decoding_buffer = buf;
659n/a }
660n/a }
661n/a return PyObject_Length(buf) == 0;
662n/a }
663n/a}
664n/a
665n/a/* Fetch a byte from TOK, using the string buffer. */
666n/a
667n/astatic int
668n/abuf_getc(struct tok_state *tok) {
669n/a return Py_CHARMASK(*tok->str++);
670n/a}
671n/a
672n/a/* Unfetch a byte from TOK, using the string buffer. */
673n/a
674n/astatic void
675n/abuf_ungetc(int c, struct tok_state *tok) {
676n/a tok->str--;
677n/a assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
678n/a}
679n/a
680n/a/* Set the readline function for TOK to ENC. For the string-based
681n/a tokenizer, this means to just record the encoding. */
682n/a
683n/astatic int
684n/abuf_setreadl(struct tok_state *tok, const char* enc) {
685n/a tok->enc = enc;
686n/a return 1;
687n/a}
688n/a
689n/a/* Return a UTF-8 encoding Python string object from the
690n/a C byte string STR, which is encoded with ENC. */
691n/a
692n/astatic PyObject *
693n/atranslate_into_utf8(const char* str, const char* enc) {
694n/a PyObject *utf8;
695n/a PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
696n/a if (buf == NULL)
697n/a return NULL;
698n/a utf8 = PyUnicode_AsUTF8String(buf);
699n/a Py_DECREF(buf);
700n/a return utf8;
701n/a}
702n/a
703n/a
704n/astatic char *
705n/atranslate_newlines(const char *s, int exec_input, struct tok_state *tok) {
706n/a int skip_next_lf = 0;
707n/a size_t needed_length = strlen(s) + 2, final_length;
708n/a char *buf, *current;
709n/a char c = '\0';
710n/a buf = PyMem_MALLOC(needed_length);
711n/a if (buf == NULL) {
712n/a tok->done = E_NOMEM;
713n/a return NULL;
714n/a }
715n/a for (current = buf; *s; s++, current++) {
716n/a c = *s;
717n/a if (skip_next_lf) {
718n/a skip_next_lf = 0;
719n/a if (c == '\n') {
720n/a c = *++s;
721n/a if (!c)
722n/a break;
723n/a }
724n/a }
725n/a if (c == '\r') {
726n/a skip_next_lf = 1;
727n/a c = '\n';
728n/a }
729n/a *current = c;
730n/a }
731n/a /* If this is exec input, add a newline to the end of the string if
732n/a there isn't one already. */
733n/a if (exec_input && c != '\n') {
734n/a *current = '\n';
735n/a current++;
736n/a }
737n/a *current = '\0';
738n/a final_length = current - buf + 1;
739n/a if (final_length < needed_length && final_length)
740n/a /* should never fail */
741n/a buf = PyMem_REALLOC(buf, final_length);
742n/a return buf;
743n/a}
744n/a
745n/a/* Decode a byte string STR for use as the buffer of TOK.
746n/a Look for encoding declarations inside STR, and record them
747n/a inside TOK. */
748n/a
749n/astatic const char *
750n/adecode_str(const char *input, int single, struct tok_state *tok)
751n/a{
752n/a PyObject* utf8 = NULL;
753n/a const char *str;
754n/a const char *s;
755n/a const char *newl[2] = {NULL, NULL};
756n/a int lineno = 0;
757n/a tok->input = str = translate_newlines(input, single, tok);
758n/a if (str == NULL)
759n/a return NULL;
760n/a tok->enc = NULL;
761n/a tok->str = str;
762n/a if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
763n/a return error_ret(tok);
764n/a str = tok->str; /* string after BOM if any */
765n/a assert(str);
766n/a if (tok->enc != NULL) {
767n/a utf8 = translate_into_utf8(str, tok->enc);
768n/a if (utf8 == NULL)
769n/a return error_ret(tok);
770n/a str = PyBytes_AsString(utf8);
771n/a }
772n/a for (s = str;; s++) {
773n/a if (*s == '\0') break;
774n/a else if (*s == '\n') {
775n/a assert(lineno < 2);
776n/a newl[lineno] = s;
777n/a lineno++;
778n/a if (lineno == 2) break;
779n/a }
780n/a }
781n/a tok->enc = NULL;
782n/a /* need to check line 1 and 2 separately since check_coding_spec
783n/a assumes a single line as input */
784n/a if (newl[0]) {
785n/a if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
786n/a return error_ret(tok);
787n/a if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
788n/a if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
789n/a tok, buf_setreadl))
790n/a return error_ret(tok);
791n/a }
792n/a }
793n/a if (tok->enc != NULL) {
794n/a assert(utf8 == NULL);
795n/a utf8 = translate_into_utf8(str, tok->enc);
796n/a if (utf8 == NULL)
797n/a return error_ret(tok);
798n/a str = PyBytes_AS_STRING(utf8);
799n/a }
800n/a assert(tok->decoding_buffer == NULL);
801n/a tok->decoding_buffer = utf8; /* CAUTION */
802n/a return str;
803n/a}
804n/a
805n/a#endif /* PGEN */
806n/a
807n/a/* Set up tokenizer for string */
808n/a
809n/astruct tok_state *
810n/aPyTokenizer_FromString(const char *str, int exec_input)
811n/a{
812n/a struct tok_state *tok = tok_new();
813n/a if (tok == NULL)
814n/a return NULL;
815n/a str = decode_str(str, exec_input, tok);
816n/a if (str == NULL) {
817n/a PyTokenizer_Free(tok);
818n/a return NULL;
819n/a }
820n/a
821n/a /* XXX: constify members. */
822n/a tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
823n/a return tok;
824n/a}
825n/a
826n/astruct tok_state *
827n/aPyTokenizer_FromUTF8(const char *str, int exec_input)
828n/a{
829n/a struct tok_state *tok = tok_new();
830n/a if (tok == NULL)
831n/a return NULL;
832n/a#ifndef PGEN
833n/a tok->input = str = translate_newlines(str, exec_input, tok);
834n/a#endif
835n/a if (str == NULL) {
836n/a PyTokenizer_Free(tok);
837n/a return NULL;
838n/a }
839n/a tok->decoding_state = STATE_RAW;
840n/a tok->read_coding_spec = 1;
841n/a tok->enc = NULL;
842n/a tok->str = str;
843n/a tok->encoding = (char *)PyMem_MALLOC(6);
844n/a if (!tok->encoding) {
845n/a PyTokenizer_Free(tok);
846n/a return NULL;
847n/a }
848n/a strcpy(tok->encoding, "utf-8");
849n/a
850n/a /* XXX: constify members. */
851n/a tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
852n/a return tok;
853n/a}
854n/a
855n/a/* Set up tokenizer for file */
856n/a
857n/astruct tok_state *
858n/aPyTokenizer_FromFile(FILE *fp, const char* enc,
859n/a const char *ps1, const char *ps2)
860n/a{
861n/a struct tok_state *tok = tok_new();
862n/a if (tok == NULL)
863n/a return NULL;
864n/a if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
865n/a PyTokenizer_Free(tok);
866n/a return NULL;
867n/a }
868n/a tok->cur = tok->inp = tok->buf;
869n/a tok->end = tok->buf + BUFSIZ;
870n/a tok->fp = fp;
871n/a tok->prompt = ps1;
872n/a tok->nextprompt = ps2;
873n/a if (enc != NULL) {
874n/a /* Must copy encoding declaration since it
875n/a gets copied into the parse tree. */
876n/a tok->encoding = PyMem_MALLOC(strlen(enc)+1);
877n/a if (!tok->encoding) {
878n/a PyTokenizer_Free(tok);
879n/a return NULL;
880n/a }
881n/a strcpy(tok->encoding, enc);
882n/a tok->decoding_state = STATE_NORMAL;
883n/a }
884n/a return tok;
885n/a}
886n/a
887n/a
888n/a/* Free a tok_state structure */
889n/a
890n/avoid
891n/aPyTokenizer_Free(struct tok_state *tok)
892n/a{
893n/a if (tok->encoding != NULL)
894n/a PyMem_FREE(tok->encoding);
895n/a#ifndef PGEN
896n/a Py_XDECREF(tok->decoding_readline);
897n/a Py_XDECREF(tok->decoding_buffer);
898n/a Py_XDECREF(tok->filename);
899n/a#endif
900n/a if (tok->fp != NULL && tok->buf != NULL)
901n/a PyMem_FREE(tok->buf);
902n/a if (tok->input)
903n/a PyMem_FREE((char *)tok->input);
904n/a PyMem_FREE(tok);
905n/a}
906n/a
907n/a/* Get next char, updating state; error code goes into tok->done */
908n/a
909n/astatic int
910n/atok_nextc(struct tok_state *tok)
911n/a{
912n/a for (;;) {
913n/a if (tok->cur != tok->inp) {
914n/a return Py_CHARMASK(*tok->cur++); /* Fast path */
915n/a }
916n/a if (tok->done != E_OK)
917n/a return EOF;
918n/a if (tok->fp == NULL) {
919n/a char *end = strchr(tok->inp, '\n');
920n/a if (end != NULL)
921n/a end++;
922n/a else {
923n/a end = strchr(tok->inp, '\0');
924n/a if (end == tok->inp) {
925n/a tok->done = E_EOF;
926n/a return EOF;
927n/a }
928n/a }
929n/a if (tok->start == NULL)
930n/a tok->buf = tok->cur;
931n/a tok->line_start = tok->cur;
932n/a tok->lineno++;
933n/a tok->inp = end;
934n/a return Py_CHARMASK(*tok->cur++);
935n/a }
936n/a if (tok->prompt != NULL) {
937n/a char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
938n/a#ifndef PGEN
939n/a if (newtok != NULL) {
940n/a char *translated = translate_newlines(newtok, 0, tok);
941n/a PyMem_FREE(newtok);
942n/a if (translated == NULL)
943n/a return EOF;
944n/a newtok = translated;
945n/a }
946n/a if (tok->encoding && newtok && *newtok) {
947n/a /* Recode to UTF-8 */
948n/a Py_ssize_t buflen;
949n/a const char* buf;
950n/a PyObject *u = translate_into_utf8(newtok, tok->encoding);
951n/a PyMem_FREE(newtok);
952n/a if (!u) {
953n/a tok->done = E_DECODE;
954n/a return EOF;
955n/a }
956n/a buflen = PyBytes_GET_SIZE(u);
957n/a buf = PyBytes_AS_STRING(u);
958n/a newtok = PyMem_MALLOC(buflen+1);
959n/a strcpy(newtok, buf);
960n/a Py_DECREF(u);
961n/a }
962n/a#endif
963n/a if (tok->nextprompt != NULL)
964n/a tok->prompt = tok->nextprompt;
965n/a if (newtok == NULL)
966n/a tok->done = E_INTR;
967n/a else if (*newtok == '\0') {
968n/a PyMem_FREE(newtok);
969n/a tok->done = E_EOF;
970n/a }
971n/a else if (tok->start != NULL) {
972n/a size_t start = tok->start - tok->buf;
973n/a size_t oldlen = tok->cur - tok->buf;
974n/a size_t newlen = oldlen + strlen(newtok);
975n/a char *buf = tok->buf;
976n/a buf = (char *)PyMem_REALLOC(buf, newlen+1);
977n/a tok->lineno++;
978n/a if (buf == NULL) {
979n/a PyMem_FREE(tok->buf);
980n/a tok->buf = NULL;
981n/a PyMem_FREE(newtok);
982n/a tok->done = E_NOMEM;
983n/a return EOF;
984n/a }
985n/a tok->buf = buf;
986n/a tok->cur = tok->buf + oldlen;
987n/a tok->line_start = tok->cur;
988n/a strcpy(tok->buf + oldlen, newtok);
989n/a PyMem_FREE(newtok);
990n/a tok->inp = tok->buf + newlen;
991n/a tok->end = tok->inp + 1;
992n/a tok->start = tok->buf + start;
993n/a }
994n/a else {
995n/a tok->lineno++;
996n/a if (tok->buf != NULL)
997n/a PyMem_FREE(tok->buf);
998n/a tok->buf = newtok;
999n/a tok->cur = tok->buf;
1000n/a tok->line_start = tok->buf;
1001n/a tok->inp = strchr(tok->buf, '\0');
1002n/a tok->end = tok->inp + 1;
1003n/a }
1004n/a }
1005n/a else {
1006n/a int done = 0;
1007n/a Py_ssize_t cur = 0;
1008n/a char *pt;
1009n/a if (tok->start == NULL) {
1010n/a if (tok->buf == NULL) {
1011n/a tok->buf = (char *)
1012n/a PyMem_MALLOC(BUFSIZ);
1013n/a if (tok->buf == NULL) {
1014n/a tok->done = E_NOMEM;
1015n/a return EOF;
1016n/a }
1017n/a tok->end = tok->buf + BUFSIZ;
1018n/a }
1019n/a if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1020n/a tok) == NULL) {
1021n/a if (!tok->decoding_erred)
1022n/a tok->done = E_EOF;
1023n/a done = 1;
1024n/a }
1025n/a else {
1026n/a tok->done = E_OK;
1027n/a tok->inp = strchr(tok->buf, '\0');
1028n/a done = tok->inp == tok->buf || tok->inp[-1] == '\n';
1029n/a }
1030n/a }
1031n/a else {
1032n/a cur = tok->cur - tok->buf;
1033n/a if (decoding_feof(tok)) {
1034n/a tok->done = E_EOF;
1035n/a done = 1;
1036n/a }
1037n/a else
1038n/a tok->done = E_OK;
1039n/a }
1040n/a tok->lineno++;
1041n/a /* Read until '\n' or EOF */
1042n/a while (!done) {
1043n/a Py_ssize_t curstart = tok->start == NULL ? -1 :
1044n/a tok->start - tok->buf;
1045n/a Py_ssize_t curvalid = tok->inp - tok->buf;
1046n/a Py_ssize_t newsize = curvalid + BUFSIZ;
1047n/a char *newbuf = tok->buf;
1048n/a newbuf = (char *)PyMem_REALLOC(newbuf,
1049n/a newsize);
1050n/a if (newbuf == NULL) {
1051n/a tok->done = E_NOMEM;
1052n/a tok->cur = tok->inp;
1053n/a return EOF;
1054n/a }
1055n/a tok->buf = newbuf;
1056n/a tok->cur = tok->buf + cur;
1057n/a tok->line_start = tok->cur;
1058n/a tok->inp = tok->buf + curvalid;
1059n/a tok->end = tok->buf + newsize;
1060n/a tok->start = curstart < 0 ? NULL :
1061n/a tok->buf + curstart;
1062n/a if (decoding_fgets(tok->inp,
1063n/a (int)(tok->end - tok->inp),
1064n/a tok) == NULL) {
1065n/a /* Break out early on decoding
1066n/a errors, as tok->buf will be NULL
1067n/a */
1068n/a if (tok->decoding_erred)
1069n/a return EOF;
1070n/a /* Last line does not end in \n,
1071n/a fake one */
1072n/a strcpy(tok->inp, "\n");
1073n/a }
1074n/a tok->inp = strchr(tok->inp, '\0');
1075n/a done = tok->inp[-1] == '\n';
1076n/a }
1077n/a if (tok->buf != NULL) {
1078n/a tok->cur = tok->buf + cur;
1079n/a tok->line_start = tok->cur;
1080n/a /* replace "\r\n" with "\n" */
1081n/a /* For Mac leave the \r, giving a syntax error */
1082n/a pt = tok->inp - 2;
1083n/a if (pt >= tok->buf && *pt == '\r') {
1084n/a *pt++ = '\n';
1085n/a *pt = '\0';
1086n/a tok->inp = pt;
1087n/a }
1088n/a }
1089n/a }
1090n/a if (tok->done != E_OK) {
1091n/a if (tok->prompt != NULL)
1092n/a PySys_WriteStderr("\n");
1093n/a tok->cur = tok->inp;
1094n/a return EOF;
1095n/a }
1096n/a }
1097n/a /*NOTREACHED*/
1098n/a}
1099n/a
1100n/a
1101n/a/* Back-up one character */
1102n/a
1103n/astatic void
1104n/atok_backup(struct tok_state *tok, int c)
1105n/a{
1106n/a if (c != EOF) {
1107n/a if (--tok->cur < tok->buf)
1108n/a Py_FatalError("tok_backup: beginning of buffer");
1109n/a if (*tok->cur != c)
1110n/a *tok->cur = c;
1111n/a }
1112n/a}
1113n/a
1114n/a
1115n/a/* Return the token corresponding to a single character */
1116n/a
1117n/aint
1118n/aPyToken_OneChar(int c)
1119n/a{
1120n/a switch (c) {
1121n/a case '(': return LPAR;
1122n/a case ')': return RPAR;
1123n/a case '[': return LSQB;
1124n/a case ']': return RSQB;
1125n/a case ':': return COLON;
1126n/a case ',': return COMMA;
1127n/a case ';': return SEMI;
1128n/a case '+': return PLUS;
1129n/a case '-': return MINUS;
1130n/a case '*': return STAR;
1131n/a case '/': return SLASH;
1132n/a case '|': return VBAR;
1133n/a case '&': return AMPER;
1134n/a case '<': return LESS;
1135n/a case '>': return GREATER;
1136n/a case '=': return EQUAL;
1137n/a case '.': return DOT;
1138n/a case '%': return PERCENT;
1139n/a case '{': return LBRACE;
1140n/a case '}': return RBRACE;
1141n/a case '^': return CIRCUMFLEX;
1142n/a case '~': return TILDE;
1143n/a case '@': return AT;
1144n/a default: return OP;
1145n/a }
1146n/a}
1147n/a
1148n/a
1149n/aint
1150n/aPyToken_TwoChars(int c1, int c2)
1151n/a{
1152n/a switch (c1) {
1153n/a case '=':
1154n/a switch (c2) {
1155n/a case '=': return EQEQUAL;
1156n/a }
1157n/a break;
1158n/a case '!':
1159n/a switch (c2) {
1160n/a case '=': return NOTEQUAL;
1161n/a }
1162n/a break;
1163n/a case '<':
1164n/a switch (c2) {
1165n/a case '>': return NOTEQUAL;
1166n/a case '=': return LESSEQUAL;
1167n/a case '<': return LEFTSHIFT;
1168n/a }
1169n/a break;
1170n/a case '>':
1171n/a switch (c2) {
1172n/a case '=': return GREATEREQUAL;
1173n/a case '>': return RIGHTSHIFT;
1174n/a }
1175n/a break;
1176n/a case '+':
1177n/a switch (c2) {
1178n/a case '=': return PLUSEQUAL;
1179n/a }
1180n/a break;
1181n/a case '-':
1182n/a switch (c2) {
1183n/a case '=': return MINEQUAL;
1184n/a case '>': return RARROW;
1185n/a }
1186n/a break;
1187n/a case '*':
1188n/a switch (c2) {
1189n/a case '*': return DOUBLESTAR;
1190n/a case '=': return STAREQUAL;
1191n/a }
1192n/a break;
1193n/a case '/':
1194n/a switch (c2) {
1195n/a case '/': return DOUBLESLASH;
1196n/a case '=': return SLASHEQUAL;
1197n/a }
1198n/a break;
1199n/a case '|':
1200n/a switch (c2) {
1201n/a case '=': return VBAREQUAL;
1202n/a }
1203n/a break;
1204n/a case '%':
1205n/a switch (c2) {
1206n/a case '=': return PERCENTEQUAL;
1207n/a }
1208n/a break;
1209n/a case '&':
1210n/a switch (c2) {
1211n/a case '=': return AMPEREQUAL;
1212n/a }
1213n/a break;
1214n/a case '^':
1215n/a switch (c2) {
1216n/a case '=': return CIRCUMFLEXEQUAL;
1217n/a }
1218n/a break;
1219n/a case '@':
1220n/a switch (c2) {
1221n/a case '=': return ATEQUAL;
1222n/a }
1223n/a break;
1224n/a }
1225n/a return OP;
1226n/a}
1227n/a
1228n/aint
1229n/aPyToken_ThreeChars(int c1, int c2, int c3)
1230n/a{
1231n/a switch (c1) {
1232n/a case '<':
1233n/a switch (c2) {
1234n/a case '<':
1235n/a switch (c3) {
1236n/a case '=':
1237n/a return LEFTSHIFTEQUAL;
1238n/a }
1239n/a break;
1240n/a }
1241n/a break;
1242n/a case '>':
1243n/a switch (c2) {
1244n/a case '>':
1245n/a switch (c3) {
1246n/a case '=':
1247n/a return RIGHTSHIFTEQUAL;
1248n/a }
1249n/a break;
1250n/a }
1251n/a break;
1252n/a case '*':
1253n/a switch (c2) {
1254n/a case '*':
1255n/a switch (c3) {
1256n/a case '=':
1257n/a return DOUBLESTAREQUAL;
1258n/a }
1259n/a break;
1260n/a }
1261n/a break;
1262n/a case '/':
1263n/a switch (c2) {
1264n/a case '/':
1265n/a switch (c3) {
1266n/a case '=':
1267n/a return DOUBLESLASHEQUAL;
1268n/a }
1269n/a break;
1270n/a }
1271n/a break;
1272n/a case '.':
1273n/a switch (c2) {
1274n/a case '.':
1275n/a switch (c3) {
1276n/a case '.':
1277n/a return ELLIPSIS;
1278n/a }
1279n/a break;
1280n/a }
1281n/a break;
1282n/a }
1283n/a return OP;
1284n/a}
1285n/a
1286n/astatic int
1287n/aindenterror(struct tok_state *tok)
1288n/a{
1289n/a if (tok->alterror) {
1290n/a tok->done = E_TABSPACE;
1291n/a tok->cur = tok->inp;
1292n/a return 1;
1293n/a }
1294n/a if (tok->altwarning) {
1295n/a#ifdef PGEN
1296n/a PySys_WriteStderr("inconsistent use of tabs and spaces "
1297n/a "in indentation\n");
1298n/a#else
1299n/a PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
1300n/a "in indentation\n", tok->filename);
1301n/a#endif
1302n/a tok->altwarning = 0;
1303n/a }
1304n/a return 0;
1305n/a}
1306n/a
1307n/a#ifdef PGEN
1308n/a#define verify_identifier(tok) 1
1309n/a#else
1310n/a/* Verify that the identifier follows PEP 3131.
1311n/a All identifier strings are guaranteed to be "ready" unicode objects.
1312n/a */
1313n/astatic int
1314n/averify_identifier(struct tok_state *tok)
1315n/a{
1316n/a PyObject *s;
1317n/a int result;
1318n/a if (tok->decoding_erred)
1319n/a return 0;
1320n/a s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1321n/a if (s == NULL || PyUnicode_READY(s) == -1) {
1322n/a if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1323n/a PyErr_Clear();
1324n/a tok->done = E_IDENTIFIER;
1325n/a } else {
1326n/a tok->done = E_ERROR;
1327n/a }
1328n/a return 0;
1329n/a }
1330n/a result = PyUnicode_IsIdentifier(s);
1331n/a Py_DECREF(s);
1332n/a if (result == 0)
1333n/a tok->done = E_IDENTIFIER;
1334n/a return result;
1335n/a}
1336n/a#endif
1337n/a
1338n/astatic int
1339n/atok_decimal_tail(struct tok_state *tok)
1340n/a{
1341n/a int c;
1342n/a
1343n/a while (1) {
1344n/a do {
1345n/a c = tok_nextc(tok);
1346n/a } while (isdigit(c));
1347n/a if (c != '_') {
1348n/a break;
1349n/a }
1350n/a c = tok_nextc(tok);
1351n/a if (!isdigit(c)) {
1352n/a tok->done = E_TOKEN;
1353n/a tok_backup(tok, c);
1354n/a return 0;
1355n/a }
1356n/a }
1357n/a return c;
1358n/a}
1359n/a
1360n/a/* Get next token, after space stripping etc. */
1361n/a
1362n/astatic int
1363n/atok_get(struct tok_state *tok, char **p_start, char **p_end)
1364n/a{
1365n/a int c;
1366n/a int blankline, nonascii;
1367n/a
1368n/a *p_start = *p_end = NULL;
1369n/a nextline:
1370n/a tok->start = NULL;
1371n/a blankline = 0;
1372n/a
1373n/a /* Get indentation level */
1374n/a if (tok->atbol) {
1375n/a int col = 0;
1376n/a int altcol = 0;
1377n/a tok->atbol = 0;
1378n/a for (;;) {
1379n/a c = tok_nextc(tok);
1380n/a if (c == ' ') {
1381n/a col++, altcol++;
1382n/a }
1383n/a else if (c == '\t') {
1384n/a col = (col/tok->tabsize + 1) * tok->tabsize;
1385n/a altcol = (altcol/tok->alttabsize + 1)
1386n/a * tok->alttabsize;
1387n/a }
1388n/a else if (c == '\014') {/* Control-L (formfeed) */
1389n/a col = altcol = 0; /* For Emacs users */
1390n/a }
1391n/a else {
1392n/a break;
1393n/a }
1394n/a }
1395n/a tok_backup(tok, c);
1396n/a if (c == '#' || c == '\n') {
1397n/a /* Lines with only whitespace and/or comments
1398n/a shouldn't affect the indentation and are
1399n/a not passed to the parser as NEWLINE tokens,
1400n/a except *totally* empty lines in interactive
1401n/a mode, which signal the end of a command group. */
1402n/a if (col == 0 && c == '\n' && tok->prompt != NULL) {
1403n/a blankline = 0; /* Let it through */
1404n/a }
1405n/a else {
1406n/a blankline = 1; /* Ignore completely */
1407n/a }
1408n/a /* We can't jump back right here since we still
1409n/a may need to skip to the end of a comment */
1410n/a }
1411n/a if (!blankline && tok->level == 0) {
1412n/a if (col == tok->indstack[tok->indent]) {
1413n/a /* No change */
1414n/a if (altcol != tok->altindstack[tok->indent]) {
1415n/a if (indenterror(tok)) {
1416n/a return ERRORTOKEN;
1417n/a }
1418n/a }
1419n/a }
1420n/a else if (col > tok->indstack[tok->indent]) {
1421n/a /* Indent -- always one */
1422n/a if (tok->indent+1 >= MAXINDENT) {
1423n/a tok->done = E_TOODEEP;
1424n/a tok->cur = tok->inp;
1425n/a return ERRORTOKEN;
1426n/a }
1427n/a if (altcol <= tok->altindstack[tok->indent]) {
1428n/a if (indenterror(tok)) {
1429n/a return ERRORTOKEN;
1430n/a }
1431n/a }
1432n/a tok->pendin++;
1433n/a tok->indstack[++tok->indent] = col;
1434n/a tok->altindstack[tok->indent] = altcol;
1435n/a }
1436n/a else /* col < tok->indstack[tok->indent] */ {
1437n/a /* Dedent -- any number, must be consistent */
1438n/a while (tok->indent > 0 &&
1439n/a col < tok->indstack[tok->indent]) {
1440n/a tok->pendin--;
1441n/a tok->indent--;
1442n/a }
1443n/a if (col != tok->indstack[tok->indent]) {
1444n/a tok->done = E_DEDENT;
1445n/a tok->cur = tok->inp;
1446n/a return ERRORTOKEN;
1447n/a }
1448n/a if (altcol != tok->altindstack[tok->indent]) {
1449n/a if (indenterror(tok)) {
1450n/a return ERRORTOKEN;
1451n/a }
1452n/a }
1453n/a }
1454n/a }
1455n/a }
1456n/a
1457n/a tok->start = tok->cur;
1458n/a
1459n/a /* Return pending indents/dedents */
1460n/a if (tok->pendin != 0) {
1461n/a if (tok->pendin < 0) {
1462n/a tok->pendin++;
1463n/a return DEDENT;
1464n/a }
1465n/a else {
1466n/a tok->pendin--;
1467n/a return INDENT;
1468n/a }
1469n/a }
1470n/a
1471n/a if (tok->async_def
1472n/a && !blankline
1473n/a && tok->level == 0
1474n/a /* There was a NEWLINE after ASYNC DEF,
1475n/a so we're past the signature. */
1476n/a && tok->async_def_nl
1477n/a /* Current indentation level is less than where
1478n/a the async function was defined */
1479n/a && tok->async_def_indent >= tok->indent)
1480n/a {
1481n/a tok->async_def = 0;
1482n/a tok->async_def_indent = 0;
1483n/a tok->async_def_nl = 0;
1484n/a }
1485n/a
1486n/a again:
1487n/a tok->start = NULL;
1488n/a /* Skip spaces */
1489n/a do {
1490n/a c = tok_nextc(tok);
1491n/a } while (c == ' ' || c == '\t' || c == '\014');
1492n/a
1493n/a /* Set start of current token */
1494n/a tok->start = tok->cur - 1;
1495n/a
1496n/a /* Skip comment */
1497n/a if (c == '#') {
1498n/a while (c != EOF && c != '\n') {
1499n/a c = tok_nextc(tok);
1500n/a }
1501n/a }
1502n/a
1503n/a /* Check for EOF and errors now */
1504n/a if (c == EOF) {
1505n/a return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1506n/a }
1507n/a
1508n/a /* Identifier (most frequent token!) */
1509n/a nonascii = 0;
1510n/a if (is_potential_identifier_start(c)) {
1511n/a /* Process the various legal combinations of b"", r"", u"", and f"". */
1512n/a int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1513n/a while (1) {
1514n/a if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
1515n/a saw_b = 1;
1516n/a /* Since this is a backwards compatibility support literal we don't
1517n/a want to support it in arbitrary order like byte literals. */
1518n/a else if (!(saw_b || saw_u || saw_r || saw_f)
1519n/a && (c == 'u'|| c == 'U')) {
1520n/a saw_u = 1;
1521n/a }
1522n/a /* ur"" and ru"" are not supported */
1523n/a else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
1524n/a saw_r = 1;
1525n/a }
1526n/a else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
1527n/a saw_f = 1;
1528n/a }
1529n/a else {
1530n/a break;
1531n/a }
1532n/a c = tok_nextc(tok);
1533n/a if (c == '"' || c == '\'') {
1534n/a goto letter_quote;
1535n/a }
1536n/a }
1537n/a while (is_potential_identifier_char(c)) {
1538n/a if (c >= 128) {
1539n/a nonascii = 1;
1540n/a }
1541n/a c = tok_nextc(tok);
1542n/a }
1543n/a tok_backup(tok, c);
1544n/a if (nonascii && !verify_identifier(tok)) {
1545n/a return ERRORTOKEN;
1546n/a }
1547n/a *p_start = tok->start;
1548n/a *p_end = tok->cur;
1549n/a
1550n/a /* async/await parsing block. */
1551n/a if (tok->cur - tok->start == 5) {
1552n/a /* Current token length is 5. */
1553n/a if (tok->async_def) {
1554n/a /* We're inside an 'async def' function. */
1555n/a if (memcmp(tok->start, "async", 5) == 0) {
1556n/a return ASYNC;
1557n/a }
1558n/a if (memcmp(tok->start, "await", 5) == 0) {
1559n/a return AWAIT;
1560n/a }
1561n/a }
1562n/a else if (memcmp(tok->start, "async", 5) == 0) {
1563n/a /* The current token is 'async'.
1564n/a Look ahead one token.*/
1565n/a
1566n/a struct tok_state ahead_tok;
1567n/a char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
1568n/a int ahead_tok_kind;
1569n/a
1570n/a memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1571n/a ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1572n/a &ahead_tok_end);
1573n/a
1574n/a if (ahead_tok_kind == NAME
1575n/a && ahead_tok.cur - ahead_tok.start == 3
1576n/a && memcmp(ahead_tok.start, "def", 3) == 0)
1577n/a {
1578n/a /* The next token is going to be 'def', so instead of
1579n/a returning 'async' NAME token, we return ASYNC. */
1580n/a tok->async_def_indent = tok->indent;
1581n/a tok->async_def = 1;
1582n/a return ASYNC;
1583n/a }
1584n/a }
1585n/a }
1586n/a
1587n/a return NAME;
1588n/a }
1589n/a
1590n/a /* Newline */
1591n/a if (c == '\n') {
1592n/a tok->atbol = 1;
1593n/a if (blankline || tok->level > 0) {
1594n/a goto nextline;
1595n/a }
1596n/a *p_start = tok->start;
1597n/a *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1598n/a tok->cont_line = 0;
1599n/a if (tok->async_def) {
1600n/a /* We're somewhere inside an 'async def' function, and
1601n/a we've encountered a NEWLINE after its signature. */
1602n/a tok->async_def_nl = 1;
1603n/a }
1604n/a return NEWLINE;
1605n/a }
1606n/a
1607n/a /* Period or number starting with period? */
1608n/a if (c == '.') {
1609n/a c = tok_nextc(tok);
1610n/a if (isdigit(c)) {
1611n/a goto fraction;
1612n/a } else if (c == '.') {
1613n/a c = tok_nextc(tok);
1614n/a if (c == '.') {
1615n/a *p_start = tok->start;
1616n/a *p_end = tok->cur;
1617n/a return ELLIPSIS;
1618n/a }
1619n/a else {
1620n/a tok_backup(tok, c);
1621n/a }
1622n/a tok_backup(tok, '.');
1623n/a }
1624n/a else {
1625n/a tok_backup(tok, c);
1626n/a }
1627n/a *p_start = tok->start;
1628n/a *p_end = tok->cur;
1629n/a return DOT;
1630n/a }
1631n/a
1632n/a /* Number */
1633n/a if (isdigit(c)) {
1634n/a if (c == '0') {
1635n/a /* Hex, octal or binary -- maybe. */
1636n/a c = tok_nextc(tok);
1637n/a if (c == 'x' || c == 'X') {
1638n/a /* Hex */
1639n/a c = tok_nextc(tok);
1640n/a do {
1641n/a if (c == '_') {
1642n/a c = tok_nextc(tok);
1643n/a }
1644n/a if (!isxdigit(c)) {
1645n/a tok->done = E_TOKEN;
1646n/a tok_backup(tok, c);
1647n/a return ERRORTOKEN;
1648n/a }
1649n/a do {
1650n/a c = tok_nextc(tok);
1651n/a } while (isxdigit(c));
1652n/a } while (c == '_');
1653n/a }
1654n/a else if (c == 'o' || c == 'O') {
1655n/a /* Octal */
1656n/a c = tok_nextc(tok);
1657n/a do {
1658n/a if (c == '_') {
1659n/a c = tok_nextc(tok);
1660n/a }
1661n/a if (c < '0' || c >= '8') {
1662n/a tok->done = E_TOKEN;
1663n/a tok_backup(tok, c);
1664n/a return ERRORTOKEN;
1665n/a }
1666n/a do {
1667n/a c = tok_nextc(tok);
1668n/a } while ('0' <= c && c < '8');
1669n/a } while (c == '_');
1670n/a }
1671n/a else if (c == 'b' || c == 'B') {
1672n/a /* Binary */
1673n/a c = tok_nextc(tok);
1674n/a do {
1675n/a if (c == '_') {
1676n/a c = tok_nextc(tok);
1677n/a }
1678n/a if (c != '0' && c != '1') {
1679n/a tok->done = E_TOKEN;
1680n/a tok_backup(tok, c);
1681n/a return ERRORTOKEN;
1682n/a }
1683n/a do {
1684n/a c = tok_nextc(tok);
1685n/a } while (c == '0' || c == '1');
1686n/a } while (c == '_');
1687n/a }
1688n/a else {
1689n/a int nonzero = 0;
1690n/a /* maybe old-style octal; c is first char of it */
1691n/a /* in any case, allow '0' as a literal */
1692n/a while (1) {
1693n/a if (c == '_') {
1694n/a c = tok_nextc(tok);
1695n/a if (!isdigit(c)) {
1696n/a tok->done = E_TOKEN;
1697n/a tok_backup(tok, c);
1698n/a return ERRORTOKEN;
1699n/a }
1700n/a }
1701n/a if (c != '0') {
1702n/a break;
1703n/a }
1704n/a c = tok_nextc(tok);
1705n/a }
1706n/a if (isdigit(c)) {
1707n/a nonzero = 1;
1708n/a c = tok_decimal_tail(tok);
1709n/a if (c == 0) {
1710n/a return ERRORTOKEN;
1711n/a }
1712n/a }
1713n/a if (c == '.') {
1714n/a c = tok_nextc(tok);
1715n/a goto fraction;
1716n/a }
1717n/a else if (c == 'e' || c == 'E') {
1718n/a goto exponent;
1719n/a }
1720n/a else if (c == 'j' || c == 'J') {
1721n/a goto imaginary;
1722n/a }
1723n/a else if (nonzero) {
1724n/a /* Old-style octal: now disallowed. */
1725n/a tok->done = E_TOKEN;
1726n/a tok_backup(tok, c);
1727n/a return ERRORTOKEN;
1728n/a }
1729n/a }
1730n/a }
1731n/a else {
1732n/a /* Decimal */
1733n/a c = tok_decimal_tail(tok);
1734n/a if (c == 0) {
1735n/a return ERRORTOKEN;
1736n/a }
1737n/a {
1738n/a /* Accept floating point numbers. */
1739n/a if (c == '.') {
1740n/a c = tok_nextc(tok);
1741n/a fraction:
1742n/a /* Fraction */
1743n/a if (isdigit(c)) {
1744n/a c = tok_decimal_tail(tok);
1745n/a if (c == 0) {
1746n/a return ERRORTOKEN;
1747n/a }
1748n/a }
1749n/a }
1750n/a if (c == 'e' || c == 'E') {
1751n/a int e;
1752n/a exponent:
1753n/a e = c;
1754n/a /* Exponent part */
1755n/a c = tok_nextc(tok);
1756n/a if (c == '+' || c == '-') {
1757n/a c = tok_nextc(tok);
1758n/a if (!isdigit(c)) {
1759n/a tok->done = E_TOKEN;
1760n/a tok_backup(tok, c);
1761n/a return ERRORTOKEN;
1762n/a }
1763n/a } else if (!isdigit(c)) {
1764n/a tok_backup(tok, c);
1765n/a tok_backup(tok, e);
1766n/a *p_start = tok->start;
1767n/a *p_end = tok->cur;
1768n/a return NUMBER;
1769n/a }
1770n/a c = tok_decimal_tail(tok);
1771n/a if (c == 0) {
1772n/a return ERRORTOKEN;
1773n/a }
1774n/a }
1775n/a if (c == 'j' || c == 'J') {
1776n/a /* Imaginary part */
1777n/a imaginary:
1778n/a c = tok_nextc(tok);
1779n/a }
1780n/a }
1781n/a }
1782n/a tok_backup(tok, c);
1783n/a *p_start = tok->start;
1784n/a *p_end = tok->cur;
1785n/a return NUMBER;
1786n/a }
1787n/a
1788n/a letter_quote:
1789n/a /* String */
1790n/a if (c == '\'' || c == '"') {
1791n/a int quote = c;
1792n/a int quote_size = 1; /* 1 or 3 */
1793n/a int end_quote_size = 0;
1794n/a
1795n/a /* Find the quote size and start of string */
1796n/a c = tok_nextc(tok);
1797n/a if (c == quote) {
1798n/a c = tok_nextc(tok);
1799n/a if (c == quote) {
1800n/a quote_size = 3;
1801n/a }
1802n/a else {
1803n/a end_quote_size = 1; /* empty string found */
1804n/a }
1805n/a }
1806n/a if (c != quote) {
1807n/a tok_backup(tok, c);
1808n/a }
1809n/a
1810n/a /* Get rest of string */
1811n/a while (end_quote_size != quote_size) {
1812n/a c = tok_nextc(tok);
1813n/a if (c == EOF) {
1814n/a if (quote_size == 3) {
1815n/a tok->done = E_EOFS;
1816n/a }
1817n/a else {
1818n/a tok->done = E_EOLS;
1819n/a }
1820n/a tok->cur = tok->inp;
1821n/a return ERRORTOKEN;
1822n/a }
1823n/a if (quote_size == 1 && c == '\n') {
1824n/a tok->done = E_EOLS;
1825n/a tok->cur = tok->inp;
1826n/a return ERRORTOKEN;
1827n/a }
1828n/a if (c == quote) {
1829n/a end_quote_size += 1;
1830n/a }
1831n/a else {
1832n/a end_quote_size = 0;
1833n/a if (c == '\\') {
1834n/a tok_nextc(tok); /* skip escaped char */
1835n/a }
1836n/a }
1837n/a }
1838n/a
1839n/a *p_start = tok->start;
1840n/a *p_end = tok->cur;
1841n/a return STRING;
1842n/a }
1843n/a
1844n/a /* Line continuation */
1845n/a if (c == '\\') {
1846n/a c = tok_nextc(tok);
1847n/a if (c != '\n') {
1848n/a tok->done = E_LINECONT;
1849n/a tok->cur = tok->inp;
1850n/a return ERRORTOKEN;
1851n/a }
1852n/a tok->cont_line = 1;
1853n/a goto again; /* Read next line */
1854n/a }
1855n/a
1856n/a /* Check for two-character token */
1857n/a {
1858n/a int c2 = tok_nextc(tok);
1859n/a int token = PyToken_TwoChars(c, c2);
1860n/a if (token != OP) {
1861n/a int c3 = tok_nextc(tok);
1862n/a int token3 = PyToken_ThreeChars(c, c2, c3);
1863n/a if (token3 != OP) {
1864n/a token = token3;
1865n/a }
1866n/a else {
1867n/a tok_backup(tok, c3);
1868n/a }
1869n/a *p_start = tok->start;
1870n/a *p_end = tok->cur;
1871n/a return token;
1872n/a }
1873n/a tok_backup(tok, c2);
1874n/a }
1875n/a
1876n/a /* Keep track of parentheses nesting level */
1877n/a switch (c) {
1878n/a case '(':
1879n/a case '[':
1880n/a case '{':
1881n/a tok->level++;
1882n/a break;
1883n/a case ')':
1884n/a case ']':
1885n/a case '}':
1886n/a tok->level--;
1887n/a break;
1888n/a }
1889n/a
1890n/a /* Punctuation character */
1891n/a *p_start = tok->start;
1892n/a *p_end = tok->cur;
1893n/a return PyToken_OneChar(c);
1894n/a}
1895n/a
1896n/aint
1897n/aPyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1898n/a{
1899n/a int result = tok_get(tok, p_start, p_end);
1900n/a if (tok->decoding_erred) {
1901n/a result = ERRORTOKEN;
1902n/a tok->done = E_DECODE;
1903n/a }
1904n/a return result;
1905n/a}
1906n/a
1907n/a/* Get the encoding of a Python file. Check for the coding cookie and check if
1908n/a the file starts with a BOM.
1909n/a
1910n/a PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1911n/a encoding in the first or second line of the file (in which case the encoding
1912n/a should be assumed to be UTF-8).
1913n/a
1914n/a The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1915n/a by the caller. */
1916n/a
1917n/achar *
1918n/aPyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
1919n/a{
1920n/a struct tok_state *tok;
1921n/a FILE *fp;
1922n/a char *p_start =NULL , *p_end =NULL , *encoding = NULL;
1923n/a
1924n/a#ifndef PGEN
1925n/a fd = _Py_dup(fd);
1926n/a#else
1927n/a fd = dup(fd);
1928n/a#endif
1929n/a if (fd < 0) {
1930n/a return NULL;
1931n/a }
1932n/a
1933n/a fp = fdopen(fd, "r");
1934n/a if (fp == NULL) {
1935n/a return NULL;
1936n/a }
1937n/a tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1938n/a if (tok == NULL) {
1939n/a fclose(fp);
1940n/a return NULL;
1941n/a }
1942n/a#ifndef PGEN
1943n/a if (filename != NULL) {
1944n/a Py_INCREF(filename);
1945n/a tok->filename = filename;
1946n/a }
1947n/a else {
1948n/a tok->filename = PyUnicode_FromString("<string>");
1949n/a if (tok->filename == NULL) {
1950n/a fclose(fp);
1951n/a PyTokenizer_Free(tok);
1952n/a return encoding;
1953n/a }
1954n/a }
1955n/a#endif
1956n/a while (tok->lineno < 2 && tok->done == E_OK) {
1957n/a PyTokenizer_Get(tok, &p_start, &p_end);
1958n/a }
1959n/a fclose(fp);
1960n/a if (tok->encoding) {
1961n/a encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1962n/a if (encoding)
1963n/a strcpy(encoding, tok->encoding);
1964n/a }
1965n/a PyTokenizer_Free(tok);
1966n/a return encoding;
1967n/a}
1968n/a
1969n/achar *
1970n/aPyTokenizer_FindEncoding(int fd)
1971n/a{
1972n/a return PyTokenizer_FindEncodingFilename(fd, NULL);
1973n/a}
1974n/a
1975n/a#ifdef Py_DEBUG
1976n/a
1977n/avoid
1978n/atok_dump(int type, char *start, char *end)
1979n/a{
1980n/a printf("%s", _PyParser_TokenNames[type]);
1981n/a if (type == NAME || type == NUMBER || type == STRING || type == OP)
1982n/a printf("(%.*s)", (int)(end - start), start);
1983n/a}
1984n/a
1985n/a#endif