Python code coverage for Parser/tokenizer.c

#	count	content
1	n/a
2	n/a	/* Tokenizer implementation */
3	n/a
4	n/a	#include "Python.h"
5	n/a	#include "pgenheaders.h"
6	n/a
7	n/a	#include <ctype.h>
8	n/a	#include <assert.h>
9	n/a
10	n/a	#include "tokenizer.h"
11	n/a	#include "errcode.h"
12	n/a
13	n/a	#ifndef PGEN
14	n/a	#include "unicodeobject.h"
15	n/a	#include "bytesobject.h"
16	n/a	#include "fileobject.h"
17	n/a	#include "codecs.h"
18	n/a	#include "abstract.h"
19	n/a	#endif /* PGEN */
20	n/a
21	n/a	#define is_potential_identifier_start(c) (\
22	n/a	(c >= 'a' && c <= 'z')\
23	n/a	\|\| (c >= 'A' && c <= 'Z')\
24	n/a	\|\| c == '_'\
25	n/a	\|\| (c >= 128))
26	n/a
27	n/a	#define is_potential_identifier_char(c) (\
28	n/a	(c >= 'a' && c <= 'z')\
29	n/a	\|\| (c >= 'A' && c <= 'Z')\
30	n/a	\|\| (c >= '0' && c <= '9')\
31	n/a	\|\| c == '_'\
32	n/a	\|\| (c >= 128))
33	n/a
34	n/a	extern char PyOS_Readline(FILE , FILE , const char );
35	n/a	/* Return malloc'ed string including trailing \n;
36	n/a	empty malloc'ed string for EOF;
37	n/a	NULL if interrupted */
38	n/a
39	n/a	/* Don't ever change this -- it would break the portability of Python code */
40	n/a	#define TABSIZE 8
41	n/a
42	n/a	/* Forward */
43	n/a	static struct tok_state *tok_new(void);
44	n/a	static int tok_nextc(struct tok_state *tok);
45	n/a	static void tok_backup(struct tok_state *tok, int c);
46	n/a
47	n/a
48	n/a	/* Token names */
49	n/a
50	n/a	const char *_PyParser_TokenNames[] = {
51	n/a	"ENDMARKER",
52	n/a	"NAME",
53	n/a	"NUMBER",
54	n/a	"STRING",
55	n/a	"NEWLINE",
56	n/a	"INDENT",
57	n/a	"DEDENT",
58	n/a	"LPAR",
59	n/a	"RPAR",
60	n/a	"LSQB",
61	n/a	"RSQB",
62	n/a	"COLON",
63	n/a	"COMMA",
64	n/a	"SEMI",
65	n/a	"PLUS",
66	n/a	"MINUS",
67	n/a	"STAR",
68	n/a	"SLASH",
69	n/a	"VBAR",
70	n/a	"AMPER",
71	n/a	"LESS",
72	n/a	"GREATER",
73	n/a	"EQUAL",
74	n/a	"DOT",
75	n/a	"PERCENT",
76	n/a	"LBRACE",
77	n/a	"RBRACE",
78	n/a	"EQEQUAL",
79	n/a	"NOTEQUAL",
80	n/a	"LESSEQUAL",
81	n/a	"GREATEREQUAL",
82	n/a	"TILDE",
83	n/a	"CIRCUMFLEX",
84	n/a	"LEFTSHIFT",
85	n/a	"RIGHTSHIFT",
86	n/a	"DOUBLESTAR",
87	n/a	"PLUSEQUAL",
88	n/a	"MINEQUAL",
89	n/a	"STAREQUAL",
90	n/a	"SLASHEQUAL",
91	n/a	"PERCENTEQUAL",
92	n/a	"AMPEREQUAL",
93	n/a	"VBAREQUAL",
94	n/a	"CIRCUMFLEXEQUAL",
95	n/a	"LEFTSHIFTEQUAL",
96	n/a	"RIGHTSHIFTEQUAL",
97	n/a	"DOUBLESTAREQUAL",
98	n/a	"DOUBLESLASH",
99	n/a	"DOUBLESLASHEQUAL",
100	n/a	"AT",
101	n/a	"ATEQUAL",
102	n/a	"RARROW",
103	n/a	"ELLIPSIS",
104	n/a	/* This table must match the #defines in token.h! */
105	n/a	"OP",
106	n/a	"AWAIT",
107	n/a	"ASYNC",
108	n/a	"<ERRORTOKEN>",
109	n/a	"<N_TOKENS>"
110	n/a	};
111	n/a
112	n/a
113	n/a	/* Create and initialize a new tok_state structure */
114	n/a
115	n/a	static struct tok_state *
116	n/a	tok_new(void)
117	n/a	{
118	n/a	struct tok_state tok = (struct tok_state )PyMem_MALLOC(
119	n/a	sizeof(struct tok_state));
120	n/a	if (tok == NULL)
121	n/a	return NULL;
122	n/a	tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
123	n/a	tok->done = E_OK;
124	n/a	tok->fp = NULL;
125	n/a	tok->input = NULL;
126	n/a	tok->tabsize = TABSIZE;
127	n/a	tok->indent = 0;
128	n/a	tok->indstack[0] = 0;
129	n/a
130	n/a	tok->atbol = 1;
131	n/a	tok->pendin = 0;
132	n/a	tok->prompt = tok->nextprompt = NULL;
133	n/a	tok->lineno = 0;
134	n/a	tok->level = 0;
135	n/a	tok->altwarning = 1;
136	n/a	tok->alterror = 1;
137	n/a	tok->alttabsize = 1;
138	n/a	tok->altindstack[0] = 0;
139	n/a	tok->decoding_state = STATE_INIT;
140	n/a	tok->decoding_erred = 0;
141	n/a	tok->read_coding_spec = 0;
142	n/a	tok->enc = NULL;
143	n/a	tok->encoding = NULL;
144	n/a	tok->cont_line = 0;
145	n/a	#ifndef PGEN
146	n/a	tok->filename = NULL;
147	n/a	tok->decoding_readline = NULL;
148	n/a	tok->decoding_buffer = NULL;
149	n/a	#endif
150	n/a
151	n/a	tok->async_def = 0;
152	n/a	tok->async_def_indent = 0;
153	n/a	tok->async_def_nl = 0;
154	n/a
155	n/a	return tok;
156	n/a	}
157	n/a
158	n/a	static char *
159	n/a	new_string(const char s, Py_ssize_t len, struct tok_state tok)
160	n/a	{
161	n/a	char* result = (char *)PyMem_MALLOC(len + 1);
162	n/a	if (!result) {
163	n/a	tok->done = E_NOMEM;
164	n/a	return NULL;
165	n/a	}
166	n/a	memcpy(result, s, len);
167	n/a	result[len] = '\0';
168	n/a	return result;
169	n/a	}
170	n/a
171	n/a	#ifdef PGEN
172	n/a
173	n/a	static char *
174	n/a	decoding_fgets(char s, int size, struct tok_state tok)
175	n/a	{
176	n/a	return fgets(s, size, tok->fp);
177	n/a	}
178	n/a
179	n/a	static int
180	n/a	decoding_feof(struct tok_state *tok)
181	n/a	{
182	n/a	return feof(tok->fp);
183	n/a	}
184	n/a
185	n/a	static char *
186	n/a	decode_str(const char str, int exec_input, struct tok_state tok)
187	n/a	{
188	n/a	return new_string(str, strlen(str), tok);
189	n/a	}
190	n/a
191	n/a	#else /* PGEN */
192	n/a
193	n/a	static char *
194	n/a	error_ret(struct tok_state tok) / XXX */
195	n/a	{
196	n/a	tok->decoding_erred = 1;
197	n/a	if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
198	n/a	PyMem_FREE(tok->buf);
199	n/a	tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
200	n/a	tok->done = E_DECODE;
201	n/a	return NULL; /* as if it were EOF */
202	n/a	}
203	n/a
204	n/a
205	n/a	static const char *
206	n/a	get_normal_name(const char s) / for utf-8 and latin-1 */
207	n/a	{
208	n/a	char buf[13];
209	n/a	int i;
210	n/a	for (i = 0; i < 12; i++) {
211	n/a	int c = s[i];
212	n/a	if (c == '\0')
213	n/a	break;
214	n/a	else if (c == '_')
215	n/a	buf[i] = '-';
216	n/a	else
217	n/a	buf[i] = tolower(c);
218	n/a	}
219	n/a	buf[i] = '\0';
220	n/a	if (strcmp(buf, "utf-8") == 0 \|\|
221	n/a	strncmp(buf, "utf-8-", 6) == 0)
222	n/a	return "utf-8";
223	n/a	else if (strcmp(buf, "latin-1") == 0 \|\|
224	n/a	strcmp(buf, "iso-8859-1") == 0 \|\|
225	n/a	strcmp(buf, "iso-latin-1") == 0 \|\|
226	n/a	strncmp(buf, "latin-1-", 8) == 0 \|\|
227	n/a	strncmp(buf, "iso-8859-1-", 11) == 0 \|\|
228	n/a	strncmp(buf, "iso-latin-1-", 12) == 0)
229	n/a	return "iso-8859-1";
230	n/a	else
231	n/a	return s;
232	n/a	}
233	n/a
234	n/a	/* Return the coding spec in S, or NULL if none is found. */
235	n/a
236	n/a	static int
237	n/a	get_coding_spec(const char s, char spec, Py_ssize_t size, struct tok_state tok)
238	n/a	{
239	n/a	Py_ssize_t i;
240	n/a	*spec = NULL;
241	n/a	/* Coding spec must be in a comment, and that comment must be
242	n/a	* the only statement on the source code line. */
243	n/a	for (i = 0; i < size - 6; i++) {
244	n/a	if (s[i] == '#')
245	n/a	break;
246	n/a	if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
247	n/a	return 1;
248	n/a	}
249	n/a	for (; i < size - 6; i++) { /* XXX inefficient search */
250	n/a	const char* t = s + i;
251	n/a	if (strncmp(t, "coding", 6) == 0) {
252	n/a	const char* begin = NULL;
253	n/a	t += 6;
254	n/a	if (t[0] != ':' && t[0] != '=')
255	n/a	continue;
256	n/a	do {
257	n/a	t++;
258	n/a	} while (t[0] == '\x20' \|\| t[0] == '\t');
259	n/a
260	n/a	begin = t;
261	n/a	while (Py_ISALNUM(t[0]) \|\|
262	n/a	t[0] == '-' \|\| t[0] == '_' \|\| t[0] == '.')
263	n/a	t++;
264	n/a
265	n/a	if (begin < t) {
266	n/a	char* r = new_string(begin, t - begin, tok);
267	n/a	const char* q;
268	n/a	if (!r)
269	n/a	return 0;
270	n/a	q = get_normal_name(r);
271	n/a	if (r != q) {
272	n/a	PyMem_FREE(r);
273	n/a	r = new_string(q, strlen(q), tok);
274	n/a	if (!r)
275	n/a	return 0;
276	n/a	}
277	n/a	*spec = r;
278	n/a	break;
279	n/a	}
280	n/a	}
281	n/a	}
282	n/a	return 1;
283	n/a	}
284	n/a
285	n/a	/* Check whether the line contains a coding spec. If it does,
286	n/a	invoke the set_readline function for the new encoding.
287	n/a	This function receives the tok_state and the new encoding.
288	n/a	Return 1 on success, 0 on failure. */
289	n/a
290	n/a	static int
291	n/a	check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
292	n/a	int set_readline(struct tok_state , const char ))
293	n/a	{
294	n/a	char *cs;
295	n/a	int r = 1;
296	n/a
297	n/a	if (tok->cont_line) {
298	n/a	/* It's a continuation line, so it can't be a coding spec. */
299	n/a	tok->read_coding_spec = 1;
300	n/a	return 1;
301	n/a	}
302	n/a	if (!get_coding_spec(line, &cs, size, tok))
303	n/a	return 0;
304	n/a	if (!cs) {
305	n/a	Py_ssize_t i;
306	n/a	for (i = 0; i < size; i++) {
307	n/a	if (line[i] == '#' \|\| line[i] == '\n' \|\| line[i] == '\r')
308	n/a	break;
309	n/a	if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
310	n/a	/* Stop checking coding spec after a line containing
311	n/a	* anything except a comment. */
312	n/a	tok->read_coding_spec = 1;
313	n/a	break;
314	n/a	}
315	n/a	}
316	n/a	return 1;
317	n/a	}
318	n/a	tok->read_coding_spec = 1;
319	n/a	if (tok->encoding == NULL) {
320	n/a	assert(tok->decoding_state == STATE_RAW);
321	n/a	if (strcmp(cs, "utf-8") == 0) {
322	n/a	tok->encoding = cs;
323	n/a	} else {
324	n/a	r = set_readline(tok, cs);
325	n/a	if (r) {
326	n/a	tok->encoding = cs;
327	n/a	tok->decoding_state = STATE_NORMAL;
328	n/a	}
329	n/a	else {
330	n/a	PyErr_Format(PyExc_SyntaxError,
331	n/a	"encoding problem: %s", cs);
332	n/a	PyMem_FREE(cs);
333	n/a	}
334	n/a	}
335	n/a	} else { /* then, compare cs with BOM */
336	n/a	r = (strcmp(tok->encoding, cs) == 0);
337	n/a	if (!r)
338	n/a	PyErr_Format(PyExc_SyntaxError,
339	n/a	"encoding problem: %s with BOM", cs);
340	n/a	PyMem_FREE(cs);
341	n/a	}
342	n/a	return r;
343	n/a	}
344	n/a
345	n/a	/* See whether the file starts with a BOM. If it does,
346	n/a	invoke the set_readline function with the new encoding.
347	n/a	Return 1 on success, 0 on failure. */
348	n/a
349	n/a	static int
350	n/a	check_bom(int get_char(struct tok_state *),
351	n/a	void unget_char(int, struct tok_state *),
352	n/a	int set_readline(struct tok_state , const char ),
353	n/a	struct tok_state *tok)
354	n/a	{
355	n/a	int ch1, ch2, ch3;
356	n/a	ch1 = get_char(tok);
357	n/a	tok->decoding_state = STATE_RAW;
358	n/a	if (ch1 == EOF) {
359	n/a	return 1;
360	n/a	} else if (ch1 == 0xEF) {
361	n/a	ch2 = get_char(tok);
362	n/a	if (ch2 != 0xBB) {
363	n/a	unget_char(ch2, tok);
364	n/a	unget_char(ch1, tok);
365	n/a	return 1;
366	n/a	}
367	n/a	ch3 = get_char(tok);
368	n/a	if (ch3 != 0xBF) {
369	n/a	unget_char(ch3, tok);
370	n/a	unget_char(ch2, tok);
371	n/a	unget_char(ch1, tok);
372	n/a	return 1;
373	n/a	}
374	n/a	#if 0
375	n/a	/* Disable support for UTF-16 BOMs until a decision
376	n/a	is made whether this needs to be supported. */
377	n/a	} else if (ch1 == 0xFE) {
378	n/a	ch2 = get_char(tok);
379	n/a	if (ch2 != 0xFF) {
380	n/a	unget_char(ch2, tok);
381	n/a	unget_char(ch1, tok);
382	n/a	return 1;
383	n/a	}
384	n/a	if (!set_readline(tok, "utf-16-be"))
385	n/a	return 0;
386	n/a	tok->decoding_state = STATE_NORMAL;
387	n/a	} else if (ch1 == 0xFF) {
388	n/a	ch2 = get_char(tok);
389	n/a	if (ch2 != 0xFE) {
390	n/a	unget_char(ch2, tok);
391	n/a	unget_char(ch1, tok);
392	n/a	return 1;
393	n/a	}
394	n/a	if (!set_readline(tok, "utf-16-le"))
395	n/a	return 0;
396	n/a	tok->decoding_state = STATE_NORMAL;
397	n/a	#endif
398	n/a	} else {
399	n/a	unget_char(ch1, tok);
400	n/a	return 1;
401	n/a	}
402	n/a	if (tok->encoding != NULL)
403	n/a	PyMem_FREE(tok->encoding);
404	n/a	tok->encoding = new_string("utf-8", 5, tok);
405	n/a	if (!tok->encoding)
406	n/a	return 0;
407	n/a	/* No need to set_readline: input is already utf-8 */
408	n/a	return 1;
409	n/a	}
410	n/a
411	n/a	/* Read a line of text from TOK into S, using the stream in TOK.
412	n/a	Return NULL on failure, else S.
413	n/a
414	n/a	On entry, tok->decoding_buffer will be one of:
415	n/a	1) NULL: need to call tok->decoding_readline to get a new line
416	n/a	2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
417	n/a	stored the result in tok->decoding_buffer
418	n/a	3) PyByteArrayObject *: previous call to fp_readl did not have enough room
419	n/a	(in the s buffer) to copy entire contents of the line read
420	n/a	by tok->decoding_readline. tok->decoding_buffer has the overflow.
421	n/a	In this case, fp_readl is called in a loop (with an expanded buffer)
422	n/a	until the buffer ends with a '\n' (or until the end of the file is
423	n/a	reached): see tok_nextc and its calls to decoding_fgets.
424	n/a	*/
425	n/a
426	n/a	static char *
427	n/a	fp_readl(char s, int size, struct tok_state tok)
428	n/a	{
429	n/a	PyObject* bufobj;
430	n/a	const char *buf;
431	n/a	Py_ssize_t buflen;
432	n/a
433	n/a	/* Ask for one less byte so we can terminate it */
434	n/a	assert(size > 0);
435	n/a	size--;
436	n/a
437	n/a	if (tok->decoding_buffer) {
438	n/a	bufobj = tok->decoding_buffer;
439	n/a	Py_INCREF(bufobj);
440	n/a	}
441	n/a	else
442	n/a	{
443	n/a	bufobj = _PyObject_CallNoArg(tok->decoding_readline);
444	n/a	if (bufobj == NULL)
445	n/a	goto error;
446	n/a	}
447	n/a	if (PyUnicode_CheckExact(bufobj))
448	n/a	{
449	n/a	buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
450	n/a	if (buf == NULL) {
451	n/a	goto error;
452	n/a	}
453	n/a	}
454	n/a	else
455	n/a	{
456	n/a	buf = PyByteArray_AsString(bufobj);
457	n/a	if (buf == NULL) {
458	n/a	goto error;
459	n/a	}
460	n/a	buflen = PyByteArray_GET_SIZE(bufobj);
461	n/a	}
462	n/a
463	n/a	Py_XDECREF(tok->decoding_buffer);
464	n/a	if (buflen > size) {
465	n/a	/* Too many chars, the rest goes into tok->decoding_buffer */
466	n/a	tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
467	n/a	buflen-size);
468	n/a	if (tok->decoding_buffer == NULL)
469	n/a	goto error;
470	n/a	buflen = size;
471	n/a	}
472	n/a	else
473	n/a	tok->decoding_buffer = NULL;
474	n/a
475	n/a	memcpy(s, buf, buflen);
476	n/a	s[buflen] = '\0';
477	n/a	if (buflen == 0) /* EOF */
478	n/a	s = NULL;
479	n/a	Py_DECREF(bufobj);
480	n/a	return s;
481	n/a
482	n/a	error:
483	n/a	Py_XDECREF(bufobj);
484	n/a	return error_ret(tok);
485	n/a	}
486	n/a
487	n/a	/* Set the readline function for TOK to a StreamReader's
488	n/a	readline function. The StreamReader is named ENC.
489	n/a
490	n/a	This function is called from check_bom and check_coding_spec.
491	n/a
492	n/a	ENC is usually identical to the future value of tok->encoding,
493	n/a	except for the (currently unsupported) case of UTF-16.
494	n/a
495	n/a	Return 1 on success, 0 on failure. */
496	n/a
497	n/a	static int
498	n/a	fp_setreadl(struct tok_state tok, const char enc)
499	n/a	{
500	n/a	PyObject readline, io, *stream;
501	n/a	_Py_IDENTIFIER(open);
502	n/a	_Py_IDENTIFIER(readline);
503	n/a	int fd;
504	n/a	long pos;
505	n/a
506	n/a	fd = fileno(tok->fp);
507	n/a	/* Due to buffering the file offset for fd can be different from the file
508	n/a	* position of tok->fp. If tok->fp was opened in text mode on Windows,
509	n/a	* its file position counts CRLF as one char and can't be directly mapped
510	n/a	* to the file offset for fd. Instead we step back one byte and read to
511	n/a	* the end of line.*/
512	n/a	pos = ftell(tok->fp);
513	n/a	if (pos == -1 \|\|
514	n/a	lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
515	n/a	PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
516	n/a	return 0;
517	n/a	}
518	n/a
519	n/a	io = PyImport_ImportModuleNoBlock("io");
520	n/a	if (io == NULL)
521	n/a	return 0;
522	n/a
523	n/a	stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
524	n/a	fd, "r", -1, enc, Py_None, Py_None, Py_False);
525	n/a	Py_DECREF(io);
526	n/a	if (stream == NULL)
527	n/a	return 0;
528	n/a
529	n/a	readline = _PyObject_GetAttrId(stream, &PyId_readline);
530	n/a	Py_DECREF(stream);
531	n/a	if (readline == NULL)
532	n/a	return 0;
533	n/a	Py_XSETREF(tok->decoding_readline, readline);
534	n/a
535	n/a	if (pos > 0) {
536	n/a	PyObject *bufobj = _PyObject_CallNoArg(readline);
537	n/a	if (bufobj == NULL)
538	n/a	return 0;
539	n/a	Py_DECREF(bufobj);
540	n/a	}
541	n/a
542	n/a	return 1;
543	n/a	}
544	n/a
545	n/a	/* Fetch the next byte from TOK. */
546	n/a
547	n/a	static int fp_getc(struct tok_state *tok) {
548	n/a	return getc(tok->fp);
549	n/a	}
550	n/a
551	n/a	/* Unfetch the last byte back into TOK. */
552	n/a
553	n/a	static void fp_ungetc(int c, struct tok_state *tok) {
554	n/a	ungetc(c, tok->fp);
555	n/a	}
556	n/a
557	n/a	/* Check whether the characters at s start a valid
558	n/a	UTF-8 sequence. Return the number of characters forming
559	n/a	the sequence if yes, 0 if not. */
560	n/a	static int valid_utf8(const unsigned char* s)
561	n/a	{
562	n/a	int expected = 0;
563	n/a	int length;
564	n/a	if (*s < 0x80)
565	n/a	/* single-byte code */
566	n/a	return 1;
567	n/a	if (*s < 0xc0)
568	n/a	/* following byte */
569	n/a	return 0;
570	n/a	if (*s < 0xE0)
571	n/a	expected = 1;
572	n/a	else if (*s < 0xF0)
573	n/a	expected = 2;
574	n/a	else if (*s < 0xF8)
575	n/a	expected = 3;
576	n/a	else
577	n/a	return 0;
578	n/a	length = expected + 1;
579	n/a	for (; expected; expected--)
580	n/a	if (s[expected] < 0x80 \|\| s[expected] >= 0xC0)
581	n/a	return 0;
582	n/a	return length;
583	n/a	}
584	n/a
585	n/a	/* Read a line of input from TOK. Determine encoding
586	n/a	if necessary. */
587	n/a
588	n/a	static char *
589	n/a	decoding_fgets(char s, int size, struct tok_state tok)
590	n/a	{
591	n/a	char *line = NULL;
592	n/a	int badchar = 0;
593	n/a	for (;;) {
594	n/a	if (tok->decoding_state == STATE_NORMAL) {
595	n/a	/* We already have a codec associated with
596	n/a	this input. */
597	n/a	line = fp_readl(s, size, tok);
598	n/a	break;
599	n/a	} else if (tok->decoding_state == STATE_RAW) {
600	n/a	/* We want a 'raw' read. */
601	n/a	line = Py_UniversalNewlineFgets(s, size,
602	n/a	tok->fp, NULL);
603	n/a	break;
604	n/a	} else {
605	n/a	/* We have not yet determined the encoding.
606	n/a	If an encoding is found, use the file-pointer
607	n/a	reader functions from now on. */
608	n/a	if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
609	n/a	return error_ret(tok);
610	n/a	assert(tok->decoding_state != STATE_INIT);
611	n/a	}
612	n/a	}
613	n/a	if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
614	n/a	if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
615	n/a	return error_ret(tok);
616	n/a	}
617	n/a	}
618	n/a	#ifndef PGEN
619	n/a	/* The default encoding is UTF-8, so make sure we don't have any
620	n/a	non-UTF-8 sequences in it. */
621	n/a	if (line && !tok->encoding) {
622	n/a	unsigned char *c;
623	n/a	int length;
624	n/a	for (c = (unsigned char )line; c; c += length)
625	n/a	if (!(length = valid_utf8(c))) {
626	n/a	badchar = *c;
627	n/a	break;
628	n/a	}
629	n/a	}
630	n/a	if (badchar) {
631	n/a	/* Need to add 1 to the line number, since this line
632	n/a	has not been counted, yet. */
633	n/a	PyErr_Format(PyExc_SyntaxError,
634	n/a	"Non-UTF-8 code starting with '\\x%.2x' "
635	n/a	"in file %U on line %i, "
636	n/a	"but no encoding declared; "
637	n/a	"see http://python.org/dev/peps/pep-0263/ for details",
638	n/a	badchar, tok->filename, tok->lineno + 1);
639	n/a	return error_ret(tok);
640	n/a	}
641	n/a	#endif
642	n/a	return line;
643	n/a	}
644	n/a
645	n/a	static int
646	n/a	decoding_feof(struct tok_state *tok)
647	n/a	{
648	n/a	if (tok->decoding_state != STATE_NORMAL) {
649	n/a	return feof(tok->fp);
650	n/a	} else {
651	n/a	PyObject* buf = tok->decoding_buffer;
652	n/a	if (buf == NULL) {
653	n/a	buf = _PyObject_CallNoArg(tok->decoding_readline);
654	n/a	if (buf == NULL) {
655	n/a	error_ret(tok);
656	n/a	return 1;
657	n/a	} else {
658	n/a	tok->decoding_buffer = buf;
659	n/a	}
660	n/a	}
661	n/a	return PyObject_Length(buf) == 0;
662	n/a	}
663	n/a	}
664	n/a
665	n/a	/* Fetch a byte from TOK, using the string buffer. */
666	n/a
667	n/a	static int
668	n/a	buf_getc(struct tok_state *tok) {
669	n/a	return Py_CHARMASK(*tok->str++);
670	n/a	}
671	n/a
672	n/a	/* Unfetch a byte from TOK, using the string buffer. */
673	n/a
674	n/a	static void
675	n/a	buf_ungetc(int c, struct tok_state *tok) {
676	n/a	tok->str--;
677	n/a	assert(Py_CHARMASK(tok->str) == c); / tok->cur may point to read-only segment */
678	n/a	}
679	n/a
680	n/a	/* Set the readline function for TOK to ENC. For the string-based
681	n/a	tokenizer, this means to just record the encoding. */
682	n/a
683	n/a	static int
684	n/a	buf_setreadl(struct tok_state tok, const char enc) {
685	n/a	tok->enc = enc;
686	n/a	return 1;
687	n/a	}
688	n/a
689	n/a	/* Return a UTF-8 encoding Python string object from the
690	n/a	C byte string STR, which is encoded with ENC. */
691	n/a
692	n/a	static PyObject *
693	n/a	translate_into_utf8(const char* str, const char* enc) {
694	n/a	PyObject *utf8;
695	n/a	PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
696	n/a	if (buf == NULL)
697	n/a	return NULL;
698	n/a	utf8 = PyUnicode_AsUTF8String(buf);
699	n/a	Py_DECREF(buf);
700	n/a	return utf8;
701	n/a	}
702	n/a
703	n/a
704	n/a	static char *
705	n/a	translate_newlines(const char s, int exec_input, struct tok_state tok) {
706	n/a	int skip_next_lf = 0;
707	n/a	size_t needed_length = strlen(s) + 2, final_length;
708	n/a	char buf, current;
709	n/a	char c = '\0';
710	n/a	buf = PyMem_MALLOC(needed_length);
711	n/a	if (buf == NULL) {
712	n/a	tok->done = E_NOMEM;
713	n/a	return NULL;
714	n/a	}
715	n/a	for (current = buf; *s; s++, current++) {
716	n/a	c = *s;
717	n/a	if (skip_next_lf) {
718	n/a	skip_next_lf = 0;
719	n/a	if (c == '\n') {
720	n/a	c = *++s;
721	n/a	if (!c)
722	n/a	break;
723	n/a	}
724	n/a	}
725	n/a	if (c == '\r') {
726	n/a	skip_next_lf = 1;
727	n/a	c = '\n';
728	n/a	}
729	n/a	*current = c;
730	n/a	}
731	n/a	/* If this is exec input, add a newline to the end of the string if
732	n/a	there isn't one already. */
733	n/a	if (exec_input && c != '\n') {
734	n/a	*current = '\n';
735	n/a	current++;
736	n/a	}
737	n/a	*current = '\0';
738	n/a	final_length = current - buf + 1;
739	n/a	if (final_length < needed_length && final_length)
740	n/a	/* should never fail */
741	n/a	buf = PyMem_REALLOC(buf, final_length);
742	n/a	return buf;
743	n/a	}
744	n/a
745	n/a	/* Decode a byte string STR for use as the buffer of TOK.
746	n/a	Look for encoding declarations inside STR, and record them
747	n/a	inside TOK. */
748	n/a
749	n/a	static const char *
750	n/a	decode_str(const char input, int single, struct tok_state tok)
751	n/a	{
752	n/a	PyObject* utf8 = NULL;
753	n/a	const char *str;
754	n/a	const char *s;
755	n/a	const char *newl[2] = {NULL, NULL};
756	n/a	int lineno = 0;
757	n/a	tok->input = str = translate_newlines(input, single, tok);
758	n/a	if (str == NULL)
759	n/a	return NULL;
760	n/a	tok->enc = NULL;
761	n/a	tok->str = str;
762	n/a	if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
763	n/a	return error_ret(tok);
764	n/a	str = tok->str; /* string after BOM if any */
765	n/a	assert(str);
766	n/a	if (tok->enc != NULL) {
767	n/a	utf8 = translate_into_utf8(str, tok->enc);
768	n/a	if (utf8 == NULL)
769	n/a	return error_ret(tok);
770	n/a	str = PyBytes_AsString(utf8);
771	n/a	}
772	n/a	for (s = str;; s++) {
773	n/a	if (*s == '\0') break;
774	n/a	else if (*s == '\n') {
775	n/a	assert(lineno < 2);
776	n/a	newl[lineno] = s;
777	n/a	lineno++;
778	n/a	if (lineno == 2) break;
779	n/a	}
780	n/a	}
781	n/a	tok->enc = NULL;
782	n/a	/* need to check line 1 and 2 separately since check_coding_spec
783	n/a	assumes a single line as input */
784	n/a	if (newl[0]) {
785	n/a	if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
786	n/a	return error_ret(tok);
787	n/a	if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
788	n/a	if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
789	n/a	tok, buf_setreadl))
790	n/a	return error_ret(tok);
791	n/a	}
792	n/a	}
793	n/a	if (tok->enc != NULL) {
794	n/a	assert(utf8 == NULL);
795	n/a	utf8 = translate_into_utf8(str, tok->enc);
796	n/a	if (utf8 == NULL)
797	n/a	return error_ret(tok);
798	n/a	str = PyBytes_AS_STRING(utf8);
799	n/a	}
800	n/a	assert(tok->decoding_buffer == NULL);
801	n/a	tok->decoding_buffer = utf8; /* CAUTION */
802	n/a	return str;
803	n/a	}
804	n/a
805	n/a	#endif /* PGEN */
806	n/a
807	n/a	/* Set up tokenizer for string */
808	n/a
809	n/a	struct tok_state *
810	n/a	PyTokenizer_FromString(const char *str, int exec_input)
811	n/a	{
812	n/a	struct tok_state *tok = tok_new();
813	n/a	if (tok == NULL)
814	n/a	return NULL;
815	n/a	str = decode_str(str, exec_input, tok);
816	n/a	if (str == NULL) {
817	n/a	PyTokenizer_Free(tok);
818	n/a	return NULL;
819	n/a	}
820	n/a
821	n/a	/* XXX: constify members. */
822	n/a	tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
823	n/a	return tok;
824	n/a	}
825	n/a
826	n/a	struct tok_state *
827	n/a	PyTokenizer_FromUTF8(const char *str, int exec_input)
828	n/a	{
829	n/a	struct tok_state *tok = tok_new();
830	n/a	if (tok == NULL)
831	n/a	return NULL;
832	n/a	#ifndef PGEN
833	n/a	tok->input = str = translate_newlines(str, exec_input, tok);
834	n/a	#endif
835	n/a	if (str == NULL) {
836	n/a	PyTokenizer_Free(tok);
837	n/a	return NULL;
838	n/a	}
839	n/a	tok->decoding_state = STATE_RAW;
840	n/a	tok->read_coding_spec = 1;
841	n/a	tok->enc = NULL;
842	n/a	tok->str = str;
843	n/a	tok->encoding = (char *)PyMem_MALLOC(6);
844	n/a	if (!tok->encoding) {
845	n/a	PyTokenizer_Free(tok);
846	n/a	return NULL;
847	n/a	}
848	n/a	strcpy(tok->encoding, "utf-8");
849	n/a
850	n/a	/* XXX: constify members. */
851	n/a	tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
852	n/a	return tok;
853	n/a	}
854	n/a
855	n/a	/* Set up tokenizer for file */
856	n/a
857	n/a	struct tok_state *
858	n/a	PyTokenizer_FromFile(FILE fp, const char enc,
859	n/a	const char ps1, const char ps2)
860	n/a	{
861	n/a	struct tok_state *tok = tok_new();
862	n/a	if (tok == NULL)
863	n/a	return NULL;
864	n/a	if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
865	n/a	PyTokenizer_Free(tok);
866	n/a	return NULL;
867	n/a	}
868	n/a	tok->cur = tok->inp = tok->buf;
869	n/a	tok->end = tok->buf + BUFSIZ;
870	n/a	tok->fp = fp;
871	n/a	tok->prompt = ps1;
872	n/a	tok->nextprompt = ps2;
873	n/a	if (enc != NULL) {
874	n/a	/* Must copy encoding declaration since it
875	n/a	gets copied into the parse tree. */
876	n/a	tok->encoding = PyMem_MALLOC(strlen(enc)+1);
877	n/a	if (!tok->encoding) {
878	n/a	PyTokenizer_Free(tok);
879	n/a	return NULL;
880	n/a	}
881	n/a	strcpy(tok->encoding, enc);
882	n/a	tok->decoding_state = STATE_NORMAL;
883	n/a	}
884	n/a	return tok;
885	n/a	}
886	n/a
887	n/a
888	n/a	/* Free a tok_state structure */
889	n/a
890	n/a	void
891	n/a	PyTokenizer_Free(struct tok_state *tok)
892	n/a	{
893	n/a	if (tok->encoding != NULL)
894	n/a	PyMem_FREE(tok->encoding);
895	n/a	#ifndef PGEN
896	n/a	Py_XDECREF(tok->decoding_readline);
897	n/a	Py_XDECREF(tok->decoding_buffer);
898	n/a	Py_XDECREF(tok->filename);
899	n/a	#endif
900	n/a	if (tok->fp != NULL && tok->buf != NULL)
901	n/a	PyMem_FREE(tok->buf);
902	n/a	if (tok->input)
903	n/a	PyMem_FREE((char *)tok->input);
904	n/a	PyMem_FREE(tok);
905	n/a	}
906	n/a
907	n/a	/* Get next char, updating state; error code goes into tok->done */
908	n/a
909	n/a	static int
910	n/a	tok_nextc(struct tok_state *tok)
911	n/a	{
912	n/a	for (;;) {
913	n/a	if (tok->cur != tok->inp) {
914	n/a	return Py_CHARMASK(tok->cur++); / Fast path */
915	n/a	}
916	n/a	if (tok->done != E_OK)
917	n/a	return EOF;
918	n/a	if (tok->fp == NULL) {
919	n/a	char *end = strchr(tok->inp, '\n');
920	n/a	if (end != NULL)
921	n/a	end++;
922	n/a	else {
923	n/a	end = strchr(tok->inp, '\0');
924	n/a	if (end == tok->inp) {
925	n/a	tok->done = E_EOF;
926	n/a	return EOF;
927	n/a	}
928	n/a	}
929	n/a	if (tok->start == NULL)
930	n/a	tok->buf = tok->cur;
931	n/a	tok->line_start = tok->cur;
932	n/a	tok->lineno++;
933	n/a	tok->inp = end;
934	n/a	return Py_CHARMASK(*tok->cur++);
935	n/a	}
936	n/a	if (tok->prompt != NULL) {
937	n/a	char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
938	n/a	#ifndef PGEN
939	n/a	if (newtok != NULL) {
940	n/a	char *translated = translate_newlines(newtok, 0, tok);
941	n/a	PyMem_FREE(newtok);
942	n/a	if (translated == NULL)
943	n/a	return EOF;
944	n/a	newtok = translated;
945	n/a	}
946	n/a	if (tok->encoding && newtok && *newtok) {
947	n/a	/* Recode to UTF-8 */
948	n/a	Py_ssize_t buflen;
949	n/a	const char* buf;
950	n/a	PyObject *u = translate_into_utf8(newtok, tok->encoding);
951	n/a	PyMem_FREE(newtok);
952	n/a	if (!u) {
953	n/a	tok->done = E_DECODE;
954	n/a	return EOF;
955	n/a	}
956	n/a	buflen = PyBytes_GET_SIZE(u);
957	n/a	buf = PyBytes_AS_STRING(u);
958	n/a	newtok = PyMem_MALLOC(buflen+1);
959	n/a	strcpy(newtok, buf);
960	n/a	Py_DECREF(u);
961	n/a	}
962	n/a	#endif
963	n/a	if (tok->nextprompt != NULL)
964	n/a	tok->prompt = tok->nextprompt;
965	n/a	if (newtok == NULL)
966	n/a	tok->done = E_INTR;
967	n/a	else if (*newtok == '\0') {
968	n/a	PyMem_FREE(newtok);
969	n/a	tok->done = E_EOF;
970	n/a	}
971	n/a	else if (tok->start != NULL) {
972	n/a	size_t start = tok->start - tok->buf;
973	n/a	size_t oldlen = tok->cur - tok->buf;
974	n/a	size_t newlen = oldlen + strlen(newtok);
975	n/a	char *buf = tok->buf;
976	n/a	buf = (char *)PyMem_REALLOC(buf, newlen+1);
977	n/a	tok->lineno++;
978	n/a	if (buf == NULL) {
979	n/a	PyMem_FREE(tok->buf);
980	n/a	tok->buf = NULL;
981	n/a	PyMem_FREE(newtok);
982	n/a	tok->done = E_NOMEM;
983	n/a	return EOF;
984	n/a	}
985	n/a	tok->buf = buf;
986	n/a	tok->cur = tok->buf + oldlen;
987	n/a	tok->line_start = tok->cur;
988	n/a	strcpy(tok->buf + oldlen, newtok);
989	n/a	PyMem_FREE(newtok);
990	n/a	tok->inp = tok->buf + newlen;
991	n/a	tok->end = tok->inp + 1;
992	n/a	tok->start = tok->buf + start;
993	n/a	}
994	n/a	else {
995	n/a	tok->lineno++;
996	n/a	if (tok->buf != NULL)
997	n/a	PyMem_FREE(tok->buf);
998	n/a	tok->buf = newtok;
999	n/a	tok->cur = tok->buf;
1000	n/a	tok->line_start = tok->buf;
1001	n/a	tok->inp = strchr(tok->buf, '\0');
1002	n/a	tok->end = tok->inp + 1;
1003	n/a	}
1004	n/a	}
1005	n/a	else {
1006	n/a	int done = 0;
1007	n/a	Py_ssize_t cur = 0;
1008	n/a	char *pt;
1009	n/a	if (tok->start == NULL) {
1010	n/a	if (tok->buf == NULL) {
1011	n/a	tok->buf = (char *)
1012	n/a	PyMem_MALLOC(BUFSIZ);
1013	n/a	if (tok->buf == NULL) {
1014	n/a	tok->done = E_NOMEM;
1015	n/a	return EOF;
1016	n/a	}
1017	n/a	tok->end = tok->buf + BUFSIZ;
1018	n/a	}
1019	n/a	if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1020	n/a	tok) == NULL) {
1021	n/a	if (!tok->decoding_erred)
1022	n/a	tok->done = E_EOF;
1023	n/a	done = 1;
1024	n/a	}
1025	n/a	else {
1026	n/a	tok->done = E_OK;
1027	n/a	tok->inp = strchr(tok->buf, '\0');
1028	n/a	done = tok->inp == tok->buf \|\| tok->inp[-1] == '\n';
1029	n/a	}
1030	n/a	}
1031	n/a	else {
1032	n/a	cur = tok->cur - tok->buf;
1033	n/a	if (decoding_feof(tok)) {
1034	n/a	tok->done = E_EOF;
1035	n/a	done = 1;
1036	n/a	}
1037	n/a	else
1038	n/a	tok->done = E_OK;
1039	n/a	}
1040	n/a	tok->lineno++;
1041	n/a	/* Read until '\n' or EOF */
1042	n/a	while (!done) {
1043	n/a	Py_ssize_t curstart = tok->start == NULL ? -1 :
1044	n/a	tok->start - tok->buf;
1045	n/a	Py_ssize_t curvalid = tok->inp - tok->buf;
1046	n/a	Py_ssize_t newsize = curvalid + BUFSIZ;
1047	n/a	char *newbuf = tok->buf;
1048	n/a	newbuf = (char *)PyMem_REALLOC(newbuf,
1049	n/a	newsize);
1050	n/a	if (newbuf == NULL) {
1051	n/a	tok->done = E_NOMEM;
1052	n/a	tok->cur = tok->inp;
1053	n/a	return EOF;
1054	n/a	}
1055	n/a	tok->buf = newbuf;
1056	n/a	tok->cur = tok->buf + cur;
1057	n/a	tok->line_start = tok->cur;
1058	n/a	tok->inp = tok->buf + curvalid;
1059	n/a	tok->end = tok->buf + newsize;
1060	n/a	tok->start = curstart < 0 ? NULL :
1061	n/a	tok->buf + curstart;
1062	n/a	if (decoding_fgets(tok->inp,
1063	n/a	(int)(tok->end - tok->inp),
1064	n/a	tok) == NULL) {
1065	n/a	/* Break out early on decoding
1066	n/a	errors, as tok->buf will be NULL
1067	n/a	*/
1068	n/a	if (tok->decoding_erred)
1069	n/a	return EOF;
1070	n/a	/* Last line does not end in \n,
1071	n/a	fake one */
1072	n/a	strcpy(tok->inp, "\n");
1073	n/a	}
1074	n/a	tok->inp = strchr(tok->inp, '\0');
1075	n/a	done = tok->inp[-1] == '\n';
1076	n/a	}
1077	n/a	if (tok->buf != NULL) {
1078	n/a	tok->cur = tok->buf + cur;
1079	n/a	tok->line_start = tok->cur;
1080	n/a	/* replace "\r\n" with "\n" */
1081	n/a	/* For Mac leave the \r, giving a syntax error */
1082	n/a	pt = tok->inp - 2;
1083	n/a	if (pt >= tok->buf && *pt == '\r') {
1084	n/a	*pt++ = '\n';
1085	n/a	*pt = '\0';
1086	n/a	tok->inp = pt;
1087	n/a	}
1088	n/a	}
1089	n/a	}
1090	n/a	if (tok->done != E_OK) {
1091	n/a	if (tok->prompt != NULL)
1092	n/a	PySys_WriteStderr("\n");
1093	n/a	tok->cur = tok->inp;
1094	n/a	return EOF;
1095	n/a	}
1096	n/a	}
1097	n/a	/NOTREACHED/
1098	n/a	}
1099	n/a
1100	n/a
1101	n/a	/* Back-up one character */
1102	n/a
1103	n/a	static void
1104	n/a	tok_backup(struct tok_state *tok, int c)
1105	n/a	{
1106	n/a	if (c != EOF) {
1107	n/a	if (--tok->cur < tok->buf)
1108	n/a	Py_FatalError("tok_backup: beginning of buffer");
1109	n/a	if (*tok->cur != c)
1110	n/a	*tok->cur = c;
1111	n/a	}
1112	n/a	}
1113	n/a
1114	n/a
1115	n/a	/* Return the token corresponding to a single character */
1116	n/a
1117	n/a	int
1118	n/a	PyToken_OneChar(int c)
1119	n/a	{
1120	n/a	switch (c) {
1121	n/a	case '(': return LPAR;
1122	n/a	case ')': return RPAR;
1123	n/a	case '[': return LSQB;
1124	n/a	case ']': return RSQB;
1125	n/a	case ':': return COLON;
1126	n/a	case ',': return COMMA;
1127	n/a	case ';': return SEMI;
1128	n/a	case '+': return PLUS;
1129	n/a	case '-': return MINUS;
1130	n/a	case '*': return STAR;
1131	n/a	case '/': return SLASH;
1132	n/a	case '\|': return VBAR;
1133	n/a	case '&': return AMPER;
1134	n/a	case '<': return LESS;
1135	n/a	case '>': return GREATER;
1136	n/a	case '=': return EQUAL;
1137	n/a	case '.': return DOT;
1138	n/a	case '%': return PERCENT;
1139	n/a	case '{': return LBRACE;
1140	n/a	case '}': return RBRACE;
1141	n/a	case '^': return CIRCUMFLEX;
1142	n/a	case '~': return TILDE;
1143	n/a	case '@': return AT;
1144	n/a	default: return OP;
1145	n/a	}
1146	n/a	}
1147	n/a
1148	n/a
1149	n/a	int
1150	n/a	PyToken_TwoChars(int c1, int c2)
1151	n/a	{
1152	n/a	switch (c1) {
1153	n/a	case '=':
1154	n/a	switch (c2) {
1155	n/a	case '=': return EQEQUAL;
1156	n/a	}
1157	n/a	break;
1158	n/a	case '!':
1159	n/a	switch (c2) {
1160	n/a	case '=': return NOTEQUAL;
1161	n/a	}
1162	n/a	break;
1163	n/a	case '<':
1164	n/a	switch (c2) {
1165	n/a	case '>': return NOTEQUAL;
1166	n/a	case '=': return LESSEQUAL;
1167	n/a	case '<': return LEFTSHIFT;
1168	n/a	}
1169	n/a	break;
1170	n/a	case '>':
1171	n/a	switch (c2) {
1172	n/a	case '=': return GREATEREQUAL;
1173	n/a	case '>': return RIGHTSHIFT;
1174	n/a	}
1175	n/a	break;
1176	n/a	case '+':
1177	n/a	switch (c2) {
1178	n/a	case '=': return PLUSEQUAL;
1179	n/a	}
1180	n/a	break;
1181	n/a	case '-':
1182	n/a	switch (c2) {
1183	n/a	case '=': return MINEQUAL;
1184	n/a	case '>': return RARROW;
1185	n/a	}
1186	n/a	break;
1187	n/a	case '*':
1188	n/a	switch (c2) {
1189	n/a	case '*': return DOUBLESTAR;
1190	n/a	case '=': return STAREQUAL;
1191	n/a	}
1192	n/a	break;
1193	n/a	case '/':
1194	n/a	switch (c2) {
1195	n/a	case '/': return DOUBLESLASH;
1196	n/a	case '=': return SLASHEQUAL;
1197	n/a	}
1198	n/a	break;
1199	n/a	case '\|':
1200	n/a	switch (c2) {
1201	n/a	case '=': return VBAREQUAL;
1202	n/a	}
1203	n/a	break;
1204	n/a	case '%':
1205	n/a	switch (c2) {
1206	n/a	case '=': return PERCENTEQUAL;
1207	n/a	}
1208	n/a	break;
1209	n/a	case '&':
1210	n/a	switch (c2) {
1211	n/a	case '=': return AMPEREQUAL;
1212	n/a	}
1213	n/a	break;
1214	n/a	case '^':
1215	n/a	switch (c2) {
1216	n/a	case '=': return CIRCUMFLEXEQUAL;
1217	n/a	}
1218	n/a	break;
1219	n/a	case '@':
1220	n/a	switch (c2) {
1221	n/a	case '=': return ATEQUAL;
1222	n/a	}
1223	n/a	break;
1224	n/a	}
1225	n/a	return OP;
1226	n/a	}
1227	n/a
1228	n/a	int
1229	n/a	PyToken_ThreeChars(int c1, int c2, int c3)
1230	n/a	{
1231	n/a	switch (c1) {
1232	n/a	case '<':
1233	n/a	switch (c2) {
1234	n/a	case '<':
1235	n/a	switch (c3) {
1236	n/a	case '=':
1237	n/a	return LEFTSHIFTEQUAL;
1238	n/a	}
1239	n/a	break;
1240	n/a	}
1241	n/a	break;
1242	n/a	case '>':
1243	n/a	switch (c2) {
1244	n/a	case '>':
1245	n/a	switch (c3) {
1246	n/a	case '=':
1247	n/a	return RIGHTSHIFTEQUAL;
1248	n/a	}
1249	n/a	break;
1250	n/a	}
1251	n/a	break;
1252	n/a	case '*':
1253	n/a	switch (c2) {
1254	n/a	case '*':
1255	n/a	switch (c3) {
1256	n/a	case '=':
1257	n/a	return DOUBLESTAREQUAL;
1258	n/a	}
1259	n/a	break;
1260	n/a	}
1261	n/a	break;
1262	n/a	case '/':
1263	n/a	switch (c2) {
1264	n/a	case '/':
1265	n/a	switch (c3) {
1266	n/a	case '=':
1267	n/a	return DOUBLESLASHEQUAL;
1268	n/a	}
1269	n/a	break;
1270	n/a	}
1271	n/a	break;
1272	n/a	case '.':
1273	n/a	switch (c2) {
1274	n/a	case '.':
1275	n/a	switch (c3) {
1276	n/a	case '.':
1277	n/a	return ELLIPSIS;
1278	n/a	}
1279	n/a	break;
1280	n/a	}
1281	n/a	break;
1282	n/a	}
1283	n/a	return OP;
1284	n/a	}
1285	n/a
1286	n/a	static int
1287	n/a	indenterror(struct tok_state *tok)
1288	n/a	{
1289	n/a	if (tok->alterror) {
1290	n/a	tok->done = E_TABSPACE;
1291	n/a	tok->cur = tok->inp;
1292	n/a	return 1;
1293	n/a	}
1294	n/a	if (tok->altwarning) {
1295	n/a	#ifdef PGEN
1296	n/a	PySys_WriteStderr("inconsistent use of tabs and spaces "
1297	n/a	"in indentation\n");
1298	n/a	#else
1299	n/a	PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
1300	n/a	"in indentation\n", tok->filename);
1301	n/a	#endif
1302	n/a	tok->altwarning = 0;
1303	n/a	}
1304	n/a	return 0;
1305	n/a	}
1306	n/a
1307	n/a	#ifdef PGEN
1308	n/a	#define verify_identifier(tok) 1
1309	n/a	#else
1310	n/a	/* Verify that the identifier follows PEP 3131.
1311	n/a	All identifier strings are guaranteed to be "ready" unicode objects.
1312	n/a	*/
1313	n/a	static int
1314	n/a	verify_identifier(struct tok_state *tok)
1315	n/a	{
1316	n/a	PyObject *s;
1317	n/a	int result;
1318	n/a	if (tok->decoding_erred)
1319	n/a	return 0;
1320	n/a	s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1321	n/a	if (s == NULL \|\| PyUnicode_READY(s) == -1) {
1322	n/a	if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1323	n/a	PyErr_Clear();
1324	n/a	tok->done = E_IDENTIFIER;
1325	n/a	} else {
1326	n/a	tok->done = E_ERROR;
1327	n/a	}
1328	n/a	return 0;
1329	n/a	}
1330	n/a	result = PyUnicode_IsIdentifier(s);
1331	n/a	Py_DECREF(s);
1332	n/a	if (result == 0)
1333	n/a	tok->done = E_IDENTIFIER;
1334	n/a	return result;
1335	n/a	}
1336	n/a	#endif
1337	n/a
1338	n/a	static int
1339	n/a	tok_decimal_tail(struct tok_state *tok)
1340	n/a	{
1341	n/a	int c;
1342	n/a
1343	n/a	while (1) {
1344	n/a	do {
1345	n/a	c = tok_nextc(tok);
1346	n/a	} while (isdigit(c));
1347	n/a	if (c != '_') {
1348	n/a	break;
1349	n/a	}
1350	n/a	c = tok_nextc(tok);
1351	n/a	if (!isdigit(c)) {
1352	n/a	tok->done = E_TOKEN;
1353	n/a	tok_backup(tok, c);
1354	n/a	return 0;
1355	n/a	}
1356	n/a	}
1357	n/a	return c;
1358	n/a	}
1359	n/a
1360	n/a	/* Get next token, after space stripping etc. */
1361	n/a
1362	n/a	static int
1363	n/a	tok_get(struct tok_state tok, char p_start, char *p_end)
1364	n/a	{
1365	n/a	int c;
1366	n/a	int blankline, nonascii;
1367	n/a
1368	n/a	p_start = p_end = NULL;
1369	n/a	nextline:
1370	n/a	tok->start = NULL;
1371	n/a	blankline = 0;
1372	n/a
1373	n/a	/* Get indentation level */
1374	n/a	if (tok->atbol) {
1375	n/a	int col = 0;
1376	n/a	int altcol = 0;
1377	n/a	tok->atbol = 0;
1378	n/a	for (;;) {
1379	n/a	c = tok_nextc(tok);
1380	n/a	if (c == ' ') {
1381	n/a	col++, altcol++;
1382	n/a	}
1383	n/a	else if (c == '\t') {
1384	n/a	col = (col/tok->tabsize + 1) * tok->tabsize;
1385	n/a	altcol = (altcol/tok->alttabsize + 1)
1386	n/a	* tok->alttabsize;
1387	n/a	}
1388	n/a	else if (c == '\014') {/* Control-L (formfeed) */
1389	n/a	col = altcol = 0; /* For Emacs users */
1390	n/a	}
1391	n/a	else {
1392	n/a	break;
1393	n/a	}
1394	n/a	}
1395	n/a	tok_backup(tok, c);
1396	n/a	if (c == '#' \|\| c == '\n') {
1397	n/a	/* Lines with only whitespace and/or comments
1398	n/a	shouldn't affect the indentation and are
1399	n/a	not passed to the parser as NEWLINE tokens,
1400	n/a	except totally empty lines in interactive
1401	n/a	mode, which signal the end of a command group. */
1402	n/a	if (col == 0 && c == '\n' && tok->prompt != NULL) {
1403	n/a	blankline = 0; /* Let it through */
1404	n/a	}
1405	n/a	else {
1406	n/a	blankline = 1; /* Ignore completely */
1407	n/a	}
1408	n/a	/* We can't jump back right here since we still
1409	n/a	may need to skip to the end of a comment */
1410	n/a	}
1411	n/a	if (!blankline && tok->level == 0) {
1412	n/a	if (col == tok->indstack[tok->indent]) {
1413	n/a	/* No change */
1414	n/a	if (altcol != tok->altindstack[tok->indent]) {
1415	n/a	if (indenterror(tok)) {
1416	n/a	return ERRORTOKEN;
1417	n/a	}
1418	n/a	}
1419	n/a	}
1420	n/a	else if (col > tok->indstack[tok->indent]) {
1421	n/a	/* Indent -- always one */
1422	n/a	if (tok->indent+1 >= MAXINDENT) {
1423	n/a	tok->done = E_TOODEEP;
1424	n/a	tok->cur = tok->inp;
1425	n/a	return ERRORTOKEN;
1426	n/a	}
1427	n/a	if (altcol <= tok->altindstack[tok->indent]) {
1428	n/a	if (indenterror(tok)) {
1429	n/a	return ERRORTOKEN;
1430	n/a	}
1431	n/a	}
1432	n/a	tok->pendin++;
1433	n/a	tok->indstack[++tok->indent] = col;
1434	n/a	tok->altindstack[tok->indent] = altcol;
1435	n/a	}
1436	n/a	else /* col < tok->indstack[tok->indent] */ {
1437	n/a	/* Dedent -- any number, must be consistent */
1438	n/a	while (tok->indent > 0 &&
1439	n/a	col < tok->indstack[tok->indent]) {
1440	n/a	tok->pendin--;
1441	n/a	tok->indent--;
1442	n/a	}
1443	n/a	if (col != tok->indstack[tok->indent]) {
1444	n/a	tok->done = E_DEDENT;
1445	n/a	tok->cur = tok->inp;
1446	n/a	return ERRORTOKEN;
1447	n/a	}
1448	n/a	if (altcol != tok->altindstack[tok->indent]) {
1449	n/a	if (indenterror(tok)) {
1450	n/a	return ERRORTOKEN;
1451	n/a	}
1452	n/a	}
1453	n/a	}
1454	n/a	}
1455	n/a	}
1456	n/a
1457	n/a	tok->start = tok->cur;
1458	n/a
1459	n/a	/* Return pending indents/dedents */
1460	n/a	if (tok->pendin != 0) {
1461	n/a	if (tok->pendin < 0) {
1462	n/a	tok->pendin++;
1463	n/a	return DEDENT;
1464	n/a	}
1465	n/a	else {
1466	n/a	tok->pendin--;
1467	n/a	return INDENT;
1468	n/a	}
1469	n/a	}
1470	n/a
1471	n/a	if (tok->async_def
1472	n/a	&& !blankline
1473	n/a	&& tok->level == 0
1474	n/a	/* There was a NEWLINE after ASYNC DEF,
1475	n/a	so we're past the signature. */
1476	n/a	&& tok->async_def_nl
1477	n/a	/* Current indentation level is less than where
1478	n/a	the async function was defined */
1479	n/a	&& tok->async_def_indent >= tok->indent)
1480	n/a	{
1481	n/a	tok->async_def = 0;
1482	n/a	tok->async_def_indent = 0;
1483	n/a	tok->async_def_nl = 0;
1484	n/a	}
1485	n/a
1486	n/a	again:
1487	n/a	tok->start = NULL;
1488	n/a	/* Skip spaces */
1489	n/a	do {
1490	n/a	c = tok_nextc(tok);
1491	n/a	} while (c == ' ' \|\| c == '\t' \|\| c == '\014');
1492	n/a
1493	n/a	/* Set start of current token */
1494	n/a	tok->start = tok->cur - 1;
1495	n/a
1496	n/a	/* Skip comment */
1497	n/a	if (c == '#') {
1498	n/a	while (c != EOF && c != '\n') {
1499	n/a	c = tok_nextc(tok);
1500	n/a	}
1501	n/a	}
1502	n/a
1503	n/a	/* Check for EOF and errors now */
1504	n/a	if (c == EOF) {
1505	n/a	return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1506	n/a	}
1507	n/a
1508	n/a	/* Identifier (most frequent token!) */
1509	n/a	nonascii = 0;
1510	n/a	if (is_potential_identifier_start(c)) {
1511	n/a	/* Process the various legal combinations of b"", r"", u"", and f"". */
1512	n/a	int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1513	n/a	while (1) {
1514	n/a	if (!(saw_b \|\| saw_u \|\| saw_f) && (c == 'b' \|\| c == 'B'))
1515	n/a	saw_b = 1;
1516	n/a	/* Since this is a backwards compatibility support literal we don't
1517	n/a	want to support it in arbitrary order like byte literals. */
1518	n/a	else if (!(saw_b \|\| saw_u \|\| saw_r \|\| saw_f)
1519	n/a	&& (c == 'u'\|\| c == 'U')) {
1520	n/a	saw_u = 1;
1521	n/a	}
1522	n/a	/* ur"" and ru"" are not supported */
1523	n/a	else if (!(saw_r \|\| saw_u) && (c == 'r' \|\| c == 'R')) {
1524	n/a	saw_r = 1;
1525	n/a	}
1526	n/a	else if (!(saw_f \|\| saw_b \|\| saw_u) && (c == 'f' \|\| c == 'F')) {
1527	n/a	saw_f = 1;
1528	n/a	}
1529	n/a	else {
1530	n/a	break;
1531	n/a	}
1532	n/a	c = tok_nextc(tok);
1533	n/a	if (c == '"' \|\| c == '\'') {
1534	n/a	goto letter_quote;
1535	n/a	}
1536	n/a	}
1537	n/a	while (is_potential_identifier_char(c)) {
1538	n/a	if (c >= 128) {
1539	n/a	nonascii = 1;
1540	n/a	}
1541	n/a	c = tok_nextc(tok);
1542	n/a	}
1543	n/a	tok_backup(tok, c);
1544	n/a	if (nonascii && !verify_identifier(tok)) {
1545	n/a	return ERRORTOKEN;
1546	n/a	}
1547	n/a	*p_start = tok->start;
1548	n/a	*p_end = tok->cur;
1549	n/a
1550	n/a	/* async/await parsing block. */
1551	n/a	if (tok->cur - tok->start == 5) {
1552	n/a	/* Current token length is 5. */
1553	n/a	if (tok->async_def) {
1554	n/a	/* We're inside an 'async def' function. */
1555	n/a	if (memcmp(tok->start, "async", 5) == 0) {
1556	n/a	return ASYNC;
1557	n/a	}
1558	n/a	if (memcmp(tok->start, "await", 5) == 0) {
1559	n/a	return AWAIT;
1560	n/a	}
1561	n/a	}
1562	n/a	else if (memcmp(tok->start, "async", 5) == 0) {
1563	n/a	/* The current token is 'async'.
1564	n/a	Look ahead one token.*/
1565	n/a
1566	n/a	struct tok_state ahead_tok;
1567	n/a	char ahead_tok_start = NULL, ahead_tok_end = NULL;
1568	n/a	int ahead_tok_kind;
1569	n/a
1570	n/a	memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1571	n/a	ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1572	n/a	&ahead_tok_end);
1573	n/a
1574	n/a	if (ahead_tok_kind == NAME
1575	n/a	&& ahead_tok.cur - ahead_tok.start == 3
1576	n/a	&& memcmp(ahead_tok.start, "def", 3) == 0)
1577	n/a	{
1578	n/a	/* The next token is going to be 'def', so instead of
1579	n/a	returning 'async' NAME token, we return ASYNC. */
1580	n/a	tok->async_def_indent = tok->indent;
1581	n/a	tok->async_def = 1;
1582	n/a	return ASYNC;
1583	n/a	}
1584	n/a	}
1585	n/a	}
1586	n/a
1587	n/a	return NAME;
1588	n/a	}
1589	n/a
1590	n/a	/* Newline */
1591	n/a	if (c == '\n') {
1592	n/a	tok->atbol = 1;
1593	n/a	if (blankline \|\| tok->level > 0) {
1594	n/a	goto nextline;
1595	n/a	}
1596	n/a	*p_start = tok->start;
1597	n/a	p_end = tok->cur - 1; / Leave '\n' out of the string */
1598	n/a	tok->cont_line = 0;
1599	n/a	if (tok->async_def) {
1600	n/a	/* We're somewhere inside an 'async def' function, and
1601	n/a	we've encountered a NEWLINE after its signature. */
1602	n/a	tok->async_def_nl = 1;
1603	n/a	}
1604	n/a	return NEWLINE;
1605	n/a	}
1606	n/a
1607	n/a	/* Period or number starting with period? */
1608	n/a	if (c == '.') {
1609	n/a	c = tok_nextc(tok);
1610	n/a	if (isdigit(c)) {
1611	n/a	goto fraction;
1612	n/a	} else if (c == '.') {
1613	n/a	c = tok_nextc(tok);
1614	n/a	if (c == '.') {
1615	n/a	*p_start = tok->start;
1616	n/a	*p_end = tok->cur;
1617	n/a	return ELLIPSIS;
1618	n/a	}
1619	n/a	else {
1620	n/a	tok_backup(tok, c);
1621	n/a	}
1622	n/a	tok_backup(tok, '.');
1623	n/a	}
1624	n/a	else {
1625	n/a	tok_backup(tok, c);
1626	n/a	}
1627	n/a	*p_start = tok->start;
1628	n/a	*p_end = tok->cur;
1629	n/a	return DOT;
1630	n/a	}
1631	n/a
1632	n/a	/* Number */
1633	n/a	if (isdigit(c)) {
1634	n/a	if (c == '0') {
1635	n/a	/* Hex, octal or binary -- maybe. */
1636	n/a	c = tok_nextc(tok);
1637	n/a	if (c == 'x' \|\| c == 'X') {
1638	n/a	/* Hex */
1639	n/a	c = tok_nextc(tok);
1640	n/a	do {
1641	n/a	if (c == '_') {
1642	n/a	c = tok_nextc(tok);
1643	n/a	}
1644	n/a	if (!isxdigit(c)) {
1645	n/a	tok->done = E_TOKEN;
1646	n/a	tok_backup(tok, c);
1647	n/a	return ERRORTOKEN;
1648	n/a	}
1649	n/a	do {
1650	n/a	c = tok_nextc(tok);
1651	n/a	} while (isxdigit(c));
1652	n/a	} while (c == '_');
1653	n/a	}
1654	n/a	else if (c == 'o' \|\| c == 'O') {
1655	n/a	/* Octal */
1656	n/a	c = tok_nextc(tok);
1657	n/a	do {
1658	n/a	if (c == '_') {
1659	n/a	c = tok_nextc(tok);
1660	n/a	}
1661	n/a	if (c < '0' \|\| c >= '8') {
1662	n/a	tok->done = E_TOKEN;
1663	n/a	tok_backup(tok, c);
1664	n/a	return ERRORTOKEN;
1665	n/a	}
1666	n/a	do {
1667	n/a	c = tok_nextc(tok);
1668	n/a	} while ('0' <= c && c < '8');
1669	n/a	} while (c == '_');
1670	n/a	}
1671	n/a	else if (c == 'b' \|\| c == 'B') {
1672	n/a	/* Binary */
1673	n/a	c = tok_nextc(tok);
1674	n/a	do {
1675	n/a	if (c == '_') {
1676	n/a	c = tok_nextc(tok);
1677	n/a	}
1678	n/a	if (c != '0' && c != '1') {
1679	n/a	tok->done = E_TOKEN;
1680	n/a	tok_backup(tok, c);
1681	n/a	return ERRORTOKEN;
1682	n/a	}
1683	n/a	do {
1684	n/a	c = tok_nextc(tok);
1685	n/a	} while (c == '0' \|\| c == '1');
1686	n/a	} while (c == '_');
1687	n/a	}
1688	n/a	else {
1689	n/a	int nonzero = 0;
1690	n/a	/* maybe old-style octal; c is first char of it */
1691	n/a	/* in any case, allow '0' as a literal */
1692	n/a	while (1) {
1693	n/a	if (c == '_') {
1694	n/a	c = tok_nextc(tok);
1695	n/a	if (!isdigit(c)) {
1696	n/a	tok->done = E_TOKEN;
1697	n/a	tok_backup(tok, c);
1698	n/a	return ERRORTOKEN;
1699	n/a	}
1700	n/a	}
1701	n/a	if (c != '0') {
1702	n/a	break;
1703	n/a	}
1704	n/a	c = tok_nextc(tok);
1705	n/a	}
1706	n/a	if (isdigit(c)) {
1707	n/a	nonzero = 1;
1708	n/a	c = tok_decimal_tail(tok);
1709	n/a	if (c == 0) {
1710	n/a	return ERRORTOKEN;
1711	n/a	}
1712	n/a	}
1713	n/a	if (c == '.') {
1714	n/a	c = tok_nextc(tok);
1715	n/a	goto fraction;
1716	n/a	}
1717	n/a	else if (c == 'e' \|\| c == 'E') {
1718	n/a	goto exponent;
1719	n/a	}
1720	n/a	else if (c == 'j' \|\| c == 'J') {
1721	n/a	goto imaginary;
1722	n/a	}
1723	n/a	else if (nonzero) {
1724	n/a	/* Old-style octal: now disallowed. */
1725	n/a	tok->done = E_TOKEN;
1726	n/a	tok_backup(tok, c);
1727	n/a	return ERRORTOKEN;
1728	n/a	}
1729	n/a	}
1730	n/a	}
1731	n/a	else {
1732	n/a	/* Decimal */
1733	n/a	c = tok_decimal_tail(tok);
1734	n/a	if (c == 0) {
1735	n/a	return ERRORTOKEN;
1736	n/a	}
1737	n/a	{
1738	n/a	/* Accept floating point numbers. */
1739	n/a	if (c == '.') {
1740	n/a	c = tok_nextc(tok);
1741	n/a	fraction:
1742	n/a	/* Fraction */
1743	n/a	if (isdigit(c)) {
1744	n/a	c = tok_decimal_tail(tok);
1745	n/a	if (c == 0) {
1746	n/a	return ERRORTOKEN;
1747	n/a	}
1748	n/a	}
1749	n/a	}
1750	n/a	if (c == 'e' \|\| c == 'E') {
1751	n/a	int e;
1752	n/a	exponent:
1753	n/a	e = c;
1754	n/a	/* Exponent part */
1755	n/a	c = tok_nextc(tok);
1756	n/a	if (c == '+' \|\| c == '-') {
1757	n/a	c = tok_nextc(tok);
1758	n/a	if (!isdigit(c)) {
1759	n/a	tok->done = E_TOKEN;
1760	n/a	tok_backup(tok, c);
1761	n/a	return ERRORTOKEN;
1762	n/a	}
1763	n/a	} else if (!isdigit(c)) {
1764	n/a	tok_backup(tok, c);
1765	n/a	tok_backup(tok, e);
1766	n/a	*p_start = tok->start;
1767	n/a	*p_end = tok->cur;
1768	n/a	return NUMBER;
1769	n/a	}
1770	n/a	c = tok_decimal_tail(tok);
1771	n/a	if (c == 0) {
1772	n/a	return ERRORTOKEN;
1773	n/a	}
1774	n/a	}
1775	n/a	if (c == 'j' \|\| c == 'J') {
1776	n/a	/* Imaginary part */
1777	n/a	imaginary:
1778	n/a	c = tok_nextc(tok);
1779	n/a	}
1780	n/a	}
1781	n/a	}
1782	n/a	tok_backup(tok, c);
1783	n/a	*p_start = tok->start;
1784	n/a	*p_end = tok->cur;
1785	n/a	return NUMBER;
1786	n/a	}
1787	n/a
1788	n/a	letter_quote:
1789	n/a	/* String */
1790	n/a	if (c == '\'' \|\| c == '"') {
1791	n/a	int quote = c;
1792	n/a	int quote_size = 1; /* 1 or 3 */
1793	n/a	int end_quote_size = 0;
1794	n/a
1795	n/a	/* Find the quote size and start of string */
1796	n/a	c = tok_nextc(tok);
1797	n/a	if (c == quote) {
1798	n/a	c = tok_nextc(tok);
1799	n/a	if (c == quote) {
1800	n/a	quote_size = 3;
1801	n/a	}
1802	n/a	else {
1803	n/a	end_quote_size = 1; /* empty string found */
1804	n/a	}
1805	n/a	}
1806	n/a	if (c != quote) {
1807	n/a	tok_backup(tok, c);
1808	n/a	}
1809	n/a
1810	n/a	/* Get rest of string */
1811	n/a	while (end_quote_size != quote_size) {
1812	n/a	c = tok_nextc(tok);
1813	n/a	if (c == EOF) {
1814	n/a	if (quote_size == 3) {
1815	n/a	tok->done = E_EOFS;
1816	n/a	}
1817	n/a	else {
1818	n/a	tok->done = E_EOLS;
1819	n/a	}
1820	n/a	tok->cur = tok->inp;
1821	n/a	return ERRORTOKEN;
1822	n/a	}
1823	n/a	if (quote_size == 1 && c == '\n') {
1824	n/a	tok->done = E_EOLS;
1825	n/a	tok->cur = tok->inp;
1826	n/a	return ERRORTOKEN;
1827	n/a	}
1828	n/a	if (c == quote) {
1829	n/a	end_quote_size += 1;
1830	n/a	}
1831	n/a	else {
1832	n/a	end_quote_size = 0;
1833	n/a	if (c == '\\') {
1834	n/a	tok_nextc(tok); /* skip escaped char */
1835	n/a	}
1836	n/a	}
1837	n/a	}
1838	n/a
1839	n/a	*p_start = tok->start;
1840	n/a	*p_end = tok->cur;
1841	n/a	return STRING;
1842	n/a	}
1843	n/a
1844	n/a	/* Line continuation */
1845	n/a	if (c == '\\') {
1846	n/a	c = tok_nextc(tok);
1847	n/a	if (c != '\n') {
1848	n/a	tok->done = E_LINECONT;
1849	n/a	tok->cur = tok->inp;
1850	n/a	return ERRORTOKEN;
1851	n/a	}
1852	n/a	tok->cont_line = 1;
1853	n/a	goto again; /* Read next line */
1854	n/a	}
1855	n/a
1856	n/a	/* Check for two-character token */
1857	n/a	{
1858	n/a	int c2 = tok_nextc(tok);
1859	n/a	int token = PyToken_TwoChars(c, c2);
1860	n/a	if (token != OP) {
1861	n/a	int c3 = tok_nextc(tok);
1862	n/a	int token3 = PyToken_ThreeChars(c, c2, c3);
1863	n/a	if (token3 != OP) {
1864	n/a	token = token3;
1865	n/a	}
1866	n/a	else {
1867	n/a	tok_backup(tok, c3);
1868	n/a	}
1869	n/a	*p_start = tok->start;
1870	n/a	*p_end = tok->cur;
1871	n/a	return token;
1872	n/a	}
1873	n/a	tok_backup(tok, c2);
1874	n/a	}
1875	n/a
1876	n/a	/* Keep track of parentheses nesting level */
1877	n/a	switch (c) {
1878	n/a	case '(':
1879	n/a	case '[':
1880	n/a	case '{':
1881	n/a	tok->level++;
1882	n/a	break;
1883	n/a	case ')':
1884	n/a	case ']':
1885	n/a	case '}':
1886	n/a	tok->level--;
1887	n/a	break;
1888	n/a	}
1889	n/a
1890	n/a	/* Punctuation character */
1891	n/a	*p_start = tok->start;
1892	n/a	*p_end = tok->cur;
1893	n/a	return PyToken_OneChar(c);
1894	n/a	}
1895	n/a
1896	n/a	int
1897	n/a	PyTokenizer_Get(struct tok_state tok, char p_start, char *p_end)
1898	n/a	{
1899	n/a	int result = tok_get(tok, p_start, p_end);
1900	n/a	if (tok->decoding_erred) {
1901	n/a	result = ERRORTOKEN;
1902	n/a	tok->done = E_DECODE;
1903	n/a	}
1904	n/a	return result;
1905	n/a	}
1906	n/a
1907	n/a	/* Get the encoding of a Python file. Check for the coding cookie and check if
1908	n/a	the file starts with a BOM.
1909	n/a
1910	n/a	PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1911	n/a	encoding in the first or second line of the file (in which case the encoding
1912	n/a	should be assumed to be UTF-8).
1913	n/a
1914	n/a	The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1915	n/a	by the caller. */
1916	n/a
1917	n/a	char *
1918	n/a	PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
1919	n/a	{
1920	n/a	struct tok_state *tok;
1921	n/a	FILE *fp;
1922	n/a	char p_start =NULL , p_end =NULL , *encoding = NULL;
1923	n/a
1924	n/a	#ifndef PGEN
1925	n/a	fd = _Py_dup(fd);
1926	n/a	#else
1927	n/a	fd = dup(fd);
1928	n/a	#endif
1929	n/a	if (fd < 0) {
1930	n/a	return NULL;
1931	n/a	}
1932	n/a
1933	n/a	fp = fdopen(fd, "r");
1934	n/a	if (fp == NULL) {
1935	n/a	return NULL;
1936	n/a	}
1937	n/a	tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1938	n/a	if (tok == NULL) {
1939	n/a	fclose(fp);
1940	n/a	return NULL;
1941	n/a	}
1942	n/a	#ifndef PGEN
1943	n/a	if (filename != NULL) {
1944	n/a	Py_INCREF(filename);
1945	n/a	tok->filename = filename;
1946	n/a	}
1947	n/a	else {
1948	n/a	tok->filename = PyUnicode_FromString("<string>");
1949	n/a	if (tok->filename == NULL) {
1950	n/a	fclose(fp);
1951	n/a	PyTokenizer_Free(tok);
1952	n/a	return encoding;
1953	n/a	}
1954	n/a	}
1955	n/a	#endif
1956	n/a	while (tok->lineno < 2 && tok->done == E_OK) {
1957	n/a	PyTokenizer_Get(tok, &p_start, &p_end);
1958	n/a	}
1959	n/a	fclose(fp);
1960	n/a	if (tok->encoding) {
1961	n/a	encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1962	n/a	if (encoding)
1963	n/a	strcpy(encoding, tok->encoding);
1964	n/a	}
1965	n/a	PyTokenizer_Free(tok);
1966	n/a	return encoding;
1967	n/a	}
1968	n/a
1969	n/a	char *
1970	n/a	PyTokenizer_FindEncoding(int fd)
1971	n/a	{
1972	n/a	return PyTokenizer_FindEncodingFilename(fd, NULL);
1973	n/a	}
1974	n/a
1975	n/a	#ifdef Py_DEBUG
1976	n/a
1977	n/a	void
1978	n/a	tok_dump(int type, char start, char end)
1979	n/a	{
1980	n/a	printf("%s", _PyParser_TokenNames[type]);
1981	n/a	if (type == NAME \|\| type == NUMBER \|\| type == STRING \|\| type == OP)
1982	n/a	printf("(%.*s)", (int)(end - start), start);
1983	n/a	}
1984	n/a
1985	n/a	#endif