Python code coverage for Lib/tokenize.py

#	count	content
1	n/a	"""Tokenization help for Python programs.
2	n/a
3	n/a	tokenize(readline) is a generator that breaks a stream of bytes into
4	n/a	Python tokens. It decodes the bytes according to PEP-0263 for
5	n/a	determining source file encoding.
6	n/a
7	n/a	It accepts a readline-like method which is called repeatedly to get the
8	n/a	next line of input (or b"" for EOF). It generates 5-tuples with these
9	n/a	members:
10	n/a
11	n/a	the token type (see token.py)
12	n/a	the token (a string)
13	n/a	the starting (row, column) indices of the token (a 2-tuple of ints)
14	n/a	the ending (row, column) indices of the token (a 2-tuple of ints)
15	n/a	the original line (string)
16	n/a
17	n/a	It is designed to match the working of the Python tokenizer exactly, except
18	n/a	that it produces COMMENT tokens for comments and gives type OP for all
19	n/a	operators. Additionally, all token lists start with an ENCODING token
20	n/a	which tells you which encoding was used to decode the bytes stream.
21	n/a	"""
22	n/a
23	n/a	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
24	n/a	__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
25	n/a	'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
26	n/a	'Michael Foord')
27	n/a	from builtins import open as _builtin_open
28	n/a	from codecs import lookup, BOM_UTF8
29	n/a	import collections
30	n/a	from io import TextIOWrapper
31	n/a	from itertools import chain
32	n/a	import itertools as _itertools
33	n/a	import re
34	n/a	import sys
35	n/a	from token import *
36	n/a
37	n/a	cookie_re = re.compile(r'^[ \t\f]#.?coding[:=][ \t]*([-\w.]+)', re.ASCII)
38	n/a	blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]\|$)', re.ASCII)
39	n/a
40	n/a	import token
41	n/a	__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
42	n/a	"NL", "untokenize", "ENCODING", "TokenInfo"]
43	n/a	del token
44	n/a
45	n/a	COMMENT = N_TOKENS
46	n/a	tok_name[COMMENT] = 'COMMENT'
47	n/a	NL = N_TOKENS + 1
48	n/a	tok_name[NL] = 'NL'
49	n/a	ENCODING = N_TOKENS + 2
50	n/a	tok_name[ENCODING] = 'ENCODING'
51	n/a	N_TOKENS += 3
52	n/a	EXACT_TOKEN_TYPES = {
53	n/a	'(': LPAR,
54	n/a	')': RPAR,
55	n/a	'[': LSQB,
56	n/a	']': RSQB,
57	n/a	':': COLON,
58	n/a	',': COMMA,
59	n/a	';': SEMI,
60	n/a	'+': PLUS,
61	n/a	'-': MINUS,
62	n/a	'*': STAR,
63	n/a	'/': SLASH,
64	n/a	'\|': VBAR,
65	n/a	'&': AMPER,
66	n/a	'<': LESS,
67	n/a	'>': GREATER,
68	n/a	'=': EQUAL,
69	n/a	'.': DOT,
70	n/a	'%': PERCENT,
71	n/a	'{': LBRACE,
72	n/a	'}': RBRACE,
73	n/a	'==': EQEQUAL,
74	n/a	'!=': NOTEQUAL,
75	n/a	'<=': LESSEQUAL,
76	n/a	'>=': GREATEREQUAL,
77	n/a	'~': TILDE,
78	n/a	'^': CIRCUMFLEX,
79	n/a	'<<': LEFTSHIFT,
80	n/a	'>>': RIGHTSHIFT,
81	n/a	'**': DOUBLESTAR,
82	n/a	'+=': PLUSEQUAL,
83	n/a	'-=': MINEQUAL,
84	n/a	'*=': STAREQUAL,
85	n/a	'/=': SLASHEQUAL,
86	n/a	'%=': PERCENTEQUAL,
87	n/a	'&=': AMPEREQUAL,
88	n/a	'\|=': VBAREQUAL,
89	n/a	'^=': CIRCUMFLEXEQUAL,
90	n/a	'<<=': LEFTSHIFTEQUAL,
91	n/a	'>>=': RIGHTSHIFTEQUAL,
92	n/a	'**=': DOUBLESTAREQUAL,
93	n/a	'//': DOUBLESLASH,
94	n/a	'//=': DOUBLESLASHEQUAL,
95	n/a	'@': AT,
96	n/a	'@=': ATEQUAL,
97	n/a	}
98	n/a
99	n/a	class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
100	n/a	def __repr__(self):
101	n/a	annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
102	n/a	return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
103	n/a	self._replace(type=annotated_type))
104	n/a
105	n/a	@property
106	n/a	def exact_type(self):
107	n/a	if self.type == OP and self.string in EXACT_TOKEN_TYPES:
108	n/a	return EXACT_TOKEN_TYPES[self.string]
109	n/a	else:
110	n/a	return self.type
111	n/a
112	n/a	def group(*choices): return '(' + '\|'.join(choices) + ')'
113	n/a	def any(choices): return group(choices) + '*'
114	n/a	def maybe(choices): return group(choices) + '?'
115	n/a
116	n/a	# Note: we use unicode matching for names ("\w") but ascii matching for
117	n/a	# number literals.
118	n/a	Whitespace = r'[ \f\t]*'
119	n/a	Comment = r'#[^\r\n]*'
120	n/a	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
121	n/a	Name = r'\w+'
122	n/a
123	n/a	Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
124	n/a	Binnumber = r'0[bB](?:_?[01])+'
125	n/a	Octnumber = r'0[oO](?:_?[0-7])+'
126	n/a	Decnumber = r'(?:0(?:_?0)\|[1-9](?:_?[0-9]))'
127	n/a	Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
128	n/a	Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
129	n/a	Pointfloat = group(r'[0-9](?:_?[0-9])\.(?:[0-9](?:_?[0-9]))?',
130	n/a	r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
131	n/a	Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
132	n/a	Floatnumber = group(Pointfloat, Expfloat)
133	n/a	Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
134	n/a	Number = group(Imagnumber, Floatnumber, Intnumber)
135	n/a
136	n/a	# Return the empty string, plus all of the valid string prefixes.
137	n/a	def _all_string_prefixes():
138	n/a	# The valid string prefixes. Only contain the lower case versions,
139	n/a	# and don't contain any permuations (include 'fr', but not
140	n/a	# 'rf'). The various permutations will be generated.
141	n/a	_valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
142	n/a	# if we add binary f-strings, add: ['fb', 'fbr']
143	n/a	result = set([''])
144	n/a	for prefix in _valid_string_prefixes:
145	n/a	for t in _itertools.permutations(prefix):
146	n/a	# create a list with upper and lower versions of each
147	n/a	# character
148	n/a	for u in _itertools.product(*[(c, c.upper()) for c in t]):
149	n/a	result.add(''.join(u))
150	n/a	return result
151	n/a
152	n/a	def _compile(expr):
153	n/a	return re.compile(expr, re.UNICODE)
154	n/a
155	n/a	# Note that since _all_string_prefixes includes the empty string,
156	n/a	# StringPrefix can be the empty string (making it optional).
157	n/a	StringPrefix = group(*_all_string_prefixes())
158	n/a
159	n/a	# Tail end of ' string.
160	n/a	Single = r"[^'\\](?:\\.[^'\\])*'"
161	n/a	# Tail end of " string.
162	n/a	Double = r'[^"\\](?:\\.[^"\\])*"'
163	n/a	# Tail end of ''' string.
164	n/a	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
165	n/a	# Tail end of """ string.
166	n/a	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
167	n/a	Triple = group(StringPrefix + "'''", StringPrefix + '"""')
168	n/a	# Single-line ' or " string.
169	n/a	String = group(StringPrefix + r"'[^\n'\\](?:\\.[^\n'\\])*'",
170	n/a	StringPrefix + r'"[^\n"\\](?:\\.[^\n"\\])*"')
171	n/a
172	n/a	# Because of leftmost-then-longest match semantics, be sure to put the
173	n/a	# longest operators first (e.g., if = came before ==, == would get
174	n/a	# recognized as two instances of =).
175	n/a	Operator = group(r"\\=?", r">>=?", r"<<=?", r"!=",
176	n/a	r"//=?", r"->",
177	n/a	r"[+\-*/%&@\|^=<>]=?",
178	n/a	r"~")
179	n/a
180	n/a	Bracket = '[][(){}]'
181	n/a	Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
182	n/a	Funny = group(Operator, Bracket, Special)
183	n/a
184	n/a	PlainToken = group(Number, Funny, String, Name)
185	n/a	Token = Ignore + PlainToken
186	n/a
187	n/a	# First (or only) line of ' or " string.
188	n/a	ContStr = group(StringPrefix + r"'[^\n'\\](?:\\.[^\n'\\])*" +
189	n/a	group("'", r'\\\r?\n'),
190	n/a	StringPrefix + r'"[^\n"\\](?:\\.[^\n"\\])*' +
191	n/a	group('"', r'\\\r?\n'))
192	n/a	PseudoExtras = group(r'\\\r?\n\|\Z', Comment, Triple)
193	n/a	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
194	n/a
195	n/a	# For a given string prefix plus quotes, endpats maps it to a regex
196	n/a	# to match the remainder of that string. _prefix can be empty, for
197	n/a	# a normal single or triple quoted string (with no prefix).
198	n/a	endpats = {}
199	n/a	for _prefix in _all_string_prefixes():
200	n/a	endpats[_prefix + "'"] = Single
201	n/a	endpats[_prefix + '"'] = Double
202	n/a	endpats[_prefix + "'''"] = Single3
203	n/a	endpats[_prefix + '"""'] = Double3
204	n/a
205	n/a	# A set of all of the single and triple quoted string prefixes,
206	n/a	# including the opening quotes.
207	n/a	single_quoted = set()
208	n/a	triple_quoted = set()
209	n/a	for t in _all_string_prefixes():
210	n/a	for u in (t + '"', t + "'"):
211	n/a	single_quoted.add(u)
212	n/a	for u in (t + '"""', t + "'''"):
213	n/a	triple_quoted.add(u)
214	n/a
215	n/a	tabsize = 8
216	n/a
217	n/a	class TokenError(Exception): pass
218	n/a
219	n/a	class StopTokenizing(Exception): pass
220	n/a
221	n/a
222	n/a	class Untokenizer:
223	n/a
224	n/a	def __init__(self):
225	n/a	self.tokens = []
226	n/a	self.prev_row = 1
227	n/a	self.prev_col = 0
228	n/a	self.encoding = None
229	n/a
230	n/a	def add_whitespace(self, start):
231	n/a	row, col = start
232	n/a	if row < self.prev_row or row == self.prev_row and col < self.prev_col:
233	n/a	raise ValueError("start ({},{}) precedes previous end ({},{})"
234	n/a	.format(row, col, self.prev_row, self.prev_col))
235	n/a	row_offset = row - self.prev_row
236	n/a	if row_offset:
237	n/a	self.tokens.append("\\\n" * row_offset)
238	n/a	self.prev_col = 0
239	n/a	col_offset = col - self.prev_col
240	n/a	if col_offset:
241	n/a	self.tokens.append(" " * col_offset)
242	n/a
243	n/a	def untokenize(self, iterable):
244	n/a	it = iter(iterable)
245	n/a	indents = []
246	n/a	startline = False
247	n/a	for t in it:
248	n/a	if len(t) == 2:
249	n/a	self.compat(t, it)
250	n/a	break
251	n/a	tok_type, token, start, end, line = t
252	n/a	if tok_type == ENCODING:
253	n/a	self.encoding = token
254	n/a	continue
255	n/a	if tok_type == ENDMARKER:
256	n/a	break
257	n/a	if tok_type == INDENT:
258	n/a	indents.append(token)
259	n/a	continue
260	n/a	elif tok_type == DEDENT:
261	n/a	indents.pop()
262	n/a	self.prev_row, self.prev_col = end
263	n/a	continue
264	n/a	elif tok_type in (NEWLINE, NL):
265	n/a	startline = True
266	n/a	elif startline and indents:
267	n/a	indent = indents[-1]
268	n/a	if start[1] >= len(indent):
269	n/a	self.tokens.append(indent)
270	n/a	self.prev_col = len(indent)
271	n/a	startline = False
272	n/a	self.add_whitespace(start)
273	n/a	self.tokens.append(token)
274	n/a	self.prev_row, self.prev_col = end
275	n/a	if tok_type in (NEWLINE, NL):
276	n/a	self.prev_row += 1
277	n/a	self.prev_col = 0
278	n/a	return "".join(self.tokens)
279	n/a
280	n/a	def compat(self, token, iterable):
281	n/a	indents = []
282	n/a	toks_append = self.tokens.append
283	n/a	startline = token[0] in (NEWLINE, NL)
284	n/a	prevstring = False
285	n/a
286	n/a	for tok in chain([token], iterable):
287	n/a	toknum, tokval = tok[:2]
288	n/a	if toknum == ENCODING:
289	n/a	self.encoding = tokval
290	n/a	continue
291	n/a
292	n/a	if toknum in (NAME, NUMBER, ASYNC, AWAIT):
293	n/a	tokval += ' '
294	n/a
295	n/a	# Insert a space between two consecutive strings
296	n/a	if toknum == STRING:
297	n/a	if prevstring:
298	n/a	tokval = ' ' + tokval
299	n/a	prevstring = True
300	n/a	else:
301	n/a	prevstring = False
302	n/a
303	n/a	if toknum == INDENT:
304	n/a	indents.append(tokval)
305	n/a	continue
306	n/a	elif toknum == DEDENT:
307	n/a	indents.pop()
308	n/a	continue
309	n/a	elif toknum in (NEWLINE, NL):
310	n/a	startline = True
311	n/a	elif startline and indents:
312	n/a	toks_append(indents[-1])
313	n/a	startline = False
314	n/a	toks_append(tokval)
315	n/a
316	n/a
317	n/a	def untokenize(iterable):
318	n/a	"""Transform tokens back into Python source code.
319	n/a	It returns a bytes object, encoded using the ENCODING
320	n/a	token, which is the first token sequence output by tokenize.
321	n/a
322	n/a	Each element returned by the iterable must be a token sequence
323	n/a	with at least two elements, a token number and token value. If
324	n/a	only two tokens are passed, the resulting output is poor.
325	n/a
326	n/a	Round-trip invariant for full input:
327	n/a	Untokenized source will match input source exactly
328	n/a
329	n/a	Round-trip invariant for limited input:
330	n/a	# Output bytes will tokenize back to the input
331	n/a	t1 = [tok[:2] for tok in tokenize(f.readline)]
332	n/a	newcode = untokenize(t1)
333	n/a	readline = BytesIO(newcode).readline
334	n/a	t2 = [tok[:2] for tok in tokenize(readline)]
335	n/a	assert t1 == t2
336	n/a	"""
337	n/a	ut = Untokenizer()
338	n/a	out = ut.untokenize(iterable)
339	n/a	if ut.encoding is not None:
340	n/a	out = out.encode(ut.encoding)
341	n/a	return out
342	n/a
343	n/a
344	n/a	def _get_normal_name(orig_enc):
345	n/a	"""Imitates get_normal_name in tokenizer.c."""
346	n/a	# Only care about the first 12 characters.
347	n/a	enc = orig_enc[:12].lower().replace("_", "-")
348	n/a	if enc == "utf-8" or enc.startswith("utf-8-"):
349	n/a	return "utf-8"
350	n/a	if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
351	n/a	enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
352	n/a	return "iso-8859-1"
353	n/a	return orig_enc
354	n/a
355	n/a	def detect_encoding(readline):
356	n/a	"""
357	n/a	The detect_encoding() function is used to detect the encoding that should
358	n/a	be used to decode a Python source file. It requires one argument, readline,
359	n/a	in the same way as the tokenize() generator.
360	n/a
361	n/a	It will call readline a maximum of twice, and return the encoding used
362	n/a	(as a string) and a list of any lines (left as bytes) it has read in.
363	n/a
364	n/a	It detects the encoding from the presence of a utf-8 bom or an encoding
365	n/a	cookie as specified in pep-0263. If both a bom and a cookie are present,
366	n/a	but disagree, a SyntaxError will be raised. If the encoding cookie is an
367	n/a	invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
368	n/a	'utf-8-sig' is returned.
369	n/a
370	n/a	If no encoding is specified, then the default of 'utf-8' will be returned.
371	n/a	"""
372	n/a	try:
373	n/a	filename = readline.__self__.name
374	n/a	except AttributeError:
375	n/a	filename = None
376	n/a	bom_found = False
377	n/a	encoding = None
378	n/a	default = 'utf-8'
379	n/a	def read_or_stop():
380	n/a	try:
381	n/a	return readline()
382	n/a	except StopIteration:
383	n/a	return b''
384	n/a
385	n/a	def find_cookie(line):
386	n/a	try:
387	n/a	# Decode as UTF-8. Either the line is an encoding declaration,
388	n/a	# in which case it should be pure ASCII, or it must be UTF-8
389	n/a	# per default encoding.
390	n/a	line_string = line.decode('utf-8')
391	n/a	except UnicodeDecodeError:
392	n/a	msg = "invalid or missing encoding declaration"
393	n/a	if filename is not None:
394	n/a	msg = '{} for {!r}'.format(msg, filename)
395	n/a	raise SyntaxError(msg)
396	n/a
397	n/a	match = cookie_re.match(line_string)
398	n/a	if not match:
399	n/a	return None
400	n/a	encoding = _get_normal_name(match.group(1))
401	n/a	try:
402	n/a	codec = lookup(encoding)
403	n/a	except LookupError:
404	n/a	# This behaviour mimics the Python interpreter
405	n/a	if filename is None:
406	n/a	msg = "unknown encoding: " + encoding
407	n/a	else:
408	n/a	msg = "unknown encoding for {!r}: {}".format(filename,
409	n/a	encoding)
410	n/a	raise SyntaxError(msg)
411	n/a
412	n/a	if bom_found:
413	n/a	if encoding != 'utf-8':
414	n/a	# This behaviour mimics the Python interpreter
415	n/a	if filename is None:
416	n/a	msg = 'encoding problem: utf-8'
417	n/a	else:
418	n/a	msg = 'encoding problem for {!r}: utf-8'.format(filename)
419	n/a	raise SyntaxError(msg)
420	n/a	encoding += '-sig'
421	n/a	return encoding
422	n/a
423	n/a	first = read_or_stop()
424	n/a	if first.startswith(BOM_UTF8):
425	n/a	bom_found = True
426	n/a	first = first[3:]
427	n/a	default = 'utf-8-sig'
428	n/a	if not first:
429	n/a	return default, []
430	n/a
431	n/a	encoding = find_cookie(first)
432	n/a	if encoding:
433	n/a	return encoding, [first]
434	n/a	if not blank_re.match(first):
435	n/a	return default, [first]
436	n/a
437	n/a	second = read_or_stop()
438	n/a	if not second:
439	n/a	return default, [first]
440	n/a
441	n/a	encoding = find_cookie(second)
442	n/a	if encoding:
443	n/a	return encoding, [first, second]
444	n/a
445	n/a	return default, [first, second]
446	n/a
447	n/a
448	n/a	def open(filename):
449	n/a	"""Open a file in read only mode using the encoding detected by
450	n/a	detect_encoding().
451	n/a	"""
452	n/a	buffer = _builtin_open(filename, 'rb')
453	n/a	try:
454	n/a	encoding, lines = detect_encoding(buffer.readline)
455	n/a	buffer.seek(0)
456	n/a	text = TextIOWrapper(buffer, encoding, line_buffering=True)
457	n/a	text.mode = 'r'
458	n/a	return text
459	n/a	except:
460	n/a	buffer.close()
461	n/a	raise
462	n/a
463	n/a
464	n/a	def tokenize(readline):
465	n/a	"""
466	n/a	The tokenize() generator requires one argument, readline, which
467	n/a	must be a callable object which provides the same interface as the
468	n/a	readline() method of built-in file objects. Each call to the function
469	n/a	should return one line of input as bytes. Alternatively, readline
470	n/a	can be a callable function terminating with StopIteration:
471	n/a	readline = open(myfile, 'rb').__next__ # Example of alternate readline
472	n/a
473	n/a	The generator produces 5-tuples with these members: the token type; the
474	n/a	token string; a 2-tuple (srow, scol) of ints specifying the row and
475	n/a	column where the token begins in the source; a 2-tuple (erow, ecol) of
476	n/a	ints specifying the row and column where the token ends in the source;
477	n/a	and the line on which the token was found. The line passed is the
478	n/a	logical line; continuation lines are included.
479	n/a
480	n/a	The first token sequence will always be an ENCODING token
481	n/a	which tells you which encoding was used to decode the bytes stream.
482	n/a	"""
483	n/a	# This import is here to avoid problems when the itertools module is not
484	n/a	# built yet and tokenize is imported.
485	n/a	from itertools import chain, repeat
486	n/a	encoding, consumed = detect_encoding(readline)
487	n/a	rl_gen = iter(readline, b"")
488	n/a	empty = repeat(b"")
489	n/a	return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
490	n/a
491	n/a
492	n/a	def _tokenize(readline, encoding):
493	n/a	lnum = parenlev = continued = 0
494	n/a	numchars = '0123456789'
495	n/a	contstr, needcont = '', 0
496	n/a	contline = None
497	n/a	indents = [0]
498	n/a
499	n/a	# 'stashed' and 'async_*' are used for async/await parsing
500	n/a	stashed = None
501	n/a	async_def = False
502	n/a	async_def_indent = 0
503	n/a	async_def_nl = False
504	n/a
505	n/a	if encoding is not None:
506	n/a	if encoding == "utf-8-sig":
507	n/a	# BOM will already have been stripped.
508	n/a	encoding = "utf-8"
509	n/a	yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
510	n/a	while True: # loop over lines in stream
511	n/a	try:
512	n/a	line = readline()
513	n/a	except StopIteration:
514	n/a	line = b''
515	n/a
516	n/a	if encoding is not None:
517	n/a	line = line.decode(encoding)
518	n/a	lnum += 1
519	n/a	pos, max = 0, len(line)
520	n/a
521	n/a	if contstr: # continued string
522	n/a	if not line:
523	n/a	raise TokenError("EOF in multi-line string", strstart)
524	n/a	endmatch = endprog.match(line)
525	n/a	if endmatch:
526	n/a	pos = end = endmatch.end(0)
527	n/a	yield TokenInfo(STRING, contstr + line[:end],
528	n/a	strstart, (lnum, end), contline + line)
529	n/a	contstr, needcont = '', 0
530	n/a	contline = None
531	n/a	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
532	n/a	yield TokenInfo(ERRORTOKEN, contstr + line,
533	n/a	strstart, (lnum, len(line)), contline)
534	n/a	contstr = ''
535	n/a	contline = None
536	n/a	continue
537	n/a	else:
538	n/a	contstr = contstr + line
539	n/a	contline = contline + line
540	n/a	continue
541	n/a
542	n/a	elif parenlev == 0 and not continued: # new statement
543	n/a	if not line: break
544	n/a	column = 0
545	n/a	while pos < max: # measure leading whitespace
546	n/a	if line[pos] == ' ':
547	n/a	column += 1
548	n/a	elif line[pos] == '\t':
549	n/a	column = (column//tabsize + 1)*tabsize
550	n/a	elif line[pos] == '\f':
551	n/a	column = 0
552	n/a	else:
553	n/a	break
554	n/a	pos += 1
555	n/a	if pos == max:
556	n/a	break
557	n/a
558	n/a	if line[pos] in '#\r\n': # skip comments or blank lines
559	n/a	if line[pos] == '#':
560	n/a	comment_token = line[pos:].rstrip('\r\n')
561	n/a	nl_pos = pos + len(comment_token)
562	n/a	yield TokenInfo(COMMENT, comment_token,
563	n/a	(lnum, pos), (lnum, pos + len(comment_token)), line)
564	n/a	yield TokenInfo(NL, line[nl_pos:],
565	n/a	(lnum, nl_pos), (lnum, len(line)), line)
566	n/a	else:
567	n/a	yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
568	n/a	(lnum, pos), (lnum, len(line)), line)
569	n/a	continue
570	n/a
571	n/a	if column > indents[-1]: # count indents or dedents
572	n/a	indents.append(column)
573	n/a	yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
574	n/a	while column < indents[-1]:
575	n/a	if column not in indents:
576	n/a	raise IndentationError(
577	n/a	"unindent does not match any outer indentation level",
578	n/a	("<tokenize>", lnum, pos, line))
579	n/a	indents = indents[:-1]
580	n/a
581	n/a	if async_def and async_def_indent >= indents[-1]:
582	n/a	async_def = False
583	n/a	async_def_nl = False
584	n/a	async_def_indent = 0
585	n/a
586	n/a	yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
587	n/a
588	n/a	if async_def and async_def_nl and async_def_indent >= indents[-1]:
589	n/a	async_def = False
590	n/a	async_def_nl = False
591	n/a	async_def_indent = 0
592	n/a
593	n/a	else: # continued statement
594	n/a	if not line:
595	n/a	raise TokenError("EOF in multi-line statement", (lnum, 0))
596	n/a	continued = 0
597	n/a
598	n/a	while pos < max:
599	n/a	pseudomatch = _compile(PseudoToken).match(line, pos)
600	n/a	if pseudomatch: # scan for tokens
601	n/a	start, end = pseudomatch.span(1)
602	n/a	spos, epos, pos = (lnum, start), (lnum, end), end
603	n/a	if start == end:
604	n/a	continue
605	n/a	token, initial = line[start:end], line[start]
606	n/a
607	n/a	if (initial in numchars or # ordinary number
608	n/a	(initial == '.' and token != '.' and token != '...')):
609	n/a	yield TokenInfo(NUMBER, token, spos, epos, line)
610	n/a	elif initial in '\r\n':
611	n/a	if stashed:
612	n/a	yield stashed
613	n/a	stashed = None
614	n/a	if parenlev > 0:
615	n/a	yield TokenInfo(NL, token, spos, epos, line)
616	n/a	else:
617	n/a	yield TokenInfo(NEWLINE, token, spos, epos, line)
618	n/a	if async_def:
619	n/a	async_def_nl = True
620	n/a
621	n/a	elif initial == '#':
622	n/a	assert not token.endswith("\n")
623	n/a	if stashed:
624	n/a	yield stashed
625	n/a	stashed = None
626	n/a	yield TokenInfo(COMMENT, token, spos, epos, line)
627	n/a
628	n/a	elif token in triple_quoted:
629	n/a	endprog = _compile(endpats[token])
630	n/a	endmatch = endprog.match(line, pos)
631	n/a	if endmatch: # all on one line
632	n/a	pos = endmatch.end(0)
633	n/a	token = line[start:pos]
634	n/a	yield TokenInfo(STRING, token, spos, (lnum, pos), line)
635	n/a	else:
636	n/a	strstart = (lnum, start) # multiple lines
637	n/a	contstr = line[start:]
638	n/a	contline = line
639	n/a	break
640	n/a
641	n/a	# Check up to the first 3 chars of the token to see if
642	n/a	# they're in the single_quoted set. If so, they start
643	n/a	# a string.
644	n/a	# We're using the first 3, because we're looking for
645	n/a	# "rb'" (for example) at the start of the token. If
646	n/a	# we switch to longer prefixes, this needs to be
647	n/a	# adjusted.
648	n/a	# Note that initial == token[:1].
649	n/a	# Also note that single quote checking must come after
650	n/a	# triple quote checking (above).
651	n/a	elif (initial in single_quoted or
652	n/a	token[:2] in single_quoted or
653	n/a	token[:3] in single_quoted):
654	n/a	if token[-1] == '\n': # continued string
655	n/a	strstart = (lnum, start)
656	n/a	# Again, using the first 3 chars of the
657	n/a	# token. This is looking for the matching end
658	n/a	# regex for the correct type of quote
659	n/a	# character. So it's really looking for
660	n/a	# endpats["'"] or endpats['"'], by trying to
661	n/a	# skip string prefix characters, if any.
662	n/a	endprog = _compile(endpats.get(initial) or
663	n/a	endpats.get(token[1]) or
664	n/a	endpats.get(token[2]))
665	n/a	contstr, needcont = line[start:], 1
666	n/a	contline = line
667	n/a	break
668	n/a	else: # ordinary string
669	n/a	yield TokenInfo(STRING, token, spos, epos, line)
670	n/a
671	n/a	elif initial.isidentifier(): # ordinary name
672	n/a	if token in ('async', 'await'):
673	n/a	if async_def:
674	n/a	yield TokenInfo(
675	n/a	ASYNC if token == 'async' else AWAIT,
676	n/a	token, spos, epos, line)
677	n/a	continue
678	n/a
679	n/a	tok = TokenInfo(NAME, token, spos, epos, line)
680	n/a	if token == 'async' and not stashed:
681	n/a	stashed = tok
682	n/a	continue
683	n/a
684	n/a	if token == 'def':
685	n/a	if (stashed
686	n/a	and stashed.type == NAME
687	n/a	and stashed.string == 'async'):
688	n/a
689	n/a	async_def = True
690	n/a	async_def_indent = indents[-1]
691	n/a
692	n/a	yield TokenInfo(ASYNC, stashed.string,
693	n/a	stashed.start, stashed.end,
694	n/a	stashed.line)
695	n/a	stashed = None
696	n/a
697	n/a	if stashed:
698	n/a	yield stashed
699	n/a	stashed = None
700	n/a
701	n/a	yield tok
702	n/a	elif initial == '\\': # continued stmt
703	n/a	continued = 1
704	n/a	else:
705	n/a	if initial in '([{':
706	n/a	parenlev += 1
707	n/a	elif initial in ')]}':
708	n/a	parenlev -= 1
709	n/a	if stashed:
710	n/a	yield stashed
711	n/a	stashed = None
712	n/a	yield TokenInfo(OP, token, spos, epos, line)
713	n/a	else:
714	n/a	yield TokenInfo(ERRORTOKEN, line[pos],
715	n/a	(lnum, pos), (lnum, pos+1), line)
716	n/a	pos += 1
717	n/a
718	n/a	if stashed:
719	n/a	yield stashed
720	n/a	stashed = None
721	n/a
722	n/a	for indent in indents[1:]: # pop remaining indent levels
723	n/a	yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
724	n/a	yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
725	n/a
726	n/a
727	n/a	# An undocumented, backwards compatible, API for all the places in the standard
728	n/a	# library that expect to be able to use tokenize with strings
729	n/a	def generate_tokens(readline):
730	n/a	return _tokenize(readline, None)
731	n/a
732	n/a	def main():
733	n/a	import argparse
734	n/a
735	n/a	# Helper error handling routines
736	n/a	def perror(message):
737	n/a	print(message, file=sys.stderr)
738	n/a
739	n/a	def error(message, filename=None, location=None):
740	n/a	if location:
741	n/a	args = (filename,) + location + (message,)
742	n/a	perror("%s:%d:%d: error: %s" % args)
743	n/a	elif filename:
744	n/a	perror("%s: error: %s" % (filename, message))
745	n/a	else:
746	n/a	perror("error: %s" % message)
747	n/a	sys.exit(1)
748	n/a
749	n/a	# Parse the arguments and options
750	n/a	parser = argparse.ArgumentParser(prog='python -m tokenize')
751	n/a	parser.add_argument(dest='filename', nargs='?',
752	n/a	metavar='filename.py',
753	n/a	help='the file to tokenize; defaults to stdin')
754	n/a	parser.add_argument('-e', '--exact', dest='exact', action='store_true',
755	n/a	help='display token names using the exact type')
756	n/a	args = parser.parse_args()
757	n/a
758	n/a	try:
759	n/a	# Tokenize the input
760	n/a	if args.filename:
761	n/a	filename = args.filename
762	n/a	with _builtin_open(filename, 'rb') as f:
763	n/a	tokens = list(tokenize(f.readline))
764	n/a	else:
765	n/a	filename = "<stdin>"
766	n/a	tokens = _tokenize(sys.stdin.readline, None)
767	n/a
768	n/a	# Output the tokenization
769	n/a	for token in tokens:
770	n/a	token_type = token.type
771	n/a	if args.exact:
772	n/a	token_type = token.exact_type
773	n/a	token_range = "%d,%d-%d,%d:" % (token.start + token.end)
774	n/a	print("%-20s%-15s%-15r" %
775	n/a	(token_range, tok_name[token_type], token.string))
776	n/a	except IndentationError as err:
777	n/a	line, column = err.args[1][1:3]
778	n/a	error(err.args[0], filename, (line, column))
779	n/a	except TokenError as err:
780	n/a	line, column = err.args[1]
781	n/a	error(err.args[0], filename, (line, column))
782	n/a	except SyntaxError as err:
783	n/a	error(err, filename)
784	n/a	except OSError as err:
785	n/a	error(err)
786	n/a	except KeyboardInterrupt:
787	n/a	print("interrupted\n")
788	n/a	except Exception as err:
789	n/a	perror("unexpected error: %s" % err)
790	n/a	raise
791	n/a
792	n/a	if __name__ == "__main__":
793	n/a	main()