ยปCore Development>Code coverage>Lib/tokenize.py

Python code coverage for Lib/tokenize.py

#countcontent
1n/a"""Tokenization help for Python programs.
2n/a
3n/atokenize(readline) is a generator that breaks a stream of bytes into
4n/aPython tokens. It decodes the bytes according to PEP-0263 for
5n/adetermining source file encoding.
6n/a
7n/aIt accepts a readline-like method which is called repeatedly to get the
8n/anext line of input (or b"" for EOF). It generates 5-tuples with these
9n/amembers:
10n/a
11n/a the token type (see token.py)
12n/a the token (a string)
13n/a the starting (row, column) indices of the token (a 2-tuple of ints)
14n/a the ending (row, column) indices of the token (a 2-tuple of ints)
15n/a the original line (string)
16n/a
17n/aIt is designed to match the working of the Python tokenizer exactly, except
18n/athat it produces COMMENT tokens for comments and gives type OP for all
19n/aoperators. Additionally, all token lists start with an ENCODING token
20n/awhich tells you which encoding was used to decode the bytes stream.
21n/a"""
22n/a
23n/a__author__ = 'Ka-Ping Yee <ping@lfw.org>'
24n/a__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
25n/a 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
26n/a 'Michael Foord')
27n/afrom builtins import open as _builtin_open
28n/afrom codecs import lookup, BOM_UTF8
29n/aimport collections
30n/afrom io import TextIOWrapper
31n/afrom itertools import chain
32n/aimport itertools as _itertools
33n/aimport re
34n/aimport sys
35n/afrom token import *
36n/a
37n/acookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
38n/ablank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
39n/a
40n/aimport token
41n/a__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
42n/a "NL", "untokenize", "ENCODING", "TokenInfo"]
43n/adel token
44n/a
45n/aCOMMENT = N_TOKENS
46n/atok_name[COMMENT] = 'COMMENT'
47n/aNL = N_TOKENS + 1
48n/atok_name[NL] = 'NL'
49n/aENCODING = N_TOKENS + 2
50n/atok_name[ENCODING] = 'ENCODING'
51n/aN_TOKENS += 3
52n/aEXACT_TOKEN_TYPES = {
53n/a '(': LPAR,
54n/a ')': RPAR,
55n/a '[': LSQB,
56n/a ']': RSQB,
57n/a ':': COLON,
58n/a ',': COMMA,
59n/a ';': SEMI,
60n/a '+': PLUS,
61n/a '-': MINUS,
62n/a '*': STAR,
63n/a '/': SLASH,
64n/a '|': VBAR,
65n/a '&': AMPER,
66n/a '<': LESS,
67n/a '>': GREATER,
68n/a '=': EQUAL,
69n/a '.': DOT,
70n/a '%': PERCENT,
71n/a '{': LBRACE,
72n/a '}': RBRACE,
73n/a '==': EQEQUAL,
74n/a '!=': NOTEQUAL,
75n/a '<=': LESSEQUAL,
76n/a '>=': GREATEREQUAL,
77n/a '~': TILDE,
78n/a '^': CIRCUMFLEX,
79n/a '<<': LEFTSHIFT,
80n/a '>>': RIGHTSHIFT,
81n/a '**': DOUBLESTAR,
82n/a '+=': PLUSEQUAL,
83n/a '-=': MINEQUAL,
84n/a '*=': STAREQUAL,
85n/a '/=': SLASHEQUAL,
86n/a '%=': PERCENTEQUAL,
87n/a '&=': AMPEREQUAL,
88n/a '|=': VBAREQUAL,
89n/a '^=': CIRCUMFLEXEQUAL,
90n/a '<<=': LEFTSHIFTEQUAL,
91n/a '>>=': RIGHTSHIFTEQUAL,
92n/a '**=': DOUBLESTAREQUAL,
93n/a '//': DOUBLESLASH,
94n/a '//=': DOUBLESLASHEQUAL,
95n/a '@': AT,
96n/a '@=': ATEQUAL,
97n/a}
98n/a
99n/aclass TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
100n/a def __repr__(self):
101n/a annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
102n/a return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
103n/a self._replace(type=annotated_type))
104n/a
105n/a @property
106n/a def exact_type(self):
107n/a if self.type == OP and self.string in EXACT_TOKEN_TYPES:
108n/a return EXACT_TOKEN_TYPES[self.string]
109n/a else:
110n/a return self.type
111n/a
112n/adef group(*choices): return '(' + '|'.join(choices) + ')'
113n/adef any(*choices): return group(*choices) + '*'
114n/adef maybe(*choices): return group(*choices) + '?'
115n/a
116n/a# Note: we use unicode matching for names ("\w") but ascii matching for
117n/a# number literals.
118n/aWhitespace = r'[ \f\t]*'
119n/aComment = r'#[^\r\n]*'
120n/aIgnore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
121n/aName = r'\w+'
122n/a
123n/aHexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
124n/aBinnumber = r'0[bB](?:_?[01])+'
125n/aOctnumber = r'0[oO](?:_?[0-7])+'
126n/aDecnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
127n/aIntnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
128n/aExponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
129n/aPointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
130n/a r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
131n/aExpfloat = r'[0-9](?:_?[0-9])*' + Exponent
132n/aFloatnumber = group(Pointfloat, Expfloat)
133n/aImagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
134n/aNumber = group(Imagnumber, Floatnumber, Intnumber)
135n/a
136n/a# Return the empty string, plus all of the valid string prefixes.
137n/adef _all_string_prefixes():
138n/a # The valid string prefixes. Only contain the lower case versions,
139n/a # and don't contain any permuations (include 'fr', but not
140n/a # 'rf'). The various permutations will be generated.
141n/a _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
142n/a # if we add binary f-strings, add: ['fb', 'fbr']
143n/a result = set([''])
144n/a for prefix in _valid_string_prefixes:
145n/a for t in _itertools.permutations(prefix):
146n/a # create a list with upper and lower versions of each
147n/a # character
148n/a for u in _itertools.product(*[(c, c.upper()) for c in t]):
149n/a result.add(''.join(u))
150n/a return result
151n/a
152n/adef _compile(expr):
153n/a return re.compile(expr, re.UNICODE)
154n/a
155n/a# Note that since _all_string_prefixes includes the empty string,
156n/a# StringPrefix can be the empty string (making it optional).
157n/aStringPrefix = group(*_all_string_prefixes())
158n/a
159n/a# Tail end of ' string.
160n/aSingle = r"[^'\\]*(?:\\.[^'\\]*)*'"
161n/a# Tail end of " string.
162n/aDouble = r'[^"\\]*(?:\\.[^"\\]*)*"'
163n/a# Tail end of ''' string.
164n/aSingle3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
165n/a# Tail end of """ string.
166n/aDouble3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
167n/aTriple = group(StringPrefix + "'''", StringPrefix + '"""')
168n/a# Single-line ' or " string.
169n/aString = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
170n/a StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
171n/a
172n/a# Because of leftmost-then-longest match semantics, be sure to put the
173n/a# longest operators first (e.g., if = came before ==, == would get
174n/a# recognized as two instances of =).
175n/aOperator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
176n/a r"//=?", r"->",
177n/a r"[+\-*/%&@|^=<>]=?",
178n/a r"~")
179n/a
180n/aBracket = '[][(){}]'
181n/aSpecial = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
182n/aFunny = group(Operator, Bracket, Special)
183n/a
184n/aPlainToken = group(Number, Funny, String, Name)
185n/aToken = Ignore + PlainToken
186n/a
187n/a# First (or only) line of ' or " string.
188n/aContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
189n/a group("'", r'\\\r?\n'),
190n/a StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
191n/a group('"', r'\\\r?\n'))
192n/aPseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
193n/aPseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
194n/a
195n/a# For a given string prefix plus quotes, endpats maps it to a regex
196n/a# to match the remainder of that string. _prefix can be empty, for
197n/a# a normal single or triple quoted string (with no prefix).
198n/aendpats = {}
199n/afor _prefix in _all_string_prefixes():
200n/a endpats[_prefix + "'"] = Single
201n/a endpats[_prefix + '"'] = Double
202n/a endpats[_prefix + "'''"] = Single3
203n/a endpats[_prefix + '"""'] = Double3
204n/a
205n/a# A set of all of the single and triple quoted string prefixes,
206n/a# including the opening quotes.
207n/asingle_quoted = set()
208n/atriple_quoted = set()
209n/afor t in _all_string_prefixes():
210n/a for u in (t + '"', t + "'"):
211n/a single_quoted.add(u)
212n/a for u in (t + '"""', t + "'''"):
213n/a triple_quoted.add(u)
214n/a
215n/atabsize = 8
216n/a
217n/aclass TokenError(Exception): pass
218n/a
219n/aclass StopTokenizing(Exception): pass
220n/a
221n/a
222n/aclass Untokenizer:
223n/a
224n/a def __init__(self):
225n/a self.tokens = []
226n/a self.prev_row = 1
227n/a self.prev_col = 0
228n/a self.encoding = None
229n/a
230n/a def add_whitespace(self, start):
231n/a row, col = start
232n/a if row < self.prev_row or row == self.prev_row and col < self.prev_col:
233n/a raise ValueError("start ({},{}) precedes previous end ({},{})"
234n/a .format(row, col, self.prev_row, self.prev_col))
235n/a row_offset = row - self.prev_row
236n/a if row_offset:
237n/a self.tokens.append("\\\n" * row_offset)
238n/a self.prev_col = 0
239n/a col_offset = col - self.prev_col
240n/a if col_offset:
241n/a self.tokens.append(" " * col_offset)
242n/a
243n/a def untokenize(self, iterable):
244n/a it = iter(iterable)
245n/a indents = []
246n/a startline = False
247n/a for t in it:
248n/a if len(t) == 2:
249n/a self.compat(t, it)
250n/a break
251n/a tok_type, token, start, end, line = t
252n/a if tok_type == ENCODING:
253n/a self.encoding = token
254n/a continue
255n/a if tok_type == ENDMARKER:
256n/a break
257n/a if tok_type == INDENT:
258n/a indents.append(token)
259n/a continue
260n/a elif tok_type == DEDENT:
261n/a indents.pop()
262n/a self.prev_row, self.prev_col = end
263n/a continue
264n/a elif tok_type in (NEWLINE, NL):
265n/a startline = True
266n/a elif startline and indents:
267n/a indent = indents[-1]
268n/a if start[1] >= len(indent):
269n/a self.tokens.append(indent)
270n/a self.prev_col = len(indent)
271n/a startline = False
272n/a self.add_whitespace(start)
273n/a self.tokens.append(token)
274n/a self.prev_row, self.prev_col = end
275n/a if tok_type in (NEWLINE, NL):
276n/a self.prev_row += 1
277n/a self.prev_col = 0
278n/a return "".join(self.tokens)
279n/a
280n/a def compat(self, token, iterable):
281n/a indents = []
282n/a toks_append = self.tokens.append
283n/a startline = token[0] in (NEWLINE, NL)
284n/a prevstring = False
285n/a
286n/a for tok in chain([token], iterable):
287n/a toknum, tokval = tok[:2]
288n/a if toknum == ENCODING:
289n/a self.encoding = tokval
290n/a continue
291n/a
292n/a if toknum in (NAME, NUMBER, ASYNC, AWAIT):
293n/a tokval += ' '
294n/a
295n/a # Insert a space between two consecutive strings
296n/a if toknum == STRING:
297n/a if prevstring:
298n/a tokval = ' ' + tokval
299n/a prevstring = True
300n/a else:
301n/a prevstring = False
302n/a
303n/a if toknum == INDENT:
304n/a indents.append(tokval)
305n/a continue
306n/a elif toknum == DEDENT:
307n/a indents.pop()
308n/a continue
309n/a elif toknum in (NEWLINE, NL):
310n/a startline = True
311n/a elif startline and indents:
312n/a toks_append(indents[-1])
313n/a startline = False
314n/a toks_append(tokval)
315n/a
316n/a
317n/adef untokenize(iterable):
318n/a """Transform tokens back into Python source code.
319n/a It returns a bytes object, encoded using the ENCODING
320n/a token, which is the first token sequence output by tokenize.
321n/a
322n/a Each element returned by the iterable must be a token sequence
323n/a with at least two elements, a token number and token value. If
324n/a only two tokens are passed, the resulting output is poor.
325n/a
326n/a Round-trip invariant for full input:
327n/a Untokenized source will match input source exactly
328n/a
329n/a Round-trip invariant for limited input:
330n/a # Output bytes will tokenize back to the input
331n/a t1 = [tok[:2] for tok in tokenize(f.readline)]
332n/a newcode = untokenize(t1)
333n/a readline = BytesIO(newcode).readline
334n/a t2 = [tok[:2] for tok in tokenize(readline)]
335n/a assert t1 == t2
336n/a """
337n/a ut = Untokenizer()
338n/a out = ut.untokenize(iterable)
339n/a if ut.encoding is not None:
340n/a out = out.encode(ut.encoding)
341n/a return out
342n/a
343n/a
344n/adef _get_normal_name(orig_enc):
345n/a """Imitates get_normal_name in tokenizer.c."""
346n/a # Only care about the first 12 characters.
347n/a enc = orig_enc[:12].lower().replace("_", "-")
348n/a if enc == "utf-8" or enc.startswith("utf-8-"):
349n/a return "utf-8"
350n/a if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
351n/a enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
352n/a return "iso-8859-1"
353n/a return orig_enc
354n/a
355n/adef detect_encoding(readline):
356n/a """
357n/a The detect_encoding() function is used to detect the encoding that should
358n/a be used to decode a Python source file. It requires one argument, readline,
359n/a in the same way as the tokenize() generator.
360n/a
361n/a It will call readline a maximum of twice, and return the encoding used
362n/a (as a string) and a list of any lines (left as bytes) it has read in.
363n/a
364n/a It detects the encoding from the presence of a utf-8 bom or an encoding
365n/a cookie as specified in pep-0263. If both a bom and a cookie are present,
366n/a but disagree, a SyntaxError will be raised. If the encoding cookie is an
367n/a invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
368n/a 'utf-8-sig' is returned.
369n/a
370n/a If no encoding is specified, then the default of 'utf-8' will be returned.
371n/a """
372n/a try:
373n/a filename = readline.__self__.name
374n/a except AttributeError:
375n/a filename = None
376n/a bom_found = False
377n/a encoding = None
378n/a default = 'utf-8'
379n/a def read_or_stop():
380n/a try:
381n/a return readline()
382n/a except StopIteration:
383n/a return b''
384n/a
385n/a def find_cookie(line):
386n/a try:
387n/a # Decode as UTF-8. Either the line is an encoding declaration,
388n/a # in which case it should be pure ASCII, or it must be UTF-8
389n/a # per default encoding.
390n/a line_string = line.decode('utf-8')
391n/a except UnicodeDecodeError:
392n/a msg = "invalid or missing encoding declaration"
393n/a if filename is not None:
394n/a msg = '{} for {!r}'.format(msg, filename)
395n/a raise SyntaxError(msg)
396n/a
397n/a match = cookie_re.match(line_string)
398n/a if not match:
399n/a return None
400n/a encoding = _get_normal_name(match.group(1))
401n/a try:
402n/a codec = lookup(encoding)
403n/a except LookupError:
404n/a # This behaviour mimics the Python interpreter
405n/a if filename is None:
406n/a msg = "unknown encoding: " + encoding
407n/a else:
408n/a msg = "unknown encoding for {!r}: {}".format(filename,
409n/a encoding)
410n/a raise SyntaxError(msg)
411n/a
412n/a if bom_found:
413n/a if encoding != 'utf-8':
414n/a # This behaviour mimics the Python interpreter
415n/a if filename is None:
416n/a msg = 'encoding problem: utf-8'
417n/a else:
418n/a msg = 'encoding problem for {!r}: utf-8'.format(filename)
419n/a raise SyntaxError(msg)
420n/a encoding += '-sig'
421n/a return encoding
422n/a
423n/a first = read_or_stop()
424n/a if first.startswith(BOM_UTF8):
425n/a bom_found = True
426n/a first = first[3:]
427n/a default = 'utf-8-sig'
428n/a if not first:
429n/a return default, []
430n/a
431n/a encoding = find_cookie(first)
432n/a if encoding:
433n/a return encoding, [first]
434n/a if not blank_re.match(first):
435n/a return default, [first]
436n/a
437n/a second = read_or_stop()
438n/a if not second:
439n/a return default, [first]
440n/a
441n/a encoding = find_cookie(second)
442n/a if encoding:
443n/a return encoding, [first, second]
444n/a
445n/a return default, [first, second]
446n/a
447n/a
448n/adef open(filename):
449n/a """Open a file in read only mode using the encoding detected by
450n/a detect_encoding().
451n/a """
452n/a buffer = _builtin_open(filename, 'rb')
453n/a try:
454n/a encoding, lines = detect_encoding(buffer.readline)
455n/a buffer.seek(0)
456n/a text = TextIOWrapper(buffer, encoding, line_buffering=True)
457n/a text.mode = 'r'
458n/a return text
459n/a except:
460n/a buffer.close()
461n/a raise
462n/a
463n/a
464n/adef tokenize(readline):
465n/a """
466n/a The tokenize() generator requires one argument, readline, which
467n/a must be a callable object which provides the same interface as the
468n/a readline() method of built-in file objects. Each call to the function
469n/a should return one line of input as bytes. Alternatively, readline
470n/a can be a callable function terminating with StopIteration:
471n/a readline = open(myfile, 'rb').__next__ # Example of alternate readline
472n/a
473n/a The generator produces 5-tuples with these members: the token type; the
474n/a token string; a 2-tuple (srow, scol) of ints specifying the row and
475n/a column where the token begins in the source; a 2-tuple (erow, ecol) of
476n/a ints specifying the row and column where the token ends in the source;
477n/a and the line on which the token was found. The line passed is the
478n/a logical line; continuation lines are included.
479n/a
480n/a The first token sequence will always be an ENCODING token
481n/a which tells you which encoding was used to decode the bytes stream.
482n/a """
483n/a # This import is here to avoid problems when the itertools module is not
484n/a # built yet and tokenize is imported.
485n/a from itertools import chain, repeat
486n/a encoding, consumed = detect_encoding(readline)
487n/a rl_gen = iter(readline, b"")
488n/a empty = repeat(b"")
489n/a return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
490n/a
491n/a
492n/adef _tokenize(readline, encoding):
493n/a lnum = parenlev = continued = 0
494n/a numchars = '0123456789'
495n/a contstr, needcont = '', 0
496n/a contline = None
497n/a indents = [0]
498n/a
499n/a # 'stashed' and 'async_*' are used for async/await parsing
500n/a stashed = None
501n/a async_def = False
502n/a async_def_indent = 0
503n/a async_def_nl = False
504n/a
505n/a if encoding is not None:
506n/a if encoding == "utf-8-sig":
507n/a # BOM will already have been stripped.
508n/a encoding = "utf-8"
509n/a yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
510n/a while True: # loop over lines in stream
511n/a try:
512n/a line = readline()
513n/a except StopIteration:
514n/a line = b''
515n/a
516n/a if encoding is not None:
517n/a line = line.decode(encoding)
518n/a lnum += 1
519n/a pos, max = 0, len(line)
520n/a
521n/a if contstr: # continued string
522n/a if not line:
523n/a raise TokenError("EOF in multi-line string", strstart)
524n/a endmatch = endprog.match(line)
525n/a if endmatch:
526n/a pos = end = endmatch.end(0)
527n/a yield TokenInfo(STRING, contstr + line[:end],
528n/a strstart, (lnum, end), contline + line)
529n/a contstr, needcont = '', 0
530n/a contline = None
531n/a elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
532n/a yield TokenInfo(ERRORTOKEN, contstr + line,
533n/a strstart, (lnum, len(line)), contline)
534n/a contstr = ''
535n/a contline = None
536n/a continue
537n/a else:
538n/a contstr = contstr + line
539n/a contline = contline + line
540n/a continue
541n/a
542n/a elif parenlev == 0 and not continued: # new statement
543n/a if not line: break
544n/a column = 0
545n/a while pos < max: # measure leading whitespace
546n/a if line[pos] == ' ':
547n/a column += 1
548n/a elif line[pos] == '\t':
549n/a column = (column//tabsize + 1)*tabsize
550n/a elif line[pos] == '\f':
551n/a column = 0
552n/a else:
553n/a break
554n/a pos += 1
555n/a if pos == max:
556n/a break
557n/a
558n/a if line[pos] in '#\r\n': # skip comments or blank lines
559n/a if line[pos] == '#':
560n/a comment_token = line[pos:].rstrip('\r\n')
561n/a nl_pos = pos + len(comment_token)
562n/a yield TokenInfo(COMMENT, comment_token,
563n/a (lnum, pos), (lnum, pos + len(comment_token)), line)
564n/a yield TokenInfo(NL, line[nl_pos:],
565n/a (lnum, nl_pos), (lnum, len(line)), line)
566n/a else:
567n/a yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
568n/a (lnum, pos), (lnum, len(line)), line)
569n/a continue
570n/a
571n/a if column > indents[-1]: # count indents or dedents
572n/a indents.append(column)
573n/a yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
574n/a while column < indents[-1]:
575n/a if column not in indents:
576n/a raise IndentationError(
577n/a "unindent does not match any outer indentation level",
578n/a ("<tokenize>", lnum, pos, line))
579n/a indents = indents[:-1]
580n/a
581n/a if async_def and async_def_indent >= indents[-1]:
582n/a async_def = False
583n/a async_def_nl = False
584n/a async_def_indent = 0
585n/a
586n/a yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
587n/a
588n/a if async_def and async_def_nl and async_def_indent >= indents[-1]:
589n/a async_def = False
590n/a async_def_nl = False
591n/a async_def_indent = 0
592n/a
593n/a else: # continued statement
594n/a if not line:
595n/a raise TokenError("EOF in multi-line statement", (lnum, 0))
596n/a continued = 0
597n/a
598n/a while pos < max:
599n/a pseudomatch = _compile(PseudoToken).match(line, pos)
600n/a if pseudomatch: # scan for tokens
601n/a start, end = pseudomatch.span(1)
602n/a spos, epos, pos = (lnum, start), (lnum, end), end
603n/a if start == end:
604n/a continue
605n/a token, initial = line[start:end], line[start]
606n/a
607n/a if (initial in numchars or # ordinary number
608n/a (initial == '.' and token != '.' and token != '...')):
609n/a yield TokenInfo(NUMBER, token, spos, epos, line)
610n/a elif initial in '\r\n':
611n/a if stashed:
612n/a yield stashed
613n/a stashed = None
614n/a if parenlev > 0:
615n/a yield TokenInfo(NL, token, spos, epos, line)
616n/a else:
617n/a yield TokenInfo(NEWLINE, token, spos, epos, line)
618n/a if async_def:
619n/a async_def_nl = True
620n/a
621n/a elif initial == '#':
622n/a assert not token.endswith("\n")
623n/a if stashed:
624n/a yield stashed
625n/a stashed = None
626n/a yield TokenInfo(COMMENT, token, spos, epos, line)
627n/a
628n/a elif token in triple_quoted:
629n/a endprog = _compile(endpats[token])
630n/a endmatch = endprog.match(line, pos)
631n/a if endmatch: # all on one line
632n/a pos = endmatch.end(0)
633n/a token = line[start:pos]
634n/a yield TokenInfo(STRING, token, spos, (lnum, pos), line)
635n/a else:
636n/a strstart = (lnum, start) # multiple lines
637n/a contstr = line[start:]
638n/a contline = line
639n/a break
640n/a
641n/a # Check up to the first 3 chars of the token to see if
642n/a # they're in the single_quoted set. If so, they start
643n/a # a string.
644n/a # We're using the first 3, because we're looking for
645n/a # "rb'" (for example) at the start of the token. If
646n/a # we switch to longer prefixes, this needs to be
647n/a # adjusted.
648n/a # Note that initial == token[:1].
649n/a # Also note that single quote checking must come after
650n/a # triple quote checking (above).
651n/a elif (initial in single_quoted or
652n/a token[:2] in single_quoted or
653n/a token[:3] in single_quoted):
654n/a if token[-1] == '\n': # continued string
655n/a strstart = (lnum, start)
656n/a # Again, using the first 3 chars of the
657n/a # token. This is looking for the matching end
658n/a # regex for the correct type of quote
659n/a # character. So it's really looking for
660n/a # endpats["'"] or endpats['"'], by trying to
661n/a # skip string prefix characters, if any.
662n/a endprog = _compile(endpats.get(initial) or
663n/a endpats.get(token[1]) or
664n/a endpats.get(token[2]))
665n/a contstr, needcont = line[start:], 1
666n/a contline = line
667n/a break
668n/a else: # ordinary string
669n/a yield TokenInfo(STRING, token, spos, epos, line)
670n/a
671n/a elif initial.isidentifier(): # ordinary name
672n/a if token in ('async', 'await'):
673n/a if async_def:
674n/a yield TokenInfo(
675n/a ASYNC if token == 'async' else AWAIT,
676n/a token, spos, epos, line)
677n/a continue
678n/a
679n/a tok = TokenInfo(NAME, token, spos, epos, line)
680n/a if token == 'async' and not stashed:
681n/a stashed = tok
682n/a continue
683n/a
684n/a if token == 'def':
685n/a if (stashed
686n/a and stashed.type == NAME
687n/a and stashed.string == 'async'):
688n/a
689n/a async_def = True
690n/a async_def_indent = indents[-1]
691n/a
692n/a yield TokenInfo(ASYNC, stashed.string,
693n/a stashed.start, stashed.end,
694n/a stashed.line)
695n/a stashed = None
696n/a
697n/a if stashed:
698n/a yield stashed
699n/a stashed = None
700n/a
701n/a yield tok
702n/a elif initial == '\\': # continued stmt
703n/a continued = 1
704n/a else:
705n/a if initial in '([{':
706n/a parenlev += 1
707n/a elif initial in ')]}':
708n/a parenlev -= 1
709n/a if stashed:
710n/a yield stashed
711n/a stashed = None
712n/a yield TokenInfo(OP, token, spos, epos, line)
713n/a else:
714n/a yield TokenInfo(ERRORTOKEN, line[pos],
715n/a (lnum, pos), (lnum, pos+1), line)
716n/a pos += 1
717n/a
718n/a if stashed:
719n/a yield stashed
720n/a stashed = None
721n/a
722n/a for indent in indents[1:]: # pop remaining indent levels
723n/a yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
724n/a yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
725n/a
726n/a
727n/a# An undocumented, backwards compatible, API for all the places in the standard
728n/a# library that expect to be able to use tokenize with strings
729n/adef generate_tokens(readline):
730n/a return _tokenize(readline, None)
731n/a
732n/adef main():
733n/a import argparse
734n/a
735n/a # Helper error handling routines
736n/a def perror(message):
737n/a print(message, file=sys.stderr)
738n/a
739n/a def error(message, filename=None, location=None):
740n/a if location:
741n/a args = (filename,) + location + (message,)
742n/a perror("%s:%d:%d: error: %s" % args)
743n/a elif filename:
744n/a perror("%s: error: %s" % (filename, message))
745n/a else:
746n/a perror("error: %s" % message)
747n/a sys.exit(1)
748n/a
749n/a # Parse the arguments and options
750n/a parser = argparse.ArgumentParser(prog='python -m tokenize')
751n/a parser.add_argument(dest='filename', nargs='?',
752n/a metavar='filename.py',
753n/a help='the file to tokenize; defaults to stdin')
754n/a parser.add_argument('-e', '--exact', dest='exact', action='store_true',
755n/a help='display token names using the exact type')
756n/a args = parser.parse_args()
757n/a
758n/a try:
759n/a # Tokenize the input
760n/a if args.filename:
761n/a filename = args.filename
762n/a with _builtin_open(filename, 'rb') as f:
763n/a tokens = list(tokenize(f.readline))
764n/a else:
765n/a filename = "<stdin>"
766n/a tokens = _tokenize(sys.stdin.readline, None)
767n/a
768n/a # Output the tokenization
769n/a for token in tokens:
770n/a token_type = token.type
771n/a if args.exact:
772n/a token_type = token.exact_type
773n/a token_range = "%d,%d-%d,%d:" % (token.start + token.end)
774n/a print("%-20s%-15s%-15r" %
775n/a (token_range, tok_name[token_type], token.string))
776n/a except IndentationError as err:
777n/a line, column = err.args[1][1:3]
778n/a error(err.args[0], filename, (line, column))
779n/a except TokenError as err:
780n/a line, column = err.args[1]
781n/a error(err.args[0], filename, (line, column))
782n/a except SyntaxError as err:
783n/a error(err, filename)
784n/a except OSError as err:
785n/a error(err)
786n/a except KeyboardInterrupt:
787n/a print("interrupted\n")
788n/a except Exception as err:
789n/a perror("unexpected error: %s" % err)
790n/a raise
791n/a
792n/aif __name__ == "__main__":
793n/a main()