ยปCore Development>Code coverage>Lib/lib2to3/pgen2/tokenize.py

Python code coverage for Lib/lib2to3/pgen2/tokenize.py

#countcontent
1n/a# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2n/a# All rights reserved.
3n/a
4n/a"""Tokenization help for Python programs.
5n/a
6n/agenerate_tokens(readline) is a generator that breaks a stream of
7n/atext into Python tokens. It accepts a readline-like method which is called
8n/arepeatedly to get the next line of input (or "" for EOF). It generates
9n/a5-tuples with these members:
10n/a
11n/a the token type (see token.py)
12n/a the token (a string)
13n/a the starting (row, column) indices of the token (a 2-tuple of ints)
14n/a the ending (row, column) indices of the token (a 2-tuple of ints)
15n/a the original line (string)
16n/a
17n/aIt is designed to match the working of the Python tokenizer exactly, except
18n/athat it produces COMMENT tokens for comments and gives type OP for all
19n/aoperators
20n/a
21n/aOlder entry points
22n/a tokenize_loop(readline, tokeneater)
23n/a tokenize(readline, tokeneater=printtoken)
24n/aare the same, except instead of generating tokens, tokeneater is a callback
25n/afunction to which the 5 fields described above are passed as 5 arguments,
26n/aeach time a new token is found."""
27n/a
28n/a__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29n/a__credits__ = \
30n/a 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31n/a
32n/aimport string, re
33n/afrom codecs import BOM_UTF8, lookup
34n/afrom lib2to3.pgen2.token import *
35n/a
36n/afrom . import token
37n/a__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38n/a "generate_tokens", "untokenize"]
39n/adel token
40n/a
41n/atry:
42n/a bytes
43n/aexcept NameError:
44n/a # Support bytes type in Python <= 2.5, so 2to3 turns itself into
45n/a # valid Python 3 code.
46n/a bytes = str
47n/a
48n/adef group(*choices): return '(' + '|'.join(choices) + ')'
49n/adef any(*choices): return group(*choices) + '*'
50n/adef maybe(*choices): return group(*choices) + '?'
51n/a
52n/aWhitespace = r'[ \f\t]*'
53n/aComment = r'#[^\r\n]*'
54n/aIgnore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
55n/aName = r'[a-zA-Z_]\w*'
56n/a
57n/aBinnumber = r'0[bB][01]*'
58n/aHexnumber = r'0[xX][\da-fA-F]*[lL]?'
59n/aOctnumber = r'0[oO]?[0-7]*[lL]?'
60n/aDecnumber = r'[1-9]\d*[lL]?'
61n/aIntnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
62n/aExponent = r'[eE][-+]?\d+'
63n/aPointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
64n/aExpfloat = r'\d+' + Exponent
65n/aFloatnumber = group(Pointfloat, Expfloat)
66n/aImagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
67n/aNumber = group(Imagnumber, Floatnumber, Intnumber)
68n/a
69n/a# Tail end of ' string.
70n/aSingle = r"[^'\\]*(?:\\.[^'\\]*)*'"
71n/a# Tail end of " string.
72n/aDouble = r'[^"\\]*(?:\\.[^"\\]*)*"'
73n/a# Tail end of ''' string.
74n/aSingle3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
75n/a# Tail end of """ string.
76n/aDouble3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
77n/aTriple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
78n/a# Single-line ' or " string.
79n/aString = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
80n/a r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
81n/a
82n/a# Because of leftmost-then-longest match semantics, be sure to put the
83n/a# longest operators first (e.g., if = came before ==, == would get
84n/a# recognized as two instances of =).
85n/aOperator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
86n/a r"//=?", r"->",
87n/a r"[+\-*/%&@|^=<>]=?",
88n/a r"~")
89n/a
90n/aBracket = '[][(){}]'
91n/aSpecial = group(r'\r?\n', r'[:;.,`@]')
92n/aFunny = group(Operator, Bracket, Special)
93n/a
94n/aPlainToken = group(Number, Funny, String, Name)
95n/aToken = Ignore + PlainToken
96n/a
97n/a# First (or only) line of ' or " string.
98n/aContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
99n/a group("'", r'\\\r?\n'),
100n/a r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
101n/a group('"', r'\\\r?\n'))
102n/aPseudoExtras = group(r'\\\r?\n', Comment, Triple)
103n/aPseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
104n/a
105n/atokenprog, pseudoprog, single3prog, double3prog = list(map(
106n/a re.compile, (Token, PseudoToken, Single3, Double3)))
107n/aendprogs = {"'": re.compile(Single), '"': re.compile(Double),
108n/a "'''": single3prog, '"""': double3prog,
109n/a "r'''": single3prog, 'r"""': double3prog,
110n/a "u'''": single3prog, 'u"""': double3prog,
111n/a "b'''": single3prog, 'b"""': double3prog,
112n/a "ur'''": single3prog, 'ur"""': double3prog,
113n/a "br'''": single3prog, 'br"""': double3prog,
114n/a "R'''": single3prog, 'R"""': double3prog,
115n/a "U'''": single3prog, 'U"""': double3prog,
116n/a "B'''": single3prog, 'B"""': double3prog,
117n/a "uR'''": single3prog, 'uR"""': double3prog,
118n/a "Ur'''": single3prog, 'Ur"""': double3prog,
119n/a "UR'''": single3prog, 'UR"""': double3prog,
120n/a "bR'''": single3prog, 'bR"""': double3prog,
121n/a "Br'''": single3prog, 'Br"""': double3prog,
122n/a "BR'''": single3prog, 'BR"""': double3prog,
123n/a 'r': None, 'R': None,
124n/a 'u': None, 'U': None,
125n/a 'b': None, 'B': None}
126n/a
127n/atriple_quoted = {}
128n/afor t in ("'''", '"""',
129n/a "r'''", 'r"""', "R'''", 'R"""',
130n/a "u'''", 'u"""', "U'''", 'U"""',
131n/a "b'''", 'b"""', "B'''", 'B"""',
132n/a "ur'''", 'ur"""', "Ur'''", 'Ur"""',
133n/a "uR'''", 'uR"""', "UR'''", 'UR"""',
134n/a "br'''", 'br"""', "Br'''", 'Br"""',
135n/a "bR'''", 'bR"""', "BR'''", 'BR"""',):
136n/a triple_quoted[t] = t
137n/asingle_quoted = {}
138n/afor t in ("'", '"',
139n/a "r'", 'r"', "R'", 'R"',
140n/a "u'", 'u"', "U'", 'U"',
141n/a "b'", 'b"', "B'", 'B"',
142n/a "ur'", 'ur"', "Ur'", 'Ur"',
143n/a "uR'", 'uR"', "UR'", 'UR"',
144n/a "br'", 'br"', "Br'", 'Br"',
145n/a "bR'", 'bR"', "BR'", 'BR"', ):
146n/a single_quoted[t] = t
147n/a
148n/atabsize = 8
149n/a
150n/aclass TokenError(Exception): pass
151n/a
152n/aclass StopTokenizing(Exception): pass
153n/a
154n/adef printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
155n/a (srow, scol) = xxx_todo_changeme
156n/a (erow, ecol) = xxx_todo_changeme1
157n/a print("%d,%d-%d,%d:\t%s\t%s" % \
158n/a (srow, scol, erow, ecol, tok_name[type], repr(token)))
159n/a
160n/adef tokenize(readline, tokeneater=printtoken):
161n/a """
162n/a The tokenize() function accepts two parameters: one representing the
163n/a input stream, and one providing an output mechanism for tokenize().
164n/a
165n/a The first parameter, readline, must be a callable object which provides
166n/a the same interface as the readline() method of built-in file objects.
167n/a Each call to the function should return one line of input as a string.
168n/a
169n/a The second parameter, tokeneater, must also be a callable object. It is
170n/a called once for each token, with five arguments, corresponding to the
171n/a tuples generated by generate_tokens().
172n/a """
173n/a try:
174n/a tokenize_loop(readline, tokeneater)
175n/a except StopTokenizing:
176n/a pass
177n/a
178n/a# backwards compatible interface
179n/adef tokenize_loop(readline, tokeneater):
180n/a for token_info in generate_tokens(readline):
181n/a tokeneater(*token_info)
182n/a
183n/aclass Untokenizer:
184n/a
185n/a def __init__(self):
186n/a self.tokens = []
187n/a self.prev_row = 1
188n/a self.prev_col = 0
189n/a
190n/a def add_whitespace(self, start):
191n/a row, col = start
192n/a assert row <= self.prev_row
193n/a col_offset = col - self.prev_col
194n/a if col_offset:
195n/a self.tokens.append(" " * col_offset)
196n/a
197n/a def untokenize(self, iterable):
198n/a for t in iterable:
199n/a if len(t) == 2:
200n/a self.compat(t, iterable)
201n/a break
202n/a tok_type, token, start, end, line = t
203n/a self.add_whitespace(start)
204n/a self.tokens.append(token)
205n/a self.prev_row, self.prev_col = end
206n/a if tok_type in (NEWLINE, NL):
207n/a self.prev_row += 1
208n/a self.prev_col = 0
209n/a return "".join(self.tokens)
210n/a
211n/a def compat(self, token, iterable):
212n/a startline = False
213n/a indents = []
214n/a toks_append = self.tokens.append
215n/a toknum, tokval = token
216n/a if toknum in (NAME, NUMBER):
217n/a tokval += ' '
218n/a if toknum in (NEWLINE, NL):
219n/a startline = True
220n/a for tok in iterable:
221n/a toknum, tokval = tok[:2]
222n/a
223n/a if toknum in (NAME, NUMBER, ASYNC, AWAIT):
224n/a tokval += ' '
225n/a
226n/a if toknum == INDENT:
227n/a indents.append(tokval)
228n/a continue
229n/a elif toknum == DEDENT:
230n/a indents.pop()
231n/a continue
232n/a elif toknum in (NEWLINE, NL):
233n/a startline = True
234n/a elif startline and indents:
235n/a toks_append(indents[-1])
236n/a startline = False
237n/a toks_append(tokval)
238n/a
239n/acookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
240n/ablank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
241n/a
242n/adef _get_normal_name(orig_enc):
243n/a """Imitates get_normal_name in tokenizer.c."""
244n/a # Only care about the first 12 characters.
245n/a enc = orig_enc[:12].lower().replace("_", "-")
246n/a if enc == "utf-8" or enc.startswith("utf-8-"):
247n/a return "utf-8"
248n/a if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
249n/a enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
250n/a return "iso-8859-1"
251n/a return orig_enc
252n/a
253n/adef detect_encoding(readline):
254n/a """
255n/a The detect_encoding() function is used to detect the encoding that should
256n/a be used to decode a Python source file. It requires one argument, readline,
257n/a in the same way as the tokenize() generator.
258n/a
259n/a It will call readline a maximum of twice, and return the encoding used
260n/a (as a string) and a list of any lines (left as bytes) it has read
261n/a in.
262n/a
263n/a It detects the encoding from the presence of a utf-8 bom or an encoding
264n/a cookie as specified in pep-0263. If both a bom and a cookie are present, but
265n/a disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
266n/a charset, raise a SyntaxError. Note that if a utf-8 bom is found,
267n/a 'utf-8-sig' is returned.
268n/a
269n/a If no encoding is specified, then the default of 'utf-8' will be returned.
270n/a """
271n/a bom_found = False
272n/a encoding = None
273n/a default = 'utf-8'
274n/a def read_or_stop():
275n/a try:
276n/a return readline()
277n/a except StopIteration:
278n/a return bytes()
279n/a
280n/a def find_cookie(line):
281n/a try:
282n/a line_string = line.decode('ascii')
283n/a except UnicodeDecodeError:
284n/a return None
285n/a match = cookie_re.match(line_string)
286n/a if not match:
287n/a return None
288n/a encoding = _get_normal_name(match.group(1))
289n/a try:
290n/a codec = lookup(encoding)
291n/a except LookupError:
292n/a # This behaviour mimics the Python interpreter
293n/a raise SyntaxError("unknown encoding: " + encoding)
294n/a
295n/a if bom_found:
296n/a if codec.name != 'utf-8':
297n/a # This behaviour mimics the Python interpreter
298n/a raise SyntaxError('encoding problem: utf-8')
299n/a encoding += '-sig'
300n/a return encoding
301n/a
302n/a first = read_or_stop()
303n/a if first.startswith(BOM_UTF8):
304n/a bom_found = True
305n/a first = first[3:]
306n/a default = 'utf-8-sig'
307n/a if not first:
308n/a return default, []
309n/a
310n/a encoding = find_cookie(first)
311n/a if encoding:
312n/a return encoding, [first]
313n/a if not blank_re.match(first):
314n/a return default, [first]
315n/a
316n/a second = read_or_stop()
317n/a if not second:
318n/a return default, [first]
319n/a
320n/a encoding = find_cookie(second)
321n/a if encoding:
322n/a return encoding, [first, second]
323n/a
324n/a return default, [first, second]
325n/a
326n/adef untokenize(iterable):
327n/a """Transform tokens back into Python source code.
328n/a
329n/a Each element returned by the iterable must be a token sequence
330n/a with at least two elements, a token number and token value. If
331n/a only two tokens are passed, the resulting output is poor.
332n/a
333n/a Round-trip invariant for full input:
334n/a Untokenized source will match input source exactly
335n/a
336n/a Round-trip invariant for limited intput:
337n/a # Output text will tokenize the back to the input
338n/a t1 = [tok[:2] for tok in generate_tokens(f.readline)]
339n/a newcode = untokenize(t1)
340n/a readline = iter(newcode.splitlines(1)).next
341n/a t2 = [tok[:2] for tokin generate_tokens(readline)]
342n/a assert t1 == t2
343n/a """
344n/a ut = Untokenizer()
345n/a return ut.untokenize(iterable)
346n/a
347n/adef generate_tokens(readline):
348n/a """
349n/a The generate_tokens() generator requires one argument, readline, which
350n/a must be a callable object which provides the same interface as the
351n/a readline() method of built-in file objects. Each call to the function
352n/a should return one line of input as a string. Alternately, readline
353n/a can be a callable function terminating with StopIteration:
354n/a readline = open(myfile).next # Example of alternate readline
355n/a
356n/a The generator produces 5-tuples with these members: the token type; the
357n/a token string; a 2-tuple (srow, scol) of ints specifying the row and
358n/a column where the token begins in the source; a 2-tuple (erow, ecol) of
359n/a ints specifying the row and column where the token ends in the source;
360n/a and the line on which the token was found. The line passed is the
361n/a logical line; continuation lines are included.
362n/a """
363n/a lnum = parenlev = continued = 0
364n/a namechars, numchars = string.ascii_letters + '_', '0123456789'
365n/a contstr, needcont = '', 0
366n/a contline = None
367n/a indents = [0]
368n/a
369n/a # 'stashed' and 'async_*' are used for async/await parsing
370n/a stashed = None
371n/a async_def = False
372n/a async_def_indent = 0
373n/a async_def_nl = False
374n/a
375n/a while 1: # loop over lines in stream
376n/a try:
377n/a line = readline()
378n/a except StopIteration:
379n/a line = ''
380n/a lnum = lnum + 1
381n/a pos, max = 0, len(line)
382n/a
383n/a if contstr: # continued string
384n/a if not line:
385n/a raise TokenError("EOF in multi-line string", strstart)
386n/a endmatch = endprog.match(line)
387n/a if endmatch:
388n/a pos = end = endmatch.end(0)
389n/a yield (STRING, contstr + line[:end],
390n/a strstart, (lnum, end), contline + line)
391n/a contstr, needcont = '', 0
392n/a contline = None
393n/a elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
394n/a yield (ERRORTOKEN, contstr + line,
395n/a strstart, (lnum, len(line)), contline)
396n/a contstr = ''
397n/a contline = None
398n/a continue
399n/a else:
400n/a contstr = contstr + line
401n/a contline = contline + line
402n/a continue
403n/a
404n/a elif parenlev == 0 and not continued: # new statement
405n/a if not line: break
406n/a column = 0
407n/a while pos < max: # measure leading whitespace
408n/a if line[pos] == ' ': column = column + 1
409n/a elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
410n/a elif line[pos] == '\f': column = 0
411n/a else: break
412n/a pos = pos + 1
413n/a if pos == max: break
414n/a
415n/a if stashed:
416n/a yield stashed
417n/a stashed = None
418n/a
419n/a if line[pos] in '#\r\n': # skip comments or blank lines
420n/a if line[pos] == '#':
421n/a comment_token = line[pos:].rstrip('\r\n')
422n/a nl_pos = pos + len(comment_token)
423n/a yield (COMMENT, comment_token,
424n/a (lnum, pos), (lnum, pos + len(comment_token)), line)
425n/a yield (NL, line[nl_pos:],
426n/a (lnum, nl_pos), (lnum, len(line)), line)
427n/a else:
428n/a yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
429n/a (lnum, pos), (lnum, len(line)), line)
430n/a continue
431n/a
432n/a if column > indents[-1]: # count indents or dedents
433n/a indents.append(column)
434n/a yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
435n/a while column < indents[-1]:
436n/a if column not in indents:
437n/a raise IndentationError(
438n/a "unindent does not match any outer indentation level",
439n/a ("<tokenize>", lnum, pos, line))
440n/a indents = indents[:-1]
441n/a
442n/a if async_def and async_def_indent >= indents[-1]:
443n/a async_def = False
444n/a async_def_nl = False
445n/a async_def_indent = 0
446n/a
447n/a yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
448n/a
449n/a if async_def and async_def_nl and async_def_indent >= indents[-1]:
450n/a async_def = False
451n/a async_def_nl = False
452n/a async_def_indent = 0
453n/a
454n/a else: # continued statement
455n/a if not line:
456n/a raise TokenError("EOF in multi-line statement", (lnum, 0))
457n/a continued = 0
458n/a
459n/a while pos < max:
460n/a pseudomatch = pseudoprog.match(line, pos)
461n/a if pseudomatch: # scan for tokens
462n/a start, end = pseudomatch.span(1)
463n/a spos, epos, pos = (lnum, start), (lnum, end), end
464n/a token, initial = line[start:end], line[start]
465n/a
466n/a if initial in numchars or \
467n/a (initial == '.' and token != '.'): # ordinary number
468n/a yield (NUMBER, token, spos, epos, line)
469n/a elif initial in '\r\n':
470n/a newline = NEWLINE
471n/a if parenlev > 0:
472n/a newline = NL
473n/a elif async_def:
474n/a async_def_nl = True
475n/a if stashed:
476n/a yield stashed
477n/a stashed = None
478n/a yield (newline, token, spos, epos, line)
479n/a
480n/a elif initial == '#':
481n/a assert not token.endswith("\n")
482n/a if stashed:
483n/a yield stashed
484n/a stashed = None
485n/a yield (COMMENT, token, spos, epos, line)
486n/a elif token in triple_quoted:
487n/a endprog = endprogs[token]
488n/a endmatch = endprog.match(line, pos)
489n/a if endmatch: # all on one line
490n/a pos = endmatch.end(0)
491n/a token = line[start:pos]
492n/a if stashed:
493n/a yield stashed
494n/a stashed = None
495n/a yield (STRING, token, spos, (lnum, pos), line)
496n/a else:
497n/a strstart = (lnum, start) # multiple lines
498n/a contstr = line[start:]
499n/a contline = line
500n/a break
501n/a elif initial in single_quoted or \
502n/a token[:2] in single_quoted or \
503n/a token[:3] in single_quoted:
504n/a if token[-1] == '\n': # continued string
505n/a strstart = (lnum, start)
506n/a endprog = (endprogs[initial] or endprogs[token[1]] or
507n/a endprogs[token[2]])
508n/a contstr, needcont = line[start:], 1
509n/a contline = line
510n/a break
511n/a else: # ordinary string
512n/a if stashed:
513n/a yield stashed
514n/a stashed = None
515n/a yield (STRING, token, spos, epos, line)
516n/a elif initial in namechars: # ordinary name
517n/a if token in ('async', 'await'):
518n/a if async_def:
519n/a yield (ASYNC if token == 'async' else AWAIT,
520n/a token, spos, epos, line)
521n/a continue
522n/a
523n/a tok = (NAME, token, spos, epos, line)
524n/a if token == 'async' and not stashed:
525n/a stashed = tok
526n/a continue
527n/a
528n/a if token == 'def':
529n/a if (stashed
530n/a and stashed[0] == NAME
531n/a and stashed[1] == 'async'):
532n/a
533n/a async_def = True
534n/a async_def_indent = indents[-1]
535n/a
536n/a yield (ASYNC, stashed[1],
537n/a stashed[2], stashed[3],
538n/a stashed[4])
539n/a stashed = None
540n/a
541n/a if stashed:
542n/a yield stashed
543n/a stashed = None
544n/a
545n/a yield tok
546n/a elif initial == '\\': # continued stmt
547n/a # This yield is new; needed for better idempotency:
548n/a if stashed:
549n/a yield stashed
550n/a stashed = None
551n/a yield (NL, token, spos, (lnum, pos), line)
552n/a continued = 1
553n/a else:
554n/a if initial in '([{': parenlev = parenlev + 1
555n/a elif initial in ')]}': parenlev = parenlev - 1
556n/a if stashed:
557n/a yield stashed
558n/a stashed = None
559n/a yield (OP, token, spos, epos, line)
560n/a else:
561n/a yield (ERRORTOKEN, line[pos],
562n/a (lnum, pos), (lnum, pos+1), line)
563n/a pos = pos + 1
564n/a
565n/a if stashed:
566n/a yield stashed
567n/a stashed = None
568n/a
569n/a for indent in indents[1:]: # pop remaining indent levels
570n/a yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
571n/a yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
572n/a
573n/aif __name__ == '__main__': # testing
574n/a import sys
575n/a if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
576n/a else: tokenize(sys.stdin.readline)