ยปCore Development>Code coverage>Lib/sre_parse.py

Python code coverage for Lib/sre_parse.py

#countcontent
1n/a#
2n/a# Secret Labs' Regular Expression Engine
3n/a#
4n/a# convert re-style regular expression to sre pattern
5n/a#
6n/a# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
7n/a#
8n/a# See the sre.py file for information on usage and redistribution.
9n/a#
10n/a
11n/a"""Internal support module for sre"""
12n/a
13n/a# XXX: show string offset and offending character for all errors
14n/a
15n/afrom sre_constants import *
16n/a
17n/aSPECIAL_CHARS = ".\\[{()*+?^$|"
18n/aREPEAT_CHARS = "*+?{"
19n/a
20n/aDIGITS = frozenset("0123456789")
21n/a
22n/aOCTDIGITS = frozenset("01234567")
23n/aHEXDIGITS = frozenset("0123456789abcdefABCDEF")
24n/aASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
25n/a
26n/aWHITESPACE = frozenset(" \t\n\r\v\f")
27n/a
28n/a_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
29n/a_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
30n/a
31n/aESCAPES = {
32n/a r"\a": (LITERAL, ord("\a")),
33n/a r"\b": (LITERAL, ord("\b")),
34n/a r"\f": (LITERAL, ord("\f")),
35n/a r"\n": (LITERAL, ord("\n")),
36n/a r"\r": (LITERAL, ord("\r")),
37n/a r"\t": (LITERAL, ord("\t")),
38n/a r"\v": (LITERAL, ord("\v")),
39n/a r"\\": (LITERAL, ord("\\"))
40n/a}
41n/a
42n/aCATEGORIES = {
43n/a r"\A": (AT, AT_BEGINNING_STRING), # start of string
44n/a r"\b": (AT, AT_BOUNDARY),
45n/a r"\B": (AT, AT_NON_BOUNDARY),
46n/a r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
47n/a r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
48n/a r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
49n/a r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
50n/a r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
51n/a r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
52n/a r"\Z": (AT, AT_END_STRING), # end of string
53n/a}
54n/a
55n/aFLAGS = {
56n/a # standard flags
57n/a "i": SRE_FLAG_IGNORECASE,
58n/a "L": SRE_FLAG_LOCALE,
59n/a "m": SRE_FLAG_MULTILINE,
60n/a "s": SRE_FLAG_DOTALL,
61n/a "x": SRE_FLAG_VERBOSE,
62n/a # extensions
63n/a "a": SRE_FLAG_ASCII,
64n/a "t": SRE_FLAG_TEMPLATE,
65n/a "u": SRE_FLAG_UNICODE,
66n/a}
67n/a
68n/aGLOBAL_FLAGS = (SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE |
69n/a SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE)
70n/a
71n/aclass Verbose(Exception):
72n/a pass
73n/a
74n/aclass Pattern:
75n/a # master pattern object. keeps track of global attributes
76n/a def __init__(self):
77n/a self.flags = 0
78n/a self.groupdict = {}
79n/a self.groupwidths = [None] # group 0
80n/a self.lookbehindgroups = None
81n/a @property
82n/a def groups(self):
83n/a return len(self.groupwidths)
84n/a def opengroup(self, name=None):
85n/a gid = self.groups
86n/a self.groupwidths.append(None)
87n/a if self.groups > MAXGROUPS:
88n/a raise error("too many groups")
89n/a if name is not None:
90n/a ogid = self.groupdict.get(name, None)
91n/a if ogid is not None:
92n/a raise error("redefinition of group name %r as group %d; "
93n/a "was group %d" % (name, gid, ogid))
94n/a self.groupdict[name] = gid
95n/a return gid
96n/a def closegroup(self, gid, p):
97n/a self.groupwidths[gid] = p.getwidth()
98n/a def checkgroup(self, gid):
99n/a return gid < self.groups and self.groupwidths[gid] is not None
100n/a
101n/a def checklookbehindgroup(self, gid, source):
102n/a if self.lookbehindgroups is not None:
103n/a if not self.checkgroup(gid):
104n/a raise source.error('cannot refer to an open group')
105n/a if gid >= self.lookbehindgroups:
106n/a raise source.error('cannot refer to group defined in the same '
107n/a 'lookbehind subpattern')
108n/a
109n/aclass SubPattern:
110n/a # a subpattern, in intermediate form
111n/a def __init__(self, pattern, data=None):
112n/a self.pattern = pattern
113n/a if data is None:
114n/a data = []
115n/a self.data = data
116n/a self.width = None
117n/a def dump(self, level=0):
118n/a nl = True
119n/a seqtypes = (tuple, list)
120n/a for op, av in self.data:
121n/a print(level*" " + str(op), end='')
122n/a if op is IN:
123n/a # member sublanguage
124n/a print()
125n/a for op, a in av:
126n/a print((level+1)*" " + str(op), a)
127n/a elif op is BRANCH:
128n/a print()
129n/a for i, a in enumerate(av[1]):
130n/a if i:
131n/a print(level*" " + "OR")
132n/a a.dump(level+1)
133n/a elif op is GROUPREF_EXISTS:
134n/a condgroup, item_yes, item_no = av
135n/a print('', condgroup)
136n/a item_yes.dump(level+1)
137n/a if item_no:
138n/a print(level*" " + "ELSE")
139n/a item_no.dump(level+1)
140n/a elif isinstance(av, seqtypes):
141n/a nl = False
142n/a for a in av:
143n/a if isinstance(a, SubPattern):
144n/a if not nl:
145n/a print()
146n/a a.dump(level+1)
147n/a nl = True
148n/a else:
149n/a if not nl:
150n/a print(' ', end='')
151n/a print(a, end='')
152n/a nl = False
153n/a if not nl:
154n/a print()
155n/a else:
156n/a print('', av)
157n/a def __repr__(self):
158n/a return repr(self.data)
159n/a def __len__(self):
160n/a return len(self.data)
161n/a def __delitem__(self, index):
162n/a del self.data[index]
163n/a def __getitem__(self, index):
164n/a if isinstance(index, slice):
165n/a return SubPattern(self.pattern, self.data[index])
166n/a return self.data[index]
167n/a def __setitem__(self, index, code):
168n/a self.data[index] = code
169n/a def insert(self, index, code):
170n/a self.data.insert(index, code)
171n/a def append(self, code):
172n/a self.data.append(code)
173n/a def getwidth(self):
174n/a # determine the width (min, max) for this subpattern
175n/a if self.width is not None:
176n/a return self.width
177n/a lo = hi = 0
178n/a for op, av in self.data:
179n/a if op is BRANCH:
180n/a i = MAXREPEAT - 1
181n/a j = 0
182n/a for av in av[1]:
183n/a l, h = av.getwidth()
184n/a i = min(i, l)
185n/a j = max(j, h)
186n/a lo = lo + i
187n/a hi = hi + j
188n/a elif op is CALL:
189n/a i, j = av.getwidth()
190n/a lo = lo + i
191n/a hi = hi + j
192n/a elif op is SUBPATTERN:
193n/a i, j = av[-1].getwidth()
194n/a lo = lo + i
195n/a hi = hi + j
196n/a elif op in _REPEATCODES:
197n/a i, j = av[2].getwidth()
198n/a lo = lo + i * av[0]
199n/a hi = hi + j * av[1]
200n/a elif op in _UNITCODES:
201n/a lo = lo + 1
202n/a hi = hi + 1
203n/a elif op is GROUPREF:
204n/a i, j = self.pattern.groupwidths[av]
205n/a lo = lo + i
206n/a hi = hi + j
207n/a elif op is GROUPREF_EXISTS:
208n/a i, j = av[1].getwidth()
209n/a if av[2] is not None:
210n/a l, h = av[2].getwidth()
211n/a i = min(i, l)
212n/a j = max(j, h)
213n/a else:
214n/a i = 0
215n/a lo = lo + i
216n/a hi = hi + j
217n/a elif op is SUCCESS:
218n/a break
219n/a self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
220n/a return self.width
221n/a
222n/aclass Tokenizer:
223n/a def __init__(self, string):
224n/a self.istext = isinstance(string, str)
225n/a self.string = string
226n/a if not self.istext:
227n/a string = str(string, 'latin1')
228n/a self.decoded_string = string
229n/a self.index = 0
230n/a self.next = None
231n/a self.__next()
232n/a def __next(self):
233n/a index = self.index
234n/a try:
235n/a char = self.decoded_string[index]
236n/a except IndexError:
237n/a self.next = None
238n/a return
239n/a if char == "\\":
240n/a index += 1
241n/a try:
242n/a char += self.decoded_string[index]
243n/a except IndexError:
244n/a raise error("bad escape (end of pattern)",
245n/a self.string, len(self.string) - 1) from None
246n/a self.index = index + 1
247n/a self.next = char
248n/a def match(self, char):
249n/a if char == self.next:
250n/a self.__next()
251n/a return True
252n/a return False
253n/a def get(self):
254n/a this = self.next
255n/a self.__next()
256n/a return this
257n/a def getwhile(self, n, charset):
258n/a result = ''
259n/a for _ in range(n):
260n/a c = self.next
261n/a if c not in charset:
262n/a break
263n/a result += c
264n/a self.__next()
265n/a return result
266n/a def getuntil(self, terminator):
267n/a result = ''
268n/a while True:
269n/a c = self.next
270n/a self.__next()
271n/a if c is None:
272n/a if not result:
273n/a raise self.error("missing group name")
274n/a raise self.error("missing %s, unterminated name" % terminator,
275n/a len(result))
276n/a if c == terminator:
277n/a if not result:
278n/a raise self.error("missing group name", 1)
279n/a break
280n/a result += c
281n/a return result
282n/a @property
283n/a def pos(self):
284n/a return self.index - len(self.next or '')
285n/a def tell(self):
286n/a return self.index - len(self.next or '')
287n/a def seek(self, index):
288n/a self.index = index
289n/a self.__next()
290n/a
291n/a def error(self, msg, offset=0):
292n/a return error(msg, self.string, self.tell() - offset)
293n/a
294n/adef _class_escape(source, escape):
295n/a # handle escape code inside character class
296n/a code = ESCAPES.get(escape)
297n/a if code:
298n/a return code
299n/a code = CATEGORIES.get(escape)
300n/a if code and code[0] is IN:
301n/a return code
302n/a try:
303n/a c = escape[1:2]
304n/a if c == "x":
305n/a # hexadecimal escape (exactly two digits)
306n/a escape += source.getwhile(2, HEXDIGITS)
307n/a if len(escape) != 4:
308n/a raise source.error("incomplete escape %s" % escape, len(escape))
309n/a return LITERAL, int(escape[2:], 16)
310n/a elif c == "u" and source.istext:
311n/a # unicode escape (exactly four digits)
312n/a escape += source.getwhile(4, HEXDIGITS)
313n/a if len(escape) != 6:
314n/a raise source.error("incomplete escape %s" % escape, len(escape))
315n/a return LITERAL, int(escape[2:], 16)
316n/a elif c == "U" and source.istext:
317n/a # unicode escape (exactly eight digits)
318n/a escape += source.getwhile(8, HEXDIGITS)
319n/a if len(escape) != 10:
320n/a raise source.error("incomplete escape %s" % escape, len(escape))
321n/a c = int(escape[2:], 16)
322n/a chr(c) # raise ValueError for invalid code
323n/a return LITERAL, c
324n/a elif c in OCTDIGITS:
325n/a # octal escape (up to three digits)
326n/a escape += source.getwhile(2, OCTDIGITS)
327n/a c = int(escape[1:], 8)
328n/a if c > 0o377:
329n/a raise source.error('octal escape value %s outside of '
330n/a 'range 0-0o377' % escape, len(escape))
331n/a return LITERAL, c
332n/a elif c in DIGITS:
333n/a raise ValueError
334n/a if len(escape) == 2:
335n/a if c in ASCIILETTERS:
336n/a raise source.error('bad escape %s' % escape, len(escape))
337n/a return LITERAL, ord(escape[1])
338n/a except ValueError:
339n/a pass
340n/a raise source.error("bad escape %s" % escape, len(escape))
341n/a
342n/adef _escape(source, escape, state):
343n/a # handle escape code in expression
344n/a code = CATEGORIES.get(escape)
345n/a if code:
346n/a return code
347n/a code = ESCAPES.get(escape)
348n/a if code:
349n/a return code
350n/a try:
351n/a c = escape[1:2]
352n/a if c == "x":
353n/a # hexadecimal escape
354n/a escape += source.getwhile(2, HEXDIGITS)
355n/a if len(escape) != 4:
356n/a raise source.error("incomplete escape %s" % escape, len(escape))
357n/a return LITERAL, int(escape[2:], 16)
358n/a elif c == "u" and source.istext:
359n/a # unicode escape (exactly four digits)
360n/a escape += source.getwhile(4, HEXDIGITS)
361n/a if len(escape) != 6:
362n/a raise source.error("incomplete escape %s" % escape, len(escape))
363n/a return LITERAL, int(escape[2:], 16)
364n/a elif c == "U" and source.istext:
365n/a # unicode escape (exactly eight digits)
366n/a escape += source.getwhile(8, HEXDIGITS)
367n/a if len(escape) != 10:
368n/a raise source.error("incomplete escape %s" % escape, len(escape))
369n/a c = int(escape[2:], 16)
370n/a chr(c) # raise ValueError for invalid code
371n/a return LITERAL, c
372n/a elif c == "0":
373n/a # octal escape
374n/a escape += source.getwhile(2, OCTDIGITS)
375n/a return LITERAL, int(escape[1:], 8)
376n/a elif c in DIGITS:
377n/a # octal escape *or* decimal group reference (sigh)
378n/a if source.next in DIGITS:
379n/a escape += source.get()
380n/a if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
381n/a source.next in OCTDIGITS):
382n/a # got three octal digits; this is an octal escape
383n/a escape += source.get()
384n/a c = int(escape[1:], 8)
385n/a if c > 0o377:
386n/a raise source.error('octal escape value %s outside of '
387n/a 'range 0-0o377' % escape,
388n/a len(escape))
389n/a return LITERAL, c
390n/a # not an octal escape, so this is a group reference
391n/a group = int(escape[1:])
392n/a if group < state.groups:
393n/a if not state.checkgroup(group):
394n/a raise source.error("cannot refer to an open group",
395n/a len(escape))
396n/a state.checklookbehindgroup(group, source)
397n/a return GROUPREF, group
398n/a raise source.error("invalid group reference %d" % group, len(escape) - 1)
399n/a if len(escape) == 2:
400n/a if c in ASCIILETTERS:
401n/a raise source.error("bad escape %s" % escape, len(escape))
402n/a return LITERAL, ord(escape[1])
403n/a except ValueError:
404n/a pass
405n/a raise source.error("bad escape %s" % escape, len(escape))
406n/a
407n/adef _parse_sub(source, state, verbose, nested=True):
408n/a # parse an alternation: a|b|c
409n/a
410n/a items = []
411n/a itemsappend = items.append
412n/a sourcematch = source.match
413n/a start = source.tell()
414n/a while True:
415n/a itemsappend(_parse(source, state, verbose))
416n/a if not sourcematch("|"):
417n/a break
418n/a
419n/a if len(items) == 1:
420n/a return items[0]
421n/a
422n/a subpattern = SubPattern(state)
423n/a subpatternappend = subpattern.append
424n/a
425n/a # check if all items share a common prefix
426n/a while True:
427n/a prefix = None
428n/a for item in items:
429n/a if not item:
430n/a break
431n/a if prefix is None:
432n/a prefix = item[0]
433n/a elif item[0] != prefix:
434n/a break
435n/a else:
436n/a # all subitems start with a common "prefix".
437n/a # move it out of the branch
438n/a for item in items:
439n/a del item[0]
440n/a subpatternappend(prefix)
441n/a continue # check next one
442n/a break
443n/a
444n/a # check if the branch can be replaced by a character set
445n/a for item in items:
446n/a if len(item) != 1 or item[0][0] is not LITERAL:
447n/a break
448n/a else:
449n/a # we can store this as a character set instead of a
450n/a # branch (the compiler may optimize this even more)
451n/a subpatternappend((IN, [item[0] for item in items]))
452n/a return subpattern
453n/a
454n/a subpattern.append((BRANCH, (None, items)))
455n/a return subpattern
456n/a
457n/adef _parse_sub_cond(source, state, condgroup, verbose):
458n/a item_yes = _parse(source, state, verbose)
459n/a if source.match("|"):
460n/a item_no = _parse(source, state, verbose)
461n/a if source.next == "|":
462n/a raise source.error("conditional backref with more than two branches")
463n/a else:
464n/a item_no = None
465n/a subpattern = SubPattern(state)
466n/a subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
467n/a return subpattern
468n/a
469n/adef _parse(source, state, verbose):
470n/a # parse a simple pattern
471n/a subpattern = SubPattern(state)
472n/a
473n/a # precompute constants into local variables
474n/a subpatternappend = subpattern.append
475n/a sourceget = source.get
476n/a sourcematch = source.match
477n/a _len = len
478n/a _ord = ord
479n/a
480n/a while True:
481n/a
482n/a this = source.next
483n/a if this is None:
484n/a break # end of pattern
485n/a if this in "|)":
486n/a break # end of subpattern
487n/a sourceget()
488n/a
489n/a if verbose:
490n/a # skip whitespace and comments
491n/a if this in WHITESPACE:
492n/a continue
493n/a if this == "#":
494n/a while True:
495n/a this = sourceget()
496n/a if this is None or this == "\n":
497n/a break
498n/a continue
499n/a
500n/a if this[0] == "\\":
501n/a code = _escape(source, this, state)
502n/a subpatternappend(code)
503n/a
504n/a elif this not in SPECIAL_CHARS:
505n/a subpatternappend((LITERAL, _ord(this)))
506n/a
507n/a elif this == "[":
508n/a here = source.tell() - 1
509n/a # character set
510n/a set = []
511n/a setappend = set.append
512n/a## if sourcematch(":"):
513n/a## pass # handle character classes
514n/a if sourcematch("^"):
515n/a setappend((NEGATE, None))
516n/a # check remaining characters
517n/a start = set[:]
518n/a while True:
519n/a this = sourceget()
520n/a if this is None:
521n/a raise source.error("unterminated character set",
522n/a source.tell() - here)
523n/a if this == "]" and set != start:
524n/a break
525n/a elif this[0] == "\\":
526n/a code1 = _class_escape(source, this)
527n/a else:
528n/a code1 = LITERAL, _ord(this)
529n/a if sourcematch("-"):
530n/a # potential range
531n/a that = sourceget()
532n/a if that is None:
533n/a raise source.error("unterminated character set",
534n/a source.tell() - here)
535n/a if that == "]":
536n/a if code1[0] is IN:
537n/a code1 = code1[1][0]
538n/a setappend(code1)
539n/a setappend((LITERAL, _ord("-")))
540n/a break
541n/a if that[0] == "\\":
542n/a code2 = _class_escape(source, that)
543n/a else:
544n/a code2 = LITERAL, _ord(that)
545n/a if code1[0] != LITERAL or code2[0] != LITERAL:
546n/a msg = "bad character range %s-%s" % (this, that)
547n/a raise source.error(msg, len(this) + 1 + len(that))
548n/a lo = code1[1]
549n/a hi = code2[1]
550n/a if hi < lo:
551n/a msg = "bad character range %s-%s" % (this, that)
552n/a raise source.error(msg, len(this) + 1 + len(that))
553n/a setappend((RANGE, (lo, hi)))
554n/a else:
555n/a if code1[0] is IN:
556n/a code1 = code1[1][0]
557n/a setappend(code1)
558n/a
559n/a # XXX: <fl> should move set optimization to compiler!
560n/a if _len(set)==1 and set[0][0] is LITERAL:
561n/a subpatternappend(set[0]) # optimization
562n/a elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
563n/a subpatternappend((NOT_LITERAL, set[1][1])) # optimization
564n/a else:
565n/a # XXX: <fl> should add charmap optimization here
566n/a subpatternappend((IN, set))
567n/a
568n/a elif this in REPEAT_CHARS:
569n/a # repeat previous item
570n/a here = source.tell()
571n/a if this == "?":
572n/a min, max = 0, 1
573n/a elif this == "*":
574n/a min, max = 0, MAXREPEAT
575n/a
576n/a elif this == "+":
577n/a min, max = 1, MAXREPEAT
578n/a elif this == "{":
579n/a if source.next == "}":
580n/a subpatternappend((LITERAL, _ord(this)))
581n/a continue
582n/a min, max = 0, MAXREPEAT
583n/a lo = hi = ""
584n/a while source.next in DIGITS:
585n/a lo += sourceget()
586n/a if sourcematch(","):
587n/a while source.next in DIGITS:
588n/a hi += sourceget()
589n/a else:
590n/a hi = lo
591n/a if not sourcematch("}"):
592n/a subpatternappend((LITERAL, _ord(this)))
593n/a source.seek(here)
594n/a continue
595n/a if lo:
596n/a min = int(lo)
597n/a if min >= MAXREPEAT:
598n/a raise OverflowError("the repetition number is too large")
599n/a if hi:
600n/a max = int(hi)
601n/a if max >= MAXREPEAT:
602n/a raise OverflowError("the repetition number is too large")
603n/a if max < min:
604n/a raise source.error("min repeat greater than max repeat",
605n/a source.tell() - here)
606n/a else:
607n/a raise AssertionError("unsupported quantifier %r" % (char,))
608n/a # figure out which item to repeat
609n/a if subpattern:
610n/a item = subpattern[-1:]
611n/a else:
612n/a item = None
613n/a if not item or (_len(item) == 1 and item[0][0] is AT):
614n/a raise source.error("nothing to repeat",
615n/a source.tell() - here + len(this))
616n/a if item[0][0] in _REPEATCODES:
617n/a raise source.error("multiple repeat",
618n/a source.tell() - here + len(this))
619n/a if sourcematch("?"):
620n/a subpattern[-1] = (MIN_REPEAT, (min, max, item))
621n/a else:
622n/a subpattern[-1] = (MAX_REPEAT, (min, max, item))
623n/a
624n/a elif this == ".":
625n/a subpatternappend((ANY, None))
626n/a
627n/a elif this == "(":
628n/a start = source.tell() - 1
629n/a group = True
630n/a name = None
631n/a condgroup = None
632n/a add_flags = 0
633n/a del_flags = 0
634n/a if sourcematch("?"):
635n/a # options
636n/a char = sourceget()
637n/a if char is None:
638n/a raise source.error("unexpected end of pattern")
639n/a if char == "P":
640n/a # python extensions
641n/a if sourcematch("<"):
642n/a # named group: skip forward to end of name
643n/a name = source.getuntil(">")
644n/a if not name.isidentifier():
645n/a msg = "bad character in group name %r" % name
646n/a raise source.error(msg, len(name) + 1)
647n/a elif sourcematch("="):
648n/a # named backreference
649n/a name = source.getuntil(")")
650n/a if not name.isidentifier():
651n/a msg = "bad character in group name %r" % name
652n/a raise source.error(msg, len(name) + 1)
653n/a gid = state.groupdict.get(name)
654n/a if gid is None:
655n/a msg = "unknown group name %r" % name
656n/a raise source.error(msg, len(name) + 1)
657n/a if not state.checkgroup(gid):
658n/a raise source.error("cannot refer to an open group",
659n/a len(name) + 1)
660n/a state.checklookbehindgroup(gid, source)
661n/a subpatternappend((GROUPREF, gid))
662n/a continue
663n/a else:
664n/a char = sourceget()
665n/a if char is None:
666n/a raise source.error("unexpected end of pattern")
667n/a raise source.error("unknown extension ?P" + char,
668n/a len(char) + 2)
669n/a elif char == ":":
670n/a # non-capturing group
671n/a group = None
672n/a elif char == "#":
673n/a # comment
674n/a while True:
675n/a if source.next is None:
676n/a raise source.error("missing ), unterminated comment",
677n/a source.tell() - start)
678n/a if sourceget() == ")":
679n/a break
680n/a continue
681n/a elif char in "=!<":
682n/a # lookahead assertions
683n/a dir = 1
684n/a if char == "<":
685n/a char = sourceget()
686n/a if char is None:
687n/a raise source.error("unexpected end of pattern")
688n/a if char not in "=!":
689n/a raise source.error("unknown extension ?<" + char,
690n/a len(char) + 2)
691n/a dir = -1 # lookbehind
692n/a lookbehindgroups = state.lookbehindgroups
693n/a if lookbehindgroups is None:
694n/a state.lookbehindgroups = state.groups
695n/a p = _parse_sub(source, state, verbose)
696n/a if dir < 0:
697n/a if lookbehindgroups is None:
698n/a state.lookbehindgroups = None
699n/a if not sourcematch(")"):
700n/a raise source.error("missing ), unterminated subpattern",
701n/a source.tell() - start)
702n/a if char == "=":
703n/a subpatternappend((ASSERT, (dir, p)))
704n/a else:
705n/a subpatternappend((ASSERT_NOT, (dir, p)))
706n/a continue
707n/a elif char == "(":
708n/a # conditional backreference group
709n/a condname = source.getuntil(")")
710n/a group = None
711n/a if condname.isidentifier():
712n/a condgroup = state.groupdict.get(condname)
713n/a if condgroup is None:
714n/a msg = "unknown group name %r" % condname
715n/a raise source.error(msg, len(condname) + 1)
716n/a else:
717n/a try:
718n/a condgroup = int(condname)
719n/a if condgroup < 0:
720n/a raise ValueError
721n/a except ValueError:
722n/a msg = "bad character in group name %r" % condname
723n/a raise source.error(msg, len(condname) + 1) from None
724n/a if not condgroup:
725n/a raise source.error("bad group number",
726n/a len(condname) + 1)
727n/a if condgroup >= MAXGROUPS:
728n/a msg = "invalid group reference %d" % condgroup
729n/a raise source.error(msg, len(condname) + 1)
730n/a state.checklookbehindgroup(condgroup, source)
731n/a elif char in FLAGS or char == "-":
732n/a # flags
733n/a pos = source.pos
734n/a flags = _parse_flags(source, state, char)
735n/a if flags is None: # global flags
736n/a if pos != 3: # "(?x"
737n/a import warnings
738n/a warnings.warn(
739n/a 'Flags not at the start of the expression %s%s' % (
740n/a source.string[:20], # truncate long regexes
741n/a ' (truncated)' if len(source.string) > 20 else '',
742n/a ),
743n/a DeprecationWarning, stacklevel=7
744n/a )
745n/a continue
746n/a add_flags, del_flags = flags
747n/a group = None
748n/a else:
749n/a raise source.error("unknown extension ?" + char,
750n/a len(char) + 1)
751n/a
752n/a # parse group contents
753n/a if group is not None:
754n/a try:
755n/a group = state.opengroup(name)
756n/a except error as err:
757n/a raise source.error(err.msg, len(name) + 1) from None
758n/a if condgroup:
759n/a p = _parse_sub_cond(source, state, condgroup, verbose)
760n/a else:
761n/a sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
762n/a not (del_flags & SRE_FLAG_VERBOSE))
763n/a p = _parse_sub(source, state, sub_verbose)
764n/a if not source.match(")"):
765n/a raise source.error("missing ), unterminated subpattern",
766n/a source.tell() - start)
767n/a if group is not None:
768n/a state.closegroup(group, p)
769n/a subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
770n/a
771n/a elif this == "^":
772n/a subpatternappend((AT, AT_BEGINNING))
773n/a
774n/a elif this == "$":
775n/a subpattern.append((AT, AT_END))
776n/a
777n/a else:
778n/a raise AssertionError("unsupported special character %r" % (char,))
779n/a
780n/a return subpattern
781n/a
782n/adef _parse_flags(source, state, char):
783n/a sourceget = source.get
784n/a add_flags = 0
785n/a del_flags = 0
786n/a if char != "-":
787n/a while True:
788n/a add_flags |= FLAGS[char]
789n/a char = sourceget()
790n/a if char is None:
791n/a raise source.error("missing -, : or )")
792n/a if char in ")-:":
793n/a break
794n/a if char not in FLAGS:
795n/a msg = "unknown flag" if char.isalpha() else "missing -, : or )"
796n/a raise source.error(msg, len(char))
797n/a if char == ")":
798n/a if ((add_flags & SRE_FLAG_VERBOSE) and
799n/a not (state.flags & SRE_FLAG_VERBOSE)):
800n/a raise Verbose
801n/a state.flags |= add_flags
802n/a return None
803n/a if add_flags & GLOBAL_FLAGS:
804n/a raise source.error("bad inline flags: cannot turn on global flag", 1)
805n/a if char == "-":
806n/a char = sourceget()
807n/a if char is None:
808n/a raise source.error("missing flag")
809n/a if char not in FLAGS:
810n/a msg = "unknown flag" if char.isalpha() else "missing flag"
811n/a raise source.error(msg, len(char))
812n/a while True:
813n/a del_flags |= FLAGS[char]
814n/a char = sourceget()
815n/a if char is None:
816n/a raise source.error("missing :")
817n/a if char == ":":
818n/a break
819n/a if char not in FLAGS:
820n/a msg = "unknown flag" if char.isalpha() else "missing :"
821n/a raise source.error(msg, len(char))
822n/a assert char == ":"
823n/a if del_flags & GLOBAL_FLAGS:
824n/a raise source.error("bad inline flags: cannot turn off global flag", 1)
825n/a if add_flags & del_flags:
826n/a raise source.error("bad inline flags: flag turned on and off", 1)
827n/a return add_flags, del_flags
828n/a
829n/adef fix_flags(src, flags):
830n/a # Check and fix flags according to the type of pattern (str or bytes)
831n/a if isinstance(src, str):
832n/a if flags & SRE_FLAG_LOCALE:
833n/a raise ValueError("cannot use LOCALE flag with a str pattern")
834n/a if not flags & SRE_FLAG_ASCII:
835n/a flags |= SRE_FLAG_UNICODE
836n/a elif flags & SRE_FLAG_UNICODE:
837n/a raise ValueError("ASCII and UNICODE flags are incompatible")
838n/a else:
839n/a if flags & SRE_FLAG_UNICODE:
840n/a raise ValueError("cannot use UNICODE flag with a bytes pattern")
841n/a if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII:
842n/a raise ValueError("ASCII and LOCALE flags are incompatible")
843n/a return flags
844n/a
845n/adef parse(str, flags=0, pattern=None):
846n/a # parse 're' pattern into list of (opcode, argument) tuples
847n/a
848n/a source = Tokenizer(str)
849n/a
850n/a if pattern is None:
851n/a pattern = Pattern()
852n/a pattern.flags = flags
853n/a pattern.str = str
854n/a
855n/a try:
856n/a p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, False)
857n/a except Verbose:
858n/a # the VERBOSE flag was switched on inside the pattern. to be
859n/a # on the safe side, we'll parse the whole thing again...
860n/a pattern = Pattern()
861n/a pattern.flags = flags | SRE_FLAG_VERBOSE
862n/a pattern.str = str
863n/a source.seek(0)
864n/a p = _parse_sub(source, pattern, True, False)
865n/a
866n/a p.pattern.flags = fix_flags(str, p.pattern.flags)
867n/a
868n/a if source.next is not None:
869n/a assert source.next == ")"
870n/a raise source.error("unbalanced parenthesis")
871n/a
872n/a if flags & SRE_FLAG_DEBUG:
873n/a p.dump()
874n/a
875n/a return p
876n/a
877n/adef parse_template(source, pattern):
878n/a # parse 're' replacement string into list of literals and
879n/a # group references
880n/a s = Tokenizer(source)
881n/a sget = s.get
882n/a groups = []
883n/a literals = []
884n/a literal = []
885n/a lappend = literal.append
886n/a def addgroup(index, pos):
887n/a if index > pattern.groups:
888n/a raise s.error("invalid group reference %d" % index, pos)
889n/a if literal:
890n/a literals.append(''.join(literal))
891n/a del literal[:]
892n/a groups.append((len(literals), index))
893n/a literals.append(None)
894n/a groupindex = pattern.groupindex
895n/a while True:
896n/a this = sget()
897n/a if this is None:
898n/a break # end of replacement string
899n/a if this[0] == "\\":
900n/a # group
901n/a c = this[1]
902n/a if c == "g":
903n/a name = ""
904n/a if not s.match("<"):
905n/a raise s.error("missing <")
906n/a name = s.getuntil(">")
907n/a if name.isidentifier():
908n/a try:
909n/a index = groupindex[name]
910n/a except KeyError:
911n/a raise IndexError("unknown group name %r" % name)
912n/a else:
913n/a try:
914n/a index = int(name)
915n/a if index < 0:
916n/a raise ValueError
917n/a except ValueError:
918n/a raise s.error("bad character in group name %r" % name,
919n/a len(name) + 1) from None
920n/a if index >= MAXGROUPS:
921n/a raise s.error("invalid group reference %d" % index,
922n/a len(name) + 1)
923n/a addgroup(index, len(name) + 1)
924n/a elif c == "0":
925n/a if s.next in OCTDIGITS:
926n/a this += sget()
927n/a if s.next in OCTDIGITS:
928n/a this += sget()
929n/a lappend(chr(int(this[1:], 8) & 0xff))
930n/a elif c in DIGITS:
931n/a isoctal = False
932n/a if s.next in DIGITS:
933n/a this += sget()
934n/a if (c in OCTDIGITS and this[2] in OCTDIGITS and
935n/a s.next in OCTDIGITS):
936n/a this += sget()
937n/a isoctal = True
938n/a c = int(this[1:], 8)
939n/a if c > 0o377:
940n/a raise s.error('octal escape value %s outside of '
941n/a 'range 0-0o377' % this, len(this))
942n/a lappend(chr(c))
943n/a if not isoctal:
944n/a addgroup(int(this[1:]), len(this) - 1)
945n/a else:
946n/a try:
947n/a this = chr(ESCAPES[this][1])
948n/a except KeyError:
949n/a if c in ASCIILETTERS:
950n/a raise s.error('bad escape %s' % this, len(this))
951n/a lappend(this)
952n/a else:
953n/a lappend(this)
954n/a if literal:
955n/a literals.append(''.join(literal))
956n/a if not isinstance(source, str):
957n/a # The tokenizer implicitly decodes bytes objects as latin-1, we must
958n/a # therefore re-encode the final representation.
959n/a literals = [None if s is None else s.encode('latin-1') for s in literals]
960n/a return groups, literals
961n/a
962n/adef expand_template(template, match):
963n/a g = match.group
964n/a empty = match.string[:0]
965n/a groups, literals = template
966n/a literals = literals[:]
967n/a try:
968n/a for index, group in groups:
969n/a literals[index] = g(group) or empty
970n/a except IndexError:
971n/a raise error("invalid group reference %d" % index)
972n/a return empty.join(literals)