1 | n/a | # |
---|
2 | n/a | # Secret Labs' Regular Expression Engine |
---|
3 | n/a | # |
---|
4 | n/a | # convert re-style regular expression to sre pattern |
---|
5 | n/a | # |
---|
6 | n/a | # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. |
---|
7 | n/a | # |
---|
8 | n/a | # See the sre.py file for information on usage and redistribution. |
---|
9 | n/a | # |
---|
10 | n/a | |
---|
11 | n/a | """Internal support module for sre""" |
---|
12 | n/a | |
---|
13 | n/a | # XXX: show string offset and offending character for all errors |
---|
14 | n/a | |
---|
15 | n/a | from sre_constants import * |
---|
16 | n/a | |
---|
17 | n/a | SPECIAL_CHARS = ".\\[{()*+?^$|" |
---|
18 | n/a | REPEAT_CHARS = "*+?{" |
---|
19 | n/a | |
---|
20 | n/a | DIGITS = frozenset("0123456789") |
---|
21 | n/a | |
---|
22 | n/a | OCTDIGITS = frozenset("01234567") |
---|
23 | n/a | HEXDIGITS = frozenset("0123456789abcdefABCDEF") |
---|
24 | n/a | ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") |
---|
25 | n/a | |
---|
26 | n/a | WHITESPACE = frozenset(" \t\n\r\v\f") |
---|
27 | n/a | |
---|
28 | n/a | _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT}) |
---|
29 | n/a | _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) |
---|
30 | n/a | |
---|
31 | n/a | ESCAPES = { |
---|
32 | n/a | r"\a": (LITERAL, ord("\a")), |
---|
33 | n/a | r"\b": (LITERAL, ord("\b")), |
---|
34 | n/a | r"\f": (LITERAL, ord("\f")), |
---|
35 | n/a | r"\n": (LITERAL, ord("\n")), |
---|
36 | n/a | r"\r": (LITERAL, ord("\r")), |
---|
37 | n/a | r"\t": (LITERAL, ord("\t")), |
---|
38 | n/a | r"\v": (LITERAL, ord("\v")), |
---|
39 | n/a | r"\\": (LITERAL, ord("\\")) |
---|
40 | n/a | } |
---|
41 | n/a | |
---|
42 | n/a | CATEGORIES = { |
---|
43 | n/a | r"\A": (AT, AT_BEGINNING_STRING), # start of string |
---|
44 | n/a | r"\b": (AT, AT_BOUNDARY), |
---|
45 | n/a | r"\B": (AT, AT_NON_BOUNDARY), |
---|
46 | n/a | r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), |
---|
47 | n/a | r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), |
---|
48 | n/a | r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), |
---|
49 | n/a | r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), |
---|
50 | n/a | r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), |
---|
51 | n/a | r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), |
---|
52 | n/a | r"\Z": (AT, AT_END_STRING), # end of string |
---|
53 | n/a | } |
---|
54 | n/a | |
---|
55 | n/a | FLAGS = { |
---|
56 | n/a | # standard flags |
---|
57 | n/a | "i": SRE_FLAG_IGNORECASE, |
---|
58 | n/a | "L": SRE_FLAG_LOCALE, |
---|
59 | n/a | "m": SRE_FLAG_MULTILINE, |
---|
60 | n/a | "s": SRE_FLAG_DOTALL, |
---|
61 | n/a | "x": SRE_FLAG_VERBOSE, |
---|
62 | n/a | # extensions |
---|
63 | n/a | "a": SRE_FLAG_ASCII, |
---|
64 | n/a | "t": SRE_FLAG_TEMPLATE, |
---|
65 | n/a | "u": SRE_FLAG_UNICODE, |
---|
66 | n/a | } |
---|
67 | n/a | |
---|
68 | n/a | GLOBAL_FLAGS = (SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE | |
---|
69 | n/a | SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE) |
---|
70 | n/a | |
---|
71 | n/a | class Verbose(Exception): |
---|
72 | n/a | pass |
---|
73 | n/a | |
---|
74 | n/a | class Pattern: |
---|
75 | n/a | # master pattern object. keeps track of global attributes |
---|
76 | n/a | def __init__(self): |
---|
77 | n/a | self.flags = 0 |
---|
78 | n/a | self.groupdict = {} |
---|
79 | n/a | self.groupwidths = [None] # group 0 |
---|
80 | n/a | self.lookbehindgroups = None |
---|
81 | n/a | @property |
---|
82 | n/a | def groups(self): |
---|
83 | n/a | return len(self.groupwidths) |
---|
84 | n/a | def opengroup(self, name=None): |
---|
85 | n/a | gid = self.groups |
---|
86 | n/a | self.groupwidths.append(None) |
---|
87 | n/a | if self.groups > MAXGROUPS: |
---|
88 | n/a | raise error("too many groups") |
---|
89 | n/a | if name is not None: |
---|
90 | n/a | ogid = self.groupdict.get(name, None) |
---|
91 | n/a | if ogid is not None: |
---|
92 | n/a | raise error("redefinition of group name %r as group %d; " |
---|
93 | n/a | "was group %d" % (name, gid, ogid)) |
---|
94 | n/a | self.groupdict[name] = gid |
---|
95 | n/a | return gid |
---|
96 | n/a | def closegroup(self, gid, p): |
---|
97 | n/a | self.groupwidths[gid] = p.getwidth() |
---|
98 | n/a | def checkgroup(self, gid): |
---|
99 | n/a | return gid < self.groups and self.groupwidths[gid] is not None |
---|
100 | n/a | |
---|
101 | n/a | def checklookbehindgroup(self, gid, source): |
---|
102 | n/a | if self.lookbehindgroups is not None: |
---|
103 | n/a | if not self.checkgroup(gid): |
---|
104 | n/a | raise source.error('cannot refer to an open group') |
---|
105 | n/a | if gid >= self.lookbehindgroups: |
---|
106 | n/a | raise source.error('cannot refer to group defined in the same ' |
---|
107 | n/a | 'lookbehind subpattern') |
---|
108 | n/a | |
---|
109 | n/a | class SubPattern: |
---|
110 | n/a | # a subpattern, in intermediate form |
---|
111 | n/a | def __init__(self, pattern, data=None): |
---|
112 | n/a | self.pattern = pattern |
---|
113 | n/a | if data is None: |
---|
114 | n/a | data = [] |
---|
115 | n/a | self.data = data |
---|
116 | n/a | self.width = None |
---|
117 | n/a | def dump(self, level=0): |
---|
118 | n/a | nl = True |
---|
119 | n/a | seqtypes = (tuple, list) |
---|
120 | n/a | for op, av in self.data: |
---|
121 | n/a | print(level*" " + str(op), end='') |
---|
122 | n/a | if op is IN: |
---|
123 | n/a | # member sublanguage |
---|
124 | n/a | print() |
---|
125 | n/a | for op, a in av: |
---|
126 | n/a | print((level+1)*" " + str(op), a) |
---|
127 | n/a | elif op is BRANCH: |
---|
128 | n/a | print() |
---|
129 | n/a | for i, a in enumerate(av[1]): |
---|
130 | n/a | if i: |
---|
131 | n/a | print(level*" " + "OR") |
---|
132 | n/a | a.dump(level+1) |
---|
133 | n/a | elif op is GROUPREF_EXISTS: |
---|
134 | n/a | condgroup, item_yes, item_no = av |
---|
135 | n/a | print('', condgroup) |
---|
136 | n/a | item_yes.dump(level+1) |
---|
137 | n/a | if item_no: |
---|
138 | n/a | print(level*" " + "ELSE") |
---|
139 | n/a | item_no.dump(level+1) |
---|
140 | n/a | elif isinstance(av, seqtypes): |
---|
141 | n/a | nl = False |
---|
142 | n/a | for a in av: |
---|
143 | n/a | if isinstance(a, SubPattern): |
---|
144 | n/a | if not nl: |
---|
145 | n/a | print() |
---|
146 | n/a | a.dump(level+1) |
---|
147 | n/a | nl = True |
---|
148 | n/a | else: |
---|
149 | n/a | if not nl: |
---|
150 | n/a | print(' ', end='') |
---|
151 | n/a | print(a, end='') |
---|
152 | n/a | nl = False |
---|
153 | n/a | if not nl: |
---|
154 | n/a | print() |
---|
155 | n/a | else: |
---|
156 | n/a | print('', av) |
---|
157 | n/a | def __repr__(self): |
---|
158 | n/a | return repr(self.data) |
---|
159 | n/a | def __len__(self): |
---|
160 | n/a | return len(self.data) |
---|
161 | n/a | def __delitem__(self, index): |
---|
162 | n/a | del self.data[index] |
---|
163 | n/a | def __getitem__(self, index): |
---|
164 | n/a | if isinstance(index, slice): |
---|
165 | n/a | return SubPattern(self.pattern, self.data[index]) |
---|
166 | n/a | return self.data[index] |
---|
167 | n/a | def __setitem__(self, index, code): |
---|
168 | n/a | self.data[index] = code |
---|
169 | n/a | def insert(self, index, code): |
---|
170 | n/a | self.data.insert(index, code) |
---|
171 | n/a | def append(self, code): |
---|
172 | n/a | self.data.append(code) |
---|
173 | n/a | def getwidth(self): |
---|
174 | n/a | # determine the width (min, max) for this subpattern |
---|
175 | n/a | if self.width is not None: |
---|
176 | n/a | return self.width |
---|
177 | n/a | lo = hi = 0 |
---|
178 | n/a | for op, av in self.data: |
---|
179 | n/a | if op is BRANCH: |
---|
180 | n/a | i = MAXREPEAT - 1 |
---|
181 | n/a | j = 0 |
---|
182 | n/a | for av in av[1]: |
---|
183 | n/a | l, h = av.getwidth() |
---|
184 | n/a | i = min(i, l) |
---|
185 | n/a | j = max(j, h) |
---|
186 | n/a | lo = lo + i |
---|
187 | n/a | hi = hi + j |
---|
188 | n/a | elif op is CALL: |
---|
189 | n/a | i, j = av.getwidth() |
---|
190 | n/a | lo = lo + i |
---|
191 | n/a | hi = hi + j |
---|
192 | n/a | elif op is SUBPATTERN: |
---|
193 | n/a | i, j = av[-1].getwidth() |
---|
194 | n/a | lo = lo + i |
---|
195 | n/a | hi = hi + j |
---|
196 | n/a | elif op in _REPEATCODES: |
---|
197 | n/a | i, j = av[2].getwidth() |
---|
198 | n/a | lo = lo + i * av[0] |
---|
199 | n/a | hi = hi + j * av[1] |
---|
200 | n/a | elif op in _UNITCODES: |
---|
201 | n/a | lo = lo + 1 |
---|
202 | n/a | hi = hi + 1 |
---|
203 | n/a | elif op is GROUPREF: |
---|
204 | n/a | i, j = self.pattern.groupwidths[av] |
---|
205 | n/a | lo = lo + i |
---|
206 | n/a | hi = hi + j |
---|
207 | n/a | elif op is GROUPREF_EXISTS: |
---|
208 | n/a | i, j = av[1].getwidth() |
---|
209 | n/a | if av[2] is not None: |
---|
210 | n/a | l, h = av[2].getwidth() |
---|
211 | n/a | i = min(i, l) |
---|
212 | n/a | j = max(j, h) |
---|
213 | n/a | else: |
---|
214 | n/a | i = 0 |
---|
215 | n/a | lo = lo + i |
---|
216 | n/a | hi = hi + j |
---|
217 | n/a | elif op is SUCCESS: |
---|
218 | n/a | break |
---|
219 | n/a | self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) |
---|
220 | n/a | return self.width |
---|
221 | n/a | |
---|
222 | n/a | class Tokenizer: |
---|
223 | n/a | def __init__(self, string): |
---|
224 | n/a | self.istext = isinstance(string, str) |
---|
225 | n/a | self.string = string |
---|
226 | n/a | if not self.istext: |
---|
227 | n/a | string = str(string, 'latin1') |
---|
228 | n/a | self.decoded_string = string |
---|
229 | n/a | self.index = 0 |
---|
230 | n/a | self.next = None |
---|
231 | n/a | self.__next() |
---|
232 | n/a | def __next(self): |
---|
233 | n/a | index = self.index |
---|
234 | n/a | try: |
---|
235 | n/a | char = self.decoded_string[index] |
---|
236 | n/a | except IndexError: |
---|
237 | n/a | self.next = None |
---|
238 | n/a | return |
---|
239 | n/a | if char == "\\": |
---|
240 | n/a | index += 1 |
---|
241 | n/a | try: |
---|
242 | n/a | char += self.decoded_string[index] |
---|
243 | n/a | except IndexError: |
---|
244 | n/a | raise error("bad escape (end of pattern)", |
---|
245 | n/a | self.string, len(self.string) - 1) from None |
---|
246 | n/a | self.index = index + 1 |
---|
247 | n/a | self.next = char |
---|
248 | n/a | def match(self, char): |
---|
249 | n/a | if char == self.next: |
---|
250 | n/a | self.__next() |
---|
251 | n/a | return True |
---|
252 | n/a | return False |
---|
253 | n/a | def get(self): |
---|
254 | n/a | this = self.next |
---|
255 | n/a | self.__next() |
---|
256 | n/a | return this |
---|
257 | n/a | def getwhile(self, n, charset): |
---|
258 | n/a | result = '' |
---|
259 | n/a | for _ in range(n): |
---|
260 | n/a | c = self.next |
---|
261 | n/a | if c not in charset: |
---|
262 | n/a | break |
---|
263 | n/a | result += c |
---|
264 | n/a | self.__next() |
---|
265 | n/a | return result |
---|
266 | n/a | def getuntil(self, terminator): |
---|
267 | n/a | result = '' |
---|
268 | n/a | while True: |
---|
269 | n/a | c = self.next |
---|
270 | n/a | self.__next() |
---|
271 | n/a | if c is None: |
---|
272 | n/a | if not result: |
---|
273 | n/a | raise self.error("missing group name") |
---|
274 | n/a | raise self.error("missing %s, unterminated name" % terminator, |
---|
275 | n/a | len(result)) |
---|
276 | n/a | if c == terminator: |
---|
277 | n/a | if not result: |
---|
278 | n/a | raise self.error("missing group name", 1) |
---|
279 | n/a | break |
---|
280 | n/a | result += c |
---|
281 | n/a | return result |
---|
282 | n/a | @property |
---|
283 | n/a | def pos(self): |
---|
284 | n/a | return self.index - len(self.next or '') |
---|
285 | n/a | def tell(self): |
---|
286 | n/a | return self.index - len(self.next or '') |
---|
287 | n/a | def seek(self, index): |
---|
288 | n/a | self.index = index |
---|
289 | n/a | self.__next() |
---|
290 | n/a | |
---|
291 | n/a | def error(self, msg, offset=0): |
---|
292 | n/a | return error(msg, self.string, self.tell() - offset) |
---|
293 | n/a | |
---|
294 | n/a | def _class_escape(source, escape): |
---|
295 | n/a | # handle escape code inside character class |
---|
296 | n/a | code = ESCAPES.get(escape) |
---|
297 | n/a | if code: |
---|
298 | n/a | return code |
---|
299 | n/a | code = CATEGORIES.get(escape) |
---|
300 | n/a | if code and code[0] is IN: |
---|
301 | n/a | return code |
---|
302 | n/a | try: |
---|
303 | n/a | c = escape[1:2] |
---|
304 | n/a | if c == "x": |
---|
305 | n/a | # hexadecimal escape (exactly two digits) |
---|
306 | n/a | escape += source.getwhile(2, HEXDIGITS) |
---|
307 | n/a | if len(escape) != 4: |
---|
308 | n/a | raise source.error("incomplete escape %s" % escape, len(escape)) |
---|
309 | n/a | return LITERAL, int(escape[2:], 16) |
---|
310 | n/a | elif c == "u" and source.istext: |
---|
311 | n/a | # unicode escape (exactly four digits) |
---|
312 | n/a | escape += source.getwhile(4, HEXDIGITS) |
---|
313 | n/a | if len(escape) != 6: |
---|
314 | n/a | raise source.error("incomplete escape %s" % escape, len(escape)) |
---|
315 | n/a | return LITERAL, int(escape[2:], 16) |
---|
316 | n/a | elif c == "U" and source.istext: |
---|
317 | n/a | # unicode escape (exactly eight digits) |
---|
318 | n/a | escape += source.getwhile(8, HEXDIGITS) |
---|
319 | n/a | if len(escape) != 10: |
---|
320 | n/a | raise source.error("incomplete escape %s" % escape, len(escape)) |
---|
321 | n/a | c = int(escape[2:], 16) |
---|
322 | n/a | chr(c) # raise ValueError for invalid code |
---|
323 | n/a | return LITERAL, c |
---|
324 | n/a | elif c in OCTDIGITS: |
---|
325 | n/a | # octal escape (up to three digits) |
---|
326 | n/a | escape += source.getwhile(2, OCTDIGITS) |
---|
327 | n/a | c = int(escape[1:], 8) |
---|
328 | n/a | if c > 0o377: |
---|
329 | n/a | raise source.error('octal escape value %s outside of ' |
---|
330 | n/a | 'range 0-0o377' % escape, len(escape)) |
---|
331 | n/a | return LITERAL, c |
---|
332 | n/a | elif c in DIGITS: |
---|
333 | n/a | raise ValueError |
---|
334 | n/a | if len(escape) == 2: |
---|
335 | n/a | if c in ASCIILETTERS: |
---|
336 | n/a | raise source.error('bad escape %s' % escape, len(escape)) |
---|
337 | n/a | return LITERAL, ord(escape[1]) |
---|
338 | n/a | except ValueError: |
---|
339 | n/a | pass |
---|
340 | n/a | raise source.error("bad escape %s" % escape, len(escape)) |
---|
341 | n/a | |
---|
342 | n/a | def _escape(source, escape, state): |
---|
343 | n/a | # handle escape code in expression |
---|
344 | n/a | code = CATEGORIES.get(escape) |
---|
345 | n/a | if code: |
---|
346 | n/a | return code |
---|
347 | n/a | code = ESCAPES.get(escape) |
---|
348 | n/a | if code: |
---|
349 | n/a | return code |
---|
350 | n/a | try: |
---|
351 | n/a | c = escape[1:2] |
---|
352 | n/a | if c == "x": |
---|
353 | n/a | # hexadecimal escape |
---|
354 | n/a | escape += source.getwhile(2, HEXDIGITS) |
---|
355 | n/a | if len(escape) != 4: |
---|
356 | n/a | raise source.error("incomplete escape %s" % escape, len(escape)) |
---|
357 | n/a | return LITERAL, int(escape[2:], 16) |
---|
358 | n/a | elif c == "u" and source.istext: |
---|
359 | n/a | # unicode escape (exactly four digits) |
---|
360 | n/a | escape += source.getwhile(4, HEXDIGITS) |
---|
361 | n/a | if len(escape) != 6: |
---|
362 | n/a | raise source.error("incomplete escape %s" % escape, len(escape)) |
---|
363 | n/a | return LITERAL, int(escape[2:], 16) |
---|
364 | n/a | elif c == "U" and source.istext: |
---|
365 | n/a | # unicode escape (exactly eight digits) |
---|
366 | n/a | escape += source.getwhile(8, HEXDIGITS) |
---|
367 | n/a | if len(escape) != 10: |
---|
368 | n/a | raise source.error("incomplete escape %s" % escape, len(escape)) |
---|
369 | n/a | c = int(escape[2:], 16) |
---|
370 | n/a | chr(c) # raise ValueError for invalid code |
---|
371 | n/a | return LITERAL, c |
---|
372 | n/a | elif c == "0": |
---|
373 | n/a | # octal escape |
---|
374 | n/a | escape += source.getwhile(2, OCTDIGITS) |
---|
375 | n/a | return LITERAL, int(escape[1:], 8) |
---|
376 | n/a | elif c in DIGITS: |
---|
377 | n/a | # octal escape *or* decimal group reference (sigh) |
---|
378 | n/a | if source.next in DIGITS: |
---|
379 | n/a | escape += source.get() |
---|
380 | n/a | if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and |
---|
381 | n/a | source.next in OCTDIGITS): |
---|
382 | n/a | # got three octal digits; this is an octal escape |
---|
383 | n/a | escape += source.get() |
---|
384 | n/a | c = int(escape[1:], 8) |
---|
385 | n/a | if c > 0o377: |
---|
386 | n/a | raise source.error('octal escape value %s outside of ' |
---|
387 | n/a | 'range 0-0o377' % escape, |
---|
388 | n/a | len(escape)) |
---|
389 | n/a | return LITERAL, c |
---|
390 | n/a | # not an octal escape, so this is a group reference |
---|
391 | n/a | group = int(escape[1:]) |
---|
392 | n/a | if group < state.groups: |
---|
393 | n/a | if not state.checkgroup(group): |
---|
394 | n/a | raise source.error("cannot refer to an open group", |
---|
395 | n/a | len(escape)) |
---|
396 | n/a | state.checklookbehindgroup(group, source) |
---|
397 | n/a | return GROUPREF, group |
---|
398 | n/a | raise source.error("invalid group reference %d" % group, len(escape) - 1) |
---|
399 | n/a | if len(escape) == 2: |
---|
400 | n/a | if c in ASCIILETTERS: |
---|
401 | n/a | raise source.error("bad escape %s" % escape, len(escape)) |
---|
402 | n/a | return LITERAL, ord(escape[1]) |
---|
403 | n/a | except ValueError: |
---|
404 | n/a | pass |
---|
405 | n/a | raise source.error("bad escape %s" % escape, len(escape)) |
---|
406 | n/a | |
---|
407 | n/a | def _parse_sub(source, state, verbose, nested=True): |
---|
408 | n/a | # parse an alternation: a|b|c |
---|
409 | n/a | |
---|
410 | n/a | items = [] |
---|
411 | n/a | itemsappend = items.append |
---|
412 | n/a | sourcematch = source.match |
---|
413 | n/a | start = source.tell() |
---|
414 | n/a | while True: |
---|
415 | n/a | itemsappend(_parse(source, state, verbose)) |
---|
416 | n/a | if not sourcematch("|"): |
---|
417 | n/a | break |
---|
418 | n/a | |
---|
419 | n/a | if len(items) == 1: |
---|
420 | n/a | return items[0] |
---|
421 | n/a | |
---|
422 | n/a | subpattern = SubPattern(state) |
---|
423 | n/a | subpatternappend = subpattern.append |
---|
424 | n/a | |
---|
425 | n/a | # check if all items share a common prefix |
---|
426 | n/a | while True: |
---|
427 | n/a | prefix = None |
---|
428 | n/a | for item in items: |
---|
429 | n/a | if not item: |
---|
430 | n/a | break |
---|
431 | n/a | if prefix is None: |
---|
432 | n/a | prefix = item[0] |
---|
433 | n/a | elif item[0] != prefix: |
---|
434 | n/a | break |
---|
435 | n/a | else: |
---|
436 | n/a | # all subitems start with a common "prefix". |
---|
437 | n/a | # move it out of the branch |
---|
438 | n/a | for item in items: |
---|
439 | n/a | del item[0] |
---|
440 | n/a | subpatternappend(prefix) |
---|
441 | n/a | continue # check next one |
---|
442 | n/a | break |
---|
443 | n/a | |
---|
444 | n/a | # check if the branch can be replaced by a character set |
---|
445 | n/a | for item in items: |
---|
446 | n/a | if len(item) != 1 or item[0][0] is not LITERAL: |
---|
447 | n/a | break |
---|
448 | n/a | else: |
---|
449 | n/a | # we can store this as a character set instead of a |
---|
450 | n/a | # branch (the compiler may optimize this even more) |
---|
451 | n/a | subpatternappend((IN, [item[0] for item in items])) |
---|
452 | n/a | return subpattern |
---|
453 | n/a | |
---|
454 | n/a | subpattern.append((BRANCH, (None, items))) |
---|
455 | n/a | return subpattern |
---|
456 | n/a | |
---|
457 | n/a | def _parse_sub_cond(source, state, condgroup, verbose): |
---|
458 | n/a | item_yes = _parse(source, state, verbose) |
---|
459 | n/a | if source.match("|"): |
---|
460 | n/a | item_no = _parse(source, state, verbose) |
---|
461 | n/a | if source.next == "|": |
---|
462 | n/a | raise source.error("conditional backref with more than two branches") |
---|
463 | n/a | else: |
---|
464 | n/a | item_no = None |
---|
465 | n/a | subpattern = SubPattern(state) |
---|
466 | n/a | subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) |
---|
467 | n/a | return subpattern |
---|
468 | n/a | |
---|
469 | n/a | def _parse(source, state, verbose): |
---|
470 | n/a | # parse a simple pattern |
---|
471 | n/a | subpattern = SubPattern(state) |
---|
472 | n/a | |
---|
473 | n/a | # precompute constants into local variables |
---|
474 | n/a | subpatternappend = subpattern.append |
---|
475 | n/a | sourceget = source.get |
---|
476 | n/a | sourcematch = source.match |
---|
477 | n/a | _len = len |
---|
478 | n/a | _ord = ord |
---|
479 | n/a | |
---|
480 | n/a | while True: |
---|
481 | n/a | |
---|
482 | n/a | this = source.next |
---|
483 | n/a | if this is None: |
---|
484 | n/a | break # end of pattern |
---|
485 | n/a | if this in "|)": |
---|
486 | n/a | break # end of subpattern |
---|
487 | n/a | sourceget() |
---|
488 | n/a | |
---|
489 | n/a | if verbose: |
---|
490 | n/a | # skip whitespace and comments |
---|
491 | n/a | if this in WHITESPACE: |
---|
492 | n/a | continue |
---|
493 | n/a | if this == "#": |
---|
494 | n/a | while True: |
---|
495 | n/a | this = sourceget() |
---|
496 | n/a | if this is None or this == "\n": |
---|
497 | n/a | break |
---|
498 | n/a | continue |
---|
499 | n/a | |
---|
500 | n/a | if this[0] == "\\": |
---|
501 | n/a | code = _escape(source, this, state) |
---|
502 | n/a | subpatternappend(code) |
---|
503 | n/a | |
---|
504 | n/a | elif this not in SPECIAL_CHARS: |
---|
505 | n/a | subpatternappend((LITERAL, _ord(this))) |
---|
506 | n/a | |
---|
507 | n/a | elif this == "[": |
---|
508 | n/a | here = source.tell() - 1 |
---|
509 | n/a | # character set |
---|
510 | n/a | set = [] |
---|
511 | n/a | setappend = set.append |
---|
512 | n/a | ## if sourcematch(":"): |
---|
513 | n/a | ## pass # handle character classes |
---|
514 | n/a | if sourcematch("^"): |
---|
515 | n/a | setappend((NEGATE, None)) |
---|
516 | n/a | # check remaining characters |
---|
517 | n/a | start = set[:] |
---|
518 | n/a | while True: |
---|
519 | n/a | this = sourceget() |
---|
520 | n/a | if this is None: |
---|
521 | n/a | raise source.error("unterminated character set", |
---|
522 | n/a | source.tell() - here) |
---|
523 | n/a | if this == "]" and set != start: |
---|
524 | n/a | break |
---|
525 | n/a | elif this[0] == "\\": |
---|
526 | n/a | code1 = _class_escape(source, this) |
---|
527 | n/a | else: |
---|
528 | n/a | code1 = LITERAL, _ord(this) |
---|
529 | n/a | if sourcematch("-"): |
---|
530 | n/a | # potential range |
---|
531 | n/a | that = sourceget() |
---|
532 | n/a | if that is None: |
---|
533 | n/a | raise source.error("unterminated character set", |
---|
534 | n/a | source.tell() - here) |
---|
535 | n/a | if that == "]": |
---|
536 | n/a | if code1[0] is IN: |
---|
537 | n/a | code1 = code1[1][0] |
---|
538 | n/a | setappend(code1) |
---|
539 | n/a | setappend((LITERAL, _ord("-"))) |
---|
540 | n/a | break |
---|
541 | n/a | if that[0] == "\\": |
---|
542 | n/a | code2 = _class_escape(source, that) |
---|
543 | n/a | else: |
---|
544 | n/a | code2 = LITERAL, _ord(that) |
---|
545 | n/a | if code1[0] != LITERAL or code2[0] != LITERAL: |
---|
546 | n/a | msg = "bad character range %s-%s" % (this, that) |
---|
547 | n/a | raise source.error(msg, len(this) + 1 + len(that)) |
---|
548 | n/a | lo = code1[1] |
---|
549 | n/a | hi = code2[1] |
---|
550 | n/a | if hi < lo: |
---|
551 | n/a | msg = "bad character range %s-%s" % (this, that) |
---|
552 | n/a | raise source.error(msg, len(this) + 1 + len(that)) |
---|
553 | n/a | setappend((RANGE, (lo, hi))) |
---|
554 | n/a | else: |
---|
555 | n/a | if code1[0] is IN: |
---|
556 | n/a | code1 = code1[1][0] |
---|
557 | n/a | setappend(code1) |
---|
558 | n/a | |
---|
559 | n/a | # XXX: <fl> should move set optimization to compiler! |
---|
560 | n/a | if _len(set)==1 and set[0][0] is LITERAL: |
---|
561 | n/a | subpatternappend(set[0]) # optimization |
---|
562 | n/a | elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: |
---|
563 | n/a | subpatternappend((NOT_LITERAL, set[1][1])) # optimization |
---|
564 | n/a | else: |
---|
565 | n/a | # XXX: <fl> should add charmap optimization here |
---|
566 | n/a | subpatternappend((IN, set)) |
---|
567 | n/a | |
---|
568 | n/a | elif this in REPEAT_CHARS: |
---|
569 | n/a | # repeat previous item |
---|
570 | n/a | here = source.tell() |
---|
571 | n/a | if this == "?": |
---|
572 | n/a | min, max = 0, 1 |
---|
573 | n/a | elif this == "*": |
---|
574 | n/a | min, max = 0, MAXREPEAT |
---|
575 | n/a | |
---|
576 | n/a | elif this == "+": |
---|
577 | n/a | min, max = 1, MAXREPEAT |
---|
578 | n/a | elif this == "{": |
---|
579 | n/a | if source.next == "}": |
---|
580 | n/a | subpatternappend((LITERAL, _ord(this))) |
---|
581 | n/a | continue |
---|
582 | n/a | min, max = 0, MAXREPEAT |
---|
583 | n/a | lo = hi = "" |
---|
584 | n/a | while source.next in DIGITS: |
---|
585 | n/a | lo += sourceget() |
---|
586 | n/a | if sourcematch(","): |
---|
587 | n/a | while source.next in DIGITS: |
---|
588 | n/a | hi += sourceget() |
---|
589 | n/a | else: |
---|
590 | n/a | hi = lo |
---|
591 | n/a | if not sourcematch("}"): |
---|
592 | n/a | subpatternappend((LITERAL, _ord(this))) |
---|
593 | n/a | source.seek(here) |
---|
594 | n/a | continue |
---|
595 | n/a | if lo: |
---|
596 | n/a | min = int(lo) |
---|
597 | n/a | if min >= MAXREPEAT: |
---|
598 | n/a | raise OverflowError("the repetition number is too large") |
---|
599 | n/a | if hi: |
---|
600 | n/a | max = int(hi) |
---|
601 | n/a | if max >= MAXREPEAT: |
---|
602 | n/a | raise OverflowError("the repetition number is too large") |
---|
603 | n/a | if max < min: |
---|
604 | n/a | raise source.error("min repeat greater than max repeat", |
---|
605 | n/a | source.tell() - here) |
---|
606 | n/a | else: |
---|
607 | n/a | raise AssertionError("unsupported quantifier %r" % (char,)) |
---|
608 | n/a | # figure out which item to repeat |
---|
609 | n/a | if subpattern: |
---|
610 | n/a | item = subpattern[-1:] |
---|
611 | n/a | else: |
---|
612 | n/a | item = None |
---|
613 | n/a | if not item or (_len(item) == 1 and item[0][0] is AT): |
---|
614 | n/a | raise source.error("nothing to repeat", |
---|
615 | n/a | source.tell() - here + len(this)) |
---|
616 | n/a | if item[0][0] in _REPEATCODES: |
---|
617 | n/a | raise source.error("multiple repeat", |
---|
618 | n/a | source.tell() - here + len(this)) |
---|
619 | n/a | if sourcematch("?"): |
---|
620 | n/a | subpattern[-1] = (MIN_REPEAT, (min, max, item)) |
---|
621 | n/a | else: |
---|
622 | n/a | subpattern[-1] = (MAX_REPEAT, (min, max, item)) |
---|
623 | n/a | |
---|
624 | n/a | elif this == ".": |
---|
625 | n/a | subpatternappend((ANY, None)) |
---|
626 | n/a | |
---|
627 | n/a | elif this == "(": |
---|
628 | n/a | start = source.tell() - 1 |
---|
629 | n/a | group = True |
---|
630 | n/a | name = None |
---|
631 | n/a | condgroup = None |
---|
632 | n/a | add_flags = 0 |
---|
633 | n/a | del_flags = 0 |
---|
634 | n/a | if sourcematch("?"): |
---|
635 | n/a | # options |
---|
636 | n/a | char = sourceget() |
---|
637 | n/a | if char is None: |
---|
638 | n/a | raise source.error("unexpected end of pattern") |
---|
639 | n/a | if char == "P": |
---|
640 | n/a | # python extensions |
---|
641 | n/a | if sourcematch("<"): |
---|
642 | n/a | # named group: skip forward to end of name |
---|
643 | n/a | name = source.getuntil(">") |
---|
644 | n/a | if not name.isidentifier(): |
---|
645 | n/a | msg = "bad character in group name %r" % name |
---|
646 | n/a | raise source.error(msg, len(name) + 1) |
---|
647 | n/a | elif sourcematch("="): |
---|
648 | n/a | # named backreference |
---|
649 | n/a | name = source.getuntil(")") |
---|
650 | n/a | if not name.isidentifier(): |
---|
651 | n/a | msg = "bad character in group name %r" % name |
---|
652 | n/a | raise source.error(msg, len(name) + 1) |
---|
653 | n/a | gid = state.groupdict.get(name) |
---|
654 | n/a | if gid is None: |
---|
655 | n/a | msg = "unknown group name %r" % name |
---|
656 | n/a | raise source.error(msg, len(name) + 1) |
---|
657 | n/a | if not state.checkgroup(gid): |
---|
658 | n/a | raise source.error("cannot refer to an open group", |
---|
659 | n/a | len(name) + 1) |
---|
660 | n/a | state.checklookbehindgroup(gid, source) |
---|
661 | n/a | subpatternappend((GROUPREF, gid)) |
---|
662 | n/a | continue |
---|
663 | n/a | else: |
---|
664 | n/a | char = sourceget() |
---|
665 | n/a | if char is None: |
---|
666 | n/a | raise source.error("unexpected end of pattern") |
---|
667 | n/a | raise source.error("unknown extension ?P" + char, |
---|
668 | n/a | len(char) + 2) |
---|
669 | n/a | elif char == ":": |
---|
670 | n/a | # non-capturing group |
---|
671 | n/a | group = None |
---|
672 | n/a | elif char == "#": |
---|
673 | n/a | # comment |
---|
674 | n/a | while True: |
---|
675 | n/a | if source.next is None: |
---|
676 | n/a | raise source.error("missing ), unterminated comment", |
---|
677 | n/a | source.tell() - start) |
---|
678 | n/a | if sourceget() == ")": |
---|
679 | n/a | break |
---|
680 | n/a | continue |
---|
681 | n/a | elif char in "=!<": |
---|
682 | n/a | # lookahead assertions |
---|
683 | n/a | dir = 1 |
---|
684 | n/a | if char == "<": |
---|
685 | n/a | char = sourceget() |
---|
686 | n/a | if char is None: |
---|
687 | n/a | raise source.error("unexpected end of pattern") |
---|
688 | n/a | if char not in "=!": |
---|
689 | n/a | raise source.error("unknown extension ?<" + char, |
---|
690 | n/a | len(char) + 2) |
---|
691 | n/a | dir = -1 # lookbehind |
---|
692 | n/a | lookbehindgroups = state.lookbehindgroups |
---|
693 | n/a | if lookbehindgroups is None: |
---|
694 | n/a | state.lookbehindgroups = state.groups |
---|
695 | n/a | p = _parse_sub(source, state, verbose) |
---|
696 | n/a | if dir < 0: |
---|
697 | n/a | if lookbehindgroups is None: |
---|
698 | n/a | state.lookbehindgroups = None |
---|
699 | n/a | if not sourcematch(")"): |
---|
700 | n/a | raise source.error("missing ), unterminated subpattern", |
---|
701 | n/a | source.tell() - start) |
---|
702 | n/a | if char == "=": |
---|
703 | n/a | subpatternappend((ASSERT, (dir, p))) |
---|
704 | n/a | else: |
---|
705 | n/a | subpatternappend((ASSERT_NOT, (dir, p))) |
---|
706 | n/a | continue |
---|
707 | n/a | elif char == "(": |
---|
708 | n/a | # conditional backreference group |
---|
709 | n/a | condname = source.getuntil(")") |
---|
710 | n/a | group = None |
---|
711 | n/a | if condname.isidentifier(): |
---|
712 | n/a | condgroup = state.groupdict.get(condname) |
---|
713 | n/a | if condgroup is None: |
---|
714 | n/a | msg = "unknown group name %r" % condname |
---|
715 | n/a | raise source.error(msg, len(condname) + 1) |
---|
716 | n/a | else: |
---|
717 | n/a | try: |
---|
718 | n/a | condgroup = int(condname) |
---|
719 | n/a | if condgroup < 0: |
---|
720 | n/a | raise ValueError |
---|
721 | n/a | except ValueError: |
---|
722 | n/a | msg = "bad character in group name %r" % condname |
---|
723 | n/a | raise source.error(msg, len(condname) + 1) from None |
---|
724 | n/a | if not condgroup: |
---|
725 | n/a | raise source.error("bad group number", |
---|
726 | n/a | len(condname) + 1) |
---|
727 | n/a | if condgroup >= MAXGROUPS: |
---|
728 | n/a | msg = "invalid group reference %d" % condgroup |
---|
729 | n/a | raise source.error(msg, len(condname) + 1) |
---|
730 | n/a | state.checklookbehindgroup(condgroup, source) |
---|
731 | n/a | elif char in FLAGS or char == "-": |
---|
732 | n/a | # flags |
---|
733 | n/a | pos = source.pos |
---|
734 | n/a | flags = _parse_flags(source, state, char) |
---|
735 | n/a | if flags is None: # global flags |
---|
736 | n/a | if pos != 3: # "(?x" |
---|
737 | n/a | import warnings |
---|
738 | n/a | warnings.warn( |
---|
739 | n/a | 'Flags not at the start of the expression %s%s' % ( |
---|
740 | n/a | source.string[:20], # truncate long regexes |
---|
741 | n/a | ' (truncated)' if len(source.string) > 20 else '', |
---|
742 | n/a | ), |
---|
743 | n/a | DeprecationWarning, stacklevel=7 |
---|
744 | n/a | ) |
---|
745 | n/a | continue |
---|
746 | n/a | add_flags, del_flags = flags |
---|
747 | n/a | group = None |
---|
748 | n/a | else: |
---|
749 | n/a | raise source.error("unknown extension ?" + char, |
---|
750 | n/a | len(char) + 1) |
---|
751 | n/a | |
---|
752 | n/a | # parse group contents |
---|
753 | n/a | if group is not None: |
---|
754 | n/a | try: |
---|
755 | n/a | group = state.opengroup(name) |
---|
756 | n/a | except error as err: |
---|
757 | n/a | raise source.error(err.msg, len(name) + 1) from None |
---|
758 | n/a | if condgroup: |
---|
759 | n/a | p = _parse_sub_cond(source, state, condgroup, verbose) |
---|
760 | n/a | else: |
---|
761 | n/a | sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and |
---|
762 | n/a | not (del_flags & SRE_FLAG_VERBOSE)) |
---|
763 | n/a | p = _parse_sub(source, state, sub_verbose) |
---|
764 | n/a | if not source.match(")"): |
---|
765 | n/a | raise source.error("missing ), unterminated subpattern", |
---|
766 | n/a | source.tell() - start) |
---|
767 | n/a | if group is not None: |
---|
768 | n/a | state.closegroup(group, p) |
---|
769 | n/a | subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) |
---|
770 | n/a | |
---|
771 | n/a | elif this == "^": |
---|
772 | n/a | subpatternappend((AT, AT_BEGINNING)) |
---|
773 | n/a | |
---|
774 | n/a | elif this == "$": |
---|
775 | n/a | subpattern.append((AT, AT_END)) |
---|
776 | n/a | |
---|
777 | n/a | else: |
---|
778 | n/a | raise AssertionError("unsupported special character %r" % (char,)) |
---|
779 | n/a | |
---|
780 | n/a | return subpattern |
---|
781 | n/a | |
---|
782 | n/a | def _parse_flags(source, state, char): |
---|
783 | n/a | sourceget = source.get |
---|
784 | n/a | add_flags = 0 |
---|
785 | n/a | del_flags = 0 |
---|
786 | n/a | if char != "-": |
---|
787 | n/a | while True: |
---|
788 | n/a | add_flags |= FLAGS[char] |
---|
789 | n/a | char = sourceget() |
---|
790 | n/a | if char is None: |
---|
791 | n/a | raise source.error("missing -, : or )") |
---|
792 | n/a | if char in ")-:": |
---|
793 | n/a | break |
---|
794 | n/a | if char not in FLAGS: |
---|
795 | n/a | msg = "unknown flag" if char.isalpha() else "missing -, : or )" |
---|
796 | n/a | raise source.error(msg, len(char)) |
---|
797 | n/a | if char == ")": |
---|
798 | n/a | if ((add_flags & SRE_FLAG_VERBOSE) and |
---|
799 | n/a | not (state.flags & SRE_FLAG_VERBOSE)): |
---|
800 | n/a | raise Verbose |
---|
801 | n/a | state.flags |= add_flags |
---|
802 | n/a | return None |
---|
803 | n/a | if add_flags & GLOBAL_FLAGS: |
---|
804 | n/a | raise source.error("bad inline flags: cannot turn on global flag", 1) |
---|
805 | n/a | if char == "-": |
---|
806 | n/a | char = sourceget() |
---|
807 | n/a | if char is None: |
---|
808 | n/a | raise source.error("missing flag") |
---|
809 | n/a | if char not in FLAGS: |
---|
810 | n/a | msg = "unknown flag" if char.isalpha() else "missing flag" |
---|
811 | n/a | raise source.error(msg, len(char)) |
---|
812 | n/a | while True: |
---|
813 | n/a | del_flags |= FLAGS[char] |
---|
814 | n/a | char = sourceget() |
---|
815 | n/a | if char is None: |
---|
816 | n/a | raise source.error("missing :") |
---|
817 | n/a | if char == ":": |
---|
818 | n/a | break |
---|
819 | n/a | if char not in FLAGS: |
---|
820 | n/a | msg = "unknown flag" if char.isalpha() else "missing :" |
---|
821 | n/a | raise source.error(msg, len(char)) |
---|
822 | n/a | assert char == ":" |
---|
823 | n/a | if del_flags & GLOBAL_FLAGS: |
---|
824 | n/a | raise source.error("bad inline flags: cannot turn off global flag", 1) |
---|
825 | n/a | if add_flags & del_flags: |
---|
826 | n/a | raise source.error("bad inline flags: flag turned on and off", 1) |
---|
827 | n/a | return add_flags, del_flags |
---|
828 | n/a | |
---|
829 | n/a | def fix_flags(src, flags): |
---|
830 | n/a | # Check and fix flags according to the type of pattern (str or bytes) |
---|
831 | n/a | if isinstance(src, str): |
---|
832 | n/a | if flags & SRE_FLAG_LOCALE: |
---|
833 | n/a | raise ValueError("cannot use LOCALE flag with a str pattern") |
---|
834 | n/a | if not flags & SRE_FLAG_ASCII: |
---|
835 | n/a | flags |= SRE_FLAG_UNICODE |
---|
836 | n/a | elif flags & SRE_FLAG_UNICODE: |
---|
837 | n/a | raise ValueError("ASCII and UNICODE flags are incompatible") |
---|
838 | n/a | else: |
---|
839 | n/a | if flags & SRE_FLAG_UNICODE: |
---|
840 | n/a | raise ValueError("cannot use UNICODE flag with a bytes pattern") |
---|
841 | n/a | if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII: |
---|
842 | n/a | raise ValueError("ASCII and LOCALE flags are incompatible") |
---|
843 | n/a | return flags |
---|
844 | n/a | |
---|
845 | n/a | def parse(str, flags=0, pattern=None): |
---|
846 | n/a | # parse 're' pattern into list of (opcode, argument) tuples |
---|
847 | n/a | |
---|
848 | n/a | source = Tokenizer(str) |
---|
849 | n/a | |
---|
850 | n/a | if pattern is None: |
---|
851 | n/a | pattern = Pattern() |
---|
852 | n/a | pattern.flags = flags |
---|
853 | n/a | pattern.str = str |
---|
854 | n/a | |
---|
855 | n/a | try: |
---|
856 | n/a | p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, False) |
---|
857 | n/a | except Verbose: |
---|
858 | n/a | # the VERBOSE flag was switched on inside the pattern. to be |
---|
859 | n/a | # on the safe side, we'll parse the whole thing again... |
---|
860 | n/a | pattern = Pattern() |
---|
861 | n/a | pattern.flags = flags | SRE_FLAG_VERBOSE |
---|
862 | n/a | pattern.str = str |
---|
863 | n/a | source.seek(0) |
---|
864 | n/a | p = _parse_sub(source, pattern, True, False) |
---|
865 | n/a | |
---|
866 | n/a | p.pattern.flags = fix_flags(str, p.pattern.flags) |
---|
867 | n/a | |
---|
868 | n/a | if source.next is not None: |
---|
869 | n/a | assert source.next == ")" |
---|
870 | n/a | raise source.error("unbalanced parenthesis") |
---|
871 | n/a | |
---|
872 | n/a | if flags & SRE_FLAG_DEBUG: |
---|
873 | n/a | p.dump() |
---|
874 | n/a | |
---|
875 | n/a | return p |
---|
876 | n/a | |
---|
877 | n/a | def parse_template(source, pattern): |
---|
878 | n/a | # parse 're' replacement string into list of literals and |
---|
879 | n/a | # group references |
---|
880 | n/a | s = Tokenizer(source) |
---|
881 | n/a | sget = s.get |
---|
882 | n/a | groups = [] |
---|
883 | n/a | literals = [] |
---|
884 | n/a | literal = [] |
---|
885 | n/a | lappend = literal.append |
---|
886 | n/a | def addgroup(index, pos): |
---|
887 | n/a | if index > pattern.groups: |
---|
888 | n/a | raise s.error("invalid group reference %d" % index, pos) |
---|
889 | n/a | if literal: |
---|
890 | n/a | literals.append(''.join(literal)) |
---|
891 | n/a | del literal[:] |
---|
892 | n/a | groups.append((len(literals), index)) |
---|
893 | n/a | literals.append(None) |
---|
894 | n/a | groupindex = pattern.groupindex |
---|
895 | n/a | while True: |
---|
896 | n/a | this = sget() |
---|
897 | n/a | if this is None: |
---|
898 | n/a | break # end of replacement string |
---|
899 | n/a | if this[0] == "\\": |
---|
900 | n/a | # group |
---|
901 | n/a | c = this[1] |
---|
902 | n/a | if c == "g": |
---|
903 | n/a | name = "" |
---|
904 | n/a | if not s.match("<"): |
---|
905 | n/a | raise s.error("missing <") |
---|
906 | n/a | name = s.getuntil(">") |
---|
907 | n/a | if name.isidentifier(): |
---|
908 | n/a | try: |
---|
909 | n/a | index = groupindex[name] |
---|
910 | n/a | except KeyError: |
---|
911 | n/a | raise IndexError("unknown group name %r" % name) |
---|
912 | n/a | else: |
---|
913 | n/a | try: |
---|
914 | n/a | index = int(name) |
---|
915 | n/a | if index < 0: |
---|
916 | n/a | raise ValueError |
---|
917 | n/a | except ValueError: |
---|
918 | n/a | raise s.error("bad character in group name %r" % name, |
---|
919 | n/a | len(name) + 1) from None |
---|
920 | n/a | if index >= MAXGROUPS: |
---|
921 | n/a | raise s.error("invalid group reference %d" % index, |
---|
922 | n/a | len(name) + 1) |
---|
923 | n/a | addgroup(index, len(name) + 1) |
---|
924 | n/a | elif c == "0": |
---|
925 | n/a | if s.next in OCTDIGITS: |
---|
926 | n/a | this += sget() |
---|
927 | n/a | if s.next in OCTDIGITS: |
---|
928 | n/a | this += sget() |
---|
929 | n/a | lappend(chr(int(this[1:], 8) & 0xff)) |
---|
930 | n/a | elif c in DIGITS: |
---|
931 | n/a | isoctal = False |
---|
932 | n/a | if s.next in DIGITS: |
---|
933 | n/a | this += sget() |
---|
934 | n/a | if (c in OCTDIGITS and this[2] in OCTDIGITS and |
---|
935 | n/a | s.next in OCTDIGITS): |
---|
936 | n/a | this += sget() |
---|
937 | n/a | isoctal = True |
---|
938 | n/a | c = int(this[1:], 8) |
---|
939 | n/a | if c > 0o377: |
---|
940 | n/a | raise s.error('octal escape value %s outside of ' |
---|
941 | n/a | 'range 0-0o377' % this, len(this)) |
---|
942 | n/a | lappend(chr(c)) |
---|
943 | n/a | if not isoctal: |
---|
944 | n/a | addgroup(int(this[1:]), len(this) - 1) |
---|
945 | n/a | else: |
---|
946 | n/a | try: |
---|
947 | n/a | this = chr(ESCAPES[this][1]) |
---|
948 | n/a | except KeyError: |
---|
949 | n/a | if c in ASCIILETTERS: |
---|
950 | n/a | raise s.error('bad escape %s' % this, len(this)) |
---|
951 | n/a | lappend(this) |
---|
952 | n/a | else: |
---|
953 | n/a | lappend(this) |
---|
954 | n/a | if literal: |
---|
955 | n/a | literals.append(''.join(literal)) |
---|
956 | n/a | if not isinstance(source, str): |
---|
957 | n/a | # The tokenizer implicitly decodes bytes objects as latin-1, we must |
---|
958 | n/a | # therefore re-encode the final representation. |
---|
959 | n/a | literals = [None if s is None else s.encode('latin-1') for s in literals] |
---|
960 | n/a | return groups, literals |
---|
961 | n/a | |
---|
962 | n/a | def expand_template(template, match): |
---|
963 | n/a | g = match.group |
---|
964 | n/a | empty = match.string[:0] |
---|
965 | n/a | groups, literals = template |
---|
966 | n/a | literals = literals[:] |
---|
967 | n/a | try: |
---|
968 | n/a | for index, group in groups: |
---|
969 | n/a | literals[index] = g(group) or empty |
---|
970 | n/a | except IndexError: |
---|
971 | n/a | raise error("invalid group reference %d" % index) |
---|
972 | n/a | return empty.join(literals) |
---|