»Core Development>Code coverage>Lib/sre_compile.py

Python code coverage for Lib/sre_compile.py

#countcontent
1n/a#
2n/a# Secret Labs' Regular Expression Engine
3n/a#
4n/a# convert template to internal format
5n/a#
6n/a# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
7n/a#
8n/a# See the sre.py file for information on usage and redistribution.
9n/a#
10n/a
11n/a"""Internal support module for sre"""
12n/a
13n/aimport _sre
14n/aimport sre_parse
15n/afrom sre_constants import *
16n/a
17n/aassert _sre.MAGIC == MAGIC, "SRE module mismatch"
18n/a
19n/a_LITERAL_CODES = {LITERAL, NOT_LITERAL}
20n/a_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
21n/a_SUCCESS_CODES = {SUCCESS, FAILURE}
22n/a_ASSERT_CODES = {ASSERT, ASSERT_NOT}
23n/a
24n/a# Sets of lowercase characters which have the same uppercase.
25n/a_equivalences = (
26n/a # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
27n/a (0x69, 0x131), # iı
28n/a # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
29n/a (0x73, 0x17f), # sſ
30n/a # MICRO SIGN, GREEK SMALL LETTER MU
31n/a (0xb5, 0x3bc), # µμ
32n/a # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
33n/a (0x345, 0x3b9, 0x1fbe), # \u0345ιι
34n/a # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
35n/a (0x390, 0x1fd3), # ΐΐ
36n/a # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
37n/a (0x3b0, 0x1fe3), # ΰΰ
38n/a # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
39n/a (0x3b2, 0x3d0), # βϐ
40n/a # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
41n/a (0x3b5, 0x3f5), # εϵ
42n/a # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
43n/a (0x3b8, 0x3d1), # θϑ
44n/a # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
45n/a (0x3ba, 0x3f0), # κϰ
46n/a # GREEK SMALL LETTER PI, GREEK PI SYMBOL
47n/a (0x3c0, 0x3d6), # πϖ
48n/a # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
49n/a (0x3c1, 0x3f1), # ρϱ
50n/a # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
51n/a (0x3c2, 0x3c3), # ςσ
52n/a # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
53n/a (0x3c6, 0x3d5), # φϕ
54n/a # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
55n/a (0x1e61, 0x1e9b), # ṡẛ
56n/a # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
57n/a (0xfb05, 0xfb06), # ſtst
58n/a)
59n/a
60n/a# Maps the lowercase code to lowercase codes which have the same uppercase.
61n/a_ignorecase_fixes = {i: tuple(j for j in t if i != j)
62n/a for t in _equivalences for i in t}
63n/a
64n/adef _compile(code, pattern, flags):
65n/a # internal: compile a (sub)pattern
66n/a emit = code.append
67n/a _len = len
68n/a LITERAL_CODES = _LITERAL_CODES
69n/a REPEATING_CODES = _REPEATING_CODES
70n/a SUCCESS_CODES = _SUCCESS_CODES
71n/a ASSERT_CODES = _ASSERT_CODES
72n/a if (flags & SRE_FLAG_IGNORECASE and
73n/a not (flags & SRE_FLAG_LOCALE) and
74n/a flags & SRE_FLAG_UNICODE and
75n/a not (flags & SRE_FLAG_ASCII)):
76n/a fixes = _ignorecase_fixes
77n/a else:
78n/a fixes = None
79n/a for op, av in pattern:
80n/a if op in LITERAL_CODES:
81n/a if flags & SRE_FLAG_IGNORECASE:
82n/a lo = _sre.getlower(av, flags)
83n/a if fixes and lo in fixes:
84n/a emit(IN_IGNORE)
85n/a skip = _len(code); emit(0)
86n/a if op is NOT_LITERAL:
87n/a emit(NEGATE)
88n/a for k in (lo,) + fixes[lo]:
89n/a emit(LITERAL)
90n/a emit(k)
91n/a emit(FAILURE)
92n/a code[skip] = _len(code) - skip
93n/a else:
94n/a emit(OP_IGNORE[op])
95n/a emit(lo)
96n/a else:
97n/a emit(op)
98n/a emit(av)
99n/a elif op is IN:
100n/a if flags & SRE_FLAG_IGNORECASE:
101n/a emit(OP_IGNORE[op])
102n/a def fixup(literal, flags=flags):
103n/a return _sre.getlower(literal, flags)
104n/a else:
105n/a emit(op)
106n/a fixup = None
107n/a skip = _len(code); emit(0)
108n/a _compile_charset(av, flags, code, fixup, fixes)
109n/a code[skip] = _len(code) - skip
110n/a elif op is ANY:
111n/a if flags & SRE_FLAG_DOTALL:
112n/a emit(ANY_ALL)
113n/a else:
114n/a emit(ANY)
115n/a elif op in REPEATING_CODES:
116n/a if flags & SRE_FLAG_TEMPLATE:
117n/a raise error("internal: unsupported template operator %r" % (op,))
118n/a elif _simple(av) and op is not REPEAT:
119n/a if op is MAX_REPEAT:
120n/a emit(REPEAT_ONE)
121n/a else:
122n/a emit(MIN_REPEAT_ONE)
123n/a skip = _len(code); emit(0)
124n/a emit(av[0])
125n/a emit(av[1])
126n/a _compile(code, av[2], flags)
127n/a emit(SUCCESS)
128n/a code[skip] = _len(code) - skip
129n/a else:
130n/a emit(REPEAT)
131n/a skip = _len(code); emit(0)
132n/a emit(av[0])
133n/a emit(av[1])
134n/a _compile(code, av[2], flags)
135n/a code[skip] = _len(code) - skip
136n/a if op is MAX_REPEAT:
137n/a emit(MAX_UNTIL)
138n/a else:
139n/a emit(MIN_UNTIL)
140n/a elif op is SUBPATTERN:
141n/a group, add_flags, del_flags, p = av
142n/a if group:
143n/a emit(MARK)
144n/a emit((group-1)*2)
145n/a # _compile_info(code, p, (flags | add_flags) & ~del_flags)
146n/a _compile(code, p, (flags | add_flags) & ~del_flags)
147n/a if group:
148n/a emit(MARK)
149n/a emit((group-1)*2+1)
150n/a elif op in SUCCESS_CODES:
151n/a emit(op)
152n/a elif op in ASSERT_CODES:
153n/a emit(op)
154n/a skip = _len(code); emit(0)
155n/a if av[0] >= 0:
156n/a emit(0) # look ahead
157n/a else:
158n/a lo, hi = av[1].getwidth()
159n/a if lo != hi:
160n/a raise error("look-behind requires fixed-width pattern")
161n/a emit(lo) # look behind
162n/a _compile(code, av[1], flags)
163n/a emit(SUCCESS)
164n/a code[skip] = _len(code) - skip
165n/a elif op is CALL:
166n/a emit(op)
167n/a skip = _len(code); emit(0)
168n/a _compile(code, av, flags)
169n/a emit(SUCCESS)
170n/a code[skip] = _len(code) - skip
171n/a elif op is AT:
172n/a emit(op)
173n/a if flags & SRE_FLAG_MULTILINE:
174n/a av = AT_MULTILINE.get(av, av)
175n/a if flags & SRE_FLAG_LOCALE:
176n/a av = AT_LOCALE.get(av, av)
177n/a elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
178n/a av = AT_UNICODE.get(av, av)
179n/a emit(av)
180n/a elif op is BRANCH:
181n/a emit(op)
182n/a tail = []
183n/a tailappend = tail.append
184n/a for av in av[1]:
185n/a skip = _len(code); emit(0)
186n/a # _compile_info(code, av, flags)
187n/a _compile(code, av, flags)
188n/a emit(JUMP)
189n/a tailappend(_len(code)); emit(0)
190n/a code[skip] = _len(code) - skip
191n/a emit(FAILURE) # end of branch
192n/a for tail in tail:
193n/a code[tail] = _len(code) - tail
194n/a elif op is CATEGORY:
195n/a emit(op)
196n/a if flags & SRE_FLAG_LOCALE:
197n/a av = CH_LOCALE[av]
198n/a elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
199n/a av = CH_UNICODE[av]
200n/a emit(av)
201n/a elif op is GROUPREF:
202n/a if flags & SRE_FLAG_IGNORECASE:
203n/a emit(OP_IGNORE[op])
204n/a else:
205n/a emit(op)
206n/a emit(av-1)
207n/a elif op is GROUPREF_EXISTS:
208n/a emit(op)
209n/a emit(av[0]-1)
210n/a skipyes = _len(code); emit(0)
211n/a _compile(code, av[1], flags)
212n/a if av[2]:
213n/a emit(JUMP)
214n/a skipno = _len(code); emit(0)
215n/a code[skipyes] = _len(code) - skipyes + 1
216n/a _compile(code, av[2], flags)
217n/a code[skipno] = _len(code) - skipno
218n/a else:
219n/a code[skipyes] = _len(code) - skipyes + 1
220n/a else:
221n/a raise error("internal: unsupported operand type %r" % (op,))
222n/a
223n/adef _compile_charset(charset, flags, code, fixup=None, fixes=None):
224n/a # compile charset subprogram
225n/a emit = code.append
226n/a for op, av in _optimize_charset(charset, fixup, fixes):
227n/a emit(op)
228n/a if op is NEGATE:
229n/a pass
230n/a elif op is LITERAL:
231n/a emit(av)
232n/a elif op is RANGE or op is RANGE_IGNORE:
233n/a emit(av[0])
234n/a emit(av[1])
235n/a elif op is CHARSET:
236n/a code.extend(av)
237n/a elif op is BIGCHARSET:
238n/a code.extend(av)
239n/a elif op is CATEGORY:
240n/a if flags & SRE_FLAG_LOCALE:
241n/a emit(CH_LOCALE[av])
242n/a elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
243n/a emit(CH_UNICODE[av])
244n/a else:
245n/a emit(av)
246n/a else:
247n/a raise error("internal: unsupported set operator %r" % (op,))
248n/a emit(FAILURE)
249n/a
250n/adef _optimize_charset(charset, fixup, fixes):
251n/a # internal: optimize character set
252n/a out = []
253n/a tail = []
254n/a charmap = bytearray(256)
255n/a for op, av in charset:
256n/a while True:
257n/a try:
258n/a if op is LITERAL:
259n/a if fixup:
260n/a lo = fixup(av)
261n/a charmap[lo] = 1
262n/a if fixes and lo in fixes:
263n/a for k in fixes[lo]:
264n/a charmap[k] = 1
265n/a else:
266n/a charmap[av] = 1
267n/a elif op is RANGE:
268n/a r = range(av[0], av[1]+1)
269n/a if fixup:
270n/a r = map(fixup, r)
271n/a if fixup and fixes:
272n/a for i in r:
273n/a charmap[i] = 1
274n/a if i in fixes:
275n/a for k in fixes[i]:
276n/a charmap[k] = 1
277n/a else:
278n/a for i in r:
279n/a charmap[i] = 1
280n/a elif op is NEGATE:
281n/a out.append((op, av))
282n/a else:
283n/a tail.append((op, av))
284n/a except IndexError:
285n/a if len(charmap) == 256:
286n/a # character set contains non-UCS1 character codes
287n/a charmap += b'\0' * 0xff00
288n/a continue
289n/a # Character set contains non-BMP character codes.
290n/a # There are only two ranges of cased non-BMP characters:
291n/a # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
292n/a # and for both ranges RANGE_IGNORE works.
293n/a if fixup and op is RANGE:
294n/a op = RANGE_IGNORE
295n/a tail.append((op, av))
296n/a break
297n/a
298n/a # compress character map
299n/a runs = []
300n/a q = 0
301n/a while True:
302n/a p = charmap.find(1, q)
303n/a if p < 0:
304n/a break
305n/a if len(runs) >= 2:
306n/a runs = None
307n/a break
308n/a q = charmap.find(0, p)
309n/a if q < 0:
310n/a runs.append((p, len(charmap)))
311n/a break
312n/a runs.append((p, q))
313n/a if runs is not None:
314n/a # use literal/range
315n/a for p, q in runs:
316n/a if q - p == 1:
317n/a out.append((LITERAL, p))
318n/a else:
319n/a out.append((RANGE, (p, q - 1)))
320n/a out += tail
321n/a # if the case was changed or new representation is more compact
322n/a if fixup or len(out) < len(charset):
323n/a return out
324n/a # else original character set is good enough
325n/a return charset
326n/a
327n/a # use bitmap
328n/a if len(charmap) == 256:
329n/a data = _mk_bitmap(charmap)
330n/a out.append((CHARSET, data))
331n/a out += tail
332n/a return out
333n/a
334n/a # To represent a big charset, first a bitmap of all characters in the
335n/a # set is constructed. Then, this bitmap is sliced into chunks of 256
336n/a # characters, duplicate chunks are eliminated, and each chunk is
337n/a # given a number. In the compiled expression, the charset is
338n/a # represented by a 32-bit word sequence, consisting of one word for
339n/a # the number of different chunks, a sequence of 256 bytes (64 words)
340n/a # of chunk numbers indexed by their original chunk position, and a
341n/a # sequence of 256-bit chunks (8 words each).
342n/a
343n/a # Compression is normally good: in a typical charset, large ranges of
344n/a # Unicode will be either completely excluded (e.g. if only cyrillic
345n/a # letters are to be matched), or completely included (e.g. if large
346n/a # subranges of Kanji match). These ranges will be represented by
347n/a # chunks of all one-bits or all zero-bits.
348n/a
349n/a # Matching can be also done efficiently: the more significant byte of
350n/a # the Unicode character is an index into the chunk number, and the
351n/a # less significant byte is a bit index in the chunk (just like the
352n/a # CHARSET matching).
353n/a
354n/a charmap = bytes(charmap) # should be hashable
355n/a comps = {}
356n/a mapping = bytearray(256)
357n/a block = 0
358n/a data = bytearray()
359n/a for i in range(0, 65536, 256):
360n/a chunk = charmap[i: i + 256]
361n/a if chunk in comps:
362n/a mapping[i // 256] = comps[chunk]
363n/a else:
364n/a mapping[i // 256] = comps[chunk] = block
365n/a block += 1
366n/a data += chunk
367n/a data = _mk_bitmap(data)
368n/a data[0:0] = [block] + _bytes_to_codes(mapping)
369n/a out.append((BIGCHARSET, data))
370n/a out += tail
371n/a return out
372n/a
373n/a_CODEBITS = _sre.CODESIZE * 8
374n/aMAXCODE = (1 << _CODEBITS) - 1
375n/a_BITS_TRANS = b'0' + b'1' * 255
376n/adef _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
377n/a s = bits.translate(_BITS_TRANS)[::-1]
378n/a return [_int(s[i - _CODEBITS: i], 2)
379n/a for i in range(len(s), 0, -_CODEBITS)]
380n/a
381n/adef _bytes_to_codes(b):
382n/a # Convert block indices to word array
383n/a a = memoryview(b).cast('I')
384n/a assert a.itemsize == _sre.CODESIZE
385n/a assert len(a) * a.itemsize == len(b)
386n/a return a.tolist()
387n/a
388n/adef _simple(av):
389n/a # check if av is a "simple" operator
390n/a lo, hi = av[2].getwidth()
391n/a return lo == hi == 1 and av[2][0][0] != SUBPATTERN
392n/a
393n/adef _generate_overlap_table(prefix):
394n/a """
395n/a Generate an overlap table for the following prefix.
396n/a An overlap table is a table of the same size as the prefix which
397n/a informs about the potential self-overlap for each index in the prefix:
398n/a - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
399n/a - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
400n/a prefix[0:k]
401n/a """
402n/a table = [0] * len(prefix)
403n/a for i in range(1, len(prefix)):
404n/a idx = table[i - 1]
405n/a while prefix[i] != prefix[idx]:
406n/a if idx == 0:
407n/a table[i] = 0
408n/a break
409n/a idx = table[idx - 1]
410n/a else:
411n/a table[i] = idx + 1
412n/a return table
413n/a
414n/adef _get_literal_prefix(pattern):
415n/a # look for literal prefix
416n/a prefix = []
417n/a prefixappend = prefix.append
418n/a prefix_skip = None
419n/a for op, av in pattern.data:
420n/a if op is LITERAL:
421n/a prefixappend(av)
422n/a elif op is SUBPATTERN:
423n/a group, add_flags, del_flags, p = av
424n/a if add_flags & SRE_FLAG_IGNORECASE:
425n/a break
426n/a prefix1, prefix_skip1, got_all = _get_literal_prefix(p)
427n/a if prefix_skip is None:
428n/a if group is not None:
429n/a prefix_skip = len(prefix)
430n/a elif prefix_skip1 is not None:
431n/a prefix_skip = len(prefix) + prefix_skip1
432n/a prefix.extend(prefix1)
433n/a if not got_all:
434n/a break
435n/a else:
436n/a break
437n/a else:
438n/a return prefix, prefix_skip, True
439n/a return prefix, prefix_skip, False
440n/a
441n/adef _get_charset_prefix(pattern):
442n/a charset = [] # not used
443n/a charsetappend = charset.append
444n/a if pattern.data:
445n/a op, av = pattern.data[0]
446n/a if op is SUBPATTERN:
447n/a group, add_flags, del_flags, p = av
448n/a if p and not (add_flags & SRE_FLAG_IGNORECASE):
449n/a op, av = p[0]
450n/a if op is LITERAL:
451n/a charsetappend((op, av))
452n/a elif op is BRANCH:
453n/a c = []
454n/a cappend = c.append
455n/a for p in av[1]:
456n/a if not p:
457n/a break
458n/a op, av = p[0]
459n/a if op is LITERAL:
460n/a cappend((op, av))
461n/a else:
462n/a break
463n/a else:
464n/a charset = c
465n/a elif op is BRANCH:
466n/a c = []
467n/a cappend = c.append
468n/a for p in av[1]:
469n/a if not p:
470n/a break
471n/a op, av = p[0]
472n/a if op is LITERAL:
473n/a cappend((op, av))
474n/a else:
475n/a break
476n/a else:
477n/a charset = c
478n/a elif op is IN:
479n/a charset = av
480n/a return charset
481n/a
482n/adef _compile_info(code, pattern, flags):
483n/a # internal: compile an info block. in the current version,
484n/a # this contains min/max pattern width, and an optional literal
485n/a # prefix or a character map
486n/a lo, hi = pattern.getwidth()
487n/a if hi > MAXCODE:
488n/a hi = MAXCODE
489n/a if lo == 0:
490n/a code.extend([INFO, 4, 0, lo, hi])
491n/a return
492n/a # look for a literal prefix
493n/a prefix = []
494n/a prefix_skip = 0
495n/a charset = [] # not used
496n/a if not (flags & SRE_FLAG_IGNORECASE):
497n/a # look for literal prefix
498n/a prefix, prefix_skip, got_all = _get_literal_prefix(pattern)
499n/a # if no prefix, look for charset prefix
500n/a if not prefix:
501n/a charset = _get_charset_prefix(pattern)
502n/a## if prefix:
503n/a## print("*** PREFIX", prefix, prefix_skip)
504n/a## if charset:
505n/a## print("*** CHARSET", charset)
506n/a # add an info block
507n/a emit = code.append
508n/a emit(INFO)
509n/a skip = len(code); emit(0)
510n/a # literal flag
511n/a mask = 0
512n/a if prefix:
513n/a mask = SRE_INFO_PREFIX
514n/a if prefix_skip is None and got_all:
515n/a mask = mask | SRE_INFO_LITERAL
516n/a elif charset:
517n/a mask = mask | SRE_INFO_CHARSET
518n/a emit(mask)
519n/a # pattern length
520n/a if lo < MAXCODE:
521n/a emit(lo)
522n/a else:
523n/a emit(MAXCODE)
524n/a prefix = prefix[:MAXCODE]
525n/a emit(min(hi, MAXCODE))
526n/a # add literal prefix
527n/a if prefix:
528n/a emit(len(prefix)) # length
529n/a if prefix_skip is None:
530n/a prefix_skip = len(prefix)
531n/a emit(prefix_skip) # skip
532n/a code.extend(prefix)
533n/a # generate overlap table
534n/a code.extend(_generate_overlap_table(prefix))
535n/a elif charset:
536n/a _compile_charset(charset, flags, code)
537n/a code[skip] = len(code) - skip
538n/a
539n/adef isstring(obj):
540n/a return isinstance(obj, (str, bytes))
541n/a
542n/adef _code(p, flags):
543n/a
544n/a flags = p.pattern.flags | flags
545n/a code = []
546n/a
547n/a # compile info block
548n/a _compile_info(code, p, flags)
549n/a
550n/a # compile the pattern
551n/a _compile(code, p.data, flags)
552n/a
553n/a code.append(SUCCESS)
554n/a
555n/a return code
556n/a
557n/adef compile(p, flags=0):
558n/a # internal: convert pattern list to internal format
559n/a
560n/a if isstring(p):
561n/a pattern = p
562n/a p = sre_parse.parse(p, flags)
563n/a else:
564n/a pattern = None
565n/a
566n/a code = _code(p, flags)
567n/a
568n/a # print(code)
569n/a
570n/a # map in either direction
571n/a groupindex = p.pattern.groupdict
572n/a indexgroup = [None] * p.pattern.groups
573n/a for k, i in groupindex.items():
574n/a indexgroup[i] = k
575n/a
576n/a return _sre.compile(
577n/a pattern, flags | p.pattern.flags, code,
578n/a p.pattern.groups-1,
579n/a groupindex, tuple(indexgroup)
580n/a )