1 | n/a | """ |
---|
2 | n/a | Tests for the html module functions. |
---|
3 | n/a | """ |
---|
4 | n/a | |
---|
5 | n/a | import html |
---|
6 | n/a | import unittest |
---|
7 | n/a | |
---|
8 | n/a | |
---|
9 | n/a | class HtmlTests(unittest.TestCase): |
---|
10 | n/a | def test_escape(self): |
---|
11 | n/a | self.assertEqual( |
---|
12 | n/a | html.escape('\'<script>"&foo;"</script>\''), |
---|
13 | n/a | ''<script>"&foo;"</script>'') |
---|
14 | n/a | self.assertEqual( |
---|
15 | n/a | html.escape('\'<script>"&foo;"</script>\'', False), |
---|
16 | n/a | '\'<script>"&foo;"</script>\'') |
---|
17 | n/a | |
---|
18 | n/a | def test_unescape(self): |
---|
19 | n/a | numeric_formats = ['&#%d', '&#%d;', '&#x%x', '&#x%x;'] |
---|
20 | n/a | errmsg = 'unescape(%r) should have returned %r' |
---|
21 | n/a | def check(text, expected): |
---|
22 | n/a | self.assertEqual(html.unescape(text), expected, |
---|
23 | n/a | msg=errmsg % (text, expected)) |
---|
24 | n/a | def check_num(num, expected): |
---|
25 | n/a | for format in numeric_formats: |
---|
26 | n/a | text = format % num |
---|
27 | n/a | self.assertEqual(html.unescape(text), expected, |
---|
28 | n/a | msg=errmsg % (text, expected)) |
---|
29 | n/a | # check text with no character references |
---|
30 | n/a | check('no character references', 'no character references') |
---|
31 | n/a | # check & followed by invalid chars |
---|
32 | n/a | check('&\n&\t& &&', '&\n&\t& &&') |
---|
33 | n/a | # check & followed by numbers and letters |
---|
34 | n/a | check('&0 &9 &a &0; &9; &a;', '&0 &9 &a &0; &9; &a;') |
---|
35 | n/a | # check incomplete entities at the end of the string |
---|
36 | n/a | for x in ['&', '&#', '&#x', '&#X', '&#y', '&#xy', '&#Xy']: |
---|
37 | n/a | check(x, x) |
---|
38 | n/a | check(x+';', x+';') |
---|
39 | n/a | # check several combinations of numeric character references, |
---|
40 | n/a | # possibly followed by different characters |
---|
41 | n/a | formats = ['&#%d', '&#%07d', '&#%d;', '&#%07d;', |
---|
42 | n/a | '&#x%x', '&#x%06x', '&#x%x;', '&#x%06x;', |
---|
43 | n/a | '&#x%X', '&#x%06X', '&#X%x;', '&#X%06x;'] |
---|
44 | n/a | for num, char in zip([65, 97, 34, 38, 0x2603, 0x101234], |
---|
45 | n/a | ['A', 'a', '"', '&', '\u2603', '\U00101234']): |
---|
46 | n/a | for s in formats: |
---|
47 | n/a | check(s % num, char) |
---|
48 | n/a | for end in [' ', 'X']: |
---|
49 | n/a | check((s+end) % num, char+end) |
---|
50 | n/a | # check invalid code points |
---|
51 | n/a | for cp in [0xD800, 0xDB00, 0xDC00, 0xDFFF, 0x110000]: |
---|
52 | n/a | check_num(cp, '\uFFFD') |
---|
53 | n/a | # check more invalid code points |
---|
54 | n/a | for cp in [0x1, 0xb, 0xe, 0x7f, 0xfffe, 0xffff, 0x10fffe, 0x10ffff]: |
---|
55 | n/a | check_num(cp, '') |
---|
56 | n/a | # check invalid numbers |
---|
57 | n/a | for num, ch in zip([0x0d, 0x80, 0x95, 0x9d], '\r\u20ac\u2022\x9d'): |
---|
58 | n/a | check_num(num, ch) |
---|
59 | n/a | # check small numbers |
---|
60 | n/a | check_num(0, '\uFFFD') |
---|
61 | n/a | check_num(9, '\t') |
---|
62 | n/a | # check a big number |
---|
63 | n/a | check_num(1000000000000000000, '\uFFFD') |
---|
64 | n/a | # check that multiple trailing semicolons are handled correctly |
---|
65 | n/a | for e in ['";', '";', '";', '";']: |
---|
66 | n/a | check(e, '";') |
---|
67 | n/a | # check that semicolons in the middle don't create problems |
---|
68 | n/a | for e in ['"quot;', '"quot;', '"quot;', '"quot;']: |
---|
69 | n/a | check(e, '"quot;') |
---|
70 | n/a | # check triple adjacent charrefs |
---|
71 | n/a | for e in ['"', '"', '"', '"']: |
---|
72 | n/a | check(e*3, '"""') |
---|
73 | n/a | check((e+';')*3, '"""') |
---|
74 | n/a | # check that the case is respected |
---|
75 | n/a | for e in ['&', '&', '&', '&']: |
---|
76 | n/a | check(e, '&') |
---|
77 | n/a | for e in ['&Amp', '&Amp;']: |
---|
78 | n/a | check(e, e) |
---|
79 | n/a | # check that non-existent named entities are returned unchanged |
---|
80 | n/a | check('&svadilfari;', '&svadilfari;') |
---|
81 | n/a | # the following examples are in the html5 specs |
---|
82 | n/a | check('¬it', '¬it') |
---|
83 | n/a | check('¬it;', '¬it;') |
---|
84 | n/a | check('¬in', '¬in') |
---|
85 | n/a | check('∉', 'â') |
---|
86 | n/a | # a similar example with a long name |
---|
87 | n/a | check('¬ReallyAnExistingNamedCharacterReference;', |
---|
88 | n/a | '¬ReallyAnExistingNamedCharacterReference;') |
---|
89 | n/a | # longest valid name |
---|
90 | n/a | check('∳', 'â³') |
---|
91 | n/a | # check a charref that maps to two unicode chars |
---|
92 | n/a | check('∾̳', '\u223E\u0333') |
---|
93 | n/a | check('&acE', '&acE') |
---|
94 | n/a | # see #12888 |
---|
95 | n/a | check('{ ' * 1050, '{ ' * 1050) |
---|
96 | n/a | # see #15156 |
---|
97 | n/a | check('ÉricÉric&alphacentauriαcentauri', |
---|
98 | n/a | 'ÃricÃric&alphacentauriαcentauri') |
---|
99 | n/a | check('&co;', '&co;') |
---|
100 | n/a | |
---|
101 | n/a | |
---|
102 | n/a | if __name__ == '__main__': |
---|
103 | n/a | unittest.main() |
---|