| 1 | n/a | """ |
|---|
| 2 | n/a | Tests for the html module functions. |
|---|
| 3 | n/a | """ |
|---|
| 4 | n/a | |
|---|
| 5 | n/a | import html |
|---|
| 6 | n/a | import unittest |
|---|
| 7 | n/a | |
|---|
| 8 | n/a | |
|---|
| 9 | n/a | class HtmlTests(unittest.TestCase): |
|---|
| 10 | n/a | def test_escape(self): |
|---|
| 11 | n/a | self.assertEqual( |
|---|
| 12 | n/a | html.escape('\'<script>"&foo;"</script>\''), |
|---|
| 13 | n/a | ''<script>"&foo;"</script>'') |
|---|
| 14 | n/a | self.assertEqual( |
|---|
| 15 | n/a | html.escape('\'<script>"&foo;"</script>\'', False), |
|---|
| 16 | n/a | '\'<script>"&foo;"</script>\'') |
|---|
| 17 | n/a | |
|---|
| 18 | n/a | def test_unescape(self): |
|---|
| 19 | n/a | numeric_formats = ['&#%d', '&#%d;', '&#x%x', '&#x%x;'] |
|---|
| 20 | n/a | errmsg = 'unescape(%r) should have returned %r' |
|---|
| 21 | n/a | def check(text, expected): |
|---|
| 22 | n/a | self.assertEqual(html.unescape(text), expected, |
|---|
| 23 | n/a | msg=errmsg % (text, expected)) |
|---|
| 24 | n/a | def check_num(num, expected): |
|---|
| 25 | n/a | for format in numeric_formats: |
|---|
| 26 | n/a | text = format % num |
|---|
| 27 | n/a | self.assertEqual(html.unescape(text), expected, |
|---|
| 28 | n/a | msg=errmsg % (text, expected)) |
|---|
| 29 | n/a | # check text with no character references |
|---|
| 30 | n/a | check('no character references', 'no character references') |
|---|
| 31 | n/a | # check & followed by invalid chars |
|---|
| 32 | n/a | check('&\n&\t& &&', '&\n&\t& &&') |
|---|
| 33 | n/a | # check & followed by numbers and letters |
|---|
| 34 | n/a | check('&0 &9 &a &0; &9; &a;', '&0 &9 &a &0; &9; &a;') |
|---|
| 35 | n/a | # check incomplete entities at the end of the string |
|---|
| 36 | n/a | for x in ['&', '&#', '&#x', '&#X', '&#y', '&#xy', '&#Xy']: |
|---|
| 37 | n/a | check(x, x) |
|---|
| 38 | n/a | check(x+';', x+';') |
|---|
| 39 | n/a | # check several combinations of numeric character references, |
|---|
| 40 | n/a | # possibly followed by different characters |
|---|
| 41 | n/a | formats = ['&#%d', '&#%07d', '&#%d;', '&#%07d;', |
|---|
| 42 | n/a | '&#x%x', '&#x%06x', '&#x%x;', '&#x%06x;', |
|---|
| 43 | n/a | '&#x%X', '&#x%06X', '&#X%x;', '&#X%06x;'] |
|---|
| 44 | n/a | for num, char in zip([65, 97, 34, 38, 0x2603, 0x101234], |
|---|
| 45 | n/a | ['A', 'a', '"', '&', '\u2603', '\U00101234']): |
|---|
| 46 | n/a | for s in formats: |
|---|
| 47 | n/a | check(s % num, char) |
|---|
| 48 | n/a | for end in [' ', 'X']: |
|---|
| 49 | n/a | check((s+end) % num, char+end) |
|---|
| 50 | n/a | # check invalid code points |
|---|
| 51 | n/a | for cp in [0xD800, 0xDB00, 0xDC00, 0xDFFF, 0x110000]: |
|---|
| 52 | n/a | check_num(cp, '\uFFFD') |
|---|
| 53 | n/a | # check more invalid code points |
|---|
| 54 | n/a | for cp in [0x1, 0xb, 0xe, 0x7f, 0xfffe, 0xffff, 0x10fffe, 0x10ffff]: |
|---|
| 55 | n/a | check_num(cp, '') |
|---|
| 56 | n/a | # check invalid numbers |
|---|
| 57 | n/a | for num, ch in zip([0x0d, 0x80, 0x95, 0x9d], '\r\u20ac\u2022\x9d'): |
|---|
| 58 | n/a | check_num(num, ch) |
|---|
| 59 | n/a | # check small numbers |
|---|
| 60 | n/a | check_num(0, '\uFFFD') |
|---|
| 61 | n/a | check_num(9, '\t') |
|---|
| 62 | n/a | # check a big number |
|---|
| 63 | n/a | check_num(1000000000000000000, '\uFFFD') |
|---|
| 64 | n/a | # check that multiple trailing semicolons are handled correctly |
|---|
| 65 | n/a | for e in ['";', '";', '";', '";']: |
|---|
| 66 | n/a | check(e, '";') |
|---|
| 67 | n/a | # check that semicolons in the middle don't create problems |
|---|
| 68 | n/a | for e in ['"quot;', '"quot;', '"quot;', '"quot;']: |
|---|
| 69 | n/a | check(e, '"quot;') |
|---|
| 70 | n/a | # check triple adjacent charrefs |
|---|
| 71 | n/a | for e in ['"', '"', '"', '"']: |
|---|
| 72 | n/a | check(e*3, '"""') |
|---|
| 73 | n/a | check((e+';')*3, '"""') |
|---|
| 74 | n/a | # check that the case is respected |
|---|
| 75 | n/a | for e in ['&', '&', '&', '&']: |
|---|
| 76 | n/a | check(e, '&') |
|---|
| 77 | n/a | for e in ['&Amp', '&Amp;']: |
|---|
| 78 | n/a | check(e, e) |
|---|
| 79 | n/a | # check that non-existent named entities are returned unchanged |
|---|
| 80 | n/a | check('&svadilfari;', '&svadilfari;') |
|---|
| 81 | n/a | # the following examples are in the html5 specs |
|---|
| 82 | n/a | check('¬it', '¬it') |
|---|
| 83 | n/a | check('¬it;', '¬it;') |
|---|
| 84 | n/a | check('¬in', '¬in') |
|---|
| 85 | n/a | check('∉', 'â') |
|---|
| 86 | n/a | # a similar example with a long name |
|---|
| 87 | n/a | check('¬ReallyAnExistingNamedCharacterReference;', |
|---|
| 88 | n/a | '¬ReallyAnExistingNamedCharacterReference;') |
|---|
| 89 | n/a | # longest valid name |
|---|
| 90 | n/a | check('∳', 'â³') |
|---|
| 91 | n/a | # check a charref that maps to two unicode chars |
|---|
| 92 | n/a | check('∾̳', '\u223E\u0333') |
|---|
| 93 | n/a | check('&acE', '&acE') |
|---|
| 94 | n/a | # see #12888 |
|---|
| 95 | n/a | check('{ ' * 1050, '{ ' * 1050) |
|---|
| 96 | n/a | # see #15156 |
|---|
| 97 | n/a | check('ÉricÉric&alphacentauriαcentauri', |
|---|
| 98 | n/a | 'ÃricÃric&alphacentauriαcentauri') |
|---|
| 99 | n/a | check('&co;', '&co;') |
|---|
| 100 | n/a | |
|---|
| 101 | n/a | |
|---|
| 102 | n/a | if __name__ == '__main__': |
|---|
| 103 | n/a | unittest.main() |
|---|