»Core Development>Code coverage>Lib/test/test_unicode.py

Python code coverage for Lib/test/test_unicode.py

1n/a""" Test script for the Unicode implementation.
3n/aWritten by Marc-Andre Lemburg (mal@lemburg.com).
5n/a(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8n/aimport _string
9n/aimport codecs
10n/aimport itertools
11n/aimport operator
12n/aimport struct
13n/aimport string
14n/aimport sys
15n/aimport unittest
16n/aimport warnings
17n/afrom test import support, string_tests
19n/a# Error handling (bad decoder return)
20n/adef search_function(encoding):
21n/a def decode1(input, errors="strict"):
22n/a return 42 # not a tuple
23n/a def encode1(input, errors="strict"):
24n/a return 42 # not a tuple
25n/a def encode2(input, errors="strict"):
26n/a return (42, 42) # no unicode
27n/a def decode2(input, errors="strict"):
28n/a return (42, 42) # no unicode
29n/a if encoding=="test.unicode1":
30n/a return (encode1, decode1, None, None)
31n/a elif encoding=="test.unicode2":
32n/a return (encode2, decode2, None, None)
33n/a else:
34n/a return None
37n/adef duplicate_string(text):
38n/a """
39n/a Try to get a fresh clone of the specified text:
40n/a new object with a reference count of 1.
42n/a This is a best-effort: latin1 single letters and the empty
43n/a string ('') are singletons and cannot be cloned.
44n/a """
45n/a return text.encode().decode()
47n/aclass StrSubclass(str):
48n/a pass
50n/aclass UnicodeTest(string_tests.CommonTest,
51n/a string_tests.MixinStrUnicodeUserStringTest,
52n/a string_tests.MixinStrUnicodeTest,
53n/a unittest.TestCase):
55n/a type2test = str
57n/a def checkequalnofix(self, result, object, methodname, *args):
58n/a method = getattr(object, methodname)
59n/a realresult = method(*args)
60n/a self.assertEqual(realresult, result)
61n/a self.assertTrue(type(realresult) is type(result))
63n/a # if the original is returned make sure that
64n/a # this doesn't happen with subclasses
65n/a if realresult is object:
66n/a class usub(str):
67n/a def __repr__(self):
68n/a return 'usub(%r)' % str.__repr__(self)
69n/a object = usub(object)
70n/a method = getattr(object, methodname)
71n/a realresult = method(*args)
72n/a self.assertEqual(realresult, result)
73n/a self.assertTrue(object is not realresult)
75n/a def test_literals(self):
76n/a self.assertEqual('\xff', '\u00ff')
77n/a self.assertEqual('\uffff', '\U0000ffff')
78n/a self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
79n/a self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
80n/a self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
81n/a # raw strings should not have unicode escapes
82n/a self.assertNotEqual(r"\u0020", " ")
84n/a def test_ascii(self):
85n/a if not sys.platform.startswith('java'):
86n/a # Test basic sanity of repr()
87n/a self.assertEqual(ascii('abc'), "'abc'")
88n/a self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
89n/a self.assertEqual(ascii('ab\\'), "'ab\\\\'")
90n/a self.assertEqual(ascii('\\c'), "'\\\\c'")
91n/a self.assertEqual(ascii('\\'), "'\\\\'")
92n/a self.assertEqual(ascii('\n'), "'\\n'")
93n/a self.assertEqual(ascii('\r'), "'\\r'")
94n/a self.assertEqual(ascii('\t'), "'\\t'")
95n/a self.assertEqual(ascii('\b'), "'\\x08'")
96n/a self.assertEqual(ascii("'\""), """'\\'"'""")
97n/a self.assertEqual(ascii("'\""), """'\\'"'""")
98n/a self.assertEqual(ascii("'"), '''"'"''')
99n/a self.assertEqual(ascii('"'), """'"'""")
100n/a latin1repr = (
101n/a "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
102n/a "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
103n/a "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
104n/a "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
105n/a "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
106n/a "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
107n/a "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
108n/a "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
109n/a "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
110n/a "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
111n/a "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
112n/a "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
113n/a "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
114n/a "\\xfe\\xff'")
115n/a testrepr = ascii(''.join(map(chr, range(256))))
116n/a self.assertEqual(testrepr, latin1repr)
117n/a # Test ascii works on wide unicode escapes without overflow.
118n/a self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
119n/a ascii("\U00010000" * 39 + "\uffff" * 4096))
121n/a class WrongRepr:
122n/a def __repr__(self):
123n/a return b'byte-repr'
124n/a self.assertRaises(TypeError, ascii, WrongRepr())
126n/a def test_repr(self):
127n/a if not sys.platform.startswith('java'):
128n/a # Test basic sanity of repr()
129n/a self.assertEqual(repr('abc'), "'abc'")
130n/a self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
131n/a self.assertEqual(repr('ab\\'), "'ab\\\\'")
132n/a self.assertEqual(repr('\\c'), "'\\\\c'")
133n/a self.assertEqual(repr('\\'), "'\\\\'")
134n/a self.assertEqual(repr('\n'), "'\\n'")
135n/a self.assertEqual(repr('\r'), "'\\r'")
136n/a self.assertEqual(repr('\t'), "'\\t'")
137n/a self.assertEqual(repr('\b'), "'\\x08'")
138n/a self.assertEqual(repr("'\""), """'\\'"'""")
139n/a self.assertEqual(repr("'\""), """'\\'"'""")
140n/a self.assertEqual(repr("'"), '''"'"''')
141n/a self.assertEqual(repr('"'), """'"'""")
142n/a latin1repr = (
143n/a "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
144n/a "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
145n/a "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
146n/a "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
147n/a "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
148n/a "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
149n/a "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
150n/a "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
151n/a "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
152n/a "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
153n/a "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
154n/a "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
155n/a "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
156n/a "\xfe\xff'")
157n/a testrepr = repr(''.join(map(chr, range(256))))
158n/a self.assertEqual(testrepr, latin1repr)
159n/a # Test repr works on wide unicode escapes without overflow.
160n/a self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
161n/a repr("\U00010000" * 39 + "\uffff" * 4096))
163n/a class WrongRepr:
164n/a def __repr__(self):
165n/a return b'byte-repr'
166n/a self.assertRaises(TypeError, repr, WrongRepr())
168n/a def test_iterators(self):
169n/a # Make sure unicode objects have an __iter__ method
170n/a it = "\u1111\u2222\u3333".__iter__()
171n/a self.assertEqual(next(it), "\u1111")
172n/a self.assertEqual(next(it), "\u2222")
173n/a self.assertEqual(next(it), "\u3333")
174n/a self.assertRaises(StopIteration, next, it)
176n/a def test_count(self):
177n/a string_tests.CommonTest.test_count(self)
178n/a # check mixed argument types
179n/a self.checkequalnofix(3, 'aaa', 'count', 'a')
180n/a self.checkequalnofix(0, 'aaa', 'count', 'b')
181n/a self.checkequalnofix(3, 'aaa', 'count', 'a')
182n/a self.checkequalnofix(0, 'aaa', 'count', 'b')
183n/a self.checkequalnofix(0, 'aaa', 'count', 'b')
184n/a self.checkequalnofix(1, 'aaa', 'count', 'a', -1)
185n/a self.checkequalnofix(3, 'aaa', 'count', 'a', -10)
186n/a self.checkequalnofix(2, 'aaa', 'count', 'a', 0, -1)
187n/a self.checkequalnofix(0, 'aaa', 'count', 'a', 0, -10)
188n/a # test mixed kinds
189n/a self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a')
190n/a self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a')
191n/a self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102')
192n/a self.checkequal(0, 'a' * 10, 'count', '\u0102')
193n/a self.checkequal(0, 'a' * 10, 'count', '\U00100304')
194n/a self.checkequal(0, '\u0102' * 10, 'count', '\U00100304')
195n/a self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_')
196n/a self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_')
197n/a self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_')
198n/a self.checkequal(0, 'a' * 10, 'count', 'a\u0102')
199n/a self.checkequal(0, 'a' * 10, 'count', 'a\U00100304')
200n/a self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304')
202n/a def test_find(self):
203n/a string_tests.CommonTest.test_find(self)
204n/a # test implementation details of the memchr fast path
205n/a self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102')
206n/a self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201')
207n/a self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120')
208n/a self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220')
209n/a self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304')
210n/a self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204')
211n/a self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004')
212n/a # check mixed argument types
213n/a self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc')
214n/a self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1)
215n/a self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
217n/a self.assertRaises(TypeError, 'hello'.find)
218n/a self.assertRaises(TypeError, 'hello'.find, 42)
219n/a # test mixed kinds
220n/a self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a')
221n/a self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a')
222n/a self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102')
223n/a self.checkequal(-1, 'a' * 100, 'find', '\u0102')
224n/a self.checkequal(-1, 'a' * 100, 'find', '\U00100304')
225n/a self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304')
226n/a self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_')
227n/a self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_')
228n/a self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_')
229n/a self.checkequal(-1, 'a' * 100, 'find', 'a\u0102')
230n/a self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304')
231n/a self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304')
233n/a def test_rfind(self):
234n/a string_tests.CommonTest.test_rfind(self)
235n/a # test implementation details of the memrchr fast path
236n/a self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102')
237n/a self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201')
238n/a self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120')
239n/a self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220')
240n/a self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304')
241n/a self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204')
242n/a self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004')
243n/a # check mixed argument types
244n/a self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc')
245n/a self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
246n/a self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
247n/a # test mixed kinds
248n/a self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a')
249n/a self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a')
250n/a self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102')
251n/a self.checkequal(-1, 'a' * 100, 'rfind', '\u0102')
252n/a self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304')
253n/a self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304')
254n/a self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a')
255n/a self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a')
256n/a self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102')
257n/a self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a')
258n/a self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a')
259n/a self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102')
261n/a def test_index(self):
262n/a string_tests.CommonTest.test_index(self)
263n/a self.checkequalnofix(0, 'abcdefghiabc', 'index', '')
264n/a self.checkequalnofix(3, 'abcdefghiabc', 'index', 'def')
265n/a self.checkequalnofix(0, 'abcdefghiabc', 'index', 'abc')
266n/a self.checkequalnofix(9, 'abcdefghiabc', 'index', 'abc', 1)
267n/a self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
268n/a self.assertRaises(ValueError, 'abcdefghiab'.index, 'abc', 1)
269n/a self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', 8)
270n/a self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', -1)
271n/a # test mixed kinds
272n/a self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a')
273n/a self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a')
274n/a self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102')
275n/a self.assertRaises(ValueError, ('a' * 100).index, '\u0102')
276n/a self.assertRaises(ValueError, ('a' * 100).index, '\U00100304')
277n/a self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304')
278n/a self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_')
279n/a self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_')
280n/a self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_')
281n/a self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102')
282n/a self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304')
283n/a self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304')
285n/a def test_rindex(self):
286n/a string_tests.CommonTest.test_rindex(self)
287n/a self.checkequalnofix(12, 'abcdefghiabc', 'rindex', '')
288n/a self.checkequalnofix(3, 'abcdefghiabc', 'rindex', 'def')
289n/a self.checkequalnofix(9, 'abcdefghiabc', 'rindex', 'abc')
290n/a self.checkequalnofix(0, 'abcdefghiabc', 'rindex', 'abc', 0, -1)
292n/a self.assertRaises(ValueError, 'abcdefghiabc'.rindex, 'hib')
293n/a self.assertRaises(ValueError, 'defghiabc'.rindex, 'def', 1)
294n/a self.assertRaises(ValueError, 'defghiabc'.rindex, 'abc', 0, -1)
295n/a self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, 8)
296n/a self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, -1)
297n/a # test mixed kinds
298n/a self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a')
299n/a self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a')
300n/a self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102')
301n/a self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102')
302n/a self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304')
303n/a self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304')
304n/a self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a')
305n/a self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a')
306n/a self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102')
307n/a self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a')
308n/a self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a')
309n/a self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102')
311n/a def test_maketrans_translate(self):
312n/a # these work with plain translate()
313n/a self.checkequalnofix('bbbc', 'abababc', 'translate',
314n/a {ord('a'): None})
315n/a self.checkequalnofix('iiic', 'abababc', 'translate',
316n/a {ord('a'): None, ord('b'): ord('i')})
317n/a self.checkequalnofix('iiix', 'abababc', 'translate',
318n/a {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
319n/a self.checkequalnofix('c', 'abababc', 'translate',
320n/a {ord('a'): None, ord('b'): ''})
321n/a self.checkequalnofix('xyyx', 'xzx', 'translate',
322n/a {ord('z'): 'yy'})
324n/a # this needs maketrans()
325n/a self.checkequalnofix('abababc', 'abababc', 'translate',
326n/a {'b': '<i>'})
327n/a tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
328n/a self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
329n/a # test alternative way of calling maketrans()
330n/a tbl = self.type2test.maketrans('abc', 'xyz', 'd')
331n/a self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
333n/a # various tests switching from ASCII to latin1 or the opposite;
334n/a # same length, remove a letter, or replace with a longer string.
335n/a self.assertEqual("[a]".translate(str.maketrans('a', 'X')),
336n/a "[X]")
337n/a self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})),
338n/a "[X]")
339n/a self.assertEqual("[a]".translate(str.maketrans({'a': None})),
340n/a "[]")
341n/a self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})),
342n/a "[XXX]")
343n/a self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})),
344n/a "[\xe9]")
345n/a self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})),
346n/a "x123")
347n/a self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})),
348n/a "x\xe9")
350n/a # test non-ASCII (don't take the fast-path)
351n/a self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})),
352n/a "[<\xe9>]")
353n/a self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})),
354n/a "[a]")
355n/a self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})),
356n/a "[]")
357n/a self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': '123'})),
358n/a "[123]")
359n/a self.assertEqual("[a\xe9]".translate(str.maketrans({'a': '<\u20ac>'})),
360n/a "[<\u20ac>\xe9]")
362n/a # invalid Unicode characters
363n/a invalid_char = 0x10ffff+1
364n/a for before in "a\xe9\u20ac\U0010ffff":
365n/a mapping = str.maketrans({before: invalid_char})
366n/a text = "[%s]" % before
367n/a self.assertRaises(ValueError, text.translate, mapping)
369n/a # errors
370n/a self.assertRaises(TypeError, self.type2test.maketrans)
371n/a self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
372n/a self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
373n/a self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
374n/a self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
375n/a self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
376n/a self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
378n/a self.assertRaises(TypeError, 'hello'.translate)
379n/a self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
381n/a def test_split(self):
382n/a string_tests.CommonTest.test_split(self)
384n/a # test mixed kinds
385n/a for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
386n/a left *= 9
387n/a right *= 9
388n/a for delim in ('c', '\u0102', '\U00010302'):
389n/a self.checkequal([left + right],
390n/a left + right, 'split', delim)
391n/a self.checkequal([left, right],
392n/a left + delim + right, 'split', delim)
393n/a self.checkequal([left + right],
394n/a left + right, 'split', delim * 2)
395n/a self.checkequal([left, right],
396n/a left + delim * 2 + right, 'split', delim *2)
398n/a def test_rsplit(self):
399n/a string_tests.CommonTest.test_rsplit(self)
400n/a # test mixed kinds
401n/a for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
402n/a left *= 9
403n/a right *= 9
404n/a for delim in ('c', '\u0102', '\U00010302'):
405n/a self.checkequal([left + right],
406n/a left + right, 'rsplit', delim)
407n/a self.checkequal([left, right],
408n/a left + delim + right, 'rsplit', delim)
409n/a self.checkequal([left + right],
410n/a left + right, 'rsplit', delim * 2)
411n/a self.checkequal([left, right],
412n/a left + delim * 2 + right, 'rsplit', delim *2)
414n/a def test_partition(self):
415n/a string_tests.MixinStrUnicodeUserStringTest.test_partition(self)
416n/a # test mixed kinds
417n/a self.checkequal(('ABCDEFGH', '', ''), 'ABCDEFGH', 'partition', '\u4200')
418n/a for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
419n/a left *= 9
420n/a right *= 9
421n/a for delim in ('c', '\u0102', '\U00010302'):
422n/a self.checkequal((left + right, '', ''),
423n/a left + right, 'partition', delim)
424n/a self.checkequal((left, delim, right),
425n/a left + delim + right, 'partition', delim)
426n/a self.checkequal((left + right, '', ''),
427n/a left + right, 'partition', delim * 2)
428n/a self.checkequal((left, delim * 2, right),
429n/a left + delim * 2 + right, 'partition', delim * 2)
431n/a def test_rpartition(self):
432n/a string_tests.MixinStrUnicodeUserStringTest.test_rpartition(self)
433n/a # test mixed kinds
434n/a self.checkequal(('', '', 'ABCDEFGH'), 'ABCDEFGH', 'rpartition', '\u4200')
435n/a for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
436n/a left *= 9
437n/a right *= 9
438n/a for delim in ('c', '\u0102', '\U00010302'):
439n/a self.checkequal(('', '', left + right),
440n/a left + right, 'rpartition', delim)
441n/a self.checkequal((left, delim, right),
442n/a left + delim + right, 'rpartition', delim)
443n/a self.checkequal(('', '', left + right),
444n/a left + right, 'rpartition', delim * 2)
445n/a self.checkequal((left, delim * 2, right),
446n/a left + delim * 2 + right, 'rpartition', delim * 2)
448n/a def test_join(self):
449n/a string_tests.MixinStrUnicodeUserStringTest.test_join(self)
451n/a class MyWrapper:
452n/a def __init__(self, sval): self.sval = sval
453n/a def __str__(self): return self.sval
455n/a # mixed arguments
456n/a self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
457n/a self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
458n/a self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
459n/a self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
460n/a self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
461n/a self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
462n/a self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
463n/a self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
464n/a self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
465n/a self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
466n/a self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
468n/a @unittest.skipIf(sys.maxsize > 2**32,
469n/a 'needs too much memory on a 64-bit platform')
470n/a def test_join_overflow(self):
471n/a size = int(sys.maxsize**0.5) + 1
472n/a seq = ('A' * size,) * size
473n/a self.assertRaises(OverflowError, ''.join, seq)
475n/a def test_replace(self):
476n/a string_tests.CommonTest.test_replace(self)
478n/a # method call forwarded from str implementation because of unicode argument
479n/a self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
480n/a self.assertRaises(TypeError, 'replace'.replace, "r", 42)
481n/a # test mixed kinds
482n/a for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
483n/a left *= 9
484n/a right *= 9
485n/a for delim in ('c', '\u0102', '\U00010302'):
486n/a for repl in ('d', '\u0103', '\U00010303'):
487n/a self.checkequal(left + right,
488n/a left + right, 'replace', delim, repl)
489n/a self.checkequal(left + repl + right,
490n/a left + delim + right,
491n/a 'replace', delim, repl)
492n/a self.checkequal(left + right,
493n/a left + right, 'replace', delim * 2, repl)
494n/a self.checkequal(left + repl + right,
495n/a left + delim * 2 + right,
496n/a 'replace', delim * 2, repl)
498n/a @support.cpython_only
499n/a def test_replace_id(self):
500n/a pattern = 'abc'
501n/a text = 'abc def'
502n/a self.assertIs(text.replace(pattern, pattern), text)
504n/a def test_bytes_comparison(self):
505n/a with support.check_warnings():
506n/a warnings.simplefilter('ignore', BytesWarning)
507n/a self.assertEqual('abc' == b'abc', False)
508n/a self.assertEqual('abc' != b'abc', True)
509n/a self.assertEqual('abc' == bytearray(b'abc'), False)
510n/a self.assertEqual('abc' != bytearray(b'abc'), True)
512n/a def test_comparison(self):
513n/a # Comparisons:
514n/a self.assertEqual('abc', 'abc')
515n/a self.assertTrue('abcd' > 'abc')
516n/a self.assertTrue('abc' < 'abcd')
518n/a if 0:
519n/a # Move these tests to a Unicode collation module test...
520n/a # Testing UTF-16 code point order comparisons...
522n/a # No surrogates, no fixup required.
523n/a self.assertTrue('\u0061' < '\u20ac')
524n/a # Non surrogate below surrogate value, no fixup required
525n/a self.assertTrue('\u0061' < '\ud800\udc02')
527n/a # Non surrogate above surrogate value, fixup required
528n/a def test_lecmp(s, s2):
529n/a self.assertTrue(s < s2)
531n/a def test_fixup(s):
532n/a s2 = '\ud800\udc01'
533n/a test_lecmp(s, s2)
534n/a s2 = '\ud900\udc01'
535n/a test_lecmp(s, s2)
536n/a s2 = '\uda00\udc01'
537n/a test_lecmp(s, s2)
538n/a s2 = '\udb00\udc01'
539n/a test_lecmp(s, s2)
540n/a s2 = '\ud800\udd01'
541n/a test_lecmp(s, s2)
542n/a s2 = '\ud900\udd01'
543n/a test_lecmp(s, s2)
544n/a s2 = '\uda00\udd01'
545n/a test_lecmp(s, s2)
546n/a s2 = '\udb00\udd01'
547n/a test_lecmp(s, s2)
548n/a s2 = '\ud800\ude01'
549n/a test_lecmp(s, s2)
550n/a s2 = '\ud900\ude01'
551n/a test_lecmp(s, s2)
552n/a s2 = '\uda00\ude01'
553n/a test_lecmp(s, s2)
554n/a s2 = '\udb00\ude01'
555n/a test_lecmp(s, s2)
556n/a s2 = '\ud800\udfff'
557n/a test_lecmp(s, s2)
558n/a s2 = '\ud900\udfff'
559n/a test_lecmp(s, s2)
560n/a s2 = '\uda00\udfff'
561n/a test_lecmp(s, s2)
562n/a s2 = '\udb00\udfff'
563n/a test_lecmp(s, s2)
565n/a test_fixup('\ue000')
566n/a test_fixup('\uff61')
568n/a # Surrogates on both sides, no fixup required
569n/a self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
571n/a def test_islower(self):
572n/a super().test_islower()
573n/a self.checkequalnofix(False, '\u1FFc', 'islower')
574n/a self.assertFalse('\u2167'.islower())
575n/a self.assertTrue('\u2177'.islower())
576n/a # non-BMP, uppercase
577n/a self.assertFalse('\U00010401'.islower())
578n/a self.assertFalse('\U00010427'.islower())
579n/a # non-BMP, lowercase
580n/a self.assertTrue('\U00010429'.islower())
581n/a self.assertTrue('\U0001044E'.islower())
582n/a # non-BMP, non-cased
583n/a self.assertFalse('\U0001F40D'.islower())
584n/a self.assertFalse('\U0001F46F'.islower())
586n/a def test_isupper(self):
587n/a super().test_isupper()
588n/a if not sys.platform.startswith('java'):
589n/a self.checkequalnofix(False, '\u1FFc', 'isupper')
590n/a self.assertTrue('\u2167'.isupper())
591n/a self.assertFalse('\u2177'.isupper())
592n/a # non-BMP, uppercase
593n/a self.assertTrue('\U00010401'.isupper())
594n/a self.assertTrue('\U00010427'.isupper())
595n/a # non-BMP, lowercase
596n/a self.assertFalse('\U00010429'.isupper())
597n/a self.assertFalse('\U0001044E'.isupper())
598n/a # non-BMP, non-cased
599n/a self.assertFalse('\U0001F40D'.isupper())
600n/a self.assertFalse('\U0001F46F'.isupper())
602n/a def test_istitle(self):
603n/a super().test_istitle()
604n/a self.checkequalnofix(True, '\u1FFc', 'istitle')
605n/a self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
607n/a # non-BMP, uppercase + lowercase
608n/a self.assertTrue('\U00010401\U00010429'.istitle())
609n/a self.assertTrue('\U00010427\U0001044E'.istitle())
610n/a # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
611n/a for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
612n/a self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
614n/a def test_isspace(self):
615n/a super().test_isspace()
616n/a self.checkequalnofix(True, '\u2000', 'isspace')
617n/a self.checkequalnofix(True, '\u200a', 'isspace')
618n/a self.checkequalnofix(False, '\u2014', 'isspace')
619n/a # apparently there are no non-BMP spaces chars in Unicode 6
620n/a for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
621n/a '\U0001F40D', '\U0001F46F']:
622n/a self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
624n/a def test_isalnum(self):
625n/a super().test_isalnum()
626n/a for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
627n/a '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
628n/a self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
630n/a def test_isalpha(self):
631n/a super().test_isalpha()
632n/a self.checkequalnofix(True, '\u1FFc', 'isalpha')
633n/a # non-BMP, cased
634n/a self.assertTrue('\U00010401'.isalpha())
635n/a self.assertTrue('\U00010427'.isalpha())
636n/a self.assertTrue('\U00010429'.isalpha())
637n/a self.assertTrue('\U0001044E'.isalpha())
638n/a # non-BMP, non-cased
639n/a self.assertFalse('\U0001F40D'.isalpha())
640n/a self.assertFalse('\U0001F46F'.isalpha())
642n/a def test_isdecimal(self):
643n/a self.checkequalnofix(False, '', 'isdecimal')
644n/a self.checkequalnofix(False, 'a', 'isdecimal')
645n/a self.checkequalnofix(True, '0', 'isdecimal')
646n/a self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
647n/a self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
648n/a self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
649n/a self.checkequalnofix(True, '0123456789', 'isdecimal')
650n/a self.checkequalnofix(False, '0123456789a', 'isdecimal')
652n/a self.checkraises(TypeError, 'abc', 'isdecimal', 42)
654n/a for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
655n/a '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
656n/a self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
657n/a for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
658n/a self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
660n/a def test_isdigit(self):
661n/a super().test_isdigit()
662n/a self.checkequalnofix(True, '\u2460', 'isdigit')
663n/a self.checkequalnofix(False, '\xbc', 'isdigit')
664n/a self.checkequalnofix(True, '\u0660', 'isdigit')
666n/a for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
667n/a '\U0001F40D', '\U0001F46F', '\U00011065']:
668n/a self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
669n/a for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
670n/a self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
672n/a def test_isnumeric(self):
673n/a self.checkequalnofix(False, '', 'isnumeric')
674n/a self.checkequalnofix(False, 'a', 'isnumeric')
675n/a self.checkequalnofix(True, '0', 'isnumeric')
676n/a self.checkequalnofix(True, '\u2460', 'isnumeric')
677n/a self.checkequalnofix(True, '\xbc', 'isnumeric')
678n/a self.checkequalnofix(True, '\u0660', 'isnumeric')
679n/a self.checkequalnofix(True, '0123456789', 'isnumeric')
680n/a self.checkequalnofix(False, '0123456789a', 'isnumeric')
682n/a self.assertRaises(TypeError, "abc".isnumeric, 42)
684n/a for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
685n/a '\U0001F40D', '\U0001F46F']:
686n/a self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
687n/a for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
688n/a '\U000104A0', '\U0001F107']:
689n/a self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
691n/a def test_isidentifier(self):
692n/a self.assertTrue("a".isidentifier())
693n/a self.assertTrue("Z".isidentifier())
694n/a self.assertTrue("_".isidentifier())
695n/a self.assertTrue("b0".isidentifier())
696n/a self.assertTrue("bc".isidentifier())
697n/a self.assertTrue("b_".isidentifier())
698n/a self.assertTrue("µ".isidentifier())
699n/a self.assertTrue("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".isidentifier())
701n/a self.assertFalse(" ".isidentifier())
702n/a self.assertFalse("[".isidentifier())
703n/a self.assertFalse("©".isidentifier())
704n/a self.assertFalse("0".isidentifier())
706n/a def test_isprintable(self):
707n/a self.assertTrue("".isprintable())
708n/a self.assertTrue(" ".isprintable())
709n/a self.assertTrue("abcdefg".isprintable())
710n/a self.assertFalse("abcdefg\n".isprintable())
711n/a # some defined Unicode character
712n/a self.assertTrue("\u0374".isprintable())
713n/a # undefined character
714n/a self.assertFalse("\u0378".isprintable())
715n/a # single surrogate character
716n/a self.assertFalse("\ud800".isprintable())
718n/a self.assertTrue('\U0001F46F'.isprintable())
719n/a self.assertFalse('\U000E0020'.isprintable())
721n/a def test_surrogates(self):
722n/a for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
723n/a 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
724n/a self.assertTrue(s.islower())
725n/a self.assertFalse(s.isupper())
726n/a self.assertFalse(s.istitle())
727n/a for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
728n/a 'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
729n/a self.assertFalse(s.islower())
730n/a self.assertTrue(s.isupper())
731n/a self.assertTrue(s.istitle())
733n/a for meth_name in ('islower', 'isupper', 'istitle'):
734n/a meth = getattr(str, meth_name)
735n/a for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
736n/a self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
738n/a for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
739n/a 'isdecimal', 'isnumeric',
740n/a 'isidentifier', 'isprintable'):
741n/a meth = getattr(str, meth_name)
742n/a for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
743n/a 'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
744n/a 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
745n/a self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
748n/a def test_lower(self):
749n/a string_tests.CommonTest.test_lower(self)
750n/a self.assertEqual('\U00010427'.lower(), '\U0001044F')
751n/a self.assertEqual('\U00010427\U00010427'.lower(),
752n/a '\U0001044F\U0001044F')
753n/a self.assertEqual('\U00010427\U0001044F'.lower(),
754n/a '\U0001044F\U0001044F')
755n/a self.assertEqual('X\U00010427x\U0001044F'.lower(),
756n/a 'x\U0001044Fx\U0001044F')
757n/a self.assertEqual('fi'.lower(), 'fi')
758n/a self.assertEqual('\u0130'.lower(), '\u0069\u0307')
759n/a # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
760n/a self.assertEqual('\u03a3'.lower(), '\u03c3')
761n/a self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
762n/a self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
763n/a self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
764n/a self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
765n/a self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
766n/a self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
767n/a self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
768n/a self.assertEqual('\u2177'.lower(), '\u2177')
770n/a def test_casefold(self):
771n/a self.assertEqual('hello'.casefold(), 'hello')
772n/a self.assertEqual('hELlo'.casefold(), 'hello')
773n/a self.assertEqual('ß'.casefold(), 'ss')
774n/a self.assertEqual('fi'.casefold(), 'fi')
775n/a self.assertEqual('\u03a3'.casefold(), '\u03c3')
776n/a self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
777n/a self.assertEqual('\u00b5'.casefold(), '\u03bc')
779n/a def test_upper(self):
780n/a string_tests.CommonTest.test_upper(self)
781n/a self.assertEqual('\U0001044F'.upper(), '\U00010427')
782n/a self.assertEqual('\U0001044F\U0001044F'.upper(),
783n/a '\U00010427\U00010427')
784n/a self.assertEqual('\U00010427\U0001044F'.upper(),
785n/a '\U00010427\U00010427')
786n/a self.assertEqual('X\U00010427x\U0001044F'.upper(),
787n/a 'X\U00010427X\U00010427')
788n/a self.assertEqual('fi'.upper(), 'FI')
789n/a self.assertEqual('\u0130'.upper(), '\u0130')
790n/a self.assertEqual('\u03a3'.upper(), '\u03a3')
791n/a self.assertEqual('ß'.upper(), 'SS')
792n/a self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
793n/a self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
794n/a self.assertEqual('\u2177'.upper(), '\u2167')
796n/a def test_capitalize(self):
797n/a string_tests.CommonTest.test_capitalize(self)
798n/a self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
799n/a self.assertEqual('\U0001044F\U0001044F'.capitalize(),
800n/a '\U00010427\U0001044F')
801n/a self.assertEqual('\U00010427\U0001044F'.capitalize(),
802n/a '\U00010427\U0001044F')
803n/a self.assertEqual('\U0001044F\U00010427'.capitalize(),
804n/a '\U00010427\U0001044F')
805n/a self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
806n/a 'X\U0001044Fx\U0001044F')
807n/a self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
808n/a exp = '\u0399\u0308\u0300\u0069\u0307'
809n/a self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
810n/a self.assertEqual('finnish'.capitalize(), 'FInnish')
811n/a self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
813n/a def test_title(self):
814n/a super().test_title()
815n/a self.assertEqual('\U0001044F'.title(), '\U00010427')
816n/a self.assertEqual('\U0001044F\U0001044F'.title(),
817n/a '\U00010427\U0001044F')
818n/a self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
819n/a '\U00010427\U0001044F \U00010427\U0001044F')
820n/a self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
821n/a '\U00010427\U0001044F \U00010427\U0001044F')
822n/a self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
823n/a '\U00010427\U0001044F \U00010427\U0001044F')
824n/a self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
825n/a 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
826n/a self.assertEqual('fiNNISH'.title(), 'Finnish')
827n/a self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
828n/a self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
830n/a def test_swapcase(self):
831n/a string_tests.CommonTest.test_swapcase(self)
832n/a self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
833n/a self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
834n/a self.assertEqual('\U0001044F\U0001044F'.swapcase(),
835n/a '\U00010427\U00010427')
836n/a self.assertEqual('\U00010427\U0001044F'.swapcase(),
837n/a '\U0001044F\U00010427')
838n/a self.assertEqual('\U0001044F\U00010427'.swapcase(),
839n/a '\U00010427\U0001044F')
840n/a self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
841n/a 'x\U0001044FX\U00010427')
842n/a self.assertEqual('fi'.swapcase(), 'FI')
843n/a self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
844n/a # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
845n/a self.assertEqual('\u03a3'.swapcase(), '\u03c3')
846n/a self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
847n/a self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
848n/a self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
849n/a self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
850n/a self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
851n/a self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
852n/a self.assertEqual('\u03a3'.swapcase(), '\u03c3')
853n/a self.assertEqual('ß'.swapcase(), 'SS')
854n/a self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
856n/a def test_center(self):
857n/a string_tests.CommonTest.test_center(self)
858n/a self.assertEqual('x'.center(2, '\U0010FFFF'),
859n/a 'x\U0010FFFF')
860n/a self.assertEqual('x'.center(3, '\U0010FFFF'),
861n/a '\U0010FFFFx\U0010FFFF')
862n/a self.assertEqual('x'.center(4, '\U0010FFFF'),
863n/a '\U0010FFFFx\U0010FFFF\U0010FFFF')
865n/a @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system")
866n/a @support.cpython_only
867n/a def test_case_operation_overflow(self):
868n/a # Issue #22643
869n/a size = 2**32//12 + 1
870n/a try:
871n/a s = "ü" * size
872n/a except MemoryError:
873n/a self.skipTest('no enough memory (%.0f MiB required)' % (size / 2**20))
874n/a try:
875n/a self.assertRaises(OverflowError, s.upper)
876n/a finally:
877n/a del s
879n/a def test_contains(self):
880n/a # Testing Unicode contains method
881n/a self.assertIn('a', 'abdb')
882n/a self.assertIn('a', 'bdab')
883n/a self.assertIn('a', 'bdaba')
884n/a self.assertIn('a', 'bdba')
885n/a self.assertNotIn('a', 'bdb')
886n/a self.assertIn('a', 'bdba')
887n/a self.assertIn('a', ('a',1,None))
888n/a self.assertIn('a', (1,None,'a'))
889n/a self.assertIn('a', ('a',1,None))
890n/a self.assertIn('a', (1,None,'a'))
891n/a self.assertNotIn('a', ('x',1,'y'))
892n/a self.assertNotIn('a', ('x',1,None))
893n/a self.assertNotIn('abcd', 'abcxxxx')
894n/a self.assertIn('ab', 'abcd')
895n/a self.assertIn('ab', 'abc')
896n/a self.assertIn('ab', (1,None,'ab'))
897n/a self.assertIn('', 'abc')
898n/a self.assertIn('', '')
899n/a self.assertIn('', 'abc')
900n/a self.assertNotIn('\0', 'abc')
901n/a self.assertIn('\0', '\0abc')
902n/a self.assertIn('\0', 'abc\0')
903n/a self.assertIn('a', '\0abc')
904n/a self.assertIn('asdf', 'asdf')
905n/a self.assertNotIn('asdf', 'asd')
906n/a self.assertNotIn('asdf', '')
908n/a self.assertRaises(TypeError, "abc".__contains__)
909n/a # test mixed kinds
910n/a for fill in ('a', '\u0100', '\U00010300'):
911n/a fill *= 9
912n/a for delim in ('c', '\u0102', '\U00010302'):
913n/a self.assertNotIn(delim, fill)
914n/a self.assertIn(delim, fill + delim)
915n/a self.assertNotIn(delim * 2, fill)
916n/a self.assertIn(delim * 2, fill + delim * 2)
918n/a def test_issue18183(self):
919n/a '\U00010000\U00100000'.lower()
920n/a '\U00010000\U00100000'.casefold()
921n/a '\U00010000\U00100000'.upper()
922n/a '\U00010000\U00100000'.capitalize()
923n/a '\U00010000\U00100000'.title()
924n/a '\U00010000\U00100000'.swapcase()
925n/a '\U00100000'.center(3, '\U00010000')
926n/a '\U00100000'.ljust(3, '\U00010000')
927n/a '\U00100000'.rjust(3, '\U00010000')
929n/a def test_format(self):
930n/a self.assertEqual(''.format(), '')
931n/a self.assertEqual('a'.format(), 'a')
932n/a self.assertEqual('ab'.format(), 'ab')
933n/a self.assertEqual('a{{'.format(), 'a{')
934n/a self.assertEqual('a}}'.format(), 'a}')
935n/a self.assertEqual('{{b'.format(), '{b')
936n/a self.assertEqual('}}b'.format(), '}b')
937n/a self.assertEqual('a{{b'.format(), 'a{b')
939n/a # examples from the PEP:
940n/a import datetime
941n/a self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
942n/a self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
943n/a "My name is Fred")
944n/a self.assertEqual("My name is {0} :-{{}}".format('Fred'),
945n/a "My name is Fred :-{}")
947n/a d = datetime.date(2007, 8, 18)
948n/a self.assertEqual("The year is {0.year}".format(d),
949n/a "The year is 2007")
951n/a # classes we'll use for testing
952n/a class C:
953n/a def __init__(self, x=100):
954n/a self._x = x
955n/a def __format__(self, spec):
956n/a return spec
958n/a class D:
959n/a def __init__(self, x):
960n/a self.x = x
961n/a def __format__(self, spec):
962n/a return str(self.x)
964n/a # class with __str__, but no __format__
965n/a class E:
966n/a def __init__(self, x):
967n/a self.x = x
968n/a def __str__(self):
969n/a return 'E(' + self.x + ')'
971n/a # class with __repr__, but no __format__ or __str__
972n/a class F:
973n/a def __init__(self, x):
974n/a self.x = x
975n/a def __repr__(self):
976n/a return 'F(' + self.x + ')'
978n/a # class with __format__ that forwards to string, for some format_spec's
979n/a class G:
980n/a def __init__(self, x):
981n/a self.x = x
982n/a def __str__(self):
983n/a return "string is " + self.x
984n/a def __format__(self, format_spec):
985n/a if format_spec == 'd':
986n/a return 'G(' + self.x + ')'
987n/a return object.__format__(self, format_spec)
989n/a class I(datetime.date):
990n/a def __format__(self, format_spec):
991n/a return self.strftime(format_spec)
993n/a class J(int):
994n/a def __format__(self, format_spec):
995n/a return int.__format__(self * 2, format_spec)
997n/a class M:
998n/a def __init__(self, x):
999n/a self.x = x
1000n/a def __repr__(self):
1001n/a return 'M(' + self.x + ')'
1002n/a __str__ = None
1004n/a class N:
1005n/a def __init__(self, x):
1006n/a self.x = x
1007n/a def __repr__(self):
1008n/a return 'N(' + self.x + ')'
1009n/a __format__ = None
1011n/a self.assertEqual(''.format(), '')
1012n/a self.assertEqual('abc'.format(), 'abc')
1013n/a self.assertEqual('{0}'.format('abc'), 'abc')
1014n/a self.assertEqual('{0:}'.format('abc'), 'abc')
1015n/a# self.assertEqual('{ 0 }'.format('abc'), 'abc')
1016n/a self.assertEqual('X{0}'.format('abc'), 'Xabc')
1017n/a self.assertEqual('{0}X'.format('abc'), 'abcX')
1018n/a self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
1019n/a self.assertEqual('{1}'.format(1, 'abc'), 'abc')
1020n/a self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
1021n/a self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
1022n/a self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
1023n/a self.assertEqual('{0}'.format(-15), '-15')
1024n/a self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
1025n/a self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
1026n/a self.assertEqual('{{'.format(), '{')
1027n/a self.assertEqual('}}'.format(), '}')
1028n/a self.assertEqual('{{}}'.format(), '{}')
1029n/a self.assertEqual('{{x}}'.format(), '{x}')
1030n/a self.assertEqual('{{{0}}}'.format(123), '{123}')
1031n/a self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
1032n/a self.assertEqual('}}{{'.format(), '}{')
1033n/a self.assertEqual('}}x{{'.format(), '}x{')
1035n/a # weird field names
1036n/a self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
1037n/a self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
1038n/a self.assertEqual("{0[ ]}".format({' ':3}), '3')
1040n/a self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
1041n/a self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
1042n/a self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
1043n/a self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
1044n/a self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
1045n/a self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
1046n/a self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
1048n/a # strings
1049n/a self.assertEqual('{0:.3s}'.format('abc'), 'abc')
1050n/a self.assertEqual('{0:.3s}'.format('ab'), 'ab')
1051n/a self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
1052n/a self.assertEqual('{0:.0s}'.format('abcdef'), '')
1053n/a self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
1054n/a self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
1055n/a self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
1056n/a self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
1057n/a self.assertEqual('{0:x<0s}'.format('result'), 'result')
1058n/a self.assertEqual('{0:x<5s}'.format('result'), 'result')
1059n/a self.assertEqual('{0:x<6s}'.format('result'), 'result')
1060n/a self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
1061n/a self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
1062n/a self.assertEqual('{0: <7s}'.format('result'), 'result ')
1063n/a self.assertEqual('{0:<7s}'.format('result'), 'result ')
1064n/a self.assertEqual('{0:>7s}'.format('result'), ' result')
1065n/a self.assertEqual('{0:>8s}'.format('result'), ' result')
1066n/a self.assertEqual('{0:^8s}'.format('result'), ' result ')
1067n/a self.assertEqual('{0:^9s}'.format('result'), ' result ')
1068n/a self.assertEqual('{0:^10s}'.format('result'), ' result ')
1069n/a self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
1070n/a self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
1071n/a self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
1073n/a # issue 12546: use \x00 as a fill character
1074n/a self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1075n/a self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1076n/a self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1077n/a self.assertEqual('{0:^6s}'.format('foo'), ' foo ')
1079n/a self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1080n/a self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1081n/a self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1082n/a self.assertEqual('{0:<6}'.format(3), '3 ')
1084n/a self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1085n/a self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1086n/a self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1087n/a self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1089n/a self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1090n/a self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1091n/a self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1092n/a self.assertEqual('{0:^12}'.format(3+2.0j), ' (3+2j) ')
1094n/a # format specifiers for user defined type
1095n/a self.assertEqual('{0:abc}'.format(C()), 'abc')
1097n/a # !r, !s and !a coercions
1098n/a self.assertEqual('{0!s}'.format('Hello'), 'Hello')
1099n/a self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
1100n/a self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ')
1101n/a self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello ')
1102n/a self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
1103n/a self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
1104n/a self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
1105n/a self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
1106n/a self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'") # printable
1107n/a self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
1108n/a self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
1109n/a self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
1110n/a self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
1111n/a self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
1112n/a self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
1113n/a self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
1115n/a # test fallback to object.__format__
1116n/a self.assertEqual('{0}'.format({}), '{}')
1117n/a self.assertEqual('{0}'.format([]), '[]')
1118n/a self.assertEqual('{0}'.format([1]), '[1]')
1120n/a self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
1121n/a self.assertEqual('{0!s}'.format(G('data')), 'string is data')
1123n/a self.assertRaises(TypeError, '{0:^10}'.format, E('data'))
1124n/a self.assertRaises(TypeError, '{0:^10s}'.format, E('data'))
1125n/a self.assertRaises(TypeError, '{0:>15s}'.format, G('data'))
1127n/a self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
1128n/a month=8,
1129n/a day=27)),
1130n/a "date: 2007-08-27")
1132n/a # test deriving from a builtin type and overriding __format__
1133n/a self.assertEqual("{0}".format(J(10)), "20")
1136n/a # string format specifiers
1137n/a self.assertEqual('{0:}'.format('a'), 'a')
1139n/a # computed format specifiers
1140n/a self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
1141n/a self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
1142n/a self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
1143n/a self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello ')
1144n/a self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello ')
1146n/a # test various errors
1147n/a self.assertRaises(ValueError, '{'.format)
1148n/a self.assertRaises(ValueError, '}'.format)
1149n/a self.assertRaises(ValueError, 'a{'.format)
1150n/a self.assertRaises(ValueError, 'a}'.format)
1151n/a self.assertRaises(ValueError, '{a'.format)
1152n/a self.assertRaises(ValueError, '}a'.format)
1153n/a self.assertRaises(IndexError, '{0}'.format)
1154n/a self.assertRaises(IndexError, '{1}'.format, 'abc')
1155n/a self.assertRaises(KeyError, '{x}'.format)
1156n/a self.assertRaises(ValueError, "}{".format)
1157n/a self.assertRaises(ValueError, "abc{0:{}".format)
1158n/a self.assertRaises(ValueError, "{0".format)
1159n/a self.assertRaises(IndexError, "{0.}".format)
1160n/a self.assertRaises(ValueError, "{0.}".format, 0)
1161n/a self.assertRaises(ValueError, "{0[}".format)
1162n/a self.assertRaises(ValueError, "{0[}".format, [])
1163n/a self.assertRaises(KeyError, "{0]}".format)
1164n/a self.assertRaises(ValueError, "{0.[]}".format, 0)
1165n/a self.assertRaises(ValueError, "{0..foo}".format, 0)
1166n/a self.assertRaises(ValueError, "{0[0}".format, 0)
1167n/a self.assertRaises(ValueError, "{0[0:foo}".format, 0)
1168n/a self.assertRaises(KeyError, "{c]}".format)
1169n/a self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
1170n/a self.assertRaises(ValueError, "{0}}".format, 0)
1171n/a self.assertRaises(KeyError, "{foo}".format, bar=3)
1172n/a self.assertRaises(ValueError, "{0!x}".format, 3)
1173n/a self.assertRaises(ValueError, "{0!}".format, 0)
1174n/a self.assertRaises(ValueError, "{0!rs}".format, 0)
1175n/a self.assertRaises(ValueError, "{!}".format)
1176n/a self.assertRaises(IndexError, "{:}".format)
1177n/a self.assertRaises(IndexError, "{:s}".format)
1178n/a self.assertRaises(IndexError, "{}".format)
1179n/a big = "23098475029384702983476098230754973209482573"
1180n/a self.assertRaises(ValueError, ("{" + big + "}").format)
1181n/a self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
1183n/a # issue 6089
1184n/a self.assertRaises(ValueError, "{0[0]x}".format, [None])
1185n/a self.assertRaises(ValueError, "{0[0](10)}".format, [None])
1187n/a # can't have a replacement on the field name portion
1188n/a self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
1190n/a # exceed maximum recursion depth
1191n/a self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
1192n/a self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1193n/a 0, 1, 2, 3, 4, 5, 6, 7)
1195n/a # string format spec errors
1196n/a self.assertRaises(ValueError, "{0:-s}".format, '')
1197n/a self.assertRaises(ValueError, format, "", "-")
1198n/a self.assertRaises(ValueError, "{0:=s}".format, '')
1200n/a # Alternate formatting is not supported
1201n/a self.assertRaises(ValueError, format, '', '#')
1202n/a self.assertRaises(ValueError, format, '', '#20')
1204n/a # Non-ASCII
1205n/a self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
1206n/a 'ABC\u0410\u0411\u0412')
1207n/a self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
1208n/a 'ABC')
1209n/a self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
1210n/a '')
1212n/a self.assertEqual("{[{}]}".format({"{}": 5}), "5")
1213n/a self.assertEqual("{[{}]}".format({"{}" : "a"}), "a")
1214n/a self.assertEqual("{[{]}".format({"{" : "a"}), "a")
1215n/a self.assertEqual("{[}]}".format({"}" : "a"}), "a")
1216n/a self.assertEqual("{[[]}".format({"[" : "a"}), "a")
1217n/a self.assertEqual("{[!]}".format({"!" : "a"}), "a")
1218n/a self.assertRaises(ValueError, "{a{}b}".format, 42)
1219n/a self.assertRaises(ValueError, "{a{b}".format, 42)
1220n/a self.assertRaises(ValueError, "{[}".format, 42)
1222n/a self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000")
1224n/a # Blocking fallback
1225n/a m = M('data')
1226n/a self.assertEqual("{!r}".format(m), 'M(data)')
1227n/a self.assertRaises(TypeError, "{!s}".format, m)
1228n/a self.assertRaises(TypeError, "{}".format, m)
1229n/a n = N('data')
1230n/a self.assertEqual("{!r}".format(n), 'N(data)')
1231n/a self.assertEqual("{!s}".format(n), 'N(data)')
1232n/a self.assertRaises(TypeError, "{}".format, n)
1234n/a def test_format_map(self):
1235n/a self.assertEqual(''.format_map({}), '')
1236n/a self.assertEqual('a'.format_map({}), 'a')
1237n/a self.assertEqual('ab'.format_map({}), 'ab')
1238n/a self.assertEqual('a{{'.format_map({}), 'a{')
1239n/a self.assertEqual('a}}'.format_map({}), 'a}')
1240n/a self.assertEqual('{{b'.format_map({}), '{b')
1241n/a self.assertEqual('}}b'.format_map({}), '}b')
1242n/a self.assertEqual('a{{b'.format_map({}), 'a{b')
1244n/a # using mappings
1245n/a class Mapping(dict):
1246n/a def __missing__(self, key):
1247n/a return key
1248n/a self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
1249n/a self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
1251n/a class InternalMapping:
1252n/a def __init__(self):
1253n/a self.mapping = {'a': 'hello'}
1254n/a def __getitem__(self, key):
1255n/a return self.mapping[key]
1256n/a self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
1259n/a class C:
1260n/a def __init__(self, x=100):
1261n/a self._x = x
1262n/a def __format__(self, spec):
1263n/a return spec
1264n/a self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
1266n/a # test various errors
1267n/a self.assertRaises(TypeError, ''.format_map)
1268n/a self.assertRaises(TypeError, 'a'.format_map)
1270n/a self.assertRaises(ValueError, '{'.format_map, {})
1271n/a self.assertRaises(ValueError, '}'.format_map, {})
1272n/a self.assertRaises(ValueError, 'a{'.format_map, {})
1273n/a self.assertRaises(ValueError, 'a}'.format_map, {})
1274n/a self.assertRaises(ValueError, '{a'.format_map, {})
1275n/a self.assertRaises(ValueError, '}a'.format_map, {})
1277n/a # issue #12579: can't supply positional params to format_map
1278n/a self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
1279n/a self.assertRaises(ValueError, '{}'.format_map, 'a')
1280n/a self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
1282n/a def test_format_huge_precision(self):
1283n/a format_string = ".{}f".format(sys.maxsize + 1)
1284n/a with self.assertRaises(ValueError):
1285n/a result = format(2.34, format_string)
1287n/a def test_format_huge_width(self):
1288n/a format_string = "{}f".format(sys.maxsize + 1)
1289n/a with self.assertRaises(ValueError):
1290n/a result = format(2.34, format_string)
1292n/a def test_format_huge_item_number(self):
1293n/a format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
1294n/a with self.assertRaises(ValueError):
1295n/a result = format_string.format(2.34)
1297n/a def test_format_auto_numbering(self):
1298n/a class C:
1299n/a def __init__(self, x=100):
1300n/a self._x = x
1301n/a def __format__(self, spec):
1302n/a return spec
1304n/a self.assertEqual('{}'.format(10), '10')
1305n/a self.assertEqual('{:5}'.format('s'), 's ')
1306n/a self.assertEqual('{!r}'.format('s'), "'s'")
1307n/a self.assertEqual('{._x}'.format(C(10)), '10')
1308n/a self.assertEqual('{[1]}'.format([1, 2]), '2')
1309n/a self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
1310n/a self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
1312n/a self.assertEqual('a{:{}}b'.format('x', '^10'), 'a x b')
1313n/a self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
1315n/a # can't mix and match numbering and auto-numbering
1316n/a self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
1317n/a self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
1318n/a self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
1319n/a self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
1321n/a # can mix and match auto-numbering and named
1322n/a self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
1323n/a self.assertEqual('{}{f}'.format(4, f='test'), '4test')
1324n/a self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
1325n/a self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
1327n/a def test_formatting(self):
1328n/a string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
1329n/a # Testing Unicode formatting strings...
1330n/a self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
1331n/a self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000, 3.00')
1332n/a self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000, 3.00')
1333n/a self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000, 3.50')
1334n/a self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000, 3.57')
1335n/a self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
1336n/a if not sys.platform.startswith('java'):
1337n/a self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
1338n/a self.assertEqual("%r" % ("\u1234",), "'\u1234'")
1339n/a self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
1340n/a self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
1341n/a self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
1343n/a self.assertEqual('%c' % 0x1234, '\u1234')
1344n/a self.assertEqual('%c' % 0x21483, '\U00021483')
1345n/a self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
1346n/a self.assertEqual('%c' % '\U00021483', '\U00021483')
1347n/a self.assertRaises(TypeError, "%c".__mod__, "aa")
1348n/a self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
1349n/a self.assertRaises(TypeError, "%i".__mod__, "aa")
1351n/a # formatting jobs delegated from the string implementation:
1352n/a self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1353n/a self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1354n/a self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1355n/a self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1356n/a self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1357n/a self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1358n/a self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
1359n/a self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
1360n/a self.assertEqual('...%s...' % "abc", '...abc...')
1361n/a self.assertEqual('%*s' % (5,'abc',), ' abc')
1362n/a self.assertEqual('%*s' % (-5,'abc',), 'abc ')
1363n/a self.assertEqual('%*.*s' % (5,2,'abc',), ' ab')
1364n/a self.assertEqual('%*.*s' % (5,3,'abc',), ' abc')
1365n/a self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10 abc')
1366n/a self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103 abc')
1367n/a self.assertEqual('%c' % 'a', 'a')
1368n/a class Wrapper:
1369n/a def __str__(self):
1370n/a return '\u1234'
1371n/a self.assertEqual('%s' % Wrapper(), '\u1234')
1373n/a # issue 3382
1374n/a NAN = float('nan')
1375n/a INF = float('inf')
1376n/a self.assertEqual('%f' % NAN, 'nan')
1377n/a self.assertEqual('%F' % NAN, 'NAN')
1378n/a self.assertEqual('%f' % INF, 'inf')
1379n/a self.assertEqual('%F' % INF, 'INF')
1381n/a # PEP 393
1382n/a self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
1383n/a self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
1385n/a #issue 19995
1386n/a class PseudoInt:
1387n/a def __init__(self, value):
1388n/a self.value = int(value)
1389n/a def __int__(self):
1390n/a return self.value
1391n/a def __index__(self):
1392n/a return self.value
1393n/a class PseudoFloat:
1394n/a def __init__(self, value):
1395n/a self.value = float(value)
1396n/a def __int__(self):
1397n/a return int(self.value)
1398n/a pi = PseudoFloat(3.1415)
1399n/a letter_m = PseudoInt(109)
1400n/a self.assertEqual('%x' % 42, '2a')
1401n/a self.assertEqual('%X' % 15, 'F')
1402n/a self.assertEqual('%o' % 9, '11')
1403n/a self.assertEqual('%c' % 109, 'm')
1404n/a self.assertEqual('%x' % letter_m, '6d')
1405n/a self.assertEqual('%X' % letter_m, '6D')
1406n/a self.assertEqual('%o' % letter_m, '155')
1407n/a self.assertEqual('%c' % letter_m, 'm')
1408n/a self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14),
1409n/a self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11),
1410n/a self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79),
1411n/a self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi),
1412n/a self.assertRaises(TypeError, operator.mod, '%c', pi),
1414n/a def test_formatting_with_enum(self):
1415n/a # issue18780
1416n/a import enum
1417n/a class Float(float, enum.Enum):
1418n/a PI = 3.1415926
1419n/a class Int(enum.IntEnum):
1420n/a IDES = 15
1421n/a class Str(str, enum.Enum):
1422n/a ABC = 'abc'
1423n/a # Testing Unicode formatting strings...
1424n/a self.assertEqual("%s, %s" % (Str.ABC, Str.ABC),
1425n/a 'Str.ABC, Str.ABC')
1426n/a self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" %
1427n/a (Str.ABC, Str.ABC,
1428n/a Int.IDES, Int.IDES, Int.IDES,
1429n/a Float.PI, Float.PI),
1430n/a 'Str.ABC, Str.ABC, 15, 15, 15, 3.141593, 3.14')
1432n/a # formatting jobs delegated from the string implementation:
1433n/a self.assertEqual('...%(foo)s...' % {'foo':Str.ABC},
1434n/a '...Str.ABC...')
1435n/a self.assertEqual('...%(foo)s...' % {'foo':Int.IDES},
1436n/a '...Int.IDES...')
1437n/a self.assertEqual('...%(foo)i...' % {'foo':Int.IDES},
1438n/a '...15...')
1439n/a self.assertEqual('...%(foo)d...' % {'foo':Int.IDES},
1440n/a '...15...')
1441n/a self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI},
1442n/a '...15...')
1443n/a self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123},
1444n/a '...3.141593...')
1446n/a def test_formatting_huge_precision(self):
1447n/a format_string = "%.{}f".format(sys.maxsize + 1)
1448n/a with self.assertRaises(ValueError):
1449n/a result = format_string % 2.34
1451n/a @support.cpython_only
1452n/a def test_formatting_huge_precision_c_limits(self):
1453n/a from _testcapi import INT_MAX
1454n/a format_string = "%.{}f".format(INT_MAX + 1)
1455n/a with self.assertRaises(ValueError):
1456n/a result = format_string % 2.34
1458n/a def test_formatting_huge_width(self):
1459n/a format_string = "%{}f".format(sys.maxsize + 1)
1460n/a with self.assertRaises(ValueError):
1461n/a result = format_string % 2.34
1463n/a def test_startswith_endswith_errors(self):
1464n/a for meth in ('foo'.startswith, 'foo'.endswith):
1465n/a with self.assertRaises(TypeError) as cm:
1466n/a meth(['f'])
1467n/a exc = str(cm.exception)
1468n/a self.assertIn('str', exc)
1469n/a self.assertIn('tuple', exc)
1471n/a @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
1472n/a def test_format_float(self):
1473n/a # should not format with a comma, but always with C locale
1474n/a self.assertEqual('1.0', '%.1f' % 1.0)
1476n/a def test_constructor(self):
1477n/a # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1479n/a self.assertEqual(
1480n/a str('unicode remains unicode'),
1481n/a 'unicode remains unicode'
1482n/a )
1484n/a for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
1485n/a subclass = StrSubclass(text)
1486n/a self.assertEqual(str(subclass), text)
1487n/a self.assertEqual(len(subclass), len(text))
1488n/a if text == 'ascii':
1489n/a self.assertEqual(subclass.encode('ascii'), b'ascii')
1490n/a self.assertEqual(subclass.encode('utf-8'), b'ascii')
1492n/a self.assertEqual(
1493n/a str('strings are converted to unicode'),
1494n/a 'strings are converted to unicode'
1495n/a )
1497n/a class StringCompat:
1498n/a def __init__(self, x):
1499n/a self.x = x
1500n/a def __str__(self):
1501n/a return self.x
1503n/a self.assertEqual(
1504n/a str(StringCompat('__str__ compatible objects are recognized')),
1505n/a '__str__ compatible objects are recognized'
1506n/a )
1508n/a # unicode(obj) is compatible to str():
1510n/a o = StringCompat('unicode(obj) is compatible to str()')
1511n/a self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1512n/a self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1514n/a for obj in (123, 123.45, 123):
1515n/a self.assertEqual(str(obj), str(str(obj)))
1517n/a # unicode(obj, encoding, error) tests (this maps to
1518n/a # PyUnicode_FromEncodedObject() at C level)
1520n/a if not sys.platform.startswith('java'):
1521n/a self.assertRaises(
1522n/a TypeError,
1523n/a str,
1524n/a 'decoding unicode is not supported',
1525n/a 'utf-8',
1526n/a 'strict'
1527n/a )
1529n/a self.assertEqual(
1530n/a str(b'strings are decoded to unicode', 'utf-8', 'strict'),
1531n/a 'strings are decoded to unicode'
1532n/a )
1534n/a if not sys.platform.startswith('java'):
1535n/a self.assertEqual(
1536n/a str(
1537n/a memoryview(b'character buffers are decoded to unicode'),
1538n/a 'utf-8',
1539n/a 'strict'
1540n/a ),
1541n/a 'character buffers are decoded to unicode'
1542n/a )
1544n/a self.assertRaises(TypeError, str, 42, 42, 42)
1546n/a def test_constructor_keyword_args(self):
1547n/a """Pass various keyword argument combinations to the constructor."""
1548n/a # The object argument can be passed as a keyword.
1549n/a self.assertEqual(str(object='foo'), 'foo')
1550n/a self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo')
1551n/a # The errors argument without encoding triggers "decode" mode.
1552n/a self.assertEqual(str(b'foo', errors='strict'), 'foo') # not "b'foo'"
1553n/a self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
1555n/a def test_constructor_defaults(self):
1556n/a """Check the constructor argument defaults."""
1557n/a # The object argument defaults to '' or b''.
1558n/a self.assertEqual(str(), '')
1559n/a self.assertEqual(str(errors='strict'), '')
1560n/a utf8_cent = '¢'.encode('utf-8')
1561n/a # The encoding argument defaults to utf-8.
1562n/a self.assertEqual(str(utf8_cent, errors='strict'), '¢')
1563n/a # The errors argument defaults to strict.
1564n/a self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii')
1566n/a def test_codecs_utf7(self):
1567n/a utfTests = [
1568n/a ('A\u2262\u0391.', b'A+ImIDkQ.'), # RFC2152 example
1569n/a ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'), # RFC2152 example
1570n/a ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'), # RFC2152 example
1571n/a ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1572n/a ('+', b'+-'),
1573n/a ('+-', b'+--'),
1574n/a ('+?', b'+-?'),
1575n/a (r'\?', b'+AFw?'),
1576n/a ('+?', b'+-?'),
1577n/a (r'\\?', b'+AFwAXA?'),
1578n/a (r'\\\?', b'+AFwAXABc?'),
1579n/a (r'++--', b'+-+---'),
1580n/a ('\U000abcde', b'+2m/c3g-'), # surrogate pairs
1581n/a ('/', b'/'),
1582n/a ]
1584n/a for (x, y) in utfTests:
1585n/a self.assertEqual(x.encode('utf-7'), y)
1587n/a # Unpaired surrogates are passed through
1588n/a self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
1589n/a self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
1590n/a self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
1591n/a self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
1592n/a self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
1593n/a self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
1594n/a self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
1595n/a self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
1597n/a self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
1598n/a self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
1600n/a # Issue #2242: crash on some Windows/MSVC versions
1601n/a self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
1603n/a # Direct encoded characters
1604n/a set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1605n/a # Optional direct characters
1606n/a set_o = '!"#$%&*;<=>@[]^_`{|}'
1607n/a for c in set_d:
1608n/a self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1609n/a self.assertEqual(c.encode('ascii').decode('utf7'), c)
1610n/a for c in set_o:
1611n/a self.assertEqual(c.encode('ascii').decode('utf7'), c)
1613n/a def test_codecs_utf8(self):
1614n/a self.assertEqual(''.encode('utf-8'), b'')
1615n/a self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
1616n/a self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1617n/a self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
1618n/a self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1619n/a self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
1620n/a self.assertEqual(('\U00010002'*10).encode('utf-8'),
1621n/a b'\xf0\x90\x80\x82'*10)
1622n/a self.assertEqual(
1623n/a '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1624n/a '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1625n/a '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1626n/a '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1627n/a '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1628n/a ' Nunstuck git und'.encode('utf-8'),
1629n/a b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1630n/a b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1631n/a b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1632n/a b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1633n/a b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1634n/a b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1635n/a b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1636n/a b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1637n/a b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1638n/a b'\xe3\x80\x8cWenn ist das Nunstuck git und'
1639n/a )
1641n/a # UTF-8 specific decoding tests
1642n/a self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1643n/a self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1644n/a self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
1646n/a # Other possible utf-8 test cases:
1647n/a # * strict decoding testing for all of the
1648n/a # UTF8_ERROR cases in PyUnicode_DecodeUTF8
1650n/a def test_utf8_decode_valid_sequences(self):
1651n/a sequences = [
1652n/a # single byte
1653n/a (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1654n/a # 2 bytes
1655n/a (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1656n/a # 3 bytes
1657n/a (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1658n/a (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1659n/a # 4 bytes
1660n/a (b'\xF0\x90\x80\x80', '\U00010000'),
1661n/a (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1662n/a ]
1663n/a for seq, res in sequences:
1664n/a self.assertEqual(seq.decode('utf-8'), res)
1667n/a def test_utf8_decode_invalid_sequences(self):
1668n/a # continuation bytes in a sequence of 2, 3, or 4 bytes
1669n/a continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
1670n/a # start bytes of a 2-byte sequence equivalent to code points < 0x7F
1671n/a invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
1672n/a # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
1673n/a invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1674n/a invalid_start_bytes = (
1675n/a continuation_bytes + invalid_2B_seq_start_bytes +
1676n/a invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1677n/a )
1679n/a for byte in invalid_start_bytes:
1680n/a self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1682n/a for sb in invalid_2B_seq_start_bytes:
1683n/a for cb in continuation_bytes:
1684n/a self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1686n/a for sb in invalid_4B_seq_start_bytes:
1687n/a for cb1 in continuation_bytes[:3]:
1688n/a for cb3 in continuation_bytes[:3]:
1689n/a self.assertRaises(UnicodeDecodeError,
1690n/a (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1692n/a for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1693n/a self.assertRaises(UnicodeDecodeError,
1694n/a (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1695n/a self.assertRaises(UnicodeDecodeError,
1696n/a (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1697n/a # surrogates
1698n/a for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1699n/a self.assertRaises(UnicodeDecodeError,
1700n/a (b'\xED'+cb+b'\x80').decode, 'utf-8')
1701n/a self.assertRaises(UnicodeDecodeError,
1702n/a (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1703n/a for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1704n/a self.assertRaises(UnicodeDecodeError,
1705n/a (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1706n/a self.assertRaises(UnicodeDecodeError,
1707n/a (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1708n/a for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1709n/a self.assertRaises(UnicodeDecodeError,
1710n/a (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1711n/a self.assertRaises(UnicodeDecodeError,
1712n/a (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1714n/a def test_issue8271(self):
1715n/a # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1716n/a # only the start byte and the continuation byte(s) are now considered
1717n/a # invalid, instead of the number of bytes specified by the start byte.
1718n/a # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1719n/a # table 3-8, Row 2) for more information about the algorithm used.
1720n/a FFFD = '\ufffd'
1721n/a sequences = [
1722n/a # invalid start bytes
1723n/a (b'\x80', FFFD), # continuation byte
1724n/a (b'\x80\x80', FFFD*2), # 2 continuation bytes
1725n/a (b'\xc0', FFFD),
1726n/a (b'\xc0\xc0', FFFD*2),
1727n/a (b'\xc1', FFFD),
1728n/a (b'\xc1\xc0', FFFD*2),
1729n/a (b'\xc0\xc1', FFFD*2),
1730n/a # with start byte of a 2-byte sequence
1731n/a (b'\xc2', FFFD), # only the start byte
1732n/a (b'\xc2\xc2', FFFD*2), # 2 start bytes
1733n/a (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
1734n/a (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1735n/a # with start byte of a 3-byte sequence
1736n/a (b'\xe1', FFFD), # only the start byte
1737n/a (b'\xe1\xe1', FFFD*2), # 2 start bytes
1738n/a (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1739n/a (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1740n/a (b'\xe1\x80', FFFD), # only 1 continuation byte
1741n/a (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1742n/a (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1743n/a (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1744n/a (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1745n/a (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1746n/a (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1747n/a # with start byte of a 4-byte sequence
1748n/a (b'\xf1', FFFD), # only the start byte
1749n/a (b'\xf1\xf1', FFFD*2), # 2 start bytes
1750n/a (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1751n/a (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1752n/a (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1753n/a (b'\xf1\x80', FFFD), # only 1 continuation bytes
1754n/a (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1755n/a (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1756n/a (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1757n/a (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1758n/a (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1759n/a (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1760n/a (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1761n/a (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1762n/a (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1763n/a (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1764n/a (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1765n/a (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1766n/a # with invalid start byte of a 4-byte sequence (rfc2279)
1767n/a (b'\xf5', FFFD), # only the start byte
1768n/a (b'\xf5\xf5', FFFD*2), # 2 start bytes
1769n/a (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1770n/a (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1771n/a (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1772n/a (b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
1773n/a (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1774n/a (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1775n/a # with invalid start byte of a 5-byte sequence (rfc2279)
1776n/a (b'\xf8', FFFD), # only the start byte
1777n/a (b'\xf8\xf8', FFFD*2), # 2 start bytes
1778n/a (b'\xf8\x80', FFFD*2), # only one continuation byte
1779n/a (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1780n/a (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1781n/a # with invalid start byte of a 6-byte sequence (rfc2279)
1782n/a (b'\xfc', FFFD), # only the start byte
1783n/a (b'\xfc\xfc', FFFD*2), # 2 start bytes
1784n/a (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1785n/a (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1786n/a # invalid start byte
1787n/a (b'\xfe', FFFD),
1788n/a (b'\xfe\x80\x80', FFFD*3),
1789n/a # other sequences
1790n/a (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1791n/a (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1792n/a (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1793n/a (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1794n/a '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1795n/a ]
1796n/a for n, (seq, res) in enumerate(sequences):
1797n/a self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1798n/a self.assertEqual(seq.decode('utf-8', 'replace'), res)
1799n/a self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1800n/a self.assertEqual(seq.decode('utf-8', 'ignore'),
1801n/a res.replace('\uFFFD', ''))
1803n/a def assertCorrectUTF8Decoding(self, seq, res, err):
1804n/a """
1805n/a Check that an invalid UTF-8 sequence raises a UnicodeDecodeError when
1806n/a 'strict' is used, returns res when 'replace' is used, and that doesn't
1807n/a return anything when 'ignore' is used.
1808n/a """
1809n/a with self.assertRaises(UnicodeDecodeError) as cm:
1810n/a seq.decode('utf-8')
1811n/a exc = cm.exception
1813n/a self.assertIn(err, str(exc))
1814n/a self.assertEqual(seq.decode('utf-8', 'replace'), res)
1815n/a self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'),
1816n/a 'aaaa' + res + 'bbbb')
1817n/a res = res.replace('\ufffd', '')
1818n/a self.assertEqual(seq.decode('utf-8', 'ignore'), res)
1819n/a self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'),
1820n/a 'aaaa' + res + 'bbbb')
1822n/a def test_invalid_start_byte(self):
1823n/a """
1824n/a Test that an 'invalid start byte' error is raised when the first byte
1825n/a is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1826n/a 4-bytes sequence. The invalid start byte is replaced with a single
1827n/a U+FFFD when errors='replace'.
1828n/a E.g. <80> is a continuation byte and can appear only after a start byte.
1829n/a """
1830n/a FFFD = '\ufffd'
1831n/a for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
1832n/a self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd',
1833n/a 'invalid start byte')
1835n/a def test_unexpected_end_of_data(self):
1836n/a """
1837n/a Test that an 'unexpected end of data' error is raised when the string
1838n/a ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1839n/a enough continuation bytes. The incomplete sequence is replaced with a
1840n/a single U+FFFD when errors='replace'.
1841n/a E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1842n/a sequence, but it's followed by only 2 valid continuation bytes and the
1843n/a last continuation bytes is missing.
1844n/a Note: the continuation bytes must be all valid, if one of them is
1845n/a invalid another error will be raised.
1846n/a """
1847n/a sequences = [
1848n/a 'C2', 'DF',
1849n/a 'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
1850n/a 'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
1851n/a 'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
1852n/a 'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
1853n/a 'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
1854n/a 'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
1855n/a ]
1856n/a FFFD = '\ufffd'
1857n/a for seq in sequences:
1858n/a self.assertCorrectUTF8Decoding(bytes.fromhex(seq), '\ufffd',
1859n/a 'unexpected end of data')
1861n/a def test_invalid_cb_for_2bytes_seq(self):
1862n/a """
1863n/a Test that an 'invalid continuation byte' error is raised when the
1864n/a continuation byte of a 2-bytes sequence is invalid. The start byte
1865n/a is replaced by a single U+FFFD and the second byte is handled
1866n/a separately when errors='replace'.
1867n/a E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1868n/a sequence, but 41 is not a valid continuation byte because it's the
1869n/a ASCII letter 'A'.
1870n/a """
1871n/a FFFD = '\ufffd'
1872n/a FFFDx2 = FFFD * 2
1873n/a sequences = [
1874n/a ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'),
1875n/a ('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
1876n/a ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'),
1877n/a ('DF C0', FFFDx2), ('DF FF', FFFDx2),
1878n/a ]
1879n/a for seq, res in sequences:
1880n/a self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
1881n/a 'invalid continuation byte')
1883n/a def test_invalid_cb_for_3bytes_seq(self):
1884n/a """
1885n/a Test that an 'invalid continuation byte' error is raised when the
1886n/a continuation byte(s) of a 3-bytes sequence are invalid. When
1887n/a errors='replace', if the first continuation byte is valid, the first
1888n/a two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1889n/a third byte is handled separately, otherwise only the start byte is
1890n/a replaced with a U+FFFD and the other continuation bytes are handled
1891n/a separately.
1892n/a E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1893n/a sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1894n/a because it's the ASCII letter 'A'.
1895n/a Note: when the start byte is E0 or ED, the valid ranges for the first
1896n/a continuation byte are limited to A0..BF and 80..9F respectively.
1897n/a Python 2 used to consider all the bytes in range 80..BF valid when the
1898n/a start byte was ED. This is fixed in Python 3.
1899n/a """
1900n/a FFFD = '\ufffd'
1901n/a FFFDx2 = FFFD * 2
1902n/a sequences = [
1903n/a ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
1904n/a ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
1905n/a ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
1906n/a ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
1907n/a ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
1908n/a ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
1909n/a ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
1910n/a ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
1911n/a ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
1912n/a ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
1913n/a ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
1914n/a ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
1915n/a ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
1916n/a ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
1917n/a ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
1918n/a ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
1919n/a ('ED 7F', FFFD+'\x7f'),
1920n/a ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
1921n/a ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
1922n/a ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
1923n/a ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
1924n/a ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
1925n/a ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
1926n/a ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
1927n/a ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
1928n/a ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
1929n/a ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
1930n/a ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
1931n/a ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
1932n/a ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
1933n/a ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
1934n/a ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
1935n/a ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
1936n/a ]
1937n/a for seq, res in sequences:
1938n/a self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
1939n/a 'invalid continuation byte')
1941n/a def test_invalid_cb_for_4bytes_seq(self):
1942n/a """
1943n/a Test that an 'invalid continuation byte' error is raised when the
1944n/a continuation byte(s) of a 4-bytes sequence are invalid. When
1945n/a errors='replace',the start byte and all the following valid
1946n/a continuation bytes are replaced with a single U+FFFD, and all the bytes
1947n/a starting from the first invalid continuation bytes (included) are
1948n/a handled separately.
1949n/a E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1950n/a sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1951n/a because it's the ASCII letter 'A'.
1952n/a Note: when the start byte is E0 or ED, the valid ranges for the first
1953n/a continuation byte are limited to A0..BF and 80..9F respectively.
1954n/a However, when the start byte is ED, Python 2 considers all the bytes
1955n/a in range 80..BF valid. This is fixed in Python 3.
1956n/a """
1957n/a FFFD = '\ufffd'
1958n/a FFFDx2 = FFFD * 2
1959n/a sequences = [
1960n/a ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
1961n/a ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
1962n/a ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
1963n/a ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
1964n/a ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
1965n/a ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
1966n/a ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
1967n/a ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
1968n/a ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
1969n/a ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
1970n/a ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
1971n/a ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
1972n/a ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
1973n/a ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
1974n/a ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
1975n/a ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
1976n/a ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
1977n/a ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
1978n/a ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
1979n/a ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
1980n/a ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
1981n/a ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
1982n/a ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
1983n/a ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
1984n/a ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
1985n/a ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
1986n/a ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
1987n/a ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
1988n/a ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
1989n/a ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
1990n/a ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
1991n/a ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
1992n/a ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
1993n/a ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
1994n/a ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
1995n/a ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
1996n/a ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
1997n/a ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
1998n/a ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
1999n/a ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
2000n/a ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
2001n/a ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
2002n/a ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
2003n/a ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
2004n/a ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
2005n/a ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
2006n/a ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
2007n/a ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
2008n/a ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
2009n/a ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
2010n/a ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
2011n/a ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
2012n/a ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
2013n/a ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
2014n/a ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
2015n/a ]
2016n/a for seq, res in sequences:
2017n/a self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
2018n/a 'invalid continuation byte')
2020n/a def test_codecs_idna(self):
2021n/a # Test whether trailing dot is preserved
2022n/a self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
2024n/a def test_codecs_errors(self):
2025n/a # Error handling (encoding)
2026n/a self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
2027n/a self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
2028n/a self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
2029n/a self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
2030n/a self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
2031n/a 'Andr\202 x'.encode('ascii', errors='replace'))
2032n/a self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
2033n/a 'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
2035n/a # Error handling (decoding)
2036n/a self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
2037n/a self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
2038n/a self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
2039n/a self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
2040n/a self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
2042n/a # Error handling (unknown character names)
2043n/a self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
2045n/a # Error handling (truncated escape sequence)
2046n/a self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
2048n/a self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
2049n/a self.assertRaises(TypeError, str, b"hello", "test.unicode2")
2050n/a self.assertRaises(TypeError, "hello".encode, "test.unicode1")
2051n/a self.assertRaises(TypeError, "hello".encode, "test.unicode2")
2053n/a # Error handling (wrong arguments)
2054n/a self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
2056n/a # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
2057n/a self.assertRaises(UnicodeError, float, "\ud800")
2058n/a self.assertRaises(UnicodeError, float, "\udf00")
2059n/a self.assertRaises(UnicodeError, complex, "\ud800")
2060n/a self.assertRaises(UnicodeError, complex, "\udf00")
2062n/a def test_codecs(self):
2063n/a # Encoding
2064n/a self.assertEqual('hello'.encode('ascii'), b'hello')
2065n/a self.assertEqual('hello'.encode('utf-7'), b'hello')
2066n/a self.assertEqual('hello'.encode('utf-8'), b'hello')
2067n/a self.assertEqual('hello'.encode('utf-8'), b'hello')
2068n/a self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
2069n/a self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
2070n/a self.assertEqual('hello'.encode('latin-1'), b'hello')
2072n/a # Default encoding is utf-8
2073n/a self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
2075n/a # Roundtrip safety for BMP (just the first 1024 chars)
2076n/a for c in range(1024):
2077n/a u = chr(c)
2078n/a for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
2079n/a 'utf-16-be', 'raw_unicode_escape',
2080n/a 'unicode_escape', 'unicode_internal'):
2081n/a with warnings.catch_warnings():
2082n/a # unicode-internal has been deprecated
2083n/a warnings.simplefilter("ignore", DeprecationWarning)
2085n/a self.assertEqual(str(u.encode(encoding),encoding), u)
2087n/a # Roundtrip safety for BMP (just the first 256 chars)
2088n/a for c in range(256):
2089n/a u = chr(c)
2090n/a for encoding in ('latin-1',):
2091n/a self.assertEqual(str(u.encode(encoding),encoding), u)
2093n/a # Roundtrip safety for BMP (just the first 128 chars)
2094n/a for c in range(128):
2095n/a u = chr(c)
2096n/a for encoding in ('ascii',):
2097n/a self.assertEqual(str(u.encode(encoding),encoding), u)
2099n/a # Roundtrip safety for non-BMP (just a few chars)
2100n/a with warnings.catch_warnings():
2101n/a # unicode-internal has been deprecated
2102n/a warnings.simplefilter("ignore", DeprecationWarning)
2104n/a u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
2105n/a for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
2106n/a 'raw_unicode_escape',
2107n/a 'unicode_escape', 'unicode_internal'):
2108n/a self.assertEqual(str(u.encode(encoding),encoding), u)
2110n/a # UTF-8 must be roundtrip safe for all code points
2111n/a # (except surrogates, which are forbidden).
2112n/a u = ''.join(map(chr, list(range(0, 0xd800)) +
2113n/a list(range(0xe000, 0x110000))))
2114n/a for encoding in ('utf-8',):
2115n/a self.assertEqual(str(u.encode(encoding),encoding), u)
2117n/a def test_codecs_charmap(self):
2118n/a # 0-127
2119n/a s = bytes(range(128))
2120n/a for encoding in (
2121n/a 'cp037', 'cp1026', 'cp273',
2122n/a 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2123n/a 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
2124n/a 'cp863', 'cp865', 'cp866', 'cp1125',
2125n/a 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2126n/a 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
2127n/a 'iso8859_7', 'iso8859_9',
2128n/a 'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'latin_1',
2129n/a 'mac_cyrillic', 'mac_latin2',
2131n/a 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2132n/a 'cp1256', 'cp1257', 'cp1258',
2133n/a 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2135n/a 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2136n/a 'cp1006', 'iso8859_8',
2138n/a ### These have undefined mappings:
2139n/a #'cp424',
2141n/a ### These fail the round-trip:
2142n/a #'cp875'
2144n/a ):
2145n/a self.assertEqual(str(s, encoding).encode(encoding), s)
2147n/a # 128-255
2148n/a s = bytes(range(128, 256))
2149n/a for encoding in (
2150n/a 'cp037', 'cp1026', 'cp273',
2151n/a 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2152n/a 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
2153n/a 'cp863', 'cp865', 'cp866', 'cp1125',
2154n/a 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2155n/a 'iso8859_2', 'iso8859_4', 'iso8859_5',
2156n/a 'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1',
2157n/a 'mac_cyrillic', 'mac_latin2',
2159n/a ### These have undefined mappings:
2160n/a #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2161n/a #'cp1256', 'cp1257', 'cp1258',
2162n/a #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2163n/a #'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', 'kz1048',
2164n/a #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2166n/a ### These fail the round-trip:
2167n/a #'cp1006', 'cp875', 'iso8859_8',
2169n/a ):
2170n/a self.assertEqual(str(s, encoding).encode(encoding), s)
2172n/a def test_concatenation(self):
2173n/a self.assertEqual(("abc" "def"), "abcdef")
2174n/a self.assertEqual(("abc" "def"), "abcdef")
2175n/a self.assertEqual(("abc" "def"), "abcdef")
2176n/a self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2177n/a self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2179n/a def test_printing(self):
2180n/a class BitBucket:
2181n/a def write(self, text):
2182n/a pass
2184n/a out = BitBucket()
2185n/a print('abc', file=out)
2186n/a print('abc', 'def', file=out)
2187n/a print('abc', 'def', file=out)
2188n/a print('abc', 'def', file=out)
2189n/a print('abc\n', file=out)
2190n/a print('abc\n', end=' ', file=out)
2191n/a print('abc\n', end=' ', file=out)
2192n/a print('def\n', file=out)
2193n/a print('def\n', file=out)
2195n/a def test_ucs4(self):
2196n/a x = '\U00100000'
2197n/a y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
2198n/a self.assertEqual(x, y)
2200n/a y = br'\U00100000'
2201n/a x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2202n/a self.assertEqual(x, y)
2203n/a y = br'\U00010000'
2204n/a x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2205n/a self.assertEqual(x, y)
2207n/a try:
2208n/a br'\U11111111'.decode("raw-unicode-escape")
2209n/a except UnicodeDecodeError as e:
2210n/a self.assertEqual(e.start, 0)
2211n/a self.assertEqual(e.end, 10)
2212n/a else:
2213n/a self.fail("Should have raised UnicodeDecodeError")
2215n/a def test_conversion(self):
2216n/a # Make sure __str__() works properly
2217n/a class ObjectToStr:
2218n/a def __str__(self):
2219n/a return "foo"
2221n/a class StrSubclassToStr(str):
2222n/a def __str__(self):
2223n/a return "foo"
2225n/a class StrSubclassToStrSubclass(str):
2226n/a def __new__(cls, content=""):
2227n/a return str.__new__(cls, 2*content)
2228n/a def __str__(self):
2229n/a return self
2231n/a self.assertEqual(str(ObjectToStr()), "foo")
2232n/a self.assertEqual(str(StrSubclassToStr("bar")), "foo")
2233n/a s = str(StrSubclassToStrSubclass("foo"))
2234n/a self.assertEqual(s, "foofoo")
2235n/a self.assertIs(type(s), StrSubclassToStrSubclass)
2236n/a s = StrSubclass(StrSubclassToStrSubclass("foo"))
2237n/a self.assertEqual(s, "foofoo")
2238n/a self.assertIs(type(s), StrSubclass)
2240n/a def test_unicode_repr(self):
2241n/a class s1:
2242n/a def __repr__(self):
2243n/a return '\\n'
2245n/a class s2:
2246n/a def __repr__(self):
2247n/a return '\\n'
2249n/a self.assertEqual(repr(s1()), '\\n')
2250n/a self.assertEqual(repr(s2()), '\\n')
2252n/a def test_printable_repr(self):
2253n/a self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
2254n/a self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable
2256n/a # This test only affects 32-bit platforms because expandtabs can only take
2257n/a # an int as the max value, not a 64-bit C long. If expandtabs is changed
2258n/a # to take a 64-bit long, this test should apply to all platforms.
2259n/a @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
2260n/a 'only applies to 32-bit platforms')
2261n/a def test_expandtabs_overflows_gracefully(self):
2262n/a self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
2264n/a @support.cpython_only
2265n/a def test_expandtabs_optimization(self):
2266n/a s = 'abc'
2267n/a self.assertIs(s.expandtabs(), s)
2269n/a def test_raiseMemError(self):
2270n/a if struct.calcsize('P') == 8:
2271n/a # 64 bits pointers
2272n/a ascii_struct_size = 48
2273n/a compact_struct_size = 72
2274n/a else:
2275n/a # 32 bits pointers
2276n/a ascii_struct_size = 24
2277n/a compact_struct_size = 36
2279n/a for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
2280n/a code = ord(char)
2281n/a if code < 0x100:
2282n/a char_size = 1 # sizeof(Py_UCS1)
2283n/a struct_size = ascii_struct_size
2284n/a elif code < 0x10000:
2285n/a char_size = 2 # sizeof(Py_UCS2)
2286n/a struct_size = compact_struct_size
2287n/a else:
2288n/a char_size = 4 # sizeof(Py_UCS4)
2289n/a struct_size = compact_struct_size
2290n/a # Note: sys.maxsize is half of the actual max allocation because of
2291n/a # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
2292n/a # be allocatable, given enough memory.
2293n/a maxlen = ((sys.maxsize - struct_size) // char_size)
2294n/a alloc = lambda: char * maxlen
2295n/a self.assertRaises(MemoryError, alloc)
2296n/a self.assertRaises(MemoryError, alloc)
2298n/a def test_format_subclass(self):
2299n/a class S(str):
2300n/a def __str__(self):
2301n/a return '__str__ overridden'
2302n/a s = S('xxx')
2303n/a self.assertEqual("%s" % s, '__str__ overridden')
2304n/a self.assertEqual("{}".format(s), '__str__ overridden')
2306n/a def test_subclass_add(self):
2307n/a class S(str):
2308n/a def __add__(self, o):
2309n/a return "3"
2310n/a self.assertEqual(S("4") + S("5"), "3")
2311n/a class S(str):
2312n/a def __iadd__(self, o):
2313n/a return "3"
2314n/a s = S("1")
2315n/a s += "4"
2316n/a self.assertEqual(s, "3")
2318n/a def test_getnewargs(self):
2319n/a text = 'abc'
2320n/a args = text.__getnewargs__()
2321n/a self.assertIsNot(args[0], text)
2322n/a self.assertEqual(args[0], text)
2323n/a self.assertEqual(len(args), 1)
2325n/a def test_resize(self):
2326n/a for length in range(1, 100, 7):
2327n/a # generate a fresh string (refcount=1)
2328n/a text = 'a' * length + 'b'
2330n/a with support.check_warnings(('unicode_internal codec has been '
2331n/a 'deprecated', DeprecationWarning)):
2332n/a # fill wstr internal field
2333n/a abc = text.encode('unicode_internal')
2334n/a self.assertEqual(abc.decode('unicode_internal'), text)
2336n/a # resize text: wstr field must be cleared and then recomputed
2337n/a text += 'c'
2338n/a abcdef = text.encode('unicode_internal')
2339n/a self.assertNotEqual(abc, abcdef)
2340n/a self.assertEqual(abcdef.decode('unicode_internal'), text)
2342n/a def test_compare(self):
2343n/a # Issue #17615
2344n/a N = 10
2345n/a ascii = 'a' * N
2346n/a ascii2 = 'z' * N
2347n/a latin = '\x80' * N
2348n/a latin2 = '\xff' * N
2349n/a bmp = '\u0100' * N
2350n/a bmp2 = '\uffff' * N
2351n/a astral = '\U00100000' * N
2352n/a astral2 = '\U0010ffff' * N
2353n/a strings = (
2354n/a ascii, ascii2,
2355n/a latin, latin2,
2356n/a bmp, bmp2,
2357n/a astral, astral2)
2358n/a for text1, text2 in itertools.combinations(strings, 2):
2359n/a equal = (text1 is text2)
2360n/a self.assertEqual(text1 == text2, equal)
2361n/a self.assertEqual(text1 != text2, not equal)
2363n/a if equal:
2364n/a self.assertTrue(text1 <= text2)
2365n/a self.assertTrue(text1 >= text2)
2367n/a # text1 is text2: duplicate strings to skip the "str1 == str2"
2368n/a # optimization in unicode_compare_eq() and really compare
2369n/a # character per character
2370n/a copy1 = duplicate_string(text1)
2371n/a copy2 = duplicate_string(text2)
2372n/a self.assertIsNot(copy1, copy2)
2374n/a self.assertTrue(copy1 == copy2)
2375n/a self.assertFalse(copy1 != copy2)
2377n/a self.assertTrue(copy1 <= copy2)
2378n/a self.assertTrue(copy2 >= copy2)
2380n/a self.assertTrue(ascii < ascii2)
2381n/a self.assertTrue(ascii < latin)
2382n/a self.assertTrue(ascii < bmp)
2383n/a self.assertTrue(ascii < astral)
2384n/a self.assertFalse(ascii >= ascii2)
2385n/a self.assertFalse(ascii >= latin)
2386n/a self.assertFalse(ascii >= bmp)
2387n/a self.assertFalse(ascii >= astral)
2389n/a self.assertFalse(latin < ascii)
2390n/a self.assertTrue(latin < latin2)
2391n/a self.assertTrue(latin < bmp)
2392n/a self.assertTrue(latin < astral)
2393n/a self.assertTrue(latin >= ascii)
2394n/a self.assertFalse(latin >= latin2)
2395n/a self.assertFalse(latin >= bmp)
2396n/a self.assertFalse(latin >= astral)
2398n/a self.assertFalse(bmp < ascii)
2399n/a self.assertFalse(bmp < latin)
2400n/a self.assertTrue(bmp < bmp2)
2401n/a self.assertTrue(bmp < astral)
2402n/a self.assertTrue(bmp >= ascii)
2403n/a self.assertTrue(bmp >= latin)
2404n/a self.assertFalse(bmp >= bmp2)
2405n/a self.assertFalse(bmp >= astral)
2407n/a self.assertFalse(astral < ascii)
2408n/a self.assertFalse(astral < latin)
2409n/a self.assertFalse(astral < bmp2)
2410n/a self.assertTrue(astral < astral2)
2411n/a self.assertTrue(astral >= ascii)
2412n/a self.assertTrue(astral >= latin)
2413n/a self.assertTrue(astral >= bmp2)
2414n/a self.assertFalse(astral >= astral2)
2416n/a def test_free_after_iterating(self):
2417n/a support.check_free_after_iterating(self, iter, str)
2418n/a support.check_free_after_iterating(self, reversed, str)
2421n/aclass CAPITest(unittest.TestCase):
2423n/a # Test PyUnicode_FromFormat()
2424n/a def test_from_format(self):
2425n/a support.import_module('ctypes')
2426n/a from ctypes import (
2427n/a pythonapi, py_object, sizeof,
2428n/a c_int, c_long, c_longlong, c_ssize_t,
2429n/a c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
2430n/a name = "PyUnicode_FromFormat"
2431n/a _PyUnicode_FromFormat = getattr(pythonapi, name)
2432n/a _PyUnicode_FromFormat.restype = py_object
2434n/a def PyUnicode_FromFormat(format, *args):
2435n/a cargs = tuple(
2436n/a py_object(arg) if isinstance(arg, str) else arg
2437n/a for arg in args)
2438n/a return _PyUnicode_FromFormat(format, *cargs)
2440n/a def check_format(expected, format, *args):
2441n/a text = PyUnicode_FromFormat(format, *args)
2442n/a self.assertEqual(expected, text)
2444n/a # ascii format, non-ascii argument
2445n/a check_format('ascii\x7f=unicode\xe9',
2446n/a b'ascii\x7f=%U', 'unicode\xe9')
2448n/a # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
2449n/a # raises an error
2450n/a self.assertRaisesRegex(ValueError,
2451n/a r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
2452n/a 'string, got a non-ASCII byte: 0xe9$',
2453n/a PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
2455n/a # test "%c"
2456n/a check_format('\uabcd',
2457n/a b'%c', c_int(0xabcd))
2458n/a check_format('\U0010ffff',
2459n/a b'%c', c_int(0x10ffff))
2460n/a with self.assertRaises(OverflowError):
2461n/a PyUnicode_FromFormat(b'%c', c_int(0x110000))
2462n/a # Issue #18183
2463n/a check_format('\U00010000\U00100000',
2464n/a b'%c%c', c_int(0x10000), c_int(0x100000))
2466n/a # test "%"
2467n/a check_format('%',
2468n/a b'%')
2469n/a check_format('%',
2470n/a b'%%')
2471n/a check_format('%s',
2472n/a b'%%s')
2473n/a check_format('[%]',
2474n/a b'[%%]')
2475n/a check_format('%abc',
2476n/a b'%%%s', b'abc')
2478n/a # truncated string
2479n/a check_format('abc',
2480n/a b'%.3s', b'abcdef')
2481n/a check_format('abc[\ufffd',
2482n/a b'%.5s', 'abc[\u20ac]'.encode('utf8'))
2483n/a check_format("'\\u20acABC'",
2484n/a b'%A', '\u20acABC')
2485n/a check_format("'\\u20",
2486n/a b'%.5A', '\u20acABCDEF')
2487n/a check_format("'\u20acABC'",
2488n/a b'%R', '\u20acABC')
2489n/a check_format("'\u20acA",
2490n/a b'%.3R', '\u20acABCDEF')
2491n/a check_format('\u20acAB',
2492n/a b'%.3S', '\u20acABCDEF')
2493n/a check_format('\u20acAB',
2494n/a b'%.3U', '\u20acABCDEF')
2495n/a check_format('\u20acAB',
2496n/a b'%.3V', '\u20acABCDEF', None)
2497n/a check_format('abc[\ufffd',
2498n/a b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
2500n/a # following tests comes from #7330
2501n/a # test width modifier and precision modifier with %S
2502n/a check_format("repr= abc",
2503n/a b'repr=%5S', 'abc')
2504n/a check_format("repr=ab",
2505n/a b'repr=%.2S', 'abc')
2506n/a check_format("repr= ab",
2507n/a b'repr=%5.2S', 'abc')
2509n/a # test width modifier and precision modifier with %R
2510n/a check_format("repr= 'abc'",
2511n/a b'repr=%8R', 'abc')
2512n/a check_format("repr='ab",
2513n/a b'repr=%.3R', 'abc')
2514n/a check_format("repr= 'ab",
2515n/a b'repr=%5.3R', 'abc')
2517n/a # test width modifier and precision modifier with %A
2518n/a check_format("repr= 'abc'",
2519n/a b'repr=%8A', 'abc')
2520n/a check_format("repr='ab",
2521n/a b'repr=%.3A', 'abc')
2522n/a check_format("repr= 'ab",
2523n/a b'repr=%5.3A', 'abc')
2525n/a # test width modifier and precision modifier with %s
2526n/a check_format("repr= abc",
2527n/a b'repr=%5s', b'abc')
2528n/a check_format("repr=ab",
2529n/a b'repr=%.2s', b'abc')
2530n/a check_format("repr= ab",
2531n/a b'repr=%5.2s', b'abc')
2533n/a # test width modifier and precision modifier with %U
2534n/a check_format("repr= abc",
2535n/a b'repr=%5U', 'abc')
2536n/a check_format("repr=ab",
2537n/a b'repr=%.2U', 'abc')
2538n/a check_format("repr= ab",
2539n/a b'repr=%5.2U', 'abc')
2541n/a # test width modifier and precision modifier with %V
2542n/a check_format("repr= abc",
2543n/a b'repr=%5V', 'abc', b'123')
2544n/a check_format("repr=ab",
2545n/a b'repr=%.2V', 'abc', b'123')
2546n/a check_format("repr= ab",
2547n/a b'repr=%5.2V', 'abc', b'123')
2548n/a check_format("repr= 123",
2549n/a b'repr=%5V', None, b'123')
2550n/a check_format("repr=12",
2551n/a b'repr=%.2V', None, b'123')
2552n/a check_format("repr= 12",
2553n/a b'repr=%5.2V', None, b'123')
2555n/a # test integer formats (%i, %d, %u)
2556n/a check_format('010',
2557n/a b'%03i', c_int(10))
2558n/a check_format('0010',
2559n/a b'%0.4i', c_int(10))
2560n/a check_format('-123',
2561n/a b'%i', c_int(-123))
2562n/a check_format('-123',
2563n/a b'%li', c_long(-123))
2564n/a check_format('-123',
2565n/a b'%lli', c_longlong(-123))
2566n/a check_format('-123',
2567n/a b'%zi', c_ssize_t(-123))
2569n/a check_format('-123',
2570n/a b'%d', c_int(-123))
2571n/a check_format('-123',
2572n/a b'%ld', c_long(-123))
2573n/a check_format('-123',
2574n/a b'%lld', c_longlong(-123))
2575n/a check_format('-123',
2576n/a b'%zd', c_ssize_t(-123))
2578n/a check_format('123',
2579n/a b'%u', c_uint(123))
2580n/a check_format('123',
2581n/a b'%lu', c_ulong(123))
2582n/a check_format('123',
2583n/a b'%llu', c_ulonglong(123))
2584n/a check_format('123',
2585n/a b'%zu', c_size_t(123))
2587n/a # test long output
2588n/a min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
2589n/a max_longlong = -min_longlong - 1
2590n/a check_format(str(min_longlong),
2591n/a b'%lld', c_longlong(min_longlong))
2592n/a check_format(str(max_longlong),
2593n/a b'%lld', c_longlong(max_longlong))
2594n/a max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
2595n/a check_format(str(max_ulonglong),
2596n/a b'%llu', c_ulonglong(max_ulonglong))
2597n/a PyUnicode_FromFormat(b'%p', c_void_p(-1))
2599n/a # test padding (width and/or precision)
2600n/a check_format('123'.rjust(10, '0'),
2601n/a b'%010i', c_int(123))
2602n/a check_format('123'.rjust(100),
2603n/a b'%100i', c_int(123))
2604n/a check_format('123'.rjust(100, '0'),
2605n/a b'%.100i', c_int(123))
2606n/a check_format('123'.rjust(80, '0').rjust(100),
2607n/a b'%100.80i', c_int(123))
2609n/a check_format('123'.rjust(10, '0'),
2610n/a b'%010u', c_uint(123))
2611n/a check_format('123'.rjust(100),
2612n/a b'%100u', c_uint(123))
2613n/a check_format('123'.rjust(100, '0'),
2614n/a b'%.100u', c_uint(123))
2615n/a check_format('123'.rjust(80, '0').rjust(100),
2616n/a b'%100.80u', c_uint(123))
2618n/a check_format('123'.rjust(10, '0'),
2619n/a b'%010x', c_int(0x123))
2620n/a check_format('123'.rjust(100),
2621n/a b'%100x', c_int(0x123))
2622n/a check_format('123'.rjust(100, '0'),
2623n/a b'%.100x', c_int(0x123))
2624n/a check_format('123'.rjust(80, '0').rjust(100),
2625n/a b'%100.80x', c_int(0x123))
2627n/a # test %A
2628n/a check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
2629n/a b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
2631n/a # test %V
2632n/a check_format('repr=abc',
2633n/a b'repr=%V', 'abc', b'xyz')
2635n/a # Test string decode from parameter of %s using utf-8.
2636n/a # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
2637n/a # '\u4eba\u6c11'
2638n/a check_format('repr=\u4eba\u6c11',
2639n/a b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
2641n/a #Test replace error handler.
2642n/a check_format('repr=abc\ufffd',
2643n/a b'repr=%V', None, b'abc\xff')
2645n/a # not supported: copy the raw format string. these tests are just here
2646n/a # to check for crashes and should not be considered as specifications
2647n/a check_format('%s',
2648n/a b'%1%s', b'abc')
2649n/a check_format('%1abc',
2650n/a b'%1abc')
2651n/a check_format('%+i',
2652n/a b'%+i', c_int(10))
2653n/a check_format('%.%s',
2654n/a b'%.%s', b'abc')
2656n/a # Test PyUnicode_AsWideChar()
2657n/a @support.cpython_only
2658n/a def test_aswidechar(self):
2659n/a from _testcapi import unicode_aswidechar
2660n/a support.import_module('ctypes')
2661n/a from ctypes import c_wchar, sizeof
2663n/a wchar, size = unicode_aswidechar('abcdef', 2)
2664n/a self.assertEqual(size, 2)
2665n/a self.assertEqual(wchar, 'ab')
2667n/a wchar, size = unicode_aswidechar('abc', 3)
2668n/a self.assertEqual(size, 3)
2669n/a self.assertEqual(wchar, 'abc')
2671n/a wchar, size = unicode_aswidechar('abc', 4)
2672n/a self.assertEqual(size, 3)
2673n/a self.assertEqual(wchar, 'abc\0')
2675n/a wchar, size = unicode_aswidechar('abc', 10)
2676n/a self.assertEqual(size, 3)
2677n/a self.assertEqual(wchar, 'abc\0')
2679n/a wchar, size = unicode_aswidechar('abc\0def', 20)
2680n/a self.assertEqual(size, 7)
2681n/a self.assertEqual(wchar, 'abc\0def\0')
2683n/a nonbmp = chr(0x10ffff)
2684n/a if sizeof(c_wchar) == 2:
2685n/a buflen = 3
2686n/a nchar = 2
2687n/a else: # sizeof(c_wchar) == 4
2688n/a buflen = 2
2689n/a nchar = 1
2690n/a wchar, size = unicode_aswidechar(nonbmp, buflen)
2691n/a self.assertEqual(size, nchar)
2692n/a self.assertEqual(wchar, nonbmp + '\0')
2694n/a # Test PyUnicode_AsWideCharString()
2695n/a @support.cpython_only
2696n/a def test_aswidecharstring(self):
2697n/a from _testcapi import unicode_aswidecharstring
2698n/a support.import_module('ctypes')
2699n/a from ctypes import c_wchar, sizeof
2701n/a wchar, size = unicode_aswidecharstring('abc')
2702n/a self.assertEqual(size, 3)
2703n/a self.assertEqual(wchar, 'abc\0')
2705n/a wchar, size = unicode_aswidecharstring('abc\0def')
2706n/a self.assertEqual(size, 7)
2707n/a self.assertEqual(wchar, 'abc\0def\0')
2709n/a nonbmp = chr(0x10ffff)
2710n/a if sizeof(c_wchar) == 2:
2711n/a nchar = 2
2712n/a else: # sizeof(c_wchar) == 4
2713n/a nchar = 1
2714n/a wchar, size = unicode_aswidecharstring(nonbmp)
2715n/a self.assertEqual(size, nchar)
2716n/a self.assertEqual(wchar, nonbmp + '\0')
2718n/a # Test PyUnicode_AsUCS4()
2719n/a @support.cpython_only
2720n/a def test_asucs4(self):
2721n/a from _testcapi import unicode_asucs4
2722n/a for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
2723n/a 'a\ud800b\udfffc', '\ud834\udd1e']:
2724n/a l = len(s)
2725n/a self.assertEqual(unicode_asucs4(s, l, 1), s+'\0')
2726n/a self.assertEqual(unicode_asucs4(s, l, 0), s+'\uffff')
2727n/a self.assertEqual(unicode_asucs4(s, l+1, 1), s+'\0\uffff')
2728n/a self.assertEqual(unicode_asucs4(s, l+1, 0), s+'\0\uffff')
2729n/a self.assertRaises(SystemError, unicode_asucs4, s, l-1, 1)
2730n/a self.assertRaises(SystemError, unicode_asucs4, s, l-2, 0)
2731n/a s = '\0'.join([s, s])
2732n/a self.assertEqual(unicode_asucs4(s, len(s), 1), s+'\0')
2733n/a self.assertEqual(unicode_asucs4(s, len(s), 0), s+'\uffff')
2735n/a # Test PyUnicode_FindChar()
2736n/a @support.cpython_only
2737n/a def test_findchar(self):
2738n/a from _testcapi import unicode_findchar
2740n/a for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1":
2741n/a for i, ch in enumerate(str):
2742n/a self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i)
2743n/a self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i)
2745n/a str = "!>_<!"
2746n/a self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1)
2747n/a self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1)
2748n/a # start < end
2749n/a self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4)
2750n/a self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4)
2751n/a # start >= end
2752n/a self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1)
2753n/a self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1)
2754n/a # negative
2755n/a self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0)
2756n/a self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0)
2758n/a # Test PyUnicode_CopyCharacters()
2759n/a @support.cpython_only
2760n/a def test_copycharacters(self):
2761n/a from _testcapi import unicode_copycharacters
2763n/a strings = [
2764n/a 'abcde', '\xa1\xa2\xa3\xa4\xa5',
2765n/a '\u4f60\u597d\u4e16\u754c\uff01',
2766n/a '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604'
2767n/a ]
2769n/a for idx, from_ in enumerate(strings):
2770n/a # wide -> narrow: exceed maxchar limitation
2771n/a for to in strings[:idx]:
2772n/a self.assertRaises(
2773n/a SystemError,
2774n/a unicode_copycharacters, to, 0, from_, 0, 5
2775n/a )
2776n/a # same kind
2777n/a for from_start in range(5):
2778n/a self.assertEqual(
2779n/a unicode_copycharacters(from_, 0, from_, from_start, 5),
2780n/a (from_[from_start:from_start+5].ljust(5, '\0'),
2781n/a 5-from_start)
2782n/a )
2783n/a for to_start in range(5):
2784n/a self.assertEqual(
2785n/a unicode_copycharacters(from_, to_start, from_, to_start, 5),
2786n/a (from_[to_start:to_start+5].rjust(5, '\0'),
2787n/a 5-to_start)
2788n/a )
2789n/a # narrow -> wide
2790n/a # Tests omitted since this creates invalid strings.
2792n/a s = strings[0]
2793n/a self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5)
2794n/a self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5)
2795n/a self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5)
2796n/a self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5)
2797n/a self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5)
2798n/a self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1)
2799n/a self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
2801n/a @support.cpython_only
2802n/a def test_encode_decimal(self):
2803n/a from _testcapi import unicode_encodedecimal
2804n/a self.assertEqual(unicode_encodedecimal('123'),
2805n/a b'123')
2806n/a self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
2807n/a b'3.14')
2808n/a self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2809n/a b' 3.14 ')
2810n/a self.assertRaises(UnicodeEncodeError,
2811n/a unicode_encodedecimal, "123\u20ac", "strict")
2812n/a self.assertRaisesRegex(
2813n/a ValueError,
2814n/a "^'decimal' codec can't encode character",
2815n/a unicode_encodedecimal, "123\u20ac", "replace")
2817n/a @support.cpython_only
2818n/a def test_transform_decimal(self):
2819n/a from _testcapi import unicode_transformdecimaltoascii as transform_decimal
2820n/a self.assertEqual(transform_decimal('123'),
2821n/a '123')
2822n/a self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
2823n/a '3.14')
2824n/a self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2825n/a "\N{EM SPACE}3.14\N{EN SPACE}")
2826n/a self.assertEqual(transform_decimal('123\u20ac'),
2827n/a '123\u20ac')
2829n/a @support.cpython_only
2830n/a def test_pep393_utf8_caching_bug(self):
2831n/a # Issue #25709: Problem with string concatenation and utf-8 cache
2832n/a from _testcapi import getargs_s_hash
2833n/a for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
2834n/a s = ''
2835n/a for i in range(5):
2836n/a # Due to CPython specific optimization the 's' string can be
2837n/a # resized in-place.
2838n/a s += chr(k)
2839n/a # Parsing with the "s#" format code calls indirectly
2840n/a # PyUnicode_AsUTF8AndSize() which creates the UTF-8
2841n/a # encoded string cached in the Unicode object.
2842n/a self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2843n/a # Check that the second call returns the same result
2844n/a self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2846n/aclass StringModuleTest(unittest.TestCase):
2847n/a def test_formatter_parser(self):
2848n/a def parse(format):
2849n/a return list(_string.formatter_parser(format))
2851n/a formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
2852n/a self.assertEqual(formatter, [
2853n/a ('prefix ', '2', '', 's'),
2854n/a ('xxx', '0', '^+10.3f', None),
2855n/a ('', 'obj.attr', '', 's'),
2856n/a (' ', 'z[0]', '10', 's'),
2857n/a ])
2859n/a formatter = parse("prefix {} suffix")
2860n/a self.assertEqual(formatter, [
2861n/a ('prefix ', '', '', None),
2862n/a (' suffix', None, None, None),
2863n/a ])
2865n/a formatter = parse("str")
2866n/a self.assertEqual(formatter, [
2867n/a ('str', None, None, None),
2868n/a ])
2870n/a formatter = parse("")
2871n/a self.assertEqual(formatter, [])
2873n/a formatter = parse("{0}")
2874n/a self.assertEqual(formatter, [
2875n/a ('', '0', '', None),
2876n/a ])
2878n/a self.assertRaises(TypeError, _string.formatter_parser, 1)
2880n/a def test_formatter_field_name_split(self):
2881n/a def split(name):
2882n/a items = list(_string.formatter_field_name_split(name))
2883n/a items[1] = list(items[1])
2884n/a return items
2885n/a self.assertEqual(split("obj"), ["obj", []])
2886n/a self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
2887n/a self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
2888n/a self.assertEqual(split("obj.arg[key1][key2]"), [
2889n/a "obj",
2890n/a [(True, 'arg'),
2891n/a (False, 'key1'),
2892n/a (False, 'key2'),
2893n/a ]])
2894n/a self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
2897n/aif __name__ == "__main__":
2898n/a unittest.main()