ยปCore Development>Code coverage>Lib/test/test_codecs.py

Python code coverage for Lib/test/test_codecs.py

#countcontent
1n/aimport codecs
2n/aimport contextlib
3n/aimport io
4n/aimport locale
5n/aimport sys
6n/aimport unittest
7n/aimport encodings
8n/a
9n/afrom test import support
10n/a
11n/atry:
12n/a import ctypes
13n/aexcept ImportError:
14n/a ctypes = None
15n/a SIZEOF_WCHAR_T = -1
16n/aelse:
17n/a SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
18n/a
19n/adef coding_checker(self, coder):
20n/a def check(input, expect):
21n/a self.assertEqual(coder(input), (expect, len(input)))
22n/a return check
23n/a
24n/a
25n/aclass Queue(object):
26n/a """
27n/a queue: write bytes at one end, read bytes from the other end
28n/a """
29n/a def __init__(self, buffer):
30n/a self._buffer = buffer
31n/a
32n/a def write(self, chars):
33n/a self._buffer += chars
34n/a
35n/a def read(self, size=-1):
36n/a if size<0:
37n/a s = self._buffer
38n/a self._buffer = self._buffer[:0] # make empty
39n/a return s
40n/a else:
41n/a s = self._buffer[:size]
42n/a self._buffer = self._buffer[size:]
43n/a return s
44n/a
45n/a
46n/aclass MixInCheckStateHandling:
47n/a def check_state_handling_decode(self, encoding, u, s):
48n/a for i in range(len(s)+1):
49n/a d = codecs.getincrementaldecoder(encoding)()
50n/a part1 = d.decode(s[:i])
51n/a state = d.getstate()
52n/a self.assertIsInstance(state[1], int)
53n/a # Check that the condition stated in the documentation for
54n/a # IncrementalDecoder.getstate() holds
55n/a if not state[1]:
56n/a # reset decoder to the default state without anything buffered
57n/a d.setstate((state[0][:0], 0))
58n/a # Feeding the previous input may not produce any output
59n/a self.assertTrue(not d.decode(state[0]))
60n/a # The decoder must return to the same state
61n/a self.assertEqual(state, d.getstate())
62n/a # Create a new decoder and set it to the state
63n/a # we extracted from the old one
64n/a d = codecs.getincrementaldecoder(encoding)()
65n/a d.setstate(state)
66n/a part2 = d.decode(s[i:], True)
67n/a self.assertEqual(u, part1+part2)
68n/a
69n/a def check_state_handling_encode(self, encoding, u, s):
70n/a for i in range(len(u)+1):
71n/a d = codecs.getincrementalencoder(encoding)()
72n/a part1 = d.encode(u[:i])
73n/a state = d.getstate()
74n/a d = codecs.getincrementalencoder(encoding)()
75n/a d.setstate(state)
76n/a part2 = d.encode(u[i:], True)
77n/a self.assertEqual(s, part1+part2)
78n/a
79n/a
80n/aclass ReadTest(MixInCheckStateHandling):
81n/a def check_partial(self, input, partialresults):
82n/a # get a StreamReader for the encoding and feed the bytestring version
83n/a # of input to the reader byte by byte. Read everything available from
84n/a # the StreamReader and check that the results equal the appropriate
85n/a # entries from partialresults.
86n/a q = Queue(b"")
87n/a r = codecs.getreader(self.encoding)(q)
88n/a result = ""
89n/a for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
90n/a q.write(bytes([c]))
91n/a result += r.read()
92n/a self.assertEqual(result, partialresult)
93n/a # check that there's nothing left in the buffers
94n/a self.assertEqual(r.read(), "")
95n/a self.assertEqual(r.bytebuffer, b"")
96n/a
97n/a # do the check again, this time using an incremental decoder
98n/a d = codecs.getincrementaldecoder(self.encoding)()
99n/a result = ""
100n/a for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
101n/a result += d.decode(bytes([c]))
102n/a self.assertEqual(result, partialresult)
103n/a # check that there's nothing left in the buffers
104n/a self.assertEqual(d.decode(b"", True), "")
105n/a self.assertEqual(d.buffer, b"")
106n/a
107n/a # Check whether the reset method works properly
108n/a d.reset()
109n/a result = ""
110n/a for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
111n/a result += d.decode(bytes([c]))
112n/a self.assertEqual(result, partialresult)
113n/a # check that there's nothing left in the buffers
114n/a self.assertEqual(d.decode(b"", True), "")
115n/a self.assertEqual(d.buffer, b"")
116n/a
117n/a # check iterdecode()
118n/a encoded = input.encode(self.encoding)
119n/a self.assertEqual(
120n/a input,
121n/a "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
122n/a )
123n/a
124n/a def test_readline(self):
125n/a def getreader(input):
126n/a stream = io.BytesIO(input.encode(self.encoding))
127n/a return codecs.getreader(self.encoding)(stream)
128n/a
129n/a def readalllines(input, keepends=True, size=None):
130n/a reader = getreader(input)
131n/a lines = []
132n/a while True:
133n/a line = reader.readline(size=size, keepends=keepends)
134n/a if not line:
135n/a break
136n/a lines.append(line)
137n/a return "|".join(lines)
138n/a
139n/a s = "foo\nbar\r\nbaz\rspam\u2028eggs"
140n/a sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
141n/a sexpectednoends = "foo|bar|baz|spam|eggs"
142n/a self.assertEqual(readalllines(s, True), sexpected)
143n/a self.assertEqual(readalllines(s, False), sexpectednoends)
144n/a self.assertEqual(readalllines(s, True, 10), sexpected)
145n/a self.assertEqual(readalllines(s, False, 10), sexpectednoends)
146n/a
147n/a lineends = ("\n", "\r\n", "\r", "\u2028")
148n/a # Test long lines (multiple calls to read() in readline())
149n/a vw = []
150n/a vwo = []
151n/a for (i, lineend) in enumerate(lineends):
152n/a vw.append((i*200+200)*"\u3042" + lineend)
153n/a vwo.append((i*200+200)*"\u3042")
154n/a self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
155n/a self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
156n/a
157n/a # Test lines where the first read might end with \r, so the
158n/a # reader has to look ahead whether this is a lone \r or a \r\n
159n/a for size in range(80):
160n/a for lineend in lineends:
161n/a s = 10*(size*"a" + lineend + "xxx\n")
162n/a reader = getreader(s)
163n/a for i in range(10):
164n/a self.assertEqual(
165n/a reader.readline(keepends=True),
166n/a size*"a" + lineend,
167n/a )
168n/a self.assertEqual(
169n/a reader.readline(keepends=True),
170n/a "xxx\n",
171n/a )
172n/a reader = getreader(s)
173n/a for i in range(10):
174n/a self.assertEqual(
175n/a reader.readline(keepends=False),
176n/a size*"a",
177n/a )
178n/a self.assertEqual(
179n/a reader.readline(keepends=False),
180n/a "xxx",
181n/a )
182n/a
183n/a def test_mixed_readline_and_read(self):
184n/a lines = ["Humpty Dumpty sat on a wall,\n",
185n/a "Humpty Dumpty had a great fall.\r\n",
186n/a "All the king's horses and all the king's men\r",
187n/a "Couldn't put Humpty together again."]
188n/a data = ''.join(lines)
189n/a def getreader():
190n/a stream = io.BytesIO(data.encode(self.encoding))
191n/a return codecs.getreader(self.encoding)(stream)
192n/a
193n/a # Issue #8260: Test readline() followed by read()
194n/a f = getreader()
195n/a self.assertEqual(f.readline(), lines[0])
196n/a self.assertEqual(f.read(), ''.join(lines[1:]))
197n/a self.assertEqual(f.read(), '')
198n/a
199n/a # Issue #16636: Test readline() followed by readlines()
200n/a f = getreader()
201n/a self.assertEqual(f.readline(), lines[0])
202n/a self.assertEqual(f.readlines(), lines[1:])
203n/a self.assertEqual(f.read(), '')
204n/a
205n/a # Test read() followed by read()
206n/a f = getreader()
207n/a self.assertEqual(f.read(size=40, chars=5), data[:5])
208n/a self.assertEqual(f.read(), data[5:])
209n/a self.assertEqual(f.read(), '')
210n/a
211n/a # Issue #12446: Test read() followed by readlines()
212n/a f = getreader()
213n/a self.assertEqual(f.read(size=40, chars=5), data[:5])
214n/a self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
215n/a self.assertEqual(f.read(), '')
216n/a
217n/a def test_bug1175396(self):
218n/a s = [
219n/a '<%!--===================================================\r\n',
220n/a ' BLOG index page: show recent articles,\r\n',
221n/a ' today\'s articles, or articles of a specific date.\r\n',
222n/a '========================================================--%>\r\n',
223n/a '<%@inputencoding="ISO-8859-1"%>\r\n',
224n/a '<%@pagetemplate=TEMPLATE.y%>\r\n',
225n/a '<%@import=import frog.util, frog%>\r\n',
226n/a '<%@import=import frog.objects%>\r\n',
227n/a '<%@import=from frog.storageerrors import StorageError%>\r\n',
228n/a '<%\r\n',
229n/a '\r\n',
230n/a 'import logging\r\n',
231n/a 'log=logging.getLogger("Snakelets.logger")\r\n',
232n/a '\r\n',
233n/a '\r\n',
234n/a 'user=self.SessionCtx.user\r\n',
235n/a 'storageEngine=self.SessionCtx.storageEngine\r\n',
236n/a '\r\n',
237n/a '\r\n',
238n/a 'def readArticlesFromDate(date, count=None):\r\n',
239n/a ' entryids=storageEngine.listBlogEntries(date)\r\n',
240n/a ' entryids.reverse() # descending\r\n',
241n/a ' if count:\r\n',
242n/a ' entryids=entryids[:count]\r\n',
243n/a ' try:\r\n',
244n/a ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
245n/a ' except StorageError,x:\r\n',
246n/a ' log.error("Error loading articles: "+str(x))\r\n',
247n/a ' self.abort("cannot load articles")\r\n',
248n/a '\r\n',
249n/a 'showdate=None\r\n',
250n/a '\r\n',
251n/a 'arg=self.Request.getArg()\r\n',
252n/a 'if arg=="today":\r\n',
253n/a ' #-------------------- TODAY\'S ARTICLES\r\n',
254n/a ' self.write("<h2>Today\'s articles</h2>")\r\n',
255n/a ' showdate = frog.util.isodatestr() \r\n',
256n/a ' entries = readArticlesFromDate(showdate)\r\n',
257n/a 'elif arg=="active":\r\n',
258n/a ' #-------------------- ACTIVE ARTICLES redirect\r\n',
259n/a ' self.Yredirect("active.y")\r\n',
260n/a 'elif arg=="login":\r\n',
261n/a ' #-------------------- LOGIN PAGE redirect\r\n',
262n/a ' self.Yredirect("login.y")\r\n',
263n/a 'elif arg=="date":\r\n',
264n/a ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
265n/a ' showdate = self.Request.getParameter("date")\r\n',
266n/a ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
267n/a ' entries = readArticlesFromDate(showdate)\r\n',
268n/a 'else:\r\n',
269n/a ' #-------------------- RECENT ARTICLES\r\n',
270n/a ' self.write("<h2>Recent articles</h2>")\r\n',
271n/a ' dates=storageEngine.listBlogEntryDates()\r\n',
272n/a ' if dates:\r\n',
273n/a ' entries=[]\r\n',
274n/a ' SHOWAMOUNT=10\r\n',
275n/a ' for showdate in dates:\r\n',
276n/a ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
277n/a ' if len(entries)>=SHOWAMOUNT:\r\n',
278n/a ' break\r\n',
279n/a ' \r\n',
280n/a ]
281n/a stream = io.BytesIO("".join(s).encode(self.encoding))
282n/a reader = codecs.getreader(self.encoding)(stream)
283n/a for (i, line) in enumerate(reader):
284n/a self.assertEqual(line, s[i])
285n/a
286n/a def test_readlinequeue(self):
287n/a q = Queue(b"")
288n/a writer = codecs.getwriter(self.encoding)(q)
289n/a reader = codecs.getreader(self.encoding)(q)
290n/a
291n/a # No lineends
292n/a writer.write("foo\r")
293n/a self.assertEqual(reader.readline(keepends=False), "foo")
294n/a writer.write("\nbar\r")
295n/a self.assertEqual(reader.readline(keepends=False), "")
296n/a self.assertEqual(reader.readline(keepends=False), "bar")
297n/a writer.write("baz")
298n/a self.assertEqual(reader.readline(keepends=False), "baz")
299n/a self.assertEqual(reader.readline(keepends=False), "")
300n/a
301n/a # Lineends
302n/a writer.write("foo\r")
303n/a self.assertEqual(reader.readline(keepends=True), "foo\r")
304n/a writer.write("\nbar\r")
305n/a self.assertEqual(reader.readline(keepends=True), "\n")
306n/a self.assertEqual(reader.readline(keepends=True), "bar\r")
307n/a writer.write("baz")
308n/a self.assertEqual(reader.readline(keepends=True), "baz")
309n/a self.assertEqual(reader.readline(keepends=True), "")
310n/a writer.write("foo\r\n")
311n/a self.assertEqual(reader.readline(keepends=True), "foo\r\n")
312n/a
313n/a def test_bug1098990_a(self):
314n/a s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
315n/a s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
316n/a s3 = "next line.\r\n"
317n/a
318n/a s = (s1+s2+s3).encode(self.encoding)
319n/a stream = io.BytesIO(s)
320n/a reader = codecs.getreader(self.encoding)(stream)
321n/a self.assertEqual(reader.readline(), s1)
322n/a self.assertEqual(reader.readline(), s2)
323n/a self.assertEqual(reader.readline(), s3)
324n/a self.assertEqual(reader.readline(), "")
325n/a
326n/a def test_bug1098990_b(self):
327n/a s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
328n/a s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
329n/a s3 = "stillokay:bbbbxx\r\n"
330n/a s4 = "broken!!!!badbad\r\n"
331n/a s5 = "againokay.\r\n"
332n/a
333n/a s = (s1+s2+s3+s4+s5).encode(self.encoding)
334n/a stream = io.BytesIO(s)
335n/a reader = codecs.getreader(self.encoding)(stream)
336n/a self.assertEqual(reader.readline(), s1)
337n/a self.assertEqual(reader.readline(), s2)
338n/a self.assertEqual(reader.readline(), s3)
339n/a self.assertEqual(reader.readline(), s4)
340n/a self.assertEqual(reader.readline(), s5)
341n/a self.assertEqual(reader.readline(), "")
342n/a
343n/a ill_formed_sequence_replace = "\ufffd"
344n/a
345n/a def test_lone_surrogates(self):
346n/a self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
347n/a self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
348n/a "[\\udc80]".encode(self.encoding))
349n/a self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
350n/a "[\\udc80]".encode(self.encoding))
351n/a self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
352n/a "[&#56448;]".encode(self.encoding))
353n/a self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
354n/a "[]".encode(self.encoding))
355n/a self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
356n/a "[?]".encode(self.encoding))
357n/a
358n/a # sequential surrogate characters
359n/a self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
360n/a "[]".encode(self.encoding))
361n/a self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
362n/a "[??]".encode(self.encoding))
363n/a
364n/a bom = "".encode(self.encoding)
365n/a for before, after in [("\U00010fff", "A"), ("[", "]"),
366n/a ("A", "\U00010fff")]:
367n/a before_sequence = before.encode(self.encoding)[len(bom):]
368n/a after_sequence = after.encode(self.encoding)[len(bom):]
369n/a test_string = before + "\uDC80" + after
370n/a test_sequence = (bom + before_sequence +
371n/a self.ill_formed_sequence + after_sequence)
372n/a self.assertRaises(UnicodeDecodeError, test_sequence.decode,
373n/a self.encoding)
374n/a self.assertEqual(test_string.encode(self.encoding,
375n/a "surrogatepass"),
376n/a test_sequence)
377n/a self.assertEqual(test_sequence.decode(self.encoding,
378n/a "surrogatepass"),
379n/a test_string)
380n/a self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
381n/a before + after)
382n/a self.assertEqual(test_sequence.decode(self.encoding, "replace"),
383n/a before + self.ill_formed_sequence_replace + after)
384n/a backslashreplace = ''.join('\\x%02x' % b
385n/a for b in self.ill_formed_sequence)
386n/a self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
387n/a before + backslashreplace + after)
388n/a
389n/a
390n/aclass UTF32Test(ReadTest, unittest.TestCase):
391n/a encoding = "utf-32"
392n/a if sys.byteorder == 'little':
393n/a ill_formed_sequence = b"\x80\xdc\x00\x00"
394n/a else:
395n/a ill_formed_sequence = b"\x00\x00\xdc\x80"
396n/a
397n/a spamle = (b'\xff\xfe\x00\x00'
398n/a b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
399n/a b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
400n/a spambe = (b'\x00\x00\xfe\xff'
401n/a b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
402n/a b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
403n/a
404n/a def test_only_one_bom(self):
405n/a _,_,reader,writer = codecs.lookup(self.encoding)
406n/a # encode some stream
407n/a s = io.BytesIO()
408n/a f = writer(s)
409n/a f.write("spam")
410n/a f.write("spam")
411n/a d = s.getvalue()
412n/a # check whether there is exactly one BOM in it
413n/a self.assertTrue(d == self.spamle or d == self.spambe)
414n/a # try to read it back
415n/a s = io.BytesIO(d)
416n/a f = reader(s)
417n/a self.assertEqual(f.read(), "spamspam")
418n/a
419n/a def test_badbom(self):
420n/a s = io.BytesIO(4*b"\xff")
421n/a f = codecs.getreader(self.encoding)(s)
422n/a self.assertRaises(UnicodeError, f.read)
423n/a
424n/a s = io.BytesIO(8*b"\xff")
425n/a f = codecs.getreader(self.encoding)(s)
426n/a self.assertRaises(UnicodeError, f.read)
427n/a
428n/a def test_partial(self):
429n/a self.check_partial(
430n/a "\x00\xff\u0100\uffff\U00010000",
431n/a [
432n/a "", # first byte of BOM read
433n/a "", # second byte of BOM read
434n/a "", # third byte of BOM read
435n/a "", # fourth byte of BOM read => byteorder known
436n/a "",
437n/a "",
438n/a "",
439n/a "\x00",
440n/a "\x00",
441n/a "\x00",
442n/a "\x00",
443n/a "\x00\xff",
444n/a "\x00\xff",
445n/a "\x00\xff",
446n/a "\x00\xff",
447n/a "\x00\xff\u0100",
448n/a "\x00\xff\u0100",
449n/a "\x00\xff\u0100",
450n/a "\x00\xff\u0100",
451n/a "\x00\xff\u0100\uffff",
452n/a "\x00\xff\u0100\uffff",
453n/a "\x00\xff\u0100\uffff",
454n/a "\x00\xff\u0100\uffff",
455n/a "\x00\xff\u0100\uffff\U00010000",
456n/a ]
457n/a )
458n/a
459n/a def test_handlers(self):
460n/a self.assertEqual(('\ufffd', 1),
461n/a codecs.utf_32_decode(b'\x01', 'replace', True))
462n/a self.assertEqual(('', 1),
463n/a codecs.utf_32_decode(b'\x01', 'ignore', True))
464n/a
465n/a def test_errors(self):
466n/a self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
467n/a b"\xff", "strict", True)
468n/a
469n/a def test_decoder_state(self):
470n/a self.check_state_handling_decode(self.encoding,
471n/a "spamspam", self.spamle)
472n/a self.check_state_handling_decode(self.encoding,
473n/a "spamspam", self.spambe)
474n/a
475n/a def test_issue8941(self):
476n/a # Issue #8941: insufficient result allocation when decoding into
477n/a # surrogate pairs on UCS-2 builds.
478n/a encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
479n/a self.assertEqual('\U00010000' * 1024,
480n/a codecs.utf_32_decode(encoded_le)[0])
481n/a encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
482n/a self.assertEqual('\U00010000' * 1024,
483n/a codecs.utf_32_decode(encoded_be)[0])
484n/a
485n/a
486n/aclass UTF32LETest(ReadTest, unittest.TestCase):
487n/a encoding = "utf-32-le"
488n/a ill_formed_sequence = b"\x80\xdc\x00\x00"
489n/a
490n/a def test_partial(self):
491n/a self.check_partial(
492n/a "\x00\xff\u0100\uffff\U00010000",
493n/a [
494n/a "",
495n/a "",
496n/a "",
497n/a "\x00",
498n/a "\x00",
499n/a "\x00",
500n/a "\x00",
501n/a "\x00\xff",
502n/a "\x00\xff",
503n/a "\x00\xff",
504n/a "\x00\xff",
505n/a "\x00\xff\u0100",
506n/a "\x00\xff\u0100",
507n/a "\x00\xff\u0100",
508n/a "\x00\xff\u0100",
509n/a "\x00\xff\u0100\uffff",
510n/a "\x00\xff\u0100\uffff",
511n/a "\x00\xff\u0100\uffff",
512n/a "\x00\xff\u0100\uffff",
513n/a "\x00\xff\u0100\uffff\U00010000",
514n/a ]
515n/a )
516n/a
517n/a def test_simple(self):
518n/a self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
519n/a
520n/a def test_errors(self):
521n/a self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
522n/a b"\xff", "strict", True)
523n/a
524n/a def test_issue8941(self):
525n/a # Issue #8941: insufficient result allocation when decoding into
526n/a # surrogate pairs on UCS-2 builds.
527n/a encoded = b'\x00\x00\x01\x00' * 1024
528n/a self.assertEqual('\U00010000' * 1024,
529n/a codecs.utf_32_le_decode(encoded)[0])
530n/a
531n/a
532n/aclass UTF32BETest(ReadTest, unittest.TestCase):
533n/a encoding = "utf-32-be"
534n/a ill_formed_sequence = b"\x00\x00\xdc\x80"
535n/a
536n/a def test_partial(self):
537n/a self.check_partial(
538n/a "\x00\xff\u0100\uffff\U00010000",
539n/a [
540n/a "",
541n/a "",
542n/a "",
543n/a "\x00",
544n/a "\x00",
545n/a "\x00",
546n/a "\x00",
547n/a "\x00\xff",
548n/a "\x00\xff",
549n/a "\x00\xff",
550n/a "\x00\xff",
551n/a "\x00\xff\u0100",
552n/a "\x00\xff\u0100",
553n/a "\x00\xff\u0100",
554n/a "\x00\xff\u0100",
555n/a "\x00\xff\u0100\uffff",
556n/a "\x00\xff\u0100\uffff",
557n/a "\x00\xff\u0100\uffff",
558n/a "\x00\xff\u0100\uffff",
559n/a "\x00\xff\u0100\uffff\U00010000",
560n/a ]
561n/a )
562n/a
563n/a def test_simple(self):
564n/a self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
565n/a
566n/a def test_errors(self):
567n/a self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
568n/a b"\xff", "strict", True)
569n/a
570n/a def test_issue8941(self):
571n/a # Issue #8941: insufficient result allocation when decoding into
572n/a # surrogate pairs on UCS-2 builds.
573n/a encoded = b'\x00\x01\x00\x00' * 1024
574n/a self.assertEqual('\U00010000' * 1024,
575n/a codecs.utf_32_be_decode(encoded)[0])
576n/a
577n/a
578n/aclass UTF16Test(ReadTest, unittest.TestCase):
579n/a encoding = "utf-16"
580n/a if sys.byteorder == 'little':
581n/a ill_formed_sequence = b"\x80\xdc"
582n/a else:
583n/a ill_formed_sequence = b"\xdc\x80"
584n/a
585n/a spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
586n/a spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
587n/a
588n/a def test_only_one_bom(self):
589n/a _,_,reader,writer = codecs.lookup(self.encoding)
590n/a # encode some stream
591n/a s = io.BytesIO()
592n/a f = writer(s)
593n/a f.write("spam")
594n/a f.write("spam")
595n/a d = s.getvalue()
596n/a # check whether there is exactly one BOM in it
597n/a self.assertTrue(d == self.spamle or d == self.spambe)
598n/a # try to read it back
599n/a s = io.BytesIO(d)
600n/a f = reader(s)
601n/a self.assertEqual(f.read(), "spamspam")
602n/a
603n/a def test_badbom(self):
604n/a s = io.BytesIO(b"\xff\xff")
605n/a f = codecs.getreader(self.encoding)(s)
606n/a self.assertRaises(UnicodeError, f.read)
607n/a
608n/a s = io.BytesIO(b"\xff\xff\xff\xff")
609n/a f = codecs.getreader(self.encoding)(s)
610n/a self.assertRaises(UnicodeError, f.read)
611n/a
612n/a def test_partial(self):
613n/a self.check_partial(
614n/a "\x00\xff\u0100\uffff\U00010000",
615n/a [
616n/a "", # first byte of BOM read
617n/a "", # second byte of BOM read => byteorder known
618n/a "",
619n/a "\x00",
620n/a "\x00",
621n/a "\x00\xff",
622n/a "\x00\xff",
623n/a "\x00\xff\u0100",
624n/a "\x00\xff\u0100",
625n/a "\x00\xff\u0100\uffff",
626n/a "\x00\xff\u0100\uffff",
627n/a "\x00\xff\u0100\uffff",
628n/a "\x00\xff\u0100\uffff",
629n/a "\x00\xff\u0100\uffff\U00010000",
630n/a ]
631n/a )
632n/a
633n/a def test_handlers(self):
634n/a self.assertEqual(('\ufffd', 1),
635n/a codecs.utf_16_decode(b'\x01', 'replace', True))
636n/a self.assertEqual(('', 1),
637n/a codecs.utf_16_decode(b'\x01', 'ignore', True))
638n/a
639n/a def test_errors(self):
640n/a self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
641n/a b"\xff", "strict", True)
642n/a
643n/a def test_decoder_state(self):
644n/a self.check_state_handling_decode(self.encoding,
645n/a "spamspam", self.spamle)
646n/a self.check_state_handling_decode(self.encoding,
647n/a "spamspam", self.spambe)
648n/a
649n/a def test_bug691291(self):
650n/a # Files are always opened in binary mode, even if no binary mode was
651n/a # specified. This means that no automatic conversion of '\n' is done
652n/a # on reading and writing.
653n/a s1 = 'Hello\r\nworld\r\n'
654n/a
655n/a s = s1.encode(self.encoding)
656n/a self.addCleanup(support.unlink, support.TESTFN)
657n/a with open(support.TESTFN, 'wb') as fp:
658n/a fp.write(s)
659n/a with support.check_warnings(('', DeprecationWarning)):
660n/a reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
661n/a with reader:
662n/a self.assertEqual(reader.read(), s1)
663n/a
664n/aclass UTF16LETest(ReadTest, unittest.TestCase):
665n/a encoding = "utf-16-le"
666n/a ill_formed_sequence = b"\x80\xdc"
667n/a
668n/a def test_partial(self):
669n/a self.check_partial(
670n/a "\x00\xff\u0100\uffff\U00010000",
671n/a [
672n/a "",
673n/a "\x00",
674n/a "\x00",
675n/a "\x00\xff",
676n/a "\x00\xff",
677n/a "\x00\xff\u0100",
678n/a "\x00\xff\u0100",
679n/a "\x00\xff\u0100\uffff",
680n/a "\x00\xff\u0100\uffff",
681n/a "\x00\xff\u0100\uffff",
682n/a "\x00\xff\u0100\uffff",
683n/a "\x00\xff\u0100\uffff\U00010000",
684n/a ]
685n/a )
686n/a
687n/a def test_errors(self):
688n/a tests = [
689n/a (b'\xff', '\ufffd'),
690n/a (b'A\x00Z', 'A\ufffd'),
691n/a (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
692n/a (b'\x00\xd8', '\ufffd'),
693n/a (b'\x00\xd8A', '\ufffd'),
694n/a (b'\x00\xd8A\x00', '\ufffdA'),
695n/a (b'\x00\xdcA\x00', '\ufffdA'),
696n/a ]
697n/a for raw, expected in tests:
698n/a self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
699n/a raw, 'strict', True)
700n/a self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
701n/a
702n/a def test_nonbmp(self):
703n/a self.assertEqual("\U00010203".encode(self.encoding),
704n/a b'\x00\xd8\x03\xde')
705n/a self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
706n/a "\U00010203")
707n/a
708n/aclass UTF16BETest(ReadTest, unittest.TestCase):
709n/a encoding = "utf-16-be"
710n/a ill_formed_sequence = b"\xdc\x80"
711n/a
712n/a def test_partial(self):
713n/a self.check_partial(
714n/a "\x00\xff\u0100\uffff\U00010000",
715n/a [
716n/a "",
717n/a "\x00",
718n/a "\x00",
719n/a "\x00\xff",
720n/a "\x00\xff",
721n/a "\x00\xff\u0100",
722n/a "\x00\xff\u0100",
723n/a "\x00\xff\u0100\uffff",
724n/a "\x00\xff\u0100\uffff",
725n/a "\x00\xff\u0100\uffff",
726n/a "\x00\xff\u0100\uffff",
727n/a "\x00\xff\u0100\uffff\U00010000",
728n/a ]
729n/a )
730n/a
731n/a def test_errors(self):
732n/a tests = [
733n/a (b'\xff', '\ufffd'),
734n/a (b'\x00A\xff', 'A\ufffd'),
735n/a (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
736n/a (b'\xd8\x00', '\ufffd'),
737n/a (b'\xd8\x00\xdc', '\ufffd'),
738n/a (b'\xd8\x00\x00A', '\ufffdA'),
739n/a (b'\xdc\x00\x00A', '\ufffdA'),
740n/a ]
741n/a for raw, expected in tests:
742n/a self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
743n/a raw, 'strict', True)
744n/a self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
745n/a
746n/a def test_nonbmp(self):
747n/a self.assertEqual("\U00010203".encode(self.encoding),
748n/a b'\xd8\x00\xde\x03')
749n/a self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
750n/a "\U00010203")
751n/a
752n/aclass UTF8Test(ReadTest, unittest.TestCase):
753n/a encoding = "utf-8"
754n/a ill_formed_sequence = b"\xed\xb2\x80"
755n/a ill_formed_sequence_replace = "\ufffd" * 3
756n/a BOM = b''
757n/a
758n/a def test_partial(self):
759n/a self.check_partial(
760n/a "\x00\xff\u07ff\u0800\uffff\U00010000",
761n/a [
762n/a "\x00",
763n/a "\x00",
764n/a "\x00\xff",
765n/a "\x00\xff",
766n/a "\x00\xff\u07ff",
767n/a "\x00\xff\u07ff",
768n/a "\x00\xff\u07ff",
769n/a "\x00\xff\u07ff\u0800",
770n/a "\x00\xff\u07ff\u0800",
771n/a "\x00\xff\u07ff\u0800",
772n/a "\x00\xff\u07ff\u0800\uffff",
773n/a "\x00\xff\u07ff\u0800\uffff",
774n/a "\x00\xff\u07ff\u0800\uffff",
775n/a "\x00\xff\u07ff\u0800\uffff",
776n/a "\x00\xff\u07ff\u0800\uffff\U00010000",
777n/a ]
778n/a )
779n/a
780n/a def test_decoder_state(self):
781n/a u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
782n/a self.check_state_handling_decode(self.encoding,
783n/a u, u.encode(self.encoding))
784n/a
785n/a def test_decode_error(self):
786n/a for data, error_handler, expected in (
787n/a (b'[\x80\xff]', 'ignore', '[]'),
788n/a (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
789n/a (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
790n/a (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
791n/a ):
792n/a with self.subTest(data=data, error_handler=error_handler,
793n/a expected=expected):
794n/a self.assertEqual(data.decode(self.encoding, error_handler),
795n/a expected)
796n/a
797n/a def test_lone_surrogates(self):
798n/a super().test_lone_surrogates()
799n/a # not sure if this is making sense for
800n/a # UTF-16 and UTF-32
801n/a self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
802n/a self.BOM + b'[\x80]')
803n/a
804n/a with self.assertRaises(UnicodeEncodeError) as cm:
805n/a "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
806n/a exc = cm.exception
807n/a self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
808n/a
809n/a def test_surrogatepass_handler(self):
810n/a self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
811n/a self.BOM + b"abc\xed\xa0\x80def")
812n/a self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
813n/a self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
814n/a self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
815n/a self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
816n/a
817n/a self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
818n/a "abc\ud800def")
819n/a self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
820n/a "\U00010fff\uD800")
821n/a
822n/a self.assertTrue(codecs.lookup_error("surrogatepass"))
823n/a with self.assertRaises(UnicodeDecodeError):
824n/a b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
825n/a with self.assertRaises(UnicodeDecodeError):
826n/a b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
827n/a
828n/a
829n/a@unittest.skipUnless(sys.platform == 'win32',
830n/a 'cp65001 is a Windows-only codec')
831n/aclass CP65001Test(ReadTest, unittest.TestCase):
832n/a encoding = "cp65001"
833n/a
834n/a def test_encode(self):
835n/a tests = [
836n/a ('abc', 'strict', b'abc'),
837n/a ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
838n/a ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
839n/a ('\udc80', 'strict', None),
840n/a ('\udc80', 'ignore', b''),
841n/a ('\udc80', 'replace', b'?'),
842n/a ('\udc80', 'backslashreplace', b'\\udc80'),
843n/a ('\udc80', 'namereplace', b'\\udc80'),
844n/a ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
845n/a ]
846n/a for text, errors, expected in tests:
847n/a if expected is not None:
848n/a try:
849n/a encoded = text.encode('cp65001', errors)
850n/a except UnicodeEncodeError as err:
851n/a self.fail('Unable to encode %a to cp65001 with '
852n/a 'errors=%r: %s' % (text, errors, err))
853n/a self.assertEqual(encoded, expected,
854n/a '%a.encode("cp65001", %r)=%a != %a'
855n/a % (text, errors, encoded, expected))
856n/a else:
857n/a self.assertRaises(UnicodeEncodeError,
858n/a text.encode, "cp65001", errors)
859n/a
860n/a def test_decode(self):
861n/a tests = [
862n/a (b'abc', 'strict', 'abc'),
863n/a (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
864n/a (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
865n/a (b'\xef\xbf\xbd', 'strict', '\ufffd'),
866n/a (b'[\xc3\xa9]', 'strict', '[\xe9]'),
867n/a # invalid bytes
868n/a (b'[\xff]', 'strict', None),
869n/a (b'[\xff]', 'ignore', '[]'),
870n/a (b'[\xff]', 'replace', '[\ufffd]'),
871n/a (b'[\xff]', 'surrogateescape', '[\udcff]'),
872n/a (b'[\xed\xb2\x80]', 'strict', None),
873n/a (b'[\xed\xb2\x80]', 'ignore', '[]'),
874n/a (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
875n/a ]
876n/a for raw, errors, expected in tests:
877n/a if expected is not None:
878n/a try:
879n/a decoded = raw.decode('cp65001', errors)
880n/a except UnicodeDecodeError as err:
881n/a self.fail('Unable to decode %a from cp65001 with '
882n/a 'errors=%r: %s' % (raw, errors, err))
883n/a self.assertEqual(decoded, expected,
884n/a '%a.decode("cp65001", %r)=%a != %a'
885n/a % (raw, errors, decoded, expected))
886n/a else:
887n/a self.assertRaises(UnicodeDecodeError,
888n/a raw.decode, 'cp65001', errors)
889n/a
890n/a def test_lone_surrogates(self):
891n/a self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
892n/a self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
893n/a self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
894n/a b'[\\udc80]')
895n/a self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
896n/a b'[\\udc80]')
897n/a self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
898n/a b'[&#56448;]')
899n/a self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
900n/a b'[\x80]')
901n/a self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
902n/a b'[]')
903n/a self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
904n/a b'[?]')
905n/a
906n/a def test_surrogatepass_handler(self):
907n/a self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
908n/a b"abc\xed\xa0\x80def")
909n/a self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
910n/a "abc\ud800def")
911n/a self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
912n/a b"\xf0\x90\xbf\xbf\xed\xa0\x80")
913n/a self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
914n/a "\U00010fff\uD800")
915n/a self.assertTrue(codecs.lookup_error("surrogatepass"))
916n/a
917n/a
918n/aclass UTF7Test(ReadTest, unittest.TestCase):
919n/a encoding = "utf-7"
920n/a
921n/a def test_ascii(self):
922n/a # Set D (directly encoded characters)
923n/a set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
924n/a 'abcdefghijklmnopqrstuvwxyz'
925n/a '0123456789'
926n/a '\'(),-./:?')
927n/a self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
928n/a self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
929n/a # Set O (optional direct characters)
930n/a set_o = ' !"#$%&*;<=>@[]^_`{|}'
931n/a self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
932n/a self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
933n/a # +
934n/a self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
935n/a self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
936n/a # White spaces
937n/a ws = ' \t\n\r'
938n/a self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
939n/a self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
940n/a # Other ASCII characters
941n/a other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
942n/a set(set_d + set_o + '+' + ws)))
943n/a self.assertEqual(other_ascii.encode(self.encoding),
944n/a b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
945n/a b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
946n/a
947n/a def test_partial(self):
948n/a self.check_partial(
949n/a 'a+-b\x00c\x80d\u0100e\U00010000f',
950n/a [
951n/a 'a',
952n/a 'a',
953n/a 'a+',
954n/a 'a+-',
955n/a 'a+-b',
956n/a 'a+-b',
957n/a 'a+-b',
958n/a 'a+-b',
959n/a 'a+-b',
960n/a 'a+-b\x00',
961n/a 'a+-b\x00c',
962n/a 'a+-b\x00c',
963n/a 'a+-b\x00c',
964n/a 'a+-b\x00c',
965n/a 'a+-b\x00c',
966n/a 'a+-b\x00c\x80',
967n/a 'a+-b\x00c\x80d',
968n/a 'a+-b\x00c\x80d',
969n/a 'a+-b\x00c\x80d',
970n/a 'a+-b\x00c\x80d',
971n/a 'a+-b\x00c\x80d',
972n/a 'a+-b\x00c\x80d\u0100',
973n/a 'a+-b\x00c\x80d\u0100e',
974n/a 'a+-b\x00c\x80d\u0100e',
975n/a 'a+-b\x00c\x80d\u0100e',
976n/a 'a+-b\x00c\x80d\u0100e',
977n/a 'a+-b\x00c\x80d\u0100e',
978n/a 'a+-b\x00c\x80d\u0100e',
979n/a 'a+-b\x00c\x80d\u0100e',
980n/a 'a+-b\x00c\x80d\u0100e',
981n/a 'a+-b\x00c\x80d\u0100e\U00010000',
982n/a 'a+-b\x00c\x80d\u0100e\U00010000f',
983n/a ]
984n/a )
985n/a
986n/a def test_errors(self):
987n/a tests = [
988n/a (b'\xffb', '\ufffdb'),
989n/a (b'a\xffb', 'a\ufffdb'),
990n/a (b'a\xff\xffb', 'a\ufffd\ufffdb'),
991n/a (b'a+IK', 'a\ufffd'),
992n/a (b'a+IK-b', 'a\ufffdb'),
993n/a (b'a+IK,b', 'a\ufffdb'),
994n/a (b'a+IKx', 'a\u20ac\ufffd'),
995n/a (b'a+IKx-b', 'a\u20ac\ufffdb'),
996n/a (b'a+IKwgr', 'a\u20ac\ufffd'),
997n/a (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
998n/a (b'a+IKwgr,', 'a\u20ac\ufffd'),
999n/a (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
1000n/a (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
1001n/a (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
1002n/a (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
1003n/a (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
1004n/a (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
1005n/a (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
1006n/a (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
1007n/a (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
1008n/a ]
1009n/a for raw, expected in tests:
1010n/a with self.subTest(raw=raw):
1011n/a self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
1012n/a raw, 'strict', True)
1013n/a self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1014n/a
1015n/a def test_nonbmp(self):
1016n/a self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1017n/a self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1018n/a self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
1019n/a self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1020n/a self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1021n/a self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1022n/a self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1023n/a self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1024n/a b'+IKwgrNgB3KA-')
1025n/a self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1026n/a '\u20ac\u20ac\U000104A0')
1027n/a self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1028n/a '\u20ac\u20ac\U000104A0')
1029n/a
1030n/a def test_lone_surrogates(self):
1031n/a tests = [
1032n/a (b'a+2AE-b', 'a\ud801b'),
1033n/a (b'a+2AE\xffb', 'a\ufffdb'),
1034n/a (b'a+2AE', 'a\ufffd'),
1035n/a (b'a+2AEA-b', 'a\ufffdb'),
1036n/a (b'a+2AH-b', 'a\ufffdb'),
1037n/a (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1038n/a (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1039n/a (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1040n/a (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1041n/a (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1042n/a (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1043n/a (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1044n/a (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1045n/a ]
1046n/a for raw, expected in tests:
1047n/a with self.subTest(raw=raw):
1048n/a self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1049n/a
1050n/a
1051n/aclass UTF16ExTest(unittest.TestCase):
1052n/a
1053n/a def test_errors(self):
1054n/a self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
1055n/a
1056n/a def test_bad_args(self):
1057n/a self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1058n/a
1059n/aclass ReadBufferTest(unittest.TestCase):
1060n/a
1061n/a def test_array(self):
1062n/a import array
1063n/a self.assertEqual(
1064n/a codecs.readbuffer_encode(array.array("b", b"spam")),
1065n/a (b"spam", 4)
1066n/a )
1067n/a
1068n/a def test_empty(self):
1069n/a self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
1070n/a
1071n/a def test_bad_args(self):
1072n/a self.assertRaises(TypeError, codecs.readbuffer_encode)
1073n/a self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1074n/a
1075n/aclass UTF8SigTest(UTF8Test, unittest.TestCase):
1076n/a encoding = "utf-8-sig"
1077n/a BOM = codecs.BOM_UTF8
1078n/a
1079n/a def test_partial(self):
1080n/a self.check_partial(
1081n/a "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
1082n/a [
1083n/a "",
1084n/a "",
1085n/a "", # First BOM has been read and skipped
1086n/a "",
1087n/a "",
1088n/a "\ufeff", # Second BOM has been read and emitted
1089n/a "\ufeff\x00", # "\x00" read and emitted
1090n/a "\ufeff\x00", # First byte of encoded "\xff" read
1091n/a "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1092n/a "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1093n/a "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
1094n/a "\ufeff\x00\xff\u07ff",
1095n/a "\ufeff\x00\xff\u07ff",
1096n/a "\ufeff\x00\xff\u07ff\u0800",
1097n/a "\ufeff\x00\xff\u07ff\u0800",
1098n/a "\ufeff\x00\xff\u07ff\u0800",
1099n/a "\ufeff\x00\xff\u07ff\u0800\uffff",
1100n/a "\ufeff\x00\xff\u07ff\u0800\uffff",
1101n/a "\ufeff\x00\xff\u07ff\u0800\uffff",
1102n/a "\ufeff\x00\xff\u07ff\u0800\uffff",
1103n/a "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
1104n/a ]
1105n/a )
1106n/a
1107n/a def test_bug1601501(self):
1108n/a # SF bug #1601501: check that the codec works with a buffer
1109n/a self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
1110n/a
1111n/a def test_bom(self):
1112n/a d = codecs.getincrementaldecoder("utf-8-sig")()
1113n/a s = "spam"
1114n/a self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1115n/a
1116n/a def test_stream_bom(self):
1117n/a unistring = "ABC\u00A1\u2200XYZ"
1118n/a bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1119n/a
1120n/a reader = codecs.getreader("utf-8-sig")
1121n/a for sizehint in [None] + list(range(1, 11)) + \
1122n/a [64, 128, 256, 512, 1024]:
1123n/a istream = reader(io.BytesIO(bytestring))
1124n/a ostream = io.StringIO()
1125n/a while 1:
1126n/a if sizehint is not None:
1127n/a data = istream.read(sizehint)
1128n/a else:
1129n/a data = istream.read()
1130n/a
1131n/a if not data:
1132n/a break
1133n/a ostream.write(data)
1134n/a
1135n/a got = ostream.getvalue()
1136n/a self.assertEqual(got, unistring)
1137n/a
1138n/a def test_stream_bare(self):
1139n/a unistring = "ABC\u00A1\u2200XYZ"
1140n/a bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1141n/a
1142n/a reader = codecs.getreader("utf-8-sig")
1143n/a for sizehint in [None] + list(range(1, 11)) + \
1144n/a [64, 128, 256, 512, 1024]:
1145n/a istream = reader(io.BytesIO(bytestring))
1146n/a ostream = io.StringIO()
1147n/a while 1:
1148n/a if sizehint is not None:
1149n/a data = istream.read(sizehint)
1150n/a else:
1151n/a data = istream.read()
1152n/a
1153n/a if not data:
1154n/a break
1155n/a ostream.write(data)
1156n/a
1157n/a got = ostream.getvalue()
1158n/a self.assertEqual(got, unistring)
1159n/a
1160n/aclass EscapeDecodeTest(unittest.TestCase):
1161n/a def test_empty(self):
1162n/a self.assertEqual(codecs.escape_decode(b""), (b"", 0))
1163n/a self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
1164n/a
1165n/a def test_raw(self):
1166n/a decode = codecs.escape_decode
1167n/a for b in range(256):
1168n/a b = bytes([b])
1169n/a if b != b'\\':
1170n/a self.assertEqual(decode(b + b'0'), (b + b'0', 2))
1171n/a
1172n/a def test_escape(self):
1173n/a decode = codecs.escape_decode
1174n/a check = coding_checker(self, decode)
1175n/a check(b"[\\\n]", b"[]")
1176n/a check(br'[\"]', b'["]')
1177n/a check(br"[\']", b"[']")
1178n/a check(br"[\\]", b"[\\]")
1179n/a check(br"[\a]", b"[\x07]")
1180n/a check(br"[\b]", b"[\x08]")
1181n/a check(br"[\t]", b"[\x09]")
1182n/a check(br"[\n]", b"[\x0a]")
1183n/a check(br"[\v]", b"[\x0b]")
1184n/a check(br"[\f]", b"[\x0c]")
1185n/a check(br"[\r]", b"[\x0d]")
1186n/a check(br"[\7]", b"[\x07]")
1187n/a check(br"[\78]", b"[\x078]")
1188n/a check(br"[\41]", b"[!]")
1189n/a check(br"[\418]", b"[!8]")
1190n/a check(br"[\101]", b"[A]")
1191n/a check(br"[\1010]", b"[A0]")
1192n/a check(br"[\501]", b"[A]")
1193n/a check(br"[\x41]", b"[A]")
1194n/a check(br"[\x410]", b"[A0]")
1195n/a for i in range(97, 123):
1196n/a b = bytes([i])
1197n/a if b not in b'abfnrtvx':
1198n/a with self.assertWarns(DeprecationWarning):
1199n/a check(b"\\" + b, b"\\" + b)
1200n/a with self.assertWarns(DeprecationWarning):
1201n/a check(b"\\" + b.upper(), b"\\" + b.upper())
1202n/a with self.assertWarns(DeprecationWarning):
1203n/a check(br"\8", b"\\8")
1204n/a with self.assertWarns(DeprecationWarning):
1205n/a check(br"\9", b"\\9")
1206n/a
1207n/a def test_errors(self):
1208n/a decode = codecs.escape_decode
1209n/a self.assertRaises(ValueError, decode, br"\x")
1210n/a self.assertRaises(ValueError, decode, br"[\x]")
1211n/a self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1212n/a self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1213n/a self.assertRaises(ValueError, decode, br"\x0")
1214n/a self.assertRaises(ValueError, decode, br"[\x0]")
1215n/a self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1216n/a self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
1217n/a
1218n/a
1219n/aclass RecodingTest(unittest.TestCase):
1220n/a def test_recoding(self):
1221n/a f = io.BytesIO()
1222n/a f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
1223n/a f2.write("a")
1224n/a f2.close()
1225n/a # Python used to crash on this at exit because of a refcount
1226n/a # bug in _codecsmodule.c
1227n/a
1228n/a self.assertTrue(f.closed)
1229n/a
1230n/a# From RFC 3492
1231n/apunycode_testcases = [
1232n/a # A Arabic (Egyptian):
1233n/a ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1234n/a "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
1235n/a b"egbpdaj6bu4bxfgehfvwxn"),
1236n/a # B Chinese (simplified):
1237n/a ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
1238n/a b"ihqwcrb4cv8a8dqg056pqjye"),
1239n/a # C Chinese (traditional):
1240n/a ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
1241n/a b"ihqwctvzc91f659drss3x8bo0yb"),
1242n/a # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
1243n/a ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1244n/a "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1245n/a "\u0065\u0073\u006B\u0079",
1246n/a b"Proprostnemluvesky-uyb24dma41a"),
1247n/a # E Hebrew:
1248n/a ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1249n/a "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1250n/a "\u05D1\u05E8\u05D9\u05EA",
1251n/a b"4dbcagdahymbxekheh6e0a7fei0b"),
1252n/a # F Hindi (Devanagari):
1253n/a ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
1254n/a "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1255n/a "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1256n/a "\u0939\u0948\u0902",
1257n/a b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
1258n/a
1259n/a #(G) Japanese (kanji and hiragana):
1260n/a ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
1261n/a "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1262n/a b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
1263n/a
1264n/a # (H) Korean (Hangul syllables):
1265n/a ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1266n/a "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1267n/a "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
1268n/a b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1269n/a b"psd879ccm6fea98c"),
1270n/a
1271n/a # (I) Russian (Cyrillic):
1272n/a ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1273n/a "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1274n/a "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1275n/a "\u0438",
1276n/a b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
1277n/a
1278n/a # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
1279n/a ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1280n/a "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1281n/a "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1282n/a "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1283n/a "\u0061\u00F1\u006F\u006C",
1284n/a b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
1285n/a
1286n/a # (K) Vietnamese:
1287n/a # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1288n/a # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
1289n/a ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1290n/a "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1291n/a "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1292n/a "\u0056\u0069\u1EC7\u0074",
1293n/a b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
1294n/a
1295n/a #(L) 3<nen>B<gumi><kinpachi><sensei>
1296n/a ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
1297n/a b"3B-ww4c5e180e575a65lsy2b"),
1298n/a
1299n/a # (M) <amuro><namie>-with-SUPER-MONKEYS
1300n/a ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1301n/a "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1302n/a "\u004F\u004E\u004B\u0045\u0059\u0053",
1303n/a b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
1304n/a
1305n/a # (N) Hello-Another-Way-<sorezore><no><basho>
1306n/a ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1307n/a "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1308n/a "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
1309n/a b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
1310n/a
1311n/a # (O) <hitotsu><yane><no><shita>2
1312n/a ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
1313n/a b"2-u9tlzr9756bt3uc0v"),
1314n/a
1315n/a # (P) Maji<de>Koi<suru>5<byou><mae>
1316n/a ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1317n/a "\u308B\u0035\u79D2\u524D",
1318n/a b"MajiKoi5-783gue6qz075azm5e"),
1319n/a
1320n/a # (Q) <pafii>de<runba>
1321n/a ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
1322n/a b"de-jg4avhby1noc0d"),
1323n/a
1324n/a # (R) <sono><supiido><de>
1325n/a ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
1326n/a b"d9juau41awczczp"),
1327n/a
1328n/a # (S) -> $1.00 <-
1329n/a ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1330n/a "\u003C\u002D",
1331n/a b"-> $1.00 <--")
1332n/a ]
1333n/a
1334n/afor i in punycode_testcases:
1335n/a if len(i)!=2:
1336n/a print(repr(i))
1337n/a
1338n/a
1339n/aclass PunycodeTest(unittest.TestCase):
1340n/a def test_encode(self):
1341n/a for uni, puny in punycode_testcases:
1342n/a # Need to convert both strings to lower case, since
1343n/a # some of the extended encodings use upper case, but our
1344n/a # code produces only lower case. Converting just puny to
1345n/a # lower is also insufficient, since some of the input characters
1346n/a # are upper case.
1347n/a self.assertEqual(
1348n/a str(uni.encode("punycode"), "ascii").lower(),
1349n/a str(puny, "ascii").lower()
1350n/a )
1351n/a
1352n/a def test_decode(self):
1353n/a for uni, puny in punycode_testcases:
1354n/a self.assertEqual(uni, puny.decode("punycode"))
1355n/a puny = puny.decode("ascii").encode("ascii")
1356n/a self.assertEqual(uni, puny.decode("punycode"))
1357n/a
1358n/a
1359n/aclass UnicodeInternalTest(unittest.TestCase):
1360n/a @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
1361n/a def test_bug1251300(self):
1362n/a # Decoding with unicode_internal used to not correctly handle "code
1363n/a # points" above 0x10ffff on UCS-4 builds.
1364n/a ok = [
1365n/a (b"\x00\x10\xff\xff", "\U0010ffff"),
1366n/a (b"\x00\x00\x01\x01", "\U00000101"),
1367n/a (b"", ""),
1368n/a ]
1369n/a not_ok = [
1370n/a b"\x7f\xff\xff\xff",
1371n/a b"\x80\x00\x00\x00",
1372n/a b"\x81\x00\x00\x00",
1373n/a b"\x00",
1374n/a b"\x00\x00\x00\x00\x00",
1375n/a ]
1376n/a for internal, uni in ok:
1377n/a if sys.byteorder == "little":
1378n/a internal = bytes(reversed(internal))
1379n/a with support.check_warnings():
1380n/a self.assertEqual(uni, internal.decode("unicode_internal"))
1381n/a for internal in not_ok:
1382n/a if sys.byteorder == "little":
1383n/a internal = bytes(reversed(internal))
1384n/a with support.check_warnings(('unicode_internal codec has been '
1385n/a 'deprecated', DeprecationWarning)):
1386n/a self.assertRaises(UnicodeDecodeError, internal.decode,
1387n/a "unicode_internal")
1388n/a if sys.byteorder == "little":
1389n/a invalid = b"\x00\x00\x11\x00"
1390n/a invalid_backslashreplace = r"\x00\x00\x11\x00"
1391n/a else:
1392n/a invalid = b"\x00\x11\x00\x00"
1393n/a invalid_backslashreplace = r"\x00\x11\x00\x00"
1394n/a with support.check_warnings():
1395n/a self.assertRaises(UnicodeDecodeError,
1396n/a invalid.decode, "unicode_internal")
1397n/a with support.check_warnings():
1398n/a self.assertEqual(invalid.decode("unicode_internal", "replace"),
1399n/a '\ufffd')
1400n/a with support.check_warnings():
1401n/a self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"),
1402n/a invalid_backslashreplace)
1403n/a
1404n/a @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
1405n/a def test_decode_error_attributes(self):
1406n/a try:
1407n/a with support.check_warnings(('unicode_internal codec has been '
1408n/a 'deprecated', DeprecationWarning)):
1409n/a b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
1410n/a except UnicodeDecodeError as ex:
1411n/a self.assertEqual("unicode_internal", ex.encoding)
1412n/a self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1413n/a self.assertEqual(4, ex.start)
1414n/a self.assertEqual(8, ex.end)
1415n/a else:
1416n/a self.fail()
1417n/a
1418n/a @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
1419n/a def test_decode_callback(self):
1420n/a codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1421n/a decoder = codecs.getdecoder("unicode_internal")
1422n/a with support.check_warnings(('unicode_internal codec has been '
1423n/a 'deprecated', DeprecationWarning)):
1424n/a ab = "ab".encode("unicode_internal").decode()
1425n/a ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1426n/a "ascii"),
1427n/a "UnicodeInternalTest")
1428n/a self.assertEqual(("ab", 12), ignored)
1429n/a
1430n/a def test_encode_length(self):
1431n/a with support.check_warnings(('unicode_internal codec has been '
1432n/a 'deprecated', DeprecationWarning)):
1433n/a # Issue 3739
1434n/a encoder = codecs.getencoder("unicode_internal")
1435n/a self.assertEqual(encoder("a")[1], 1)
1436n/a self.assertEqual(encoder("\xe9\u0142")[1], 2)
1437n/a
1438n/a self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
1439n/a
1440n/a# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1441n/anameprep_tests = [
1442n/a # 3.1 Map to nothing.
1443n/a (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1444n/a b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1445n/a b'\xb8\x8f\xef\xbb\xbf',
1446n/a b'foobarbaz'),
1447n/a # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
1448n/a (b'CAFE',
1449n/a b'cafe'),
1450n/a # 3.3 Case folding 8bit U+00DF (german sharp s).
1451n/a # The original test case is bogus; it says \xc3\xdf
1452n/a (b'\xc3\x9f',
1453n/a b'ss'),
1454n/a # 3.4 Case folding U+0130 (turkish capital I with dot).
1455n/a (b'\xc4\xb0',
1456n/a b'i\xcc\x87'),
1457n/a # 3.5 Case folding multibyte U+0143 U+037A.
1458n/a (b'\xc5\x83\xcd\xba',
1459n/a b'\xc5\x84 \xce\xb9'),
1460n/a # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1461n/a # XXX: skip this as it fails in UCS-2 mode
1462n/a #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1463n/a # 'telc\xe2\x88\x95kg\xcf\x83'),
1464n/a (None, None),
1465n/a # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
1466n/a (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1467n/a b'\xc7\xb0 a'),
1468n/a # 3.8 Case folding U+1FB7 and normalization.
1469n/a (b'\xe1\xbe\xb7',
1470n/a b'\xe1\xbe\xb6\xce\xb9'),
1471n/a # 3.9 Self-reverting case folding U+01F0 and normalization.
1472n/a # The original test case is bogus, it says `\xc7\xf0'
1473n/a (b'\xc7\xb0',
1474n/a b'\xc7\xb0'),
1475n/a # 3.10 Self-reverting case folding U+0390 and normalization.
1476n/a (b'\xce\x90',
1477n/a b'\xce\x90'),
1478n/a # 3.11 Self-reverting case folding U+03B0 and normalization.
1479n/a (b'\xce\xb0',
1480n/a b'\xce\xb0'),
1481n/a # 3.12 Self-reverting case folding U+1E96 and normalization.
1482n/a (b'\xe1\xba\x96',
1483n/a b'\xe1\xba\x96'),
1484n/a # 3.13 Self-reverting case folding U+1F56 and normalization.
1485n/a (b'\xe1\xbd\x96',
1486n/a b'\xe1\xbd\x96'),
1487n/a # 3.14 ASCII space character U+0020.
1488n/a (b' ',
1489n/a b' '),
1490n/a # 3.15 Non-ASCII 8bit space character U+00A0.
1491n/a (b'\xc2\xa0',
1492n/a b' '),
1493n/a # 3.16 Non-ASCII multibyte space character U+1680.
1494n/a (b'\xe1\x9a\x80',
1495n/a None),
1496n/a # 3.17 Non-ASCII multibyte space character U+2000.
1497n/a (b'\xe2\x80\x80',
1498n/a b' '),
1499n/a # 3.18 Zero Width Space U+200b.
1500n/a (b'\xe2\x80\x8b',
1501n/a b''),
1502n/a # 3.19 Non-ASCII multibyte space character U+3000.
1503n/a (b'\xe3\x80\x80',
1504n/a b' '),
1505n/a # 3.20 ASCII control characters U+0010 U+007F.
1506n/a (b'\x10\x7f',
1507n/a b'\x10\x7f'),
1508n/a # 3.21 Non-ASCII 8bit control character U+0085.
1509n/a (b'\xc2\x85',
1510n/a None),
1511n/a # 3.22 Non-ASCII multibyte control character U+180E.
1512n/a (b'\xe1\xa0\x8e',
1513n/a None),
1514n/a # 3.23 Zero Width No-Break Space U+FEFF.
1515n/a (b'\xef\xbb\xbf',
1516n/a b''),
1517n/a # 3.24 Non-ASCII control character U+1D175.
1518n/a (b'\xf0\x9d\x85\xb5',
1519n/a None),
1520n/a # 3.25 Plane 0 private use character U+F123.
1521n/a (b'\xef\x84\xa3',
1522n/a None),
1523n/a # 3.26 Plane 15 private use character U+F1234.
1524n/a (b'\xf3\xb1\x88\xb4',
1525n/a None),
1526n/a # 3.27 Plane 16 private use character U+10F234.
1527n/a (b'\xf4\x8f\x88\xb4',
1528n/a None),
1529n/a # 3.28 Non-character code point U+8FFFE.
1530n/a (b'\xf2\x8f\xbf\xbe',
1531n/a None),
1532n/a # 3.29 Non-character code point U+10FFFF.
1533n/a (b'\xf4\x8f\xbf\xbf',
1534n/a None),
1535n/a # 3.30 Surrogate code U+DF42.
1536n/a (b'\xed\xbd\x82',
1537n/a None),
1538n/a # 3.31 Non-plain text character U+FFFD.
1539n/a (b'\xef\xbf\xbd',
1540n/a None),
1541n/a # 3.32 Ideographic description character U+2FF5.
1542n/a (b'\xe2\xbf\xb5',
1543n/a None),
1544n/a # 3.33 Display property character U+0341.
1545n/a (b'\xcd\x81',
1546n/a b'\xcc\x81'),
1547n/a # 3.34 Left-to-right mark U+200E.
1548n/a (b'\xe2\x80\x8e',
1549n/a None),
1550n/a # 3.35 Deprecated U+202A.
1551n/a (b'\xe2\x80\xaa',
1552n/a None),
1553n/a # 3.36 Language tagging character U+E0001.
1554n/a (b'\xf3\xa0\x80\x81',
1555n/a None),
1556n/a # 3.37 Language tagging character U+E0042.
1557n/a (b'\xf3\xa0\x81\x82',
1558n/a None),
1559n/a # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1560n/a (b'foo\xd6\xbebar',
1561n/a None),
1562n/a # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1563n/a (b'foo\xef\xb5\x90bar',
1564n/a None),
1565n/a # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1566n/a (b'foo\xef\xb9\xb6bar',
1567n/a b'foo \xd9\x8ebar'),
1568n/a # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1569n/a (b'\xd8\xa71',
1570n/a None),
1571n/a # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1572n/a (b'\xd8\xa71\xd8\xa8',
1573n/a b'\xd8\xa71\xd8\xa8'),
1574n/a # 3.43 Unassigned code point U+E0002.
1575n/a # Skip this test as we allow unassigned
1576n/a #(b'\xf3\xa0\x80\x82',
1577n/a # None),
1578n/a (None, None),
1579n/a # 3.44 Larger test (shrinking).
1580n/a # Original test case reads \xc3\xdf
1581n/a (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1582n/a b'\xaa\xce\xb0\xe2\x80\x80',
1583n/a b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1584n/a # 3.45 Larger test (expanding).
1585n/a # Original test case reads \xc3\x9f
1586n/a (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1587n/a b'\x80',
1588n/a b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1589n/a b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1590n/a b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1591n/a ]
1592n/a
1593n/a
1594n/aclass NameprepTest(unittest.TestCase):
1595n/a def test_nameprep(self):
1596n/a from encodings.idna import nameprep
1597n/a for pos, (orig, prepped) in enumerate(nameprep_tests):
1598n/a if orig is None:
1599n/a # Skipped
1600n/a continue
1601n/a # The Unicode strings are given in UTF-8
1602n/a orig = str(orig, "utf-8", "surrogatepass")
1603n/a if prepped is None:
1604n/a # Input contains prohibited characters
1605n/a self.assertRaises(UnicodeError, nameprep, orig)
1606n/a else:
1607n/a prepped = str(prepped, "utf-8", "surrogatepass")
1608n/a try:
1609n/a self.assertEqual(nameprep(orig), prepped)
1610n/a except Exception as e:
1611n/a raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1612n/a
1613n/a
1614n/aclass IDNACodecTest(unittest.TestCase):
1615n/a def test_builtin_decode(self):
1616n/a self.assertEqual(str(b"python.org", "idna"), "python.org")
1617n/a self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1618n/a self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1619n/a self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
1620n/a
1621n/a def test_builtin_encode(self):
1622n/a self.assertEqual("python.org".encode("idna"), b"python.org")
1623n/a self.assertEqual("python.org.".encode("idna"), b"python.org.")
1624n/a self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1625n/a self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
1626n/a
1627n/a def test_stream(self):
1628n/a r = codecs.getreader("idna")(io.BytesIO(b"abc"))
1629n/a r.read(3)
1630n/a self.assertEqual(r.read(), "")
1631n/a
1632n/a def test_incremental_decode(self):
1633n/a self.assertEqual(
1634n/a "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
1635n/a "python.org"
1636n/a )
1637n/a self.assertEqual(
1638n/a "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
1639n/a "python.org."
1640n/a )
1641n/a self.assertEqual(
1642n/a "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
1643n/a "pyth\xf6n.org."
1644n/a )
1645n/a self.assertEqual(
1646n/a "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
1647n/a "pyth\xf6n.org."
1648n/a )
1649n/a
1650n/a decoder = codecs.getincrementaldecoder("idna")()
1651n/a self.assertEqual(decoder.decode(b"xn--xam", ), "")
1652n/a self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1653n/a self.assertEqual(decoder.decode(b"rg"), "")
1654n/a self.assertEqual(decoder.decode(b"", True), "org")
1655n/a
1656n/a decoder.reset()
1657n/a self.assertEqual(decoder.decode(b"xn--xam", ), "")
1658n/a self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1659n/a self.assertEqual(decoder.decode(b"rg."), "org.")
1660n/a self.assertEqual(decoder.decode(b"", True), "")
1661n/a
1662n/a def test_incremental_encode(self):
1663n/a self.assertEqual(
1664n/a b"".join(codecs.iterencode("python.org", "idna")),
1665n/a b"python.org"
1666n/a )
1667n/a self.assertEqual(
1668n/a b"".join(codecs.iterencode("python.org.", "idna")),
1669n/a b"python.org."
1670n/a )
1671n/a self.assertEqual(
1672n/a b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1673n/a b"xn--pythn-mua.org."
1674n/a )
1675n/a self.assertEqual(
1676n/a b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1677n/a b"xn--pythn-mua.org."
1678n/a )
1679n/a
1680n/a encoder = codecs.getincrementalencoder("idna")()
1681n/a self.assertEqual(encoder.encode("\xe4x"), b"")
1682n/a self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1683n/a self.assertEqual(encoder.encode("", True), b"org")
1684n/a
1685n/a encoder.reset()
1686n/a self.assertEqual(encoder.encode("\xe4x"), b"")
1687n/a self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1688n/a self.assertEqual(encoder.encode("", True), b"")
1689n/a
1690n/a def test_errors(self):
1691n/a """Only supports "strict" error handler"""
1692n/a "python.org".encode("idna", "strict")
1693n/a b"python.org".decode("idna", "strict")
1694n/a for errors in ("ignore", "replace", "backslashreplace",
1695n/a "surrogateescape"):
1696n/a self.assertRaises(Exception, "python.org".encode, "idna", errors)
1697n/a self.assertRaises(Exception,
1698n/a b"python.org".decode, "idna", errors)
1699n/a
1700n/a
1701n/aclass CodecsModuleTest(unittest.TestCase):
1702n/a
1703n/a def test_decode(self):
1704n/a self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1705n/a '\xe4\xf6\xfc')
1706n/a self.assertRaises(TypeError, codecs.decode)
1707n/a self.assertEqual(codecs.decode(b'abc'), 'abc')
1708n/a self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
1709n/a
1710n/a # test keywords
1711n/a self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1712n/a '\xe4\xf6\xfc')
1713n/a self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1714n/a '[]')
1715n/a
1716n/a def test_encode(self):
1717n/a self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1718n/a b'\xe4\xf6\xfc')
1719n/a self.assertRaises(TypeError, codecs.encode)
1720n/a self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
1721n/a self.assertEqual(codecs.encode('abc'), b'abc')
1722n/a self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
1723n/a
1724n/a # test keywords
1725n/a self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1726n/a b'\xe4\xf6\xfc')
1727n/a self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1728n/a b'[]')
1729n/a
1730n/a def test_register(self):
1731n/a self.assertRaises(TypeError, codecs.register)
1732n/a self.assertRaises(TypeError, codecs.register, 42)
1733n/a
1734n/a def test_lookup(self):
1735n/a self.assertRaises(TypeError, codecs.lookup)
1736n/a self.assertRaises(LookupError, codecs.lookup, "__spam__")
1737n/a self.assertRaises(LookupError, codecs.lookup, " ")
1738n/a
1739n/a def test_getencoder(self):
1740n/a self.assertRaises(TypeError, codecs.getencoder)
1741n/a self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1742n/a
1743n/a def test_getdecoder(self):
1744n/a self.assertRaises(TypeError, codecs.getdecoder)
1745n/a self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1746n/a
1747n/a def test_getreader(self):
1748n/a self.assertRaises(TypeError, codecs.getreader)
1749n/a self.assertRaises(LookupError, codecs.getreader, "__spam__")
1750n/a
1751n/a def test_getwriter(self):
1752n/a self.assertRaises(TypeError, codecs.getwriter)
1753n/a self.assertRaises(LookupError, codecs.getwriter, "__spam__")
1754n/a
1755n/a def test_lookup_issue1813(self):
1756n/a # Issue #1813: under Turkish locales, lookup of some codecs failed
1757n/a # because 'I' is lowercased as "ร„ยฑ" (dotless i)
1758n/a oldlocale = locale.setlocale(locale.LC_CTYPE)
1759n/a self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1760n/a try:
1761n/a locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1762n/a except locale.Error:
1763n/a # Unsupported locale on this system
1764n/a self.skipTest('test needs Turkish locale')
1765n/a c = codecs.lookup('ASCII')
1766n/a self.assertEqual(c.name, 'ascii')
1767n/a
1768n/a def test_all(self):
1769n/a api = (
1770n/a "encode", "decode",
1771n/a "register", "CodecInfo", "Codec", "IncrementalEncoder",
1772n/a "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1773n/a "getencoder", "getdecoder", "getincrementalencoder",
1774n/a "getincrementaldecoder", "getreader", "getwriter",
1775n/a "register_error", "lookup_error",
1776n/a "strict_errors", "replace_errors", "ignore_errors",
1777n/a "xmlcharrefreplace_errors", "backslashreplace_errors",
1778n/a "namereplace_errors",
1779n/a "open", "EncodedFile",
1780n/a "iterencode", "iterdecode",
1781n/a "BOM", "BOM_BE", "BOM_LE",
1782n/a "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1783n/a "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1784n/a "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1785n/a "StreamReaderWriter", "StreamRecoder",
1786n/a )
1787n/a self.assertCountEqual(api, codecs.__all__)
1788n/a for api in codecs.__all__:
1789n/a getattr(codecs, api)
1790n/a
1791n/a def test_open(self):
1792n/a self.addCleanup(support.unlink, support.TESTFN)
1793n/a for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1794n/a with self.subTest(mode), \
1795n/a codecs.open(support.TESTFN, mode, 'ascii') as file:
1796n/a self.assertIsInstance(file, codecs.StreamReaderWriter)
1797n/a
1798n/a def test_undefined(self):
1799n/a self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1800n/a self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1801n/a self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1802n/a self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1803n/a for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1804n/a self.assertRaises(UnicodeError,
1805n/a codecs.encode, 'abc', 'undefined', errors)
1806n/a self.assertRaises(UnicodeError,
1807n/a codecs.decode, b'abc', 'undefined', errors)
1808n/a
1809n/a
1810n/aclass StreamReaderTest(unittest.TestCase):
1811n/a
1812n/a def setUp(self):
1813n/a self.reader = codecs.getreader('utf-8')
1814n/a self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
1815n/a
1816n/a def test_readlines(self):
1817n/a f = self.reader(self.stream)
1818n/a self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
1819n/a
1820n/a
1821n/aclass EncodedFileTest(unittest.TestCase):
1822n/a
1823n/a def test_basic(self):
1824n/a f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
1825n/a ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
1826n/a self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
1827n/a
1828n/a f = io.BytesIO()
1829n/a ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
1830n/a ef.write(b'\xc3\xbc')
1831n/a self.assertEqual(f.getvalue(), b'\xfc')
1832n/a
1833n/aall_unicode_encodings = [
1834n/a "ascii",
1835n/a "big5",
1836n/a "big5hkscs",
1837n/a "charmap",
1838n/a "cp037",
1839n/a "cp1006",
1840n/a "cp1026",
1841n/a "cp1125",
1842n/a "cp1140",
1843n/a "cp1250",
1844n/a "cp1251",
1845n/a "cp1252",
1846n/a "cp1253",
1847n/a "cp1254",
1848n/a "cp1255",
1849n/a "cp1256",
1850n/a "cp1257",
1851n/a "cp1258",
1852n/a "cp424",
1853n/a "cp437",
1854n/a "cp500",
1855n/a "cp720",
1856n/a "cp737",
1857n/a "cp775",
1858n/a "cp850",
1859n/a "cp852",
1860n/a "cp855",
1861n/a "cp856",
1862n/a "cp857",
1863n/a "cp858",
1864n/a "cp860",
1865n/a "cp861",
1866n/a "cp862",
1867n/a "cp863",
1868n/a "cp864",
1869n/a "cp865",
1870n/a "cp866",
1871n/a "cp869",
1872n/a "cp874",
1873n/a "cp875",
1874n/a "cp932",
1875n/a "cp949",
1876n/a "cp950",
1877n/a "euc_jis_2004",
1878n/a "euc_jisx0213",
1879n/a "euc_jp",
1880n/a "euc_kr",
1881n/a "gb18030",
1882n/a "gb2312",
1883n/a "gbk",
1884n/a "hp_roman8",
1885n/a "hz",
1886n/a "idna",
1887n/a "iso2022_jp",
1888n/a "iso2022_jp_1",
1889n/a "iso2022_jp_2",
1890n/a "iso2022_jp_2004",
1891n/a "iso2022_jp_3",
1892n/a "iso2022_jp_ext",
1893n/a "iso2022_kr",
1894n/a "iso8859_1",
1895n/a "iso8859_10",
1896n/a "iso8859_11",
1897n/a "iso8859_13",
1898n/a "iso8859_14",
1899n/a "iso8859_15",
1900n/a "iso8859_16",
1901n/a "iso8859_2",
1902n/a "iso8859_3",
1903n/a "iso8859_4",
1904n/a "iso8859_5",
1905n/a "iso8859_6",
1906n/a "iso8859_7",
1907n/a "iso8859_8",
1908n/a "iso8859_9",
1909n/a "johab",
1910n/a "koi8_r",
1911n/a "koi8_t",
1912n/a "koi8_u",
1913n/a "kz1048",
1914n/a "latin_1",
1915n/a "mac_cyrillic",
1916n/a "mac_greek",
1917n/a "mac_iceland",
1918n/a "mac_latin2",
1919n/a "mac_roman",
1920n/a "mac_turkish",
1921n/a "palmos",
1922n/a "ptcp154",
1923n/a "punycode",
1924n/a "raw_unicode_escape",
1925n/a "shift_jis",
1926n/a "shift_jis_2004",
1927n/a "shift_jisx0213",
1928n/a "tis_620",
1929n/a "unicode_escape",
1930n/a "unicode_internal",
1931n/a "utf_16",
1932n/a "utf_16_be",
1933n/a "utf_16_le",
1934n/a "utf_7",
1935n/a "utf_8",
1936n/a]
1937n/a
1938n/aif hasattr(codecs, "mbcs_encode"):
1939n/a all_unicode_encodings.append("mbcs")
1940n/aif hasattr(codecs, "oem_encode"):
1941n/a all_unicode_encodings.append("oem")
1942n/a
1943n/a# The following encoding is not tested, because it's not supposed
1944n/a# to work:
1945n/a# "undefined"
1946n/a
1947n/a# The following encodings don't work in stateful mode
1948n/abroken_unicode_with_stateful = [
1949n/a "punycode",
1950n/a "unicode_internal"
1951n/a]
1952n/a
1953n/a
1954n/aclass BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
1955n/a def test_basics(self):
1956n/a s = "abc123" # all codecs should be able to encode these
1957n/a for encoding in all_unicode_encodings:
1958n/a name = codecs.lookup(encoding).name
1959n/a if encoding.endswith("_codec"):
1960n/a name += "_codec"
1961n/a elif encoding == "latin_1":
1962n/a name = "latin_1"
1963n/a self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
1964n/a
1965n/a with support.check_warnings():
1966n/a # unicode-internal has been deprecated
1967n/a (b, size) = codecs.getencoder(encoding)(s)
1968n/a self.assertEqual(size, len(s), "encoding=%r" % encoding)
1969n/a (chars, size) = codecs.getdecoder(encoding)(b)
1970n/a self.assertEqual(chars, s, "encoding=%r" % encoding)
1971n/a
1972n/a if encoding not in broken_unicode_with_stateful:
1973n/a # check stream reader/writer
1974n/a q = Queue(b"")
1975n/a writer = codecs.getwriter(encoding)(q)
1976n/a encodedresult = b""
1977n/a for c in s:
1978n/a writer.write(c)
1979n/a chunk = q.read()
1980n/a self.assertTrue(type(chunk) is bytes, type(chunk))
1981n/a encodedresult += chunk
1982n/a q = Queue(b"")
1983n/a reader = codecs.getreader(encoding)(q)
1984n/a decodedresult = ""
1985n/a for c in encodedresult:
1986n/a q.write(bytes([c]))
1987n/a decodedresult += reader.read()
1988n/a self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
1989n/a
1990n/a if encoding not in broken_unicode_with_stateful:
1991n/a # check incremental decoder/encoder and iterencode()/iterdecode()
1992n/a try:
1993n/a encoder = codecs.getincrementalencoder(encoding)()
1994n/a except LookupError: # no IncrementalEncoder
1995n/a pass
1996n/a else:
1997n/a # check incremental decoder/encoder
1998n/a encodedresult = b""
1999n/a for c in s:
2000n/a encodedresult += encoder.encode(c)
2001n/a encodedresult += encoder.encode("", True)
2002n/a decoder = codecs.getincrementaldecoder(encoding)()
2003n/a decodedresult = ""
2004n/a for c in encodedresult:
2005n/a decodedresult += decoder.decode(bytes([c]))
2006n/a decodedresult += decoder.decode(b"", True)
2007n/a self.assertEqual(decodedresult, s,
2008n/a "encoding=%r" % encoding)
2009n/a
2010n/a # check iterencode()/iterdecode()
2011n/a result = "".join(codecs.iterdecode(
2012n/a codecs.iterencode(s, encoding), encoding))
2013n/a self.assertEqual(result, s, "encoding=%r" % encoding)
2014n/a
2015n/a # check iterencode()/iterdecode() with empty string
2016n/a result = "".join(codecs.iterdecode(
2017n/a codecs.iterencode("", encoding), encoding))
2018n/a self.assertEqual(result, "")
2019n/a
2020n/a if encoding not in ("idna", "mbcs"):
2021n/a # check incremental decoder/encoder with errors argument
2022n/a try:
2023n/a encoder = codecs.getincrementalencoder(encoding)("ignore")
2024n/a except LookupError: # no IncrementalEncoder
2025n/a pass
2026n/a else:
2027n/a encodedresult = b"".join(encoder.encode(c) for c in s)
2028n/a decoder = codecs.getincrementaldecoder(encoding)("ignore")
2029n/a decodedresult = "".join(decoder.decode(bytes([c]))
2030n/a for c in encodedresult)
2031n/a self.assertEqual(decodedresult, s,
2032n/a "encoding=%r" % encoding)
2033n/a
2034n/a @support.cpython_only
2035n/a def test_basics_capi(self):
2036n/a from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
2037n/a s = "abc123" # all codecs should be able to encode these
2038n/a for encoding in all_unicode_encodings:
2039n/a if encoding not in broken_unicode_with_stateful:
2040n/a # check incremental decoder/encoder (fetched via the C API)
2041n/a try:
2042n/a cencoder = codec_incrementalencoder(encoding)
2043n/a except LookupError: # no IncrementalEncoder
2044n/a pass
2045n/a else:
2046n/a # check C API
2047n/a encodedresult = b""
2048n/a for c in s:
2049n/a encodedresult += cencoder.encode(c)
2050n/a encodedresult += cencoder.encode("", True)
2051n/a cdecoder = codec_incrementaldecoder(encoding)
2052n/a decodedresult = ""
2053n/a for c in encodedresult:
2054n/a decodedresult += cdecoder.decode(bytes([c]))
2055n/a decodedresult += cdecoder.decode(b"", True)
2056n/a self.assertEqual(decodedresult, s,
2057n/a "encoding=%r" % encoding)
2058n/a
2059n/a if encoding not in ("idna", "mbcs"):
2060n/a # check incremental decoder/encoder with errors argument
2061n/a try:
2062n/a cencoder = codec_incrementalencoder(encoding, "ignore")
2063n/a except LookupError: # no IncrementalEncoder
2064n/a pass
2065n/a else:
2066n/a encodedresult = b"".join(cencoder.encode(c) for c in s)
2067n/a cdecoder = codec_incrementaldecoder(encoding, "ignore")
2068n/a decodedresult = "".join(cdecoder.decode(bytes([c]))
2069n/a for c in encodedresult)
2070n/a self.assertEqual(decodedresult, s,
2071n/a "encoding=%r" % encoding)
2072n/a
2073n/a def test_seek(self):
2074n/a # all codecs should be able to encode these
2075n/a s = "%s\n%s\n" % (100*"abc123", 100*"def456")
2076n/a for encoding in all_unicode_encodings:
2077n/a if encoding == "idna": # FIXME: See SF bug #1163178
2078n/a continue
2079n/a if encoding in broken_unicode_with_stateful:
2080n/a continue
2081n/a reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
2082n/a for t in range(5):
2083n/a # Test that calling seek resets the internal codec state and buffers
2084n/a reader.seek(0, 0)
2085n/a data = reader.read()
2086n/a self.assertEqual(s, data)
2087n/a
2088n/a def test_bad_decode_args(self):
2089n/a for encoding in all_unicode_encodings:
2090n/a decoder = codecs.getdecoder(encoding)
2091n/a self.assertRaises(TypeError, decoder)
2092n/a if encoding not in ("idna", "punycode"):
2093n/a self.assertRaises(TypeError, decoder, 42)
2094n/a
2095n/a def test_bad_encode_args(self):
2096n/a for encoding in all_unicode_encodings:
2097n/a encoder = codecs.getencoder(encoding)
2098n/a with support.check_warnings():
2099n/a # unicode-internal has been deprecated
2100n/a self.assertRaises(TypeError, encoder)
2101n/a
2102n/a def test_encoding_map_type_initialized(self):
2103n/a from encodings import cp1140
2104n/a # This used to crash, we are only verifying there's no crash.
2105n/a table_type = type(cp1140.encoding_table)
2106n/a self.assertEqual(table_type, table_type)
2107n/a
2108n/a def test_decoder_state(self):
2109n/a # Check that getstate() and setstate() handle the state properly
2110n/a u = "abc123"
2111n/a for encoding in all_unicode_encodings:
2112n/a if encoding not in broken_unicode_with_stateful:
2113n/a self.check_state_handling_decode(encoding, u, u.encode(encoding))
2114n/a self.check_state_handling_encode(encoding, u, u.encode(encoding))
2115n/a
2116n/a
2117n/aclass CharmapTest(unittest.TestCase):
2118n/a def test_decode_with_string_map(self):
2119n/a self.assertEqual(
2120n/a codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
2121n/a ("abc", 3)
2122n/a )
2123n/a
2124n/a self.assertEqual(
2125n/a codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2126n/a ("\U0010FFFFbc", 3)
2127n/a )
2128n/a
2129n/a self.assertRaises(UnicodeDecodeError,
2130n/a codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2131n/a )
2132n/a
2133n/a self.assertRaises(UnicodeDecodeError,
2134n/a codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2135n/a )
2136n/a
2137n/a self.assertEqual(
2138n/a codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
2139n/a ("ab\ufffd", 3)
2140n/a )
2141n/a
2142n/a self.assertEqual(
2143n/a codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
2144n/a ("ab\ufffd", 3)
2145n/a )
2146n/a
2147n/a self.assertEqual(
2148n/a codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2149n/a ("ab\\x02", 3)
2150n/a )
2151n/a
2152n/a self.assertEqual(
2153n/a codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2154n/a ("ab\\x02", 3)
2155n/a )
2156n/a
2157n/a self.assertEqual(
2158n/a codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
2159n/a ("ab", 3)
2160n/a )
2161n/a
2162n/a self.assertEqual(
2163n/a codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
2164n/a ("ab", 3)
2165n/a )
2166n/a
2167n/a allbytes = bytes(range(256))
2168n/a self.assertEqual(
2169n/a codecs.charmap_decode(allbytes, "ignore", ""),
2170n/a ("", len(allbytes))
2171n/a )
2172n/a
2173n/a def test_decode_with_int2str_map(self):
2174n/a self.assertEqual(
2175n/a codecs.charmap_decode(b"\x00\x01\x02", "strict",
2176n/a {0: 'a', 1: 'b', 2: 'c'}),
2177n/a ("abc", 3)
2178n/a )
2179n/a
2180n/a self.assertEqual(
2181n/a codecs.charmap_decode(b"\x00\x01\x02", "strict",
2182n/a {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2183n/a ("AaBbCc", 3)
2184n/a )
2185n/a
2186n/a self.assertEqual(
2187n/a codecs.charmap_decode(b"\x00\x01\x02", "strict",
2188n/a {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2189n/a ("\U0010FFFFbc", 3)
2190n/a )
2191n/a
2192n/a self.assertEqual(
2193n/a codecs.charmap_decode(b"\x00\x01\x02", "strict",
2194n/a {0: 'a', 1: 'b', 2: ''}),
2195n/a ("ab", 3)
2196n/a )
2197n/a
2198n/a self.assertRaises(UnicodeDecodeError,
2199n/a codecs.charmap_decode, b"\x00\x01\x02", "strict",
2200n/a {0: 'a', 1: 'b'}
2201n/a )
2202n/a
2203n/a self.assertRaises(UnicodeDecodeError,
2204n/a codecs.charmap_decode, b"\x00\x01\x02", "strict",
2205n/a {0: 'a', 1: 'b', 2: None}
2206n/a )
2207n/a
2208n/a # Issue #14850
2209n/a self.assertRaises(UnicodeDecodeError,
2210n/a codecs.charmap_decode, b"\x00\x01\x02", "strict",
2211n/a {0: 'a', 1: 'b', 2: '\ufffe'}
2212n/a )
2213n/a
2214n/a self.assertEqual(
2215n/a codecs.charmap_decode(b"\x00\x01\x02", "replace",
2216n/a {0: 'a', 1: 'b'}),
2217n/a ("ab\ufffd", 3)
2218n/a )
2219n/a
2220n/a self.assertEqual(
2221n/a codecs.charmap_decode(b"\x00\x01\x02", "replace",
2222n/a {0: 'a', 1: 'b', 2: None}),
2223n/a ("ab\ufffd", 3)
2224n/a )
2225n/a
2226n/a # Issue #14850
2227n/a self.assertEqual(
2228n/a codecs.charmap_decode(b"\x00\x01\x02", "replace",
2229n/a {0: 'a', 1: 'b', 2: '\ufffe'}),
2230n/a ("ab\ufffd", 3)
2231n/a )
2232n/a
2233n/a self.assertEqual(
2234n/a codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2235n/a {0: 'a', 1: 'b'}),
2236n/a ("ab\\x02", 3)
2237n/a )
2238n/a
2239n/a self.assertEqual(
2240n/a codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2241n/a {0: 'a', 1: 'b', 2: None}),
2242n/a ("ab\\x02", 3)
2243n/a )
2244n/a
2245n/a # Issue #14850
2246n/a self.assertEqual(
2247n/a codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2248n/a {0: 'a', 1: 'b', 2: '\ufffe'}),
2249n/a ("ab\\x02", 3)
2250n/a )
2251n/a
2252n/a self.assertEqual(
2253n/a codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2254n/a {0: 'a', 1: 'b'}),
2255n/a ("ab", 3)
2256n/a )
2257n/a
2258n/a self.assertEqual(
2259n/a codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2260n/a {0: 'a', 1: 'b', 2: None}),
2261n/a ("ab", 3)
2262n/a )
2263n/a
2264n/a # Issue #14850
2265n/a self.assertEqual(
2266n/a codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2267n/a {0: 'a', 1: 'b', 2: '\ufffe'}),
2268n/a ("ab", 3)
2269n/a )
2270n/a
2271n/a allbytes = bytes(range(256))
2272n/a self.assertEqual(
2273n/a codecs.charmap_decode(allbytes, "ignore", {}),
2274n/a ("", len(allbytes))
2275n/a )
2276n/a
2277n/a def test_decode_with_int2int_map(self):
2278n/a a = ord('a')
2279n/a b = ord('b')
2280n/a c = ord('c')
2281n/a
2282n/a self.assertEqual(
2283n/a codecs.charmap_decode(b"\x00\x01\x02", "strict",
2284n/a {0: a, 1: b, 2: c}),
2285n/a ("abc", 3)
2286n/a )
2287n/a
2288n/a # Issue #15379
2289n/a self.assertEqual(
2290n/a codecs.charmap_decode(b"\x00\x01\x02", "strict",
2291n/a {0: 0x10FFFF, 1: b, 2: c}),
2292n/a ("\U0010FFFFbc", 3)
2293n/a )
2294n/a
2295n/a self.assertEqual(
2296n/a codecs.charmap_decode(b"\x00\x01\x02", "strict",
2297n/a {0: sys.maxunicode, 1: b, 2: c}),
2298n/a (chr(sys.maxunicode) + "bc", 3)
2299n/a )
2300n/a
2301n/a self.assertRaises(TypeError,
2302n/a codecs.charmap_decode, b"\x00\x01\x02", "strict",
2303n/a {0: sys.maxunicode + 1, 1: b, 2: c}
2304n/a )
2305n/a
2306n/a self.assertRaises(UnicodeDecodeError,
2307n/a codecs.charmap_decode, b"\x00\x01\x02", "strict",
2308n/a {0: a, 1: b},
2309n/a )
2310n/a
2311n/a self.assertRaises(UnicodeDecodeError,
2312n/a codecs.charmap_decode, b"\x00\x01\x02", "strict",
2313n/a {0: a, 1: b, 2: 0xFFFE},
2314n/a )
2315n/a
2316n/a self.assertEqual(
2317n/a codecs.charmap_decode(b"\x00\x01\x02", "replace",
2318n/a {0: a, 1: b}),
2319n/a ("ab\ufffd", 3)
2320n/a )
2321n/a
2322n/a self.assertEqual(
2323n/a codecs.charmap_decode(b"\x00\x01\x02", "replace",
2324n/a {0: a, 1: b, 2: 0xFFFE}),
2325n/a ("ab\ufffd", 3)
2326n/a )
2327n/a
2328n/a self.assertEqual(
2329n/a codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2330n/a {0: a, 1: b}),
2331n/a ("ab\\x02", 3)
2332n/a )
2333n/a
2334n/a self.assertEqual(
2335n/a codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2336n/a {0: a, 1: b, 2: 0xFFFE}),
2337n/a ("ab\\x02", 3)
2338n/a )
2339n/a
2340n/a self.assertEqual(
2341n/a codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2342n/a {0: a, 1: b}),
2343n/a ("ab", 3)
2344n/a )
2345n/a
2346n/a self.assertEqual(
2347n/a codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2348n/a {0: a, 1: b, 2: 0xFFFE}),
2349n/a ("ab", 3)
2350n/a )
2351n/a
2352n/a
2353n/aclass WithStmtTest(unittest.TestCase):
2354n/a def test_encodedfile(self):
2355n/a f = io.BytesIO(b"\xc3\xbc")
2356n/a with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2357n/a self.assertEqual(ef.read(), b"\xfc")
2358n/a self.assertTrue(f.closed)
2359n/a
2360n/a def test_streamreaderwriter(self):
2361n/a f = io.BytesIO(b"\xc3\xbc")
2362n/a info = codecs.lookup("utf-8")
2363n/a with codecs.StreamReaderWriter(f, info.streamreader,
2364n/a info.streamwriter, 'strict') as srw:
2365n/a self.assertEqual(srw.read(), "\xfc")
2366n/a
2367n/a
2368n/aclass TypesTest(unittest.TestCase):
2369n/a def test_decode_unicode(self):
2370n/a # Most decoders don't accept unicode input
2371n/a decoders = [
2372n/a codecs.utf_7_decode,
2373n/a codecs.utf_8_decode,
2374n/a codecs.utf_16_le_decode,
2375n/a codecs.utf_16_be_decode,
2376n/a codecs.utf_16_ex_decode,
2377n/a codecs.utf_32_decode,
2378n/a codecs.utf_32_le_decode,
2379n/a codecs.utf_32_be_decode,
2380n/a codecs.utf_32_ex_decode,
2381n/a codecs.latin_1_decode,
2382n/a codecs.ascii_decode,
2383n/a codecs.charmap_decode,
2384n/a ]
2385n/a if hasattr(codecs, "mbcs_decode"):
2386n/a decoders.append(codecs.mbcs_decode)
2387n/a for decoder in decoders:
2388n/a self.assertRaises(TypeError, decoder, "xxx")
2389n/a
2390n/a def test_unicode_escape(self):
2391n/a # Escape-decoding a unicode string is supported and gives the same
2392n/a # result as decoding the equivalent ASCII bytes string.
2393n/a self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2394n/a self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2395n/a self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2396n/a self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2397n/a
2398n/a self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2399n/a self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2400n/a self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2401n/a (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
2402n/a
2403n/a self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2404n/a self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2405n/a self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2406n/a (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
2407n/a
2408n/a
2409n/aclass UnicodeEscapeTest(unittest.TestCase):
2410n/a def test_empty(self):
2411n/a self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2412n/a self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2413n/a
2414n/a def test_raw_encode(self):
2415n/a encode = codecs.unicode_escape_encode
2416n/a for b in range(32, 127):
2417n/a if b != b'\\'[0]:
2418n/a self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2419n/a
2420n/a def test_raw_decode(self):
2421n/a decode = codecs.unicode_escape_decode
2422n/a for b in range(256):
2423n/a if b != b'\\'[0]:
2424n/a self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2425n/a
2426n/a def test_escape_encode(self):
2427n/a encode = codecs.unicode_escape_encode
2428n/a check = coding_checker(self, encode)
2429n/a check('\t', br'\t')
2430n/a check('\n', br'\n')
2431n/a check('\r', br'\r')
2432n/a check('\\', br'\\')
2433n/a for b in range(32):
2434n/a if chr(b) not in '\t\n\r':
2435n/a check(chr(b), ('\\x%02x' % b).encode())
2436n/a for b in range(127, 256):
2437n/a check(chr(b), ('\\x%02x' % b).encode())
2438n/a check('\u20ac', br'\u20ac')
2439n/a check('\U0001d120', br'\U0001d120')
2440n/a
2441n/a def test_escape_decode(self):
2442n/a decode = codecs.unicode_escape_decode
2443n/a check = coding_checker(self, decode)
2444n/a check(b"[\\\n]", "[]")
2445n/a check(br'[\"]', '["]')
2446n/a check(br"[\']", "[']")
2447n/a check(br"[\\]", r"[\]")
2448n/a check(br"[\a]", "[\x07]")
2449n/a check(br"[\b]", "[\x08]")
2450n/a check(br"[\t]", "[\x09]")
2451n/a check(br"[\n]", "[\x0a]")
2452n/a check(br"[\v]", "[\x0b]")
2453n/a check(br"[\f]", "[\x0c]")
2454n/a check(br"[\r]", "[\x0d]")
2455n/a check(br"[\7]", "[\x07]")
2456n/a check(br"[\78]", "[\x078]")
2457n/a check(br"[\41]", "[!]")
2458n/a check(br"[\418]", "[!8]")
2459n/a check(br"[\101]", "[A]")
2460n/a check(br"[\1010]", "[A0]")
2461n/a check(br"[\x41]", "[A]")
2462n/a check(br"[\x410]", "[A0]")
2463n/a check(br"\u20ac", "\u20ac")
2464n/a check(br"\U0001d120", "\U0001d120")
2465n/a for i in range(97, 123):
2466n/a b = bytes([i])
2467n/a if b not in b'abfnrtuvx':
2468n/a with self.assertWarns(DeprecationWarning):
2469n/a check(b"\\" + b, "\\" + chr(i))
2470n/a if b.upper() not in b'UN':
2471n/a with self.assertWarns(DeprecationWarning):
2472n/a check(b"\\" + b.upper(), "\\" + chr(i-32))
2473n/a with self.assertWarns(DeprecationWarning):
2474n/a check(br"\8", "\\8")
2475n/a with self.assertWarns(DeprecationWarning):
2476n/a check(br"\9", "\\9")
2477n/a
2478n/a def test_decode_errors(self):
2479n/a decode = codecs.unicode_escape_decode
2480n/a for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2481n/a for i in range(d):
2482n/a self.assertRaises(UnicodeDecodeError, decode,
2483n/a b"\\" + c + b"0"*i)
2484n/a self.assertRaises(UnicodeDecodeError, decode,
2485n/a b"[\\" + c + b"0"*i + b"]")
2486n/a data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2487n/a self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2488n/a self.assertEqual(decode(data, "replace"),
2489n/a ("[\ufffd]\ufffd", len(data)))
2490n/a self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2491n/a self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2492n/a self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2493n/a
2494n/a
2495n/aclass RawUnicodeEscapeTest(unittest.TestCase):
2496n/a def test_empty(self):
2497n/a self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2498n/a self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2499n/a
2500n/a def test_raw_encode(self):
2501n/a encode = codecs.raw_unicode_escape_encode
2502n/a for b in range(256):
2503n/a self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2504n/a
2505n/a def test_raw_decode(self):
2506n/a decode = codecs.raw_unicode_escape_decode
2507n/a for b in range(256):
2508n/a self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2509n/a
2510n/a def test_escape_encode(self):
2511n/a encode = codecs.raw_unicode_escape_encode
2512n/a check = coding_checker(self, encode)
2513n/a for b in range(256):
2514n/a if b not in b'uU':
2515n/a check('\\' + chr(b), b'\\' + bytes([b]))
2516n/a check('\u20ac', br'\u20ac')
2517n/a check('\U0001d120', br'\U0001d120')
2518n/a
2519n/a def test_escape_decode(self):
2520n/a decode = codecs.raw_unicode_escape_decode
2521n/a check = coding_checker(self, decode)
2522n/a for b in range(256):
2523n/a if b not in b'uU':
2524n/a check(b'\\' + bytes([b]), '\\' + chr(b))
2525n/a check(br"\u20ac", "\u20ac")
2526n/a check(br"\U0001d120", "\U0001d120")
2527n/a
2528n/a def test_decode_errors(self):
2529n/a decode = codecs.raw_unicode_escape_decode
2530n/a for c, d in (b'u', 4), (b'U', 4):
2531n/a for i in range(d):
2532n/a self.assertRaises(UnicodeDecodeError, decode,
2533n/a b"\\" + c + b"0"*i)
2534n/a self.assertRaises(UnicodeDecodeError, decode,
2535n/a b"[\\" + c + b"0"*i + b"]")
2536n/a data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2537n/a self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2538n/a self.assertEqual(decode(data, "replace"),
2539n/a ("[\ufffd]\ufffd", len(data)))
2540n/a self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2541n/a self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2542n/a self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2543n/a
2544n/a
2545n/aclass EscapeEncodeTest(unittest.TestCase):
2546n/a
2547n/a def test_escape_encode(self):
2548n/a tests = [
2549n/a (b'', (b'', 0)),
2550n/a (b'foobar', (b'foobar', 6)),
2551n/a (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2552n/a (b'a\'b', (b"a\\'b", 3)),
2553n/a (b'b\\c', (b'b\\\\c', 3)),
2554n/a (b'c\nd', (b'c\\nd', 3)),
2555n/a (b'd\re', (b'd\\re', 3)),
2556n/a (b'f\x7fg', (b'f\\x7fg', 3)),
2557n/a ]
2558n/a for data, output in tests:
2559n/a with self.subTest(data=data):
2560n/a self.assertEqual(codecs.escape_encode(data), output)
2561n/a self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2562n/a self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2563n/a
2564n/a
2565n/aclass SurrogateEscapeTest(unittest.TestCase):
2566n/a
2567n/a def test_utf8(self):
2568n/a # Bad byte
2569n/a self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
2570n/a "foo\udc80bar")
2571n/a self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
2572n/a b"foo\x80bar")
2573n/a # bad-utf-8 encoded surrogate
2574n/a self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
2575n/a "\udced\udcb0\udc80")
2576n/a self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
2577n/a b"\xed\xb0\x80")
2578n/a
2579n/a def test_ascii(self):
2580n/a # bad byte
2581n/a self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
2582n/a "foo\udc80bar")
2583n/a self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
2584n/a b"foo\x80bar")
2585n/a
2586n/a def test_charmap(self):
2587n/a # bad byte: \xa5 is unmapped in iso-8859-3
2588n/a self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
2589n/a "foo\udca5bar")
2590n/a self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
2591n/a b"foo\xa5bar")
2592n/a
2593n/a def test_latin1(self):
2594n/a # Issue6373
2595n/a self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
2596n/a b"\xe4\xeb\xef\xf6\xfc")
2597n/a
2598n/a
2599n/aclass BomTest(unittest.TestCase):
2600n/a def test_seek0(self):
2601n/a data = "1234567890"
2602n/a tests = ("utf-16",
2603n/a "utf-16-le",
2604n/a "utf-16-be",
2605n/a "utf-32",
2606n/a "utf-32-le",
2607n/a "utf-32-be")
2608n/a self.addCleanup(support.unlink, support.TESTFN)
2609n/a for encoding in tests:
2610n/a # Check if the BOM is written only once
2611n/a with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2612n/a f.write(data)
2613n/a f.write(data)
2614n/a f.seek(0)
2615n/a self.assertEqual(f.read(), data * 2)
2616n/a f.seek(0)
2617n/a self.assertEqual(f.read(), data * 2)
2618n/a
2619n/a # Check that the BOM is written after a seek(0)
2620n/a with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2621n/a f.write(data[0])
2622n/a self.assertNotEqual(f.tell(), 0)
2623n/a f.seek(0)
2624n/a f.write(data)
2625n/a f.seek(0)
2626n/a self.assertEqual(f.read(), data)
2627n/a
2628n/a # (StreamWriter) Check that the BOM is written after a seek(0)
2629n/a with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2630n/a f.writer.write(data[0])
2631n/a self.assertNotEqual(f.writer.tell(), 0)
2632n/a f.writer.seek(0)
2633n/a f.writer.write(data)
2634n/a f.seek(0)
2635n/a self.assertEqual(f.read(), data)
2636n/a
2637n/a # Check that the BOM is not written after a seek() at a position
2638n/a # different than the start
2639n/a with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2640n/a f.write(data)
2641n/a f.seek(f.tell())
2642n/a f.write(data)
2643n/a f.seek(0)
2644n/a self.assertEqual(f.read(), data * 2)
2645n/a
2646n/a # (StreamWriter) Check that the BOM is not written after a seek()
2647n/a # at a position different than the start
2648n/a with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2649n/a f.writer.write(data)
2650n/a f.writer.seek(f.writer.tell())
2651n/a f.writer.write(data)
2652n/a f.seek(0)
2653n/a self.assertEqual(f.read(), data * 2)
2654n/a
2655n/a
2656n/abytes_transform_encodings = [
2657n/a "base64_codec",
2658n/a "uu_codec",
2659n/a "quopri_codec",
2660n/a "hex_codec",
2661n/a]
2662n/a
2663n/atransform_aliases = {
2664n/a "base64_codec": ["base64", "base_64"],
2665n/a "uu_codec": ["uu"],
2666n/a "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2667n/a "hex_codec": ["hex"],
2668n/a "rot_13": ["rot13"],
2669n/a}
2670n/a
2671n/atry:
2672n/a import zlib
2673n/aexcept ImportError:
2674n/a zlib = None
2675n/aelse:
2676n/a bytes_transform_encodings.append("zlib_codec")
2677n/a transform_aliases["zlib_codec"] = ["zip", "zlib"]
2678n/atry:
2679n/a import bz2
2680n/aexcept ImportError:
2681n/a pass
2682n/aelse:
2683n/a bytes_transform_encodings.append("bz2_codec")
2684n/a transform_aliases["bz2_codec"] = ["bz2"]
2685n/a
2686n/a
2687n/aclass TransformCodecTest(unittest.TestCase):
2688n/a
2689n/a def test_basics(self):
2690n/a binput = bytes(range(256))
2691n/a for encoding in bytes_transform_encodings:
2692n/a with self.subTest(encoding=encoding):
2693n/a # generic codecs interface
2694n/a (o, size) = codecs.getencoder(encoding)(binput)
2695n/a self.assertEqual(size, len(binput))
2696n/a (i, size) = codecs.getdecoder(encoding)(o)
2697n/a self.assertEqual(size, len(o))
2698n/a self.assertEqual(i, binput)
2699n/a
2700n/a def test_read(self):
2701n/a for encoding in bytes_transform_encodings:
2702n/a with self.subTest(encoding=encoding):
2703n/a sin = codecs.encode(b"\x80", encoding)
2704n/a reader = codecs.getreader(encoding)(io.BytesIO(sin))
2705n/a sout = reader.read()
2706n/a self.assertEqual(sout, b"\x80")
2707n/a
2708n/a def test_readline(self):
2709n/a for encoding in bytes_transform_encodings:
2710n/a with self.subTest(encoding=encoding):
2711n/a sin = codecs.encode(b"\x80", encoding)
2712n/a reader = codecs.getreader(encoding)(io.BytesIO(sin))
2713n/a sout = reader.readline()
2714n/a self.assertEqual(sout, b"\x80")
2715n/a
2716n/a def test_buffer_api_usage(self):
2717n/a # We check all the transform codecs accept memoryview input
2718n/a # for encoding and decoding
2719n/a # and also that they roundtrip correctly
2720n/a original = b"12345\x80"
2721n/a for encoding in bytes_transform_encodings:
2722n/a with self.subTest(encoding=encoding):
2723n/a data = original
2724n/a view = memoryview(data)
2725n/a data = codecs.encode(data, encoding)
2726n/a view_encoded = codecs.encode(view, encoding)
2727n/a self.assertEqual(view_encoded, data)
2728n/a view = memoryview(data)
2729n/a data = codecs.decode(data, encoding)
2730n/a self.assertEqual(data, original)
2731n/a view_decoded = codecs.decode(view, encoding)
2732n/a self.assertEqual(view_decoded, data)
2733n/a
2734n/a def test_text_to_binary_blacklists_binary_transforms(self):
2735n/a # Check binary -> binary codecs give a good error for str input
2736n/a bad_input = "bad input type"
2737n/a for encoding in bytes_transform_encodings:
2738n/a with self.subTest(encoding=encoding):
2739n/a fmt = (r"{!r} is not a text encoding; "
2740n/a r"use codecs.encode\(\) to handle arbitrary codecs")
2741n/a msg = fmt.format(encoding)
2742n/a with self.assertRaisesRegex(LookupError, msg) as failure:
2743n/a bad_input.encode(encoding)
2744n/a self.assertIsNone(failure.exception.__cause__)
2745n/a
2746n/a def test_text_to_binary_blacklists_text_transforms(self):
2747n/a # Check str.encode gives a good error message for str -> str codecs
2748n/a msg = (r"^'rot_13' is not a text encoding; "
2749n/a r"use codecs.encode\(\) to handle arbitrary codecs")
2750n/a with self.assertRaisesRegex(LookupError, msg):
2751n/a "just an example message".encode("rot_13")
2752n/a
2753n/a def test_binary_to_text_blacklists_binary_transforms(self):
2754n/a # Check bytes.decode and bytearray.decode give a good error
2755n/a # message for binary -> binary codecs
2756n/a data = b"encode first to ensure we meet any format restrictions"
2757n/a for encoding in bytes_transform_encodings:
2758n/a with self.subTest(encoding=encoding):
2759n/a encoded_data = codecs.encode(data, encoding)
2760n/a fmt = (r"{!r} is not a text encoding; "
2761n/a r"use codecs.decode\(\) to handle arbitrary codecs")
2762n/a msg = fmt.format(encoding)
2763n/a with self.assertRaisesRegex(LookupError, msg):
2764n/a encoded_data.decode(encoding)
2765n/a with self.assertRaisesRegex(LookupError, msg):
2766n/a bytearray(encoded_data).decode(encoding)
2767n/a
2768n/a def test_binary_to_text_blacklists_text_transforms(self):
2769n/a # Check str -> str codec gives a good error for binary input
2770n/a for bad_input in (b"immutable", bytearray(b"mutable")):
2771n/a with self.subTest(bad_input=bad_input):
2772n/a msg = (r"^'rot_13' is not a text encoding; "
2773n/a r"use codecs.decode\(\) to handle arbitrary codecs")
2774n/a with self.assertRaisesRegex(LookupError, msg) as failure:
2775n/a bad_input.decode("rot_13")
2776n/a self.assertIsNone(failure.exception.__cause__)
2777n/a
2778n/a @unittest.skipUnless(zlib, "Requires zlib support")
2779n/a def test_custom_zlib_error_is_wrapped(self):
2780n/a # Check zlib codec gives a good error for malformed input
2781n/a msg = "^decoding with 'zlib_codec' codec failed"
2782n/a with self.assertRaisesRegex(Exception, msg) as failure:
2783n/a codecs.decode(b"hello", "zlib_codec")
2784n/a self.assertIsInstance(failure.exception.__cause__,
2785n/a type(failure.exception))
2786n/a
2787n/a def test_custom_hex_error_is_wrapped(self):
2788n/a # Check hex codec gives a good error for malformed input
2789n/a msg = "^decoding with 'hex_codec' codec failed"
2790n/a with self.assertRaisesRegex(Exception, msg) as failure:
2791n/a codecs.decode(b"hello", "hex_codec")
2792n/a self.assertIsInstance(failure.exception.__cause__,
2793n/a type(failure.exception))
2794n/a
2795n/a # Unfortunately, the bz2 module throws OSError, which the codec
2796n/a # machinery currently can't wrap :(
2797n/a
2798n/a # Ensure codec aliases from http://bugs.python.org/issue7475 work
2799n/a def test_aliases(self):
2800n/a for codec_name, aliases in transform_aliases.items():
2801n/a expected_name = codecs.lookup(codec_name).name
2802n/a for alias in aliases:
2803n/a with self.subTest(alias=alias):
2804n/a info = codecs.lookup(alias)
2805n/a self.assertEqual(info.name, expected_name)
2806n/a
2807n/a def test_quopri_stateless(self):
2808n/a # Should encode with quotetabs=True
2809n/a encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2810n/a self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2811n/a # But should still support unescaped tabs and spaces
2812n/a unescaped = b"space tab eol\n"
2813n/a self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2814n/a
2815n/a def test_uu_invalid(self):
2816n/a # Missing "begin" line
2817n/a self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2818n/a
2819n/a
2820n/a# The codec system tries to wrap exceptions in order to ensure the error
2821n/a# mentions the operation being performed and the codec involved. We
2822n/a# currently *only* want this to happen for relatively stateless
2823n/a# exceptions, where the only significant information they contain is their
2824n/a# type and a single str argument.
2825n/a
2826n/a# Use a local codec registry to avoid appearing to leak objects when
2827n/a# registering multiple search functions
2828n/a_TEST_CODECS = {}
2829n/a
2830n/adef _get_test_codec(codec_name):
2831n/a return _TEST_CODECS.get(codec_name)
2832n/acodecs.register(_get_test_codec) # Returns None, not usable as a decorator
2833n/a
2834n/atry:
2835n/a # Issue #22166: Also need to clear the internal cache in CPython
2836n/a from _codecs import _forget_codec
2837n/aexcept ImportError:
2838n/a def _forget_codec(codec_name):
2839n/a pass
2840n/a
2841n/a
2842n/aclass ExceptionChainingTest(unittest.TestCase):
2843n/a
2844n/a def setUp(self):
2845n/a # There's no way to unregister a codec search function, so we just
2846n/a # ensure we render this one fairly harmless after the test
2847n/a # case finishes by using the test case repr as the codec name
2848n/a # The codecs module normalizes codec names, although this doesn't
2849n/a # appear to be formally documented...
2850n/a # We also make sure we use a truly unique id for the custom codec
2851n/a # to avoid issues with the codec cache when running these tests
2852n/a # multiple times (e.g. when hunting for refleaks)
2853n/a unique_id = repr(self) + str(id(self))
2854n/a self.codec_name = encodings.normalize_encoding(unique_id).lower()
2855n/a
2856n/a # We store the object to raise on the instance because of a bad
2857n/a # interaction between the codec caching (which means we can't
2858n/a # recreate the codec entry) and regrtest refleak hunting (which
2859n/a # runs the same test instance multiple times). This means we
2860n/a # need to ensure the codecs call back in to the instance to find
2861n/a # out which exception to raise rather than binding them in a
2862n/a # closure to an object that may change on the next run
2863n/a self.obj_to_raise = RuntimeError
2864n/a
2865n/a def tearDown(self):
2866n/a _TEST_CODECS.pop(self.codec_name, None)
2867n/a # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2868n/a encodings._cache.pop(self.codec_name, None)
2869n/a try:
2870n/a _forget_codec(self.codec_name)
2871n/a except KeyError:
2872n/a pass
2873n/a
2874n/a def set_codec(self, encode, decode):
2875n/a codec_info = codecs.CodecInfo(encode, decode,
2876n/a name=self.codec_name)
2877n/a _TEST_CODECS[self.codec_name] = codec_info
2878n/a
2879n/a @contextlib.contextmanager
2880n/a def assertWrapped(self, operation, exc_type, msg):
2881n/a full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
2882n/a operation, self.codec_name, exc_type.__name__, msg)
2883n/a with self.assertRaisesRegex(exc_type, full_msg) as caught:
2884n/a yield caught
2885n/a self.assertIsInstance(caught.exception.__cause__, exc_type)
2886n/a self.assertIsNotNone(caught.exception.__cause__.__traceback__)
2887n/a
2888n/a def raise_obj(self, *args, **kwds):
2889n/a # Helper to dynamically change the object raised by a test codec
2890n/a raise self.obj_to_raise
2891n/a
2892n/a def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
2893n/a self.obj_to_raise = obj_to_raise
2894n/a self.set_codec(self.raise_obj, self.raise_obj)
2895n/a with self.assertWrapped("encoding", exc_type, msg):
2896n/a "str_input".encode(self.codec_name)
2897n/a with self.assertWrapped("encoding", exc_type, msg):
2898n/a codecs.encode("str_input", self.codec_name)
2899n/a with self.assertWrapped("decoding", exc_type, msg):
2900n/a b"bytes input".decode(self.codec_name)
2901n/a with self.assertWrapped("decoding", exc_type, msg):
2902n/a codecs.decode(b"bytes input", self.codec_name)
2903n/a
2904n/a def test_raise_by_type(self):
2905n/a self.check_wrapped(RuntimeError, "")
2906n/a
2907n/a def test_raise_by_value(self):
2908n/a msg = "This should be wrapped"
2909n/a self.check_wrapped(RuntimeError(msg), msg)
2910n/a
2911n/a def test_raise_grandchild_subclass_exact_size(self):
2912n/a msg = "This should be wrapped"
2913n/a class MyRuntimeError(RuntimeError):
2914n/a __slots__ = ()
2915n/a self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2916n/a
2917n/a def test_raise_subclass_with_weakref_support(self):
2918n/a msg = "This should be wrapped"
2919n/a class MyRuntimeError(RuntimeError):
2920n/a pass
2921n/a self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2922n/a
2923n/a def check_not_wrapped(self, obj_to_raise, msg):
2924n/a def raise_obj(*args, **kwds):
2925n/a raise obj_to_raise
2926n/a self.set_codec(raise_obj, raise_obj)
2927n/a with self.assertRaisesRegex(RuntimeError, msg):
2928n/a "str input".encode(self.codec_name)
2929n/a with self.assertRaisesRegex(RuntimeError, msg):
2930n/a codecs.encode("str input", self.codec_name)
2931n/a with self.assertRaisesRegex(RuntimeError, msg):
2932n/a b"bytes input".decode(self.codec_name)
2933n/a with self.assertRaisesRegex(RuntimeError, msg):
2934n/a codecs.decode(b"bytes input", self.codec_name)
2935n/a
2936n/a def test_init_override_is_not_wrapped(self):
2937n/a class CustomInit(RuntimeError):
2938n/a def __init__(self):
2939n/a pass
2940n/a self.check_not_wrapped(CustomInit, "")
2941n/a
2942n/a def test_new_override_is_not_wrapped(self):
2943n/a class CustomNew(RuntimeError):
2944n/a def __new__(cls):
2945n/a return super().__new__(cls)
2946n/a self.check_not_wrapped(CustomNew, "")
2947n/a
2948n/a def test_instance_attribute_is_not_wrapped(self):
2949n/a msg = "This should NOT be wrapped"
2950n/a exc = RuntimeError(msg)
2951n/a exc.attr = 1
2952n/a self.check_not_wrapped(exc, "^{}$".format(msg))
2953n/a
2954n/a def test_non_str_arg_is_not_wrapped(self):
2955n/a self.check_not_wrapped(RuntimeError(1), "1")
2956n/a
2957n/a def test_multiple_args_is_not_wrapped(self):
2958n/a msg_re = r"^\('a', 'b', 'c'\)$"
2959n/a self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
2960n/a
2961n/a # http://bugs.python.org/issue19609
2962n/a def test_codec_lookup_failure_not_wrapped(self):
2963n/a msg = "^unknown encoding: {}$".format(self.codec_name)
2964n/a # The initial codec lookup should not be wrapped
2965n/a with self.assertRaisesRegex(LookupError, msg):
2966n/a "str input".encode(self.codec_name)
2967n/a with self.assertRaisesRegex(LookupError, msg):
2968n/a codecs.encode("str input", self.codec_name)
2969n/a with self.assertRaisesRegex(LookupError, msg):
2970n/a b"bytes input".decode(self.codec_name)
2971n/a with self.assertRaisesRegex(LookupError, msg):
2972n/a codecs.decode(b"bytes input", self.codec_name)
2973n/a
2974n/a def test_unflagged_non_text_codec_handling(self):
2975n/a # The stdlib non-text codecs are now marked so they're
2976n/a # pre-emptively skipped by the text model related methods
2977n/a # However, third party codecs won't be flagged, so we still make
2978n/a # sure the case where an inappropriate output type is produced is
2979n/a # handled appropriately
2980n/a def encode_to_str(*args, **kwds):
2981n/a return "not bytes!", 0
2982n/a def decode_to_bytes(*args, **kwds):
2983n/a return b"not str!", 0
2984n/a self.set_codec(encode_to_str, decode_to_bytes)
2985n/a # No input or output type checks on the codecs module functions
2986n/a encoded = codecs.encode(None, self.codec_name)
2987n/a self.assertEqual(encoded, "not bytes!")
2988n/a decoded = codecs.decode(None, self.codec_name)
2989n/a self.assertEqual(decoded, b"not str!")
2990n/a # Text model methods should complain
2991n/a fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2992n/a r"use codecs.encode\(\) to encode to arbitrary types$")
2993n/a msg = fmt.format(self.codec_name)
2994n/a with self.assertRaisesRegex(TypeError, msg):
2995n/a "str_input".encode(self.codec_name)
2996n/a fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2997n/a r"use codecs.decode\(\) to decode to arbitrary types$")
2998n/a msg = fmt.format(self.codec_name)
2999n/a with self.assertRaisesRegex(TypeError, msg):
3000n/a b"bytes input".decode(self.codec_name)
3001n/a
3002n/a
3003n/a
3004n/a@unittest.skipUnless(sys.platform == 'win32',
3005n/a 'code pages are specific to Windows')
3006n/aclass CodePageTest(unittest.TestCase):
3007n/a # CP_UTF8 is already tested by CP65001Test
3008n/a CP_UTF8 = 65001
3009n/a
3010n/a def test_invalid_code_page(self):
3011n/a self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
3012n/a self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
3013n/a self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
3014n/a self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
3015n/a
3016n/a def test_code_page_name(self):
3017n/a self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
3018n/a codecs.code_page_encode, 932, '\xff')
3019n/a self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
3020n/a codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
3021n/a self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
3022n/a codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
3023n/a
3024n/a def check_decode(self, cp, tests):
3025n/a for raw, errors, expected in tests:
3026n/a if expected is not None:
3027n/a try:
3028n/a decoded = codecs.code_page_decode(cp, raw, errors, True)
3029n/a except UnicodeDecodeError as err:
3030n/a self.fail('Unable to decode %a from "cp%s" with '
3031n/a 'errors=%r: %s' % (raw, cp, errors, err))
3032n/a self.assertEqual(decoded[0], expected,
3033n/a '%a.decode("cp%s", %r)=%a != %a'
3034n/a % (raw, cp, errors, decoded[0], expected))
3035n/a # assert 0 <= decoded[1] <= len(raw)
3036n/a self.assertGreaterEqual(decoded[1], 0)
3037n/a self.assertLessEqual(decoded[1], len(raw))
3038n/a else:
3039n/a self.assertRaises(UnicodeDecodeError,
3040n/a codecs.code_page_decode, cp, raw, errors, True)
3041n/a
3042n/a def check_encode(self, cp, tests):
3043n/a for text, errors, expected in tests:
3044n/a if expected is not None:
3045n/a try:
3046n/a encoded = codecs.code_page_encode(cp, text, errors)
3047n/a except UnicodeEncodeError as err:
3048n/a self.fail('Unable to encode %a to "cp%s" with '
3049n/a 'errors=%r: %s' % (text, cp, errors, err))
3050n/a self.assertEqual(encoded[0], expected,
3051n/a '%a.encode("cp%s", %r)=%a != %a'
3052n/a % (text, cp, errors, encoded[0], expected))
3053n/a self.assertEqual(encoded[1], len(text))
3054n/a else:
3055n/a self.assertRaises(UnicodeEncodeError,
3056n/a codecs.code_page_encode, cp, text, errors)
3057n/a
3058n/a def test_cp932(self):
3059n/a self.check_encode(932, (
3060n/a ('abc', 'strict', b'abc'),
3061n/a ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
3062n/a # test error handlers
3063n/a ('\xff', 'strict', None),
3064n/a ('[\xff]', 'ignore', b'[]'),
3065n/a ('[\xff]', 'replace', b'[y]'),
3066n/a ('[\u20ac]', 'replace', b'[?]'),
3067n/a ('[\xff]', 'backslashreplace', b'[\\xff]'),
3068n/a ('[\xff]', 'namereplace',
3069n/a b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
3070n/a ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
3071n/a ('\udcff', 'strict', None),
3072n/a ('[\udcff]', 'surrogateescape', b'[\xff]'),
3073n/a ('[\udcff]', 'surrogatepass', None),
3074n/a ))
3075n/a self.check_decode(932, (
3076n/a (b'abc', 'strict', 'abc'),
3077n/a (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
3078n/a # invalid bytes
3079n/a (b'[\xff]', 'strict', None),
3080n/a (b'[\xff]', 'ignore', '[]'),
3081n/a (b'[\xff]', 'replace', '[\ufffd]'),
3082n/a (b'[\xff]', 'backslashreplace', '[\\xff]'),
3083n/a (b'[\xff]', 'surrogateescape', '[\udcff]'),
3084n/a (b'[\xff]', 'surrogatepass', None),
3085n/a (b'\x81\x00abc', 'strict', None),
3086n/a (b'\x81\x00abc', 'ignore', '\x00abc'),
3087n/a (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
3088n/a (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
3089n/a ))
3090n/a
3091n/a def test_cp1252(self):
3092n/a self.check_encode(1252, (
3093n/a ('abc', 'strict', b'abc'),
3094n/a ('\xe9\u20ac', 'strict', b'\xe9\x80'),
3095n/a ('\xff', 'strict', b'\xff'),
3096n/a # test error handlers
3097n/a ('\u0141', 'strict', None),
3098n/a ('\u0141', 'ignore', b''),
3099n/a ('\u0141', 'replace', b'L'),
3100n/a ('\udc98', 'surrogateescape', b'\x98'),
3101n/a ('\udc98', 'surrogatepass', None),
3102n/a ))
3103n/a self.check_decode(1252, (
3104n/a (b'abc', 'strict', 'abc'),
3105n/a (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3106n/a (b'\xff', 'strict', '\xff'),
3107n/a ))
3108n/a
3109n/a def test_cp_utf7(self):
3110n/a cp = 65000
3111n/a self.check_encode(cp, (
3112n/a ('abc', 'strict', b'abc'),
3113n/a ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3114n/a ('\U0010ffff', 'strict', b'+2//f/w-'),
3115n/a ('\udc80', 'strict', b'+3IA-'),
3116n/a ('\ufffd', 'strict', b'+//0-'),
3117n/a ))
3118n/a self.check_decode(cp, (
3119n/a (b'abc', 'strict', 'abc'),
3120n/a (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3121n/a (b'+2//f/w-', 'strict', '\U0010ffff'),
3122n/a (b'+3IA-', 'strict', '\udc80'),
3123n/a (b'+//0-', 'strict', '\ufffd'),
3124n/a # invalid bytes
3125n/a (b'[+/]', 'strict', '[]'),
3126n/a (b'[\xff]', 'strict', '[\xff]'),
3127n/a ))
3128n/a
3129n/a def test_multibyte_encoding(self):
3130n/a self.check_decode(932, (
3131n/a (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3132n/a (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3133n/a ))
3134n/a self.check_decode(self.CP_UTF8, (
3135n/a (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3136n/a (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3137n/a ))
3138n/a self.check_encode(self.CP_UTF8, (
3139n/a ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3140n/a ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3141n/a ))
3142n/a
3143n/a def test_incremental(self):
3144n/a decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3145n/a self.assertEqual(decoded, ('', 0))
3146n/a
3147n/a decoded = codecs.code_page_decode(932,
3148n/a b'\xe9\x80\xe9', 'strict',
3149n/a False)
3150n/a self.assertEqual(decoded, ('\u9a3e', 2))
3151n/a
3152n/a decoded = codecs.code_page_decode(932,
3153n/a b'\xe9\x80\xe9\x80', 'strict',
3154n/a False)
3155n/a self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3156n/a
3157n/a decoded = codecs.code_page_decode(932,
3158n/a b'abc', 'strict',
3159n/a False)
3160n/a self.assertEqual(decoded, ('abc', 3))
3161n/a
3162n/a def test_mbcs_alias(self):
3163n/a # Check that looking up our 'default' codepage will return
3164n/a # mbcs when we don't have a more specific one available
3165n/a import _bootlocale
3166n/a def _get_fake_codepage(*a):
3167n/a return 'cp123'
3168n/a old_getpreferredencoding = _bootlocale.getpreferredencoding
3169n/a _bootlocale.getpreferredencoding = _get_fake_codepage
3170n/a try:
3171n/a codec = codecs.lookup('cp123')
3172n/a self.assertEqual(codec.name, 'mbcs')
3173n/a finally:
3174n/a _bootlocale.getpreferredencoding = old_getpreferredencoding
3175n/a
3176n/a
3177n/aclass ASCIITest(unittest.TestCase):
3178n/a def test_encode(self):
3179n/a self.assertEqual('abc123'.encode('ascii'), b'abc123')
3180n/a
3181n/a def test_encode_error(self):
3182n/a for data, error_handler, expected in (
3183n/a ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3184n/a ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3185n/a ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
3186n/a ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3187n/a b'[\\x80\\xff\\u20ac\\U000abcde]'),
3188n/a ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3189n/a ):
3190n/a with self.subTest(data=data, error_handler=error_handler,
3191n/a expected=expected):
3192n/a self.assertEqual(data.encode('ascii', error_handler),
3193n/a expected)
3194n/a
3195n/a def test_encode_surrogateescape_error(self):
3196n/a with self.assertRaises(UnicodeEncodeError):
3197n/a # the first character can be decoded, but not the second
3198n/a '\udc80\xff'.encode('ascii', 'surrogateescape')
3199n/a
3200n/a def test_decode(self):
3201n/a self.assertEqual(b'abc'.decode('ascii'), 'abc')
3202n/a
3203n/a def test_decode_error(self):
3204n/a for data, error_handler, expected in (
3205n/a (b'[\x80\xff]', 'ignore', '[]'),
3206n/a (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3207n/a (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3208n/a (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3209n/a ):
3210n/a with self.subTest(data=data, error_handler=error_handler,
3211n/a expected=expected):
3212n/a self.assertEqual(data.decode('ascii', error_handler),
3213n/a expected)
3214n/a
3215n/a
3216n/aclass Latin1Test(unittest.TestCase):
3217n/a def test_encode(self):
3218n/a for data, expected in (
3219n/a ('abc', b'abc'),
3220n/a ('\x80\xe9\xff', b'\x80\xe9\xff'),
3221n/a ):
3222n/a with self.subTest(data=data, expected=expected):
3223n/a self.assertEqual(data.encode('latin1'), expected)
3224n/a
3225n/a def test_encode_errors(self):
3226n/a for data, error_handler, expected in (
3227n/a ('[\u20ac\udc80]', 'ignore', b'[]'),
3228n/a ('[\u20ac\udc80]', 'replace', b'[??]'),
3229n/a ('[\u20ac\U000abcde]', 'backslashreplace',
3230n/a b'[\\u20ac\\U000abcde]'),
3231n/a ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3232n/a ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3233n/a ):
3234n/a with self.subTest(data=data, error_handler=error_handler,
3235n/a expected=expected):
3236n/a self.assertEqual(data.encode('latin1', error_handler),
3237n/a expected)
3238n/a
3239n/a def test_encode_surrogateescape_error(self):
3240n/a with self.assertRaises(UnicodeEncodeError):
3241n/a # the first character can be decoded, but not the second
3242n/a '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3243n/a
3244n/a def test_decode(self):
3245n/a for data, expected in (
3246n/a (b'abc', 'abc'),
3247n/a (b'[\x80\xff]', '[\x80\xff]'),
3248n/a ):
3249n/a with self.subTest(data=data, expected=expected):
3250n/a self.assertEqual(data.decode('latin1'), expected)
3251n/a
3252n/a
3253n/aif __name__ == "__main__":
3254n/a unittest.main()