| 1 | n/a | # |
|---|
| 2 | n/a | # test_multibytecodec.py |
|---|
| 3 | n/a | # Unit test for multibytecodec itself |
|---|
| 4 | n/a | # |
|---|
| 5 | n/a | |
|---|
| 6 | n/a | from test import support |
|---|
| 7 | n/a | from test.support import TESTFN |
|---|
| 8 | n/a | import unittest, io, codecs, sys |
|---|
| 9 | n/a | import _multibytecodec |
|---|
| 10 | n/a | |
|---|
| 11 | n/a | ALL_CJKENCODINGS = [ |
|---|
| 12 | n/a | # _codecs_cn |
|---|
| 13 | n/a | 'gb2312', 'gbk', 'gb18030', 'hz', |
|---|
| 14 | n/a | # _codecs_hk |
|---|
| 15 | n/a | 'big5hkscs', |
|---|
| 16 | n/a | # _codecs_jp |
|---|
| 17 | n/a | 'cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213', |
|---|
| 18 | n/a | 'euc_jis_2004', 'shift_jis_2004', |
|---|
| 19 | n/a | # _codecs_kr |
|---|
| 20 | n/a | 'cp949', 'euc_kr', 'johab', |
|---|
| 21 | n/a | # _codecs_tw |
|---|
| 22 | n/a | 'big5', 'cp950', |
|---|
| 23 | n/a | # _codecs_iso2022 |
|---|
| 24 | n/a | 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004', |
|---|
| 25 | n/a | 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr', |
|---|
| 26 | n/a | ] |
|---|
| 27 | n/a | |
|---|
| 28 | n/a | class Test_MultibyteCodec(unittest.TestCase): |
|---|
| 29 | n/a | |
|---|
| 30 | n/a | def test_nullcoding(self): |
|---|
| 31 | n/a | for enc in ALL_CJKENCODINGS: |
|---|
| 32 | n/a | self.assertEqual(b''.decode(enc), '') |
|---|
| 33 | n/a | self.assertEqual(str(b'', enc), '') |
|---|
| 34 | n/a | self.assertEqual(''.encode(enc), b'') |
|---|
| 35 | n/a | |
|---|
| 36 | n/a | def test_str_decode(self): |
|---|
| 37 | n/a | for enc in ALL_CJKENCODINGS: |
|---|
| 38 | n/a | self.assertEqual('abcd'.encode(enc), b'abcd') |
|---|
| 39 | n/a | |
|---|
| 40 | n/a | def test_errorcallback_longindex(self): |
|---|
| 41 | n/a | dec = codecs.getdecoder('euc-kr') |
|---|
| 42 | n/a | myreplace = lambda exc: ('', sys.maxsize+1) |
|---|
| 43 | n/a | codecs.register_error('test.cjktest', myreplace) |
|---|
| 44 | n/a | self.assertRaises(IndexError, dec, |
|---|
| 45 | n/a | b'apple\x92ham\x93spam', 'test.cjktest') |
|---|
| 46 | n/a | |
|---|
| 47 | n/a | def test_errorcallback_custom_ignore(self): |
|---|
| 48 | n/a | # Issue #23215: MemoryError with custom error handlers and multibyte codecs |
|---|
| 49 | n/a | data = 100 * "\udc00" |
|---|
| 50 | n/a | codecs.register_error("test.ignore", codecs.ignore_errors) |
|---|
| 51 | n/a | for enc in ALL_CJKENCODINGS: |
|---|
| 52 | n/a | self.assertEqual(data.encode(enc, "test.ignore"), b'') |
|---|
| 53 | n/a | |
|---|
| 54 | n/a | def test_codingspec(self): |
|---|
| 55 | n/a | try: |
|---|
| 56 | n/a | for enc in ALL_CJKENCODINGS: |
|---|
| 57 | n/a | code = '# coding: {}\n'.format(enc) |
|---|
| 58 | n/a | exec(code) |
|---|
| 59 | n/a | finally: |
|---|
| 60 | n/a | support.unlink(TESTFN) |
|---|
| 61 | n/a | |
|---|
| 62 | n/a | def test_init_segfault(self): |
|---|
| 63 | n/a | # bug #3305: this used to segfault |
|---|
| 64 | n/a | self.assertRaises(AttributeError, |
|---|
| 65 | n/a | _multibytecodec.MultibyteStreamReader, None) |
|---|
| 66 | n/a | self.assertRaises(AttributeError, |
|---|
| 67 | n/a | _multibytecodec.MultibyteStreamWriter, None) |
|---|
| 68 | n/a | |
|---|
| 69 | n/a | def test_decode_unicode(self): |
|---|
| 70 | n/a | # Trying to decode a unicode string should raise a TypeError |
|---|
| 71 | n/a | for enc in ALL_CJKENCODINGS: |
|---|
| 72 | n/a | self.assertRaises(TypeError, codecs.getdecoder(enc), "") |
|---|
| 73 | n/a | |
|---|
| 74 | n/a | class Test_IncrementalEncoder(unittest.TestCase): |
|---|
| 75 | n/a | |
|---|
| 76 | n/a | def test_stateless(self): |
|---|
| 77 | n/a | # cp949 encoder isn't stateful at all. |
|---|
| 78 | n/a | encoder = codecs.getincrementalencoder('cp949')() |
|---|
| 79 | n/a | self.assertEqual(encoder.encode('\ud30c\uc774\uc36c \ub9c8\uc744'), |
|---|
| 80 | n/a | b'\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb') |
|---|
| 81 | n/a | self.assertEqual(encoder.reset(), None) |
|---|
| 82 | n/a | self.assertEqual(encoder.encode('\u2606\u223c\u2606', True), |
|---|
| 83 | n/a | b'\xa1\xd9\xa1\xad\xa1\xd9') |
|---|
| 84 | n/a | self.assertEqual(encoder.reset(), None) |
|---|
| 85 | n/a | self.assertEqual(encoder.encode('', True), b'') |
|---|
| 86 | n/a | self.assertEqual(encoder.encode('', False), b'') |
|---|
| 87 | n/a | self.assertEqual(encoder.reset(), None) |
|---|
| 88 | n/a | |
|---|
| 89 | n/a | def test_stateful(self): |
|---|
| 90 | n/a | # jisx0213 encoder is stateful for a few code points. eg) |
|---|
| 91 | n/a | # U+00E6 => A9DC |
|---|
| 92 | n/a | # U+00E6 U+0300 => ABC4 |
|---|
| 93 | n/a | # U+0300 => ABDC |
|---|
| 94 | n/a | |
|---|
| 95 | n/a | encoder = codecs.getincrementalencoder('jisx0213')() |
|---|
| 96 | n/a | self.assertEqual(encoder.encode('\u00e6\u0300'), b'\xab\xc4') |
|---|
| 97 | n/a | self.assertEqual(encoder.encode('\u00e6'), b'') |
|---|
| 98 | n/a | self.assertEqual(encoder.encode('\u0300'), b'\xab\xc4') |
|---|
| 99 | n/a | self.assertEqual(encoder.encode('\u00e6', True), b'\xa9\xdc') |
|---|
| 100 | n/a | |
|---|
| 101 | n/a | self.assertEqual(encoder.reset(), None) |
|---|
| 102 | n/a | self.assertEqual(encoder.encode('\u0300'), b'\xab\xdc') |
|---|
| 103 | n/a | |
|---|
| 104 | n/a | self.assertEqual(encoder.encode('\u00e6'), b'') |
|---|
| 105 | n/a | self.assertEqual(encoder.encode('', True), b'\xa9\xdc') |
|---|
| 106 | n/a | self.assertEqual(encoder.encode('', True), b'') |
|---|
| 107 | n/a | |
|---|
| 108 | n/a | def test_stateful_keep_buffer(self): |
|---|
| 109 | n/a | encoder = codecs.getincrementalencoder('jisx0213')() |
|---|
| 110 | n/a | self.assertEqual(encoder.encode('\u00e6'), b'') |
|---|
| 111 | n/a | self.assertRaises(UnicodeEncodeError, encoder.encode, '\u0123') |
|---|
| 112 | n/a | self.assertEqual(encoder.encode('\u0300\u00e6'), b'\xab\xc4') |
|---|
| 113 | n/a | self.assertRaises(UnicodeEncodeError, encoder.encode, '\u0123') |
|---|
| 114 | n/a | self.assertEqual(encoder.reset(), None) |
|---|
| 115 | n/a | self.assertEqual(encoder.encode('\u0300'), b'\xab\xdc') |
|---|
| 116 | n/a | self.assertEqual(encoder.encode('\u00e6'), b'') |
|---|
| 117 | n/a | self.assertRaises(UnicodeEncodeError, encoder.encode, '\u0123') |
|---|
| 118 | n/a | self.assertEqual(encoder.encode('', True), b'\xa9\xdc') |
|---|
| 119 | n/a | |
|---|
| 120 | n/a | def test_issue5640(self): |
|---|
| 121 | n/a | encoder = codecs.getincrementalencoder('shift-jis')('backslashreplace') |
|---|
| 122 | n/a | self.assertEqual(encoder.encode('\xff'), b'\\xff') |
|---|
| 123 | n/a | self.assertEqual(encoder.encode('\n'), b'\n') |
|---|
| 124 | n/a | |
|---|
| 125 | n/a | class Test_IncrementalDecoder(unittest.TestCase): |
|---|
| 126 | n/a | |
|---|
| 127 | n/a | def test_dbcs(self): |
|---|
| 128 | n/a | # cp949 decoder is simple with only 1 or 2 bytes sequences. |
|---|
| 129 | n/a | decoder = codecs.getincrementaldecoder('cp949')() |
|---|
| 130 | n/a | self.assertEqual(decoder.decode(b'\xc6\xc4\xc0\xcc\xbd'), |
|---|
| 131 | n/a | '\ud30c\uc774') |
|---|
| 132 | n/a | self.assertEqual(decoder.decode(b'\xe3 \xb8\xb6\xc0\xbb'), |
|---|
| 133 | n/a | '\uc36c \ub9c8\uc744') |
|---|
| 134 | n/a | self.assertEqual(decoder.decode(b''), '') |
|---|
| 135 | n/a | |
|---|
| 136 | n/a | def test_dbcs_keep_buffer(self): |
|---|
| 137 | n/a | decoder = codecs.getincrementaldecoder('cp949')() |
|---|
| 138 | n/a | self.assertEqual(decoder.decode(b'\xc6\xc4\xc0'), '\ud30c') |
|---|
| 139 | n/a | self.assertRaises(UnicodeDecodeError, decoder.decode, b'', True) |
|---|
| 140 | n/a | self.assertEqual(decoder.decode(b'\xcc'), '\uc774') |
|---|
| 141 | n/a | |
|---|
| 142 | n/a | self.assertEqual(decoder.decode(b'\xc6\xc4\xc0'), '\ud30c') |
|---|
| 143 | n/a | self.assertRaises(UnicodeDecodeError, decoder.decode, |
|---|
| 144 | n/a | b'\xcc\xbd', True) |
|---|
| 145 | n/a | self.assertEqual(decoder.decode(b'\xcc'), '\uc774') |
|---|
| 146 | n/a | |
|---|
| 147 | n/a | def test_iso2022(self): |
|---|
| 148 | n/a | decoder = codecs.getincrementaldecoder('iso2022-jp')() |
|---|
| 149 | n/a | ESC = b'\x1b' |
|---|
| 150 | n/a | self.assertEqual(decoder.decode(ESC + b'('), '') |
|---|
| 151 | n/a | self.assertEqual(decoder.decode(b'B', True), '') |
|---|
| 152 | n/a | self.assertEqual(decoder.decode(ESC + b'$'), '') |
|---|
| 153 | n/a | self.assertEqual(decoder.decode(b'B@$'), '\u4e16') |
|---|
| 154 | n/a | self.assertEqual(decoder.decode(b'@$@'), '\u4e16') |
|---|
| 155 | n/a | self.assertEqual(decoder.decode(b'$', True), '\u4e16') |
|---|
| 156 | n/a | self.assertEqual(decoder.reset(), None) |
|---|
| 157 | n/a | self.assertEqual(decoder.decode(b'@$'), '@$') |
|---|
| 158 | n/a | self.assertEqual(decoder.decode(ESC + b'$'), '') |
|---|
| 159 | n/a | self.assertRaises(UnicodeDecodeError, decoder.decode, b'', True) |
|---|
| 160 | n/a | self.assertEqual(decoder.decode(b'B@$'), '\u4e16') |
|---|
| 161 | n/a | |
|---|
| 162 | n/a | def test_decode_unicode(self): |
|---|
| 163 | n/a | # Trying to decode a unicode string should raise a TypeError |
|---|
| 164 | n/a | for enc in ALL_CJKENCODINGS: |
|---|
| 165 | n/a | decoder = codecs.getincrementaldecoder(enc)() |
|---|
| 166 | n/a | self.assertRaises(TypeError, decoder.decode, "") |
|---|
| 167 | n/a | |
|---|
| 168 | n/a | class Test_StreamReader(unittest.TestCase): |
|---|
| 169 | n/a | def test_bug1728403(self): |
|---|
| 170 | n/a | try: |
|---|
| 171 | n/a | f = open(TESTFN, 'wb') |
|---|
| 172 | n/a | try: |
|---|
| 173 | n/a | f.write(b'\xa1') |
|---|
| 174 | n/a | finally: |
|---|
| 175 | n/a | f.close() |
|---|
| 176 | n/a | f = codecs.open(TESTFN, encoding='cp949') |
|---|
| 177 | n/a | try: |
|---|
| 178 | n/a | self.assertRaises(UnicodeDecodeError, f.read, 2) |
|---|
| 179 | n/a | finally: |
|---|
| 180 | n/a | f.close() |
|---|
| 181 | n/a | finally: |
|---|
| 182 | n/a | support.unlink(TESTFN) |
|---|
| 183 | n/a | |
|---|
| 184 | n/a | class Test_StreamWriter(unittest.TestCase): |
|---|
| 185 | n/a | def test_gb18030(self): |
|---|
| 186 | n/a | s= io.BytesIO() |
|---|
| 187 | n/a | c = codecs.getwriter('gb18030')(s) |
|---|
| 188 | n/a | c.write('123') |
|---|
| 189 | n/a | self.assertEqual(s.getvalue(), b'123') |
|---|
| 190 | n/a | c.write('\U00012345') |
|---|
| 191 | n/a | self.assertEqual(s.getvalue(), b'123\x907\x959') |
|---|
| 192 | n/a | c.write('\uac00\u00ac') |
|---|
| 193 | n/a | self.assertEqual(s.getvalue(), |
|---|
| 194 | n/a | b'123\x907\x959\x827\xcf5\x810\x851') |
|---|
| 195 | n/a | |
|---|
| 196 | n/a | def test_utf_8(self): |
|---|
| 197 | n/a | s= io.BytesIO() |
|---|
| 198 | n/a | c = codecs.getwriter('utf-8')(s) |
|---|
| 199 | n/a | c.write('123') |
|---|
| 200 | n/a | self.assertEqual(s.getvalue(), b'123') |
|---|
| 201 | n/a | c.write('\U00012345') |
|---|
| 202 | n/a | self.assertEqual(s.getvalue(), b'123\xf0\x92\x8d\x85') |
|---|
| 203 | n/a | c.write('\uac00\u00ac') |
|---|
| 204 | n/a | self.assertEqual(s.getvalue(), |
|---|
| 205 | n/a | b'123\xf0\x92\x8d\x85' |
|---|
| 206 | n/a | b'\xea\xb0\x80\xc2\xac') |
|---|
| 207 | n/a | |
|---|
| 208 | n/a | def test_streamwriter_strwrite(self): |
|---|
| 209 | n/a | s = io.BytesIO() |
|---|
| 210 | n/a | wr = codecs.getwriter('gb18030')(s) |
|---|
| 211 | n/a | wr.write('abcd') |
|---|
| 212 | n/a | self.assertEqual(s.getvalue(), b'abcd') |
|---|
| 213 | n/a | |
|---|
| 214 | n/a | class Test_ISO2022(unittest.TestCase): |
|---|
| 215 | n/a | def test_g2(self): |
|---|
| 216 | n/a | iso2022jp2 = b'\x1b(B:hu4:unit\x1b.A\x1bNi de famille' |
|---|
| 217 | n/a | uni = ':hu4:unit\xe9 de famille' |
|---|
| 218 | n/a | self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni) |
|---|
| 219 | n/a | |
|---|
| 220 | n/a | def test_iso2022_jp_g0(self): |
|---|
| 221 | n/a | self.assertNotIn(b'\x0e', '\N{SOFT HYPHEN}'.encode('iso-2022-jp-2')) |
|---|
| 222 | n/a | for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'): |
|---|
| 223 | n/a | e = '\u3406'.encode(encoding) |
|---|
| 224 | n/a | self.assertFalse(any(x > 0x80 for x in e)) |
|---|
| 225 | n/a | |
|---|
| 226 | n/a | def test_bug1572832(self): |
|---|
| 227 | n/a | for x in range(0x10000, 0x110000): |
|---|
| 228 | n/a | # Any ISO 2022 codec will cause the segfault |
|---|
| 229 | n/a | chr(x).encode('iso_2022_jp', 'ignore') |
|---|
| 230 | n/a | |
|---|
| 231 | n/a | class TestStateful(unittest.TestCase): |
|---|
| 232 | n/a | text = '\u4E16\u4E16' |
|---|
| 233 | n/a | encoding = 'iso-2022-jp' |
|---|
| 234 | n/a | expected = b'\x1b$B@$@$' |
|---|
| 235 | n/a | reset = b'\x1b(B' |
|---|
| 236 | n/a | expected_reset = expected + reset |
|---|
| 237 | n/a | |
|---|
| 238 | n/a | def test_encode(self): |
|---|
| 239 | n/a | self.assertEqual(self.text.encode(self.encoding), self.expected_reset) |
|---|
| 240 | n/a | |
|---|
| 241 | n/a | def test_incrementalencoder(self): |
|---|
| 242 | n/a | encoder = codecs.getincrementalencoder(self.encoding)() |
|---|
| 243 | n/a | output = b''.join( |
|---|
| 244 | n/a | encoder.encode(char) |
|---|
| 245 | n/a | for char in self.text) |
|---|
| 246 | n/a | self.assertEqual(output, self.expected) |
|---|
| 247 | n/a | self.assertEqual(encoder.encode('', final=True), self.reset) |
|---|
| 248 | n/a | self.assertEqual(encoder.encode('', final=True), b'') |
|---|
| 249 | n/a | |
|---|
| 250 | n/a | def test_incrementalencoder_final(self): |
|---|
| 251 | n/a | encoder = codecs.getincrementalencoder(self.encoding)() |
|---|
| 252 | n/a | last_index = len(self.text) - 1 |
|---|
| 253 | n/a | output = b''.join( |
|---|
| 254 | n/a | encoder.encode(char, index == last_index) |
|---|
| 255 | n/a | for index, char in enumerate(self.text)) |
|---|
| 256 | n/a | self.assertEqual(output, self.expected_reset) |
|---|
| 257 | n/a | self.assertEqual(encoder.encode('', final=True), b'') |
|---|
| 258 | n/a | |
|---|
| 259 | n/a | class TestHZStateful(TestStateful): |
|---|
| 260 | n/a | text = '\u804a\u804a' |
|---|
| 261 | n/a | encoding = 'hz' |
|---|
| 262 | n/a | expected = b'~{ADAD' |
|---|
| 263 | n/a | reset = b'~}' |
|---|
| 264 | n/a | expected_reset = expected + reset |
|---|
| 265 | n/a | |
|---|
| 266 | n/a | def test_main(): |
|---|
| 267 | n/a | support.run_unittest(__name__) |
|---|
| 268 | n/a | |
|---|
| 269 | n/a | if __name__ == "__main__": |
|---|
| 270 | n/a | test_main() |
|---|