Python code coverage for Lib/encodings/idna.py

#	count	content
1	n/a	# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
2	n/a
3	n/a	import stringprep, re, codecs
4	n/a	from unicodedata import ucd_3_2_0 as unicodedata
5	n/a
6	n/a	# IDNA section 3.1
7	n/a	dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
8	n/a
9	n/a	# IDNA section 5
10	n/a	ace_prefix = b"xn--"
11	n/a	sace_prefix = "xn--"
12	n/a
13	n/a	# This assumes query strings, so AllowUnassigned is true
14	n/a	def nameprep(label):
15	n/a	# Map
16	n/a	newlabel = []
17	n/a	for c in label:
18	n/a	if stringprep.in_table_b1(c):
19	n/a	# Map to nothing
20	n/a	continue
21	n/a	newlabel.append(stringprep.map_table_b2(c))
22	n/a	label = "".join(newlabel)
23	n/a
24	n/a	# Normalize
25	n/a	label = unicodedata.normalize("NFKC", label)
26	n/a
27	n/a	# Prohibit
28	n/a	for c in label:
29	n/a	if stringprep.in_table_c12(c) or \
30	n/a	stringprep.in_table_c22(c) or \
31	n/a	stringprep.in_table_c3(c) or \
32	n/a	stringprep.in_table_c4(c) or \
33	n/a	stringprep.in_table_c5(c) or \
34	n/a	stringprep.in_table_c6(c) or \
35	n/a	stringprep.in_table_c7(c) or \
36	n/a	stringprep.in_table_c8(c) or \
37	n/a	stringprep.in_table_c9(c):
38	n/a	raise UnicodeError("Invalid character %r" % c)
39	n/a
40	n/a	# Check bidi
41	n/a	RandAL = [stringprep.in_table_d1(x) for x in label]
42	n/a	for c in RandAL:
43	n/a	if c:
44	n/a	# There is a RandAL char in the string. Must perform further
45	n/a	# tests:
46	n/a	# 1) The characters in section 5.8 MUST be prohibited.
47	n/a	# This is table C.8, which was already checked
48	n/a	# 2) If a string contains any RandALCat character, the string
49	n/a	# MUST NOT contain any LCat character.
50	n/a	if any(stringprep.in_table_d2(x) for x in label):
51	n/a	raise UnicodeError("Violation of BIDI requirement 2")
52	n/a
53	n/a	# 3) If a string contains any RandALCat character, a
54	n/a	# RandALCat character MUST be the first character of the
55	n/a	# string, and a RandALCat character MUST be the last
56	n/a	# character of the string.
57	n/a	if not RandAL[0] or not RandAL[-1]:
58	n/a	raise UnicodeError("Violation of BIDI requirement 3")
59	n/a
60	n/a	return label
61	n/a
62	n/a	def ToASCII(label):
63	n/a	try:
64	n/a	# Step 1: try ASCII
65	n/a	label = label.encode("ascii")
66	n/a	except UnicodeError:
67	n/a	pass
68	n/a	else:
69	n/a	# Skip to step 3: UseSTD3ASCIIRules is false, so
70	n/a	# Skip to step 8.
71	n/a	if 0 < len(label) < 64:
72	n/a	return label
73	n/a	raise UnicodeError("label empty or too long")
74	n/a
75	n/a	# Step 2: nameprep
76	n/a	label = nameprep(label)
77	n/a
78	n/a	# Step 3: UseSTD3ASCIIRules is false
79	n/a	# Step 4: try ASCII
80	n/a	try:
81	n/a	label = label.encode("ascii")
82	n/a	except UnicodeError:
83	n/a	pass
84	n/a	else:
85	n/a	# Skip to step 8.
86	n/a	if 0 < len(label) < 64:
87	n/a	return label
88	n/a	raise UnicodeError("label empty or too long")
89	n/a
90	n/a	# Step 5: Check ACE prefix
91	n/a	if label.startswith(sace_prefix):
92	n/a	raise UnicodeError("Label starts with ACE prefix")
93	n/a
94	n/a	# Step 6: Encode with PUNYCODE
95	n/a	label = label.encode("punycode")
96	n/a
97	n/a	# Step 7: Prepend ACE prefix
98	n/a	label = ace_prefix + label
99	n/a
100	n/a	# Step 8: Check size
101	n/a	if 0 < len(label) < 64:
102	n/a	return label
103	n/a	raise UnicodeError("label empty or too long")
104	n/a
105	n/a	def ToUnicode(label):
106	n/a	# Step 1: Check for ASCII
107	n/a	if isinstance(label, bytes):
108	n/a	pure_ascii = True
109	n/a	else:
110	n/a	try:
111	n/a	label = label.encode("ascii")
112	n/a	pure_ascii = True
113	n/a	except UnicodeError:
114	n/a	pure_ascii = False
115	n/a	if not pure_ascii:
116	n/a	# Step 2: Perform nameprep
117	n/a	label = nameprep(label)
118	n/a	# It doesn't say this, but apparently, it should be ASCII now
119	n/a	try:
120	n/a	label = label.encode("ascii")
121	n/a	except UnicodeError:
122	n/a	raise UnicodeError("Invalid character in IDN label")
123	n/a	# Step 3: Check for ACE prefix
124	n/a	if not label.startswith(ace_prefix):
125	n/a	return str(label, "ascii")
126	n/a
127	n/a	# Step 4: Remove ACE prefix
128	n/a	label1 = label[len(ace_prefix):]
129	n/a
130	n/a	# Step 5: Decode using PUNYCODE
131	n/a	result = label1.decode("punycode")
132	n/a
133	n/a	# Step 6: Apply ToASCII
134	n/a	label2 = ToASCII(result)
135	n/a
136	n/a	# Step 7: Compare the result of step 6 with the one of step 3
137	n/a	# label2 will already be in lower case.
138	n/a	if str(label, "ascii").lower() != str(label2, "ascii"):
139	n/a	raise UnicodeError("IDNA does not round-trip", label, label2)
140	n/a
141	n/a	# Step 8: return the result of step 5
142	n/a	return result
143	n/a
144	n/a	### Codec APIs
145	n/a
146	n/a	class Codec(codecs.Codec):
147	n/a	def encode(self, input, errors='strict'):
148	n/a
149	n/a	if errors != 'strict':
150	n/a	# IDNA is quite clear that implementations must be strict
151	n/a	raise UnicodeError("unsupported error handling "+errors)
152	n/a
153	n/a	if not input:
154	n/a	return b'', 0
155	n/a
156	n/a	try:
157	n/a	result = input.encode('ascii')
158	n/a	except UnicodeEncodeError:
159	n/a	pass
160	n/a	else:
161	n/a	# ASCII name: fast path
162	n/a	labels = result.split(b'.')
163	n/a	for label in labels[:-1]:
164	n/a	if not (0 < len(label) < 64):
165	n/a	raise UnicodeError("label empty or too long")
166	n/a	if len(labels[-1]) >= 64:
167	n/a	raise UnicodeError("label too long")
168	n/a	return result, len(input)
169	n/a
170	n/a	result = bytearray()
171	n/a	labels = dots.split(input)
172	n/a	if labels and not labels[-1]:
173	n/a	trailing_dot = b'.'
174	n/a	del labels[-1]
175	n/a	else:
176	n/a	trailing_dot = b''
177	n/a	for label in labels:
178	n/a	if result:
179	n/a	# Join with U+002E
180	n/a	result.extend(b'.')
181	n/a	result.extend(ToASCII(label))
182	n/a	return bytes(result+trailing_dot), len(input)
183	n/a
184	n/a	def decode(self, input, errors='strict'):
185	n/a
186	n/a	if errors != 'strict':
187	n/a	raise UnicodeError("Unsupported error handling "+errors)
188	n/a
189	n/a	if not input:
190	n/a	return "", 0
191	n/a
192	n/a	# IDNA allows decoding to operate on Unicode strings, too.
193	n/a	if not isinstance(input, bytes):
194	n/a	# XXX obviously wrong, see #3232
195	n/a	input = bytes(input)
196	n/a
197	n/a	if ace_prefix not in input:
198	n/a	# Fast path
199	n/a	try:
200	n/a	return input.decode('ascii'), len(input)
201	n/a	except UnicodeDecodeError:
202	n/a	pass
203	n/a
204	n/a	labels = input.split(b".")
205	n/a
206	n/a	if labels and len(labels[-1]) == 0:
207	n/a	trailing_dot = '.'
208	n/a	del labels[-1]
209	n/a	else:
210	n/a	trailing_dot = ''
211	n/a
212	n/a	result = []
213	n/a	for label in labels:
214	n/a	result.append(ToUnicode(label))
215	n/a
216	n/a	return ".".join(result)+trailing_dot, len(input)
217	n/a
218	n/a	class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
219	n/a	def _buffer_encode(self, input, errors, final):
220	n/a	if errors != 'strict':
221	n/a	# IDNA is quite clear that implementations must be strict
222	n/a	raise UnicodeError("unsupported error handling "+errors)
223	n/a
224	n/a	if not input:
225	n/a	return (b'', 0)
226	n/a
227	n/a	labels = dots.split(input)
228	n/a	trailing_dot = b''
229	n/a	if labels:
230	n/a	if not labels[-1]:
231	n/a	trailing_dot = b'.'
232	n/a	del labels[-1]
233	n/a	elif not final:
234	n/a	# Keep potentially unfinished label until the next call
235	n/a	del labels[-1]
236	n/a	if labels:
237	n/a	trailing_dot = b'.'
238	n/a
239	n/a	result = bytearray()
240	n/a	size = 0
241	n/a	for label in labels:
242	n/a	if size:
243	n/a	# Join with U+002E
244	n/a	result.extend(b'.')
245	n/a	size += 1
246	n/a	result.extend(ToASCII(label))
247	n/a	size += len(label)
248	n/a
249	n/a	result += trailing_dot
250	n/a	size += len(trailing_dot)
251	n/a	return (bytes(result), size)
252	n/a
253	n/a	class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
254	n/a	def _buffer_decode(self, input, errors, final):
255	n/a	if errors != 'strict':
256	n/a	raise UnicodeError("Unsupported error handling "+errors)
257	n/a
258	n/a	if not input:
259	n/a	return ("", 0)
260	n/a
261	n/a	# IDNA allows decoding to operate on Unicode strings, too.
262	n/a	if isinstance(input, str):
263	n/a	labels = dots.split(input)
264	n/a	else:
265	n/a	# Must be ASCII string
266	n/a	input = str(input, "ascii")
267	n/a	labels = input.split(".")
268	n/a
269	n/a	trailing_dot = ''
270	n/a	if labels:
271	n/a	if not labels[-1]:
272	n/a	trailing_dot = '.'
273	n/a	del labels[-1]
274	n/a	elif not final:
275	n/a	# Keep potentially unfinished label until the next call
276	n/a	del labels[-1]
277	n/a	if labels:
278	n/a	trailing_dot = '.'
279	n/a
280	n/a	result = []
281	n/a	size = 0
282	n/a	for label in labels:
283	n/a	result.append(ToUnicode(label))
284	n/a	if size:
285	n/a	size += 1
286	n/a	size += len(label)
287	n/a
288	n/a	result = ".".join(result) + trailing_dot
289	n/a	size += len(trailing_dot)
290	n/a	return (result, size)
291	n/a
292	n/a	class StreamWriter(Codec,codecs.StreamWriter):
293	n/a	pass
294	n/a
295	n/a	class StreamReader(Codec,codecs.StreamReader):
296	n/a	pass
297	n/a
298	n/a	### encodings module API
299	n/a
300	n/a	def getregentry():
301	n/a	return codecs.CodecInfo(
302	n/a	name='idna',
303	n/a	encode=Codec().encode,
304	n/a	decode=Codec().decode,
305	n/a	incrementalencoder=IncrementalEncoder,
306	n/a	incrementaldecoder=IncrementalDecoder,
307	n/a	streamwriter=StreamWriter,
308	n/a	streamreader=StreamReader,
309	n/a	)