Python code coverage for Lib/email/_encoded_words.py

#	count	content
1	n/a	""" Routines for manipulating RFC2047 encoded words.
2	n/a
3	n/a	This is currently a package-private API, but will be considered for promotion
4	n/a	to a public API if there is demand.
5	n/a
6	n/a	"""
7	n/a
8	n/a	# An ecoded word looks like this:
9	n/a	#
10	n/a	# =?charset[*lang]?cte?encoded_string?=
11	n/a	#
12	n/a	# for more information about charset see the charset module. Here it is one
13	n/a	# of the preferred MIME charset names (hopefully; you never know when parsing).
14	n/a	# cte (Content Transfer Encoding) is either 'q' or 'b' (ignoring case). In
15	n/a	# theory other letters could be used for other encodings, but in practice this
16	n/a	# (almost?) never happens. There could be a public API for adding entries
17	n/a	# to the CTE tables, but YAGNI for now. 'q' is Quoted Printable, 'b' is
18	n/a	# Base64. The meaning of encoded_string should be obvious. 'lang' is optional
19	n/a	# as indicated by the brackets (they are not part of the syntax) but is almost
20	n/a	# never encountered in practice.
21	n/a	#
22	n/a	# The general interface for a CTE decoder is that it takes the encoded_string
23	n/a	# as its argument, and returns a tuple (cte_decoded_string, defects). The
24	n/a	# cte_decoded_string is the original binary that was encoded using the
25	n/a	# specified cte. 'defects' is a list of MessageDefect instances indicating any
26	n/a	# problems encountered during conversion. 'charset' and 'lang' are the
27	n/a	# corresponding strings extracted from the EW, case preserved.
28	n/a	#
29	n/a	# The general interface for a CTE encoder is that it takes a binary sequence
30	n/a	# as input and returns the cte_encoded_string, which is an ascii-only string.
31	n/a	#
32	n/a	# Each decoder must also supply a length function that takes the binary
33	n/a	# sequence as its argument and returns the length of the resulting encoded
34	n/a	# string.
35	n/a	#
36	n/a	# The main API functions for the module are decode, which calls the decoder
37	n/a	# referenced by the cte specifier, and encode, which adds the appropriate
38	n/a	# RFC 2047 "chrome" to the encoded string, and can optionally automatically
39	n/a	# select the shortest possible encoding. See their docstrings below for
40	n/a	# details.
41	n/a
42	n/a	import re
43	n/a	import base64
44	n/a	import binascii
45	n/a	import functools
46	n/a	from string import ascii_letters, digits
47	n/a	from email import errors
48	n/a
49	n/a	__all__ = ['decode_q',
50	n/a	'encode_q',
51	n/a	'decode_b',
52	n/a	'encode_b',
53	n/a	'len_q',
54	n/a	'len_b',
55	n/a	'decode',
56	n/a	'encode',
57	n/a	]
58	n/a
59	n/a	#
60	n/a	# Quoted Printable
61	n/a	#
62	n/a
63	n/a	# regex based decoder.
64	n/a	_q_byte_subber = functools.partial(re.compile(br'=([a-fA-F0-9]{2})').sub,
65	n/a	lambda m: bytes.fromhex(m.group(1).decode()))
66	n/a
67	n/a	def decode_q(encoded):
68	n/a	encoded = encoded.replace(b'_', b' ')
69	n/a	return _q_byte_subber(encoded), []
70	n/a
71	n/a
72	n/a	# dict mapping bytes to their encoded form
73	n/a	class _QByteMap(dict):
74	n/a
75	n/a	safe = b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii')
76	n/a
77	n/a	def __missing__(self, key):
78	n/a	if key in self.safe:
79	n/a	self[key] = chr(key)
80	n/a	else:
81	n/a	self[key] = "={:02X}".format(key)
82	n/a	return self[key]
83	n/a
84	n/a	_q_byte_map = _QByteMap()
85	n/a
86	n/a	# In headers spaces are mapped to '_'.
87	n/a	_q_byte_map[ord(' ')] = '_'
88	n/a
89	n/a	def encode_q(bstring):
90	n/a	return ''.join(_q_byte_map[x] for x in bstring)
91	n/a
92	n/a	def len_q(bstring):
93	n/a	return sum(len(_q_byte_map[x]) for x in bstring)
94	n/a
95	n/a
96	n/a	#
97	n/a	# Base64
98	n/a	#
99	n/a
100	n/a	def decode_b(encoded):
101	n/a	defects = []
102	n/a	pad_err = len(encoded) % 4
103	n/a	if pad_err:
104	n/a	defects.append(errors.InvalidBase64PaddingDefect())
105	n/a	padded_encoded = encoded + b'==='[:4-pad_err]
106	n/a	else:
107	n/a	padded_encoded = encoded
108	n/a	try:
109	n/a	return base64.b64decode(padded_encoded, validate=True), defects
110	n/a	except binascii.Error:
111	n/a	# Since we had correct padding, this must an invalid char error.
112	n/a	defects = [errors.InvalidBase64CharactersDefect()]
113	n/a	# The non-alphabet characters are ignored as far as padding
114	n/a	# goes, but we don't know how many there are. So we'll just
115	n/a	# try various padding lengths until something works.
116	n/a	for i in 0, 1, 2, 3:
117	n/a	try:
118	n/a	return base64.b64decode(encoded+b'='*i, validate=False), defects
119	n/a	except binascii.Error:
120	n/a	if i==0:
121	n/a	defects.append(errors.InvalidBase64PaddingDefect())
122	n/a	else:
123	n/a	# This should never happen.
124	n/a	raise AssertionError("unexpected binascii.Error")
125	n/a
126	n/a	def encode_b(bstring):
127	n/a	return base64.b64encode(bstring).decode('ascii')
128	n/a
129	n/a	def len_b(bstring):
130	n/a	groups_of_3, leftover = divmod(len(bstring), 3)
131	n/a	# 4 bytes out for each 3 bytes (or nonzero fraction thereof) in.
132	n/a	return groups_of_3 * 4 + (4 if leftover else 0)
133	n/a
134	n/a
135	n/a	_cte_decoders = {
136	n/a	'q': decode_q,
137	n/a	'b': decode_b,
138	n/a	}
139	n/a
140	n/a	def decode(ew):
141	n/a	"""Decode encoded word and return (string, charset, lang, defects) tuple.
142	n/a
143	n/a	An RFC 2047/2243 encoded word has the form:
144	n/a
145	n/a	=?charset*lang?cte?encoded_string?=
146	n/a
147	n/a	where '*lang' may be omitted but the other parts may not be.
148	n/a
149	n/a	This function expects exactly such a string (that is, it does not check the
150	n/a	syntax and may raise errors if the string is not well formed), and returns
151	n/a	the encoded_string decoded first from its Content Transfer Encoding and
152	n/a	then from the resulting bytes into unicode using the specified charset. If
153	n/a	the cte-decoded string does not successfully decode using the specified
154	n/a	character set, a defect is added to the defects list and the unknown octets
155	n/a	are replaced by the unicode 'unknown' character \\uFDFF.
156	n/a
157	n/a	The specified charset and language are returned. The default for language,
158	n/a	which is rarely if ever encountered, is the empty string.
159	n/a
160	n/a	"""
161	n/a	_, charset, cte, cte_string, _ = ew.split('?')
162	n/a	charset, _, lang = charset.partition('*')
163	n/a	cte = cte.lower()
164	n/a	# Recover the original bytes and do CTE decoding.
165	n/a	bstring = cte_string.encode('ascii', 'surrogateescape')
166	n/a	bstring, defects = _cte_decoders[cte](bstring)
167	n/a	# Turn the CTE decoded bytes into unicode.
168	n/a	try:
169	n/a	string = bstring.decode(charset)
170	n/a	except UnicodeError:
171	n/a	defects.append(errors.UndecodableBytesDefect("Encoded word "
172	n/a	"contains bytes not decodable using {} charset".format(charset)))
173	n/a	string = bstring.decode(charset, 'surrogateescape')
174	n/a	except LookupError:
175	n/a	string = bstring.decode('ascii', 'surrogateescape')
176	n/a	if charset.lower() != 'unknown-8bit':
177	n/a	defects.append(errors.CharsetError("Unknown charset {} "
178	n/a	"in encoded word; decoded as unknown bytes".format(charset)))
179	n/a	return string, charset, lang, defects
180	n/a
181	n/a
182	n/a	_cte_encoders = {
183	n/a	'q': encode_q,
184	n/a	'b': encode_b,
185	n/a	}
186	n/a
187	n/a	_cte_encode_length = {
188	n/a	'q': len_q,
189	n/a	'b': len_b,
190	n/a	}
191	n/a
192	n/a	def encode(string, charset='utf-8', encoding=None, lang=''):
193	n/a	"""Encode string using the CTE encoding that produces the shorter result.
194	n/a
195	n/a	Produces an RFC 2047/2243 encoded word of the form:
196	n/a
197	n/a	=?charset*lang?cte?encoded_string?=
198	n/a
199	n/a	where '*lang' is omitted unless the 'lang' parameter is given a value.
200	n/a	Optional argument charset (defaults to utf-8) specifies the charset to use
201	n/a	to encode the string to binary before CTE encoding it. Optional argument
202	n/a	'encoding' is the cte specifier for the encoding that should be used ('q'
203	n/a	or 'b'); if it is None (the default) the encoding which produces the
204	n/a	shortest encoded sequence is used, except that 'q' is preferred if it is up
205	n/a	to five characters longer. Optional argument 'lang' (default '') gives the
206	n/a	RFC 2243 language string to specify in the encoded word.
207	n/a
208	n/a	"""
209	n/a	if charset == 'unknown-8bit':
210	n/a	bstring = string.encode('ascii', 'surrogateescape')
211	n/a	else:
212	n/a	bstring = string.encode(charset)
213	n/a	if encoding is None:
214	n/a	qlen = _cte_encode_length['q'](bstring)
215	n/a	blen = _cte_encode_length['b'](bstring)
216	n/a	# Bias toward q. 5 is arbitrary.
217	n/a	encoding = 'q' if qlen - blen < 5 else 'b'
218	n/a	encoded = _cte_encoders[encoding](bstring)
219	n/a	if lang:
220	n/a	lang = '*' + lang
221	n/a	return "=?{}{}?{}?{}?=".format(charset, lang, encoding, encoded)