ยปCore Development>Code coverage>Lib/email/_encoded_words.py

Python code coverage for Lib/email/_encoded_words.py

#countcontent
1n/a""" Routines for manipulating RFC2047 encoded words.
2n/a
3n/aThis is currently a package-private API, but will be considered for promotion
4n/ato a public API if there is demand.
5n/a
6n/a"""
7n/a
8n/a# An ecoded word looks like this:
9n/a#
10n/a# =?charset[*lang]?cte?encoded_string?=
11n/a#
12n/a# for more information about charset see the charset module. Here it is one
13n/a# of the preferred MIME charset names (hopefully; you never know when parsing).
14n/a# cte (Content Transfer Encoding) is either 'q' or 'b' (ignoring case). In
15n/a# theory other letters could be used for other encodings, but in practice this
16n/a# (almost?) never happens. There could be a public API for adding entries
17n/a# to the CTE tables, but YAGNI for now. 'q' is Quoted Printable, 'b' is
18n/a# Base64. The meaning of encoded_string should be obvious. 'lang' is optional
19n/a# as indicated by the brackets (they are not part of the syntax) but is almost
20n/a# never encountered in practice.
21n/a#
22n/a# The general interface for a CTE decoder is that it takes the encoded_string
23n/a# as its argument, and returns a tuple (cte_decoded_string, defects). The
24n/a# cte_decoded_string is the original binary that was encoded using the
25n/a# specified cte. 'defects' is a list of MessageDefect instances indicating any
26n/a# problems encountered during conversion. 'charset' and 'lang' are the
27n/a# corresponding strings extracted from the EW, case preserved.
28n/a#
29n/a# The general interface for a CTE encoder is that it takes a binary sequence
30n/a# as input and returns the cte_encoded_string, which is an ascii-only string.
31n/a#
32n/a# Each decoder must also supply a length function that takes the binary
33n/a# sequence as its argument and returns the length of the resulting encoded
34n/a# string.
35n/a#
36n/a# The main API functions for the module are decode, which calls the decoder
37n/a# referenced by the cte specifier, and encode, which adds the appropriate
38n/a# RFC 2047 "chrome" to the encoded string, and can optionally automatically
39n/a# select the shortest possible encoding. See their docstrings below for
40n/a# details.
41n/a
42n/aimport re
43n/aimport base64
44n/aimport binascii
45n/aimport functools
46n/afrom string import ascii_letters, digits
47n/afrom email import errors
48n/a
49n/a__all__ = ['decode_q',
50n/a 'encode_q',
51n/a 'decode_b',
52n/a 'encode_b',
53n/a 'len_q',
54n/a 'len_b',
55n/a 'decode',
56n/a 'encode',
57n/a ]
58n/a
59n/a#
60n/a# Quoted Printable
61n/a#
62n/a
63n/a# regex based decoder.
64n/a_q_byte_subber = functools.partial(re.compile(br'=([a-fA-F0-9]{2})').sub,
65n/a lambda m: bytes.fromhex(m.group(1).decode()))
66n/a
67n/adef decode_q(encoded):
68n/a encoded = encoded.replace(b'_', b' ')
69n/a return _q_byte_subber(encoded), []
70n/a
71n/a
72n/a# dict mapping bytes to their encoded form
73n/aclass _QByteMap(dict):
74n/a
75n/a safe = b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii')
76n/a
77n/a def __missing__(self, key):
78n/a if key in self.safe:
79n/a self[key] = chr(key)
80n/a else:
81n/a self[key] = "={:02X}".format(key)
82n/a return self[key]
83n/a
84n/a_q_byte_map = _QByteMap()
85n/a
86n/a# In headers spaces are mapped to '_'.
87n/a_q_byte_map[ord(' ')] = '_'
88n/a
89n/adef encode_q(bstring):
90n/a return ''.join(_q_byte_map[x] for x in bstring)
91n/a
92n/adef len_q(bstring):
93n/a return sum(len(_q_byte_map[x]) for x in bstring)
94n/a
95n/a
96n/a#
97n/a# Base64
98n/a#
99n/a
100n/adef decode_b(encoded):
101n/a defects = []
102n/a pad_err = len(encoded) % 4
103n/a if pad_err:
104n/a defects.append(errors.InvalidBase64PaddingDefect())
105n/a padded_encoded = encoded + b'==='[:4-pad_err]
106n/a else:
107n/a padded_encoded = encoded
108n/a try:
109n/a return base64.b64decode(padded_encoded, validate=True), defects
110n/a except binascii.Error:
111n/a # Since we had correct padding, this must an invalid char error.
112n/a defects = [errors.InvalidBase64CharactersDefect()]
113n/a # The non-alphabet characters are ignored as far as padding
114n/a # goes, but we don't know how many there are. So we'll just
115n/a # try various padding lengths until something works.
116n/a for i in 0, 1, 2, 3:
117n/a try:
118n/a return base64.b64decode(encoded+b'='*i, validate=False), defects
119n/a except binascii.Error:
120n/a if i==0:
121n/a defects.append(errors.InvalidBase64PaddingDefect())
122n/a else:
123n/a # This should never happen.
124n/a raise AssertionError("unexpected binascii.Error")
125n/a
126n/adef encode_b(bstring):
127n/a return base64.b64encode(bstring).decode('ascii')
128n/a
129n/adef len_b(bstring):
130n/a groups_of_3, leftover = divmod(len(bstring), 3)
131n/a # 4 bytes out for each 3 bytes (or nonzero fraction thereof) in.
132n/a return groups_of_3 * 4 + (4 if leftover else 0)
133n/a
134n/a
135n/a_cte_decoders = {
136n/a 'q': decode_q,
137n/a 'b': decode_b,
138n/a }
139n/a
140n/adef decode(ew):
141n/a """Decode encoded word and return (string, charset, lang, defects) tuple.
142n/a
143n/a An RFC 2047/2243 encoded word has the form:
144n/a
145n/a =?charset*lang?cte?encoded_string?=
146n/a
147n/a where '*lang' may be omitted but the other parts may not be.
148n/a
149n/a This function expects exactly such a string (that is, it does not check the
150n/a syntax and may raise errors if the string is not well formed), and returns
151n/a the encoded_string decoded first from its Content Transfer Encoding and
152n/a then from the resulting bytes into unicode using the specified charset. If
153n/a the cte-decoded string does not successfully decode using the specified
154n/a character set, a defect is added to the defects list and the unknown octets
155n/a are replaced by the unicode 'unknown' character \\uFDFF.
156n/a
157n/a The specified charset and language are returned. The default for language,
158n/a which is rarely if ever encountered, is the empty string.
159n/a
160n/a """
161n/a _, charset, cte, cte_string, _ = ew.split('?')
162n/a charset, _, lang = charset.partition('*')
163n/a cte = cte.lower()
164n/a # Recover the original bytes and do CTE decoding.
165n/a bstring = cte_string.encode('ascii', 'surrogateescape')
166n/a bstring, defects = _cte_decoders[cte](bstring)
167n/a # Turn the CTE decoded bytes into unicode.
168n/a try:
169n/a string = bstring.decode(charset)
170n/a except UnicodeError:
171n/a defects.append(errors.UndecodableBytesDefect("Encoded word "
172n/a "contains bytes not decodable using {} charset".format(charset)))
173n/a string = bstring.decode(charset, 'surrogateescape')
174n/a except LookupError:
175n/a string = bstring.decode('ascii', 'surrogateescape')
176n/a if charset.lower() != 'unknown-8bit':
177n/a defects.append(errors.CharsetError("Unknown charset {} "
178n/a "in encoded word; decoded as unknown bytes".format(charset)))
179n/a return string, charset, lang, defects
180n/a
181n/a
182n/a_cte_encoders = {
183n/a 'q': encode_q,
184n/a 'b': encode_b,
185n/a }
186n/a
187n/a_cte_encode_length = {
188n/a 'q': len_q,
189n/a 'b': len_b,
190n/a }
191n/a
192n/adef encode(string, charset='utf-8', encoding=None, lang=''):
193n/a """Encode string using the CTE encoding that produces the shorter result.
194n/a
195n/a Produces an RFC 2047/2243 encoded word of the form:
196n/a
197n/a =?charset*lang?cte?encoded_string?=
198n/a
199n/a where '*lang' is omitted unless the 'lang' parameter is given a value.
200n/a Optional argument charset (defaults to utf-8) specifies the charset to use
201n/a to encode the string to binary before CTE encoding it. Optional argument
202n/a 'encoding' is the cte specifier for the encoding that should be used ('q'
203n/a or 'b'); if it is None (the default) the encoding which produces the
204n/a shortest encoded sequence is used, except that 'q' is preferred if it is up
205n/a to five characters longer. Optional argument 'lang' (default '') gives the
206n/a RFC 2243 language string to specify in the encoded word.
207n/a
208n/a """
209n/a if charset == 'unknown-8bit':
210n/a bstring = string.encode('ascii', 'surrogateescape')
211n/a else:
212n/a bstring = string.encode(charset)
213n/a if encoding is None:
214n/a qlen = _cte_encode_length['q'](bstring)
215n/a blen = _cte_encode_length['b'](bstring)
216n/a # Bias toward q. 5 is arbitrary.
217n/a encoding = 'q' if qlen - blen < 5 else 'b'
218n/a encoded = _cte_encoders[encoding](bstring)
219n/a if lang:
220n/a lang = '*' + lang
221n/a return "=?{}{}?{}?{}?=".format(charset, lang, encoding, encoded)