1 | n/a | # Copyright (C) 2001-2007 Python Software Foundation |
---|
2 | n/a | # Author: Ben Gertzfield, Barry Warsaw |
---|
3 | n/a | # Contact: email-sig@python.org |
---|
4 | n/a | |
---|
5 | n/a | __all__ = [ |
---|
6 | n/a | 'Charset', |
---|
7 | n/a | 'add_alias', |
---|
8 | n/a | 'add_charset', |
---|
9 | n/a | 'add_codec', |
---|
10 | n/a | ] |
---|
11 | n/a | |
---|
12 | n/a | from functools import partial |
---|
13 | n/a | |
---|
14 | n/a | import email.base64mime |
---|
15 | n/a | import email.quoprimime |
---|
16 | n/a | |
---|
17 | n/a | from email import errors |
---|
18 | n/a | from email.encoders import encode_7or8bit |
---|
19 | n/a | |
---|
20 | n/a | |
---|
21 | n/a | |
---|
22 | n/a | # Flags for types of header encodings |
---|
23 | n/a | QP = 1 # Quoted-Printable |
---|
24 | n/a | BASE64 = 2 # Base64 |
---|
25 | n/a | SHORTEST = 3 # the shorter of QP and base64, but only for headers |
---|
26 | n/a | |
---|
27 | n/a | # In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7 |
---|
28 | n/a | RFC2047_CHROME_LEN = 7 |
---|
29 | n/a | |
---|
30 | n/a | DEFAULT_CHARSET = 'us-ascii' |
---|
31 | n/a | UNKNOWN8BIT = 'unknown-8bit' |
---|
32 | n/a | EMPTYSTRING = '' |
---|
33 | n/a | |
---|
34 | n/a | |
---|
35 | n/a | |
---|
36 | n/a | # Defaults |
---|
37 | n/a | CHARSETS = { |
---|
38 | n/a | # input header enc body enc output conv |
---|
39 | n/a | 'iso-8859-1': (QP, QP, None), |
---|
40 | n/a | 'iso-8859-2': (QP, QP, None), |
---|
41 | n/a | 'iso-8859-3': (QP, QP, None), |
---|
42 | n/a | 'iso-8859-4': (QP, QP, None), |
---|
43 | n/a | # iso-8859-5 is Cyrillic, and not especially used |
---|
44 | n/a | # iso-8859-6 is Arabic, also not particularly used |
---|
45 | n/a | # iso-8859-7 is Greek, QP will not make it readable |
---|
46 | n/a | # iso-8859-8 is Hebrew, QP will not make it readable |
---|
47 | n/a | 'iso-8859-9': (QP, QP, None), |
---|
48 | n/a | 'iso-8859-10': (QP, QP, None), |
---|
49 | n/a | # iso-8859-11 is Thai, QP will not make it readable |
---|
50 | n/a | 'iso-8859-13': (QP, QP, None), |
---|
51 | n/a | 'iso-8859-14': (QP, QP, None), |
---|
52 | n/a | 'iso-8859-15': (QP, QP, None), |
---|
53 | n/a | 'iso-8859-16': (QP, QP, None), |
---|
54 | n/a | 'windows-1252':(QP, QP, None), |
---|
55 | n/a | 'viscii': (QP, QP, None), |
---|
56 | n/a | 'us-ascii': (None, None, None), |
---|
57 | n/a | 'big5': (BASE64, BASE64, None), |
---|
58 | n/a | 'gb2312': (BASE64, BASE64, None), |
---|
59 | n/a | 'euc-jp': (BASE64, None, 'iso-2022-jp'), |
---|
60 | n/a | 'shift_jis': (BASE64, None, 'iso-2022-jp'), |
---|
61 | n/a | 'iso-2022-jp': (BASE64, None, None), |
---|
62 | n/a | 'koi8-r': (BASE64, BASE64, None), |
---|
63 | n/a | 'utf-8': (SHORTEST, BASE64, 'utf-8'), |
---|
64 | n/a | } |
---|
65 | n/a | |
---|
66 | n/a | # Aliases for other commonly-used names for character sets. Map |
---|
67 | n/a | # them to the real ones used in email. |
---|
68 | n/a | ALIASES = { |
---|
69 | n/a | 'latin_1': 'iso-8859-1', |
---|
70 | n/a | 'latin-1': 'iso-8859-1', |
---|
71 | n/a | 'latin_2': 'iso-8859-2', |
---|
72 | n/a | 'latin-2': 'iso-8859-2', |
---|
73 | n/a | 'latin_3': 'iso-8859-3', |
---|
74 | n/a | 'latin-3': 'iso-8859-3', |
---|
75 | n/a | 'latin_4': 'iso-8859-4', |
---|
76 | n/a | 'latin-4': 'iso-8859-4', |
---|
77 | n/a | 'latin_5': 'iso-8859-9', |
---|
78 | n/a | 'latin-5': 'iso-8859-9', |
---|
79 | n/a | 'latin_6': 'iso-8859-10', |
---|
80 | n/a | 'latin-6': 'iso-8859-10', |
---|
81 | n/a | 'latin_7': 'iso-8859-13', |
---|
82 | n/a | 'latin-7': 'iso-8859-13', |
---|
83 | n/a | 'latin_8': 'iso-8859-14', |
---|
84 | n/a | 'latin-8': 'iso-8859-14', |
---|
85 | n/a | 'latin_9': 'iso-8859-15', |
---|
86 | n/a | 'latin-9': 'iso-8859-15', |
---|
87 | n/a | 'latin_10':'iso-8859-16', |
---|
88 | n/a | 'latin-10':'iso-8859-16', |
---|
89 | n/a | 'cp949': 'ks_c_5601-1987', |
---|
90 | n/a | 'euc_jp': 'euc-jp', |
---|
91 | n/a | 'euc_kr': 'euc-kr', |
---|
92 | n/a | 'ascii': 'us-ascii', |
---|
93 | n/a | } |
---|
94 | n/a | |
---|
95 | n/a | |
---|
96 | n/a | # Map charsets to their Unicode codec strings. |
---|
97 | n/a | CODEC_MAP = { |
---|
98 | n/a | 'gb2312': 'eucgb2312_cn', |
---|
99 | n/a | 'big5': 'big5_tw', |
---|
100 | n/a | # Hack: We don't want *any* conversion for stuff marked us-ascii, as all |
---|
101 | n/a | # sorts of garbage might be sent to us in the guise of 7-bit us-ascii. |
---|
102 | n/a | # Let that stuff pass through without conversion to/from Unicode. |
---|
103 | n/a | 'us-ascii': None, |
---|
104 | n/a | } |
---|
105 | n/a | |
---|
106 | n/a | |
---|
107 | n/a | |
---|
108 | n/a | # Convenience functions for extending the above mappings |
---|
109 | n/a | def add_charset(charset, header_enc=None, body_enc=None, output_charset=None): |
---|
110 | n/a | """Add character set properties to the global registry. |
---|
111 | n/a | |
---|
112 | n/a | charset is the input character set, and must be the canonical name of a |
---|
113 | n/a | character set. |
---|
114 | n/a | |
---|
115 | n/a | Optional header_enc and body_enc is either Charset.QP for |
---|
116 | n/a | quoted-printable, Charset.BASE64 for base64 encoding, Charset.SHORTEST for |
---|
117 | n/a | the shortest of qp or base64 encoding, or None for no encoding. SHORTEST |
---|
118 | n/a | is only valid for header_enc. It describes how message headers and |
---|
119 | n/a | message bodies in the input charset are to be encoded. Default is no |
---|
120 | n/a | encoding. |
---|
121 | n/a | |
---|
122 | n/a | Optional output_charset is the character set that the output should be |
---|
123 | n/a | in. Conversions will proceed from input charset, to Unicode, to the |
---|
124 | n/a | output charset when the method Charset.convert() is called. The default |
---|
125 | n/a | is to output in the same character set as the input. |
---|
126 | n/a | |
---|
127 | n/a | Both input_charset and output_charset must have Unicode codec entries in |
---|
128 | n/a | the module's charset-to-codec mapping; use add_codec(charset, codecname) |
---|
129 | n/a | to add codecs the module does not know about. See the codecs module's |
---|
130 | n/a | documentation for more information. |
---|
131 | n/a | """ |
---|
132 | n/a | if body_enc == SHORTEST: |
---|
133 | n/a | raise ValueError('SHORTEST not allowed for body_enc') |
---|
134 | n/a | CHARSETS[charset] = (header_enc, body_enc, output_charset) |
---|
135 | n/a | |
---|
136 | n/a | |
---|
137 | n/a | def add_alias(alias, canonical): |
---|
138 | n/a | """Add a character set alias. |
---|
139 | n/a | |
---|
140 | n/a | alias is the alias name, e.g. latin-1 |
---|
141 | n/a | canonical is the character set's canonical name, e.g. iso-8859-1 |
---|
142 | n/a | """ |
---|
143 | n/a | ALIASES[alias] = canonical |
---|
144 | n/a | |
---|
145 | n/a | |
---|
146 | n/a | def add_codec(charset, codecname): |
---|
147 | n/a | """Add a codec that map characters in the given charset to/from Unicode. |
---|
148 | n/a | |
---|
149 | n/a | charset is the canonical name of a character set. codecname is the name |
---|
150 | n/a | of a Python codec, as appropriate for the second argument to the unicode() |
---|
151 | n/a | built-in, or to the encode() method of a Unicode string. |
---|
152 | n/a | """ |
---|
153 | n/a | CODEC_MAP[charset] = codecname |
---|
154 | n/a | |
---|
155 | n/a | |
---|
156 | n/a | |
---|
157 | n/a | # Convenience function for encoding strings, taking into account |
---|
158 | n/a | # that they might be unknown-8bit (ie: have surrogate-escaped bytes) |
---|
159 | n/a | def _encode(string, codec): |
---|
160 | n/a | if codec == UNKNOWN8BIT: |
---|
161 | n/a | return string.encode('ascii', 'surrogateescape') |
---|
162 | n/a | else: |
---|
163 | n/a | return string.encode(codec) |
---|
164 | n/a | |
---|
165 | n/a | |
---|
166 | n/a | |
---|
167 | n/a | class Charset: |
---|
168 | n/a | """Map character sets to their email properties. |
---|
169 | n/a | |
---|
170 | n/a | This class provides information about the requirements imposed on email |
---|
171 | n/a | for a specific character set. It also provides convenience routines for |
---|
172 | n/a | converting between character sets, given the availability of the |
---|
173 | n/a | applicable codecs. Given a character set, it will do its best to provide |
---|
174 | n/a | information on how to use that character set in an email in an |
---|
175 | n/a | RFC-compliant way. |
---|
176 | n/a | |
---|
177 | n/a | Certain character sets must be encoded with quoted-printable or base64 |
---|
178 | n/a | when used in email headers or bodies. Certain character sets must be |
---|
179 | n/a | converted outright, and are not allowed in email. Instances of this |
---|
180 | n/a | module expose the following information about a character set: |
---|
181 | n/a | |
---|
182 | n/a | input_charset: The initial character set specified. Common aliases |
---|
183 | n/a | are converted to their `official' email names (e.g. latin_1 |
---|
184 | n/a | is converted to iso-8859-1). Defaults to 7-bit us-ascii. |
---|
185 | n/a | |
---|
186 | n/a | header_encoding: If the character set must be encoded before it can be |
---|
187 | n/a | used in an email header, this attribute will be set to |
---|
188 | n/a | Charset.QP (for quoted-printable), Charset.BASE64 (for |
---|
189 | n/a | base64 encoding), or Charset.SHORTEST for the shortest of |
---|
190 | n/a | QP or BASE64 encoding. Otherwise, it will be None. |
---|
191 | n/a | |
---|
192 | n/a | body_encoding: Same as header_encoding, but describes the encoding for the |
---|
193 | n/a | mail message's body, which indeed may be different than the |
---|
194 | n/a | header encoding. Charset.SHORTEST is not allowed for |
---|
195 | n/a | body_encoding. |
---|
196 | n/a | |
---|
197 | n/a | output_charset: Some character sets must be converted before they can be |
---|
198 | n/a | used in email headers or bodies. If the input_charset is |
---|
199 | n/a | one of them, this attribute will contain the name of the |
---|
200 | n/a | charset output will be converted to. Otherwise, it will |
---|
201 | n/a | be None. |
---|
202 | n/a | |
---|
203 | n/a | input_codec: The name of the Python codec used to convert the |
---|
204 | n/a | input_charset to Unicode. If no conversion codec is |
---|
205 | n/a | necessary, this attribute will be None. |
---|
206 | n/a | |
---|
207 | n/a | output_codec: The name of the Python codec used to convert Unicode |
---|
208 | n/a | to the output_charset. If no conversion codec is necessary, |
---|
209 | n/a | this attribute will have the same value as the input_codec. |
---|
210 | n/a | """ |
---|
211 | n/a | def __init__(self, input_charset=DEFAULT_CHARSET): |
---|
212 | n/a | # RFC 2046, $4.1.2 says charsets are not case sensitive. We coerce to |
---|
213 | n/a | # unicode because its .lower() is locale insensitive. If the argument |
---|
214 | n/a | # is already a unicode, we leave it at that, but ensure that the |
---|
215 | n/a | # charset is ASCII, as the standard (RFC XXX) requires. |
---|
216 | n/a | try: |
---|
217 | n/a | if isinstance(input_charset, str): |
---|
218 | n/a | input_charset.encode('ascii') |
---|
219 | n/a | else: |
---|
220 | n/a | input_charset = str(input_charset, 'ascii') |
---|
221 | n/a | except UnicodeError: |
---|
222 | n/a | raise errors.CharsetError(input_charset) |
---|
223 | n/a | input_charset = input_charset.lower() |
---|
224 | n/a | # Set the input charset after filtering through the aliases |
---|
225 | n/a | self.input_charset = ALIASES.get(input_charset, input_charset) |
---|
226 | n/a | # We can try to guess which encoding and conversion to use by the |
---|
227 | n/a | # charset_map dictionary. Try that first, but let the user override |
---|
228 | n/a | # it. |
---|
229 | n/a | henc, benc, conv = CHARSETS.get(self.input_charset, |
---|
230 | n/a | (SHORTEST, BASE64, None)) |
---|
231 | n/a | if not conv: |
---|
232 | n/a | conv = self.input_charset |
---|
233 | n/a | # Set the attributes, allowing the arguments to override the default. |
---|
234 | n/a | self.header_encoding = henc |
---|
235 | n/a | self.body_encoding = benc |
---|
236 | n/a | self.output_charset = ALIASES.get(conv, conv) |
---|
237 | n/a | # Now set the codecs. If one isn't defined for input_charset, |
---|
238 | n/a | # guess and try a Unicode codec with the same name as input_codec. |
---|
239 | n/a | self.input_codec = CODEC_MAP.get(self.input_charset, |
---|
240 | n/a | self.input_charset) |
---|
241 | n/a | self.output_codec = CODEC_MAP.get(self.output_charset, |
---|
242 | n/a | self.output_charset) |
---|
243 | n/a | |
---|
244 | n/a | def __str__(self): |
---|
245 | n/a | return self.input_charset.lower() |
---|
246 | n/a | |
---|
247 | n/a | __repr__ = __str__ |
---|
248 | n/a | |
---|
249 | n/a | def __eq__(self, other): |
---|
250 | n/a | return str(self) == str(other).lower() |
---|
251 | n/a | |
---|
252 | n/a | def get_body_encoding(self): |
---|
253 | n/a | """Return the content-transfer-encoding used for body encoding. |
---|
254 | n/a | |
---|
255 | n/a | This is either the string `quoted-printable' or `base64' depending on |
---|
256 | n/a | the encoding used, or it is a function in which case you should call |
---|
257 | n/a | the function with a single argument, the Message object being |
---|
258 | n/a | encoded. The function should then set the Content-Transfer-Encoding |
---|
259 | n/a | header itself to whatever is appropriate. |
---|
260 | n/a | |
---|
261 | n/a | Returns "quoted-printable" if self.body_encoding is QP. |
---|
262 | n/a | Returns "base64" if self.body_encoding is BASE64. |
---|
263 | n/a | Returns conversion function otherwise. |
---|
264 | n/a | """ |
---|
265 | n/a | assert self.body_encoding != SHORTEST |
---|
266 | n/a | if self.body_encoding == QP: |
---|
267 | n/a | return 'quoted-printable' |
---|
268 | n/a | elif self.body_encoding == BASE64: |
---|
269 | n/a | return 'base64' |
---|
270 | n/a | else: |
---|
271 | n/a | return encode_7or8bit |
---|
272 | n/a | |
---|
273 | n/a | def get_output_charset(self): |
---|
274 | n/a | """Return the output character set. |
---|
275 | n/a | |
---|
276 | n/a | This is self.output_charset if that is not None, otherwise it is |
---|
277 | n/a | self.input_charset. |
---|
278 | n/a | """ |
---|
279 | n/a | return self.output_charset or self.input_charset |
---|
280 | n/a | |
---|
281 | n/a | def header_encode(self, string): |
---|
282 | n/a | """Header-encode a string by converting it first to bytes. |
---|
283 | n/a | |
---|
284 | n/a | The type of encoding (base64 or quoted-printable) will be based on |
---|
285 | n/a | this charset's `header_encoding`. |
---|
286 | n/a | |
---|
287 | n/a | :param string: A unicode string for the header. It must be possible |
---|
288 | n/a | to encode this string to bytes using the character set's |
---|
289 | n/a | output codec. |
---|
290 | n/a | :return: The encoded string, with RFC 2047 chrome. |
---|
291 | n/a | """ |
---|
292 | n/a | codec = self.output_codec or 'us-ascii' |
---|
293 | n/a | header_bytes = _encode(string, codec) |
---|
294 | n/a | # 7bit/8bit encodings return the string unchanged (modulo conversions) |
---|
295 | n/a | encoder_module = self._get_encoder(header_bytes) |
---|
296 | n/a | if encoder_module is None: |
---|
297 | n/a | return string |
---|
298 | n/a | return encoder_module.header_encode(header_bytes, codec) |
---|
299 | n/a | |
---|
300 | n/a | def header_encode_lines(self, string, maxlengths): |
---|
301 | n/a | """Header-encode a string by converting it first to bytes. |
---|
302 | n/a | |
---|
303 | n/a | This is similar to `header_encode()` except that the string is fit |
---|
304 | n/a | into maximum line lengths as given by the argument. |
---|
305 | n/a | |
---|
306 | n/a | :param string: A unicode string for the header. It must be possible |
---|
307 | n/a | to encode this string to bytes using the character set's |
---|
308 | n/a | output codec. |
---|
309 | n/a | :param maxlengths: Maximum line length iterator. Each element |
---|
310 | n/a | returned from this iterator will provide the next maximum line |
---|
311 | n/a | length. This parameter is used as an argument to built-in next() |
---|
312 | n/a | and should never be exhausted. The maximum line lengths should |
---|
313 | n/a | not count the RFC 2047 chrome. These line lengths are only a |
---|
314 | n/a | hint; the splitter does the best it can. |
---|
315 | n/a | :return: Lines of encoded strings, each with RFC 2047 chrome. |
---|
316 | n/a | """ |
---|
317 | n/a | # See which encoding we should use. |
---|
318 | n/a | codec = self.output_codec or 'us-ascii' |
---|
319 | n/a | header_bytes = _encode(string, codec) |
---|
320 | n/a | encoder_module = self._get_encoder(header_bytes) |
---|
321 | n/a | encoder = partial(encoder_module.header_encode, charset=codec) |
---|
322 | n/a | # Calculate the number of characters that the RFC 2047 chrome will |
---|
323 | n/a | # contribute to each line. |
---|
324 | n/a | charset = self.get_output_charset() |
---|
325 | n/a | extra = len(charset) + RFC2047_CHROME_LEN |
---|
326 | n/a | # Now comes the hard part. We must encode bytes but we can't split on |
---|
327 | n/a | # bytes because some character sets are variable length and each |
---|
328 | n/a | # encoded word must stand on its own. So the problem is you have to |
---|
329 | n/a | # encode to bytes to figure out this word's length, but you must split |
---|
330 | n/a | # on characters. This causes two problems: first, we don't know how |
---|
331 | n/a | # many octets a specific substring of unicode characters will get |
---|
332 | n/a | # encoded to, and second, we don't know how many ASCII characters |
---|
333 | n/a | # those octets will get encoded to. Unless we try it. Which seems |
---|
334 | n/a | # inefficient. In the interest of being correct rather than fast (and |
---|
335 | n/a | # in the hope that there will be few encoded headers in any such |
---|
336 | n/a | # message), brute force it. :( |
---|
337 | n/a | lines = [] |
---|
338 | n/a | current_line = [] |
---|
339 | n/a | maxlen = next(maxlengths) - extra |
---|
340 | n/a | for character in string: |
---|
341 | n/a | current_line.append(character) |
---|
342 | n/a | this_line = EMPTYSTRING.join(current_line) |
---|
343 | n/a | length = encoder_module.header_length(_encode(this_line, charset)) |
---|
344 | n/a | if length > maxlen: |
---|
345 | n/a | # This last character doesn't fit so pop it off. |
---|
346 | n/a | current_line.pop() |
---|
347 | n/a | # Does nothing fit on the first line? |
---|
348 | n/a | if not lines and not current_line: |
---|
349 | n/a | lines.append(None) |
---|
350 | n/a | else: |
---|
351 | n/a | separator = (' ' if lines else '') |
---|
352 | n/a | joined_line = EMPTYSTRING.join(current_line) |
---|
353 | n/a | header_bytes = _encode(joined_line, codec) |
---|
354 | n/a | lines.append(encoder(header_bytes)) |
---|
355 | n/a | current_line = [character] |
---|
356 | n/a | maxlen = next(maxlengths) - extra |
---|
357 | n/a | joined_line = EMPTYSTRING.join(current_line) |
---|
358 | n/a | header_bytes = _encode(joined_line, codec) |
---|
359 | n/a | lines.append(encoder(header_bytes)) |
---|
360 | n/a | return lines |
---|
361 | n/a | |
---|
362 | n/a | def _get_encoder(self, header_bytes): |
---|
363 | n/a | if self.header_encoding == BASE64: |
---|
364 | n/a | return email.base64mime |
---|
365 | n/a | elif self.header_encoding == QP: |
---|
366 | n/a | return email.quoprimime |
---|
367 | n/a | elif self.header_encoding == SHORTEST: |
---|
368 | n/a | len64 = email.base64mime.header_length(header_bytes) |
---|
369 | n/a | lenqp = email.quoprimime.header_length(header_bytes) |
---|
370 | n/a | if len64 < lenqp: |
---|
371 | n/a | return email.base64mime |
---|
372 | n/a | else: |
---|
373 | n/a | return email.quoprimime |
---|
374 | n/a | else: |
---|
375 | n/a | return None |
---|
376 | n/a | |
---|
377 | n/a | def body_encode(self, string): |
---|
378 | n/a | """Body-encode a string by converting it first to bytes. |
---|
379 | n/a | |
---|
380 | n/a | The type of encoding (base64 or quoted-printable) will be based on |
---|
381 | n/a | self.body_encoding. If body_encoding is None, we assume the |
---|
382 | n/a | output charset is a 7bit encoding, so re-encoding the decoded |
---|
383 | n/a | string using the ascii codec produces the correct string version |
---|
384 | n/a | of the content. |
---|
385 | n/a | """ |
---|
386 | n/a | if not string: |
---|
387 | n/a | return string |
---|
388 | n/a | if self.body_encoding is BASE64: |
---|
389 | n/a | if isinstance(string, str): |
---|
390 | n/a | string = string.encode(self.output_charset) |
---|
391 | n/a | return email.base64mime.body_encode(string) |
---|
392 | n/a | elif self.body_encoding is QP: |
---|
393 | n/a | # quopromime.body_encode takes a string, but operates on it as if |
---|
394 | n/a | # it were a list of byte codes. For a (minimal) history on why |
---|
395 | n/a | # this is so, see changeset 0cf700464177. To correctly encode a |
---|
396 | n/a | # character set, then, we must turn it into pseudo bytes via the |
---|
397 | n/a | # latin1 charset, which will encode any byte as a single code point |
---|
398 | n/a | # between 0 and 255, which is what body_encode is expecting. |
---|
399 | n/a | if isinstance(string, str): |
---|
400 | n/a | string = string.encode(self.output_charset) |
---|
401 | n/a | string = string.decode('latin1') |
---|
402 | n/a | return email.quoprimime.body_encode(string) |
---|
403 | n/a | else: |
---|
404 | n/a | if isinstance(string, str): |
---|
405 | n/a | string = string.encode(self.output_charset).decode('ascii') |
---|
406 | n/a | return string |
---|