1 | n/a | # Copyright (C) 2001-2010 Python Software Foundation |
---|
2 | n/a | # Author: Barry Warsaw |
---|
3 | n/a | # Contact: email-sig@python.org |
---|
4 | n/a | |
---|
5 | n/a | """Miscellaneous utilities.""" |
---|
6 | n/a | |
---|
7 | n/a | __all__ = [ |
---|
8 | n/a | 'collapse_rfc2231_value', |
---|
9 | n/a | 'decode_params', |
---|
10 | n/a | 'decode_rfc2231', |
---|
11 | n/a | 'encode_rfc2231', |
---|
12 | n/a | 'formataddr', |
---|
13 | n/a | 'formatdate', |
---|
14 | n/a | 'format_datetime', |
---|
15 | n/a | 'getaddresses', |
---|
16 | n/a | 'make_msgid', |
---|
17 | n/a | 'mktime_tz', |
---|
18 | n/a | 'parseaddr', |
---|
19 | n/a | 'parsedate', |
---|
20 | n/a | 'parsedate_tz', |
---|
21 | n/a | 'parsedate_to_datetime', |
---|
22 | n/a | 'unquote', |
---|
23 | n/a | ] |
---|
24 | n/a | |
---|
25 | n/a | import os |
---|
26 | n/a | import re |
---|
27 | n/a | import time |
---|
28 | n/a | import random |
---|
29 | n/a | import socket |
---|
30 | n/a | import datetime |
---|
31 | n/a | import urllib.parse |
---|
32 | n/a | |
---|
33 | n/a | from email._parseaddr import quote |
---|
34 | n/a | from email._parseaddr import AddressList as _AddressList |
---|
35 | n/a | from email._parseaddr import mktime_tz |
---|
36 | n/a | |
---|
37 | n/a | from email._parseaddr import parsedate, parsedate_tz, _parsedate_tz |
---|
38 | n/a | |
---|
39 | n/a | # Intrapackage imports |
---|
40 | n/a | from email.charset import Charset |
---|
41 | n/a | |
---|
42 | n/a | COMMASPACE = ', ' |
---|
43 | n/a | EMPTYSTRING = '' |
---|
44 | n/a | UEMPTYSTRING = '' |
---|
45 | n/a | CRLF = '\r\n' |
---|
46 | n/a | TICK = "'" |
---|
47 | n/a | |
---|
48 | n/a | specialsre = re.compile(r'[][\\()<>@,:;".]') |
---|
49 | n/a | escapesre = re.compile(r'[\\"]') |
---|
50 | n/a | |
---|
51 | n/a | def _has_surrogates(s): |
---|
52 | n/a | """Return True if s contains surrogate-escaped binary data.""" |
---|
53 | n/a | # This check is based on the fact that unless there are surrogates, utf8 |
---|
54 | n/a | # (Python's default encoding) can encode any string. This is the fastest |
---|
55 | n/a | # way to check for surrogates, see issue 11454 for timings. |
---|
56 | n/a | try: |
---|
57 | n/a | s.encode() |
---|
58 | n/a | return False |
---|
59 | n/a | except UnicodeEncodeError: |
---|
60 | n/a | return True |
---|
61 | n/a | |
---|
62 | n/a | # How to deal with a string containing bytes before handing it to the |
---|
63 | n/a | # application through the 'normal' interface. |
---|
64 | n/a | def _sanitize(string): |
---|
65 | n/a | # Turn any escaped bytes into unicode 'unknown' char. If the escaped |
---|
66 | n/a | # bytes happen to be utf-8 they will instead get decoded, even if they |
---|
67 | n/a | # were invalid in the charset the source was supposed to be in. This |
---|
68 | n/a | # seems like it is not a bad thing; a defect was still registered. |
---|
69 | n/a | original_bytes = string.encode('utf-8', 'surrogateescape') |
---|
70 | n/a | return original_bytes.decode('utf-8', 'replace') |
---|
71 | n/a | |
---|
72 | n/a | |
---|
73 | n/a | |
---|
74 | n/a | # Helpers |
---|
75 | n/a | |
---|
76 | n/a | def formataddr(pair, charset='utf-8'): |
---|
77 | n/a | """The inverse of parseaddr(), this takes a 2-tuple of the form |
---|
78 | n/a | (realname, email_address) and returns the string value suitable |
---|
79 | n/a | for an RFC 2822 From, To or Cc header. |
---|
80 | n/a | |
---|
81 | n/a | If the first element of pair is false, then the second element is |
---|
82 | n/a | returned unmodified. |
---|
83 | n/a | |
---|
84 | n/a | Optional charset if given is the character set that is used to encode |
---|
85 | n/a | realname in case realname is not ASCII safe. Can be an instance of str or |
---|
86 | n/a | a Charset-like object which has a header_encode method. Default is |
---|
87 | n/a | 'utf-8'. |
---|
88 | n/a | """ |
---|
89 | n/a | name, address = pair |
---|
90 | n/a | # The address MUST (per RFC) be ascii, so raise a UnicodeError if it isn't. |
---|
91 | n/a | address.encode('ascii') |
---|
92 | n/a | if name: |
---|
93 | n/a | try: |
---|
94 | n/a | name.encode('ascii') |
---|
95 | n/a | except UnicodeEncodeError: |
---|
96 | n/a | if isinstance(charset, str): |
---|
97 | n/a | charset = Charset(charset) |
---|
98 | n/a | encoded_name = charset.header_encode(name) |
---|
99 | n/a | return "%s <%s>" % (encoded_name, address) |
---|
100 | n/a | else: |
---|
101 | n/a | quotes = '' |
---|
102 | n/a | if specialsre.search(name): |
---|
103 | n/a | quotes = '"' |
---|
104 | n/a | name = escapesre.sub(r'\\\g<0>', name) |
---|
105 | n/a | return '%s%s%s <%s>' % (quotes, name, quotes, address) |
---|
106 | n/a | return address |
---|
107 | n/a | |
---|
108 | n/a | |
---|
109 | n/a | |
---|
110 | n/a | def getaddresses(fieldvalues): |
---|
111 | n/a | """Return a list of (REALNAME, EMAIL) for each fieldvalue.""" |
---|
112 | n/a | all = COMMASPACE.join(fieldvalues) |
---|
113 | n/a | a = _AddressList(all) |
---|
114 | n/a | return a.addresslist |
---|
115 | n/a | |
---|
116 | n/a | |
---|
117 | n/a | |
---|
118 | n/a | ecre = re.compile(r''' |
---|
119 | n/a | =\? # literal =? |
---|
120 | n/a | (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset |
---|
121 | n/a | \? # literal ? |
---|
122 | n/a | (?P<encoding>[qb]) # either a "q" or a "b", case insensitive |
---|
123 | n/a | \? # literal ? |
---|
124 | n/a | (?P<atom>.*?) # non-greedy up to the next ?= is the atom |
---|
125 | n/a | \?= # literal ?= |
---|
126 | n/a | ''', re.VERBOSE | re.IGNORECASE) |
---|
127 | n/a | |
---|
128 | n/a | |
---|
129 | n/a | def _format_timetuple_and_zone(timetuple, zone): |
---|
130 | n/a | return '%s, %02d %s %04d %02d:%02d:%02d %s' % ( |
---|
131 | n/a | ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][timetuple[6]], |
---|
132 | n/a | timetuple[2], |
---|
133 | n/a | ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', |
---|
134 | n/a | 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][timetuple[1] - 1], |
---|
135 | n/a | timetuple[0], timetuple[3], timetuple[4], timetuple[5], |
---|
136 | n/a | zone) |
---|
137 | n/a | |
---|
138 | n/a | def formatdate(timeval=None, localtime=False, usegmt=False): |
---|
139 | n/a | """Returns a date string as specified by RFC 2822, e.g.: |
---|
140 | n/a | |
---|
141 | n/a | Fri, 09 Nov 2001 01:08:47 -0000 |
---|
142 | n/a | |
---|
143 | n/a | Optional timeval if given is a floating point time value as accepted by |
---|
144 | n/a | gmtime() and localtime(), otherwise the current time is used. |
---|
145 | n/a | |
---|
146 | n/a | Optional localtime is a flag that when True, interprets timeval, and |
---|
147 | n/a | returns a date relative to the local timezone instead of UTC, properly |
---|
148 | n/a | taking daylight savings time into account. |
---|
149 | n/a | |
---|
150 | n/a | Optional argument usegmt means that the timezone is written out as |
---|
151 | n/a | an ascii string, not numeric one (so "GMT" instead of "+0000"). This |
---|
152 | n/a | is needed for HTTP, and is only used when localtime==False. |
---|
153 | n/a | """ |
---|
154 | n/a | # Note: we cannot use strftime() because that honors the locale and RFC |
---|
155 | n/a | # 2822 requires that day and month names be the English abbreviations. |
---|
156 | n/a | if timeval is None: |
---|
157 | n/a | timeval = time.time() |
---|
158 | n/a | if localtime or usegmt: |
---|
159 | n/a | dt = datetime.datetime.fromtimestamp(timeval, datetime.timezone.utc) |
---|
160 | n/a | else: |
---|
161 | n/a | dt = datetime.datetime.utcfromtimestamp(timeval) |
---|
162 | n/a | if localtime: |
---|
163 | n/a | dt = dt.astimezone() |
---|
164 | n/a | usegmt = False |
---|
165 | n/a | return format_datetime(dt, usegmt) |
---|
166 | n/a | |
---|
167 | n/a | def format_datetime(dt, usegmt=False): |
---|
168 | n/a | """Turn a datetime into a date string as specified in RFC 2822. |
---|
169 | n/a | |
---|
170 | n/a | If usegmt is True, dt must be an aware datetime with an offset of zero. In |
---|
171 | n/a | this case 'GMT' will be rendered instead of the normal +0000 required by |
---|
172 | n/a | RFC2822. This is to support HTTP headers involving date stamps. |
---|
173 | n/a | """ |
---|
174 | n/a | now = dt.timetuple() |
---|
175 | n/a | if usegmt: |
---|
176 | n/a | if dt.tzinfo is None or dt.tzinfo != datetime.timezone.utc: |
---|
177 | n/a | raise ValueError("usegmt option requires a UTC datetime") |
---|
178 | n/a | zone = 'GMT' |
---|
179 | n/a | elif dt.tzinfo is None: |
---|
180 | n/a | zone = '-0000' |
---|
181 | n/a | else: |
---|
182 | n/a | zone = dt.strftime("%z") |
---|
183 | n/a | return _format_timetuple_and_zone(now, zone) |
---|
184 | n/a | |
---|
185 | n/a | |
---|
186 | n/a | def make_msgid(idstring=None, domain=None): |
---|
187 | n/a | """Returns a string suitable for RFC 2822 compliant Message-ID, e.g: |
---|
188 | n/a | |
---|
189 | n/a | <142480216486.20800.16526388040877946887@nightshade.la.mastaler.com> |
---|
190 | n/a | |
---|
191 | n/a | Optional idstring if given is a string used to strengthen the |
---|
192 | n/a | uniqueness of the message id. Optional domain if given provides the |
---|
193 | n/a | portion of the message id after the '@'. It defaults to the locally |
---|
194 | n/a | defined hostname. |
---|
195 | n/a | """ |
---|
196 | n/a | timeval = int(time.time()*100) |
---|
197 | n/a | pid = os.getpid() |
---|
198 | n/a | randint = random.getrandbits(64) |
---|
199 | n/a | if idstring is None: |
---|
200 | n/a | idstring = '' |
---|
201 | n/a | else: |
---|
202 | n/a | idstring = '.' + idstring |
---|
203 | n/a | if domain is None: |
---|
204 | n/a | domain = socket.getfqdn() |
---|
205 | n/a | msgid = '<%d.%d.%d%s@%s>' % (timeval, pid, randint, idstring, domain) |
---|
206 | n/a | return msgid |
---|
207 | n/a | |
---|
208 | n/a | |
---|
209 | n/a | def parsedate_to_datetime(data): |
---|
210 | n/a | *dtuple, tz = _parsedate_tz(data) |
---|
211 | n/a | if tz is None: |
---|
212 | n/a | return datetime.datetime(*dtuple[:6]) |
---|
213 | n/a | return datetime.datetime(*dtuple[:6], |
---|
214 | n/a | tzinfo=datetime.timezone(datetime.timedelta(seconds=tz))) |
---|
215 | n/a | |
---|
216 | n/a | |
---|
217 | n/a | def parseaddr(addr): |
---|
218 | n/a | addrs = _AddressList(addr).addresslist |
---|
219 | n/a | if not addrs: |
---|
220 | n/a | return '', '' |
---|
221 | n/a | return addrs[0] |
---|
222 | n/a | |
---|
223 | n/a | |
---|
224 | n/a | # rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3. |
---|
225 | n/a | def unquote(str): |
---|
226 | n/a | """Remove quotes from a string.""" |
---|
227 | n/a | if len(str) > 1: |
---|
228 | n/a | if str.startswith('"') and str.endswith('"'): |
---|
229 | n/a | return str[1:-1].replace('\\\\', '\\').replace('\\"', '"') |
---|
230 | n/a | if str.startswith('<') and str.endswith('>'): |
---|
231 | n/a | return str[1:-1] |
---|
232 | n/a | return str |
---|
233 | n/a | |
---|
234 | n/a | |
---|
235 | n/a | |
---|
236 | n/a | # RFC2231-related functions - parameter encoding and decoding |
---|
237 | n/a | def decode_rfc2231(s): |
---|
238 | n/a | """Decode string according to RFC 2231""" |
---|
239 | n/a | parts = s.split(TICK, 2) |
---|
240 | n/a | if len(parts) <= 2: |
---|
241 | n/a | return None, None, s |
---|
242 | n/a | return parts |
---|
243 | n/a | |
---|
244 | n/a | |
---|
245 | n/a | def encode_rfc2231(s, charset=None, language=None): |
---|
246 | n/a | """Encode string according to RFC 2231. |
---|
247 | n/a | |
---|
248 | n/a | If neither charset nor language is given, then s is returned as-is. If |
---|
249 | n/a | charset is given but not language, the string is encoded using the empty |
---|
250 | n/a | string for language. |
---|
251 | n/a | """ |
---|
252 | n/a | s = urllib.parse.quote(s, safe='', encoding=charset or 'ascii') |
---|
253 | n/a | if charset is None and language is None: |
---|
254 | n/a | return s |
---|
255 | n/a | if language is None: |
---|
256 | n/a | language = '' |
---|
257 | n/a | return "%s'%s'%s" % (charset, language, s) |
---|
258 | n/a | |
---|
259 | n/a | |
---|
260 | n/a | rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$', |
---|
261 | n/a | re.ASCII) |
---|
262 | n/a | |
---|
263 | n/a | def decode_params(params): |
---|
264 | n/a | """Decode parameters list according to RFC 2231. |
---|
265 | n/a | |
---|
266 | n/a | params is a sequence of 2-tuples containing (param name, string value). |
---|
267 | n/a | """ |
---|
268 | n/a | # Copy params so we don't mess with the original |
---|
269 | n/a | params = params[:] |
---|
270 | n/a | new_params = [] |
---|
271 | n/a | # Map parameter's name to a list of continuations. The values are a |
---|
272 | n/a | # 3-tuple of the continuation number, the string value, and a flag |
---|
273 | n/a | # specifying whether a particular segment is %-encoded. |
---|
274 | n/a | rfc2231_params = {} |
---|
275 | n/a | name, value = params.pop(0) |
---|
276 | n/a | new_params.append((name, value)) |
---|
277 | n/a | while params: |
---|
278 | n/a | name, value = params.pop(0) |
---|
279 | n/a | if name.endswith('*'): |
---|
280 | n/a | encoded = True |
---|
281 | n/a | else: |
---|
282 | n/a | encoded = False |
---|
283 | n/a | value = unquote(value) |
---|
284 | n/a | mo = rfc2231_continuation.match(name) |
---|
285 | n/a | if mo: |
---|
286 | n/a | name, num = mo.group('name', 'num') |
---|
287 | n/a | if num is not None: |
---|
288 | n/a | num = int(num) |
---|
289 | n/a | rfc2231_params.setdefault(name, []).append((num, value, encoded)) |
---|
290 | n/a | else: |
---|
291 | n/a | new_params.append((name, '"%s"' % quote(value))) |
---|
292 | n/a | if rfc2231_params: |
---|
293 | n/a | for name, continuations in rfc2231_params.items(): |
---|
294 | n/a | value = [] |
---|
295 | n/a | extended = False |
---|
296 | n/a | # Sort by number |
---|
297 | n/a | continuations.sort() |
---|
298 | n/a | # And now append all values in numerical order, converting |
---|
299 | n/a | # %-encodings for the encoded segments. If any of the |
---|
300 | n/a | # continuation names ends in a *, then the entire string, after |
---|
301 | n/a | # decoding segments and concatenating, must have the charset and |
---|
302 | n/a | # language specifiers at the beginning of the string. |
---|
303 | n/a | for num, s, encoded in continuations: |
---|
304 | n/a | if encoded: |
---|
305 | n/a | # Decode as "latin-1", so the characters in s directly |
---|
306 | n/a | # represent the percent-encoded octet values. |
---|
307 | n/a | # collapse_rfc2231_value treats this as an octet sequence. |
---|
308 | n/a | s = urllib.parse.unquote(s, encoding="latin-1") |
---|
309 | n/a | extended = True |
---|
310 | n/a | value.append(s) |
---|
311 | n/a | value = quote(EMPTYSTRING.join(value)) |
---|
312 | n/a | if extended: |
---|
313 | n/a | charset, language, value = decode_rfc2231(value) |
---|
314 | n/a | new_params.append((name, (charset, language, '"%s"' % value))) |
---|
315 | n/a | else: |
---|
316 | n/a | new_params.append((name, '"%s"' % value)) |
---|
317 | n/a | return new_params |
---|
318 | n/a | |
---|
319 | n/a | def collapse_rfc2231_value(value, errors='replace', |
---|
320 | n/a | fallback_charset='us-ascii'): |
---|
321 | n/a | if not isinstance(value, tuple) or len(value) != 3: |
---|
322 | n/a | return unquote(value) |
---|
323 | n/a | # While value comes to us as a unicode string, we need it to be a bytes |
---|
324 | n/a | # object. We do not want bytes() normal utf-8 decoder, we want a straight |
---|
325 | n/a | # interpretation of the string as character bytes. |
---|
326 | n/a | charset, language, text = value |
---|
327 | n/a | if charset is None: |
---|
328 | n/a | # Issue 17369: if charset/lang is None, decode_rfc2231 couldn't parse |
---|
329 | n/a | # the value, so use the fallback_charset. |
---|
330 | n/a | charset = fallback_charset |
---|
331 | n/a | rawbytes = bytes(text, 'raw-unicode-escape') |
---|
332 | n/a | try: |
---|
333 | n/a | return str(rawbytes, charset, errors) |
---|
334 | n/a | except LookupError: |
---|
335 | n/a | # charset is not a known codec. |
---|
336 | n/a | return unquote(text) |
---|
337 | n/a | |
---|
338 | n/a | |
---|
339 | n/a | # |
---|
340 | n/a | # datetime doesn't provide a localtime function yet, so provide one. Code |
---|
341 | n/a | # adapted from the patch in issue 9527. This may not be perfect, but it is |
---|
342 | n/a | # better than not having it. |
---|
343 | n/a | # |
---|
344 | n/a | |
---|
345 | n/a | def localtime(dt=None, isdst=-1): |
---|
346 | n/a | """Return local time as an aware datetime object. |
---|
347 | n/a | |
---|
348 | n/a | If called without arguments, return current time. Otherwise *dt* |
---|
349 | n/a | argument should be a datetime instance, and it is converted to the |
---|
350 | n/a | local time zone according to the system time zone database. If *dt* is |
---|
351 | n/a | naive (that is, dt.tzinfo is None), it is assumed to be in local time. |
---|
352 | n/a | In this case, a positive or zero value for *isdst* causes localtime to |
---|
353 | n/a | presume initially that summer time (for example, Daylight Saving Time) |
---|
354 | n/a | is or is not (respectively) in effect for the specified time. A |
---|
355 | n/a | negative value for *isdst* causes the localtime() function to attempt |
---|
356 | n/a | to divine whether summer time is in effect for the specified time. |
---|
357 | n/a | |
---|
358 | n/a | """ |
---|
359 | n/a | if dt is None: |
---|
360 | n/a | return datetime.datetime.now(datetime.timezone.utc).astimezone() |
---|
361 | n/a | if dt.tzinfo is not None: |
---|
362 | n/a | return dt.astimezone() |
---|
363 | n/a | # We have a naive datetime. Convert to a (localtime) timetuple and pass to |
---|
364 | n/a | # system mktime together with the isdst hint. System mktime will return |
---|
365 | n/a | # seconds since epoch. |
---|
366 | n/a | tm = dt.timetuple()[:-1] + (isdst,) |
---|
367 | n/a | seconds = time.mktime(tm) |
---|
368 | n/a | localtm = time.localtime(seconds) |
---|
369 | n/a | try: |
---|
370 | n/a | delta = datetime.timedelta(seconds=localtm.tm_gmtoff) |
---|
371 | n/a | tz = datetime.timezone(delta, localtm.tm_zone) |
---|
372 | n/a | except AttributeError: |
---|
373 | n/a | # Compute UTC offset and compare with the value implied by tm_isdst. |
---|
374 | n/a | # If the values match, use the zone name implied by tm_isdst. |
---|
375 | n/a | delta = dt - datetime.datetime(*time.gmtime(seconds)[:6]) |
---|
376 | n/a | dst = time.daylight and localtm.tm_isdst > 0 |
---|
377 | n/a | gmtoff = -(time.altzone if dst else time.timezone) |
---|
378 | n/a | if delta == datetime.timedelta(seconds=gmtoff): |
---|
379 | n/a | tz = datetime.timezone(delta, time.tzname[dst]) |
---|
380 | n/a | else: |
---|
381 | n/a | tz = datetime.timezone(delta) |
---|
382 | n/a | return dt.replace(tzinfo=tz) |
---|