Python code coverage for Lib/email/utils.py

#	count	content
1	n/a	# Copyright (C) 2001-2010 Python Software Foundation
2	n/a	# Author: Barry Warsaw
3	n/a	# Contact: email-sig@python.org
4	n/a
5	n/a	"""Miscellaneous utilities."""
6	n/a
7	n/a	__all__ = [
8	n/a	'collapse_rfc2231_value',
9	n/a	'decode_params',
10	n/a	'decode_rfc2231',
11	n/a	'encode_rfc2231',
12	n/a	'formataddr',
13	n/a	'formatdate',
14	n/a	'format_datetime',
15	n/a	'getaddresses',
16	n/a	'make_msgid',
17	n/a	'mktime_tz',
18	n/a	'parseaddr',
19	n/a	'parsedate',
20	n/a	'parsedate_tz',
21	n/a	'parsedate_to_datetime',
22	n/a	'unquote',
23	n/a	]
24	n/a
25	n/a	import os
26	n/a	import re
27	n/a	import time
28	n/a	import random
29	n/a	import socket
30	n/a	import datetime
31	n/a	import urllib.parse
32	n/a
33	n/a	from email._parseaddr import quote
34	n/a	from email._parseaddr import AddressList as _AddressList
35	n/a	from email._parseaddr import mktime_tz
36	n/a
37	n/a	from email._parseaddr import parsedate, parsedate_tz, _parsedate_tz
38	n/a
39	n/a	# Intrapackage imports
40	n/a	from email.charset import Charset
41	n/a
42	n/a	COMMASPACE = ', '
43	n/a	EMPTYSTRING = ''
44	n/a	UEMPTYSTRING = ''
45	n/a	CRLF = '\r\n'
46	n/a	TICK = "'"
47	n/a
48	n/a	specialsre = re.compile(r'[][\\()<>@,:;".]')
49	n/a	escapesre = re.compile(r'[\\"]')
50	n/a
51	n/a	def _has_surrogates(s):
52	n/a	"""Return True if s contains surrogate-escaped binary data."""
53	n/a	# This check is based on the fact that unless there are surrogates, utf8
54	n/a	# (Python's default encoding) can encode any string. This is the fastest
55	n/a	# way to check for surrogates, see issue 11454 for timings.
56	n/a	try:
57	n/a	s.encode()
58	n/a	return False
59	n/a	except UnicodeEncodeError:
60	n/a	return True
61	n/a
62	n/a	# How to deal with a string containing bytes before handing it to the
63	n/a	# application through the 'normal' interface.
64	n/a	def _sanitize(string):
65	n/a	# Turn any escaped bytes into unicode 'unknown' char. If the escaped
66	n/a	# bytes happen to be utf-8 they will instead get decoded, even if they
67	n/a	# were invalid in the charset the source was supposed to be in. This
68	n/a	# seems like it is not a bad thing; a defect was still registered.
69	n/a	original_bytes = string.encode('utf-8', 'surrogateescape')
70	n/a	return original_bytes.decode('utf-8', 'replace')
71	n/a
72	n/a
73	n/a
74	n/a	# Helpers
75	n/a
76	n/a	def formataddr(pair, charset='utf-8'):
77	n/a	"""The inverse of parseaddr(), this takes a 2-tuple of the form
78	n/a	(realname, email_address) and returns the string value suitable
79	n/a	for an RFC 2822 From, To or Cc header.
80	n/a
81	n/a	If the first element of pair is false, then the second element is
82	n/a	returned unmodified.
83	n/a
84	n/a	Optional charset if given is the character set that is used to encode
85	n/a	realname in case realname is not ASCII safe. Can be an instance of str or
86	n/a	a Charset-like object which has a header_encode method. Default is
87	n/a	'utf-8'.
88	n/a	"""
89	n/a	name, address = pair
90	n/a	# The address MUST (per RFC) be ascii, so raise a UnicodeError if it isn't.
91	n/a	address.encode('ascii')
92	n/a	if name:
93	n/a	try:
94	n/a	name.encode('ascii')
95	n/a	except UnicodeEncodeError:
96	n/a	if isinstance(charset, str):
97	n/a	charset = Charset(charset)
98	n/a	encoded_name = charset.header_encode(name)
99	n/a	return "%s <%s>" % (encoded_name, address)
100	n/a	else:
101	n/a	quotes = ''
102	n/a	if specialsre.search(name):
103	n/a	quotes = '"'
104	n/a	name = escapesre.sub(r'\\\g<0>', name)
105	n/a	return '%s%s%s <%s>' % (quotes, name, quotes, address)
106	n/a	return address
107	n/a
108	n/a
109	n/a
110	n/a	def getaddresses(fieldvalues):
111	n/a	"""Return a list of (REALNAME, EMAIL) for each fieldvalue."""
112	n/a	all = COMMASPACE.join(fieldvalues)
113	n/a	a = _AddressList(all)
114	n/a	return a.addresslist
115	n/a
116	n/a
117	n/a
118	n/a	ecre = re.compile(r'''
119	n/a	=\? # literal =?
120	n/a	(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
121	n/a	\? # literal ?
122	n/a	(?P<encoding>[qb]) # either a "q" or a "b", case insensitive
123	n/a	\? # literal ?
124	n/a	(?P<atom>.*?) # non-greedy up to the next ?= is the atom
125	n/a	\?= # literal ?=
126	n/a	''', re.VERBOSE \| re.IGNORECASE)
127	n/a
128	n/a
129	n/a	def _format_timetuple_and_zone(timetuple, zone):
130	n/a	return '%s, %02d %s %04d %02d:%02d:%02d %s' % (
131	n/a	['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][timetuple[6]],
132	n/a	timetuple[2],
133	n/a	['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
134	n/a	'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][timetuple[1] - 1],
135	n/a	timetuple[0], timetuple[3], timetuple[4], timetuple[5],
136	n/a	zone)
137	n/a
138	n/a	def formatdate(timeval=None, localtime=False, usegmt=False):
139	n/a	"""Returns a date string as specified by RFC 2822, e.g.:
140	n/a
141	n/a	Fri, 09 Nov 2001 01:08:47 -0000
142	n/a
143	n/a	Optional timeval if given is a floating point time value as accepted by
144	n/a	gmtime() and localtime(), otherwise the current time is used.
145	n/a
146	n/a	Optional localtime is a flag that when True, interprets timeval, and
147	n/a	returns a date relative to the local timezone instead of UTC, properly
148	n/a	taking daylight savings time into account.
149	n/a
150	n/a	Optional argument usegmt means that the timezone is written out as
151	n/a	an ascii string, not numeric one (so "GMT" instead of "+0000"). This
152	n/a	is needed for HTTP, and is only used when localtime==False.
153	n/a	"""
154	n/a	# Note: we cannot use strftime() because that honors the locale and RFC
155	n/a	# 2822 requires that day and month names be the English abbreviations.
156	n/a	if timeval is None:
157	n/a	timeval = time.time()
158	n/a	if localtime or usegmt:
159	n/a	dt = datetime.datetime.fromtimestamp(timeval, datetime.timezone.utc)
160	n/a	else:
161	n/a	dt = datetime.datetime.utcfromtimestamp(timeval)
162	n/a	if localtime:
163	n/a	dt = dt.astimezone()
164	n/a	usegmt = False
165	n/a	return format_datetime(dt, usegmt)
166	n/a
167	n/a	def format_datetime(dt, usegmt=False):
168	n/a	"""Turn a datetime into a date string as specified in RFC 2822.
169	n/a
170	n/a	If usegmt is True, dt must be an aware datetime with an offset of zero. In
171	n/a	this case 'GMT' will be rendered instead of the normal +0000 required by
172	n/a	RFC2822. This is to support HTTP headers involving date stamps.
173	n/a	"""
174	n/a	now = dt.timetuple()
175	n/a	if usegmt:
176	n/a	if dt.tzinfo is None or dt.tzinfo != datetime.timezone.utc:
177	n/a	raise ValueError("usegmt option requires a UTC datetime")
178	n/a	zone = 'GMT'
179	n/a	elif dt.tzinfo is None:
180	n/a	zone = '-0000'
181	n/a	else:
182	n/a	zone = dt.strftime("%z")
183	n/a	return _format_timetuple_and_zone(now, zone)
184	n/a
185	n/a
186	n/a	def make_msgid(idstring=None, domain=None):
187	n/a	"""Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
188	n/a
189	n/a	<142480216486.20800.16526388040877946887@nightshade.la.mastaler.com>
190	n/a
191	n/a	Optional idstring if given is a string used to strengthen the
192	n/a	uniqueness of the message id. Optional domain if given provides the
193	n/a	portion of the message id after the '@'. It defaults to the locally
194	n/a	defined hostname.
195	n/a	"""
196	n/a	timeval = int(time.time()*100)
197	n/a	pid = os.getpid()
198	n/a	randint = random.getrandbits(64)
199	n/a	if idstring is None:
200	n/a	idstring = ''
201	n/a	else:
202	n/a	idstring = '.' + idstring
203	n/a	if domain is None:
204	n/a	domain = socket.getfqdn()
205	n/a	msgid = '<%d.%d.%d%s@%s>' % (timeval, pid, randint, idstring, domain)
206	n/a	return msgid
207	n/a
208	n/a
209	n/a	def parsedate_to_datetime(data):
210	n/a	*dtuple, tz = _parsedate_tz(data)
211	n/a	if tz is None:
212	n/a	return datetime.datetime(*dtuple[:6])
213	n/a	return datetime.datetime(*dtuple[:6],
214	n/a	tzinfo=datetime.timezone(datetime.timedelta(seconds=tz)))
215	n/a
216	n/a
217	n/a	def parseaddr(addr):
218	n/a	addrs = _AddressList(addr).addresslist
219	n/a	if not addrs:
220	n/a	return '', ''
221	n/a	return addrs[0]
222	n/a
223	n/a
224	n/a	# rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3.
225	n/a	def unquote(str):
226	n/a	"""Remove quotes from a string."""
227	n/a	if len(str) > 1:
228	n/a	if str.startswith('"') and str.endswith('"'):
229	n/a	return str[1:-1].replace('\\\\', '\\').replace('\\"', '"')
230	n/a	if str.startswith('<') and str.endswith('>'):
231	n/a	return str[1:-1]
232	n/a	return str
233	n/a
234	n/a
235	n/a
236	n/a	# RFC2231-related functions - parameter encoding and decoding
237	n/a	def decode_rfc2231(s):
238	n/a	"""Decode string according to RFC 2231"""
239	n/a	parts = s.split(TICK, 2)
240	n/a	if len(parts) <= 2:
241	n/a	return None, None, s
242	n/a	return parts
243	n/a
244	n/a
245	n/a	def encode_rfc2231(s, charset=None, language=None):
246	n/a	"""Encode string according to RFC 2231.
247	n/a
248	n/a	If neither charset nor language is given, then s is returned as-is. If
249	n/a	charset is given but not language, the string is encoded using the empty
250	n/a	string for language.
251	n/a	"""
252	n/a	s = urllib.parse.quote(s, safe='', encoding=charset or 'ascii')
253	n/a	if charset is None and language is None:
254	n/a	return s
255	n/a	if language is None:
256	n/a	language = ''
257	n/a	return "%s'%s'%s" % (charset, language, s)
258	n/a
259	n/a
260	n/a	rfc2231_continuation = re.compile(r'^(?P<name>\w+)\((?P<num>[0-9]+)\?)?$',
261	n/a	re.ASCII)
262	n/a
263	n/a	def decode_params(params):
264	n/a	"""Decode parameters list according to RFC 2231.
265	n/a
266	n/a	params is a sequence of 2-tuples containing (param name, string value).
267	n/a	"""
268	n/a	# Copy params so we don't mess with the original
269	n/a	params = params[:]
270	n/a	new_params = []
271	n/a	# Map parameter's name to a list of continuations. The values are a
272	n/a	# 3-tuple of the continuation number, the string value, and a flag
273	n/a	# specifying whether a particular segment is %-encoded.
274	n/a	rfc2231_params = {}
275	n/a	name, value = params.pop(0)
276	n/a	new_params.append((name, value))
277	n/a	while params:
278	n/a	name, value = params.pop(0)
279	n/a	if name.endswith('*'):
280	n/a	encoded = True
281	n/a	else:
282	n/a	encoded = False
283	n/a	value = unquote(value)
284	n/a	mo = rfc2231_continuation.match(name)
285	n/a	if mo:
286	n/a	name, num = mo.group('name', 'num')
287	n/a	if num is not None:
288	n/a	num = int(num)
289	n/a	rfc2231_params.setdefault(name, []).append((num, value, encoded))
290	n/a	else:
291	n/a	new_params.append((name, '"%s"' % quote(value)))
292	n/a	if rfc2231_params:
293	n/a	for name, continuations in rfc2231_params.items():
294	n/a	value = []
295	n/a	extended = False
296	n/a	# Sort by number
297	n/a	continuations.sort()
298	n/a	# And now append all values in numerical order, converting
299	n/a	# %-encodings for the encoded segments. If any of the
300	n/a	# continuation names ends in a *, then the entire string, after
301	n/a	# decoding segments and concatenating, must have the charset and
302	n/a	# language specifiers at the beginning of the string.
303	n/a	for num, s, encoded in continuations:
304	n/a	if encoded:
305	n/a	# Decode as "latin-1", so the characters in s directly
306	n/a	# represent the percent-encoded octet values.
307	n/a	# collapse_rfc2231_value treats this as an octet sequence.
308	n/a	s = urllib.parse.unquote(s, encoding="latin-1")
309	n/a	extended = True
310	n/a	value.append(s)
311	n/a	value = quote(EMPTYSTRING.join(value))
312	n/a	if extended:
313	n/a	charset, language, value = decode_rfc2231(value)
314	n/a	new_params.append((name, (charset, language, '"%s"' % value)))
315	n/a	else:
316	n/a	new_params.append((name, '"%s"' % value))
317	n/a	return new_params
318	n/a
319	n/a	def collapse_rfc2231_value(value, errors='replace',
320	n/a	fallback_charset='us-ascii'):
321	n/a	if not isinstance(value, tuple) or len(value) != 3:
322	n/a	return unquote(value)
323	n/a	# While value comes to us as a unicode string, we need it to be a bytes
324	n/a	# object. We do not want bytes() normal utf-8 decoder, we want a straight
325	n/a	# interpretation of the string as character bytes.
326	n/a	charset, language, text = value
327	n/a	if charset is None:
328	n/a	# Issue 17369: if charset/lang is None, decode_rfc2231 couldn't parse
329	n/a	# the value, so use the fallback_charset.
330	n/a	charset = fallback_charset
331	n/a	rawbytes = bytes(text, 'raw-unicode-escape')
332	n/a	try:
333	n/a	return str(rawbytes, charset, errors)
334	n/a	except LookupError:
335	n/a	# charset is not a known codec.
336	n/a	return unquote(text)
337	n/a
338	n/a
339	n/a	#
340	n/a	# datetime doesn't provide a localtime function yet, so provide one. Code
341	n/a	# adapted from the patch in issue 9527. This may not be perfect, but it is
342	n/a	# better than not having it.
343	n/a	#
344	n/a
345	n/a	def localtime(dt=None, isdst=-1):
346	n/a	"""Return local time as an aware datetime object.
347	n/a
348	n/a	If called without arguments, return current time. Otherwise dt
349	n/a	argument should be a datetime instance, and it is converted to the
350	n/a	local time zone according to the system time zone database. If dt is
351	n/a	naive (that is, dt.tzinfo is None), it is assumed to be in local time.
352	n/a	In this case, a positive or zero value for isdst causes localtime to
353	n/a	presume initially that summer time (for example, Daylight Saving Time)
354	n/a	is or is not (respectively) in effect for the specified time. A
355	n/a	negative value for isdst causes the localtime() function to attempt
356	n/a	to divine whether summer time is in effect for the specified time.
357	n/a
358	n/a	"""
359	n/a	if dt is None:
360	n/a	return datetime.datetime.now(datetime.timezone.utc).astimezone()
361	n/a	if dt.tzinfo is not None:
362	n/a	return dt.astimezone()
363	n/a	# We have a naive datetime. Convert to a (localtime) timetuple and pass to
364	n/a	# system mktime together with the isdst hint. System mktime will return
365	n/a	# seconds since epoch.
366	n/a	tm = dt.timetuple()[:-1] + (isdst,)
367	n/a	seconds = time.mktime(tm)
368	n/a	localtm = time.localtime(seconds)
369	n/a	try:
370	n/a	delta = datetime.timedelta(seconds=localtm.tm_gmtoff)
371	n/a	tz = datetime.timezone(delta, localtm.tm_zone)
372	n/a	except AttributeError:
373	n/a	# Compute UTC offset and compare with the value implied by tm_isdst.
374	n/a	# If the values match, use the zone name implied by tm_isdst.
375	n/a	delta = dt - datetime.datetime(*time.gmtime(seconds)[:6])
376	n/a	dst = time.daylight and localtm.tm_isdst > 0
377	n/a	gmtoff = -(time.altzone if dst else time.timezone)
378	n/a	if delta == datetime.timedelta(seconds=gmtoff):
379	n/a	tz = datetime.timezone(delta, time.tzname[dst])
380	n/a	else:
381	n/a	tz = datetime.timezone(delta)
382	n/a	return dt.replace(tzinfo=tz)