Python code coverage for Lib/email/_parseaddr.py

#	count	content
1	n/a	# Copyright (C) 2002-2007 Python Software Foundation
2	n/a	# Contact: email-sig@python.org
3	n/a
4	n/a	"""Email address parsing code.
5	n/a
6	n/a	Lifted directly from rfc822.py. This should eventually be rewritten.
7	n/a	"""
8	n/a
9	n/a	__all__ = [
10	n/a	'mktime_tz',
11	n/a	'parsedate',
12	n/a	'parsedate_tz',
13	n/a	'quote',
14	n/a	]
15	n/a
16	n/a	import time, calendar
17	n/a
18	n/a	SPACE = ' '
19	n/a	EMPTYSTRING = ''
20	n/a	COMMASPACE = ', '
21	n/a
22	n/a	# Parse a date field
23	n/a	_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
24	n/a	'aug', 'sep', 'oct', 'nov', 'dec',
25	n/a	'january', 'february', 'march', 'april', 'may', 'june', 'july',
26	n/a	'august', 'september', 'october', 'november', 'december']
27	n/a
28	n/a	_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
29	n/a
30	n/a	# The timezone table does not include the military time zones defined
31	n/a	# in RFC822, other than Z. According to RFC1123, the description in
32	n/a	# RFC822 gets the signs wrong, so we can't rely on any such time
33	n/a	# zones. RFC1123 recommends that numeric timezone indicators be used
34	n/a	# instead of timezone names.
35	n/a
36	n/a	_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
37	n/a	'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
38	n/a	'EST': -500, 'EDT': -400, # Eastern
39	n/a	'CST': -600, 'CDT': -500, # Central
40	n/a	'MST': -700, 'MDT': -600, # Mountain
41	n/a	'PST': -800, 'PDT': -700 # Pacific
42	n/a	}
43	n/a
44	n/a
45	n/a	def parsedate_tz(data):
46	n/a	"""Convert a date string to a time tuple.
47	n/a
48	n/a	Accounts for military timezones.
49	n/a	"""
50	n/a	res = _parsedate_tz(data)
51	n/a	if not res:
52	n/a	return
53	n/a	if res[9] is None:
54	n/a	res[9] = 0
55	n/a	return tuple(res)
56	n/a
57	n/a	def _parsedate_tz(data):
58	n/a	"""Convert date to extended time tuple.
59	n/a
60	n/a	The last (additional) element is the time zone offset in seconds, except if
61	n/a	the timezone was specified as -0000. In that case the last element is
62	n/a	None. This indicates a UTC timestamp that explicitly declaims knowledge of
63	n/a	the source timezone, as opposed to a +0000 timestamp that indicates the
64	n/a	source timezone really was UTC.
65	n/a
66	n/a	"""
67	n/a	if not data:
68	n/a	return
69	n/a	data = data.split()
70	n/a	# The FWS after the comma after the day-of-week is optional, so search and
71	n/a	# adjust for this.
72	n/a	if data[0].endswith(',') or data[0].lower() in _daynames:
73	n/a	# There's a dayname here. Skip it
74	n/a	del data[0]
75	n/a	else:
76	n/a	i = data[0].rfind(',')
77	n/a	if i >= 0:
78	n/a	data[0] = data[0][i+1:]
79	n/a	if len(data) == 3: # RFC 850 date, deprecated
80	n/a	stuff = data[0].split('-')
81	n/a	if len(stuff) == 3:
82	n/a	data = stuff + data[1:]
83	n/a	if len(data) == 4:
84	n/a	s = data[3]
85	n/a	i = s.find('+')
86	n/a	if i == -1:
87	n/a	i = s.find('-')
88	n/a	if i > 0:
89	n/a	data[3:] = [s[:i], s[i:]]
90	n/a	else:
91	n/a	data.append('') # Dummy tz
92	n/a	if len(data) < 5:
93	n/a	return None
94	n/a	data = data[:5]
95	n/a	[dd, mm, yy, tm, tz] = data
96	n/a	mm = mm.lower()
97	n/a	if mm not in _monthnames:
98	n/a	dd, mm = mm, dd.lower()
99	n/a	if mm not in _monthnames:
100	n/a	return None
101	n/a	mm = _monthnames.index(mm) + 1
102	n/a	if mm > 12:
103	n/a	mm -= 12
104	n/a	if dd[-1] == ',':
105	n/a	dd = dd[:-1]
106	n/a	i = yy.find(':')
107	n/a	if i > 0:
108	n/a	yy, tm = tm, yy
109	n/a	if yy[-1] == ',':
110	n/a	yy = yy[:-1]
111	n/a	if not yy[0].isdigit():
112	n/a	yy, tz = tz, yy
113	n/a	if tm[-1] == ',':
114	n/a	tm = tm[:-1]
115	n/a	tm = tm.split(':')
116	n/a	if len(tm) == 2:
117	n/a	[thh, tmm] = tm
118	n/a	tss = '0'
119	n/a	elif len(tm) == 3:
120	n/a	[thh, tmm, tss] = tm
121	n/a	elif len(tm) == 1 and '.' in tm[0]:
122	n/a	# Some non-compliant MUAs use '.' to separate time elements.
123	n/a	tm = tm[0].split('.')
124	n/a	if len(tm) == 2:
125	n/a	[thh, tmm] = tm
126	n/a	tss = 0
127	n/a	elif len(tm) == 3:
128	n/a	[thh, tmm, tss] = tm
129	n/a	else:
130	n/a	return None
131	n/a	try:
132	n/a	yy = int(yy)
133	n/a	dd = int(dd)
134	n/a	thh = int(thh)
135	n/a	tmm = int(tmm)
136	n/a	tss = int(tss)
137	n/a	except ValueError:
138	n/a	return None
139	n/a	# Check for a yy specified in two-digit format, then convert it to the
140	n/a	# appropriate four-digit format, according to the POSIX standard. RFC 822
141	n/a	# calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
142	n/a	# mandates a 4-digit yy. For more information, see the documentation for
143	n/a	# the time module.
144	n/a	if yy < 100:
145	n/a	# The year is between 1969 and 1999 (inclusive).
146	n/a	if yy > 68:
147	n/a	yy += 1900
148	n/a	# The year is between 2000 and 2068 (inclusive).
149	n/a	else:
150	n/a	yy += 2000
151	n/a	tzoffset = None
152	n/a	tz = tz.upper()
153	n/a	if tz in _timezones:
154	n/a	tzoffset = _timezones[tz]
155	n/a	else:
156	n/a	try:
157	n/a	tzoffset = int(tz)
158	n/a	except ValueError:
159	n/a	pass
160	n/a	if tzoffset==0 and tz.startswith('-'):
161	n/a	tzoffset = None
162	n/a	# Convert a timezone offset into seconds ; -0500 -> -18000
163	n/a	if tzoffset:
164	n/a	if tzoffset < 0:
165	n/a	tzsign = -1
166	n/a	tzoffset = -tzoffset
167	n/a	else:
168	n/a	tzsign = 1
169	n/a	tzoffset = tzsign * ( (tzoffset//100)3600 + (tzoffset % 100)60)
170	n/a	# Daylight Saving Time flag is set to -1, since DST is unknown.
171	n/a	return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset]
172	n/a
173	n/a
174	n/a	def parsedate(data):
175	n/a	"""Convert a time string to a time tuple."""
176	n/a	t = parsedate_tz(data)
177	n/a	if isinstance(t, tuple):
178	n/a	return t[:9]
179	n/a	else:
180	n/a	return t
181	n/a
182	n/a
183	n/a	def mktime_tz(data):
184	n/a	"""Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp."""
185	n/a	if data[9] is None:
186	n/a	# No zone info, so localtime is better assumption than GMT
187	n/a	return time.mktime(data[:8] + (-1,))
188	n/a	else:
189	n/a	t = calendar.timegm(data)
190	n/a	return t - data[9]
191	n/a
192	n/a
193	n/a	def quote(str):
194	n/a	"""Prepare string to be used in a quoted string.
195	n/a
196	n/a	Turns backslash and double quote characters into quoted pairs. These
197	n/a	are the only characters that need to be quoted inside a quoted string.
198	n/a	Does not add the surrounding double quotes.
199	n/a	"""
200	n/a	return str.replace('\\', '\\\\').replace('"', '\\"')
201	n/a
202	n/a
203	n/a	class AddrlistClass:
204	n/a	"""Address parser class by Ben Escoto.
205	n/a
206	n/a	To understand what this class does, it helps to have a copy of RFC 2822 in
207	n/a	front of you.
208	n/a
209	n/a	Note: this class interface is deprecated and may be removed in the future.
210	n/a	Use email.utils.AddressList instead.
211	n/a	"""
212	n/a
213	n/a	def __init__(self, field):
214	n/a	"""Initialize a new instance.
215	n/a
216	n/a	`field' is an unparsed address header field, containing
217	n/a	one or more addresses.
218	n/a	"""
219	n/a	self.specials = '()<>@,:;.\"[]'
220	n/a	self.pos = 0
221	n/a	self.LWS = ' \t'
222	n/a	self.CR = '\r\n'
223	n/a	self.FWS = self.LWS + self.CR
224	n/a	self.atomends = self.specials + self.LWS + self.CR
225	n/a	# Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
226	n/a	# is obsolete syntax. RFC 2822 requires that we recognize obsolete
227	n/a	# syntax, so allow dots in phrases.
228	n/a	self.phraseends = self.atomends.replace('.', '')
229	n/a	self.field = field
230	n/a	self.commentlist = []
231	n/a
232	n/a	def gotonext(self):
233	n/a	"""Skip white space and extract comments."""
234	n/a	wslist = []
235	n/a	while self.pos < len(self.field):
236	n/a	if self.field[self.pos] in self.LWS + '\n\r':
237	n/a	if self.field[self.pos] not in '\n\r':
238	n/a	wslist.append(self.field[self.pos])
239	n/a	self.pos += 1
240	n/a	elif self.field[self.pos] == '(':
241	n/a	self.commentlist.append(self.getcomment())
242	n/a	else:
243	n/a	break
244	n/a	return EMPTYSTRING.join(wslist)
245	n/a
246	n/a	def getaddrlist(self):
247	n/a	"""Parse all addresses.
248	n/a
249	n/a	Returns a list containing all of the addresses.
250	n/a	"""
251	n/a	result = []
252	n/a	while self.pos < len(self.field):
253	n/a	ad = self.getaddress()
254	n/a	if ad:
255	n/a	result += ad
256	n/a	else:
257	n/a	result.append(('', ''))
258	n/a	return result
259	n/a
260	n/a	def getaddress(self):
261	n/a	"""Parse the next address."""
262	n/a	self.commentlist = []
263	n/a	self.gotonext()
264	n/a
265	n/a	oldpos = self.pos
266	n/a	oldcl = self.commentlist
267	n/a	plist = self.getphraselist()
268	n/a
269	n/a	self.gotonext()
270	n/a	returnlist = []
271	n/a
272	n/a	if self.pos >= len(self.field):
273	n/a	# Bad email address technically, no domain.
274	n/a	if plist:
275	n/a	returnlist = [(SPACE.join(self.commentlist), plist[0])]
276	n/a
277	n/a	elif self.field[self.pos] in '.@':
278	n/a	# email address is just an addrspec
279	n/a	# this isn't very efficient since we start over
280	n/a	self.pos = oldpos
281	n/a	self.commentlist = oldcl
282	n/a	addrspec = self.getaddrspec()
283	n/a	returnlist = [(SPACE.join(self.commentlist), addrspec)]
284	n/a
285	n/a	elif self.field[self.pos] == ':':
286	n/a	# address is a group
287	n/a	returnlist = []
288	n/a
289	n/a	fieldlen = len(self.field)
290	n/a	self.pos += 1
291	n/a	while self.pos < len(self.field):
292	n/a	self.gotonext()
293	n/a	if self.pos < fieldlen and self.field[self.pos] == ';':
294	n/a	self.pos += 1
295	n/a	break
296	n/a	returnlist = returnlist + self.getaddress()
297	n/a
298	n/a	elif self.field[self.pos] == '<':
299	n/a	# Address is a phrase then a route addr
300	n/a	routeaddr = self.getrouteaddr()
301	n/a
302	n/a	if self.commentlist:
303	n/a	returnlist = [(SPACE.join(plist) + ' (' +
304	n/a	' '.join(self.commentlist) + ')', routeaddr)]
305	n/a	else:
306	n/a	returnlist = [(SPACE.join(plist), routeaddr)]
307	n/a
308	n/a	else:
309	n/a	if plist:
310	n/a	returnlist = [(SPACE.join(self.commentlist), plist[0])]
311	n/a	elif self.field[self.pos] in self.specials:
312	n/a	self.pos += 1
313	n/a
314	n/a	self.gotonext()
315	n/a	if self.pos < len(self.field) and self.field[self.pos] == ',':
316	n/a	self.pos += 1
317	n/a	return returnlist
318	n/a
319	n/a	def getrouteaddr(self):
320	n/a	"""Parse a route address (Return-path value).
321	n/a
322	n/a	This method just skips all the route stuff and returns the addrspec.
323	n/a	"""
324	n/a	if self.field[self.pos] != '<':
325	n/a	return
326	n/a
327	n/a	expectroute = False
328	n/a	self.pos += 1
329	n/a	self.gotonext()
330	n/a	adlist = ''
331	n/a	while self.pos < len(self.field):
332	n/a	if expectroute:
333	n/a	self.getdomain()
334	n/a	expectroute = False
335	n/a	elif self.field[self.pos] == '>':
336	n/a	self.pos += 1
337	n/a	break
338	n/a	elif self.field[self.pos] == '@':
339	n/a	self.pos += 1
340	n/a	expectroute = True
341	n/a	elif self.field[self.pos] == ':':
342	n/a	self.pos += 1
343	n/a	else:
344	n/a	adlist = self.getaddrspec()
345	n/a	self.pos += 1
346	n/a	break
347	n/a	self.gotonext()
348	n/a
349	n/a	return adlist
350	n/a
351	n/a	def getaddrspec(self):
352	n/a	"""Parse an RFC 2822 addr-spec."""
353	n/a	aslist = []
354	n/a
355	n/a	self.gotonext()
356	n/a	while self.pos < len(self.field):
357	n/a	preserve_ws = True
358	n/a	if self.field[self.pos] == '.':
359	n/a	if aslist and not aslist[-1].strip():
360	n/a	aslist.pop()
361	n/a	aslist.append('.')
362	n/a	self.pos += 1
363	n/a	preserve_ws = False
364	n/a	elif self.field[self.pos] == '"':
365	n/a	aslist.append('"%s"' % quote(self.getquote()))
366	n/a	elif self.field[self.pos] in self.atomends:
367	n/a	if aslist and not aslist[-1].strip():
368	n/a	aslist.pop()
369	n/a	break
370	n/a	else:
371	n/a	aslist.append(self.getatom())
372	n/a	ws = self.gotonext()
373	n/a	if preserve_ws and ws:
374	n/a	aslist.append(ws)
375	n/a
376	n/a	if self.pos >= len(self.field) or self.field[self.pos] != '@':
377	n/a	return EMPTYSTRING.join(aslist)
378	n/a
379	n/a	aslist.append('@')
380	n/a	self.pos += 1
381	n/a	self.gotonext()
382	n/a	return EMPTYSTRING.join(aslist) + self.getdomain()
383	n/a
384	n/a	def getdomain(self):
385	n/a	"""Get the complete domain name from an address."""
386	n/a	sdlist = []
387	n/a	while self.pos < len(self.field):
388	n/a	if self.field[self.pos] in self.LWS:
389	n/a	self.pos += 1
390	n/a	elif self.field[self.pos] == '(':
391	n/a	self.commentlist.append(self.getcomment())
392	n/a	elif self.field[self.pos] == '[':
393	n/a	sdlist.append(self.getdomainliteral())
394	n/a	elif self.field[self.pos] == '.':
395	n/a	self.pos += 1
396	n/a	sdlist.append('.')
397	n/a	elif self.field[self.pos] in self.atomends:
398	n/a	break
399	n/a	else:
400	n/a	sdlist.append(self.getatom())
401	n/a	return EMPTYSTRING.join(sdlist)
402	n/a
403	n/a	def getdelimited(self, beginchar, endchars, allowcomments=True):
404	n/a	"""Parse a header fragment delimited by special characters.
405	n/a
406	n/a	`beginchar' is the start character for the fragment.
407	n/a	If self is not looking at an instance of `beginchar' then
408	n/a	getdelimited returns the empty string.
409	n/a
410	n/a	`endchars' is a sequence of allowable end-delimiting characters.
411	n/a	Parsing stops when one of these is encountered.
412	n/a
413	n/a	If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
414	n/a	within the parsed fragment.
415	n/a	"""
416	n/a	if self.field[self.pos] != beginchar:
417	n/a	return ''
418	n/a
419	n/a	slist = ['']
420	n/a	quote = False
421	n/a	self.pos += 1
422	n/a	while self.pos < len(self.field):
423	n/a	if quote:
424	n/a	slist.append(self.field[self.pos])
425	n/a	quote = False
426	n/a	elif self.field[self.pos] in endchars:
427	n/a	self.pos += 1
428	n/a	break
429	n/a	elif allowcomments and self.field[self.pos] == '(':
430	n/a	slist.append(self.getcomment())
431	n/a	continue # have already advanced pos from getcomment
432	n/a	elif self.field[self.pos] == '\\':
433	n/a	quote = True
434	n/a	else:
435	n/a	slist.append(self.field[self.pos])
436	n/a	self.pos += 1
437	n/a
438	n/a	return EMPTYSTRING.join(slist)
439	n/a
440	n/a	def getquote(self):
441	n/a	"""Get a quote-delimited fragment from self's field."""
442	n/a	return self.getdelimited('"', '"\r', False)
443	n/a
444	n/a	def getcomment(self):
445	n/a	"""Get a parenthesis-delimited fragment from self's field."""
446	n/a	return self.getdelimited('(', ')\r', True)
447	n/a
448	n/a	def getdomainliteral(self):
449	n/a	"""Parse an RFC 2822 domain-literal."""
450	n/a	return '[%s]' % self.getdelimited('[', ']\r', False)
451	n/a
452	n/a	def getatom(self, atomends=None):
453	n/a	"""Parse an RFC 2822 atom.
454	n/a
455	n/a	Optional atomends specifies a different set of end token delimiters
456	n/a	(the default is to use self.atomends). This is used e.g. in
457	n/a	getphraselist() since phrase endings must not include the `.' (which
458	n/a	is legal in phrases)."""
459	n/a	atomlist = ['']
460	n/a	if atomends is None:
461	n/a	atomends = self.atomends
462	n/a
463	n/a	while self.pos < len(self.field):
464	n/a	if self.field[self.pos] in atomends:
465	n/a	break
466	n/a	else:
467	n/a	atomlist.append(self.field[self.pos])
468	n/a	self.pos += 1
469	n/a
470	n/a	return EMPTYSTRING.join(atomlist)
471	n/a
472	n/a	def getphraselist(self):
473	n/a	"""Parse a sequence of RFC 2822 phrases.
474	n/a
475	n/a	A phrase is a sequence of words, which are in turn either RFC 2822
476	n/a	atoms or quoted-strings. Phrases are canonicalized by squeezing all
477	n/a	runs of continuous whitespace into one space.
478	n/a	"""
479	n/a	plist = []
480	n/a
481	n/a	while self.pos < len(self.field):
482	n/a	if self.field[self.pos] in self.FWS:
483	n/a	self.pos += 1
484	n/a	elif self.field[self.pos] == '"':
485	n/a	plist.append(self.getquote())
486	n/a	elif self.field[self.pos] == '(':
487	n/a	self.commentlist.append(self.getcomment())
488	n/a	elif self.field[self.pos] in self.phraseends:
489	n/a	break
490	n/a	else:
491	n/a	plist.append(self.getatom(self.phraseends))
492	n/a
493	n/a	return plist
494	n/a
495	n/a	class AddressList(AddrlistClass):
496	n/a	"""An AddressList encapsulates a list of parsed RFC 2822 addresses."""
497	n/a	def __init__(self, field):
498	n/a	AddrlistClass.__init__(self, field)
499	n/a	if field:
500	n/a	self.addresslist = self.getaddrlist()
501	n/a	else:
502	n/a	self.addresslist = []
503	n/a
504	n/a	def __len__(self):
505	n/a	return len(self.addresslist)
506	n/a
507	n/a	def __add__(self, other):
508	n/a	# Set union
509	n/a	newaddr = AddressList(None)
510	n/a	newaddr.addresslist = self.addresslist[:]
511	n/a	for x in other.addresslist:
512	n/a	if not x in self.addresslist:
513	n/a	newaddr.addresslist.append(x)
514	n/a	return newaddr
515	n/a
516	n/a	def __iadd__(self, other):
517	n/a	# Set union, in-place
518	n/a	for x in other.addresslist:
519	n/a	if not x in self.addresslist:
520	n/a	self.addresslist.append(x)
521	n/a	return self
522	n/a
523	n/a	def __sub__(self, other):
524	n/a	# Set difference
525	n/a	newaddr = AddressList(None)
526	n/a	for x in self.addresslist:
527	n/a	if not x in other.addresslist:
528	n/a	newaddr.addresslist.append(x)
529	n/a	return newaddr
530	n/a
531	n/a	def __isub__(self, other):
532	n/a	# Set difference, in-place
533	n/a	for x in other.addresslist:
534	n/a	if x in self.addresslist:
535	n/a	self.addresslist.remove(x)
536	n/a	return self
537	n/a
538	n/a	def __getitem__(self, index):
539	n/a	# Make indexing, slices, and 'in' work
540	n/a	return self.addresslist[index]