Python code coverage for Lib/email/_header_value_parser.py

#	count	content
1	n/a	"""Header value parser implementing various email-related RFC parsing rules.
2	n/a
3	n/a	The parsing methods defined in this module implement various email related
4	n/a	parsing rules. Principal among them is RFC 5322, which is the followon
5	n/a	to RFC 2822 and primarily a clarification of the former. It also implements
6	n/a	RFC 2047 encoded word decoding.
7	n/a
8	n/a	RFC 5322 goes to considerable trouble to maintain backward compatibility with
9	n/a	RFC 822 in the parse phase, while cleaning up the structure on the generation
10	n/a	phase. This parser supports correct RFC 5322 generation by tagging white space
11	n/a	as folding white space only when folding is allowed in the non-obsolete rule
12	n/a	sets. Actually, the parser is even more generous when accepting input than RFC
13	n/a	5322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages.
14	n/a	Where possible deviations from the standard are annotated on the 'defects'
15	n/a	attribute of tokens that deviate.
16	n/a
17	n/a	The general structure of the parser follows RFC 5322, and uses its terminology
18	n/a	where there is a direct correspondence. Where the implementation requires a
19	n/a	somewhat different structure than that used by the formal grammar, new terms
20	n/a	that mimic the closest existing terms are used. Thus, it really helps to have
21	n/a	a copy of RFC 5322 handy when studying this code.
22	n/a
23	n/a	Input to the parser is a string that has already been unfolded according to
24	n/a	RFC 5322 rules. According to the RFC this unfolding is the very first step, and
25	n/a	this parser leaves the unfolding step to a higher level message parser, which
26	n/a	will have already detected the line breaks that need unfolding while
27	n/a	determining the beginning and end of each header.
28	n/a
29	n/a	The output of the parser is a TokenList object, which is a list subclass. A
30	n/a	TokenList is a recursive data structure. The terminal nodes of the structure
31	n/a	are Terminal objects, which are subclasses of str. These do not correspond
32	n/a	directly to terminal objects in the formal grammar, but are instead more
33	n/a	practical higher level combinations of true terminals.
34	n/a
35	n/a	All TokenList and Terminal objects have a 'value' attribute, which produces the
36	n/a	semantically meaningful value of that part of the parse subtree. The value of
37	n/a	all whitespace tokens (no matter how many sub-tokens they may contain) is a
38	n/a	single space, as per the RFC rules. This includes 'CFWS', which is herein
39	n/a	included in the general class of whitespace tokens. There is one exception to
40	n/a	the rule that whitespace tokens are collapsed into single spaces in values: in
41	n/a	the value of a 'bare-quoted-string' (a quoted-string with no leading or
42	n/a	trailing whitespace), any whitespace that appeared between the quotation marks
43	n/a	is preserved in the returned value. Note that in all Terminal strings quoted
44	n/a	pairs are turned into their unquoted values.
45	n/a
46	n/a	All TokenList and Terminal objects also have a string value, which attempts to
47	n/a	be a "canonical" representation of the RFC-compliant form of the substring that
48	n/a	produced the parsed subtree, including minimal use of quoted pair quoting.
49	n/a	Whitespace runs are not collapsed.
50	n/a
51	n/a	Comment tokens also have a 'content' attribute providing the string found
52	n/a	between the parens (including any nested comments) with whitespace preserved.
53	n/a
54	n/a	All TokenList and Terminal objects have a 'defects' attribute which is a
55	n/a	possibly empty list all of the defects found while creating the token. Defects
56	n/a	may appear on any token in the tree, and a composite list of all defects in the
57	n/a	subtree is available through the 'all_defects' attribute of any node. (For
58	n/a	Terminal notes x.defects == x.all_defects.)
59	n/a
60	n/a	Each object in a parse tree is called a 'token', and each has a 'token_type'
61	n/a	attribute that gives the name from the RFC 5322 grammar that it represents.
62	n/a	Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that
63	n/a	may be produced: 'ptext'. A 'ptext' is a string of printable ascii characters.
64	n/a	It is returned in place of lists of (ctext/quoted-pair) and
65	n/a	(qtext/quoted-pair).
66	n/a
67	n/a	XXX: provide complete list of token types.
68	n/a	"""
69	n/a
70	n/a	import re
71	n/a	import urllib # For urllib.parse.unquote
72	n/a	from string import hexdigits
73	n/a	from collections import OrderedDict
74	n/a	from operator import itemgetter
75	n/a	from email import _encoded_words as _ew
76	n/a	from email import errors
77	n/a	from email import utils
78	n/a
79	n/a	#
80	n/a	# Useful constants and functions
81	n/a	#
82	n/a
83	n/a	WSP = set(' \t')
84	n/a	CFWS_LEADER = WSP \| set('(')
85	n/a	SPECIALS = set(r'()<>@,:;.\"[]')
86	n/a	ATOM_ENDS = SPECIALS \| WSP
87	n/a	DOT_ATOM_ENDS = ATOM_ENDS - set('.')
88	n/a	# '.', '"', and '(' do not end phrases in order to support obs-phrase
89	n/a	PHRASE_ENDS = SPECIALS - set('."(')
90	n/a	TSPECIALS = (SPECIALS \| set('/?=')) - set('.')
91	n/a	TOKEN_ENDS = TSPECIALS \| WSP
92	n/a	ASPECIALS = TSPECIALS \| set("*'%")
93	n/a	ATTRIBUTE_ENDS = ASPECIALS \| WSP
94	n/a	EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
95	n/a
96	n/a	def quote_string(value):
97	n/a	return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
98	n/a
99	n/a	#
100	n/a	# Accumulator for header folding
101	n/a	#
102	n/a
103	n/a	class _Folded:
104	n/a
105	n/a	def __init__(self, maxlen, policy):
106	n/a	self.maxlen = maxlen
107	n/a	self.policy = policy
108	n/a	self.lastlen = 0
109	n/a	self.stickyspace = None
110	n/a	self.firstline = True
111	n/a	self.done = []
112	n/a	self.current = []
113	n/a
114	n/a	def newline(self):
115	n/a	self.done.extend(self.current)
116	n/a	self.done.append(self.policy.linesep)
117	n/a	self.current.clear()
118	n/a	self.lastlen = 0
119	n/a
120	n/a	def finalize(self):
121	n/a	if self.current:
122	n/a	self.newline()
123	n/a
124	n/a	def __str__(self):
125	n/a	return ''.join(self.done)
126	n/a
127	n/a	def append(self, stoken):
128	n/a	self.current.append(stoken)
129	n/a
130	n/a	def append_if_fits(self, token, stoken=None):
131	n/a	if stoken is None:
132	n/a	stoken = str(token)
133	n/a	l = len(stoken)
134	n/a	if self.stickyspace is not None:
135	n/a	stickyspace_len = len(self.stickyspace)
136	n/a	if self.lastlen + stickyspace_len + l <= self.maxlen:
137	n/a	self.current.append(self.stickyspace)
138	n/a	self.lastlen += stickyspace_len
139	n/a	self.current.append(stoken)
140	n/a	self.lastlen += l
141	n/a	self.stickyspace = None
142	n/a	self.firstline = False
143	n/a	return True
144	n/a	if token.has_fws:
145	n/a	ws = token.pop_leading_fws()
146	n/a	if ws is not None:
147	n/a	self.stickyspace += str(ws)
148	n/a	stickyspace_len += len(ws)
149	n/a	token._fold(self)
150	n/a	return True
151	n/a	if stickyspace_len and l + 1 <= self.maxlen:
152	n/a	margin = self.maxlen - l
153	n/a	if 0 < margin < stickyspace_len:
154	n/a	trim = stickyspace_len - margin
155	n/a	self.current.append(self.stickyspace[:trim])
156	n/a	self.stickyspace = self.stickyspace[trim:]
157	n/a	stickyspace_len = trim
158	n/a	self.newline()
159	n/a	self.current.append(self.stickyspace)
160	n/a	self.current.append(stoken)
161	n/a	self.lastlen = l + stickyspace_len
162	n/a	self.stickyspace = None
163	n/a	self.firstline = False
164	n/a	return True
165	n/a	if not self.firstline:
166	n/a	self.newline()
167	n/a	self.current.append(self.stickyspace)
168	n/a	self.current.append(stoken)
169	n/a	self.stickyspace = None
170	n/a	self.firstline = False
171	n/a	return True
172	n/a	if self.lastlen + l <= self.maxlen:
173	n/a	self.current.append(stoken)
174	n/a	self.lastlen += l
175	n/a	return True
176	n/a	if l < self.maxlen:
177	n/a	self.newline()
178	n/a	self.current.append(stoken)
179	n/a	self.lastlen = l
180	n/a	return True
181	n/a	return False
182	n/a
183	n/a	#
184	n/a	# TokenList and its subclasses
185	n/a	#
186	n/a
187	n/a	class TokenList(list):
188	n/a
189	n/a	token_type = None
190	n/a
191	n/a	def __init__(self, args, *kw):
192	n/a	super().__init__(args, *kw)
193	n/a	self.defects = []
194	n/a
195	n/a	def __str__(self):
196	n/a	return ''.join(str(x) for x in self)
197	n/a
198	n/a	def __repr__(self):
199	n/a	return '{}({})'.format(self.__class__.__name__,
200	n/a	super().__repr__())
201	n/a
202	n/a	@property
203	n/a	def value(self):
204	n/a	return ''.join(x.value for x in self if x.value)
205	n/a
206	n/a	@property
207	n/a	def all_defects(self):
208	n/a	return sum((x.all_defects for x in self), self.defects)
209	n/a
210	n/a	#
211	n/a	# Folding API
212	n/a	#
213	n/a	# parts():
214	n/a	#
215	n/a	# return a list of objects that constitute the "higher level syntactic
216	n/a	# objects" specified by the RFC as the best places to fold a header line.
217	n/a	# The returned objects must include leading folding white space, even if
218	n/a	# this means mutating the underlying parse tree of the object. Each object
219	n/a	# is only responsible for returning its parts, and should not drill down
220	n/a	# to any lower level except as required to meet the leading folding white
221	n/a	# space constraint.
222	n/a	#
223	n/a	# _fold(folded):
224	n/a	#
225	n/a	# folded: the result accumulator. This is an instance of _Folded.
226	n/a	# (XXX: I haven't finished factoring this out yet, the folding code
227	n/a	# pretty much uses this as a state object.) When the folded.current
228	n/a	# contains as much text as will fit, the _fold method should call
229	n/a	# folded.newline.
230	n/a	# folded.lastlen: the current length of the test stored in folded.current.
231	n/a	# folded.maxlen: The maximum number of characters that may appear on a
232	n/a	# folded line. Differs from the policy setting in that "no limit" is
233	n/a	# represented by +inf, which means it can be used in the trivially
234	n/a	# logical fashion in comparisons.
235	n/a	#
236	n/a	# Currently no subclasses implement parts, and I think this will remain
237	n/a	# true. A subclass only needs to implement _fold when the generic version
238	n/a	# isn't sufficient. _fold will need to be implemented primarily when it is
239	n/a	# possible for encoded words to appear in the specialized token-list, since
240	n/a	# there is no generic algorithm that can know where exactly the encoded
241	n/a	# words are allowed. A _fold implementation is responsible for filling
242	n/a	# lines in the same general way that the top level _fold does. It may, and
243	n/a	# should, call the _fold method of sub-objects in a similar fashion to that
244	n/a	# of the top level _fold.
245	n/a	#
246	n/a	# XXX: I'm hoping it will be possible to factor the existing code further
247	n/a	# to reduce redundancy and make the logic clearer.
248	n/a
249	n/a	@property
250	n/a	def parts(self):
251	n/a	klass = self.__class__
252	n/a	this = []
253	n/a	for token in self:
254	n/a	if token.startswith_fws():
255	n/a	if this:
256	n/a	yield this[0] if len(this)==1 else klass(this)
257	n/a	this.clear()
258	n/a	end_ws = token.pop_trailing_ws()
259	n/a	this.append(token)
260	n/a	if end_ws:
261	n/a	yield klass(this)
262	n/a	this = [end_ws]
263	n/a	if this:
264	n/a	yield this[0] if len(this)==1 else klass(this)
265	n/a
266	n/a	def startswith_fws(self):
267	n/a	return self[0].startswith_fws()
268	n/a
269	n/a	def pop_leading_fws(self):
270	n/a	if self[0].token_type == 'fws':
271	n/a	return self.pop(0)
272	n/a	return self[0].pop_leading_fws()
273	n/a
274	n/a	def pop_trailing_ws(self):
275	n/a	if self[-1].token_type == 'cfws':
276	n/a	return self.pop(-1)
277	n/a	return self[-1].pop_trailing_ws()
278	n/a
279	n/a	@property
280	n/a	def has_fws(self):
281	n/a	for part in self:
282	n/a	if part.has_fws:
283	n/a	return True
284	n/a	return False
285	n/a
286	n/a	def has_leading_comment(self):
287	n/a	return self[0].has_leading_comment()
288	n/a
289	n/a	@property
290	n/a	def comments(self):
291	n/a	comments = []
292	n/a	for token in self:
293	n/a	comments.extend(token.comments)
294	n/a	return comments
295	n/a
296	n/a	def fold(self, *, policy):
297	n/a	# max_line_length 0/None means no limit, ie: infinitely long.
298	n/a	maxlen = policy.max_line_length or float("+inf")
299	n/a	folded = _Folded(maxlen, policy)
300	n/a	self._fold(folded)
301	n/a	folded.finalize()
302	n/a	return str(folded)
303	n/a
304	n/a	def as_encoded_word(self, charset):
305	n/a	# This works only for things returned by 'parts', which include
306	n/a	# the leading fws, if any, that should be used.
307	n/a	res = []
308	n/a	ws = self.pop_leading_fws()
309	n/a	if ws:
310	n/a	res.append(ws)
311	n/a	trailer = self.pop(-1) if self[-1].token_type=='fws' else ''
312	n/a	res.append(_ew.encode(str(self), charset))
313	n/a	res.append(trailer)
314	n/a	return ''.join(res)
315	n/a
316	n/a	def cte_encode(self, charset, policy):
317	n/a	res = []
318	n/a	for part in self:
319	n/a	res.append(part.cte_encode(charset, policy))
320	n/a	return ''.join(res)
321	n/a
322	n/a	def _fold(self, folded):
323	n/a	encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
324	n/a	for part in self.parts:
325	n/a	tstr = str(part)
326	n/a	tlen = len(tstr)
327	n/a	try:
328	n/a	str(part).encode(encoding)
329	n/a	except UnicodeEncodeError:
330	n/a	if any(isinstance(x, errors.UndecodableBytesDefect)
331	n/a	for x in part.all_defects):
332	n/a	charset = 'unknown-8bit'
333	n/a	else:
334	n/a	# XXX: this should be a policy setting when utf8 is False.
335	n/a	charset = 'utf-8'
336	n/a	tstr = part.cte_encode(charset, folded.policy)
337	n/a	tlen = len(tstr)
338	n/a	if folded.append_if_fits(part, tstr):
339	n/a	continue
340	n/a	# Peel off the leading whitespace if any and make it sticky, to
341	n/a	# avoid infinite recursion.
342	n/a	ws = part.pop_leading_fws()
343	n/a	if ws is not None:
344	n/a	# Peel off the leading whitespace and make it sticky, to
345	n/a	# avoid infinite recursion.
346	n/a	folded.stickyspace = str(part.pop(0))
347	n/a	if folded.append_if_fits(part):
348	n/a	continue
349	n/a	if part.has_fws:
350	n/a	part._fold(folded)
351	n/a	continue
352	n/a	# There are no fold points in this one; it is too long for a single
353	n/a	# line and can't be split...we just have to put it on its own line.
354	n/a	folded.append(tstr)
355	n/a	folded.newline()
356	n/a
357	n/a	def pprint(self, indent=''):
358	n/a	print('\n'.join(self._pp(indent='')))
359	n/a
360	n/a	def ppstr(self, indent=''):
361	n/a	return '\n'.join(self._pp(indent=''))
362	n/a
363	n/a	def _pp(self, indent=''):
364	n/a	yield '{}{}/{}('.format(
365	n/a	indent,
366	n/a	self.__class__.__name__,
367	n/a	self.token_type)
368	n/a	for token in self:
369	n/a	if not hasattr(token, '_pp'):
370	n/a	yield (indent + ' !! invalid element in token '
371	n/a	'list: {!r}'.format(token))
372	n/a	else:
373	n/a	yield from token._pp(indent+' ')
374	n/a	if self.defects:
375	n/a	extra = ' Defects: {}'.format(self.defects)
376	n/a	else:
377	n/a	extra = ''
378	n/a	yield '{}){}'.format(indent, extra)
379	n/a
380	n/a
381	n/a	class WhiteSpaceTokenList(TokenList):
382	n/a
383	n/a	@property
384	n/a	def value(self):
385	n/a	return ' '
386	n/a
387	n/a	@property
388	n/a	def comments(self):
389	n/a	return [x.content for x in self if x.token_type=='comment']
390	n/a
391	n/a
392	n/a	class UnstructuredTokenList(TokenList):
393	n/a
394	n/a	token_type = 'unstructured'
395	n/a
396	n/a	def _fold(self, folded):
397	n/a	last_ew = None
398	n/a	encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
399	n/a	for part in self.parts:
400	n/a	tstr = str(part)
401	n/a	is_ew = False
402	n/a	try:
403	n/a	str(part).encode(encoding)
404	n/a	except UnicodeEncodeError:
405	n/a	if any(isinstance(x, errors.UndecodableBytesDefect)
406	n/a	for x in part.all_defects):
407	n/a	charset = 'unknown-8bit'
408	n/a	else:
409	n/a	charset = 'utf-8'
410	n/a	if last_ew is not None:
411	n/a	# We've already done an EW, combine this one with it
412	n/a	# if there's room.
413	n/a	chunk = get_unstructured(
414	n/a	''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
415	n/a	oldlastlen = sum(len(x) for x in folded.current[:last_ew])
416	n/a	schunk = str(chunk)
417	n/a	lchunk = len(schunk)
418	n/a	if oldlastlen + lchunk <= folded.maxlen:
419	n/a	del folded.current[last_ew:]
420	n/a	folded.append(schunk)
421	n/a	folded.lastlen = oldlastlen + lchunk
422	n/a	continue
423	n/a	tstr = part.as_encoded_word(charset)
424	n/a	is_ew = True
425	n/a	if folded.append_if_fits(part, tstr):
426	n/a	if is_ew:
427	n/a	last_ew = len(folded.current) - 1
428	n/a	continue
429	n/a	if is_ew or last_ew:
430	n/a	# It's too big to fit on the line, but since we've
431	n/a	# got encoded words we can use encoded word folding.
432	n/a	part._fold_as_ew(folded)
433	n/a	continue
434	n/a	# Peel off the leading whitespace if any and make it sticky, to
435	n/a	# avoid infinite recursion.
436	n/a	ws = part.pop_leading_fws()
437	n/a	if ws is not None:
438	n/a	folded.stickyspace = str(ws)
439	n/a	if folded.append_if_fits(part):
440	n/a	continue
441	n/a	if part.has_fws:
442	n/a	part._fold(folded)
443	n/a	continue
444	n/a	# It can't be split...we just have to put it on its own line.
445	n/a	folded.append(tstr)
446	n/a	folded.newline()
447	n/a	last_ew = None
448	n/a
449	n/a	def cte_encode(self, charset, policy):
450	n/a	res = []
451	n/a	last_ew = None
452	n/a	for part in self:
453	n/a	spart = str(part)
454	n/a	try:
455	n/a	spart.encode('us-ascii')
456	n/a	res.append(spart)
457	n/a	except UnicodeEncodeError:
458	n/a	if last_ew is None:
459	n/a	res.append(part.cte_encode(charset, policy))
460	n/a	last_ew = len(res)
461	n/a	else:
462	n/a	tl = get_unstructured(''.join(res[last_ew:] + [spart]))
463	n/a	res.append(tl.as_encoded_word(charset))
464	n/a	return ''.join(res)
465	n/a
466	n/a
467	n/a	class Phrase(TokenList):
468	n/a
469	n/a	token_type = 'phrase'
470	n/a
471	n/a	def _fold(self, folded):
472	n/a	# As with Unstructured, we can have pure ASCII with or without
473	n/a	# surrogateescape encoded bytes, or we could have unicode. But this
474	n/a	# case is more complicated, since we have to deal with the various
475	n/a	# sub-token types and how they can be composed in the face of
476	n/a	# unicode-that-needs-CTE-encoding, and the fact that if a token a
477	n/a	# comment that becomes a barrier across which we can't compose encoded
478	n/a	# words.
479	n/a	last_ew = None
480	n/a	encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
481	n/a	for part in self.parts:
482	n/a	tstr = str(part)
483	n/a	tlen = len(tstr)
484	n/a	has_ew = False
485	n/a	try:
486	n/a	str(part).encode(encoding)
487	n/a	except UnicodeEncodeError:
488	n/a	if any(isinstance(x, errors.UndecodableBytesDefect)
489	n/a	for x in part.all_defects):
490	n/a	charset = 'unknown-8bit'
491	n/a	else:
492	n/a	charset = 'utf-8'
493	n/a	if last_ew is not None and not part.has_leading_comment():
494	n/a	# We've already done an EW, let's see if we can combine
495	n/a	# this one with it. The last_ew logic ensures that all we
496	n/a	# have at this point is atoms, no comments or quoted
497	n/a	# strings. So we can treat the text between the last
498	n/a	# encoded word and the content of this token as
499	n/a	# unstructured text, and things will work correctly. But
500	n/a	# we have to strip off any trailing comment on this token
501	n/a	# first, and if it is a quoted string we have to pull out
502	n/a	# the content (we're encoding it, so it no longer needs to
503	n/a	# be quoted).
504	n/a	if part[-1].token_type == 'cfws' and part.comments:
505	n/a	remainder = part.pop(-1)
506	n/a	else:
507	n/a	remainder = ''
508	n/a	for i, token in enumerate(part):
509	n/a	if token.token_type == 'bare-quoted-string':
510	n/a	part[i] = UnstructuredTokenList(token[:])
511	n/a	chunk = get_unstructured(
512	n/a	''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
513	n/a	schunk = str(chunk)
514	n/a	lchunk = len(schunk)
515	n/a	if last_ew + lchunk <= folded.maxlen:
516	n/a	del folded.current[last_ew:]
517	n/a	folded.append(schunk)
518	n/a	folded.lastlen = sum(len(x) for x in folded.current)
519	n/a	continue
520	n/a	tstr = part.as_encoded_word(charset)
521	n/a	tlen = len(tstr)
522	n/a	has_ew = True
523	n/a	if folded.append_if_fits(part, tstr):
524	n/a	if has_ew and not part.comments:
525	n/a	last_ew = len(folded.current) - 1
526	n/a	elif part.comments or part.token_type == 'quoted-string':
527	n/a	# If a comment is involved we can't combine EWs. And if a
528	n/a	# quoted string is involved, it's not worth the effort to
529	n/a	# try to combine them.
530	n/a	last_ew = None
531	n/a	continue
532	n/a	part._fold(folded)
533	n/a
534	n/a	def cte_encode(self, charset, policy):
535	n/a	res = []
536	n/a	last_ew = None
537	n/a	is_ew = False
538	n/a	for part in self:
539	n/a	spart = str(part)
540	n/a	try:
541	n/a	spart.encode('us-ascii')
542	n/a	res.append(spart)
543	n/a	except UnicodeEncodeError:
544	n/a	is_ew = True
545	n/a	if last_ew is None:
546	n/a	if not part.comments:
547	n/a	last_ew = len(res)
548	n/a	res.append(part.cte_encode(charset, policy))
549	n/a	elif not part.has_leading_comment():
550	n/a	if part[-1].token_type == 'cfws' and part.comments:
551	n/a	remainder = part.pop(-1)
552	n/a	else:
553	n/a	remainder = ''
554	n/a	for i, token in enumerate(part):
555	n/a	if token.token_type == 'bare-quoted-string':
556	n/a	part[i] = UnstructuredTokenList(token[:])
557	n/a	tl = get_unstructured(''.join(res[last_ew:] + [spart]))
558	n/a	res[last_ew:] = [tl.as_encoded_word(charset)]
559	n/a	if part.comments or (not is_ew and part.token_type == 'quoted-string'):
560	n/a	last_ew = None
561	n/a	return ''.join(res)
562	n/a
563	n/a	class Word(TokenList):
564	n/a
565	n/a	token_type = 'word'
566	n/a
567	n/a
568	n/a	class CFWSList(WhiteSpaceTokenList):
569	n/a
570	n/a	token_type = 'cfws'
571	n/a
572	n/a	def has_leading_comment(self):
573	n/a	return bool(self.comments)
574	n/a
575	n/a
576	n/a	class Atom(TokenList):
577	n/a
578	n/a	token_type = 'atom'
579	n/a
580	n/a
581	n/a	class Token(TokenList):
582	n/a
583	n/a	token_type = 'token'
584	n/a
585	n/a
586	n/a	class EncodedWord(TokenList):
587	n/a
588	n/a	token_type = 'encoded-word'
589	n/a	cte = None
590	n/a	charset = None
591	n/a	lang = None
592	n/a
593	n/a	@property
594	n/a	def encoded(self):
595	n/a	if self.cte is not None:
596	n/a	return self.cte
597	n/a	_ew.encode(str(self), self.charset)
598	n/a
599	n/a
600	n/a
601	n/a	class QuotedString(TokenList):
602	n/a
603	n/a	token_type = 'quoted-string'
604	n/a
605	n/a	@property
606	n/a	def content(self):
607	n/a	for x in self:
608	n/a	if x.token_type == 'bare-quoted-string':
609	n/a	return x.value
610	n/a
611	n/a	@property
612	n/a	def quoted_value(self):
613	n/a	res = []
614	n/a	for x in self:
615	n/a	if x.token_type == 'bare-quoted-string':
616	n/a	res.append(str(x))
617	n/a	else:
618	n/a	res.append(x.value)
619	n/a	return ''.join(res)
620	n/a
621	n/a	@property
622	n/a	def stripped_value(self):
623	n/a	for token in self:
624	n/a	if token.token_type == 'bare-quoted-string':
625	n/a	return token.value
626	n/a
627	n/a
628	n/a	class BareQuotedString(QuotedString):
629	n/a
630	n/a	token_type = 'bare-quoted-string'
631	n/a
632	n/a	def __str__(self):
633	n/a	return quote_string(''.join(str(x) for x in self))
634	n/a
635	n/a	@property
636	n/a	def value(self):
637	n/a	return ''.join(str(x) for x in self)
638	n/a
639	n/a
640	n/a	class Comment(WhiteSpaceTokenList):
641	n/a
642	n/a	token_type = 'comment'
643	n/a
644	n/a	def __str__(self):
645	n/a	return ''.join(sum([
646	n/a	["("],
647	n/a	[self.quote(x) for x in self],
648	n/a	[")"],
649	n/a	], []))
650	n/a
651	n/a	def quote(self, value):
652	n/a	if value.token_type == 'comment':
653	n/a	return str(value)
654	n/a	return str(value).replace('\\', '\\\\').replace(
655	n/a	'(', r'\(').replace(
656	n/a	')', r'\)')
657	n/a
658	n/a	@property
659	n/a	def content(self):
660	n/a	return ''.join(str(x) for x in self)
661	n/a
662	n/a	@property
663	n/a	def comments(self):
664	n/a	return [self.content]
665	n/a
666	n/a	class AddressList(TokenList):
667	n/a
668	n/a	token_type = 'address-list'
669	n/a
670	n/a	@property
671	n/a	def addresses(self):
672	n/a	return [x for x in self if x.token_type=='address']
673	n/a
674	n/a	@property
675	n/a	def mailboxes(self):
676	n/a	return sum((x.mailboxes
677	n/a	for x in self if x.token_type=='address'), [])
678	n/a
679	n/a	@property
680	n/a	def all_mailboxes(self):
681	n/a	return sum((x.all_mailboxes
682	n/a	for x in self if x.token_type=='address'), [])
683	n/a
684	n/a
685	n/a	class Address(TokenList):
686	n/a
687	n/a	token_type = 'address'
688	n/a
689	n/a	@property
690	n/a	def display_name(self):
691	n/a	if self[0].token_type == 'group':
692	n/a	return self[0].display_name
693	n/a
694	n/a	@property
695	n/a	def mailboxes(self):
696	n/a	if self[0].token_type == 'mailbox':
697	n/a	return [self[0]]
698	n/a	elif self[0].token_type == 'invalid-mailbox':
699	n/a	return []
700	n/a	return self[0].mailboxes
701	n/a
702	n/a	@property
703	n/a	def all_mailboxes(self):
704	n/a	if self[0].token_type == 'mailbox':
705	n/a	return [self[0]]
706	n/a	elif self[0].token_type == 'invalid-mailbox':
707	n/a	return [self[0]]
708	n/a	return self[0].all_mailboxes
709	n/a
710	n/a	class MailboxList(TokenList):
711	n/a
712	n/a	token_type = 'mailbox-list'
713	n/a
714	n/a	@property
715	n/a	def mailboxes(self):
716	n/a	return [x for x in self if x.token_type=='mailbox']
717	n/a
718	n/a	@property
719	n/a	def all_mailboxes(self):
720	n/a	return [x for x in self
721	n/a	if x.token_type in ('mailbox', 'invalid-mailbox')]
722	n/a
723	n/a
724	n/a	class GroupList(TokenList):
725	n/a
726	n/a	token_type = 'group-list'
727	n/a
728	n/a	@property
729	n/a	def mailboxes(self):
730	n/a	if not self or self[0].token_type != 'mailbox-list':
731	n/a	return []
732	n/a	return self[0].mailboxes
733	n/a
734	n/a	@property
735	n/a	def all_mailboxes(self):
736	n/a	if not self or self[0].token_type != 'mailbox-list':
737	n/a	return []
738	n/a	return self[0].all_mailboxes
739	n/a
740	n/a
741	n/a	class Group(TokenList):
742	n/a
743	n/a	token_type = "group"
744	n/a
745	n/a	@property
746	n/a	def mailboxes(self):
747	n/a	if self[2].token_type != 'group-list':
748	n/a	return []
749	n/a	return self[2].mailboxes
750	n/a
751	n/a	@property
752	n/a	def all_mailboxes(self):
753	n/a	if self[2].token_type != 'group-list':
754	n/a	return []
755	n/a	return self[2].all_mailboxes
756	n/a
757	n/a	@property
758	n/a	def display_name(self):
759	n/a	return self[0].display_name
760	n/a
761	n/a
762	n/a	class NameAddr(TokenList):
763	n/a
764	n/a	token_type = 'name-addr'
765	n/a
766	n/a	@property
767	n/a	def display_name(self):
768	n/a	if len(self) == 1:
769	n/a	return None
770	n/a	return self[0].display_name
771	n/a
772	n/a	@property
773	n/a	def local_part(self):
774	n/a	return self[-1].local_part
775	n/a
776	n/a	@property
777	n/a	def domain(self):
778	n/a	return self[-1].domain
779	n/a
780	n/a	@property
781	n/a	def route(self):
782	n/a	return self[-1].route
783	n/a
784	n/a	@property
785	n/a	def addr_spec(self):
786	n/a	return self[-1].addr_spec
787	n/a
788	n/a
789	n/a	class AngleAddr(TokenList):
790	n/a
791	n/a	token_type = 'angle-addr'
792	n/a
793	n/a	@property
794	n/a	def local_part(self):
795	n/a	for x in self:
796	n/a	if x.token_type == 'addr-spec':
797	n/a	return x.local_part
798	n/a
799	n/a	@property
800	n/a	def domain(self):
801	n/a	for x in self:
802	n/a	if x.token_type == 'addr-spec':
803	n/a	return x.domain
804	n/a
805	n/a	@property
806	n/a	def route(self):
807	n/a	for x in self:
808	n/a	if x.token_type == 'obs-route':
809	n/a	return x.domains
810	n/a
811	n/a	@property
812	n/a	def addr_spec(self):
813	n/a	for x in self:
814	n/a	if x.token_type == 'addr-spec':
815	n/a	return x.addr_spec
816	n/a	else:
817	n/a	return '<>'
818	n/a
819	n/a
820	n/a	class ObsRoute(TokenList):
821	n/a
822	n/a	token_type = 'obs-route'
823	n/a
824	n/a	@property
825	n/a	def domains(self):
826	n/a	return [x.domain for x in self if x.token_type == 'domain']
827	n/a
828	n/a
829	n/a	class Mailbox(TokenList):
830	n/a
831	n/a	token_type = 'mailbox'
832	n/a
833	n/a	@property
834	n/a	def display_name(self):
835	n/a	if self[0].token_type == 'name-addr':
836	n/a	return self[0].display_name
837	n/a
838	n/a	@property
839	n/a	def local_part(self):
840	n/a	return self[0].local_part
841	n/a
842	n/a	@property
843	n/a	def domain(self):
844	n/a	return self[0].domain
845	n/a
846	n/a	@property
847	n/a	def route(self):
848	n/a	if self[0].token_type == 'name-addr':
849	n/a	return self[0].route
850	n/a
851	n/a	@property
852	n/a	def addr_spec(self):
853	n/a	return self[0].addr_spec
854	n/a
855	n/a
856	n/a	class InvalidMailbox(TokenList):
857	n/a
858	n/a	token_type = 'invalid-mailbox'
859	n/a
860	n/a	@property
861	n/a	def display_name(self):
862	n/a	return None
863	n/a
864	n/a	local_part = domain = route = addr_spec = display_name
865	n/a
866	n/a
867	n/a	class Domain(TokenList):
868	n/a
869	n/a	token_type = 'domain'
870	n/a
871	n/a	@property
872	n/a	def domain(self):
873	n/a	return ''.join(super().value.split())
874	n/a
875	n/a
876	n/a	class DotAtom(TokenList):
877	n/a
878	n/a	token_type = 'dot-atom'
879	n/a
880	n/a
881	n/a	class DotAtomText(TokenList):
882	n/a
883	n/a	token_type = 'dot-atom-text'
884	n/a
885	n/a
886	n/a	class AddrSpec(TokenList):
887	n/a
888	n/a	token_type = 'addr-spec'
889	n/a
890	n/a	@property
891	n/a	def local_part(self):
892	n/a	return self[0].local_part
893	n/a
894	n/a	@property
895	n/a	def domain(self):
896	n/a	if len(self) < 3:
897	n/a	return None
898	n/a	return self[-1].domain
899	n/a
900	n/a	@property
901	n/a	def value(self):
902	n/a	if len(self) < 3:
903	n/a	return self[0].value
904	n/a	return self[0].value.rstrip()+self[1].value+self[2].value.lstrip()
905	n/a
906	n/a	@property
907	n/a	def addr_spec(self):
908	n/a	nameset = set(self.local_part)
909	n/a	if len(nameset) > len(nameset-DOT_ATOM_ENDS):
910	n/a	lp = quote_string(self.local_part)
911	n/a	else:
912	n/a	lp = self.local_part
913	n/a	if self.domain is not None:
914	n/a	return lp + '@' + self.domain
915	n/a	return lp
916	n/a
917	n/a
918	n/a	class ObsLocalPart(TokenList):
919	n/a
920	n/a	token_type = 'obs-local-part'
921	n/a
922	n/a
923	n/a	class DisplayName(Phrase):
924	n/a
925	n/a	token_type = 'display-name'
926	n/a
927	n/a	@property
928	n/a	def display_name(self):
929	n/a	res = TokenList(self)
930	n/a	if res[0].token_type == 'cfws':
931	n/a	res.pop(0)
932	n/a	else:
933	n/a	if res[0][0].token_type == 'cfws':
934	n/a	res[0] = TokenList(res[0][1:])
935	n/a	if res[-1].token_type == 'cfws':
936	n/a	res.pop()
937	n/a	else:
938	n/a	if res[-1][-1].token_type == 'cfws':
939	n/a	res[-1] = TokenList(res[-1][:-1])
940	n/a	return res.value
941	n/a
942	n/a	@property
943	n/a	def value(self):
944	n/a	quote = False
945	n/a	if self.defects:
946	n/a	quote = True
947	n/a	else:
948	n/a	for x in self:
949	n/a	if x.token_type == 'quoted-string':
950	n/a	quote = True
951	n/a	if quote:
952	n/a	pre = post = ''
953	n/a	if self[0].token_type=='cfws' or self[0][0].token_type=='cfws':
954	n/a	pre = ' '
955	n/a	if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws':
956	n/a	post = ' '
957	n/a	return pre+quote_string(self.display_name)+post
958	n/a	else:
959	n/a	return super().value
960	n/a
961	n/a
962	n/a	class LocalPart(TokenList):
963	n/a
964	n/a	token_type = 'local-part'
965	n/a
966	n/a	@property
967	n/a	def value(self):
968	n/a	if self[0].token_type == "quoted-string":
969	n/a	return self[0].quoted_value
970	n/a	else:
971	n/a	return self[0].value
972	n/a
973	n/a	@property
974	n/a	def local_part(self):
975	n/a	# Strip whitespace from front, back, and around dots.
976	n/a	res = [DOT]
977	n/a	last = DOT
978	n/a	last_is_tl = False
979	n/a	for tok in self[0] + [DOT]:
980	n/a	if tok.token_type == 'cfws':
981	n/a	continue
982	n/a	if (last_is_tl and tok.token_type == 'dot' and
983	n/a	last[-1].token_type == 'cfws'):
984	n/a	res[-1] = TokenList(last[:-1])
985	n/a	is_tl = isinstance(tok, TokenList)
986	n/a	if (is_tl and last.token_type == 'dot' and
987	n/a	tok[0].token_type == 'cfws'):
988	n/a	res.append(TokenList(tok[1:]))
989	n/a	else:
990	n/a	res.append(tok)
991	n/a	last = res[-1]
992	n/a	last_is_tl = is_tl
993	n/a	res = TokenList(res[1:-1])
994	n/a	return res.value
995	n/a
996	n/a
997	n/a	class DomainLiteral(TokenList):
998	n/a
999	n/a	token_type = 'domain-literal'
1000	n/a
1001	n/a	@property
1002	n/a	def domain(self):
1003	n/a	return ''.join(super().value.split())
1004	n/a
1005	n/a	@property
1006	n/a	def ip(self):
1007	n/a	for x in self:
1008	n/a	if x.token_type == 'ptext':
1009	n/a	return x.value
1010	n/a
1011	n/a
1012	n/a	class MIMEVersion(TokenList):
1013	n/a
1014	n/a	token_type = 'mime-version'
1015	n/a	major = None
1016	n/a	minor = None
1017	n/a
1018	n/a
1019	n/a	class Parameter(TokenList):
1020	n/a
1021	n/a	token_type = 'parameter'
1022	n/a	sectioned = False
1023	n/a	extended = False
1024	n/a	charset = 'us-ascii'
1025	n/a
1026	n/a	@property
1027	n/a	def section_number(self):
1028	n/a	# Because the first token, the attribute (name) eats CFWS, the second
1029	n/a	# token is always the section if there is one.
1030	n/a	return self[1].number if self.sectioned else 0
1031	n/a
1032	n/a	@property
1033	n/a	def param_value(self):
1034	n/a	# This is part of the "handle quoted extended parameters" hack.
1035	n/a	for token in self:
1036	n/a	if token.token_type == 'value':
1037	n/a	return token.stripped_value
1038	n/a	if token.token_type == 'quoted-string':
1039	n/a	for token in token:
1040	n/a	if token.token_type == 'bare-quoted-string':
1041	n/a	for token in token:
1042	n/a	if token.token_type == 'value':
1043	n/a	return token.stripped_value
1044	n/a	return ''
1045	n/a
1046	n/a
1047	n/a	class InvalidParameter(Parameter):
1048	n/a
1049	n/a	token_type = 'invalid-parameter'
1050	n/a
1051	n/a
1052	n/a	class Attribute(TokenList):
1053	n/a
1054	n/a	token_type = 'attribute'
1055	n/a
1056	n/a	@property
1057	n/a	def stripped_value(self):
1058	n/a	for token in self:
1059	n/a	if token.token_type.endswith('attrtext'):
1060	n/a	return token.value
1061	n/a
1062	n/a	class Section(TokenList):
1063	n/a
1064	n/a	token_type = 'section'
1065	n/a	number = None
1066	n/a
1067	n/a
1068	n/a	class Value(TokenList):
1069	n/a
1070	n/a	token_type = 'value'
1071	n/a
1072	n/a	@property
1073	n/a	def stripped_value(self):
1074	n/a	token = self[0]
1075	n/a	if token.token_type == 'cfws':
1076	n/a	token = self[1]
1077	n/a	if token.token_type.endswith(
1078	n/a	('quoted-string', 'attribute', 'extended-attribute')):
1079	n/a	return token.stripped_value
1080	n/a	return self.value
1081	n/a
1082	n/a
1083	n/a	class MimeParameters(TokenList):
1084	n/a
1085	n/a	token_type = 'mime-parameters'
1086	n/a
1087	n/a	@property
1088	n/a	def params(self):
1089	n/a	# The RFC specifically states that the ordering of parameters is not
1090	n/a	# guaranteed and may be reordered by the transport layer. So we have
1091	n/a	# to assume the RFC 2231 pieces can come in any order. However, we
1092	n/a	# output them in the order that we first see a given name, which gives
1093	n/a	# us a stable __str__.
1094	n/a	params = OrderedDict()
1095	n/a	for token in self:
1096	n/a	if not token.token_type.endswith('parameter'):
1097	n/a	continue
1098	n/a	if token[0].token_type != 'attribute':
1099	n/a	continue
1100	n/a	name = token[0].value.strip()
1101	n/a	if name not in params:
1102	n/a	params[name] = []
1103	n/a	params[name].append((token.section_number, token))
1104	n/a	for name, parts in params.items():
1105	n/a	parts = sorted(parts, key=itemgetter(0))
1106	n/a	first_param = parts[0][1]
1107	n/a	charset = first_param.charset
1108	n/a	# Our arbitrary error recovery is to ignore duplicate parameters,
1109	n/a	# to use appearance order if there are duplicate rfc 2231 parts,
1110	n/a	# and to ignore gaps. This mimics the error recovery of get_param.
1111	n/a	if not first_param.extended and len(parts) > 1:
1112	n/a	if parts[1][0] == 0:
1113	n/a	parts[1][1].defects.append(errors.InvalidHeaderDefect(
1114	n/a	'duplicate parameter name; duplicate(s) ignored'))
1115	n/a	parts = parts[:1]
1116	n/a	# Else assume the 0 was missing...note that this is different
1117	n/a	# from get_param, but we registered a defect for this earlier.
1118	n/a	value_parts = []
1119	n/a	i = 0
1120	n/a	for section_number, param in parts:
1121	n/a	if section_number != i:
1122	n/a	# We could get fancier here and look for a complete
1123	n/a	# duplicate extended parameter and ignore the second one
1124	n/a	# seen. But we're not doing that. The old code didn't.
1125	n/a	if not param.extended:
1126	n/a	param.defects.append(errors.InvalidHeaderDefect(
1127	n/a	'duplicate parameter name; duplicate ignored'))
1128	n/a	continue
1129	n/a	else:
1130	n/a	param.defects.append(errors.InvalidHeaderDefect(
1131	n/a	"inconsistent RFC2231 parameter numbering"))
1132	n/a	i += 1
1133	n/a	value = param.param_value
1134	n/a	if param.extended:
1135	n/a	try:
1136	n/a	value = urllib.parse.unquote_to_bytes(value)
1137	n/a	except UnicodeEncodeError:
1138	n/a	# source had surrogate escaped bytes. What we do now
1139	n/a	# is a bit of an open question. I'm not sure this is
1140	n/a	# the best choice, but it is what the old algorithm did
1141	n/a	value = urllib.parse.unquote(value, encoding='latin-1')
1142	n/a	else:
1143	n/a	try:
1144	n/a	value = value.decode(charset, 'surrogateescape')
1145	n/a	except LookupError:
1146	n/a	# XXX: there should really be a custom defect for
1147	n/a	# unknown character set to make it easy to find,
1148	n/a	# because otherwise unknown charset is a silent
1149	n/a	# failure.
1150	n/a	value = value.decode('us-ascii', 'surrogateescape')
1151	n/a	if utils._has_surrogates(value):
1152	n/a	param.defects.append(errors.UndecodableBytesDefect())
1153	n/a	value_parts.append(value)
1154	n/a	value = ''.join(value_parts)
1155	n/a	yield name, value
1156	n/a
1157	n/a	def __str__(self):
1158	n/a	params = []
1159	n/a	for name, value in self.params:
1160	n/a	if value:
1161	n/a	params.append('{}={}'.format(name, quote_string(value)))
1162	n/a	else:
1163	n/a	params.append(name)
1164	n/a	params = '; '.join(params)
1165	n/a	return ' ' + params if params else ''
1166	n/a
1167	n/a
1168	n/a	class ParameterizedHeaderValue(TokenList):
1169	n/a
1170	n/a	@property
1171	n/a	def params(self):
1172	n/a	for token in reversed(self):
1173	n/a	if token.token_type == 'mime-parameters':
1174	n/a	return token.params
1175	n/a	return {}
1176	n/a
1177	n/a	@property
1178	n/a	def parts(self):
1179	n/a	if self and self[-1].token_type == 'mime-parameters':
1180	n/a	# We don't want to start a new line if all of the params don't fit
1181	n/a	# after the value, so unwrap the parameter list.
1182	n/a	return TokenList(self[:-1] + self[-1])
1183	n/a	return TokenList(self).parts
1184	n/a
1185	n/a
1186	n/a	class ContentType(ParameterizedHeaderValue):
1187	n/a
1188	n/a	token_type = 'content-type'
1189	n/a	maintype = 'text'
1190	n/a	subtype = 'plain'
1191	n/a
1192	n/a
1193	n/a	class ContentDisposition(ParameterizedHeaderValue):
1194	n/a
1195	n/a	token_type = 'content-disposition'
1196	n/a	content_disposition = None
1197	n/a
1198	n/a
1199	n/a	class ContentTransferEncoding(TokenList):
1200	n/a
1201	n/a	token_type = 'content-transfer-encoding'
1202	n/a	cte = '7bit'
1203	n/a
1204	n/a
1205	n/a	class HeaderLabel(TokenList):
1206	n/a
1207	n/a	token_type = 'header-label'
1208	n/a
1209	n/a
1210	n/a	class Header(TokenList):
1211	n/a
1212	n/a	token_type = 'header'
1213	n/a
1214	n/a	def _fold(self, folded):
1215	n/a	folded.append(str(self.pop(0)))
1216	n/a	folded.lastlen = len(folded.current[0])
1217	n/a	# The first line of the header is different from all others: we don't
1218	n/a	# want to start a new object on a new line if it has any fold points in
1219	n/a	# it that would allow part of it to be on the first header line.
1220	n/a	# Further, if the first fold point would fit on the new line, we want
1221	n/a	# to do that, but if it doesn't we want to put it on the first line.
1222	n/a	# Folded supports this via the stickyspace attribute. If this
1223	n/a	# attribute is not None, it does the special handling.
1224	n/a	folded.stickyspace = str(self.pop(0)) if self[0].token_type == 'cfws' else ''
1225	n/a	rest = self.pop(0)
1226	n/a	if self:
1227	n/a	raise ValueError("Malformed Header token list")
1228	n/a	rest._fold(folded)
1229	n/a
1230	n/a
1231	n/a	#
1232	n/a	# Terminal classes and instances
1233	n/a	#
1234	n/a
1235	n/a	class Terminal(str):
1236	n/a
1237	n/a	def __new__(cls, value, token_type):
1238	n/a	self = super().__new__(cls, value)
1239	n/a	self.token_type = token_type
1240	n/a	self.defects = []
1241	n/a	return self
1242	n/a
1243	n/a	def __repr__(self):
1244	n/a	return "{}({})".format(self.__class__.__name__, super().__repr__())
1245	n/a
1246	n/a	@property
1247	n/a	def all_defects(self):
1248	n/a	return list(self.defects)
1249	n/a
1250	n/a	def _pp(self, indent=''):
1251	n/a	return ["{}{}/{}({}){}".format(
1252	n/a	indent,
1253	n/a	self.__class__.__name__,
1254	n/a	self.token_type,
1255	n/a	super().__repr__(),
1256	n/a	'' if not self.defects else ' {}'.format(self.defects),
1257	n/a	)]
1258	n/a
1259	n/a	def cte_encode(self, charset, policy):
1260	n/a	value = str(self)
1261	n/a	try:
1262	n/a	value.encode('us-ascii')
1263	n/a	return value
1264	n/a	except UnicodeEncodeError:
1265	n/a	return _ew.encode(value, charset)
1266	n/a
1267	n/a	def pop_trailing_ws(self):
1268	n/a	# This terminates the recursion.
1269	n/a	return None
1270	n/a
1271	n/a	def pop_leading_fws(self):
1272	n/a	# This terminates the recursion.
1273	n/a	return None
1274	n/a
1275	n/a	@property
1276	n/a	def comments(self):
1277	n/a	return []
1278	n/a
1279	n/a	def has_leading_comment(self):
1280	n/a	return False
1281	n/a
1282	n/a	def __getnewargs__(self):
1283	n/a	return(str(self), self.token_type)
1284	n/a
1285	n/a
1286	n/a	class WhiteSpaceTerminal(Terminal):
1287	n/a
1288	n/a	@property
1289	n/a	def value(self):
1290	n/a	return ' '
1291	n/a
1292	n/a	def startswith_fws(self):
1293	n/a	return True
1294	n/a
1295	n/a	has_fws = True
1296	n/a
1297	n/a
1298	n/a	class ValueTerminal(Terminal):
1299	n/a
1300	n/a	@property
1301	n/a	def value(self):
1302	n/a	return self
1303	n/a
1304	n/a	def startswith_fws(self):
1305	n/a	return False
1306	n/a
1307	n/a	has_fws = False
1308	n/a
1309	n/a	def as_encoded_word(self, charset):
1310	n/a	return _ew.encode(str(self), charset)
1311	n/a
1312	n/a
1313	n/a	class EWWhiteSpaceTerminal(WhiteSpaceTerminal):
1314	n/a
1315	n/a	@property
1316	n/a	def value(self):
1317	n/a	return ''
1318	n/a
1319	n/a	@property
1320	n/a	def encoded(self):
1321	n/a	return self[:]
1322	n/a
1323	n/a	def __str__(self):
1324	n/a	return ''
1325	n/a
1326	n/a	has_fws = True
1327	n/a
1328	n/a
1329	n/a	# XXX these need to become classes and used as instances so
1330	n/a	# that a program can't change them in a parse tree and screw
1331	n/a	# up other parse trees. Maybe should have tests for that, too.
1332	n/a	DOT = ValueTerminal('.', 'dot')
1333	n/a	ListSeparator = ValueTerminal(',', 'list-separator')
1334	n/a	RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
1335	n/a
1336	n/a	#
1337	n/a	# Parser
1338	n/a	#
1339	n/a
1340	n/a	# Parse strings according to RFC822/2047/2822/5322 rules.
1341	n/a	#
1342	n/a	# This is a stateless parser. Each get_XXX function accepts a string and
1343	n/a	# returns either a Terminal or a TokenList representing the RFC object named
1344	n/a	# by the method and a string containing the remaining unparsed characters
1345	n/a	# from the input. Thus a parser method consumes the next syntactic construct
1346	n/a	# of a given type and returns a token representing the construct plus the
1347	n/a	# unparsed remainder of the input string.
1348	n/a	#
1349	n/a	# For example, if the first element of a structured header is a 'phrase',
1350	n/a	# then:
1351	n/a	#
1352	n/a	# phrase, value = get_phrase(value)
1353	n/a	#
1354	n/a	# returns the complete phrase from the start of the string value, plus any
1355	n/a	# characters left in the string after the phrase is removed.
1356	n/a
1357	n/a	_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
1358	n/a	_non_atom_end_matcher = re.compile(r"[^{}]+".format(
1359	n/a	''.join(ATOM_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
1360	n/a	_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall
1361	n/a	_non_token_end_matcher = re.compile(r"[^{}]+".format(
1362	n/a	''.join(TOKEN_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
1363	n/a	_non_attribute_end_matcher = re.compile(r"[^{}]+".format(
1364	n/a	''.join(ATTRIBUTE_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
1365	n/a	_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format(
1366	n/a	''.join(EXTENDED_ATTRIBUTE_ENDS).replace(
1367	n/a	'\\','\\\\').replace(']',r'\]'))).match
1368	n/a
1369	n/a	def _validate_xtext(xtext):
1370	n/a	"""If input token contains ASCII non-printables, register a defect."""
1371	n/a
1372	n/a	non_printables = _non_printable_finder(xtext)
1373	n/a	if non_printables:
1374	n/a	xtext.defects.append(errors.NonPrintableDefect(non_printables))
1375	n/a	if utils._has_surrogates(xtext):
1376	n/a	xtext.defects.append(errors.UndecodableBytesDefect(
1377	n/a	"Non-ASCII characters found in header token"))
1378	n/a
1379	n/a	def _get_ptext_to_endchars(value, endchars):
1380	n/a	"""Scan printables/quoted-pairs until endchars and return unquoted ptext.
1381	n/a
1382	n/a	This function turns a run of qcontent, ccontent-without-comments, or
1383	n/a	dtext-with-quoted-printables into a single string by unquoting any
1384	n/a	quoted printables. It returns the string, the remaining value, and
1385	n/a	a flag that is True iff there were any quoted printables decoded.
1386	n/a
1387	n/a	"""
1388	n/a	fragment, *remainder = _wsp_splitter(value, 1)
1389	n/a	vchars = []
1390	n/a	escape = False
1391	n/a	had_qp = False
1392	n/a	for pos in range(len(fragment)):
1393	n/a	if fragment[pos] == '\\':
1394	n/a	if escape:
1395	n/a	escape = False
1396	n/a	had_qp = True
1397	n/a	else:
1398	n/a	escape = True
1399	n/a	continue
1400	n/a	if escape:
1401	n/a	escape = False
1402	n/a	elif fragment[pos] in endchars:
1403	n/a	break
1404	n/a	vchars.append(fragment[pos])
1405	n/a	else:
1406	n/a	pos = pos + 1
1407	n/a	return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp
1408	n/a
1409	n/a	def get_fws(value):
1410	n/a	"""FWS = 1*WSP
1411	n/a
1412	n/a	This isn't the RFC definition. We're using fws to represent tokens where
1413	n/a	folding can be done, but when we are parsing the unfolding has already
1414	n/a	been done so we don't need to watch out for CRLF.
1415	n/a
1416	n/a	"""
1417	n/a	newvalue = value.lstrip()
1418	n/a	fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws')
1419	n/a	return fws, newvalue
1420	n/a
1421	n/a	def get_encoded_word(value):
1422	n/a	""" encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
1423	n/a
1424	n/a	"""
1425	n/a	ew = EncodedWord()
1426	n/a	if not value.startswith('=?'):
1427	n/a	raise errors.HeaderParseError(
1428	n/a	"expected encoded word but found {}".format(value))
1429	n/a	tok, *remainder = value[2:].split('?=', 1)
1430	n/a	if tok == value[2:]:
1431	n/a	raise errors.HeaderParseError(
1432	n/a	"expected encoded word but found {}".format(value))
1433	n/a	remstr = ''.join(remainder)
1434	n/a	if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits:
1435	n/a	# The ? after the CTE was followed by an encoded word escape (=XX).
1436	n/a	rest, *remainder = remstr.split('?=', 1)
1437	n/a	tok = tok + '?=' + rest
1438	n/a	if len(tok.split()) > 1:
1439	n/a	ew.defects.append(errors.InvalidHeaderDefect(
1440	n/a	"whitespace inside encoded word"))
1441	n/a	ew.cte = value
1442	n/a	value = ''.join(remainder)
1443	n/a	try:
1444	n/a	text, charset, lang, defects = _ew.decode('=?' + tok + '?=')
1445	n/a	except ValueError:
1446	n/a	raise errors.HeaderParseError(
1447	n/a	"encoded word format invalid: '{}'".format(ew.cte))
1448	n/a	ew.charset = charset
1449	n/a	ew.lang = lang
1450	n/a	ew.defects.extend(defects)
1451	n/a	while text:
1452	n/a	if text[0] in WSP:
1453	n/a	token, text = get_fws(text)
1454	n/a	ew.append(token)
1455	n/a	continue
1456	n/a	chars, *remainder = _wsp_splitter(text, 1)
1457	n/a	vtext = ValueTerminal(chars, 'vtext')
1458	n/a	_validate_xtext(vtext)
1459	n/a	ew.append(vtext)
1460	n/a	text = ''.join(remainder)
1461	n/a	return ew, value
1462	n/a
1463	n/a	def get_unstructured(value):
1464	n/a	"""unstructured = (([FWS] vchar) WSP) / obs-unstruct
1465	n/a	obs-unstruct = ((LF CR (obs-utext) LF CR)) / FWS)
1466	n/a	obs-utext = %d0 / obs-NO-WS-CTL / LF / CR
1467	n/a
1468	n/a	obs-NO-WS-CTL is control characters except WSP/CR/LF.
1469	n/a
1470	n/a	So, basically, we have printable runs, plus control characters or nulls in
1471	n/a	the obsolete syntax, separated by whitespace. Since RFC 2047 uses the
1472	n/a	obsolete syntax in its specification, but requires whitespace on either
1473	n/a	side of the encoded words, I can see no reason to need to separate the
1474	n/a	non-printable-non-whitespace from the printable runs if they occur, so we
1475	n/a	parse this into xtext tokens separated by WSP tokens.
1476	n/a
1477	n/a	Because an 'unstructured' value must by definition constitute the entire
1478	n/a	value, this 'get' routine does not return a remaining value, only the
1479	n/a	parsed TokenList.
1480	n/a
1481	n/a	"""
1482	n/a	# XXX: but what about bare CR and LF? They might signal the start or
1483	n/a	# end of an encoded word. YAGNI for now, since our current parsers
1484	n/a	# will never send us strings with bare CR or LF.
1485	n/a
1486	n/a	unstructured = UnstructuredTokenList()
1487	n/a	while value:
1488	n/a	if value[0] in WSP:
1489	n/a	token, value = get_fws(value)
1490	n/a	unstructured.append(token)
1491	n/a	continue
1492	n/a	if value.startswith('=?'):
1493	n/a	try:
1494	n/a	token, value = get_encoded_word(value)
1495	n/a	except errors.HeaderParseError:
1496	n/a	# XXX: Need to figure out how to register defects when
1497	n/a	# appropriate here.
1498	n/a	pass
1499	n/a	else:
1500	n/a	have_ws = True
1501	n/a	if len(unstructured) > 0:
1502	n/a	if unstructured[-1].token_type != 'fws':
1503	n/a	unstructured.defects.append(errors.InvalidHeaderDefect(
1504	n/a	"missing whitespace before encoded word"))
1505	n/a	have_ws = False
1506	n/a	if have_ws and len(unstructured) > 1:
1507	n/a	if unstructured[-2].token_type == 'encoded-word':
1508	n/a	unstructured[-1] = EWWhiteSpaceTerminal(
1509	n/a	unstructured[-1], 'fws')
1510	n/a	unstructured.append(token)
1511	n/a	continue
1512	n/a	tok, *remainder = _wsp_splitter(value, 1)
1513	n/a	vtext = ValueTerminal(tok, 'vtext')
1514	n/a	_validate_xtext(vtext)
1515	n/a	unstructured.append(vtext)
1516	n/a	value = ''.join(remainder)
1517	n/a	return unstructured
1518	n/a
1519	n/a	def get_qp_ctext(value):
1520	n/a	r"""ctext = <printable ascii except \ ( )>
1521	n/a
1522	n/a	This is not the RFC ctext, since we are handling nested comments in comment
1523	n/a	and unquoting quoted-pairs here. We allow anything except the '()'
1524	n/a	characters, but if we find any ASCII other than the RFC defined printable
1525	n/a	ASCII, a NonPrintableDefect is added to the token's defects list. Since
1526	n/a	quoted pairs are converted to their unquoted values, what is returned is
1527	n/a	a 'ptext' token. In this case it is a WhiteSpaceTerminal, so it's value
1528	n/a	is ' '.
1529	n/a
1530	n/a	"""
1531	n/a	ptext, value, _ = _get_ptext_to_endchars(value, '()')
1532	n/a	ptext = WhiteSpaceTerminal(ptext, 'ptext')
1533	n/a	_validate_xtext(ptext)
1534	n/a	return ptext, value
1535	n/a
1536	n/a	def get_qcontent(value):
1537	n/a	"""qcontent = qtext / quoted-pair
1538	n/a
1539	n/a	We allow anything except the DQUOTE character, but if we find any ASCII
1540	n/a	other than the RFC defined printable ASCII, a NonPrintableDefect is
1541	n/a	added to the token's defects list. Any quoted pairs are converted to their
1542	n/a	unquoted values, so what is returned is a 'ptext' token. In this case it
1543	n/a	is a ValueTerminal.
1544	n/a
1545	n/a	"""
1546	n/a	ptext, value, _ = _get_ptext_to_endchars(value, '"')
1547	n/a	ptext = ValueTerminal(ptext, 'ptext')
1548	n/a	_validate_xtext(ptext)
1549	n/a	return ptext, value
1550	n/a
1551	n/a	def get_atext(value):
1552	n/a	"""atext = <matches _atext_matcher>
1553	n/a
1554	n/a	We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to
1555	n/a	the token's defects list if we find non-atext characters.
1556	n/a	"""
1557	n/a	m = _non_atom_end_matcher(value)
1558	n/a	if not m:
1559	n/a	raise errors.HeaderParseError(
1560	n/a	"expected atext but found '{}'".format(value))
1561	n/a	atext = m.group()
1562	n/a	value = value[len(atext):]
1563	n/a	atext = ValueTerminal(atext, 'atext')
1564	n/a	_validate_xtext(atext)
1565	n/a	return atext, value
1566	n/a
1567	n/a	def get_bare_quoted_string(value):
1568	n/a	"""bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE
1569	n/a
1570	n/a	A quoted-string without the leading or trailing white space. Its
1571	n/a	value is the text between the quote marks, with whitespace
1572	n/a	preserved and quoted pairs decoded.
1573	n/a	"""
1574	n/a	if value[0] != '"':
1575	n/a	raise errors.HeaderParseError(
1576	n/a	"expected '\"' but found '{}'".format(value))
1577	n/a	bare_quoted_string = BareQuotedString()
1578	n/a	value = value[1:]
1579	n/a	while value and value[0] != '"':
1580	n/a	if value[0] in WSP:
1581	n/a	token, value = get_fws(value)
1582	n/a	elif value[:2] == '=?':
1583	n/a	try:
1584	n/a	token, value = get_encoded_word(value)
1585	n/a	bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
1586	n/a	"encoded word inside quoted string"))
1587	n/a	except errors.HeaderParseError:
1588	n/a	token, value = get_qcontent(value)
1589	n/a	else:
1590	n/a	token, value = get_qcontent(value)
1591	n/a	bare_quoted_string.append(token)
1592	n/a	if not value:
1593	n/a	bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
1594	n/a	"end of header inside quoted string"))
1595	n/a	return bare_quoted_string, value
1596	n/a	return bare_quoted_string, value[1:]
1597	n/a
1598	n/a	def get_comment(value):
1599	n/a	"""comment = "(" *([FWS] ccontent) [FWS] ")"
1600	n/a	ccontent = ctext / quoted-pair / comment
1601	n/a
1602	n/a	We handle nested comments here, and quoted-pair in our qp-ctext routine.
1603	n/a	"""
1604	n/a	if value and value[0] != '(':
1605	n/a	raise errors.HeaderParseError(
1606	n/a	"expected '(' but found '{}'".format(value))
1607	n/a	comment = Comment()
1608	n/a	value = value[1:]
1609	n/a	while value and value[0] != ")":
1610	n/a	if value[0] in WSP:
1611	n/a	token, value = get_fws(value)
1612	n/a	elif value[0] == '(':
1613	n/a	token, value = get_comment(value)
1614	n/a	else:
1615	n/a	token, value = get_qp_ctext(value)
1616	n/a	comment.append(token)
1617	n/a	if not value:
1618	n/a	comment.defects.append(errors.InvalidHeaderDefect(
1619	n/a	"end of header inside comment"))
1620	n/a	return comment, value
1621	n/a	return comment, value[1:]
1622	n/a
1623	n/a	def get_cfws(value):
1624	n/a	"""CFWS = (1*([FWS] comment) [FWS]) / FWS
1625	n/a
1626	n/a	"""
1627	n/a	cfws = CFWSList()
1628	n/a	while value and value[0] in CFWS_LEADER:
1629	n/a	if value[0] in WSP:
1630	n/a	token, value = get_fws(value)
1631	n/a	else:
1632	n/a	token, value = get_comment(value)
1633	n/a	cfws.append(token)
1634	n/a	return cfws, value
1635	n/a
1636	n/a	def get_quoted_string(value):
1637	n/a	"""quoted-string = [CFWS] <bare-quoted-string> [CFWS]
1638	n/a
1639	n/a	'bare-quoted-string' is an intermediate class defined by this
1640	n/a	parser and not by the RFC grammar. It is the quoted string
1641	n/a	without any attached CFWS.
1642	n/a	"""
1643	n/a	quoted_string = QuotedString()
1644	n/a	if value and value[0] in CFWS_LEADER:
1645	n/a	token, value = get_cfws(value)
1646	n/a	quoted_string.append(token)
1647	n/a	token, value = get_bare_quoted_string(value)
1648	n/a	quoted_string.append(token)
1649	n/a	if value and value[0] in CFWS_LEADER:
1650	n/a	token, value = get_cfws(value)
1651	n/a	quoted_string.append(token)
1652	n/a	return quoted_string, value
1653	n/a
1654	n/a	def get_atom(value):
1655	n/a	"""atom = [CFWS] 1*atext [CFWS]
1656	n/a
1657	n/a	An atom could be an rfc2047 encoded word.
1658	n/a	"""
1659	n/a	atom = Atom()
1660	n/a	if value and value[0] in CFWS_LEADER:
1661	n/a	token, value = get_cfws(value)
1662	n/a	atom.append(token)
1663	n/a	if value and value[0] in ATOM_ENDS:
1664	n/a	raise errors.HeaderParseError(
1665	n/a	"expected atom but found '{}'".format(value))
1666	n/a	if value.startswith('=?'):
1667	n/a	try:
1668	n/a	token, value = get_encoded_word(value)
1669	n/a	except errors.HeaderParseError:
1670	n/a	# XXX: need to figure out how to register defects when
1671	n/a	# appropriate here.
1672	n/a	token, value = get_atext(value)
1673	n/a	else:
1674	n/a	token, value = get_atext(value)
1675	n/a	atom.append(token)
1676	n/a	if value and value[0] in CFWS_LEADER:
1677	n/a	token, value = get_cfws(value)
1678	n/a	atom.append(token)
1679	n/a	return atom, value
1680	n/a
1681	n/a	def get_dot_atom_text(value):
1682	n/a	""" dot-text = 1atext ("." 1*atext)
1683	n/a
1684	n/a	"""
1685	n/a	dot_atom_text = DotAtomText()
1686	n/a	if not value or value[0] in ATOM_ENDS:
1687	n/a	raise errors.HeaderParseError("expected atom at a start of "
1688	n/a	"dot-atom-text but found '{}'".format(value))
1689	n/a	while value and value[0] not in ATOM_ENDS:
1690	n/a	token, value = get_atext(value)
1691	n/a	dot_atom_text.append(token)
1692	n/a	if value and value[0] == '.':
1693	n/a	dot_atom_text.append(DOT)
1694	n/a	value = value[1:]
1695	n/a	if dot_atom_text[-1] is DOT:
1696	n/a	raise errors.HeaderParseError("expected atom at end of dot-atom-text "
1697	n/a	"but found '{}'".format('.'+value))
1698	n/a	return dot_atom_text, value
1699	n/a
1700	n/a	def get_dot_atom(value):
1701	n/a	""" dot-atom = [CFWS] dot-atom-text [CFWS]
1702	n/a
1703	n/a	Any place we can have a dot atom, we could instead have an rfc2047 encoded
1704	n/a	word.
1705	n/a	"""
1706	n/a	dot_atom = DotAtom()
1707	n/a	if value[0] in CFWS_LEADER:
1708	n/a	token, value = get_cfws(value)
1709	n/a	dot_atom.append(token)
1710	n/a	if value.startswith('=?'):
1711	n/a	try:
1712	n/a	token, value = get_encoded_word(value)
1713	n/a	except errors.HeaderParseError:
1714	n/a	# XXX: need to figure out how to register defects when
1715	n/a	# appropriate here.
1716	n/a	token, value = get_dot_atom_text(value)
1717	n/a	else:
1718	n/a	token, value = get_dot_atom_text(value)
1719	n/a	dot_atom.append(token)
1720	n/a	if value and value[0] in CFWS_LEADER:
1721	n/a	token, value = get_cfws(value)
1722	n/a	dot_atom.append(token)
1723	n/a	return dot_atom, value
1724	n/a
1725	n/a	def get_word(value):
1726	n/a	"""word = atom / quoted-string
1727	n/a
1728	n/a	Either atom or quoted-string may start with CFWS. We have to peel off this
1729	n/a	CFWS first to determine which type of word to parse. Afterward we splice
1730	n/a	the leading CFWS, if any, into the parsed sub-token.
1731	n/a
1732	n/a	If neither an atom or a quoted-string is found before the next special, a
1733	n/a	HeaderParseError is raised.
1734	n/a
1735	n/a	The token returned is either an Atom or a QuotedString, as appropriate.
1736	n/a	This means the 'word' level of the formal grammar is not represented in the
1737	n/a	parse tree; this is because having that extra layer when manipulating the
1738	n/a	parse tree is more confusing than it is helpful.
1739	n/a
1740	n/a	"""
1741	n/a	if value[0] in CFWS_LEADER:
1742	n/a	leader, value = get_cfws(value)
1743	n/a	else:
1744	n/a	leader = None
1745	n/a	if value[0]=='"':
1746	n/a	token, value = get_quoted_string(value)
1747	n/a	elif value[0] in SPECIALS:
1748	n/a	raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' "
1749	n/a	"but found '{}'".format(value))
1750	n/a	else:
1751	n/a	token, value = get_atom(value)
1752	n/a	if leader is not None:
1753	n/a	token[:0] = [leader]
1754	n/a	return token, value
1755	n/a
1756	n/a	def get_phrase(value):
1757	n/a	""" phrase = 1*word / obs-phrase
1758	n/a	obs-phrase = word *(word / "." / CFWS)
1759	n/a
1760	n/a	This means a phrase can be a sequence of words, periods, and CFWS in any
1761	n/a	order as long as it starts with at least one word. If anything other than
1762	n/a	words is detected, an ObsoleteHeaderDefect is added to the token's defect
1763	n/a	list. We also accept a phrase that starts with CFWS followed by a dot;
1764	n/a	this is registered as an InvalidHeaderDefect, since it is not supported by
1765	n/a	even the obsolete grammar.
1766	n/a
1767	n/a	"""
1768	n/a	phrase = Phrase()
1769	n/a	try:
1770	n/a	token, value = get_word(value)
1771	n/a	phrase.append(token)
1772	n/a	except errors.HeaderParseError:
1773	n/a	phrase.defects.append(errors.InvalidHeaderDefect(
1774	n/a	"phrase does not start with word"))
1775	n/a	while value and value[0] not in PHRASE_ENDS:
1776	n/a	if value[0]=='.':
1777	n/a	phrase.append(DOT)
1778	n/a	phrase.defects.append(errors.ObsoleteHeaderDefect(
1779	n/a	"period in 'phrase'"))
1780	n/a	value = value[1:]
1781	n/a	else:
1782	n/a	try:
1783	n/a	token, value = get_word(value)
1784	n/a	except errors.HeaderParseError:
1785	n/a	if value[0] in CFWS_LEADER:
1786	n/a	token, value = get_cfws(value)
1787	n/a	phrase.defects.append(errors.ObsoleteHeaderDefect(
1788	n/a	"comment found without atom"))
1789	n/a	else:
1790	n/a	raise
1791	n/a	phrase.append(token)
1792	n/a	return phrase, value
1793	n/a
1794	n/a	def get_local_part(value):
1795	n/a	""" local-part = dot-atom / quoted-string / obs-local-part
1796	n/a
1797	n/a	"""
1798	n/a	local_part = LocalPart()
1799	n/a	leader = None
1800	n/a	if value[0] in CFWS_LEADER:
1801	n/a	leader, value = get_cfws(value)
1802	n/a	if not value:
1803	n/a	raise errors.HeaderParseError(
1804	n/a	"expected local-part but found '{}'".format(value))
1805	n/a	try:
1806	n/a	token, value = get_dot_atom(value)
1807	n/a	except errors.HeaderParseError:
1808	n/a	try:
1809	n/a	token, value = get_word(value)
1810	n/a	except errors.HeaderParseError:
1811	n/a	if value[0] != '\\' and value[0] in PHRASE_ENDS:
1812	n/a	raise
1813	n/a	token = TokenList()
1814	n/a	if leader is not None:
1815	n/a	token[:0] = [leader]
1816	n/a	local_part.append(token)
1817	n/a	if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
1818	n/a	obs_local_part, value = get_obs_local_part(str(local_part) + value)
1819	n/a	if obs_local_part.token_type == 'invalid-obs-local-part':
1820	n/a	local_part.defects.append(errors.InvalidHeaderDefect(
1821	n/a	"local-part is not dot-atom, quoted-string, or obs-local-part"))
1822	n/a	else:
1823	n/a	local_part.defects.append(errors.ObsoleteHeaderDefect(
1824	n/a	"local-part is not a dot-atom (contains CFWS)"))
1825	n/a	local_part[0] = obs_local_part
1826	n/a	try:
1827	n/a	local_part.value.encode('ascii')
1828	n/a	except UnicodeEncodeError:
1829	n/a	local_part.defects.append(errors.NonASCIILocalPartDefect(
1830	n/a	"local-part contains non-ASCII characters)"))
1831	n/a	return local_part, value
1832	n/a
1833	n/a	def get_obs_local_part(value):
1834	n/a	""" obs-local-part = word *("." word)
1835	n/a	"""
1836	n/a	obs_local_part = ObsLocalPart()
1837	n/a	last_non_ws_was_dot = False
1838	n/a	while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
1839	n/a	if value[0] == '.':
1840	n/a	if last_non_ws_was_dot:
1841	n/a	obs_local_part.defects.append(errors.InvalidHeaderDefect(
1842	n/a	"invalid repeated '.'"))
1843	n/a	obs_local_part.append(DOT)
1844	n/a	last_non_ws_was_dot = True
1845	n/a	value = value[1:]
1846	n/a	continue
1847	n/a	elif value[0]=='\\':
1848	n/a	obs_local_part.append(ValueTerminal(value[0],
1849	n/a	'misplaced-special'))
1850	n/a	value = value[1:]
1851	n/a	obs_local_part.defects.append(errors.InvalidHeaderDefect(
1852	n/a	"'\\' character outside of quoted-string/ccontent"))
1853	n/a	last_non_ws_was_dot = False
1854	n/a	continue
1855	n/a	if obs_local_part and obs_local_part[-1].token_type != 'dot':
1856	n/a	obs_local_part.defects.append(errors.InvalidHeaderDefect(
1857	n/a	"missing '.' between words"))
1858	n/a	try:
1859	n/a	token, value = get_word(value)
1860	n/a	last_non_ws_was_dot = False
1861	n/a	except errors.HeaderParseError:
1862	n/a	if value[0] not in CFWS_LEADER:
1863	n/a	raise
1864	n/a	token, value = get_cfws(value)
1865	n/a	obs_local_part.append(token)
1866	n/a	if (obs_local_part[0].token_type == 'dot' or
1867	n/a	obs_local_part[0].token_type=='cfws' and
1868	n/a	obs_local_part[1].token_type=='dot'):
1869	n/a	obs_local_part.defects.append(errors.InvalidHeaderDefect(
1870	n/a	"Invalid leading '.' in local part"))
1871	n/a	if (obs_local_part[-1].token_type == 'dot' or
1872	n/a	obs_local_part[-1].token_type=='cfws' and
1873	n/a	obs_local_part[-2].token_type=='dot'):
1874	n/a	obs_local_part.defects.append(errors.InvalidHeaderDefect(
1875	n/a	"Invalid trailing '.' in local part"))
1876	n/a	if obs_local_part.defects:
1877	n/a	obs_local_part.token_type = 'invalid-obs-local-part'
1878	n/a	return obs_local_part, value
1879	n/a
1880	n/a	def get_dtext(value):
1881	n/a	r""" dtext = <printable ascii except \ [ ]> / obs-dtext
1882	n/a	obs-dtext = obs-NO-WS-CTL / quoted-pair
1883	n/a
1884	n/a	We allow anything except the excluded characters, but if we find any
1885	n/a	ASCII other than the RFC defined printable ASCII, a NonPrintableDefect is
1886	n/a	added to the token's defects list. Quoted pairs are converted to their
1887	n/a	unquoted values, so what is returned is a ptext token, in this case a
1888	n/a	ValueTerminal. If there were quoted-printables, an ObsoleteHeaderDefect is
1889	n/a	added to the returned token's defect list.
1890	n/a
1891	n/a	"""
1892	n/a	ptext, value, had_qp = _get_ptext_to_endchars(value, '[]')
1893	n/a	ptext = ValueTerminal(ptext, 'ptext')
1894	n/a	if had_qp:
1895	n/a	ptext.defects.append(errors.ObsoleteHeaderDefect(
1896	n/a	"quoted printable found in domain-literal"))
1897	n/a	_validate_xtext(ptext)
1898	n/a	return ptext, value
1899	n/a
1900	n/a	def _check_for_early_dl_end(value, domain_literal):
1901	n/a	if value:
1902	n/a	return False
1903	n/a	domain_literal.append(errors.InvalidHeaderDefect(
1904	n/a	"end of input inside domain-literal"))
1905	n/a	domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
1906	n/a	return True
1907	n/a
1908	n/a	def get_domain_literal(value):
1909	n/a	""" domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS]
1910	n/a
1911	n/a	"""
1912	n/a	domain_literal = DomainLiteral()
1913	n/a	if value[0] in CFWS_LEADER:
1914	n/a	token, value = get_cfws(value)
1915	n/a	domain_literal.append(token)
1916	n/a	if not value:
1917	n/a	raise errors.HeaderParseError("expected domain-literal")
1918	n/a	if value[0] != '[':
1919	n/a	raise errors.HeaderParseError("expected '[' at start of domain-literal "
1920	n/a	"but found '{}'".format(value))
1921	n/a	value = value[1:]
1922	n/a	if _check_for_early_dl_end(value, domain_literal):
1923	n/a	return domain_literal, value
1924	n/a	domain_literal.append(ValueTerminal('[', 'domain-literal-start'))
1925	n/a	if value[0] in WSP:
1926	n/a	token, value = get_fws(value)
1927	n/a	domain_literal.append(token)
1928	n/a	token, value = get_dtext(value)
1929	n/a	domain_literal.append(token)
1930	n/a	if _check_for_early_dl_end(value, domain_literal):
1931	n/a	return domain_literal, value
1932	n/a	if value[0] in WSP:
1933	n/a	token, value = get_fws(value)
1934	n/a	domain_literal.append(token)
1935	n/a	if _check_for_early_dl_end(value, domain_literal):
1936	n/a	return domain_literal, value
1937	n/a	if value[0] != ']':
1938	n/a	raise errors.HeaderParseError("expected ']' at end of domain-literal "
1939	n/a	"but found '{}'".format(value))
1940	n/a	domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
1941	n/a	value = value[1:]
1942	n/a	if value and value[0] in CFWS_LEADER:
1943	n/a	token, value = get_cfws(value)
1944	n/a	domain_literal.append(token)
1945	n/a	return domain_literal, value
1946	n/a
1947	n/a	def get_domain(value):
1948	n/a	""" domain = dot-atom / domain-literal / obs-domain
1949	n/a	obs-domain = atom *("." atom))
1950	n/a
1951	n/a	"""
1952	n/a	domain = Domain()
1953	n/a	leader = None
1954	n/a	if value[0] in CFWS_LEADER:
1955	n/a	leader, value = get_cfws(value)
1956	n/a	if not value:
1957	n/a	raise errors.HeaderParseError(
1958	n/a	"expected domain but found '{}'".format(value))
1959	n/a	if value[0] == '[':
1960	n/a	token, value = get_domain_literal(value)
1961	n/a	if leader is not None:
1962	n/a	token[:0] = [leader]
1963	n/a	domain.append(token)
1964	n/a	return domain, value
1965	n/a	try:
1966	n/a	token, value = get_dot_atom(value)
1967	n/a	except errors.HeaderParseError:
1968	n/a	token, value = get_atom(value)
1969	n/a	if leader is not None:
1970	n/a	token[:0] = [leader]
1971	n/a	domain.append(token)
1972	n/a	if value and value[0] == '.':
1973	n/a	domain.defects.append(errors.ObsoleteHeaderDefect(
1974	n/a	"domain is not a dot-atom (contains CFWS)"))
1975	n/a	if domain[0].token_type == 'dot-atom':
1976	n/a	domain[:] = domain[0]
1977	n/a	while value and value[0] == '.':
1978	n/a	domain.append(DOT)
1979	n/a	token, value = get_atom(value[1:])
1980	n/a	domain.append(token)
1981	n/a	return domain, value
1982	n/a
1983	n/a	def get_addr_spec(value):
1984	n/a	""" addr-spec = local-part "@" domain
1985	n/a
1986	n/a	"""
1987	n/a	addr_spec = AddrSpec()
1988	n/a	token, value = get_local_part(value)
1989	n/a	addr_spec.append(token)
1990	n/a	if not value or value[0] != '@':
1991	n/a	addr_spec.defects.append(errors.InvalidHeaderDefect(
1992	n/a	"add-spec local part with no domain"))
1993	n/a	return addr_spec, value
1994	n/a	addr_spec.append(ValueTerminal('@', 'address-at-symbol'))
1995	n/a	token, value = get_domain(value[1:])
1996	n/a	addr_spec.append(token)
1997	n/a	return addr_spec, value
1998	n/a
1999	n/a	def get_obs_route(value):
2000	n/a	""" obs-route = obs-domain-list ":"
2001	n/a	obs-domain-list = (CFWS / ",") "@" domain ("," [CFWS] ["@" domain])
2002	n/a
2003	n/a	Returns an obs-route token with the appropriate sub-tokens (that is,
2004	n/a	there is no obs-domain-list in the parse tree).
2005	n/a	"""
2006	n/a	obs_route = ObsRoute()
2007	n/a	while value and (value[0]==',' or value[0] in CFWS_LEADER):
2008	n/a	if value[0] in CFWS_LEADER:
2009	n/a	token, value = get_cfws(value)
2010	n/a	obs_route.append(token)
2011	n/a	elif value[0] == ',':
2012	n/a	obs_route.append(ListSeparator)
2013	n/a	value = value[1:]
2014	n/a	if not value or value[0] != '@':
2015	n/a	raise errors.HeaderParseError(
2016	n/a	"expected obs-route domain but found '{}'".format(value))
2017	n/a	obs_route.append(RouteComponentMarker)
2018	n/a	token, value = get_domain(value[1:])
2019	n/a	obs_route.append(token)
2020	n/a	while value and value[0]==',':
2021	n/a	obs_route.append(ListSeparator)
2022	n/a	value = value[1:]
2023	n/a	if not value:
2024	n/a	break
2025	n/a	if value[0] in CFWS_LEADER:
2026	n/a	token, value = get_cfws(value)
2027	n/a	obs_route.append(token)
2028	n/a	if value[0] == '@':
2029	n/a	obs_route.append(RouteComponentMarker)
2030	n/a	token, value = get_domain(value[1:])
2031	n/a	obs_route.append(token)
2032	n/a	if not value:
2033	n/a	raise errors.HeaderParseError("end of header while parsing obs-route")
2034	n/a	if value[0] != ':':
2035	n/a	raise errors.HeaderParseError( "expected ':' marking end of "
2036	n/a	"obs-route but found '{}'".format(value))
2037	n/a	obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker'))
2038	n/a	return obs_route, value[1:]
2039	n/a
2040	n/a	def get_angle_addr(value):
2041	n/a	""" angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
2042	n/a	obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS]
2043	n/a
2044	n/a	"""
2045	n/a	angle_addr = AngleAddr()
2046	n/a	if value[0] in CFWS_LEADER:
2047	n/a	token, value = get_cfws(value)
2048	n/a	angle_addr.append(token)
2049	n/a	if not value or value[0] != '<':
2050	n/a	raise errors.HeaderParseError(
2051	n/a	"expected angle-addr but found '{}'".format(value))
2052	n/a	angle_addr.append(ValueTerminal('<', 'angle-addr-start'))
2053	n/a	value = value[1:]
2054	n/a	# Although it is not legal per RFC5322, SMTP uses '<>' in certain
2055	n/a	# circumstances.
2056	n/a	if value[0] == '>':
2057	n/a	angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
2058	n/a	angle_addr.defects.append(errors.InvalidHeaderDefect(
2059	n/a	"null addr-spec in angle-addr"))
2060	n/a	value = value[1:]
2061	n/a	return angle_addr, value
2062	n/a	try:
2063	n/a	token, value = get_addr_spec(value)
2064	n/a	except errors.HeaderParseError:
2065	n/a	try:
2066	n/a	token, value = get_obs_route(value)
2067	n/a	angle_addr.defects.append(errors.ObsoleteHeaderDefect(
2068	n/a	"obsolete route specification in angle-addr"))
2069	n/a	except errors.HeaderParseError:
2070	n/a	raise errors.HeaderParseError(
2071	n/a	"expected addr-spec or obs-route but found '{}'".format(value))
2072	n/a	angle_addr.append(token)
2073	n/a	token, value = get_addr_spec(value)
2074	n/a	angle_addr.append(token)
2075	n/a	if value and value[0] == '>':
2076	n/a	value = value[1:]
2077	n/a	else:
2078	n/a	angle_addr.defects.append(errors.InvalidHeaderDefect(
2079	n/a	"missing trailing '>' on angle-addr"))
2080	n/a	angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
2081	n/a	if value and value[0] in CFWS_LEADER:
2082	n/a	token, value = get_cfws(value)
2083	n/a	angle_addr.append(token)
2084	n/a	return angle_addr, value
2085	n/a
2086	n/a	def get_display_name(value):
2087	n/a	""" display-name = phrase
2088	n/a
2089	n/a	Because this is simply a name-rule, we don't return a display-name
2090	n/a	token containing a phrase, but rather a display-name token with
2091	n/a	the content of the phrase.
2092	n/a
2093	n/a	"""
2094	n/a	display_name = DisplayName()
2095	n/a	token, value = get_phrase(value)
2096	n/a	display_name.extend(token[:])
2097	n/a	display_name.defects = token.defects[:]
2098	n/a	return display_name, value
2099	n/a
2100	n/a
2101	n/a	def get_name_addr(value):
2102	n/a	""" name-addr = [display-name] angle-addr
2103	n/a
2104	n/a	"""
2105	n/a	name_addr = NameAddr()
2106	n/a	# Both the optional display name and the angle-addr can start with cfws.
2107	n/a	leader = None
2108	n/a	if value[0] in CFWS_LEADER:
2109	n/a	leader, value = get_cfws(value)
2110	n/a	if not value:
2111	n/a	raise errors.HeaderParseError(
2112	n/a	"expected name-addr but found '{}'".format(leader))
2113	n/a	if value[0] != '<':
2114	n/a	if value[0] in PHRASE_ENDS:
2115	n/a	raise errors.HeaderParseError(
2116	n/a	"expected name-addr but found '{}'".format(value))
2117	n/a	token, value = get_display_name(value)
2118	n/a	if not value:
2119	n/a	raise errors.HeaderParseError(
2120	n/a	"expected name-addr but found '{}'".format(token))
2121	n/a	if leader is not None:
2122	n/a	token[0][:0] = [leader]
2123	n/a	leader = None
2124	n/a	name_addr.append(token)
2125	n/a	token, value = get_angle_addr(value)
2126	n/a	if leader is not None:
2127	n/a	token[:0] = [leader]
2128	n/a	name_addr.append(token)
2129	n/a	return name_addr, value
2130	n/a
2131	n/a	def get_mailbox(value):
2132	n/a	""" mailbox = name-addr / addr-spec
2133	n/a
2134	n/a	"""
2135	n/a	# The only way to figure out if we are dealing with a name-addr or an
2136	n/a	# addr-spec is to try parsing each one.
2137	n/a	mailbox = Mailbox()
2138	n/a	try:
2139	n/a	token, value = get_name_addr(value)
2140	n/a	except errors.HeaderParseError:
2141	n/a	try:
2142	n/a	token, value = get_addr_spec(value)
2143	n/a	except errors.HeaderParseError:
2144	n/a	raise errors.HeaderParseError(
2145	n/a	"expected mailbox but found '{}'".format(value))
2146	n/a	if any(isinstance(x, errors.InvalidHeaderDefect)
2147	n/a	for x in token.all_defects):
2148	n/a	mailbox.token_type = 'invalid-mailbox'
2149	n/a	mailbox.append(token)
2150	n/a	return mailbox, value
2151	n/a
2152	n/a	def get_invalid_mailbox(value, endchars):
2153	n/a	""" Read everything up to one of the chars in endchars.
2154	n/a
2155	n/a	This is outside the formal grammar. The InvalidMailbox TokenList that is
2156	n/a	returned acts like a Mailbox, but the data attributes are None.
2157	n/a
2158	n/a	"""
2159	n/a	invalid_mailbox = InvalidMailbox()
2160	n/a	while value and value[0] not in endchars:
2161	n/a	if value[0] in PHRASE_ENDS:
2162	n/a	invalid_mailbox.append(ValueTerminal(value[0],
2163	n/a	'misplaced-special'))
2164	n/a	value = value[1:]
2165	n/a	else:
2166	n/a	token, value = get_phrase(value)
2167	n/a	invalid_mailbox.append(token)
2168	n/a	return invalid_mailbox, value
2169	n/a
2170	n/a	def get_mailbox_list(value):
2171	n/a	""" mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
2172	n/a	obs-mbox-list = ([CFWS] ",") mailbox ("," [mailbox / CFWS])
2173	n/a
2174	n/a	For this routine we go outside the formal grammar in order to improve error
2175	n/a	handling. We recognize the end of the mailbox list only at the end of the
2176	n/a	value or at a ';' (the group terminator). This is so that we can turn
2177	n/a	invalid mailboxes into InvalidMailbox tokens and continue parsing any
2178	n/a	remaining valid mailboxes. We also allow all mailbox entries to be null,
2179	n/a	and this condition is handled appropriately at a higher level.
2180	n/a
2181	n/a	"""
2182	n/a	mailbox_list = MailboxList()
2183	n/a	while value and value[0] != ';':
2184	n/a	try:
2185	n/a	token, value = get_mailbox(value)
2186	n/a	mailbox_list.append(token)
2187	n/a	except errors.HeaderParseError:
2188	n/a	leader = None
2189	n/a	if value[0] in CFWS_LEADER:
2190	n/a	leader, value = get_cfws(value)
2191	n/a	if not value or value[0] in ',;':
2192	n/a	mailbox_list.append(leader)
2193	n/a	mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
2194	n/a	"empty element in mailbox-list"))
2195	n/a	else:
2196	n/a	token, value = get_invalid_mailbox(value, ',;')
2197	n/a	if leader is not None:
2198	n/a	token[:0] = [leader]
2199	n/a	mailbox_list.append(token)
2200	n/a	mailbox_list.defects.append(errors.InvalidHeaderDefect(
2201	n/a	"invalid mailbox in mailbox-list"))
2202	n/a	elif value[0] == ',':
2203	n/a	mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
2204	n/a	"empty element in mailbox-list"))
2205	n/a	else:
2206	n/a	token, value = get_invalid_mailbox(value, ',;')
2207	n/a	if leader is not None:
2208	n/a	token[:0] = [leader]
2209	n/a	mailbox_list.append(token)
2210	n/a	mailbox_list.defects.append(errors.InvalidHeaderDefect(
2211	n/a	"invalid mailbox in mailbox-list"))
2212	n/a	if value and value[0] not in ',;':
2213	n/a	# Crap after mailbox; treat it as an invalid mailbox.
2214	n/a	# The mailbox info will still be available.
2215	n/a	mailbox = mailbox_list[-1]
2216	n/a	mailbox.token_type = 'invalid-mailbox'
2217	n/a	token, value = get_invalid_mailbox(value, ',;')
2218	n/a	mailbox.extend(token)
2219	n/a	mailbox_list.defects.append(errors.InvalidHeaderDefect(
2220	n/a	"invalid mailbox in mailbox-list"))
2221	n/a	if value and value[0] == ',':
2222	n/a	mailbox_list.append(ListSeparator)
2223	n/a	value = value[1:]
2224	n/a	return mailbox_list, value
2225	n/a
2226	n/a
2227	n/a	def get_group_list(value):
2228	n/a	""" group-list = mailbox-list / CFWS / obs-group-list
2229	n/a	obs-group-list = 1*([CFWS] ",") [CFWS]
2230	n/a
2231	n/a	"""
2232	n/a	group_list = GroupList()
2233	n/a	if not value:
2234	n/a	group_list.defects.append(errors.InvalidHeaderDefect(
2235	n/a	"end of header before group-list"))
2236	n/a	return group_list, value
2237	n/a	leader = None
2238	n/a	if value and value[0] in CFWS_LEADER:
2239	n/a	leader, value = get_cfws(value)
2240	n/a	if not value:
2241	n/a	# This should never happen in email parsing, since CFWS-only is a
2242	n/a	# legal alternative to group-list in a group, which is the only
2243	n/a	# place group-list appears.
2244	n/a	group_list.defects.append(errors.InvalidHeaderDefect(
2245	n/a	"end of header in group-list"))
2246	n/a	group_list.append(leader)
2247	n/a	return group_list, value
2248	n/a	if value[0] == ';':
2249	n/a	group_list.append(leader)
2250	n/a	return group_list, value
2251	n/a	token, value = get_mailbox_list(value)
2252	n/a	if len(token.all_mailboxes)==0:
2253	n/a	if leader is not None:
2254	n/a	group_list.append(leader)
2255	n/a	group_list.extend(token)
2256	n/a	group_list.defects.append(errors.ObsoleteHeaderDefect(
2257	n/a	"group-list with empty entries"))
2258	n/a	return group_list, value
2259	n/a	if leader is not None:
2260	n/a	token[:0] = [leader]
2261	n/a	group_list.append(token)
2262	n/a	return group_list, value
2263	n/a
2264	n/a	def get_group(value):
2265	n/a	""" group = display-name ":" [group-list] ";" [CFWS]
2266	n/a
2267	n/a	"""
2268	n/a	group = Group()
2269	n/a	token, value = get_display_name(value)
2270	n/a	if not value or value[0] != ':':
2271	n/a	raise errors.HeaderParseError("expected ':' at end of group "
2272	n/a	"display name but found '{}'".format(value))
2273	n/a	group.append(token)
2274	n/a	group.append(ValueTerminal(':', 'group-display-name-terminator'))
2275	n/a	value = value[1:]
2276	n/a	if value and value[0] == ';':
2277	n/a	group.append(ValueTerminal(';', 'group-terminator'))
2278	n/a	return group, value[1:]
2279	n/a	token, value = get_group_list(value)
2280	n/a	group.append(token)
2281	n/a	if not value:
2282	n/a	group.defects.append(errors.InvalidHeaderDefect(
2283	n/a	"end of header in group"))
2284	n/a	if value[0] != ';':
2285	n/a	raise errors.HeaderParseError(
2286	n/a	"expected ';' at end of group but found {}".format(value))
2287	n/a	group.append(ValueTerminal(';', 'group-terminator'))
2288	n/a	value = value[1:]
2289	n/a	if value and value[0] in CFWS_LEADER:
2290	n/a	token, value = get_cfws(value)
2291	n/a	group.append(token)
2292	n/a	return group, value
2293	n/a
2294	n/a	def get_address(value):
2295	n/a	""" address = mailbox / group
2296	n/a
2297	n/a	Note that counter-intuitively, an address can be either a single address or
2298	n/a	a list of addresses (a group). This is why the returned Address object has
2299	n/a	a 'mailboxes' attribute which treats a single address as a list of length
2300	n/a	one. When you need to differentiate between to two cases, extract the single
2301	n/a	element, which is either a mailbox or a group token.
2302	n/a
2303	n/a	"""
2304	n/a	# The formal grammar isn't very helpful when parsing an address. mailbox
2305	n/a	# and group, especially when allowing for obsolete forms, start off very
2306	n/a	# similarly. It is only when you reach one of @, <, or : that you know
2307	n/a	# what you've got. So, we try each one in turn, starting with the more
2308	n/a	# likely of the two. We could perhaps make this more efficient by looking
2309	n/a	# for a phrase and then branching based on the next character, but that
2310	n/a	# would be a premature optimization.
2311	n/a	address = Address()
2312	n/a	try:
2313	n/a	token, value = get_group(value)
2314	n/a	except errors.HeaderParseError:
2315	n/a	try:
2316	n/a	token, value = get_mailbox(value)
2317	n/a	except errors.HeaderParseError:
2318	n/a	raise errors.HeaderParseError(
2319	n/a	"expected address but found '{}'".format(value))
2320	n/a	address.append(token)
2321	n/a	return address, value
2322	n/a
2323	n/a	def get_address_list(value):
2324	n/a	""" address_list = (address *("," address)) / obs-addr-list
2325	n/a	obs-addr-list = ([CFWS] ",") address ("," [address / CFWS])
2326	n/a
2327	n/a	We depart from the formal grammar here by continuing to parse until the end
2328	n/a	of the input, assuming the input to be entirely composed of an
2329	n/a	address-list. This is always true in email parsing, and allows us
2330	n/a	to skip invalid addresses to parse additional valid ones.
2331	n/a
2332	n/a	"""
2333	n/a	address_list = AddressList()
2334	n/a	while value:
2335	n/a	try:
2336	n/a	token, value = get_address(value)
2337	n/a	address_list.append(token)
2338	n/a	except errors.HeaderParseError as err:
2339	n/a	leader = None
2340	n/a	if value[0] in CFWS_LEADER:
2341	n/a	leader, value = get_cfws(value)
2342	n/a	if not value or value[0] == ',':
2343	n/a	address_list.append(leader)
2344	n/a	address_list.defects.append(errors.ObsoleteHeaderDefect(
2345	n/a	"address-list entry with no content"))
2346	n/a	else:
2347	n/a	token, value = get_invalid_mailbox(value, ',')
2348	n/a	if leader is not None:
2349	n/a	token[:0] = [leader]
2350	n/a	address_list.append(Address([token]))
2351	n/a	address_list.defects.append(errors.InvalidHeaderDefect(
2352	n/a	"invalid address in address-list"))
2353	n/a	elif value[0] == ',':
2354	n/a	address_list.defects.append(errors.ObsoleteHeaderDefect(
2355	n/a	"empty element in address-list"))
2356	n/a	else:
2357	n/a	token, value = get_invalid_mailbox(value, ',')
2358	n/a	if leader is not None:
2359	n/a	token[:0] = [leader]
2360	n/a	address_list.append(Address([token]))
2361	n/a	address_list.defects.append(errors.InvalidHeaderDefect(
2362	n/a	"invalid address in address-list"))
2363	n/a	if value and value[0] != ',':
2364	n/a	# Crap after address; treat it as an invalid mailbox.
2365	n/a	# The mailbox info will still be available.
2366	n/a	mailbox = address_list[-1][0]
2367	n/a	mailbox.token_type = 'invalid-mailbox'
2368	n/a	token, value = get_invalid_mailbox(value, ',')
2369	n/a	mailbox.extend(token)
2370	n/a	address_list.defects.append(errors.InvalidHeaderDefect(
2371	n/a	"invalid address in address-list"))
2372	n/a	if value: # Must be a , at this point.
2373	n/a	address_list.append(ValueTerminal(',', 'list-separator'))
2374	n/a	value = value[1:]
2375	n/a	return address_list, value
2376	n/a
2377	n/a	#
2378	n/a	# XXX: As I begin to add additional header parsers, I'm realizing we probably
2379	n/a	# have two level of parser routines: the get_XXX methods that get a token in
2380	n/a	# the grammar, and parse_XXX methods that parse an entire field value. So
2381	n/a	# get_address_list above should really be a parse_ method, as probably should
2382	n/a	# be get_unstructured.
2383	n/a	#
2384	n/a
2385	n/a	def parse_mime_version(value):
2386	n/a	""" mime-version = [CFWS] 1digit [CFWS] "." [CFWS] 1digit [CFWS]
2387	n/a
2388	n/a	"""
2389	n/a	# The [CFWS] is implicit in the RFC 2045 BNF.
2390	n/a	# XXX: This routine is a bit verbose, should factor out a get_int method.
2391	n/a	mime_version = MIMEVersion()
2392	n/a	if not value:
2393	n/a	mime_version.defects.append(errors.HeaderMissingRequiredValue(
2394	n/a	"Missing MIME version number (eg: 1.0)"))
2395	n/a	return mime_version
2396	n/a	if value[0] in CFWS_LEADER:
2397	n/a	token, value = get_cfws(value)
2398	n/a	mime_version.append(token)
2399	n/a	if not value:
2400	n/a	mime_version.defects.append(errors.HeaderMissingRequiredValue(
2401	n/a	"Expected MIME version number but found only CFWS"))
2402	n/a	digits = ''
2403	n/a	while value and value[0] != '.' and value[0] not in CFWS_LEADER:
2404	n/a	digits += value[0]
2405	n/a	value = value[1:]
2406	n/a	if not digits.isdigit():
2407	n/a	mime_version.defects.append(errors.InvalidHeaderDefect(
2408	n/a	"Expected MIME major version number but found {!r}".format(digits)))
2409	n/a	mime_version.append(ValueTerminal(digits, 'xtext'))
2410	n/a	else:
2411	n/a	mime_version.major = int(digits)
2412	n/a	mime_version.append(ValueTerminal(digits, 'digits'))
2413	n/a	if value and value[0] in CFWS_LEADER:
2414	n/a	token, value = get_cfws(value)
2415	n/a	mime_version.append(token)
2416	n/a	if not value or value[0] != '.':
2417	n/a	if mime_version.major is not None:
2418	n/a	mime_version.defects.append(errors.InvalidHeaderDefect(
2419	n/a	"Incomplete MIME version; found only major number"))
2420	n/a	if value:
2421	n/a	mime_version.append(ValueTerminal(value, 'xtext'))
2422	n/a	return mime_version
2423	n/a	mime_version.append(ValueTerminal('.', 'version-separator'))
2424	n/a	value = value[1:]
2425	n/a	if value and value[0] in CFWS_LEADER:
2426	n/a	token, value = get_cfws(value)
2427	n/a	mime_version.append(token)
2428	n/a	if not value:
2429	n/a	if mime_version.major is not None:
2430	n/a	mime_version.defects.append(errors.InvalidHeaderDefect(
2431	n/a	"Incomplete MIME version; found only major number"))
2432	n/a	return mime_version
2433	n/a	digits = ''
2434	n/a	while value and value[0] not in CFWS_LEADER:
2435	n/a	digits += value[0]
2436	n/a	value = value[1:]
2437	n/a	if not digits.isdigit():
2438	n/a	mime_version.defects.append(errors.InvalidHeaderDefect(
2439	n/a	"Expected MIME minor version number but found {!r}".format(digits)))
2440	n/a	mime_version.append(ValueTerminal(digits, 'xtext'))
2441	n/a	else:
2442	n/a	mime_version.minor = int(digits)
2443	n/a	mime_version.append(ValueTerminal(digits, 'digits'))
2444	n/a	if value and value[0] in CFWS_LEADER:
2445	n/a	token, value = get_cfws(value)
2446	n/a	mime_version.append(token)
2447	n/a	if value:
2448	n/a	mime_version.defects.append(errors.InvalidHeaderDefect(
2449	n/a	"Excess non-CFWS text after MIME version"))
2450	n/a	mime_version.append(ValueTerminal(value, 'xtext'))
2451	n/a	return mime_version
2452	n/a
2453	n/a	def get_invalid_parameter(value):
2454	n/a	""" Read everything up to the next ';'.
2455	n/a
2456	n/a	This is outside the formal grammar. The InvalidParameter TokenList that is
2457	n/a	returned acts like a Parameter, but the data attributes are None.
2458	n/a
2459	n/a	"""
2460	n/a	invalid_parameter = InvalidParameter()
2461	n/a	while value and value[0] != ';':
2462	n/a	if value[0] in PHRASE_ENDS:
2463	n/a	invalid_parameter.append(ValueTerminal(value[0],
2464	n/a	'misplaced-special'))
2465	n/a	value = value[1:]
2466	n/a	else:
2467	n/a	token, value = get_phrase(value)
2468	n/a	invalid_parameter.append(token)
2469	n/a	return invalid_parameter, value
2470	n/a
2471	n/a	def get_ttext(value):
2472	n/a	"""ttext = <matches _ttext_matcher>
2473	n/a
2474	n/a	We allow any non-TOKEN_ENDS in ttext, but add defects to the token's
2475	n/a	defects list if we find non-ttext characters. We also register defects for
2476	n/a	any non-printables even though the RFC doesn't exclude all of them,
2477	n/a	because we follow the spirit of RFC 5322.
2478	n/a
2479	n/a	"""
2480	n/a	m = _non_token_end_matcher(value)
2481	n/a	if not m:
2482	n/a	raise errors.HeaderParseError(
2483	n/a	"expected ttext but found '{}'".format(value))
2484	n/a	ttext = m.group()
2485	n/a	value = value[len(ttext):]
2486	n/a	ttext = ValueTerminal(ttext, 'ttext')
2487	n/a	_validate_xtext(ttext)
2488	n/a	return ttext, value
2489	n/a
2490	n/a	def get_token(value):
2491	n/a	"""token = [CFWS] 1*ttext [CFWS]
2492	n/a
2493	n/a	The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or
2494	n/a	tspecials. We also exclude tabs even though the RFC doesn't.
2495	n/a
2496	n/a	The RFC implies the CFWS but is not explicit about it in the BNF.
2497	n/a
2498	n/a	"""
2499	n/a	mtoken = Token()
2500	n/a	if value and value[0] in CFWS_LEADER:
2501	n/a	token, value = get_cfws(value)
2502	n/a	mtoken.append(token)
2503	n/a	if value and value[0] in TOKEN_ENDS:
2504	n/a	raise errors.HeaderParseError(
2505	n/a	"expected token but found '{}'".format(value))
2506	n/a	token, value = get_ttext(value)
2507	n/a	mtoken.append(token)
2508	n/a	if value and value[0] in CFWS_LEADER:
2509	n/a	token, value = get_cfws(value)
2510	n/a	mtoken.append(token)
2511	n/a	return mtoken, value
2512	n/a
2513	n/a	def get_attrtext(value):
2514	n/a	"""attrtext = 1*(any non-ATTRIBUTE_ENDS character)
2515	n/a
2516	n/a	We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the
2517	n/a	token's defects list if we find non-attrtext characters. We also register
2518	n/a	defects for any non-printables even though the RFC doesn't exclude all of
2519	n/a	them, because we follow the spirit of RFC 5322.
2520	n/a
2521	n/a	"""
2522	n/a	m = _non_attribute_end_matcher(value)
2523	n/a	if not m:
2524	n/a	raise errors.HeaderParseError(
2525	n/a	"expected attrtext but found {!r}".format(value))
2526	n/a	attrtext = m.group()
2527	n/a	value = value[len(attrtext):]
2528	n/a	attrtext = ValueTerminal(attrtext, 'attrtext')
2529	n/a	_validate_xtext(attrtext)
2530	n/a	return attrtext, value
2531	n/a
2532	n/a	def get_attribute(value):
2533	n/a	""" [CFWS] 1*attrtext [CFWS]
2534	n/a
2535	n/a	This version of the BNF makes the CFWS explicit, and as usual we use a
2536	n/a	value terminal for the actual run of characters. The RFC equivalent of
2537	n/a	attrtext is the token characters, with the subtraction of '*', "'", and '%'.
2538	n/a	We include tab in the excluded set just as we do for token.
2539	n/a
2540	n/a	"""
2541	n/a	attribute = Attribute()
2542	n/a	if value and value[0] in CFWS_LEADER:
2543	n/a	token, value = get_cfws(value)
2544	n/a	attribute.append(token)
2545	n/a	if value and value[0] in ATTRIBUTE_ENDS:
2546	n/a	raise errors.HeaderParseError(
2547	n/a	"expected token but found '{}'".format(value))
2548	n/a	token, value = get_attrtext(value)
2549	n/a	attribute.append(token)
2550	n/a	if value and value[0] in CFWS_LEADER:
2551	n/a	token, value = get_cfws(value)
2552	n/a	attribute.append(token)
2553	n/a	return attribute, value
2554	n/a
2555	n/a	def get_extended_attrtext(value):
2556	n/a	"""attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%')
2557	n/a
2558	n/a	This is a special parsing routine so that we get a value that
2559	n/a	includes % escapes as a single string (which we decode as a single
2560	n/a	string later).
2561	n/a
2562	n/a	"""
2563	n/a	m = _non_extended_attribute_end_matcher(value)
2564	n/a	if not m:
2565	n/a	raise errors.HeaderParseError(
2566	n/a	"expected extended attrtext but found {!r}".format(value))
2567	n/a	attrtext = m.group()
2568	n/a	value = value[len(attrtext):]
2569	n/a	attrtext = ValueTerminal(attrtext, 'extended-attrtext')
2570	n/a	_validate_xtext(attrtext)
2571	n/a	return attrtext, value
2572	n/a
2573	n/a	def get_extended_attribute(value):
2574	n/a	""" [CFWS] 1*extended_attrtext [CFWS]
2575	n/a
2576	n/a	This is like the non-extended version except we allow % characters, so that
2577	n/a	we can pick up an encoded value as a single string.
2578	n/a
2579	n/a	"""
2580	n/a	# XXX: should we have an ExtendedAttribute TokenList?
2581	n/a	attribute = Attribute()
2582	n/a	if value and value[0] in CFWS_LEADER:
2583	n/a	token, value = get_cfws(value)
2584	n/a	attribute.append(token)
2585	n/a	if value and value[0] in EXTENDED_ATTRIBUTE_ENDS:
2586	n/a	raise errors.HeaderParseError(
2587	n/a	"expected token but found '{}'".format(value))
2588	n/a	token, value = get_extended_attrtext(value)
2589	n/a	attribute.append(token)
2590	n/a	if value and value[0] in CFWS_LEADER:
2591	n/a	token, value = get_cfws(value)
2592	n/a	attribute.append(token)
2593	n/a	return attribute, value
2594	n/a
2595	n/a	def get_section(value):
2596	n/a	""" '*' digits
2597	n/a
2598	n/a	The formal BNF is more complicated because leading 0s are not allowed. We
2599	n/a	check for that and add a defect. We also assume no CFWS is allowed between
2600	n/a	the '*' and the digits, though the RFC is not crystal clear on that.
2601	n/a	The caller should already have dealt with leading CFWS.
2602	n/a
2603	n/a	"""
2604	n/a	section = Section()
2605	n/a	if not value or value[0] != '*':
2606	n/a	raise errors.HeaderParseError("Expected section but found {}".format(
2607	n/a	value))
2608	n/a	section.append(ValueTerminal('*', 'section-marker'))
2609	n/a	value = value[1:]
2610	n/a	if not value or not value[0].isdigit():
2611	n/a	raise errors.HeaderParseError("Expected section number but "
2612	n/a	"found {}".format(value))
2613	n/a	digits = ''
2614	n/a	while value and value[0].isdigit():
2615	n/a	digits += value[0]
2616	n/a	value = value[1:]
2617	n/a	if digits[0] == '0' and digits != '0':
2618	n/a	section.defects.append(errors.InvalidHeaderError("section number"
2619	n/a	"has an invalid leading 0"))
2620	n/a	section.number = int(digits)
2621	n/a	section.append(ValueTerminal(digits, 'digits'))
2622	n/a	return section, value
2623	n/a
2624	n/a
2625	n/a	def get_value(value):
2626	n/a	""" quoted-string / attribute
2627	n/a
2628	n/a	"""
2629	n/a	v = Value()
2630	n/a	if not value:
2631	n/a	raise errors.HeaderParseError("Expected value but found end of string")
2632	n/a	leader = None
2633	n/a	if value[0] in CFWS_LEADER:
2634	n/a	leader, value = get_cfws(value)
2635	n/a	if not value:
2636	n/a	raise errors.HeaderParseError("Expected value but found "
2637	n/a	"only {}".format(leader))
2638	n/a	if value[0] == '"':
2639	n/a	token, value = get_quoted_string(value)
2640	n/a	else:
2641	n/a	token, value = get_extended_attribute(value)
2642	n/a	if leader is not None:
2643	n/a	token[:0] = [leader]
2644	n/a	v.append(token)
2645	n/a	return v, value
2646	n/a
2647	n/a	def get_parameter(value):
2648	n/a	""" attribute [section] ["*"] [CFWS] "=" value
2649	n/a
2650	n/a	The CFWS is implied by the RFC but not made explicit in the BNF. This
2651	n/a	simplified form of the BNF from the RFC is made to conform with the RFC BNF
2652	n/a	through some extra checks. We do it this way because it makes both error
2653	n/a	recovery and working with the resulting parse tree easier.
2654	n/a	"""
2655	n/a	# It is possible CFWS would also be implicitly allowed between the section
2656	n/a	# and the 'extended-attribute' marker (the '*') , but we've never seen that
2657	n/a	# in the wild and we will therefore ignore the possibility.
2658	n/a	param = Parameter()
2659	n/a	token, value = get_attribute(value)
2660	n/a	param.append(token)
2661	n/a	if not value or value[0] == ';':
2662	n/a	param.defects.append(errors.InvalidHeaderDefect("Parameter contains "
2663	n/a	"name ({}) but no value".format(token)))
2664	n/a	return param, value
2665	n/a	if value[0] == '*':
2666	n/a	try:
2667	n/a	token, value = get_section(value)
2668	n/a	param.sectioned = True
2669	n/a	param.append(token)
2670	n/a	except errors.HeaderParseError:
2671	n/a	pass
2672	n/a	if not value:
2673	n/a	raise errors.HeaderParseError("Incomplete parameter")
2674	n/a	if value[0] == '*':
2675	n/a	param.append(ValueTerminal('*', 'extended-parameter-marker'))
2676	n/a	value = value[1:]
2677	n/a	param.extended = True
2678	n/a	if value[0] != '=':
2679	n/a	raise errors.HeaderParseError("Parameter not followed by '='")
2680	n/a	param.append(ValueTerminal('=', 'parameter-separator'))
2681	n/a	value = value[1:]
2682	n/a	leader = None
2683	n/a	if value and value[0] in CFWS_LEADER:
2684	n/a	token, value = get_cfws(value)
2685	n/a	param.append(token)
2686	n/a	remainder = None
2687	n/a	appendto = param
2688	n/a	if param.extended and value and value[0] == '"':
2689	n/a	# Now for some serious hackery to handle the common invalid case of
2690	n/a	# double quotes around an extended value. We also accept (with defect)
2691	n/a	# a value marked as encoded that isn't really.
2692	n/a	qstring, remainder = get_quoted_string(value)
2693	n/a	inner_value = qstring.stripped_value
2694	n/a	semi_valid = False
2695	n/a	if param.section_number == 0:
2696	n/a	if inner_value and inner_value[0] == "'":
2697	n/a	semi_valid = True
2698	n/a	else:
2699	n/a	token, rest = get_attrtext(inner_value)
2700	n/a	if rest and rest[0] == "'":
2701	n/a	semi_valid = True
2702	n/a	else:
2703	n/a	try:
2704	n/a	token, rest = get_extended_attrtext(inner_value)
2705	n/a	except:
2706	n/a	pass
2707	n/a	else:
2708	n/a	if not rest:
2709	n/a	semi_valid = True
2710	n/a	if semi_valid:
2711	n/a	param.defects.append(errors.InvalidHeaderDefect(
2712	n/a	"Quoted string value for extended parameter is invalid"))
2713	n/a	param.append(qstring)
2714	n/a	for t in qstring:
2715	n/a	if t.token_type == 'bare-quoted-string':
2716	n/a	t[:] = []
2717	n/a	appendto = t
2718	n/a	break
2719	n/a	value = inner_value
2720	n/a	else:
2721	n/a	remainder = None
2722	n/a	param.defects.append(errors.InvalidHeaderDefect(
2723	n/a	"Parameter marked as extended but appears to have a "
2724	n/a	"quoted string value that is non-encoded"))
2725	n/a	if value and value[0] == "'":
2726	n/a	token = None
2727	n/a	else:
2728	n/a	token, value = get_value(value)
2729	n/a	if not param.extended or param.section_number > 0:
2730	n/a	if not value or value[0] != "'":
2731	n/a	appendto.append(token)
2732	n/a	if remainder is not None:
2733	n/a	assert not value, value
2734	n/a	value = remainder
2735	n/a	return param, value
2736	n/a	param.defects.append(errors.InvalidHeaderDefect(
2737	n/a	"Apparent initial-extended-value but attribute "
2738	n/a	"was not marked as extended or was not initial section"))
2739	n/a	if not value:
2740	n/a	# Assume the charset/lang is missing and the token is the value.
2741	n/a	param.defects.append(errors.InvalidHeaderDefect(
2742	n/a	"Missing required charset/lang delimiters"))
2743	n/a	appendto.append(token)
2744	n/a	if remainder is None:
2745	n/a	return param, value
2746	n/a	else:
2747	n/a	if token is not None:
2748	n/a	for t in token:
2749	n/a	if t.token_type == 'extended-attrtext':
2750	n/a	break
2751	n/a	t.token_type == 'attrtext'
2752	n/a	appendto.append(t)
2753	n/a	param.charset = t.value
2754	n/a	if value[0] != "'":
2755	n/a	raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
2756	n/a	"delimiter, but found {!r}".format(value))
2757	n/a	appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
2758	n/a	value = value[1:]
2759	n/a	if value and value[0] != "'":
2760	n/a	token, value = get_attrtext(value)
2761	n/a	appendto.append(token)
2762	n/a	param.lang = token.value
2763	n/a	if not value or value[0] != "'":
2764	n/a	raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
2765	n/a	"delimiter, but found {}".format(value))
2766	n/a	appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
2767	n/a	value = value[1:]
2768	n/a	if remainder is not None:
2769	n/a	# Treat the rest of value as bare quoted string content.
2770	n/a	v = Value()
2771	n/a	while value:
2772	n/a	if value[0] in WSP:
2773	n/a	token, value = get_fws(value)
2774	n/a	else:
2775	n/a	token, value = get_qcontent(value)
2776	n/a	v.append(token)
2777	n/a	token = v
2778	n/a	else:
2779	n/a	token, value = get_value(value)
2780	n/a	appendto.append(token)
2781	n/a	if remainder is not None:
2782	n/a	assert not value, value
2783	n/a	value = remainder
2784	n/a	return param, value
2785	n/a
2786	n/a	def parse_mime_parameters(value):
2787	n/a	""" parameter *( ";" parameter )
2788	n/a
2789	n/a	That BNF is meant to indicate this routine should only be called after
2790	n/a	finding and handling the leading ';'. There is no corresponding rule in
2791	n/a	the formal RFC grammar, but it is more convenient for us for the set of
2792	n/a	parameters to be treated as its own TokenList.
2793	n/a
2794	n/a	This is 'parse' routine because it consumes the reminaing value, but it
2795	n/a	would never be called to parse a full header. Instead it is called to
2796	n/a	parse everything after the non-parameter value of a specific MIME header.
2797	n/a
2798	n/a	"""
2799	n/a	mime_parameters = MimeParameters()
2800	n/a	while value:
2801	n/a	try:
2802	n/a	token, value = get_parameter(value)
2803	n/a	mime_parameters.append(token)
2804	n/a	except errors.HeaderParseError as err:
2805	n/a	leader = None
2806	n/a	if value[0] in CFWS_LEADER:
2807	n/a	leader, value = get_cfws(value)
2808	n/a	if not value:
2809	n/a	mime_parameters.append(leader)
2810	n/a	return mime_parameters
2811	n/a	if value[0] == ';':
2812	n/a	if leader is not None:
2813	n/a	mime_parameters.append(leader)
2814	n/a	mime_parameters.defects.append(errors.InvalidHeaderDefect(
2815	n/a	"parameter entry with no content"))
2816	n/a	else:
2817	n/a	token, value = get_invalid_parameter(value)
2818	n/a	if leader:
2819	n/a	token[:0] = [leader]
2820	n/a	mime_parameters.append(token)
2821	n/a	mime_parameters.defects.append(errors.InvalidHeaderDefect(
2822	n/a	"invalid parameter {!r}".format(token)))
2823	n/a	if value and value[0] != ';':
2824	n/a	# Junk after the otherwise valid parameter. Mark it as
2825	n/a	# invalid, but it will have a value.
2826	n/a	param = mime_parameters[-1]
2827	n/a	param.token_type = 'invalid-parameter'
2828	n/a	token, value = get_invalid_parameter(value)
2829	n/a	param.extend(token)
2830	n/a	mime_parameters.defects.append(errors.InvalidHeaderDefect(
2831	n/a	"parameter with invalid trailing text {!r}".format(token)))
2832	n/a	if value:
2833	n/a	# Must be a ';' at this point.
2834	n/a	mime_parameters.append(ValueTerminal(';', 'parameter-separator'))
2835	n/a	value = value[1:]
2836	n/a	return mime_parameters
2837	n/a
2838	n/a	def _find_mime_parameters(tokenlist, value):
2839	n/a	"""Do our best to find the parameters in an invalid MIME header
2840	n/a
2841	n/a	"""
2842	n/a	while value and value[0] != ';':
2843	n/a	if value[0] in PHRASE_ENDS:
2844	n/a	tokenlist.append(ValueTerminal(value[0], 'misplaced-special'))
2845	n/a	value = value[1:]
2846	n/a	else:
2847	n/a	token, value = get_phrase(value)
2848	n/a	tokenlist.append(token)
2849	n/a	if not value:
2850	n/a	return
2851	n/a	tokenlist.append(ValueTerminal(';', 'parameter-separator'))
2852	n/a	tokenlist.append(parse_mime_parameters(value[1:]))
2853	n/a
2854	n/a	def parse_content_type_header(value):
2855	n/a	""" maintype "/" subtype *( ";" parameter )
2856	n/a
2857	n/a	The maintype and substype are tokens. Theoretically they could
2858	n/a	be checked against the official IANA list + x-token, but we
2859	n/a	don't do that.
2860	n/a	"""
2861	n/a	ctype = ContentType()
2862	n/a	recover = False
2863	n/a	if not value:
2864	n/a	ctype.defects.append(errors.HeaderMissingRequiredValue(
2865	n/a	"Missing content type specification"))
2866	n/a	return ctype
2867	n/a	try:
2868	n/a	token, value = get_token(value)
2869	n/a	except errors.HeaderParseError:
2870	n/a	ctype.defects.append(errors.InvalidHeaderDefect(
2871	n/a	"Expected content maintype but found {!r}".format(value)))
2872	n/a	_find_mime_parameters(ctype, value)
2873	n/a	return ctype
2874	n/a	ctype.append(token)
2875	n/a	# XXX: If we really want to follow the formal grammar we should make
2876	n/a	# mantype and subtype specialized TokenLists here. Probably not worth it.
2877	n/a	if not value or value[0] != '/':
2878	n/a	ctype.defects.append(errors.InvalidHeaderDefect(
2879	n/a	"Invalid content type"))
2880	n/a	if value:
2881	n/a	_find_mime_parameters(ctype, value)
2882	n/a	return ctype
2883	n/a	ctype.maintype = token.value.strip().lower()
2884	n/a	ctype.append(ValueTerminal('/', 'content-type-separator'))
2885	n/a	value = value[1:]
2886	n/a	try:
2887	n/a	token, value = get_token(value)
2888	n/a	except errors.HeaderParseError:
2889	n/a	ctype.defects.append(errors.InvalidHeaderDefect(
2890	n/a	"Expected content subtype but found {!r}".format(value)))
2891	n/a	_find_mime_parameters(ctype, value)
2892	n/a	return ctype
2893	n/a	ctype.append(token)
2894	n/a	ctype.subtype = token.value.strip().lower()
2895	n/a	if not value:
2896	n/a	return ctype
2897	n/a	if value[0] != ';':
2898	n/a	ctype.defects.append(errors.InvalidHeaderDefect(
2899	n/a	"Only parameters are valid after content type, but "
2900	n/a	"found {!r}".format(value)))
2901	n/a	# The RFC requires that a syntactically invalid content-type be treated
2902	n/a	# as text/plain. Perhaps we should postel this, but we should probably
2903	n/a	# only do that if we were checking the subtype value against IANA.
2904	n/a	del ctype.maintype, ctype.subtype
2905	n/a	_find_mime_parameters(ctype, value)
2906	n/a	return ctype
2907	n/a	ctype.append(ValueTerminal(';', 'parameter-separator'))
2908	n/a	ctype.append(parse_mime_parameters(value[1:]))
2909	n/a	return ctype
2910	n/a
2911	n/a	def parse_content_disposition_header(value):
2912	n/a	""" disposition-type *( ";" parameter )
2913	n/a
2914	n/a	"""
2915	n/a	disp_header = ContentDisposition()
2916	n/a	if not value:
2917	n/a	disp_header.defects.append(errors.HeaderMissingRequiredValue(
2918	n/a	"Missing content disposition"))
2919	n/a	return disp_header
2920	n/a	try:
2921	n/a	token, value = get_token(value)
2922	n/a	except errors.HeaderParseError:
2923	n/a	disp_header.defects.append(errors.InvalidHeaderDefect(
2924	n/a	"Expected content disposition but found {!r}".format(value)))
2925	n/a	_find_mime_parameters(disp_header, value)
2926	n/a	return disp_header
2927	n/a	disp_header.append(token)
2928	n/a	disp_header.content_disposition = token.value.strip().lower()
2929	n/a	if not value:
2930	n/a	return disp_header
2931	n/a	if value[0] != ';':
2932	n/a	disp_header.defects.append(errors.InvalidHeaderDefect(
2933	n/a	"Only parameters are valid after content disposition, but "
2934	n/a	"found {!r}".format(value)))
2935	n/a	_find_mime_parameters(disp_header, value)
2936	n/a	return disp_header
2937	n/a	disp_header.append(ValueTerminal(';', 'parameter-separator'))
2938	n/a	disp_header.append(parse_mime_parameters(value[1:]))
2939	n/a	return disp_header
2940	n/a
2941	n/a	def parse_content_transfer_encoding_header(value):
2942	n/a	""" mechanism
2943	n/a
2944	n/a	"""
2945	n/a	# We should probably validate the values, since the list is fixed.
2946	n/a	cte_header = ContentTransferEncoding()
2947	n/a	if not value:
2948	n/a	cte_header.defects.append(errors.HeaderMissingRequiredValue(
2949	n/a	"Missing content transfer encoding"))
2950	n/a	return cte_header
2951	n/a	try:
2952	n/a	token, value = get_token(value)
2953	n/a	except errors.HeaderParseError:
2954	n/a	cte_header.defects.append(errors.InvalidHeaderDefect(
2955	n/a	"Expected content transfer encoding but found {!r}".format(value)))
2956	n/a	else:
2957	n/a	cte_header.append(token)
2958	n/a	cte_header.cte = token.value.strip().lower()
2959	n/a	if not value:
2960	n/a	return cte_header
2961	n/a	while value:
2962	n/a	cte_header.defects.append(errors.InvalidHeaderDefect(
2963	n/a	"Extra text after content transfer encoding"))
2964	n/a	if value[0] in PHRASE_ENDS:
2965	n/a	cte_header.append(ValueTerminal(value[0], 'misplaced-special'))
2966	n/a	value = value[1:]
2967	n/a	else:
2968	n/a	token, value = get_phrase(value)
2969	n/a	cte_header.append(token)
2970	n/a	return cte_header