Python code coverage for Lib/email/feedparser.py

#	count	content
1	n/a	# Copyright (C) 2004-2006 Python Software Foundation
2	n/a	# Authors: Baxter, Wouters and Warsaw
3	n/a	# Contact: email-sig@python.org
4	n/a
5	n/a	"""FeedParser - An email feed parser.
6	n/a
7	n/a	The feed parser implements an interface for incrementally parsing an email
8	n/a	message, line by line. This has advantages for certain applications, such as
9	n/a	those reading email messages off a socket.
10	n/a
11	n/a	FeedParser.feed() is the primary interface for pushing new data into the
12	n/a	parser. It returns when there's nothing more it can do with the available
13	n/a	data. When you have no more data to push into the parser, call .close().
14	n/a	This completes the parsing and returns the root message object.
15	n/a
16	n/a	The other advantage of this parser is that it will never raise a parsing
17	n/a	exception. Instead, when it finds something unexpected, it adds a 'defect' to
18	n/a	the current message. Defects are just instances that live on the message
19	n/a	object's .defects attribute.
20	n/a	"""
21	n/a
22	n/a	__all__ = ['FeedParser', 'BytesFeedParser']
23	n/a
24	n/a	import re
25	n/a
26	n/a	from email import errors
27	n/a	from email._policybase import compat32
28	n/a	from collections import deque
29	n/a	from io import StringIO
30	n/a
31	n/a	NLCRE = re.compile(r'\r\n\|\r\|\n')
32	n/a	NLCRE_bol = re.compile(r'(\r\n\|\r\|\n)')
33	n/a	NLCRE_eol = re.compile(r'(\r\n\|\r\|\n)\Z')
34	n/a	NLCRE_crack = re.compile(r'(\r\n\|\r\|\n)')
35	n/a	# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
36	n/a	# except controls, SP, and ":".
37	n/a	headerRE = re.compile(r'^(From \|[\041-\071\073-\176]*:\|[\t ])')
38	n/a	EMPTYSTRING = ''
39	n/a	NL = '\n'
40	n/a
41	n/a	NeedMoreData = object()
42	n/a
43	n/a
44	n/a
45	n/a	class BufferedSubFile(object):
46	n/a	"""A file-ish object that can have new data loaded into it.
47	n/a
48	n/a	You can also push and pop line-matching predicates onto a stack. When the
49	n/a	current predicate matches the current line, a false EOF response
50	n/a	(i.e. empty string) is returned instead. This lets the parser adhere to a
51	n/a	simple abstraction -- it parses until EOF closes the current message.
52	n/a	"""
53	n/a	def __init__(self):
54	n/a	# Text stream of the last partial line pushed into this object.
55	n/a	# See issue 22233 for why this is a text stream and not a list.
56	n/a	self._partial = StringIO(newline='')
57	n/a	# A deque of full, pushed lines
58	n/a	self._lines = deque()
59	n/a	# The stack of false-EOF checking predicates.
60	n/a	self._eofstack = []
61	n/a	# A flag indicating whether the file has been closed or not.
62	n/a	self._closed = False
63	n/a
64	n/a	def push_eof_matcher(self, pred):
65	n/a	self._eofstack.append(pred)
66	n/a
67	n/a	def pop_eof_matcher(self):
68	n/a	return self._eofstack.pop()
69	n/a
70	n/a	def close(self):
71	n/a	# Don't forget any trailing partial line.
72	n/a	self._partial.seek(0)
73	n/a	self.pushlines(self._partial.readlines())
74	n/a	self._partial.seek(0)
75	n/a	self._partial.truncate()
76	n/a	self._closed = True
77	n/a
78	n/a	def readline(self):
79	n/a	if not self._lines:
80	n/a	if self._closed:
81	n/a	return ''
82	n/a	return NeedMoreData
83	n/a	# Pop the line off the stack and see if it matches the current
84	n/a	# false-EOF predicate.
85	n/a	line = self._lines.popleft()
86	n/a	# RFC 2046, section 5.1.2 requires us to recognize outer level
87	n/a	# boundaries at any level of inner nesting. Do this, but be sure it's
88	n/a	# in the order of most to least nested.
89	n/a	for ateof in reversed(self._eofstack):
90	n/a	if ateof(line):
91	n/a	# We're at the false EOF. But push the last line back first.
92	n/a	self._lines.appendleft(line)
93	n/a	return ''
94	n/a	return line
95	n/a
96	n/a	def unreadline(self, line):
97	n/a	# Let the consumer push a line back into the buffer.
98	n/a	assert line is not NeedMoreData
99	n/a	self._lines.appendleft(line)
100	n/a
101	n/a	def push(self, data):
102	n/a	"""Push some new data into this object."""
103	n/a	self._partial.write(data)
104	n/a	if '\n' not in data and '\r' not in data:
105	n/a	# No new complete lines, wait for more.
106	n/a	return
107	n/a
108	n/a	# Crack into lines, preserving the linesep characters.
109	n/a	self._partial.seek(0)
110	n/a	parts = self._partial.readlines()
111	n/a	self._partial.seek(0)
112	n/a	self._partial.truncate()
113	n/a
114	n/a	# If the last element of the list does not end in a newline, then treat
115	n/a	# it as a partial line. We only check for '\n' here because a line
116	n/a	# ending with '\r' might be a line that was split in the middle of a
117	n/a	# '\r\n' sequence (see bugs 1555570 and 1721862).
118	n/a	if not parts[-1].endswith('\n'):
119	n/a	self._partial.write(parts.pop())
120	n/a	self.pushlines(parts)
121	n/a
122	n/a	def pushlines(self, lines):
123	n/a	self._lines.extend(lines)
124	n/a
125	n/a	def __iter__(self):
126	n/a	return self
127	n/a
128	n/a	def __next__(self):
129	n/a	line = self.readline()
130	n/a	if line == '':
131	n/a	raise StopIteration
132	n/a	return line
133	n/a
134	n/a
135	n/a
136	n/a	class FeedParser:
137	n/a	"""A feed-style parser of email."""
138	n/a
139	n/a	def __init__(self, _factory=None, *, policy=compat32):
140	n/a	"""_factory is called with no arguments to create a new message obj
141	n/a
142	n/a	The policy keyword specifies a policy object that controls a number of
143	n/a	aspects of the parser's operation. The default policy maintains
144	n/a	backward compatibility.
145	n/a
146	n/a	"""
147	n/a	self.policy = policy
148	n/a	self._old_style_factory = False
149	n/a	if _factory is None:
150	n/a	if policy.message_factory is None:
151	n/a	from email.message import Message
152	n/a	self._factory = Message
153	n/a	else:
154	n/a	self._factory = policy.message_factory
155	n/a	else:
156	n/a	self._factory = _factory
157	n/a	try:
158	n/a	_factory(policy=self.policy)
159	n/a	except TypeError:
160	n/a	# Assume this is an old-style factory
161	n/a	self._old_style_factory = True
162	n/a	self._input = BufferedSubFile()
163	n/a	self._msgstack = []
164	n/a	self._parse = self._parsegen().__next__
165	n/a	self._cur = None
166	n/a	self._last = None
167	n/a	self._headersonly = False
168	n/a
169	n/a	# Non-public interface for supporting Parser's headersonly flag
170	n/a	def _set_headersonly(self):
171	n/a	self._headersonly = True
172	n/a
173	n/a	def feed(self, data):
174	n/a	"""Push more data into the parser."""
175	n/a	self._input.push(data)
176	n/a	self._call_parse()
177	n/a
178	n/a	def _call_parse(self):
179	n/a	try:
180	n/a	self._parse()
181	n/a	except StopIteration:
182	n/a	pass
183	n/a
184	n/a	def close(self):
185	n/a	"""Parse all remaining data and return the root message object."""
186	n/a	self._input.close()
187	n/a	self._call_parse()
188	n/a	root = self._pop_message()
189	n/a	assert not self._msgstack
190	n/a	# Look for final set of defects
191	n/a	if root.get_content_maintype() == 'multipart' \
192	n/a	and not root.is_multipart():
193	n/a	defect = errors.MultipartInvariantViolationDefect()
194	n/a	self.policy.handle_defect(root, defect)
195	n/a	return root
196	n/a
197	n/a	def _new_message(self):
198	n/a	if self._old_style_factory:
199	n/a	msg = self._factory()
200	n/a	else:
201	n/a	msg = self._factory(policy=self.policy)
202	n/a	if self._cur and self._cur.get_content_type() == 'multipart/digest':
203	n/a	msg.set_default_type('message/rfc822')
204	n/a	if self._msgstack:
205	n/a	self._msgstack[-1].attach(msg)
206	n/a	self._msgstack.append(msg)
207	n/a	self._cur = msg
208	n/a	self._last = msg
209	n/a
210	n/a	def _pop_message(self):
211	n/a	retval = self._msgstack.pop()
212	n/a	if self._msgstack:
213	n/a	self._cur = self._msgstack[-1]
214	n/a	else:
215	n/a	self._cur = None
216	n/a	return retval
217	n/a
218	n/a	def _parsegen(self):
219	n/a	# Create a new message and start by parsing headers.
220	n/a	self._new_message()
221	n/a	headers = []
222	n/a	# Collect the headers, searching for a line that doesn't match the RFC
223	n/a	# 2822 header or continuation pattern (including an empty line).
224	n/a	for line in self._input:
225	n/a	if line is NeedMoreData:
226	n/a	yield NeedMoreData
227	n/a	continue
228	n/a	if not headerRE.match(line):
229	n/a	# If we saw the RFC defined header/body separator
230	n/a	# (i.e. newline), just throw it away. Otherwise the line is
231	n/a	# part of the body so push it back.
232	n/a	if not NLCRE.match(line):
233	n/a	defect = errors.MissingHeaderBodySeparatorDefect()
234	n/a	self.policy.handle_defect(self._cur, defect)
235	n/a	self._input.unreadline(line)
236	n/a	break
237	n/a	headers.append(line)
238	n/a	# Done with the headers, so parse them and figure out what we're
239	n/a	# supposed to see in the body of the message.
240	n/a	self._parse_headers(headers)
241	n/a	# Headers-only parsing is a backwards compatibility hack, which was
242	n/a	# necessary in the older parser, which could raise errors. All
243	n/a	# remaining lines in the input are thrown into the message body.
244	n/a	if self._headersonly:
245	n/a	lines = []
246	n/a	while True:
247	n/a	line = self._input.readline()
248	n/a	if line is NeedMoreData:
249	n/a	yield NeedMoreData
250	n/a	continue
251	n/a	if line == '':
252	n/a	break
253	n/a	lines.append(line)
254	n/a	self._cur.set_payload(EMPTYSTRING.join(lines))
255	n/a	return
256	n/a	if self._cur.get_content_type() == 'message/delivery-status':
257	n/a	# message/delivery-status contains blocks of headers separated by
258	n/a	# a blank line. We'll represent each header block as a separate
259	n/a	# nested message object, but the processing is a bit different
260	n/a	# than standard message/* types because there is no body for the
261	n/a	# nested messages. A blank line separates the subparts.
262	n/a	while True:
263	n/a	self._input.push_eof_matcher(NLCRE.match)
264	n/a	for retval in self._parsegen():
265	n/a	if retval is NeedMoreData:
266	n/a	yield NeedMoreData
267	n/a	continue
268	n/a	break
269	n/a	msg = self._pop_message()
270	n/a	# We need to pop the EOF matcher in order to tell if we're at
271	n/a	# the end of the current file, not the end of the last block
272	n/a	# of message headers.
273	n/a	self._input.pop_eof_matcher()
274	n/a	# The input stream must be sitting at the newline or at the
275	n/a	# EOF. We want to see if we're at the end of this subpart, so
276	n/a	# first consume the blank line, then test the next line to see
277	n/a	# if we're at this subpart's EOF.
278	n/a	while True:
279	n/a	line = self._input.readline()
280	n/a	if line is NeedMoreData:
281	n/a	yield NeedMoreData
282	n/a	continue
283	n/a	break
284	n/a	while True:
285	n/a	line = self._input.readline()
286	n/a	if line is NeedMoreData:
287	n/a	yield NeedMoreData
288	n/a	continue
289	n/a	break
290	n/a	if line == '':
291	n/a	break
292	n/a	# Not at EOF so this is a line we're going to need.
293	n/a	self._input.unreadline(line)
294	n/a	return
295	n/a	if self._cur.get_content_maintype() == 'message':
296	n/a	# The message claims to be a message/* type, then what follows is
297	n/a	# another RFC 2822 message.
298	n/a	for retval in self._parsegen():
299	n/a	if retval is NeedMoreData:
300	n/a	yield NeedMoreData
301	n/a	continue
302	n/a	break
303	n/a	self._pop_message()
304	n/a	return
305	n/a	if self._cur.get_content_maintype() == 'multipart':
306	n/a	boundary = self._cur.get_boundary()
307	n/a	if boundary is None:
308	n/a	# The message /claims/ to be a multipart but it has not
309	n/a	# defined a boundary. That's a problem which we'll handle by
310	n/a	# reading everything until the EOF and marking the message as
311	n/a	# defective.
312	n/a	defect = errors.NoBoundaryInMultipartDefect()
313	n/a	self.policy.handle_defect(self._cur, defect)
314	n/a	lines = []
315	n/a	for line in self._input:
316	n/a	if line is NeedMoreData:
317	n/a	yield NeedMoreData
318	n/a	continue
319	n/a	lines.append(line)
320	n/a	self._cur.set_payload(EMPTYSTRING.join(lines))
321	n/a	return
322	n/a	# Make sure a valid content type was specified per RFC 2045:6.4.
323	n/a	if (self._cur.get('content-transfer-encoding', '8bit').lower()
324	n/a	not in ('7bit', '8bit', 'binary')):
325	n/a	defect = errors.InvalidMultipartContentTransferEncodingDefect()
326	n/a	self.policy.handle_defect(self._cur, defect)
327	n/a	# Create a line match predicate which matches the inter-part
328	n/a	# boundary as well as the end-of-multipart boundary. Don't push
329	n/a	# this onto the input stream until we've scanned past the
330	n/a	# preamble.
331	n/a	separator = '--' + boundary
332	n/a	boundaryre = re.compile(
333	n/a	'(?P<sep>' + re.escape(separator) +
334	n/a	r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n\|\r\|\n)?$')
335	n/a	capturing_preamble = True
336	n/a	preamble = []
337	n/a	linesep = False
338	n/a	close_boundary_seen = False
339	n/a	while True:
340	n/a	line = self._input.readline()
341	n/a	if line is NeedMoreData:
342	n/a	yield NeedMoreData
343	n/a	continue
344	n/a	if line == '':
345	n/a	break
346	n/a	mo = boundaryre.match(line)
347	n/a	if mo:
348	n/a	# If we're looking at the end boundary, we're done with
349	n/a	# this multipart. If there was a newline at the end of
350	n/a	# the closing boundary, then we need to initialize the
351	n/a	# epilogue with the empty string (see below).
352	n/a	if mo.group('end'):
353	n/a	close_boundary_seen = True
354	n/a	linesep = mo.group('linesep')
355	n/a	break
356	n/a	# We saw an inter-part boundary. Were we in the preamble?
357	n/a	if capturing_preamble:
358	n/a	if preamble:
359	n/a	# According to RFC 2046, the last newline belongs
360	n/a	# to the boundary.
361	n/a	lastline = preamble[-1]
362	n/a	eolmo = NLCRE_eol.search(lastline)
363	n/a	if eolmo:
364	n/a	preamble[-1] = lastline[:-len(eolmo.group(0))]
365	n/a	self._cur.preamble = EMPTYSTRING.join(preamble)
366	n/a	capturing_preamble = False
367	n/a	self._input.unreadline(line)
368	n/a	continue
369	n/a	# We saw a boundary separating two parts. Consume any
370	n/a	# multiple boundary lines that may be following. Our
371	n/a	# interpretation of RFC 2046 BNF grammar does not produce
372	n/a	# body parts within such double boundaries.
373	n/a	while True:
374	n/a	line = self._input.readline()
375	n/a	if line is NeedMoreData:
376	n/a	yield NeedMoreData
377	n/a	continue
378	n/a	mo = boundaryre.match(line)
379	n/a	if not mo:
380	n/a	self._input.unreadline(line)
381	n/a	break
382	n/a	# Recurse to parse this subpart; the input stream points
383	n/a	# at the subpart's first line.
384	n/a	self._input.push_eof_matcher(boundaryre.match)
385	n/a	for retval in self._parsegen():
386	n/a	if retval is NeedMoreData:
387	n/a	yield NeedMoreData
388	n/a	continue
389	n/a	break
390	n/a	# Because of RFC 2046, the newline preceding the boundary
391	n/a	# separator actually belongs to the boundary, not the
392	n/a	# previous subpart's payload (or epilogue if the previous
393	n/a	# part is a multipart).
394	n/a	if self._last.get_content_maintype() == 'multipart':
395	n/a	epilogue = self._last.epilogue
396	n/a	if epilogue == '':
397	n/a	self._last.epilogue = None
398	n/a	elif epilogue is not None:
399	n/a	mo = NLCRE_eol.search(epilogue)
400	n/a	if mo:
401	n/a	end = len(mo.group(0))
402	n/a	self._last.epilogue = epilogue[:-end]
403	n/a	else:
404	n/a	payload = self._last._payload
405	n/a	if isinstance(payload, str):
406	n/a	mo = NLCRE_eol.search(payload)
407	n/a	if mo:
408	n/a	payload = payload[:-len(mo.group(0))]
409	n/a	self._last._payload = payload
410	n/a	self._input.pop_eof_matcher()
411	n/a	self._pop_message()
412	n/a	# Set the multipart up for newline cleansing, which will
413	n/a	# happen if we're in a nested multipart.
414	n/a	self._last = self._cur
415	n/a	else:
416	n/a	# I think we must be in the preamble
417	n/a	assert capturing_preamble
418	n/a	preamble.append(line)
419	n/a	# We've seen either the EOF or the end boundary. If we're still
420	n/a	# capturing the preamble, we never saw the start boundary. Note
421	n/a	# that as a defect and store the captured text as the payload.
422	n/a	if capturing_preamble:
423	n/a	defect = errors.StartBoundaryNotFoundDefect()
424	n/a	self.policy.handle_defect(self._cur, defect)
425	n/a	self._cur.set_payload(EMPTYSTRING.join(preamble))
426	n/a	epilogue = []
427	n/a	for line in self._input:
428	n/a	if line is NeedMoreData:
429	n/a	yield NeedMoreData
430	n/a	continue
431	n/a	self._cur.epilogue = EMPTYSTRING.join(epilogue)
432	n/a	return
433	n/a	# If we're not processing the preamble, then we might have seen
434	n/a	# EOF without seeing that end boundary...that is also a defect.
435	n/a	if not close_boundary_seen:
436	n/a	defect = errors.CloseBoundaryNotFoundDefect()
437	n/a	self.policy.handle_defect(self._cur, defect)
438	n/a	return
439	n/a	# Everything from here to the EOF is epilogue. If the end boundary
440	n/a	# ended in a newline, we'll need to make sure the epilogue isn't
441	n/a	# None
442	n/a	if linesep:
443	n/a	epilogue = ['']
444	n/a	else:
445	n/a	epilogue = []
446	n/a	for line in self._input:
447	n/a	if line is NeedMoreData:
448	n/a	yield NeedMoreData
449	n/a	continue
450	n/a	epilogue.append(line)
451	n/a	# Any CRLF at the front of the epilogue is not technically part of
452	n/a	# the epilogue. Also, watch out for an empty string epilogue,
453	n/a	# which means a single newline.
454	n/a	if epilogue:
455	n/a	firstline = epilogue[0]
456	n/a	bolmo = NLCRE_bol.match(firstline)
457	n/a	if bolmo:
458	n/a	epilogue[0] = firstline[len(bolmo.group(0)):]
459	n/a	self._cur.epilogue = EMPTYSTRING.join(epilogue)
460	n/a	return
461	n/a	# Otherwise, it's some non-multipart type, so the entire rest of the
462	n/a	# file contents becomes the payload.
463	n/a	lines = []
464	n/a	for line in self._input:
465	n/a	if line is NeedMoreData:
466	n/a	yield NeedMoreData
467	n/a	continue
468	n/a	lines.append(line)
469	n/a	self._cur.set_payload(EMPTYSTRING.join(lines))
470	n/a
471	n/a	def _parse_headers(self, lines):
472	n/a	# Passed a list of lines that make up the headers for the current msg
473	n/a	lastheader = ''
474	n/a	lastvalue = []
475	n/a	for lineno, line in enumerate(lines):
476	n/a	# Check for continuation
477	n/a	if line[0] in ' \t':
478	n/a	if not lastheader:
479	n/a	# The first line of the headers was a continuation. This
480	n/a	# is illegal, so let's note the defect, store the illegal
481	n/a	# line, and ignore it for purposes of headers.
482	n/a	defect = errors.FirstHeaderLineIsContinuationDefect(line)
483	n/a	self.policy.handle_defect(self._cur, defect)
484	n/a	continue
485	n/a	lastvalue.append(line)
486	n/a	continue
487	n/a	if lastheader:
488	n/a	self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
489	n/a	lastheader, lastvalue = '', []
490	n/a	# Check for envelope header, i.e. unix-from
491	n/a	if line.startswith('From '):
492	n/a	if lineno == 0:
493	n/a	# Strip off the trailing newline
494	n/a	mo = NLCRE_eol.search(line)
495	n/a	if mo:
496	n/a	line = line[:-len(mo.group(0))]
497	n/a	self._cur.set_unixfrom(line)
498	n/a	continue
499	n/a	elif lineno == len(lines) - 1:
500	n/a	# Something looking like a unix-from at the end - it's
501	n/a	# probably the first line of the body, so push back the
502	n/a	# line and stop.
503	n/a	self._input.unreadline(line)
504	n/a	return
505	n/a	else:
506	n/a	# Weirdly placed unix-from line. Note this as a defect
507	n/a	# and ignore it.
508	n/a	defect = errors.MisplacedEnvelopeHeaderDefect(line)
509	n/a	self._cur.defects.append(defect)
510	n/a	continue
511	n/a	# Split the line on the colon separating field name from value.
512	n/a	# There will always be a colon, because if there wasn't the part of
513	n/a	# the parser that calls us would have started parsing the body.
514	n/a	i = line.find(':')
515	n/a
516	n/a	# If the colon is on the start of the line the header is clearly
517	n/a	# malformed, but we might be able to salvage the rest of the
518	n/a	# message. Track the error but keep going.
519	n/a	if i == 0:
520	n/a	defect = errors.InvalidHeaderDefect("Missing header name.")
521	n/a	self._cur.defects.append(defect)
522	n/a	continue
523	n/a
524	n/a	assert i>0, "_parse_headers fed line with no : and no leading WS"
525	n/a	lastheader = line[:i]
526	n/a	lastvalue = [line]
527	n/a	# Done with all the lines, so handle the last header.
528	n/a	if lastheader:
529	n/a	self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
530	n/a
531	n/a
532	n/a	class BytesFeedParser(FeedParser):
533	n/a	"""Like FeedParser, but feed accepts bytes."""
534	n/a
535	n/a	def feed(self, data):
536	n/a	super().feed(data.decode('ascii', 'surrogateescape'))