ยปCore Development>Code coverage>Lib/email/_header_value_parser.py

Python code coverage for Lib/email/_header_value_parser.py

#countcontent
1n/a"""Header value parser implementing various email-related RFC parsing rules.
2n/a
3n/aThe parsing methods defined in this module implement various email related
4n/aparsing rules. Principal among them is RFC 5322, which is the followon
5n/ato RFC 2822 and primarily a clarification of the former. It also implements
6n/aRFC 2047 encoded word decoding.
7n/a
8n/aRFC 5322 goes to considerable trouble to maintain backward compatibility with
9n/aRFC 822 in the parse phase, while cleaning up the structure on the generation
10n/aphase. This parser supports correct RFC 5322 generation by tagging white space
11n/aas folding white space only when folding is allowed in the non-obsolete rule
12n/asets. Actually, the parser is even more generous when accepting input than RFC
13n/a5322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages.
14n/aWhere possible deviations from the standard are annotated on the 'defects'
15n/aattribute of tokens that deviate.
16n/a
17n/aThe general structure of the parser follows RFC 5322, and uses its terminology
18n/awhere there is a direct correspondence. Where the implementation requires a
19n/asomewhat different structure than that used by the formal grammar, new terms
20n/athat mimic the closest existing terms are used. Thus, it really helps to have
21n/aa copy of RFC 5322 handy when studying this code.
22n/a
23n/aInput to the parser is a string that has already been unfolded according to
24n/aRFC 5322 rules. According to the RFC this unfolding is the very first step, and
25n/athis parser leaves the unfolding step to a higher level message parser, which
26n/awill have already detected the line breaks that need unfolding while
27n/adetermining the beginning and end of each header.
28n/a
29n/aThe output of the parser is a TokenList object, which is a list subclass. A
30n/aTokenList is a recursive data structure. The terminal nodes of the structure
31n/aare Terminal objects, which are subclasses of str. These do not correspond
32n/adirectly to terminal objects in the formal grammar, but are instead more
33n/apractical higher level combinations of true terminals.
34n/a
35n/aAll TokenList and Terminal objects have a 'value' attribute, which produces the
36n/asemantically meaningful value of that part of the parse subtree. The value of
37n/aall whitespace tokens (no matter how many sub-tokens they may contain) is a
38n/asingle space, as per the RFC rules. This includes 'CFWS', which is herein
39n/aincluded in the general class of whitespace tokens. There is one exception to
40n/athe rule that whitespace tokens are collapsed into single spaces in values: in
41n/athe value of a 'bare-quoted-string' (a quoted-string with no leading or
42n/atrailing whitespace), any whitespace that appeared between the quotation marks
43n/ais preserved in the returned value. Note that in all Terminal strings quoted
44n/apairs are turned into their unquoted values.
45n/a
46n/aAll TokenList and Terminal objects also have a string value, which attempts to
47n/abe a "canonical" representation of the RFC-compliant form of the substring that
48n/aproduced the parsed subtree, including minimal use of quoted pair quoting.
49n/aWhitespace runs are not collapsed.
50n/a
51n/aComment tokens also have a 'content' attribute providing the string found
52n/abetween the parens (including any nested comments) with whitespace preserved.
53n/a
54n/aAll TokenList and Terminal objects have a 'defects' attribute which is a
55n/apossibly empty list all of the defects found while creating the token. Defects
56n/amay appear on any token in the tree, and a composite list of all defects in the
57n/asubtree is available through the 'all_defects' attribute of any node. (For
58n/aTerminal notes x.defects == x.all_defects.)
59n/a
60n/aEach object in a parse tree is called a 'token', and each has a 'token_type'
61n/aattribute that gives the name from the RFC 5322 grammar that it represents.
62n/aNot all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that
63n/amay be produced: 'ptext'. A 'ptext' is a string of printable ascii characters.
64n/aIt is returned in place of lists of (ctext/quoted-pair) and
65n/a(qtext/quoted-pair).
66n/a
67n/aXXX: provide complete list of token types.
68n/a"""
69n/a
70n/aimport re
71n/aimport urllib # For urllib.parse.unquote
72n/afrom string import hexdigits
73n/afrom collections import OrderedDict
74n/afrom operator import itemgetter
75n/afrom email import _encoded_words as _ew
76n/afrom email import errors
77n/afrom email import utils
78n/a
79n/a#
80n/a# Useful constants and functions
81n/a#
82n/a
83n/aWSP = set(' \t')
84n/aCFWS_LEADER = WSP | set('(')
85n/aSPECIALS = set(r'()<>@,:;.\"[]')
86n/aATOM_ENDS = SPECIALS | WSP
87n/aDOT_ATOM_ENDS = ATOM_ENDS - set('.')
88n/a# '.', '"', and '(' do not end phrases in order to support obs-phrase
89n/aPHRASE_ENDS = SPECIALS - set('."(')
90n/aTSPECIALS = (SPECIALS | set('/?=')) - set('.')
91n/aTOKEN_ENDS = TSPECIALS | WSP
92n/aASPECIALS = TSPECIALS | set("*'%")
93n/aATTRIBUTE_ENDS = ASPECIALS | WSP
94n/aEXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
95n/a
96n/adef quote_string(value):
97n/a return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
98n/a
99n/a#
100n/a# Accumulator for header folding
101n/a#
102n/a
103n/aclass _Folded:
104n/a
105n/a def __init__(self, maxlen, policy):
106n/a self.maxlen = maxlen
107n/a self.policy = policy
108n/a self.lastlen = 0
109n/a self.stickyspace = None
110n/a self.firstline = True
111n/a self.done = []
112n/a self.current = []
113n/a
114n/a def newline(self):
115n/a self.done.extend(self.current)
116n/a self.done.append(self.policy.linesep)
117n/a self.current.clear()
118n/a self.lastlen = 0
119n/a
120n/a def finalize(self):
121n/a if self.current:
122n/a self.newline()
123n/a
124n/a def __str__(self):
125n/a return ''.join(self.done)
126n/a
127n/a def append(self, stoken):
128n/a self.current.append(stoken)
129n/a
130n/a def append_if_fits(self, token, stoken=None):
131n/a if stoken is None:
132n/a stoken = str(token)
133n/a l = len(stoken)
134n/a if self.stickyspace is not None:
135n/a stickyspace_len = len(self.stickyspace)
136n/a if self.lastlen + stickyspace_len + l <= self.maxlen:
137n/a self.current.append(self.stickyspace)
138n/a self.lastlen += stickyspace_len
139n/a self.current.append(stoken)
140n/a self.lastlen += l
141n/a self.stickyspace = None
142n/a self.firstline = False
143n/a return True
144n/a if token.has_fws:
145n/a ws = token.pop_leading_fws()
146n/a if ws is not None:
147n/a self.stickyspace += str(ws)
148n/a stickyspace_len += len(ws)
149n/a token._fold(self)
150n/a return True
151n/a if stickyspace_len and l + 1 <= self.maxlen:
152n/a margin = self.maxlen - l
153n/a if 0 < margin < stickyspace_len:
154n/a trim = stickyspace_len - margin
155n/a self.current.append(self.stickyspace[:trim])
156n/a self.stickyspace = self.stickyspace[trim:]
157n/a stickyspace_len = trim
158n/a self.newline()
159n/a self.current.append(self.stickyspace)
160n/a self.current.append(stoken)
161n/a self.lastlen = l + stickyspace_len
162n/a self.stickyspace = None
163n/a self.firstline = False
164n/a return True
165n/a if not self.firstline:
166n/a self.newline()
167n/a self.current.append(self.stickyspace)
168n/a self.current.append(stoken)
169n/a self.stickyspace = None
170n/a self.firstline = False
171n/a return True
172n/a if self.lastlen + l <= self.maxlen:
173n/a self.current.append(stoken)
174n/a self.lastlen += l
175n/a return True
176n/a if l < self.maxlen:
177n/a self.newline()
178n/a self.current.append(stoken)
179n/a self.lastlen = l
180n/a return True
181n/a return False
182n/a
183n/a#
184n/a# TokenList and its subclasses
185n/a#
186n/a
187n/aclass TokenList(list):
188n/a
189n/a token_type = None
190n/a
191n/a def __init__(self, *args, **kw):
192n/a super().__init__(*args, **kw)
193n/a self.defects = []
194n/a
195n/a def __str__(self):
196n/a return ''.join(str(x) for x in self)
197n/a
198n/a def __repr__(self):
199n/a return '{}({})'.format(self.__class__.__name__,
200n/a super().__repr__())
201n/a
202n/a @property
203n/a def value(self):
204n/a return ''.join(x.value for x in self if x.value)
205n/a
206n/a @property
207n/a def all_defects(self):
208n/a return sum((x.all_defects for x in self), self.defects)
209n/a
210n/a #
211n/a # Folding API
212n/a #
213n/a # parts():
214n/a #
215n/a # return a list of objects that constitute the "higher level syntactic
216n/a # objects" specified by the RFC as the best places to fold a header line.
217n/a # The returned objects must include leading folding white space, even if
218n/a # this means mutating the underlying parse tree of the object. Each object
219n/a # is only responsible for returning *its* parts, and should not drill down
220n/a # to any lower level except as required to meet the leading folding white
221n/a # space constraint.
222n/a #
223n/a # _fold(folded):
224n/a #
225n/a # folded: the result accumulator. This is an instance of _Folded.
226n/a # (XXX: I haven't finished factoring this out yet, the folding code
227n/a # pretty much uses this as a state object.) When the folded.current
228n/a # contains as much text as will fit, the _fold method should call
229n/a # folded.newline.
230n/a # folded.lastlen: the current length of the test stored in folded.current.
231n/a # folded.maxlen: The maximum number of characters that may appear on a
232n/a # folded line. Differs from the policy setting in that "no limit" is
233n/a # represented by +inf, which means it can be used in the trivially
234n/a # logical fashion in comparisons.
235n/a #
236n/a # Currently no subclasses implement parts, and I think this will remain
237n/a # true. A subclass only needs to implement _fold when the generic version
238n/a # isn't sufficient. _fold will need to be implemented primarily when it is
239n/a # possible for encoded words to appear in the specialized token-list, since
240n/a # there is no generic algorithm that can know where exactly the encoded
241n/a # words are allowed. A _fold implementation is responsible for filling
242n/a # lines in the same general way that the top level _fold does. It may, and
243n/a # should, call the _fold method of sub-objects in a similar fashion to that
244n/a # of the top level _fold.
245n/a #
246n/a # XXX: I'm hoping it will be possible to factor the existing code further
247n/a # to reduce redundancy and make the logic clearer.
248n/a
249n/a @property
250n/a def parts(self):
251n/a klass = self.__class__
252n/a this = []
253n/a for token in self:
254n/a if token.startswith_fws():
255n/a if this:
256n/a yield this[0] if len(this)==1 else klass(this)
257n/a this.clear()
258n/a end_ws = token.pop_trailing_ws()
259n/a this.append(token)
260n/a if end_ws:
261n/a yield klass(this)
262n/a this = [end_ws]
263n/a if this:
264n/a yield this[0] if len(this)==1 else klass(this)
265n/a
266n/a def startswith_fws(self):
267n/a return self[0].startswith_fws()
268n/a
269n/a def pop_leading_fws(self):
270n/a if self[0].token_type == 'fws':
271n/a return self.pop(0)
272n/a return self[0].pop_leading_fws()
273n/a
274n/a def pop_trailing_ws(self):
275n/a if self[-1].token_type == 'cfws':
276n/a return self.pop(-1)
277n/a return self[-1].pop_trailing_ws()
278n/a
279n/a @property
280n/a def has_fws(self):
281n/a for part in self:
282n/a if part.has_fws:
283n/a return True
284n/a return False
285n/a
286n/a def has_leading_comment(self):
287n/a return self[0].has_leading_comment()
288n/a
289n/a @property
290n/a def comments(self):
291n/a comments = []
292n/a for token in self:
293n/a comments.extend(token.comments)
294n/a return comments
295n/a
296n/a def fold(self, *, policy):
297n/a # max_line_length 0/None means no limit, ie: infinitely long.
298n/a maxlen = policy.max_line_length or float("+inf")
299n/a folded = _Folded(maxlen, policy)
300n/a self._fold(folded)
301n/a folded.finalize()
302n/a return str(folded)
303n/a
304n/a def as_encoded_word(self, charset):
305n/a # This works only for things returned by 'parts', which include
306n/a # the leading fws, if any, that should be used.
307n/a res = []
308n/a ws = self.pop_leading_fws()
309n/a if ws:
310n/a res.append(ws)
311n/a trailer = self.pop(-1) if self[-1].token_type=='fws' else ''
312n/a res.append(_ew.encode(str(self), charset))
313n/a res.append(trailer)
314n/a return ''.join(res)
315n/a
316n/a def cte_encode(self, charset, policy):
317n/a res = []
318n/a for part in self:
319n/a res.append(part.cte_encode(charset, policy))
320n/a return ''.join(res)
321n/a
322n/a def _fold(self, folded):
323n/a encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
324n/a for part in self.parts:
325n/a tstr = str(part)
326n/a tlen = len(tstr)
327n/a try:
328n/a str(part).encode(encoding)
329n/a except UnicodeEncodeError:
330n/a if any(isinstance(x, errors.UndecodableBytesDefect)
331n/a for x in part.all_defects):
332n/a charset = 'unknown-8bit'
333n/a else:
334n/a # XXX: this should be a policy setting when utf8 is False.
335n/a charset = 'utf-8'
336n/a tstr = part.cte_encode(charset, folded.policy)
337n/a tlen = len(tstr)
338n/a if folded.append_if_fits(part, tstr):
339n/a continue
340n/a # Peel off the leading whitespace if any and make it sticky, to
341n/a # avoid infinite recursion.
342n/a ws = part.pop_leading_fws()
343n/a if ws is not None:
344n/a # Peel off the leading whitespace and make it sticky, to
345n/a # avoid infinite recursion.
346n/a folded.stickyspace = str(part.pop(0))
347n/a if folded.append_if_fits(part):
348n/a continue
349n/a if part.has_fws:
350n/a part._fold(folded)
351n/a continue
352n/a # There are no fold points in this one; it is too long for a single
353n/a # line and can't be split...we just have to put it on its own line.
354n/a folded.append(tstr)
355n/a folded.newline()
356n/a
357n/a def pprint(self, indent=''):
358n/a print('\n'.join(self._pp(indent='')))
359n/a
360n/a def ppstr(self, indent=''):
361n/a return '\n'.join(self._pp(indent=''))
362n/a
363n/a def _pp(self, indent=''):
364n/a yield '{}{}/{}('.format(
365n/a indent,
366n/a self.__class__.__name__,
367n/a self.token_type)
368n/a for token in self:
369n/a if not hasattr(token, '_pp'):
370n/a yield (indent + ' !! invalid element in token '
371n/a 'list: {!r}'.format(token))
372n/a else:
373n/a yield from token._pp(indent+' ')
374n/a if self.defects:
375n/a extra = ' Defects: {}'.format(self.defects)
376n/a else:
377n/a extra = ''
378n/a yield '{}){}'.format(indent, extra)
379n/a
380n/a
381n/aclass WhiteSpaceTokenList(TokenList):
382n/a
383n/a @property
384n/a def value(self):
385n/a return ' '
386n/a
387n/a @property
388n/a def comments(self):
389n/a return [x.content for x in self if x.token_type=='comment']
390n/a
391n/a
392n/aclass UnstructuredTokenList(TokenList):
393n/a
394n/a token_type = 'unstructured'
395n/a
396n/a def _fold(self, folded):
397n/a last_ew = None
398n/a encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
399n/a for part in self.parts:
400n/a tstr = str(part)
401n/a is_ew = False
402n/a try:
403n/a str(part).encode(encoding)
404n/a except UnicodeEncodeError:
405n/a if any(isinstance(x, errors.UndecodableBytesDefect)
406n/a for x in part.all_defects):
407n/a charset = 'unknown-8bit'
408n/a else:
409n/a charset = 'utf-8'
410n/a if last_ew is not None:
411n/a # We've already done an EW, combine this one with it
412n/a # if there's room.
413n/a chunk = get_unstructured(
414n/a ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
415n/a oldlastlen = sum(len(x) for x in folded.current[:last_ew])
416n/a schunk = str(chunk)
417n/a lchunk = len(schunk)
418n/a if oldlastlen + lchunk <= folded.maxlen:
419n/a del folded.current[last_ew:]
420n/a folded.append(schunk)
421n/a folded.lastlen = oldlastlen + lchunk
422n/a continue
423n/a tstr = part.as_encoded_word(charset)
424n/a is_ew = True
425n/a if folded.append_if_fits(part, tstr):
426n/a if is_ew:
427n/a last_ew = len(folded.current) - 1
428n/a continue
429n/a if is_ew or last_ew:
430n/a # It's too big to fit on the line, but since we've
431n/a # got encoded words we can use encoded word folding.
432n/a part._fold_as_ew(folded)
433n/a continue
434n/a # Peel off the leading whitespace if any and make it sticky, to
435n/a # avoid infinite recursion.
436n/a ws = part.pop_leading_fws()
437n/a if ws is not None:
438n/a folded.stickyspace = str(ws)
439n/a if folded.append_if_fits(part):
440n/a continue
441n/a if part.has_fws:
442n/a part._fold(folded)
443n/a continue
444n/a # It can't be split...we just have to put it on its own line.
445n/a folded.append(tstr)
446n/a folded.newline()
447n/a last_ew = None
448n/a
449n/a def cte_encode(self, charset, policy):
450n/a res = []
451n/a last_ew = None
452n/a for part in self:
453n/a spart = str(part)
454n/a try:
455n/a spart.encode('us-ascii')
456n/a res.append(spart)
457n/a except UnicodeEncodeError:
458n/a if last_ew is None:
459n/a res.append(part.cte_encode(charset, policy))
460n/a last_ew = len(res)
461n/a else:
462n/a tl = get_unstructured(''.join(res[last_ew:] + [spart]))
463n/a res.append(tl.as_encoded_word(charset))
464n/a return ''.join(res)
465n/a
466n/a
467n/aclass Phrase(TokenList):
468n/a
469n/a token_type = 'phrase'
470n/a
471n/a def _fold(self, folded):
472n/a # As with Unstructured, we can have pure ASCII with or without
473n/a # surrogateescape encoded bytes, or we could have unicode. But this
474n/a # case is more complicated, since we have to deal with the various
475n/a # sub-token types and how they can be composed in the face of
476n/a # unicode-that-needs-CTE-encoding, and the fact that if a token a
477n/a # comment that becomes a barrier across which we can't compose encoded
478n/a # words.
479n/a last_ew = None
480n/a encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
481n/a for part in self.parts:
482n/a tstr = str(part)
483n/a tlen = len(tstr)
484n/a has_ew = False
485n/a try:
486n/a str(part).encode(encoding)
487n/a except UnicodeEncodeError:
488n/a if any(isinstance(x, errors.UndecodableBytesDefect)
489n/a for x in part.all_defects):
490n/a charset = 'unknown-8bit'
491n/a else:
492n/a charset = 'utf-8'
493n/a if last_ew is not None and not part.has_leading_comment():
494n/a # We've already done an EW, let's see if we can combine
495n/a # this one with it. The last_ew logic ensures that all we
496n/a # have at this point is atoms, no comments or quoted
497n/a # strings. So we can treat the text between the last
498n/a # encoded word and the content of this token as
499n/a # unstructured text, and things will work correctly. But
500n/a # we have to strip off any trailing comment on this token
501n/a # first, and if it is a quoted string we have to pull out
502n/a # the content (we're encoding it, so it no longer needs to
503n/a # be quoted).
504n/a if part[-1].token_type == 'cfws' and part.comments:
505n/a remainder = part.pop(-1)
506n/a else:
507n/a remainder = ''
508n/a for i, token in enumerate(part):
509n/a if token.token_type == 'bare-quoted-string':
510n/a part[i] = UnstructuredTokenList(token[:])
511n/a chunk = get_unstructured(
512n/a ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
513n/a schunk = str(chunk)
514n/a lchunk = len(schunk)
515n/a if last_ew + lchunk <= folded.maxlen:
516n/a del folded.current[last_ew:]
517n/a folded.append(schunk)
518n/a folded.lastlen = sum(len(x) for x in folded.current)
519n/a continue
520n/a tstr = part.as_encoded_word(charset)
521n/a tlen = len(tstr)
522n/a has_ew = True
523n/a if folded.append_if_fits(part, tstr):
524n/a if has_ew and not part.comments:
525n/a last_ew = len(folded.current) - 1
526n/a elif part.comments or part.token_type == 'quoted-string':
527n/a # If a comment is involved we can't combine EWs. And if a
528n/a # quoted string is involved, it's not worth the effort to
529n/a # try to combine them.
530n/a last_ew = None
531n/a continue
532n/a part._fold(folded)
533n/a
534n/a def cte_encode(self, charset, policy):
535n/a res = []
536n/a last_ew = None
537n/a is_ew = False
538n/a for part in self:
539n/a spart = str(part)
540n/a try:
541n/a spart.encode('us-ascii')
542n/a res.append(spart)
543n/a except UnicodeEncodeError:
544n/a is_ew = True
545n/a if last_ew is None:
546n/a if not part.comments:
547n/a last_ew = len(res)
548n/a res.append(part.cte_encode(charset, policy))
549n/a elif not part.has_leading_comment():
550n/a if part[-1].token_type == 'cfws' and part.comments:
551n/a remainder = part.pop(-1)
552n/a else:
553n/a remainder = ''
554n/a for i, token in enumerate(part):
555n/a if token.token_type == 'bare-quoted-string':
556n/a part[i] = UnstructuredTokenList(token[:])
557n/a tl = get_unstructured(''.join(res[last_ew:] + [spart]))
558n/a res[last_ew:] = [tl.as_encoded_word(charset)]
559n/a if part.comments or (not is_ew and part.token_type == 'quoted-string'):
560n/a last_ew = None
561n/a return ''.join(res)
562n/a
563n/aclass Word(TokenList):
564n/a
565n/a token_type = 'word'
566n/a
567n/a
568n/aclass CFWSList(WhiteSpaceTokenList):
569n/a
570n/a token_type = 'cfws'
571n/a
572n/a def has_leading_comment(self):
573n/a return bool(self.comments)
574n/a
575n/a
576n/aclass Atom(TokenList):
577n/a
578n/a token_type = 'atom'
579n/a
580n/a
581n/aclass Token(TokenList):
582n/a
583n/a token_type = 'token'
584n/a
585n/a
586n/aclass EncodedWord(TokenList):
587n/a
588n/a token_type = 'encoded-word'
589n/a cte = None
590n/a charset = None
591n/a lang = None
592n/a
593n/a @property
594n/a def encoded(self):
595n/a if self.cte is not None:
596n/a return self.cte
597n/a _ew.encode(str(self), self.charset)
598n/a
599n/a
600n/a
601n/aclass QuotedString(TokenList):
602n/a
603n/a token_type = 'quoted-string'
604n/a
605n/a @property
606n/a def content(self):
607n/a for x in self:
608n/a if x.token_type == 'bare-quoted-string':
609n/a return x.value
610n/a
611n/a @property
612n/a def quoted_value(self):
613n/a res = []
614n/a for x in self:
615n/a if x.token_type == 'bare-quoted-string':
616n/a res.append(str(x))
617n/a else:
618n/a res.append(x.value)
619n/a return ''.join(res)
620n/a
621n/a @property
622n/a def stripped_value(self):
623n/a for token in self:
624n/a if token.token_type == 'bare-quoted-string':
625n/a return token.value
626n/a
627n/a
628n/aclass BareQuotedString(QuotedString):
629n/a
630n/a token_type = 'bare-quoted-string'
631n/a
632n/a def __str__(self):
633n/a return quote_string(''.join(str(x) for x in self))
634n/a
635n/a @property
636n/a def value(self):
637n/a return ''.join(str(x) for x in self)
638n/a
639n/a
640n/aclass Comment(WhiteSpaceTokenList):
641n/a
642n/a token_type = 'comment'
643n/a
644n/a def __str__(self):
645n/a return ''.join(sum([
646n/a ["("],
647n/a [self.quote(x) for x in self],
648n/a [")"],
649n/a ], []))
650n/a
651n/a def quote(self, value):
652n/a if value.token_type == 'comment':
653n/a return str(value)
654n/a return str(value).replace('\\', '\\\\').replace(
655n/a '(', r'\(').replace(
656n/a ')', r'\)')
657n/a
658n/a @property
659n/a def content(self):
660n/a return ''.join(str(x) for x in self)
661n/a
662n/a @property
663n/a def comments(self):
664n/a return [self.content]
665n/a
666n/aclass AddressList(TokenList):
667n/a
668n/a token_type = 'address-list'
669n/a
670n/a @property
671n/a def addresses(self):
672n/a return [x for x in self if x.token_type=='address']
673n/a
674n/a @property
675n/a def mailboxes(self):
676n/a return sum((x.mailboxes
677n/a for x in self if x.token_type=='address'), [])
678n/a
679n/a @property
680n/a def all_mailboxes(self):
681n/a return sum((x.all_mailboxes
682n/a for x in self if x.token_type=='address'), [])
683n/a
684n/a
685n/aclass Address(TokenList):
686n/a
687n/a token_type = 'address'
688n/a
689n/a @property
690n/a def display_name(self):
691n/a if self[0].token_type == 'group':
692n/a return self[0].display_name
693n/a
694n/a @property
695n/a def mailboxes(self):
696n/a if self[0].token_type == 'mailbox':
697n/a return [self[0]]
698n/a elif self[0].token_type == 'invalid-mailbox':
699n/a return []
700n/a return self[0].mailboxes
701n/a
702n/a @property
703n/a def all_mailboxes(self):
704n/a if self[0].token_type == 'mailbox':
705n/a return [self[0]]
706n/a elif self[0].token_type == 'invalid-mailbox':
707n/a return [self[0]]
708n/a return self[0].all_mailboxes
709n/a
710n/aclass MailboxList(TokenList):
711n/a
712n/a token_type = 'mailbox-list'
713n/a
714n/a @property
715n/a def mailboxes(self):
716n/a return [x for x in self if x.token_type=='mailbox']
717n/a
718n/a @property
719n/a def all_mailboxes(self):
720n/a return [x for x in self
721n/a if x.token_type in ('mailbox', 'invalid-mailbox')]
722n/a
723n/a
724n/aclass GroupList(TokenList):
725n/a
726n/a token_type = 'group-list'
727n/a
728n/a @property
729n/a def mailboxes(self):
730n/a if not self or self[0].token_type != 'mailbox-list':
731n/a return []
732n/a return self[0].mailboxes
733n/a
734n/a @property
735n/a def all_mailboxes(self):
736n/a if not self or self[0].token_type != 'mailbox-list':
737n/a return []
738n/a return self[0].all_mailboxes
739n/a
740n/a
741n/aclass Group(TokenList):
742n/a
743n/a token_type = "group"
744n/a
745n/a @property
746n/a def mailboxes(self):
747n/a if self[2].token_type != 'group-list':
748n/a return []
749n/a return self[2].mailboxes
750n/a
751n/a @property
752n/a def all_mailboxes(self):
753n/a if self[2].token_type != 'group-list':
754n/a return []
755n/a return self[2].all_mailboxes
756n/a
757n/a @property
758n/a def display_name(self):
759n/a return self[0].display_name
760n/a
761n/a
762n/aclass NameAddr(TokenList):
763n/a
764n/a token_type = 'name-addr'
765n/a
766n/a @property
767n/a def display_name(self):
768n/a if len(self) == 1:
769n/a return None
770n/a return self[0].display_name
771n/a
772n/a @property
773n/a def local_part(self):
774n/a return self[-1].local_part
775n/a
776n/a @property
777n/a def domain(self):
778n/a return self[-1].domain
779n/a
780n/a @property
781n/a def route(self):
782n/a return self[-1].route
783n/a
784n/a @property
785n/a def addr_spec(self):
786n/a return self[-1].addr_spec
787n/a
788n/a
789n/aclass AngleAddr(TokenList):
790n/a
791n/a token_type = 'angle-addr'
792n/a
793n/a @property
794n/a def local_part(self):
795n/a for x in self:
796n/a if x.token_type == 'addr-spec':
797n/a return x.local_part
798n/a
799n/a @property
800n/a def domain(self):
801n/a for x in self:
802n/a if x.token_type == 'addr-spec':
803n/a return x.domain
804n/a
805n/a @property
806n/a def route(self):
807n/a for x in self:
808n/a if x.token_type == 'obs-route':
809n/a return x.domains
810n/a
811n/a @property
812n/a def addr_spec(self):
813n/a for x in self:
814n/a if x.token_type == 'addr-spec':
815n/a return x.addr_spec
816n/a else:
817n/a return '<>'
818n/a
819n/a
820n/aclass ObsRoute(TokenList):
821n/a
822n/a token_type = 'obs-route'
823n/a
824n/a @property
825n/a def domains(self):
826n/a return [x.domain for x in self if x.token_type == 'domain']
827n/a
828n/a
829n/aclass Mailbox(TokenList):
830n/a
831n/a token_type = 'mailbox'
832n/a
833n/a @property
834n/a def display_name(self):
835n/a if self[0].token_type == 'name-addr':
836n/a return self[0].display_name
837n/a
838n/a @property
839n/a def local_part(self):
840n/a return self[0].local_part
841n/a
842n/a @property
843n/a def domain(self):
844n/a return self[0].domain
845n/a
846n/a @property
847n/a def route(self):
848n/a if self[0].token_type == 'name-addr':
849n/a return self[0].route
850n/a
851n/a @property
852n/a def addr_spec(self):
853n/a return self[0].addr_spec
854n/a
855n/a
856n/aclass InvalidMailbox(TokenList):
857n/a
858n/a token_type = 'invalid-mailbox'
859n/a
860n/a @property
861n/a def display_name(self):
862n/a return None
863n/a
864n/a local_part = domain = route = addr_spec = display_name
865n/a
866n/a
867n/aclass Domain(TokenList):
868n/a
869n/a token_type = 'domain'
870n/a
871n/a @property
872n/a def domain(self):
873n/a return ''.join(super().value.split())
874n/a
875n/a
876n/aclass DotAtom(TokenList):
877n/a
878n/a token_type = 'dot-atom'
879n/a
880n/a
881n/aclass DotAtomText(TokenList):
882n/a
883n/a token_type = 'dot-atom-text'
884n/a
885n/a
886n/aclass AddrSpec(TokenList):
887n/a
888n/a token_type = 'addr-spec'
889n/a
890n/a @property
891n/a def local_part(self):
892n/a return self[0].local_part
893n/a
894n/a @property
895n/a def domain(self):
896n/a if len(self) < 3:
897n/a return None
898n/a return self[-1].domain
899n/a
900n/a @property
901n/a def value(self):
902n/a if len(self) < 3:
903n/a return self[0].value
904n/a return self[0].value.rstrip()+self[1].value+self[2].value.lstrip()
905n/a
906n/a @property
907n/a def addr_spec(self):
908n/a nameset = set(self.local_part)
909n/a if len(nameset) > len(nameset-DOT_ATOM_ENDS):
910n/a lp = quote_string(self.local_part)
911n/a else:
912n/a lp = self.local_part
913n/a if self.domain is not None:
914n/a return lp + '@' + self.domain
915n/a return lp
916n/a
917n/a
918n/aclass ObsLocalPart(TokenList):
919n/a
920n/a token_type = 'obs-local-part'
921n/a
922n/a
923n/aclass DisplayName(Phrase):
924n/a
925n/a token_type = 'display-name'
926n/a
927n/a @property
928n/a def display_name(self):
929n/a res = TokenList(self)
930n/a if res[0].token_type == 'cfws':
931n/a res.pop(0)
932n/a else:
933n/a if res[0][0].token_type == 'cfws':
934n/a res[0] = TokenList(res[0][1:])
935n/a if res[-1].token_type == 'cfws':
936n/a res.pop()
937n/a else:
938n/a if res[-1][-1].token_type == 'cfws':
939n/a res[-1] = TokenList(res[-1][:-1])
940n/a return res.value
941n/a
942n/a @property
943n/a def value(self):
944n/a quote = False
945n/a if self.defects:
946n/a quote = True
947n/a else:
948n/a for x in self:
949n/a if x.token_type == 'quoted-string':
950n/a quote = True
951n/a if quote:
952n/a pre = post = ''
953n/a if self[0].token_type=='cfws' or self[0][0].token_type=='cfws':
954n/a pre = ' '
955n/a if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws':
956n/a post = ' '
957n/a return pre+quote_string(self.display_name)+post
958n/a else:
959n/a return super().value
960n/a
961n/a
962n/aclass LocalPart(TokenList):
963n/a
964n/a token_type = 'local-part'
965n/a
966n/a @property
967n/a def value(self):
968n/a if self[0].token_type == "quoted-string":
969n/a return self[0].quoted_value
970n/a else:
971n/a return self[0].value
972n/a
973n/a @property
974n/a def local_part(self):
975n/a # Strip whitespace from front, back, and around dots.
976n/a res = [DOT]
977n/a last = DOT
978n/a last_is_tl = False
979n/a for tok in self[0] + [DOT]:
980n/a if tok.token_type == 'cfws':
981n/a continue
982n/a if (last_is_tl and tok.token_type == 'dot' and
983n/a last[-1].token_type == 'cfws'):
984n/a res[-1] = TokenList(last[:-1])
985n/a is_tl = isinstance(tok, TokenList)
986n/a if (is_tl and last.token_type == 'dot' and
987n/a tok[0].token_type == 'cfws'):
988n/a res.append(TokenList(tok[1:]))
989n/a else:
990n/a res.append(tok)
991n/a last = res[-1]
992n/a last_is_tl = is_tl
993n/a res = TokenList(res[1:-1])
994n/a return res.value
995n/a
996n/a
997n/aclass DomainLiteral(TokenList):
998n/a
999n/a token_type = 'domain-literal'
1000n/a
1001n/a @property
1002n/a def domain(self):
1003n/a return ''.join(super().value.split())
1004n/a
1005n/a @property
1006n/a def ip(self):
1007n/a for x in self:
1008n/a if x.token_type == 'ptext':
1009n/a return x.value
1010n/a
1011n/a
1012n/aclass MIMEVersion(TokenList):
1013n/a
1014n/a token_type = 'mime-version'
1015n/a major = None
1016n/a minor = None
1017n/a
1018n/a
1019n/aclass Parameter(TokenList):
1020n/a
1021n/a token_type = 'parameter'
1022n/a sectioned = False
1023n/a extended = False
1024n/a charset = 'us-ascii'
1025n/a
1026n/a @property
1027n/a def section_number(self):
1028n/a # Because the first token, the attribute (name) eats CFWS, the second
1029n/a # token is always the section if there is one.
1030n/a return self[1].number if self.sectioned else 0
1031n/a
1032n/a @property
1033n/a def param_value(self):
1034n/a # This is part of the "handle quoted extended parameters" hack.
1035n/a for token in self:
1036n/a if token.token_type == 'value':
1037n/a return token.stripped_value
1038n/a if token.token_type == 'quoted-string':
1039n/a for token in token:
1040n/a if token.token_type == 'bare-quoted-string':
1041n/a for token in token:
1042n/a if token.token_type == 'value':
1043n/a return token.stripped_value
1044n/a return ''
1045n/a
1046n/a
1047n/aclass InvalidParameter(Parameter):
1048n/a
1049n/a token_type = 'invalid-parameter'
1050n/a
1051n/a
1052n/aclass Attribute(TokenList):
1053n/a
1054n/a token_type = 'attribute'
1055n/a
1056n/a @property
1057n/a def stripped_value(self):
1058n/a for token in self:
1059n/a if token.token_type.endswith('attrtext'):
1060n/a return token.value
1061n/a
1062n/aclass Section(TokenList):
1063n/a
1064n/a token_type = 'section'
1065n/a number = None
1066n/a
1067n/a
1068n/aclass Value(TokenList):
1069n/a
1070n/a token_type = 'value'
1071n/a
1072n/a @property
1073n/a def stripped_value(self):
1074n/a token = self[0]
1075n/a if token.token_type == 'cfws':
1076n/a token = self[1]
1077n/a if token.token_type.endswith(
1078n/a ('quoted-string', 'attribute', 'extended-attribute')):
1079n/a return token.stripped_value
1080n/a return self.value
1081n/a
1082n/a
1083n/aclass MimeParameters(TokenList):
1084n/a
1085n/a token_type = 'mime-parameters'
1086n/a
1087n/a @property
1088n/a def params(self):
1089n/a # The RFC specifically states that the ordering of parameters is not
1090n/a # guaranteed and may be reordered by the transport layer. So we have
1091n/a # to assume the RFC 2231 pieces can come in any order. However, we
1092n/a # output them in the order that we first see a given name, which gives
1093n/a # us a stable __str__.
1094n/a params = OrderedDict()
1095n/a for token in self:
1096n/a if not token.token_type.endswith('parameter'):
1097n/a continue
1098n/a if token[0].token_type != 'attribute':
1099n/a continue
1100n/a name = token[0].value.strip()
1101n/a if name not in params:
1102n/a params[name] = []
1103n/a params[name].append((token.section_number, token))
1104n/a for name, parts in params.items():
1105n/a parts = sorted(parts, key=itemgetter(0))
1106n/a first_param = parts[0][1]
1107n/a charset = first_param.charset
1108n/a # Our arbitrary error recovery is to ignore duplicate parameters,
1109n/a # to use appearance order if there are duplicate rfc 2231 parts,
1110n/a # and to ignore gaps. This mimics the error recovery of get_param.
1111n/a if not first_param.extended and len(parts) > 1:
1112n/a if parts[1][0] == 0:
1113n/a parts[1][1].defects.append(errors.InvalidHeaderDefect(
1114n/a 'duplicate parameter name; duplicate(s) ignored'))
1115n/a parts = parts[:1]
1116n/a # Else assume the *0* was missing...note that this is different
1117n/a # from get_param, but we registered a defect for this earlier.
1118n/a value_parts = []
1119n/a i = 0
1120n/a for section_number, param in parts:
1121n/a if section_number != i:
1122n/a # We could get fancier here and look for a complete
1123n/a # duplicate extended parameter and ignore the second one
1124n/a # seen. But we're not doing that. The old code didn't.
1125n/a if not param.extended:
1126n/a param.defects.append(errors.InvalidHeaderDefect(
1127n/a 'duplicate parameter name; duplicate ignored'))
1128n/a continue
1129n/a else:
1130n/a param.defects.append(errors.InvalidHeaderDefect(
1131n/a "inconsistent RFC2231 parameter numbering"))
1132n/a i += 1
1133n/a value = param.param_value
1134n/a if param.extended:
1135n/a try:
1136n/a value = urllib.parse.unquote_to_bytes(value)
1137n/a except UnicodeEncodeError:
1138n/a # source had surrogate escaped bytes. What we do now
1139n/a # is a bit of an open question. I'm not sure this is
1140n/a # the best choice, but it is what the old algorithm did
1141n/a value = urllib.parse.unquote(value, encoding='latin-1')
1142n/a else:
1143n/a try:
1144n/a value = value.decode(charset, 'surrogateescape')
1145n/a except LookupError:
1146n/a # XXX: there should really be a custom defect for
1147n/a # unknown character set to make it easy to find,
1148n/a # because otherwise unknown charset is a silent
1149n/a # failure.
1150n/a value = value.decode('us-ascii', 'surrogateescape')
1151n/a if utils._has_surrogates(value):
1152n/a param.defects.append(errors.UndecodableBytesDefect())
1153n/a value_parts.append(value)
1154n/a value = ''.join(value_parts)
1155n/a yield name, value
1156n/a
1157n/a def __str__(self):
1158n/a params = []
1159n/a for name, value in self.params:
1160n/a if value:
1161n/a params.append('{}={}'.format(name, quote_string(value)))
1162n/a else:
1163n/a params.append(name)
1164n/a params = '; '.join(params)
1165n/a return ' ' + params if params else ''
1166n/a
1167n/a
1168n/aclass ParameterizedHeaderValue(TokenList):
1169n/a
1170n/a @property
1171n/a def params(self):
1172n/a for token in reversed(self):
1173n/a if token.token_type == 'mime-parameters':
1174n/a return token.params
1175n/a return {}
1176n/a
1177n/a @property
1178n/a def parts(self):
1179n/a if self and self[-1].token_type == 'mime-parameters':
1180n/a # We don't want to start a new line if all of the params don't fit
1181n/a # after the value, so unwrap the parameter list.
1182n/a return TokenList(self[:-1] + self[-1])
1183n/a return TokenList(self).parts
1184n/a
1185n/a
1186n/aclass ContentType(ParameterizedHeaderValue):
1187n/a
1188n/a token_type = 'content-type'
1189n/a maintype = 'text'
1190n/a subtype = 'plain'
1191n/a
1192n/a
1193n/aclass ContentDisposition(ParameterizedHeaderValue):
1194n/a
1195n/a token_type = 'content-disposition'
1196n/a content_disposition = None
1197n/a
1198n/a
1199n/aclass ContentTransferEncoding(TokenList):
1200n/a
1201n/a token_type = 'content-transfer-encoding'
1202n/a cte = '7bit'
1203n/a
1204n/a
1205n/aclass HeaderLabel(TokenList):
1206n/a
1207n/a token_type = 'header-label'
1208n/a
1209n/a
1210n/aclass Header(TokenList):
1211n/a
1212n/a token_type = 'header'
1213n/a
1214n/a def _fold(self, folded):
1215n/a folded.append(str(self.pop(0)))
1216n/a folded.lastlen = len(folded.current[0])
1217n/a # The first line of the header is different from all others: we don't
1218n/a # want to start a new object on a new line if it has any fold points in
1219n/a # it that would allow part of it to be on the first header line.
1220n/a # Further, if the first fold point would fit on the new line, we want
1221n/a # to do that, but if it doesn't we want to put it on the first line.
1222n/a # Folded supports this via the stickyspace attribute. If this
1223n/a # attribute is not None, it does the special handling.
1224n/a folded.stickyspace = str(self.pop(0)) if self[0].token_type == 'cfws' else ''
1225n/a rest = self.pop(0)
1226n/a if self:
1227n/a raise ValueError("Malformed Header token list")
1228n/a rest._fold(folded)
1229n/a
1230n/a
1231n/a#
1232n/a# Terminal classes and instances
1233n/a#
1234n/a
1235n/aclass Terminal(str):
1236n/a
1237n/a def __new__(cls, value, token_type):
1238n/a self = super().__new__(cls, value)
1239n/a self.token_type = token_type
1240n/a self.defects = []
1241n/a return self
1242n/a
1243n/a def __repr__(self):
1244n/a return "{}({})".format(self.__class__.__name__, super().__repr__())
1245n/a
1246n/a @property
1247n/a def all_defects(self):
1248n/a return list(self.defects)
1249n/a
1250n/a def _pp(self, indent=''):
1251n/a return ["{}{}/{}({}){}".format(
1252n/a indent,
1253n/a self.__class__.__name__,
1254n/a self.token_type,
1255n/a super().__repr__(),
1256n/a '' if not self.defects else ' {}'.format(self.defects),
1257n/a )]
1258n/a
1259n/a def cte_encode(self, charset, policy):
1260n/a value = str(self)
1261n/a try:
1262n/a value.encode('us-ascii')
1263n/a return value
1264n/a except UnicodeEncodeError:
1265n/a return _ew.encode(value, charset)
1266n/a
1267n/a def pop_trailing_ws(self):
1268n/a # This terminates the recursion.
1269n/a return None
1270n/a
1271n/a def pop_leading_fws(self):
1272n/a # This terminates the recursion.
1273n/a return None
1274n/a
1275n/a @property
1276n/a def comments(self):
1277n/a return []
1278n/a
1279n/a def has_leading_comment(self):
1280n/a return False
1281n/a
1282n/a def __getnewargs__(self):
1283n/a return(str(self), self.token_type)
1284n/a
1285n/a
1286n/aclass WhiteSpaceTerminal(Terminal):
1287n/a
1288n/a @property
1289n/a def value(self):
1290n/a return ' '
1291n/a
1292n/a def startswith_fws(self):
1293n/a return True
1294n/a
1295n/a has_fws = True
1296n/a
1297n/a
1298n/aclass ValueTerminal(Terminal):
1299n/a
1300n/a @property
1301n/a def value(self):
1302n/a return self
1303n/a
1304n/a def startswith_fws(self):
1305n/a return False
1306n/a
1307n/a has_fws = False
1308n/a
1309n/a def as_encoded_word(self, charset):
1310n/a return _ew.encode(str(self), charset)
1311n/a
1312n/a
1313n/aclass EWWhiteSpaceTerminal(WhiteSpaceTerminal):
1314n/a
1315n/a @property
1316n/a def value(self):
1317n/a return ''
1318n/a
1319n/a @property
1320n/a def encoded(self):
1321n/a return self[:]
1322n/a
1323n/a def __str__(self):
1324n/a return ''
1325n/a
1326n/a has_fws = True
1327n/a
1328n/a
1329n/a# XXX these need to become classes and used as instances so
1330n/a# that a program can't change them in a parse tree and screw
1331n/a# up other parse trees. Maybe should have tests for that, too.
1332n/aDOT = ValueTerminal('.', 'dot')
1333n/aListSeparator = ValueTerminal(',', 'list-separator')
1334n/aRouteComponentMarker = ValueTerminal('@', 'route-component-marker')
1335n/a
1336n/a#
1337n/a# Parser
1338n/a#
1339n/a
1340n/a# Parse strings according to RFC822/2047/2822/5322 rules.
1341n/a#
1342n/a# This is a stateless parser. Each get_XXX function accepts a string and
1343n/a# returns either a Terminal or a TokenList representing the RFC object named
1344n/a# by the method and a string containing the remaining unparsed characters
1345n/a# from the input. Thus a parser method consumes the next syntactic construct
1346n/a# of a given type and returns a token representing the construct plus the
1347n/a# unparsed remainder of the input string.
1348n/a#
1349n/a# For example, if the first element of a structured header is a 'phrase',
1350n/a# then:
1351n/a#
1352n/a# phrase, value = get_phrase(value)
1353n/a#
1354n/a# returns the complete phrase from the start of the string value, plus any
1355n/a# characters left in the string after the phrase is removed.
1356n/a
1357n/a_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
1358n/a_non_atom_end_matcher = re.compile(r"[^{}]+".format(
1359n/a ''.join(ATOM_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
1360n/a_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall
1361n/a_non_token_end_matcher = re.compile(r"[^{}]+".format(
1362n/a ''.join(TOKEN_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
1363n/a_non_attribute_end_matcher = re.compile(r"[^{}]+".format(
1364n/a ''.join(ATTRIBUTE_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
1365n/a_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format(
1366n/a ''.join(EXTENDED_ATTRIBUTE_ENDS).replace(
1367n/a '\\','\\\\').replace(']',r'\]'))).match
1368n/a
1369n/adef _validate_xtext(xtext):
1370n/a """If input token contains ASCII non-printables, register a defect."""
1371n/a
1372n/a non_printables = _non_printable_finder(xtext)
1373n/a if non_printables:
1374n/a xtext.defects.append(errors.NonPrintableDefect(non_printables))
1375n/a if utils._has_surrogates(xtext):
1376n/a xtext.defects.append(errors.UndecodableBytesDefect(
1377n/a "Non-ASCII characters found in header token"))
1378n/a
1379n/adef _get_ptext_to_endchars(value, endchars):
1380n/a """Scan printables/quoted-pairs until endchars and return unquoted ptext.
1381n/a
1382n/a This function turns a run of qcontent, ccontent-without-comments, or
1383n/a dtext-with-quoted-printables into a single string by unquoting any
1384n/a quoted printables. It returns the string, the remaining value, and
1385n/a a flag that is True iff there were any quoted printables decoded.
1386n/a
1387n/a """
1388n/a fragment, *remainder = _wsp_splitter(value, 1)
1389n/a vchars = []
1390n/a escape = False
1391n/a had_qp = False
1392n/a for pos in range(len(fragment)):
1393n/a if fragment[pos] == '\\':
1394n/a if escape:
1395n/a escape = False
1396n/a had_qp = True
1397n/a else:
1398n/a escape = True
1399n/a continue
1400n/a if escape:
1401n/a escape = False
1402n/a elif fragment[pos] in endchars:
1403n/a break
1404n/a vchars.append(fragment[pos])
1405n/a else:
1406n/a pos = pos + 1
1407n/a return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp
1408n/a
1409n/adef get_fws(value):
1410n/a """FWS = 1*WSP
1411n/a
1412n/a This isn't the RFC definition. We're using fws to represent tokens where
1413n/a folding can be done, but when we are parsing the *un*folding has already
1414n/a been done so we don't need to watch out for CRLF.
1415n/a
1416n/a """
1417n/a newvalue = value.lstrip()
1418n/a fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws')
1419n/a return fws, newvalue
1420n/a
1421n/adef get_encoded_word(value):
1422n/a """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
1423n/a
1424n/a """
1425n/a ew = EncodedWord()
1426n/a if not value.startswith('=?'):
1427n/a raise errors.HeaderParseError(
1428n/a "expected encoded word but found {}".format(value))
1429n/a tok, *remainder = value[2:].split('?=', 1)
1430n/a if tok == value[2:]:
1431n/a raise errors.HeaderParseError(
1432n/a "expected encoded word but found {}".format(value))
1433n/a remstr = ''.join(remainder)
1434n/a if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits:
1435n/a # The ? after the CTE was followed by an encoded word escape (=XX).
1436n/a rest, *remainder = remstr.split('?=', 1)
1437n/a tok = tok + '?=' + rest
1438n/a if len(tok.split()) > 1:
1439n/a ew.defects.append(errors.InvalidHeaderDefect(
1440n/a "whitespace inside encoded word"))
1441n/a ew.cte = value
1442n/a value = ''.join(remainder)
1443n/a try:
1444n/a text, charset, lang, defects = _ew.decode('=?' + tok + '?=')
1445n/a except ValueError:
1446n/a raise errors.HeaderParseError(
1447n/a "encoded word format invalid: '{}'".format(ew.cte))
1448n/a ew.charset = charset
1449n/a ew.lang = lang
1450n/a ew.defects.extend(defects)
1451n/a while text:
1452n/a if text[0] in WSP:
1453n/a token, text = get_fws(text)
1454n/a ew.append(token)
1455n/a continue
1456n/a chars, *remainder = _wsp_splitter(text, 1)
1457n/a vtext = ValueTerminal(chars, 'vtext')
1458n/a _validate_xtext(vtext)
1459n/a ew.append(vtext)
1460n/a text = ''.join(remainder)
1461n/a return ew, value
1462n/a
1463n/adef get_unstructured(value):
1464n/a """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct
1465n/a obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS)
1466n/a obs-utext = %d0 / obs-NO-WS-CTL / LF / CR
1467n/a
1468n/a obs-NO-WS-CTL is control characters except WSP/CR/LF.
1469n/a
1470n/a So, basically, we have printable runs, plus control characters or nulls in
1471n/a the obsolete syntax, separated by whitespace. Since RFC 2047 uses the
1472n/a obsolete syntax in its specification, but requires whitespace on either
1473n/a side of the encoded words, I can see no reason to need to separate the
1474n/a non-printable-non-whitespace from the printable runs if they occur, so we
1475n/a parse this into xtext tokens separated by WSP tokens.
1476n/a
1477n/a Because an 'unstructured' value must by definition constitute the entire
1478n/a value, this 'get' routine does not return a remaining value, only the
1479n/a parsed TokenList.
1480n/a
1481n/a """
1482n/a # XXX: but what about bare CR and LF? They might signal the start or
1483n/a # end of an encoded word. YAGNI for now, since our current parsers
1484n/a # will never send us strings with bare CR or LF.
1485n/a
1486n/a unstructured = UnstructuredTokenList()
1487n/a while value:
1488n/a if value[0] in WSP:
1489n/a token, value = get_fws(value)
1490n/a unstructured.append(token)
1491n/a continue
1492n/a if value.startswith('=?'):
1493n/a try:
1494n/a token, value = get_encoded_word(value)
1495n/a except errors.HeaderParseError:
1496n/a # XXX: Need to figure out how to register defects when
1497n/a # appropriate here.
1498n/a pass
1499n/a else:
1500n/a have_ws = True
1501n/a if len(unstructured) > 0:
1502n/a if unstructured[-1].token_type != 'fws':
1503n/a unstructured.defects.append(errors.InvalidHeaderDefect(
1504n/a "missing whitespace before encoded word"))
1505n/a have_ws = False
1506n/a if have_ws and len(unstructured) > 1:
1507n/a if unstructured[-2].token_type == 'encoded-word':
1508n/a unstructured[-1] = EWWhiteSpaceTerminal(
1509n/a unstructured[-1], 'fws')
1510n/a unstructured.append(token)
1511n/a continue
1512n/a tok, *remainder = _wsp_splitter(value, 1)
1513n/a vtext = ValueTerminal(tok, 'vtext')
1514n/a _validate_xtext(vtext)
1515n/a unstructured.append(vtext)
1516n/a value = ''.join(remainder)
1517n/a return unstructured
1518n/a
1519n/adef get_qp_ctext(value):
1520n/a r"""ctext = <printable ascii except \ ( )>
1521n/a
1522n/a This is not the RFC ctext, since we are handling nested comments in comment
1523n/a and unquoting quoted-pairs here. We allow anything except the '()'
1524n/a characters, but if we find any ASCII other than the RFC defined printable
1525n/a ASCII, a NonPrintableDefect is added to the token's defects list. Since
1526n/a quoted pairs are converted to their unquoted values, what is returned is
1527n/a a 'ptext' token. In this case it is a WhiteSpaceTerminal, so it's value
1528n/a is ' '.
1529n/a
1530n/a """
1531n/a ptext, value, _ = _get_ptext_to_endchars(value, '()')
1532n/a ptext = WhiteSpaceTerminal(ptext, 'ptext')
1533n/a _validate_xtext(ptext)
1534n/a return ptext, value
1535n/a
1536n/adef get_qcontent(value):
1537n/a """qcontent = qtext / quoted-pair
1538n/a
1539n/a We allow anything except the DQUOTE character, but if we find any ASCII
1540n/a other than the RFC defined printable ASCII, a NonPrintableDefect is
1541n/a added to the token's defects list. Any quoted pairs are converted to their
1542n/a unquoted values, so what is returned is a 'ptext' token. In this case it
1543n/a is a ValueTerminal.
1544n/a
1545n/a """
1546n/a ptext, value, _ = _get_ptext_to_endchars(value, '"')
1547n/a ptext = ValueTerminal(ptext, 'ptext')
1548n/a _validate_xtext(ptext)
1549n/a return ptext, value
1550n/a
1551n/adef get_atext(value):
1552n/a """atext = <matches _atext_matcher>
1553n/a
1554n/a We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to
1555n/a the token's defects list if we find non-atext characters.
1556n/a """
1557n/a m = _non_atom_end_matcher(value)
1558n/a if not m:
1559n/a raise errors.HeaderParseError(
1560n/a "expected atext but found '{}'".format(value))
1561n/a atext = m.group()
1562n/a value = value[len(atext):]
1563n/a atext = ValueTerminal(atext, 'atext')
1564n/a _validate_xtext(atext)
1565n/a return atext, value
1566n/a
1567n/adef get_bare_quoted_string(value):
1568n/a """bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE
1569n/a
1570n/a A quoted-string without the leading or trailing white space. Its
1571n/a value is the text between the quote marks, with whitespace
1572n/a preserved and quoted pairs decoded.
1573n/a """
1574n/a if value[0] != '"':
1575n/a raise errors.HeaderParseError(
1576n/a "expected '\"' but found '{}'".format(value))
1577n/a bare_quoted_string = BareQuotedString()
1578n/a value = value[1:]
1579n/a while value and value[0] != '"':
1580n/a if value[0] in WSP:
1581n/a token, value = get_fws(value)
1582n/a elif value[:2] == '=?':
1583n/a try:
1584n/a token, value = get_encoded_word(value)
1585n/a bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
1586n/a "encoded word inside quoted string"))
1587n/a except errors.HeaderParseError:
1588n/a token, value = get_qcontent(value)
1589n/a else:
1590n/a token, value = get_qcontent(value)
1591n/a bare_quoted_string.append(token)
1592n/a if not value:
1593n/a bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
1594n/a "end of header inside quoted string"))
1595n/a return bare_quoted_string, value
1596n/a return bare_quoted_string, value[1:]
1597n/a
1598n/adef get_comment(value):
1599n/a """comment = "(" *([FWS] ccontent) [FWS] ")"
1600n/a ccontent = ctext / quoted-pair / comment
1601n/a
1602n/a We handle nested comments here, and quoted-pair in our qp-ctext routine.
1603n/a """
1604n/a if value and value[0] != '(':
1605n/a raise errors.HeaderParseError(
1606n/a "expected '(' but found '{}'".format(value))
1607n/a comment = Comment()
1608n/a value = value[1:]
1609n/a while value and value[0] != ")":
1610n/a if value[0] in WSP:
1611n/a token, value = get_fws(value)
1612n/a elif value[0] == '(':
1613n/a token, value = get_comment(value)
1614n/a else:
1615n/a token, value = get_qp_ctext(value)
1616n/a comment.append(token)
1617n/a if not value:
1618n/a comment.defects.append(errors.InvalidHeaderDefect(
1619n/a "end of header inside comment"))
1620n/a return comment, value
1621n/a return comment, value[1:]
1622n/a
1623n/adef get_cfws(value):
1624n/a """CFWS = (1*([FWS] comment) [FWS]) / FWS
1625n/a
1626n/a """
1627n/a cfws = CFWSList()
1628n/a while value and value[0] in CFWS_LEADER:
1629n/a if value[0] in WSP:
1630n/a token, value = get_fws(value)
1631n/a else:
1632n/a token, value = get_comment(value)
1633n/a cfws.append(token)
1634n/a return cfws, value
1635n/a
1636n/adef get_quoted_string(value):
1637n/a """quoted-string = [CFWS] <bare-quoted-string> [CFWS]
1638n/a
1639n/a 'bare-quoted-string' is an intermediate class defined by this
1640n/a parser and not by the RFC grammar. It is the quoted string
1641n/a without any attached CFWS.
1642n/a """
1643n/a quoted_string = QuotedString()
1644n/a if value and value[0] in CFWS_LEADER:
1645n/a token, value = get_cfws(value)
1646n/a quoted_string.append(token)
1647n/a token, value = get_bare_quoted_string(value)
1648n/a quoted_string.append(token)
1649n/a if value and value[0] in CFWS_LEADER:
1650n/a token, value = get_cfws(value)
1651n/a quoted_string.append(token)
1652n/a return quoted_string, value
1653n/a
1654n/adef get_atom(value):
1655n/a """atom = [CFWS] 1*atext [CFWS]
1656n/a
1657n/a An atom could be an rfc2047 encoded word.
1658n/a """
1659n/a atom = Atom()
1660n/a if value and value[0] in CFWS_LEADER:
1661n/a token, value = get_cfws(value)
1662n/a atom.append(token)
1663n/a if value and value[0] in ATOM_ENDS:
1664n/a raise errors.HeaderParseError(
1665n/a "expected atom but found '{}'".format(value))
1666n/a if value.startswith('=?'):
1667n/a try:
1668n/a token, value = get_encoded_word(value)
1669n/a except errors.HeaderParseError:
1670n/a # XXX: need to figure out how to register defects when
1671n/a # appropriate here.
1672n/a token, value = get_atext(value)
1673n/a else:
1674n/a token, value = get_atext(value)
1675n/a atom.append(token)
1676n/a if value and value[0] in CFWS_LEADER:
1677n/a token, value = get_cfws(value)
1678n/a atom.append(token)
1679n/a return atom, value
1680n/a
1681n/adef get_dot_atom_text(value):
1682n/a """ dot-text = 1*atext *("." 1*atext)
1683n/a
1684n/a """
1685n/a dot_atom_text = DotAtomText()
1686n/a if not value or value[0] in ATOM_ENDS:
1687n/a raise errors.HeaderParseError("expected atom at a start of "
1688n/a "dot-atom-text but found '{}'".format(value))
1689n/a while value and value[0] not in ATOM_ENDS:
1690n/a token, value = get_atext(value)
1691n/a dot_atom_text.append(token)
1692n/a if value and value[0] == '.':
1693n/a dot_atom_text.append(DOT)
1694n/a value = value[1:]
1695n/a if dot_atom_text[-1] is DOT:
1696n/a raise errors.HeaderParseError("expected atom at end of dot-atom-text "
1697n/a "but found '{}'".format('.'+value))
1698n/a return dot_atom_text, value
1699n/a
1700n/adef get_dot_atom(value):
1701n/a """ dot-atom = [CFWS] dot-atom-text [CFWS]
1702n/a
1703n/a Any place we can have a dot atom, we could instead have an rfc2047 encoded
1704n/a word.
1705n/a """
1706n/a dot_atom = DotAtom()
1707n/a if value[0] in CFWS_LEADER:
1708n/a token, value = get_cfws(value)
1709n/a dot_atom.append(token)
1710n/a if value.startswith('=?'):
1711n/a try:
1712n/a token, value = get_encoded_word(value)
1713n/a except errors.HeaderParseError:
1714n/a # XXX: need to figure out how to register defects when
1715n/a # appropriate here.
1716n/a token, value = get_dot_atom_text(value)
1717n/a else:
1718n/a token, value = get_dot_atom_text(value)
1719n/a dot_atom.append(token)
1720n/a if value and value[0] in CFWS_LEADER:
1721n/a token, value = get_cfws(value)
1722n/a dot_atom.append(token)
1723n/a return dot_atom, value
1724n/a
1725n/adef get_word(value):
1726n/a """word = atom / quoted-string
1727n/a
1728n/a Either atom or quoted-string may start with CFWS. We have to peel off this
1729n/a CFWS first to determine which type of word to parse. Afterward we splice
1730n/a the leading CFWS, if any, into the parsed sub-token.
1731n/a
1732n/a If neither an atom or a quoted-string is found before the next special, a
1733n/a HeaderParseError is raised.
1734n/a
1735n/a The token returned is either an Atom or a QuotedString, as appropriate.
1736n/a This means the 'word' level of the formal grammar is not represented in the
1737n/a parse tree; this is because having that extra layer when manipulating the
1738n/a parse tree is more confusing than it is helpful.
1739n/a
1740n/a """
1741n/a if value[0] in CFWS_LEADER:
1742n/a leader, value = get_cfws(value)
1743n/a else:
1744n/a leader = None
1745n/a if value[0]=='"':
1746n/a token, value = get_quoted_string(value)
1747n/a elif value[0] in SPECIALS:
1748n/a raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' "
1749n/a "but found '{}'".format(value))
1750n/a else:
1751n/a token, value = get_atom(value)
1752n/a if leader is not None:
1753n/a token[:0] = [leader]
1754n/a return token, value
1755n/a
1756n/adef get_phrase(value):
1757n/a """ phrase = 1*word / obs-phrase
1758n/a obs-phrase = word *(word / "." / CFWS)
1759n/a
1760n/a This means a phrase can be a sequence of words, periods, and CFWS in any
1761n/a order as long as it starts with at least one word. If anything other than
1762n/a words is detected, an ObsoleteHeaderDefect is added to the token's defect
1763n/a list. We also accept a phrase that starts with CFWS followed by a dot;
1764n/a this is registered as an InvalidHeaderDefect, since it is not supported by
1765n/a even the obsolete grammar.
1766n/a
1767n/a """
1768n/a phrase = Phrase()
1769n/a try:
1770n/a token, value = get_word(value)
1771n/a phrase.append(token)
1772n/a except errors.HeaderParseError:
1773n/a phrase.defects.append(errors.InvalidHeaderDefect(
1774n/a "phrase does not start with word"))
1775n/a while value and value[0] not in PHRASE_ENDS:
1776n/a if value[0]=='.':
1777n/a phrase.append(DOT)
1778n/a phrase.defects.append(errors.ObsoleteHeaderDefect(
1779n/a "period in 'phrase'"))
1780n/a value = value[1:]
1781n/a else:
1782n/a try:
1783n/a token, value = get_word(value)
1784n/a except errors.HeaderParseError:
1785n/a if value[0] in CFWS_LEADER:
1786n/a token, value = get_cfws(value)
1787n/a phrase.defects.append(errors.ObsoleteHeaderDefect(
1788n/a "comment found without atom"))
1789n/a else:
1790n/a raise
1791n/a phrase.append(token)
1792n/a return phrase, value
1793n/a
1794n/adef get_local_part(value):
1795n/a """ local-part = dot-atom / quoted-string / obs-local-part
1796n/a
1797n/a """
1798n/a local_part = LocalPart()
1799n/a leader = None
1800n/a if value[0] in CFWS_LEADER:
1801n/a leader, value = get_cfws(value)
1802n/a if not value:
1803n/a raise errors.HeaderParseError(
1804n/a "expected local-part but found '{}'".format(value))
1805n/a try:
1806n/a token, value = get_dot_atom(value)
1807n/a except errors.HeaderParseError:
1808n/a try:
1809n/a token, value = get_word(value)
1810n/a except errors.HeaderParseError:
1811n/a if value[0] != '\\' and value[0] in PHRASE_ENDS:
1812n/a raise
1813n/a token = TokenList()
1814n/a if leader is not None:
1815n/a token[:0] = [leader]
1816n/a local_part.append(token)
1817n/a if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
1818n/a obs_local_part, value = get_obs_local_part(str(local_part) + value)
1819n/a if obs_local_part.token_type == 'invalid-obs-local-part':
1820n/a local_part.defects.append(errors.InvalidHeaderDefect(
1821n/a "local-part is not dot-atom, quoted-string, or obs-local-part"))
1822n/a else:
1823n/a local_part.defects.append(errors.ObsoleteHeaderDefect(
1824n/a "local-part is not a dot-atom (contains CFWS)"))
1825n/a local_part[0] = obs_local_part
1826n/a try:
1827n/a local_part.value.encode('ascii')
1828n/a except UnicodeEncodeError:
1829n/a local_part.defects.append(errors.NonASCIILocalPartDefect(
1830n/a "local-part contains non-ASCII characters)"))
1831n/a return local_part, value
1832n/a
1833n/adef get_obs_local_part(value):
1834n/a """ obs-local-part = word *("." word)
1835n/a """
1836n/a obs_local_part = ObsLocalPart()
1837n/a last_non_ws_was_dot = False
1838n/a while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
1839n/a if value[0] == '.':
1840n/a if last_non_ws_was_dot:
1841n/a obs_local_part.defects.append(errors.InvalidHeaderDefect(
1842n/a "invalid repeated '.'"))
1843n/a obs_local_part.append(DOT)
1844n/a last_non_ws_was_dot = True
1845n/a value = value[1:]
1846n/a continue
1847n/a elif value[0]=='\\':
1848n/a obs_local_part.append(ValueTerminal(value[0],
1849n/a 'misplaced-special'))
1850n/a value = value[1:]
1851n/a obs_local_part.defects.append(errors.InvalidHeaderDefect(
1852n/a "'\\' character outside of quoted-string/ccontent"))
1853n/a last_non_ws_was_dot = False
1854n/a continue
1855n/a if obs_local_part and obs_local_part[-1].token_type != 'dot':
1856n/a obs_local_part.defects.append(errors.InvalidHeaderDefect(
1857n/a "missing '.' between words"))
1858n/a try:
1859n/a token, value = get_word(value)
1860n/a last_non_ws_was_dot = False
1861n/a except errors.HeaderParseError:
1862n/a if value[0] not in CFWS_LEADER:
1863n/a raise
1864n/a token, value = get_cfws(value)
1865n/a obs_local_part.append(token)
1866n/a if (obs_local_part[0].token_type == 'dot' or
1867n/a obs_local_part[0].token_type=='cfws' and
1868n/a obs_local_part[1].token_type=='dot'):
1869n/a obs_local_part.defects.append(errors.InvalidHeaderDefect(
1870n/a "Invalid leading '.' in local part"))
1871n/a if (obs_local_part[-1].token_type == 'dot' or
1872n/a obs_local_part[-1].token_type=='cfws' and
1873n/a obs_local_part[-2].token_type=='dot'):
1874n/a obs_local_part.defects.append(errors.InvalidHeaderDefect(
1875n/a "Invalid trailing '.' in local part"))
1876n/a if obs_local_part.defects:
1877n/a obs_local_part.token_type = 'invalid-obs-local-part'
1878n/a return obs_local_part, value
1879n/a
1880n/adef get_dtext(value):
1881n/a r""" dtext = <printable ascii except \ [ ]> / obs-dtext
1882n/a obs-dtext = obs-NO-WS-CTL / quoted-pair
1883n/a
1884n/a We allow anything except the excluded characters, but if we find any
1885n/a ASCII other than the RFC defined printable ASCII, a NonPrintableDefect is
1886n/a added to the token's defects list. Quoted pairs are converted to their
1887n/a unquoted values, so what is returned is a ptext token, in this case a
1888n/a ValueTerminal. If there were quoted-printables, an ObsoleteHeaderDefect is
1889n/a added to the returned token's defect list.
1890n/a
1891n/a """
1892n/a ptext, value, had_qp = _get_ptext_to_endchars(value, '[]')
1893n/a ptext = ValueTerminal(ptext, 'ptext')
1894n/a if had_qp:
1895n/a ptext.defects.append(errors.ObsoleteHeaderDefect(
1896n/a "quoted printable found in domain-literal"))
1897n/a _validate_xtext(ptext)
1898n/a return ptext, value
1899n/a
1900n/adef _check_for_early_dl_end(value, domain_literal):
1901n/a if value:
1902n/a return False
1903n/a domain_literal.append(errors.InvalidHeaderDefect(
1904n/a "end of input inside domain-literal"))
1905n/a domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
1906n/a return True
1907n/a
1908n/adef get_domain_literal(value):
1909n/a """ domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS]
1910n/a
1911n/a """
1912n/a domain_literal = DomainLiteral()
1913n/a if value[0] in CFWS_LEADER:
1914n/a token, value = get_cfws(value)
1915n/a domain_literal.append(token)
1916n/a if not value:
1917n/a raise errors.HeaderParseError("expected domain-literal")
1918n/a if value[0] != '[':
1919n/a raise errors.HeaderParseError("expected '[' at start of domain-literal "
1920n/a "but found '{}'".format(value))
1921n/a value = value[1:]
1922n/a if _check_for_early_dl_end(value, domain_literal):
1923n/a return domain_literal, value
1924n/a domain_literal.append(ValueTerminal('[', 'domain-literal-start'))
1925n/a if value[0] in WSP:
1926n/a token, value = get_fws(value)
1927n/a domain_literal.append(token)
1928n/a token, value = get_dtext(value)
1929n/a domain_literal.append(token)
1930n/a if _check_for_early_dl_end(value, domain_literal):
1931n/a return domain_literal, value
1932n/a if value[0] in WSP:
1933n/a token, value = get_fws(value)
1934n/a domain_literal.append(token)
1935n/a if _check_for_early_dl_end(value, domain_literal):
1936n/a return domain_literal, value
1937n/a if value[0] != ']':
1938n/a raise errors.HeaderParseError("expected ']' at end of domain-literal "
1939n/a "but found '{}'".format(value))
1940n/a domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
1941n/a value = value[1:]
1942n/a if value and value[0] in CFWS_LEADER:
1943n/a token, value = get_cfws(value)
1944n/a domain_literal.append(token)
1945n/a return domain_literal, value
1946n/a
1947n/adef get_domain(value):
1948n/a """ domain = dot-atom / domain-literal / obs-domain
1949n/a obs-domain = atom *("." atom))
1950n/a
1951n/a """
1952n/a domain = Domain()
1953n/a leader = None
1954n/a if value[0] in CFWS_LEADER:
1955n/a leader, value = get_cfws(value)
1956n/a if not value:
1957n/a raise errors.HeaderParseError(
1958n/a "expected domain but found '{}'".format(value))
1959n/a if value[0] == '[':
1960n/a token, value = get_domain_literal(value)
1961n/a if leader is not None:
1962n/a token[:0] = [leader]
1963n/a domain.append(token)
1964n/a return domain, value
1965n/a try:
1966n/a token, value = get_dot_atom(value)
1967n/a except errors.HeaderParseError:
1968n/a token, value = get_atom(value)
1969n/a if leader is not None:
1970n/a token[:0] = [leader]
1971n/a domain.append(token)
1972n/a if value and value[0] == '.':
1973n/a domain.defects.append(errors.ObsoleteHeaderDefect(
1974n/a "domain is not a dot-atom (contains CFWS)"))
1975n/a if domain[0].token_type == 'dot-atom':
1976n/a domain[:] = domain[0]
1977n/a while value and value[0] == '.':
1978n/a domain.append(DOT)
1979n/a token, value = get_atom(value[1:])
1980n/a domain.append(token)
1981n/a return domain, value
1982n/a
1983n/adef get_addr_spec(value):
1984n/a """ addr-spec = local-part "@" domain
1985n/a
1986n/a """
1987n/a addr_spec = AddrSpec()
1988n/a token, value = get_local_part(value)
1989n/a addr_spec.append(token)
1990n/a if not value or value[0] != '@':
1991n/a addr_spec.defects.append(errors.InvalidHeaderDefect(
1992n/a "add-spec local part with no domain"))
1993n/a return addr_spec, value
1994n/a addr_spec.append(ValueTerminal('@', 'address-at-symbol'))
1995n/a token, value = get_domain(value[1:])
1996n/a addr_spec.append(token)
1997n/a return addr_spec, value
1998n/a
1999n/adef get_obs_route(value):
2000n/a """ obs-route = obs-domain-list ":"
2001n/a obs-domain-list = *(CFWS / ",") "@" domain *("," [CFWS] ["@" domain])
2002n/a
2003n/a Returns an obs-route token with the appropriate sub-tokens (that is,
2004n/a there is no obs-domain-list in the parse tree).
2005n/a """
2006n/a obs_route = ObsRoute()
2007n/a while value and (value[0]==',' or value[0] in CFWS_LEADER):
2008n/a if value[0] in CFWS_LEADER:
2009n/a token, value = get_cfws(value)
2010n/a obs_route.append(token)
2011n/a elif value[0] == ',':
2012n/a obs_route.append(ListSeparator)
2013n/a value = value[1:]
2014n/a if not value or value[0] != '@':
2015n/a raise errors.HeaderParseError(
2016n/a "expected obs-route domain but found '{}'".format(value))
2017n/a obs_route.append(RouteComponentMarker)
2018n/a token, value = get_domain(value[1:])
2019n/a obs_route.append(token)
2020n/a while value and value[0]==',':
2021n/a obs_route.append(ListSeparator)
2022n/a value = value[1:]
2023n/a if not value:
2024n/a break
2025n/a if value[0] in CFWS_LEADER:
2026n/a token, value = get_cfws(value)
2027n/a obs_route.append(token)
2028n/a if value[0] == '@':
2029n/a obs_route.append(RouteComponentMarker)
2030n/a token, value = get_domain(value[1:])
2031n/a obs_route.append(token)
2032n/a if not value:
2033n/a raise errors.HeaderParseError("end of header while parsing obs-route")
2034n/a if value[0] != ':':
2035n/a raise errors.HeaderParseError( "expected ':' marking end of "
2036n/a "obs-route but found '{}'".format(value))
2037n/a obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker'))
2038n/a return obs_route, value[1:]
2039n/a
2040n/adef get_angle_addr(value):
2041n/a """ angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
2042n/a obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS]
2043n/a
2044n/a """
2045n/a angle_addr = AngleAddr()
2046n/a if value[0] in CFWS_LEADER:
2047n/a token, value = get_cfws(value)
2048n/a angle_addr.append(token)
2049n/a if not value or value[0] != '<':
2050n/a raise errors.HeaderParseError(
2051n/a "expected angle-addr but found '{}'".format(value))
2052n/a angle_addr.append(ValueTerminal('<', 'angle-addr-start'))
2053n/a value = value[1:]
2054n/a # Although it is not legal per RFC5322, SMTP uses '<>' in certain
2055n/a # circumstances.
2056n/a if value[0] == '>':
2057n/a angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
2058n/a angle_addr.defects.append(errors.InvalidHeaderDefect(
2059n/a "null addr-spec in angle-addr"))
2060n/a value = value[1:]
2061n/a return angle_addr, value
2062n/a try:
2063n/a token, value = get_addr_spec(value)
2064n/a except errors.HeaderParseError:
2065n/a try:
2066n/a token, value = get_obs_route(value)
2067n/a angle_addr.defects.append(errors.ObsoleteHeaderDefect(
2068n/a "obsolete route specification in angle-addr"))
2069n/a except errors.HeaderParseError:
2070n/a raise errors.HeaderParseError(
2071n/a "expected addr-spec or obs-route but found '{}'".format(value))
2072n/a angle_addr.append(token)
2073n/a token, value = get_addr_spec(value)
2074n/a angle_addr.append(token)
2075n/a if value and value[0] == '>':
2076n/a value = value[1:]
2077n/a else:
2078n/a angle_addr.defects.append(errors.InvalidHeaderDefect(
2079n/a "missing trailing '>' on angle-addr"))
2080n/a angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
2081n/a if value and value[0] in CFWS_LEADER:
2082n/a token, value = get_cfws(value)
2083n/a angle_addr.append(token)
2084n/a return angle_addr, value
2085n/a
2086n/adef get_display_name(value):
2087n/a """ display-name = phrase
2088n/a
2089n/a Because this is simply a name-rule, we don't return a display-name
2090n/a token containing a phrase, but rather a display-name token with
2091n/a the content of the phrase.
2092n/a
2093n/a """
2094n/a display_name = DisplayName()
2095n/a token, value = get_phrase(value)
2096n/a display_name.extend(token[:])
2097n/a display_name.defects = token.defects[:]
2098n/a return display_name, value
2099n/a
2100n/a
2101n/adef get_name_addr(value):
2102n/a """ name-addr = [display-name] angle-addr
2103n/a
2104n/a """
2105n/a name_addr = NameAddr()
2106n/a # Both the optional display name and the angle-addr can start with cfws.
2107n/a leader = None
2108n/a if value[0] in CFWS_LEADER:
2109n/a leader, value = get_cfws(value)
2110n/a if not value:
2111n/a raise errors.HeaderParseError(
2112n/a "expected name-addr but found '{}'".format(leader))
2113n/a if value[0] != '<':
2114n/a if value[0] in PHRASE_ENDS:
2115n/a raise errors.HeaderParseError(
2116n/a "expected name-addr but found '{}'".format(value))
2117n/a token, value = get_display_name(value)
2118n/a if not value:
2119n/a raise errors.HeaderParseError(
2120n/a "expected name-addr but found '{}'".format(token))
2121n/a if leader is not None:
2122n/a token[0][:0] = [leader]
2123n/a leader = None
2124n/a name_addr.append(token)
2125n/a token, value = get_angle_addr(value)
2126n/a if leader is not None:
2127n/a token[:0] = [leader]
2128n/a name_addr.append(token)
2129n/a return name_addr, value
2130n/a
2131n/adef get_mailbox(value):
2132n/a """ mailbox = name-addr / addr-spec
2133n/a
2134n/a """
2135n/a # The only way to figure out if we are dealing with a name-addr or an
2136n/a # addr-spec is to try parsing each one.
2137n/a mailbox = Mailbox()
2138n/a try:
2139n/a token, value = get_name_addr(value)
2140n/a except errors.HeaderParseError:
2141n/a try:
2142n/a token, value = get_addr_spec(value)
2143n/a except errors.HeaderParseError:
2144n/a raise errors.HeaderParseError(
2145n/a "expected mailbox but found '{}'".format(value))
2146n/a if any(isinstance(x, errors.InvalidHeaderDefect)
2147n/a for x in token.all_defects):
2148n/a mailbox.token_type = 'invalid-mailbox'
2149n/a mailbox.append(token)
2150n/a return mailbox, value
2151n/a
2152n/adef get_invalid_mailbox(value, endchars):
2153n/a """ Read everything up to one of the chars in endchars.
2154n/a
2155n/a This is outside the formal grammar. The InvalidMailbox TokenList that is
2156n/a returned acts like a Mailbox, but the data attributes are None.
2157n/a
2158n/a """
2159n/a invalid_mailbox = InvalidMailbox()
2160n/a while value and value[0] not in endchars:
2161n/a if value[0] in PHRASE_ENDS:
2162n/a invalid_mailbox.append(ValueTerminal(value[0],
2163n/a 'misplaced-special'))
2164n/a value = value[1:]
2165n/a else:
2166n/a token, value = get_phrase(value)
2167n/a invalid_mailbox.append(token)
2168n/a return invalid_mailbox, value
2169n/a
2170n/adef get_mailbox_list(value):
2171n/a """ mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
2172n/a obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS])
2173n/a
2174n/a For this routine we go outside the formal grammar in order to improve error
2175n/a handling. We recognize the end of the mailbox list only at the end of the
2176n/a value or at a ';' (the group terminator). This is so that we can turn
2177n/a invalid mailboxes into InvalidMailbox tokens and continue parsing any
2178n/a remaining valid mailboxes. We also allow all mailbox entries to be null,
2179n/a and this condition is handled appropriately at a higher level.
2180n/a
2181n/a """
2182n/a mailbox_list = MailboxList()
2183n/a while value and value[0] != ';':
2184n/a try:
2185n/a token, value = get_mailbox(value)
2186n/a mailbox_list.append(token)
2187n/a except errors.HeaderParseError:
2188n/a leader = None
2189n/a if value[0] in CFWS_LEADER:
2190n/a leader, value = get_cfws(value)
2191n/a if not value or value[0] in ',;':
2192n/a mailbox_list.append(leader)
2193n/a mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
2194n/a "empty element in mailbox-list"))
2195n/a else:
2196n/a token, value = get_invalid_mailbox(value, ',;')
2197n/a if leader is not None:
2198n/a token[:0] = [leader]
2199n/a mailbox_list.append(token)
2200n/a mailbox_list.defects.append(errors.InvalidHeaderDefect(
2201n/a "invalid mailbox in mailbox-list"))
2202n/a elif value[0] == ',':
2203n/a mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
2204n/a "empty element in mailbox-list"))
2205n/a else:
2206n/a token, value = get_invalid_mailbox(value, ',;')
2207n/a if leader is not None:
2208n/a token[:0] = [leader]
2209n/a mailbox_list.append(token)
2210n/a mailbox_list.defects.append(errors.InvalidHeaderDefect(
2211n/a "invalid mailbox in mailbox-list"))
2212n/a if value and value[0] not in ',;':
2213n/a # Crap after mailbox; treat it as an invalid mailbox.
2214n/a # The mailbox info will still be available.
2215n/a mailbox = mailbox_list[-1]
2216n/a mailbox.token_type = 'invalid-mailbox'
2217n/a token, value = get_invalid_mailbox(value, ',;')
2218n/a mailbox.extend(token)
2219n/a mailbox_list.defects.append(errors.InvalidHeaderDefect(
2220n/a "invalid mailbox in mailbox-list"))
2221n/a if value and value[0] == ',':
2222n/a mailbox_list.append(ListSeparator)
2223n/a value = value[1:]
2224n/a return mailbox_list, value
2225n/a
2226n/a
2227n/adef get_group_list(value):
2228n/a """ group-list = mailbox-list / CFWS / obs-group-list
2229n/a obs-group-list = 1*([CFWS] ",") [CFWS]
2230n/a
2231n/a """
2232n/a group_list = GroupList()
2233n/a if not value:
2234n/a group_list.defects.append(errors.InvalidHeaderDefect(
2235n/a "end of header before group-list"))
2236n/a return group_list, value
2237n/a leader = None
2238n/a if value and value[0] in CFWS_LEADER:
2239n/a leader, value = get_cfws(value)
2240n/a if not value:
2241n/a # This should never happen in email parsing, since CFWS-only is a
2242n/a # legal alternative to group-list in a group, which is the only
2243n/a # place group-list appears.
2244n/a group_list.defects.append(errors.InvalidHeaderDefect(
2245n/a "end of header in group-list"))
2246n/a group_list.append(leader)
2247n/a return group_list, value
2248n/a if value[0] == ';':
2249n/a group_list.append(leader)
2250n/a return group_list, value
2251n/a token, value = get_mailbox_list(value)
2252n/a if len(token.all_mailboxes)==0:
2253n/a if leader is not None:
2254n/a group_list.append(leader)
2255n/a group_list.extend(token)
2256n/a group_list.defects.append(errors.ObsoleteHeaderDefect(
2257n/a "group-list with empty entries"))
2258n/a return group_list, value
2259n/a if leader is not None:
2260n/a token[:0] = [leader]
2261n/a group_list.append(token)
2262n/a return group_list, value
2263n/a
2264n/adef get_group(value):
2265n/a """ group = display-name ":" [group-list] ";" [CFWS]
2266n/a
2267n/a """
2268n/a group = Group()
2269n/a token, value = get_display_name(value)
2270n/a if not value or value[0] != ':':
2271n/a raise errors.HeaderParseError("expected ':' at end of group "
2272n/a "display name but found '{}'".format(value))
2273n/a group.append(token)
2274n/a group.append(ValueTerminal(':', 'group-display-name-terminator'))
2275n/a value = value[1:]
2276n/a if value and value[0] == ';':
2277n/a group.append(ValueTerminal(';', 'group-terminator'))
2278n/a return group, value[1:]
2279n/a token, value = get_group_list(value)
2280n/a group.append(token)
2281n/a if not value:
2282n/a group.defects.append(errors.InvalidHeaderDefect(
2283n/a "end of header in group"))
2284n/a if value[0] != ';':
2285n/a raise errors.HeaderParseError(
2286n/a "expected ';' at end of group but found {}".format(value))
2287n/a group.append(ValueTerminal(';', 'group-terminator'))
2288n/a value = value[1:]
2289n/a if value and value[0] in CFWS_LEADER:
2290n/a token, value = get_cfws(value)
2291n/a group.append(token)
2292n/a return group, value
2293n/a
2294n/adef get_address(value):
2295n/a """ address = mailbox / group
2296n/a
2297n/a Note that counter-intuitively, an address can be either a single address or
2298n/a a list of addresses (a group). This is why the returned Address object has
2299n/a a 'mailboxes' attribute which treats a single address as a list of length
2300n/a one. When you need to differentiate between to two cases, extract the single
2301n/a element, which is either a mailbox or a group token.
2302n/a
2303n/a """
2304n/a # The formal grammar isn't very helpful when parsing an address. mailbox
2305n/a # and group, especially when allowing for obsolete forms, start off very
2306n/a # similarly. It is only when you reach one of @, <, or : that you know
2307n/a # what you've got. So, we try each one in turn, starting with the more
2308n/a # likely of the two. We could perhaps make this more efficient by looking
2309n/a # for a phrase and then branching based on the next character, but that
2310n/a # would be a premature optimization.
2311n/a address = Address()
2312n/a try:
2313n/a token, value = get_group(value)
2314n/a except errors.HeaderParseError:
2315n/a try:
2316n/a token, value = get_mailbox(value)
2317n/a except errors.HeaderParseError:
2318n/a raise errors.HeaderParseError(
2319n/a "expected address but found '{}'".format(value))
2320n/a address.append(token)
2321n/a return address, value
2322n/a
2323n/adef get_address_list(value):
2324n/a """ address_list = (address *("," address)) / obs-addr-list
2325n/a obs-addr-list = *([CFWS] ",") address *("," [address / CFWS])
2326n/a
2327n/a We depart from the formal grammar here by continuing to parse until the end
2328n/a of the input, assuming the input to be entirely composed of an
2329n/a address-list. This is always true in email parsing, and allows us
2330n/a to skip invalid addresses to parse additional valid ones.
2331n/a
2332n/a """
2333n/a address_list = AddressList()
2334n/a while value:
2335n/a try:
2336n/a token, value = get_address(value)
2337n/a address_list.append(token)
2338n/a except errors.HeaderParseError as err:
2339n/a leader = None
2340n/a if value[0] in CFWS_LEADER:
2341n/a leader, value = get_cfws(value)
2342n/a if not value or value[0] == ',':
2343n/a address_list.append(leader)
2344n/a address_list.defects.append(errors.ObsoleteHeaderDefect(
2345n/a "address-list entry with no content"))
2346n/a else:
2347n/a token, value = get_invalid_mailbox(value, ',')
2348n/a if leader is not None:
2349n/a token[:0] = [leader]
2350n/a address_list.append(Address([token]))
2351n/a address_list.defects.append(errors.InvalidHeaderDefect(
2352n/a "invalid address in address-list"))
2353n/a elif value[0] == ',':
2354n/a address_list.defects.append(errors.ObsoleteHeaderDefect(
2355n/a "empty element in address-list"))
2356n/a else:
2357n/a token, value = get_invalid_mailbox(value, ',')
2358n/a if leader is not None:
2359n/a token[:0] = [leader]
2360n/a address_list.append(Address([token]))
2361n/a address_list.defects.append(errors.InvalidHeaderDefect(
2362n/a "invalid address in address-list"))
2363n/a if value and value[0] != ',':
2364n/a # Crap after address; treat it as an invalid mailbox.
2365n/a # The mailbox info will still be available.
2366n/a mailbox = address_list[-1][0]
2367n/a mailbox.token_type = 'invalid-mailbox'
2368n/a token, value = get_invalid_mailbox(value, ',')
2369n/a mailbox.extend(token)
2370n/a address_list.defects.append(errors.InvalidHeaderDefect(
2371n/a "invalid address in address-list"))
2372n/a if value: # Must be a , at this point.
2373n/a address_list.append(ValueTerminal(',', 'list-separator'))
2374n/a value = value[1:]
2375n/a return address_list, value
2376n/a
2377n/a#
2378n/a# XXX: As I begin to add additional header parsers, I'm realizing we probably
2379n/a# have two level of parser routines: the get_XXX methods that get a token in
2380n/a# the grammar, and parse_XXX methods that parse an entire field value. So
2381n/a# get_address_list above should really be a parse_ method, as probably should
2382n/a# be get_unstructured.
2383n/a#
2384n/a
2385n/adef parse_mime_version(value):
2386n/a """ mime-version = [CFWS] 1*digit [CFWS] "." [CFWS] 1*digit [CFWS]
2387n/a
2388n/a """
2389n/a # The [CFWS] is implicit in the RFC 2045 BNF.
2390n/a # XXX: This routine is a bit verbose, should factor out a get_int method.
2391n/a mime_version = MIMEVersion()
2392n/a if not value:
2393n/a mime_version.defects.append(errors.HeaderMissingRequiredValue(
2394n/a "Missing MIME version number (eg: 1.0)"))
2395n/a return mime_version
2396n/a if value[0] in CFWS_LEADER:
2397n/a token, value = get_cfws(value)
2398n/a mime_version.append(token)
2399n/a if not value:
2400n/a mime_version.defects.append(errors.HeaderMissingRequiredValue(
2401n/a "Expected MIME version number but found only CFWS"))
2402n/a digits = ''
2403n/a while value and value[0] != '.' and value[0] not in CFWS_LEADER:
2404n/a digits += value[0]
2405n/a value = value[1:]
2406n/a if not digits.isdigit():
2407n/a mime_version.defects.append(errors.InvalidHeaderDefect(
2408n/a "Expected MIME major version number but found {!r}".format(digits)))
2409n/a mime_version.append(ValueTerminal(digits, 'xtext'))
2410n/a else:
2411n/a mime_version.major = int(digits)
2412n/a mime_version.append(ValueTerminal(digits, 'digits'))
2413n/a if value and value[0] in CFWS_LEADER:
2414n/a token, value = get_cfws(value)
2415n/a mime_version.append(token)
2416n/a if not value or value[0] != '.':
2417n/a if mime_version.major is not None:
2418n/a mime_version.defects.append(errors.InvalidHeaderDefect(
2419n/a "Incomplete MIME version; found only major number"))
2420n/a if value:
2421n/a mime_version.append(ValueTerminal(value, 'xtext'))
2422n/a return mime_version
2423n/a mime_version.append(ValueTerminal('.', 'version-separator'))
2424n/a value = value[1:]
2425n/a if value and value[0] in CFWS_LEADER:
2426n/a token, value = get_cfws(value)
2427n/a mime_version.append(token)
2428n/a if not value:
2429n/a if mime_version.major is not None:
2430n/a mime_version.defects.append(errors.InvalidHeaderDefect(
2431n/a "Incomplete MIME version; found only major number"))
2432n/a return mime_version
2433n/a digits = ''
2434n/a while value and value[0] not in CFWS_LEADER:
2435n/a digits += value[0]
2436n/a value = value[1:]
2437n/a if not digits.isdigit():
2438n/a mime_version.defects.append(errors.InvalidHeaderDefect(
2439n/a "Expected MIME minor version number but found {!r}".format(digits)))
2440n/a mime_version.append(ValueTerminal(digits, 'xtext'))
2441n/a else:
2442n/a mime_version.minor = int(digits)
2443n/a mime_version.append(ValueTerminal(digits, 'digits'))
2444n/a if value and value[0] in CFWS_LEADER:
2445n/a token, value = get_cfws(value)
2446n/a mime_version.append(token)
2447n/a if value:
2448n/a mime_version.defects.append(errors.InvalidHeaderDefect(
2449n/a "Excess non-CFWS text after MIME version"))
2450n/a mime_version.append(ValueTerminal(value, 'xtext'))
2451n/a return mime_version
2452n/a
2453n/adef get_invalid_parameter(value):
2454n/a """ Read everything up to the next ';'.
2455n/a
2456n/a This is outside the formal grammar. The InvalidParameter TokenList that is
2457n/a returned acts like a Parameter, but the data attributes are None.
2458n/a
2459n/a """
2460n/a invalid_parameter = InvalidParameter()
2461n/a while value and value[0] != ';':
2462n/a if value[0] in PHRASE_ENDS:
2463n/a invalid_parameter.append(ValueTerminal(value[0],
2464n/a 'misplaced-special'))
2465n/a value = value[1:]
2466n/a else:
2467n/a token, value = get_phrase(value)
2468n/a invalid_parameter.append(token)
2469n/a return invalid_parameter, value
2470n/a
2471n/adef get_ttext(value):
2472n/a """ttext = <matches _ttext_matcher>
2473n/a
2474n/a We allow any non-TOKEN_ENDS in ttext, but add defects to the token's
2475n/a defects list if we find non-ttext characters. We also register defects for
2476n/a *any* non-printables even though the RFC doesn't exclude all of them,
2477n/a because we follow the spirit of RFC 5322.
2478n/a
2479n/a """
2480n/a m = _non_token_end_matcher(value)
2481n/a if not m:
2482n/a raise errors.HeaderParseError(
2483n/a "expected ttext but found '{}'".format(value))
2484n/a ttext = m.group()
2485n/a value = value[len(ttext):]
2486n/a ttext = ValueTerminal(ttext, 'ttext')
2487n/a _validate_xtext(ttext)
2488n/a return ttext, value
2489n/a
2490n/adef get_token(value):
2491n/a """token = [CFWS] 1*ttext [CFWS]
2492n/a
2493n/a The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or
2494n/a tspecials. We also exclude tabs even though the RFC doesn't.
2495n/a
2496n/a The RFC implies the CFWS but is not explicit about it in the BNF.
2497n/a
2498n/a """
2499n/a mtoken = Token()
2500n/a if value and value[0] in CFWS_LEADER:
2501n/a token, value = get_cfws(value)
2502n/a mtoken.append(token)
2503n/a if value and value[0] in TOKEN_ENDS:
2504n/a raise errors.HeaderParseError(
2505n/a "expected token but found '{}'".format(value))
2506n/a token, value = get_ttext(value)
2507n/a mtoken.append(token)
2508n/a if value and value[0] in CFWS_LEADER:
2509n/a token, value = get_cfws(value)
2510n/a mtoken.append(token)
2511n/a return mtoken, value
2512n/a
2513n/adef get_attrtext(value):
2514n/a """attrtext = 1*(any non-ATTRIBUTE_ENDS character)
2515n/a
2516n/a We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the
2517n/a token's defects list if we find non-attrtext characters. We also register
2518n/a defects for *any* non-printables even though the RFC doesn't exclude all of
2519n/a them, because we follow the spirit of RFC 5322.
2520n/a
2521n/a """
2522n/a m = _non_attribute_end_matcher(value)
2523n/a if not m:
2524n/a raise errors.HeaderParseError(
2525n/a "expected attrtext but found {!r}".format(value))
2526n/a attrtext = m.group()
2527n/a value = value[len(attrtext):]
2528n/a attrtext = ValueTerminal(attrtext, 'attrtext')
2529n/a _validate_xtext(attrtext)
2530n/a return attrtext, value
2531n/a
2532n/adef get_attribute(value):
2533n/a """ [CFWS] 1*attrtext [CFWS]
2534n/a
2535n/a This version of the BNF makes the CFWS explicit, and as usual we use a
2536n/a value terminal for the actual run of characters. The RFC equivalent of
2537n/a attrtext is the token characters, with the subtraction of '*', "'", and '%'.
2538n/a We include tab in the excluded set just as we do for token.
2539n/a
2540n/a """
2541n/a attribute = Attribute()
2542n/a if value and value[0] in CFWS_LEADER:
2543n/a token, value = get_cfws(value)
2544n/a attribute.append(token)
2545n/a if value and value[0] in ATTRIBUTE_ENDS:
2546n/a raise errors.HeaderParseError(
2547n/a "expected token but found '{}'".format(value))
2548n/a token, value = get_attrtext(value)
2549n/a attribute.append(token)
2550n/a if value and value[0] in CFWS_LEADER:
2551n/a token, value = get_cfws(value)
2552n/a attribute.append(token)
2553n/a return attribute, value
2554n/a
2555n/adef get_extended_attrtext(value):
2556n/a """attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%')
2557n/a
2558n/a This is a special parsing routine so that we get a value that
2559n/a includes % escapes as a single string (which we decode as a single
2560n/a string later).
2561n/a
2562n/a """
2563n/a m = _non_extended_attribute_end_matcher(value)
2564n/a if not m:
2565n/a raise errors.HeaderParseError(
2566n/a "expected extended attrtext but found {!r}".format(value))
2567n/a attrtext = m.group()
2568n/a value = value[len(attrtext):]
2569n/a attrtext = ValueTerminal(attrtext, 'extended-attrtext')
2570n/a _validate_xtext(attrtext)
2571n/a return attrtext, value
2572n/a
2573n/adef get_extended_attribute(value):
2574n/a """ [CFWS] 1*extended_attrtext [CFWS]
2575n/a
2576n/a This is like the non-extended version except we allow % characters, so that
2577n/a we can pick up an encoded value as a single string.
2578n/a
2579n/a """
2580n/a # XXX: should we have an ExtendedAttribute TokenList?
2581n/a attribute = Attribute()
2582n/a if value and value[0] in CFWS_LEADER:
2583n/a token, value = get_cfws(value)
2584n/a attribute.append(token)
2585n/a if value and value[0] in EXTENDED_ATTRIBUTE_ENDS:
2586n/a raise errors.HeaderParseError(
2587n/a "expected token but found '{}'".format(value))
2588n/a token, value = get_extended_attrtext(value)
2589n/a attribute.append(token)
2590n/a if value and value[0] in CFWS_LEADER:
2591n/a token, value = get_cfws(value)
2592n/a attribute.append(token)
2593n/a return attribute, value
2594n/a
2595n/adef get_section(value):
2596n/a """ '*' digits
2597n/a
2598n/a The formal BNF is more complicated because leading 0s are not allowed. We
2599n/a check for that and add a defect. We also assume no CFWS is allowed between
2600n/a the '*' and the digits, though the RFC is not crystal clear on that.
2601n/a The caller should already have dealt with leading CFWS.
2602n/a
2603n/a """
2604n/a section = Section()
2605n/a if not value or value[0] != '*':
2606n/a raise errors.HeaderParseError("Expected section but found {}".format(
2607n/a value))
2608n/a section.append(ValueTerminal('*', 'section-marker'))
2609n/a value = value[1:]
2610n/a if not value or not value[0].isdigit():
2611n/a raise errors.HeaderParseError("Expected section number but "
2612n/a "found {}".format(value))
2613n/a digits = ''
2614n/a while value and value[0].isdigit():
2615n/a digits += value[0]
2616n/a value = value[1:]
2617n/a if digits[0] == '0' and digits != '0':
2618n/a section.defects.append(errors.InvalidHeaderError("section number"
2619n/a "has an invalid leading 0"))
2620n/a section.number = int(digits)
2621n/a section.append(ValueTerminal(digits, 'digits'))
2622n/a return section, value
2623n/a
2624n/a
2625n/adef get_value(value):
2626n/a """ quoted-string / attribute
2627n/a
2628n/a """
2629n/a v = Value()
2630n/a if not value:
2631n/a raise errors.HeaderParseError("Expected value but found end of string")
2632n/a leader = None
2633n/a if value[0] in CFWS_LEADER:
2634n/a leader, value = get_cfws(value)
2635n/a if not value:
2636n/a raise errors.HeaderParseError("Expected value but found "
2637n/a "only {}".format(leader))
2638n/a if value[0] == '"':
2639n/a token, value = get_quoted_string(value)
2640n/a else:
2641n/a token, value = get_extended_attribute(value)
2642n/a if leader is not None:
2643n/a token[:0] = [leader]
2644n/a v.append(token)
2645n/a return v, value
2646n/a
2647n/adef get_parameter(value):
2648n/a """ attribute [section] ["*"] [CFWS] "=" value
2649n/a
2650n/a The CFWS is implied by the RFC but not made explicit in the BNF. This
2651n/a simplified form of the BNF from the RFC is made to conform with the RFC BNF
2652n/a through some extra checks. We do it this way because it makes both error
2653n/a recovery and working with the resulting parse tree easier.
2654n/a """
2655n/a # It is possible CFWS would also be implicitly allowed between the section
2656n/a # and the 'extended-attribute' marker (the '*') , but we've never seen that
2657n/a # in the wild and we will therefore ignore the possibility.
2658n/a param = Parameter()
2659n/a token, value = get_attribute(value)
2660n/a param.append(token)
2661n/a if not value or value[0] == ';':
2662n/a param.defects.append(errors.InvalidHeaderDefect("Parameter contains "
2663n/a "name ({}) but no value".format(token)))
2664n/a return param, value
2665n/a if value[0] == '*':
2666n/a try:
2667n/a token, value = get_section(value)
2668n/a param.sectioned = True
2669n/a param.append(token)
2670n/a except errors.HeaderParseError:
2671n/a pass
2672n/a if not value:
2673n/a raise errors.HeaderParseError("Incomplete parameter")
2674n/a if value[0] == '*':
2675n/a param.append(ValueTerminal('*', 'extended-parameter-marker'))
2676n/a value = value[1:]
2677n/a param.extended = True
2678n/a if value[0] != '=':
2679n/a raise errors.HeaderParseError("Parameter not followed by '='")
2680n/a param.append(ValueTerminal('=', 'parameter-separator'))
2681n/a value = value[1:]
2682n/a leader = None
2683n/a if value and value[0] in CFWS_LEADER:
2684n/a token, value = get_cfws(value)
2685n/a param.append(token)
2686n/a remainder = None
2687n/a appendto = param
2688n/a if param.extended and value and value[0] == '"':
2689n/a # Now for some serious hackery to handle the common invalid case of
2690n/a # double quotes around an extended value. We also accept (with defect)
2691n/a # a value marked as encoded that isn't really.
2692n/a qstring, remainder = get_quoted_string(value)
2693n/a inner_value = qstring.stripped_value
2694n/a semi_valid = False
2695n/a if param.section_number == 0:
2696n/a if inner_value and inner_value[0] == "'":
2697n/a semi_valid = True
2698n/a else:
2699n/a token, rest = get_attrtext(inner_value)
2700n/a if rest and rest[0] == "'":
2701n/a semi_valid = True
2702n/a else:
2703n/a try:
2704n/a token, rest = get_extended_attrtext(inner_value)
2705n/a except:
2706n/a pass
2707n/a else:
2708n/a if not rest:
2709n/a semi_valid = True
2710n/a if semi_valid:
2711n/a param.defects.append(errors.InvalidHeaderDefect(
2712n/a "Quoted string value for extended parameter is invalid"))
2713n/a param.append(qstring)
2714n/a for t in qstring:
2715n/a if t.token_type == 'bare-quoted-string':
2716n/a t[:] = []
2717n/a appendto = t
2718n/a break
2719n/a value = inner_value
2720n/a else:
2721n/a remainder = None
2722n/a param.defects.append(errors.InvalidHeaderDefect(
2723n/a "Parameter marked as extended but appears to have a "
2724n/a "quoted string value that is non-encoded"))
2725n/a if value and value[0] == "'":
2726n/a token = None
2727n/a else:
2728n/a token, value = get_value(value)
2729n/a if not param.extended or param.section_number > 0:
2730n/a if not value or value[0] != "'":
2731n/a appendto.append(token)
2732n/a if remainder is not None:
2733n/a assert not value, value
2734n/a value = remainder
2735n/a return param, value
2736n/a param.defects.append(errors.InvalidHeaderDefect(
2737n/a "Apparent initial-extended-value but attribute "
2738n/a "was not marked as extended or was not initial section"))
2739n/a if not value:
2740n/a # Assume the charset/lang is missing and the token is the value.
2741n/a param.defects.append(errors.InvalidHeaderDefect(
2742n/a "Missing required charset/lang delimiters"))
2743n/a appendto.append(token)
2744n/a if remainder is None:
2745n/a return param, value
2746n/a else:
2747n/a if token is not None:
2748n/a for t in token:
2749n/a if t.token_type == 'extended-attrtext':
2750n/a break
2751n/a t.token_type == 'attrtext'
2752n/a appendto.append(t)
2753n/a param.charset = t.value
2754n/a if value[0] != "'":
2755n/a raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
2756n/a "delimiter, but found {!r}".format(value))
2757n/a appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
2758n/a value = value[1:]
2759n/a if value and value[0] != "'":
2760n/a token, value = get_attrtext(value)
2761n/a appendto.append(token)
2762n/a param.lang = token.value
2763n/a if not value or value[0] != "'":
2764n/a raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
2765n/a "delimiter, but found {}".format(value))
2766n/a appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
2767n/a value = value[1:]
2768n/a if remainder is not None:
2769n/a # Treat the rest of value as bare quoted string content.
2770n/a v = Value()
2771n/a while value:
2772n/a if value[0] in WSP:
2773n/a token, value = get_fws(value)
2774n/a else:
2775n/a token, value = get_qcontent(value)
2776n/a v.append(token)
2777n/a token = v
2778n/a else:
2779n/a token, value = get_value(value)
2780n/a appendto.append(token)
2781n/a if remainder is not None:
2782n/a assert not value, value
2783n/a value = remainder
2784n/a return param, value
2785n/a
2786n/adef parse_mime_parameters(value):
2787n/a """ parameter *( ";" parameter )
2788n/a
2789n/a That BNF is meant to indicate this routine should only be called after
2790n/a finding and handling the leading ';'. There is no corresponding rule in
2791n/a the formal RFC grammar, but it is more convenient for us for the set of
2792n/a parameters to be treated as its own TokenList.
2793n/a
2794n/a This is 'parse' routine because it consumes the reminaing value, but it
2795n/a would never be called to parse a full header. Instead it is called to
2796n/a parse everything after the non-parameter value of a specific MIME header.
2797n/a
2798n/a """
2799n/a mime_parameters = MimeParameters()
2800n/a while value:
2801n/a try:
2802n/a token, value = get_parameter(value)
2803n/a mime_parameters.append(token)
2804n/a except errors.HeaderParseError as err:
2805n/a leader = None
2806n/a if value[0] in CFWS_LEADER:
2807n/a leader, value = get_cfws(value)
2808n/a if not value:
2809n/a mime_parameters.append(leader)
2810n/a return mime_parameters
2811n/a if value[0] == ';':
2812n/a if leader is not None:
2813n/a mime_parameters.append(leader)
2814n/a mime_parameters.defects.append(errors.InvalidHeaderDefect(
2815n/a "parameter entry with no content"))
2816n/a else:
2817n/a token, value = get_invalid_parameter(value)
2818n/a if leader:
2819n/a token[:0] = [leader]
2820n/a mime_parameters.append(token)
2821n/a mime_parameters.defects.append(errors.InvalidHeaderDefect(
2822n/a "invalid parameter {!r}".format(token)))
2823n/a if value and value[0] != ';':
2824n/a # Junk after the otherwise valid parameter. Mark it as
2825n/a # invalid, but it will have a value.
2826n/a param = mime_parameters[-1]
2827n/a param.token_type = 'invalid-parameter'
2828n/a token, value = get_invalid_parameter(value)
2829n/a param.extend(token)
2830n/a mime_parameters.defects.append(errors.InvalidHeaderDefect(
2831n/a "parameter with invalid trailing text {!r}".format(token)))
2832n/a if value:
2833n/a # Must be a ';' at this point.
2834n/a mime_parameters.append(ValueTerminal(';', 'parameter-separator'))
2835n/a value = value[1:]
2836n/a return mime_parameters
2837n/a
2838n/adef _find_mime_parameters(tokenlist, value):
2839n/a """Do our best to find the parameters in an invalid MIME header
2840n/a
2841n/a """
2842n/a while value and value[0] != ';':
2843n/a if value[0] in PHRASE_ENDS:
2844n/a tokenlist.append(ValueTerminal(value[0], 'misplaced-special'))
2845n/a value = value[1:]
2846n/a else:
2847n/a token, value = get_phrase(value)
2848n/a tokenlist.append(token)
2849n/a if not value:
2850n/a return
2851n/a tokenlist.append(ValueTerminal(';', 'parameter-separator'))
2852n/a tokenlist.append(parse_mime_parameters(value[1:]))
2853n/a
2854n/adef parse_content_type_header(value):
2855n/a """ maintype "/" subtype *( ";" parameter )
2856n/a
2857n/a The maintype and substype are tokens. Theoretically they could
2858n/a be checked against the official IANA list + x-token, but we
2859n/a don't do that.
2860n/a """
2861n/a ctype = ContentType()
2862n/a recover = False
2863n/a if not value:
2864n/a ctype.defects.append(errors.HeaderMissingRequiredValue(
2865n/a "Missing content type specification"))
2866n/a return ctype
2867n/a try:
2868n/a token, value = get_token(value)
2869n/a except errors.HeaderParseError:
2870n/a ctype.defects.append(errors.InvalidHeaderDefect(
2871n/a "Expected content maintype but found {!r}".format(value)))
2872n/a _find_mime_parameters(ctype, value)
2873n/a return ctype
2874n/a ctype.append(token)
2875n/a # XXX: If we really want to follow the formal grammar we should make
2876n/a # mantype and subtype specialized TokenLists here. Probably not worth it.
2877n/a if not value or value[0] != '/':
2878n/a ctype.defects.append(errors.InvalidHeaderDefect(
2879n/a "Invalid content type"))
2880n/a if value:
2881n/a _find_mime_parameters(ctype, value)
2882n/a return ctype
2883n/a ctype.maintype = token.value.strip().lower()
2884n/a ctype.append(ValueTerminal('/', 'content-type-separator'))
2885n/a value = value[1:]
2886n/a try:
2887n/a token, value = get_token(value)
2888n/a except errors.HeaderParseError:
2889n/a ctype.defects.append(errors.InvalidHeaderDefect(
2890n/a "Expected content subtype but found {!r}".format(value)))
2891n/a _find_mime_parameters(ctype, value)
2892n/a return ctype
2893n/a ctype.append(token)
2894n/a ctype.subtype = token.value.strip().lower()
2895n/a if not value:
2896n/a return ctype
2897n/a if value[0] != ';':
2898n/a ctype.defects.append(errors.InvalidHeaderDefect(
2899n/a "Only parameters are valid after content type, but "
2900n/a "found {!r}".format(value)))
2901n/a # The RFC requires that a syntactically invalid content-type be treated
2902n/a # as text/plain. Perhaps we should postel this, but we should probably
2903n/a # only do that if we were checking the subtype value against IANA.
2904n/a del ctype.maintype, ctype.subtype
2905n/a _find_mime_parameters(ctype, value)
2906n/a return ctype
2907n/a ctype.append(ValueTerminal(';', 'parameter-separator'))
2908n/a ctype.append(parse_mime_parameters(value[1:]))
2909n/a return ctype
2910n/a
2911n/adef parse_content_disposition_header(value):
2912n/a """ disposition-type *( ";" parameter )
2913n/a
2914n/a """
2915n/a disp_header = ContentDisposition()
2916n/a if not value:
2917n/a disp_header.defects.append(errors.HeaderMissingRequiredValue(
2918n/a "Missing content disposition"))
2919n/a return disp_header
2920n/a try:
2921n/a token, value = get_token(value)
2922n/a except errors.HeaderParseError:
2923n/a disp_header.defects.append(errors.InvalidHeaderDefect(
2924n/a "Expected content disposition but found {!r}".format(value)))
2925n/a _find_mime_parameters(disp_header, value)
2926n/a return disp_header
2927n/a disp_header.append(token)
2928n/a disp_header.content_disposition = token.value.strip().lower()
2929n/a if not value:
2930n/a return disp_header
2931n/a if value[0] != ';':
2932n/a disp_header.defects.append(errors.InvalidHeaderDefect(
2933n/a "Only parameters are valid after content disposition, but "
2934n/a "found {!r}".format(value)))
2935n/a _find_mime_parameters(disp_header, value)
2936n/a return disp_header
2937n/a disp_header.append(ValueTerminal(';', 'parameter-separator'))
2938n/a disp_header.append(parse_mime_parameters(value[1:]))
2939n/a return disp_header
2940n/a
2941n/adef parse_content_transfer_encoding_header(value):
2942n/a """ mechanism
2943n/a
2944n/a """
2945n/a # We should probably validate the values, since the list is fixed.
2946n/a cte_header = ContentTransferEncoding()
2947n/a if not value:
2948n/a cte_header.defects.append(errors.HeaderMissingRequiredValue(
2949n/a "Missing content transfer encoding"))
2950n/a return cte_header
2951n/a try:
2952n/a token, value = get_token(value)
2953n/a except errors.HeaderParseError:
2954n/a cte_header.defects.append(errors.InvalidHeaderDefect(
2955n/a "Expected content transfer encoding but found {!r}".format(value)))
2956n/a else:
2957n/a cte_header.append(token)
2958n/a cte_header.cte = token.value.strip().lower()
2959n/a if not value:
2960n/a return cte_header
2961n/a while value:
2962n/a cte_header.defects.append(errors.InvalidHeaderDefect(
2963n/a "Extra text after content transfer encoding"))
2964n/a if value[0] in PHRASE_ENDS:
2965n/a cte_header.append(ValueTerminal(value[0], 'misplaced-special'))
2966n/a value = value[1:]
2967n/a else:
2968n/a token, value = get_phrase(value)
2969n/a cte_header.append(token)
2970n/a return cte_header