ยปCore Development>Code coverage>Lib/urllib/parse.py

Python code coverage for Lib/urllib/parse.py

#countcontent
1n/a"""Parse (absolute and relative) URLs.
2n/a
3n/aurlparse module is based upon the following RFC specifications.
4n/a
5n/aRFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6n/aand L. Masinter, January 2005.
7n/a
8n/aRFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9n/aand L.Masinter, December 1999.
10n/a
11n/aRFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12n/aBerners-Lee, R. Fielding, and L. Masinter, August 1998.
13n/a
14n/aRFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
15n/a
16n/aRFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
17n/a1995.
18n/a
19n/aRFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20n/aMcCahill, December 1994
21n/a
22n/aRFC 3986 is considered the current standard and any future changes to
23n/aurlparse module should conform with it. The urlparse module is
24n/acurrently not entirely compliant with this RFC due to defacto
25n/ascenarios for parsing, and for backward compatibility purposes, some
26n/aparsing quirks from older RFCs are retained. The testcases in
27n/atest_urlparse.py provides a good indicator of parsing behavior.
28n/a"""
29n/a
30n/aimport re
31n/aimport sys
32n/aimport collections
33n/a
34n/a__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
35n/a "urlsplit", "urlunsplit", "urlencode", "parse_qs",
36n/a "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
37n/a "unquote", "unquote_plus", "unquote_to_bytes",
38n/a "DefragResult", "ParseResult", "SplitResult",
39n/a "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
40n/a
41n/a# A classification of schemes ('' means apply by default)
42n/auses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
43n/a 'wais', 'file', 'https', 'shttp', 'mms',
44n/a 'prospero', 'rtsp', 'rtspu', '', 'sftp',
45n/a 'svn', 'svn+ssh', 'ws', 'wss']
46n/auses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
47n/a 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
48n/a 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
49n/a 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
50n/a 'ws', 'wss']
51n/auses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
52n/a 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
53n/a 'mms', '', 'sftp', 'tel']
54n/a
55n/a# These are not actually used anymore, but should stay for backwards
56n/a# compatibility. (They are undocumented, but have a public-looking name.)
57n/anon_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
58n/a 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
59n/auses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
60n/a 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
61n/auses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
62n/a 'nntp', 'wais', 'https', 'shttp', 'snews',
63n/a 'file', 'prospero', '']
64n/a
65n/a# Characters valid in scheme names
66n/ascheme_chars = ('abcdefghijklmnopqrstuvwxyz'
67n/a 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
68n/a '0123456789'
69n/a '+-.')
70n/a
71n/a# XXX: Consider replacing with functools.lru_cache
72n/aMAX_CACHE_SIZE = 20
73n/a_parse_cache = {}
74n/a
75n/adef clear_cache():
76n/a """Clear the parse cache and the quoters cache."""
77n/a _parse_cache.clear()
78n/a _safe_quoters.clear()
79n/a
80n/a
81n/a# Helpers for bytes handling
82n/a# For 3.2, we deliberately require applications that
83n/a# handle improperly quoted URLs to do their own
84n/a# decoding and encoding. If valid use cases are
85n/a# presented, we may relax this by using latin-1
86n/a# decoding internally for 3.3
87n/a_implicit_encoding = 'ascii'
88n/a_implicit_errors = 'strict'
89n/a
90n/adef _noop(obj):
91n/a return obj
92n/a
93n/adef _encode_result(obj, encoding=_implicit_encoding,
94n/a errors=_implicit_errors):
95n/a return obj.encode(encoding, errors)
96n/a
97n/adef _decode_args(args, encoding=_implicit_encoding,
98n/a errors=_implicit_errors):
99n/a return tuple(x.decode(encoding, errors) if x else '' for x in args)
100n/a
101n/adef _coerce_args(*args):
102n/a # Invokes decode if necessary to create str args
103n/a # and returns the coerced inputs along with
104n/a # an appropriate result coercion function
105n/a # - noop for str inputs
106n/a # - encoding function otherwise
107n/a str_input = isinstance(args[0], str)
108n/a for arg in args[1:]:
109n/a # We special-case the empty string to support the
110n/a # "scheme=''" default argument to some functions
111n/a if arg and isinstance(arg, str) != str_input:
112n/a raise TypeError("Cannot mix str and non-str arguments")
113n/a if str_input:
114n/a return args + (_noop,)
115n/a return _decode_args(args) + (_encode_result,)
116n/a
117n/a# Result objects are more helpful than simple tuples
118n/aclass _ResultMixinStr(object):
119n/a """Standard approach to encoding parsed results from str to bytes"""
120n/a __slots__ = ()
121n/a
122n/a def encode(self, encoding='ascii', errors='strict'):
123n/a return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
124n/a
125n/a
126n/aclass _ResultMixinBytes(object):
127n/a """Standard approach to decoding parsed results from bytes to str"""
128n/a __slots__ = ()
129n/a
130n/a def decode(self, encoding='ascii', errors='strict'):
131n/a return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
132n/a
133n/a
134n/aclass _NetlocResultMixinBase(object):
135n/a """Shared methods for the parsed result objects containing a netloc element"""
136n/a __slots__ = ()
137n/a
138n/a @property
139n/a def username(self):
140n/a return self._userinfo[0]
141n/a
142n/a @property
143n/a def password(self):
144n/a return self._userinfo[1]
145n/a
146n/a @property
147n/a def hostname(self):
148n/a hostname = self._hostinfo[0]
149n/a if not hostname:
150n/a hostname = None
151n/a elif hostname is not None:
152n/a hostname = hostname.lower()
153n/a return hostname
154n/a
155n/a @property
156n/a def port(self):
157n/a port = self._hostinfo[1]
158n/a if port is not None:
159n/a port = int(port, 10)
160n/a if not ( 0 <= port <= 65535):
161n/a raise ValueError("Port out of range 0-65535")
162n/a return port
163n/a
164n/a
165n/aclass _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
166n/a __slots__ = ()
167n/a
168n/a @property
169n/a def _userinfo(self):
170n/a netloc = self.netloc
171n/a userinfo, have_info, hostinfo = netloc.rpartition('@')
172n/a if have_info:
173n/a username, have_password, password = userinfo.partition(':')
174n/a if not have_password:
175n/a password = None
176n/a else:
177n/a username = password = None
178n/a return username, password
179n/a
180n/a @property
181n/a def _hostinfo(self):
182n/a netloc = self.netloc
183n/a _, _, hostinfo = netloc.rpartition('@')
184n/a _, have_open_br, bracketed = hostinfo.partition('[')
185n/a if have_open_br:
186n/a hostname, _, port = bracketed.partition(']')
187n/a _, _, port = port.partition(':')
188n/a else:
189n/a hostname, _, port = hostinfo.partition(':')
190n/a if not port:
191n/a port = None
192n/a return hostname, port
193n/a
194n/a
195n/aclass _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
196n/a __slots__ = ()
197n/a
198n/a @property
199n/a def _userinfo(self):
200n/a netloc = self.netloc
201n/a userinfo, have_info, hostinfo = netloc.rpartition(b'@')
202n/a if have_info:
203n/a username, have_password, password = userinfo.partition(b':')
204n/a if not have_password:
205n/a password = None
206n/a else:
207n/a username = password = None
208n/a return username, password
209n/a
210n/a @property
211n/a def _hostinfo(self):
212n/a netloc = self.netloc
213n/a _, _, hostinfo = netloc.rpartition(b'@')
214n/a _, have_open_br, bracketed = hostinfo.partition(b'[')
215n/a if have_open_br:
216n/a hostname, _, port = bracketed.partition(b']')
217n/a _, _, port = port.partition(b':')
218n/a else:
219n/a hostname, _, port = hostinfo.partition(b':')
220n/a if not port:
221n/a port = None
222n/a return hostname, port
223n/a
224n/a
225n/afrom collections import namedtuple
226n/a
227n/a_DefragResultBase = namedtuple('DefragResult', 'url fragment')
228n/a_SplitResultBase = namedtuple(
229n/a 'SplitResult', 'scheme netloc path query fragment')
230n/a_ParseResultBase = namedtuple(
231n/a 'ParseResult', 'scheme netloc path params query fragment')
232n/a
233n/a_DefragResultBase.__doc__ = """
234n/aDefragResult(url, fragment)
235n/a
236n/aA 2-tuple that contains the url without fragment identifier and the fragment
237n/aidentifier as a separate argument.
238n/a"""
239n/a
240n/a_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
241n/a
242n/a_DefragResultBase.fragment.__doc__ = """
243n/aFragment identifier separated from URL, that allows indirect identification of a
244n/asecondary resource by reference to a primary resource and additional identifying
245n/ainformation.
246n/a"""
247n/a
248n/a_SplitResultBase.__doc__ = """
249n/aSplitResult(scheme, netloc, path, query, fragment)
250n/a
251n/aA 5-tuple that contains the different components of a URL. Similar to
252n/aParseResult, but does not split params.
253n/a"""
254n/a
255n/a_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
256n/a
257n/a_SplitResultBase.netloc.__doc__ = """
258n/aNetwork location where the request is made to.
259n/a"""
260n/a
261n/a_SplitResultBase.path.__doc__ = """
262n/aThe hierarchical path, such as the path to a file to download.
263n/a"""
264n/a
265n/a_SplitResultBase.query.__doc__ = """
266n/aThe query component, that contains non-hierarchical data, that along with data
267n/ain path component, identifies a resource in the scope of URI's scheme and
268n/anetwork location.
269n/a"""
270n/a
271n/a_SplitResultBase.fragment.__doc__ = """
272n/aFragment identifier, that allows indirect identification of a secondary resource
273n/aby reference to a primary resource and additional identifying information.
274n/a"""
275n/a
276n/a_ParseResultBase.__doc__ = """
277n/aParseResult(scheme, netloc, path, params, query, fragment)
278n/a
279n/aA 6-tuple that contains components of a parsed URL.
280n/a"""
281n/a
282n/a_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
283n/a_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
284n/a_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
285n/a_ParseResultBase.params.__doc__ = """
286n/aParameters for last path element used to dereference the URI in order to provide
287n/aaccess to perform some operation on the resource.
288n/a"""
289n/a
290n/a_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
291n/a_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
292n/a
293n/a
294n/a# For backwards compatibility, alias _NetlocResultMixinStr
295n/a# ResultBase is no longer part of the documented API, but it is
296n/a# retained since deprecating it isn't worth the hassle
297n/aResultBase = _NetlocResultMixinStr
298n/a
299n/a# Structured result objects for string data
300n/aclass DefragResult(_DefragResultBase, _ResultMixinStr):
301n/a __slots__ = ()
302n/a def geturl(self):
303n/a if self.fragment:
304n/a return self.url + '#' + self.fragment
305n/a else:
306n/a return self.url
307n/a
308n/aclass SplitResult(_SplitResultBase, _NetlocResultMixinStr):
309n/a __slots__ = ()
310n/a def geturl(self):
311n/a return urlunsplit(self)
312n/a
313n/aclass ParseResult(_ParseResultBase, _NetlocResultMixinStr):
314n/a __slots__ = ()
315n/a def geturl(self):
316n/a return urlunparse(self)
317n/a
318n/a# Structured result objects for bytes data
319n/aclass DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
320n/a __slots__ = ()
321n/a def geturl(self):
322n/a if self.fragment:
323n/a return self.url + b'#' + self.fragment
324n/a else:
325n/a return self.url
326n/a
327n/aclass SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
328n/a __slots__ = ()
329n/a def geturl(self):
330n/a return urlunsplit(self)
331n/a
332n/aclass ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
333n/a __slots__ = ()
334n/a def geturl(self):
335n/a return urlunparse(self)
336n/a
337n/a# Set up the encode/decode result pairs
338n/adef _fix_result_transcoding():
339n/a _result_pairs = (
340n/a (DefragResult, DefragResultBytes),
341n/a (SplitResult, SplitResultBytes),
342n/a (ParseResult, ParseResultBytes),
343n/a )
344n/a for _decoded, _encoded in _result_pairs:
345n/a _decoded._encoded_counterpart = _encoded
346n/a _encoded._decoded_counterpart = _decoded
347n/a
348n/a_fix_result_transcoding()
349n/adel _fix_result_transcoding
350n/a
351n/adef urlparse(url, scheme='', allow_fragments=True):
352n/a """Parse a URL into 6 components:
353n/a <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
354n/a Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
355n/a Note that we don't break the components up in smaller bits
356n/a (e.g. netloc is a single string) and we don't expand % escapes."""
357n/a url, scheme, _coerce_result = _coerce_args(url, scheme)
358n/a splitresult = urlsplit(url, scheme, allow_fragments)
359n/a scheme, netloc, url, query, fragment = splitresult
360n/a if scheme in uses_params and ';' in url:
361n/a url, params = _splitparams(url)
362n/a else:
363n/a params = ''
364n/a result = ParseResult(scheme, netloc, url, params, query, fragment)
365n/a return _coerce_result(result)
366n/a
367n/adef _splitparams(url):
368n/a if '/' in url:
369n/a i = url.find(';', url.rfind('/'))
370n/a if i < 0:
371n/a return url, ''
372n/a else:
373n/a i = url.find(';')
374n/a return url[:i], url[i+1:]
375n/a
376n/adef _splitnetloc(url, start=0):
377n/a delim = len(url) # position of end of domain part of url, default is end
378n/a for c in '/?#': # look for delimiters; the order is NOT important
379n/a wdelim = url.find(c, start) # find first of this delim
380n/a if wdelim >= 0: # if found
381n/a delim = min(delim, wdelim) # use earliest delim position
382n/a return url[start:delim], url[delim:] # return (domain, rest)
383n/a
384n/adef urlsplit(url, scheme='', allow_fragments=True):
385n/a """Parse a URL into 5 components:
386n/a <scheme>://<netloc>/<path>?<query>#<fragment>
387n/a Return a 5-tuple: (scheme, netloc, path, query, fragment).
388n/a Note that we don't break the components up in smaller bits
389n/a (e.g. netloc is a single string) and we don't expand % escapes."""
390n/a url, scheme, _coerce_result = _coerce_args(url, scheme)
391n/a allow_fragments = bool(allow_fragments)
392n/a key = url, scheme, allow_fragments, type(url), type(scheme)
393n/a cached = _parse_cache.get(key, None)
394n/a if cached:
395n/a return _coerce_result(cached)
396n/a if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
397n/a clear_cache()
398n/a netloc = query = fragment = ''
399n/a i = url.find(':')
400n/a if i > 0:
401n/a if url[:i] == 'http': # optimize the common case
402n/a scheme = url[:i].lower()
403n/a url = url[i+1:]
404n/a if url[:2] == '//':
405n/a netloc, url = _splitnetloc(url, 2)
406n/a if (('[' in netloc and ']' not in netloc) or
407n/a (']' in netloc and '[' not in netloc)):
408n/a raise ValueError("Invalid IPv6 URL")
409n/a if allow_fragments and '#' in url:
410n/a url, fragment = url.split('#', 1)
411n/a if '?' in url:
412n/a url, query = url.split('?', 1)
413n/a v = SplitResult(scheme, netloc, url, query, fragment)
414n/a _parse_cache[key] = v
415n/a return _coerce_result(v)
416n/a for c in url[:i]:
417n/a if c not in scheme_chars:
418n/a break
419n/a else:
420n/a # make sure "url" is not actually a port number (in which case
421n/a # "scheme" is really part of the path)
422n/a rest = url[i+1:]
423n/a if not rest or any(c not in '0123456789' for c in rest):
424n/a # not a port number
425n/a scheme, url = url[:i].lower(), rest
426n/a
427n/a if url[:2] == '//':
428n/a netloc, url = _splitnetloc(url, 2)
429n/a if (('[' in netloc and ']' not in netloc) or
430n/a (']' in netloc and '[' not in netloc)):
431n/a raise ValueError("Invalid IPv6 URL")
432n/a if allow_fragments and '#' in url:
433n/a url, fragment = url.split('#', 1)
434n/a if '?' in url:
435n/a url, query = url.split('?', 1)
436n/a v = SplitResult(scheme, netloc, url, query, fragment)
437n/a _parse_cache[key] = v
438n/a return _coerce_result(v)
439n/a
440n/adef urlunparse(components):
441n/a """Put a parsed URL back together again. This may result in a
442n/a slightly different, but equivalent URL, if the URL that was parsed
443n/a originally had redundant delimiters, e.g. a ? with an empty query
444n/a (the draft states that these are equivalent)."""
445n/a scheme, netloc, url, params, query, fragment, _coerce_result = (
446n/a _coerce_args(*components))
447n/a if params:
448n/a url = "%s;%s" % (url, params)
449n/a return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
450n/a
451n/adef urlunsplit(components):
452n/a """Combine the elements of a tuple as returned by urlsplit() into a
453n/a complete URL as a string. The data argument can be any five-item iterable.
454n/a This may result in a slightly different, but equivalent URL, if the URL that
455n/a was parsed originally had unnecessary delimiters (for example, a ? with an
456n/a empty query; the RFC states that these are equivalent)."""
457n/a scheme, netloc, url, query, fragment, _coerce_result = (
458n/a _coerce_args(*components))
459n/a if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
460n/a if url and url[:1] != '/': url = '/' + url
461n/a url = '//' + (netloc or '') + url
462n/a if scheme:
463n/a url = scheme + ':' + url
464n/a if query:
465n/a url = url + '?' + query
466n/a if fragment:
467n/a url = url + '#' + fragment
468n/a return _coerce_result(url)
469n/a
470n/adef urljoin(base, url, allow_fragments=True):
471n/a """Join a base URL and a possibly relative URL to form an absolute
472n/a interpretation of the latter."""
473n/a if not base:
474n/a return url
475n/a if not url:
476n/a return base
477n/a
478n/a base, url, _coerce_result = _coerce_args(base, url)
479n/a bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
480n/a urlparse(base, '', allow_fragments)
481n/a scheme, netloc, path, params, query, fragment = \
482n/a urlparse(url, bscheme, allow_fragments)
483n/a
484n/a if scheme != bscheme or scheme not in uses_relative:
485n/a return _coerce_result(url)
486n/a if scheme in uses_netloc:
487n/a if netloc:
488n/a return _coerce_result(urlunparse((scheme, netloc, path,
489n/a params, query, fragment)))
490n/a netloc = bnetloc
491n/a
492n/a if not path and not params:
493n/a path = bpath
494n/a params = bparams
495n/a if not query:
496n/a query = bquery
497n/a return _coerce_result(urlunparse((scheme, netloc, path,
498n/a params, query, fragment)))
499n/a
500n/a base_parts = bpath.split('/')
501n/a if base_parts[-1] != '':
502n/a # the last item is not a directory, so will not be taken into account
503n/a # in resolving the relative path
504n/a del base_parts[-1]
505n/a
506n/a # for rfc3986, ignore all base path should the first character be root.
507n/a if path[:1] == '/':
508n/a segments = path.split('/')
509n/a else:
510n/a segments = base_parts + path.split('/')
511n/a # filter out elements that would cause redundant slashes on re-joining
512n/a # the resolved_path
513n/a segments[1:-1] = filter(None, segments[1:-1])
514n/a
515n/a resolved_path = []
516n/a
517n/a for seg in segments:
518n/a if seg == '..':
519n/a try:
520n/a resolved_path.pop()
521n/a except IndexError:
522n/a # ignore any .. segments that would otherwise cause an IndexError
523n/a # when popped from resolved_path if resolving for rfc3986
524n/a pass
525n/a elif seg == '.':
526n/a continue
527n/a else:
528n/a resolved_path.append(seg)
529n/a
530n/a if segments[-1] in ('.', '..'):
531n/a # do some post-processing here. if the last segment was a relative dir,
532n/a # then we need to append the trailing '/'
533n/a resolved_path.append('')
534n/a
535n/a return _coerce_result(urlunparse((scheme, netloc, '/'.join(
536n/a resolved_path) or '/', params, query, fragment)))
537n/a
538n/a
539n/adef urldefrag(url):
540n/a """Removes any existing fragment from URL.
541n/a
542n/a Returns a tuple of the defragmented URL and the fragment. If
543n/a the URL contained no fragments, the second element is the
544n/a empty string.
545n/a """
546n/a url, _coerce_result = _coerce_args(url)
547n/a if '#' in url:
548n/a s, n, p, a, q, frag = urlparse(url)
549n/a defrag = urlunparse((s, n, p, a, q, ''))
550n/a else:
551n/a frag = ''
552n/a defrag = url
553n/a return _coerce_result(DefragResult(defrag, frag))
554n/a
555n/a_hexdig = '0123456789ABCDEFabcdef'
556n/a_hextobyte = None
557n/a
558n/adef unquote_to_bytes(string):
559n/a """unquote_to_bytes('abc%20def') -> b'abc def'."""
560n/a # Note: strings are encoded as UTF-8. This is only an issue if it contains
561n/a # unescaped non-ASCII characters, which URIs should not.
562n/a if not string:
563n/a # Is it a string-like object?
564n/a string.split
565n/a return b''
566n/a if isinstance(string, str):
567n/a string = string.encode('utf-8')
568n/a bits = string.split(b'%')
569n/a if len(bits) == 1:
570n/a return string
571n/a res = [bits[0]]
572n/a append = res.append
573n/a # Delay the initialization of the table to not waste memory
574n/a # if the function is never called
575n/a global _hextobyte
576n/a if _hextobyte is None:
577n/a _hextobyte = {(a + b).encode(): bytes.fromhex(a + b)
578n/a for a in _hexdig for b in _hexdig}
579n/a for item in bits[1:]:
580n/a try:
581n/a append(_hextobyte[item[:2]])
582n/a append(item[2:])
583n/a except KeyError:
584n/a append(b'%')
585n/a append(item)
586n/a return b''.join(res)
587n/a
588n/a_asciire = re.compile('([\x00-\x7f]+)')
589n/a
590n/adef unquote(string, encoding='utf-8', errors='replace'):
591n/a """Replace %xx escapes by their single-character equivalent. The optional
592n/a encoding and errors parameters specify how to decode percent-encoded
593n/a sequences into Unicode characters, as accepted by the bytes.decode()
594n/a method.
595n/a By default, percent-encoded sequences are decoded with UTF-8, and invalid
596n/a sequences are replaced by a placeholder character.
597n/a
598n/a unquote('abc%20def') -> 'abc def'.
599n/a """
600n/a if '%' not in string:
601n/a string.split
602n/a return string
603n/a if encoding is None:
604n/a encoding = 'utf-8'
605n/a if errors is None:
606n/a errors = 'replace'
607n/a bits = _asciire.split(string)
608n/a res = [bits[0]]
609n/a append = res.append
610n/a for i in range(1, len(bits), 2):
611n/a append(unquote_to_bytes(bits[i]).decode(encoding, errors))
612n/a append(bits[i + 1])
613n/a return ''.join(res)
614n/a
615n/adef parse_qs(qs, keep_blank_values=False, strict_parsing=False,
616n/a encoding='utf-8', errors='replace'):
617n/a """Parse a query given as a string argument.
618n/a
619n/a Arguments:
620n/a
621n/a qs: percent-encoded query string to be parsed
622n/a
623n/a keep_blank_values: flag indicating whether blank values in
624n/a percent-encoded queries should be treated as blank strings.
625n/a A true value indicates that blanks should be retained as
626n/a blank strings. The default false value indicates that
627n/a blank values are to be ignored and treated as if they were
628n/a not included.
629n/a
630n/a strict_parsing: flag indicating what to do with parsing errors.
631n/a If false (the default), errors are silently ignored.
632n/a If true, errors raise a ValueError exception.
633n/a
634n/a encoding and errors: specify how to decode percent-encoded sequences
635n/a into Unicode characters, as accepted by the bytes.decode() method.
636n/a """
637n/a parsed_result = {}
638n/a pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
639n/a encoding=encoding, errors=errors)
640n/a for name, value in pairs:
641n/a if name in parsed_result:
642n/a parsed_result[name].append(value)
643n/a else:
644n/a parsed_result[name] = [value]
645n/a return parsed_result
646n/a
647n/adef parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
648n/a encoding='utf-8', errors='replace'):
649n/a """Parse a query given as a string argument.
650n/a
651n/a Arguments:
652n/a
653n/a qs: percent-encoded query string to be parsed
654n/a
655n/a keep_blank_values: flag indicating whether blank values in
656n/a percent-encoded queries should be treated as blank strings. A
657n/a true value indicates that blanks should be retained as blank
658n/a strings. The default false value indicates that blank values
659n/a are to be ignored and treated as if they were not included.
660n/a
661n/a strict_parsing: flag indicating what to do with parsing errors. If
662n/a false (the default), errors are silently ignored. If true,
663n/a errors raise a ValueError exception.
664n/a
665n/a encoding and errors: specify how to decode percent-encoded sequences
666n/a into Unicode characters, as accepted by the bytes.decode() method.
667n/a
668n/a Returns a list, as G-d intended.
669n/a """
670n/a qs, _coerce_result = _coerce_args(qs)
671n/a pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
672n/a r = []
673n/a for name_value in pairs:
674n/a if not name_value and not strict_parsing:
675n/a continue
676n/a nv = name_value.split('=', 1)
677n/a if len(nv) != 2:
678n/a if strict_parsing:
679n/a raise ValueError("bad query field: %r" % (name_value,))
680n/a # Handle case of a control-name with no equal sign
681n/a if keep_blank_values:
682n/a nv.append('')
683n/a else:
684n/a continue
685n/a if len(nv[1]) or keep_blank_values:
686n/a name = nv[0].replace('+', ' ')
687n/a name = unquote(name, encoding=encoding, errors=errors)
688n/a name = _coerce_result(name)
689n/a value = nv[1].replace('+', ' ')
690n/a value = unquote(value, encoding=encoding, errors=errors)
691n/a value = _coerce_result(value)
692n/a r.append((name, value))
693n/a return r
694n/a
695n/adef unquote_plus(string, encoding='utf-8', errors='replace'):
696n/a """Like unquote(), but also replace plus signs by spaces, as required for
697n/a unquoting HTML form values.
698n/a
699n/a unquote_plus('%7e/abc+def') -> '~/abc def'
700n/a """
701n/a string = string.replace('+', ' ')
702n/a return unquote(string, encoding, errors)
703n/a
704n/a_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
705n/a b'abcdefghijklmnopqrstuvwxyz'
706n/a b'0123456789'
707n/a b'_.-')
708n/a_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
709n/a_safe_quoters = {}
710n/a
711n/aclass Quoter(collections.defaultdict):
712n/a """A mapping from bytes (in range(0,256)) to strings.
713n/a
714n/a String values are percent-encoded byte values, unless the key < 128, and
715n/a in the "safe" set (either the specified safe set, or default set).
716n/a """
717n/a # Keeps a cache internally, using defaultdict, for efficiency (lookups
718n/a # of cached keys don't call Python code at all).
719n/a def __init__(self, safe):
720n/a """safe: bytes object."""
721n/a self.safe = _ALWAYS_SAFE.union(safe)
722n/a
723n/a def __repr__(self):
724n/a # Without this, will just display as a defaultdict
725n/a return "<%s %r>" % (self.__class__.__name__, dict(self))
726n/a
727n/a def __missing__(self, b):
728n/a # Handle a cache miss. Store quoted string in cache and return.
729n/a res = chr(b) if b in self.safe else '%{:02X}'.format(b)
730n/a self[b] = res
731n/a return res
732n/a
733n/adef quote(string, safe='/', encoding=None, errors=None):
734n/a """quote('abc def') -> 'abc%20def'
735n/a
736n/a Each part of a URL, e.g. the path info, the query, etc., has a
737n/a different set of reserved characters that must be quoted.
738n/a
739n/a RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
740n/a the following reserved characters.
741n/a
742n/a reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
743n/a "$" | ","
744n/a
745n/a Each of these characters is reserved in some component of a URL,
746n/a but not necessarily in all of them.
747n/a
748n/a By default, the quote function is intended for quoting the path
749n/a section of a URL. Thus, it will not encode '/'. This character
750n/a is reserved, but in typical usage the quote function is being
751n/a called on a path where the existing slash characters are used as
752n/a reserved characters.
753n/a
754n/a string and safe may be either str or bytes objects. encoding and errors
755n/a must not be specified if string is a bytes object.
756n/a
757n/a The optional encoding and errors parameters specify how to deal with
758n/a non-ASCII characters, as accepted by the str.encode method.
759n/a By default, encoding='utf-8' (characters are encoded with UTF-8), and
760n/a errors='strict' (unsupported characters raise a UnicodeEncodeError).
761n/a """
762n/a if isinstance(string, str):
763n/a if not string:
764n/a return string
765n/a if encoding is None:
766n/a encoding = 'utf-8'
767n/a if errors is None:
768n/a errors = 'strict'
769n/a string = string.encode(encoding, errors)
770n/a else:
771n/a if encoding is not None:
772n/a raise TypeError("quote() doesn't support 'encoding' for bytes")
773n/a if errors is not None:
774n/a raise TypeError("quote() doesn't support 'errors' for bytes")
775n/a return quote_from_bytes(string, safe)
776n/a
777n/adef quote_plus(string, safe='', encoding=None, errors=None):
778n/a """Like quote(), but also replace ' ' with '+', as required for quoting
779n/a HTML form values. Plus signs in the original string are escaped unless
780n/a they are included in safe. It also does not have safe default to '/'.
781n/a """
782n/a # Check if ' ' in string, where string may either be a str or bytes. If
783n/a # there are no spaces, the regular quote will produce the right answer.
784n/a if ((isinstance(string, str) and ' ' not in string) or
785n/a (isinstance(string, bytes) and b' ' not in string)):
786n/a return quote(string, safe, encoding, errors)
787n/a if isinstance(safe, str):
788n/a space = ' '
789n/a else:
790n/a space = b' '
791n/a string = quote(string, safe + space, encoding, errors)
792n/a return string.replace(' ', '+')
793n/a
794n/adef quote_from_bytes(bs, safe='/'):
795n/a """Like quote(), but accepts a bytes object rather than a str, and does
796n/a not perform string-to-bytes encoding. It always returns an ASCII string.
797n/a quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
798n/a """
799n/a if not isinstance(bs, (bytes, bytearray)):
800n/a raise TypeError("quote_from_bytes() expected bytes")
801n/a if not bs:
802n/a return ''
803n/a if isinstance(safe, str):
804n/a # Normalize 'safe' by converting to bytes and removing non-ASCII chars
805n/a safe = safe.encode('ascii', 'ignore')
806n/a else:
807n/a safe = bytes([c for c in safe if c < 128])
808n/a if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
809n/a return bs.decode()
810n/a try:
811n/a quoter = _safe_quoters[safe]
812n/a except KeyError:
813n/a _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
814n/a return ''.join([quoter(char) for char in bs])
815n/a
816n/adef urlencode(query, doseq=False, safe='', encoding=None, errors=None,
817n/a quote_via=quote_plus):
818n/a """Encode a dict or sequence of two-element tuples into a URL query string.
819n/a
820n/a If any values in the query arg are sequences and doseq is true, each
821n/a sequence element is converted to a separate parameter.
822n/a
823n/a If the query arg is a sequence of two-element tuples, the order of the
824n/a parameters in the output will match the order of parameters in the
825n/a input.
826n/a
827n/a The components of a query arg may each be either a string or a bytes type.
828n/a
829n/a The safe, encoding, and errors parameters are passed down to the function
830n/a specified by quote_via (encoding and errors only if a component is a str).
831n/a """
832n/a
833n/a if hasattr(query, "items"):
834n/a query = query.items()
835n/a else:
836n/a # It's a bother at times that strings and string-like objects are
837n/a # sequences.
838n/a try:
839n/a # non-sequence items should not work with len()
840n/a # non-empty strings will fail this
841n/a if len(query) and not isinstance(query[0], tuple):
842n/a raise TypeError
843n/a # Zero-length sequences of all types will get here and succeed,
844n/a # but that's a minor nit. Since the original implementation
845n/a # allowed empty dicts that type of behavior probably should be
846n/a # preserved for consistency
847n/a except TypeError:
848n/a ty, va, tb = sys.exc_info()
849n/a raise TypeError("not a valid non-string sequence "
850n/a "or mapping object").with_traceback(tb)
851n/a
852n/a l = []
853n/a if not doseq:
854n/a for k, v in query:
855n/a if isinstance(k, bytes):
856n/a k = quote_via(k, safe)
857n/a else:
858n/a k = quote_via(str(k), safe, encoding, errors)
859n/a
860n/a if isinstance(v, bytes):
861n/a v = quote_via(v, safe)
862n/a else:
863n/a v = quote_via(str(v), safe, encoding, errors)
864n/a l.append(k + '=' + v)
865n/a else:
866n/a for k, v in query:
867n/a if isinstance(k, bytes):
868n/a k = quote_via(k, safe)
869n/a else:
870n/a k = quote_via(str(k), safe, encoding, errors)
871n/a
872n/a if isinstance(v, bytes):
873n/a v = quote_via(v, safe)
874n/a l.append(k + '=' + v)
875n/a elif isinstance(v, str):
876n/a v = quote_via(v, safe, encoding, errors)
877n/a l.append(k + '=' + v)
878n/a else:
879n/a try:
880n/a # Is this a sufficient test for sequence-ness?
881n/a x = len(v)
882n/a except TypeError:
883n/a # not a sequence
884n/a v = quote_via(str(v), safe, encoding, errors)
885n/a l.append(k + '=' + v)
886n/a else:
887n/a # loop over the sequence
888n/a for elt in v:
889n/a if isinstance(elt, bytes):
890n/a elt = quote_via(elt, safe)
891n/a else:
892n/a elt = quote_via(str(elt), safe, encoding, errors)
893n/a l.append(k + '=' + elt)
894n/a return '&'.join(l)
895n/a
896n/adef to_bytes(url):
897n/a """to_bytes(u"URL") --> 'URL'."""
898n/a # Most URL schemes require ASCII. If that changes, the conversion
899n/a # can be relaxed.
900n/a # XXX get rid of to_bytes()
901n/a if isinstance(url, str):
902n/a try:
903n/a url = url.encode("ASCII").decode()
904n/a except UnicodeError:
905n/a raise UnicodeError("URL " + repr(url) +
906n/a " contains non-ASCII characters")
907n/a return url
908n/a
909n/adef unwrap(url):
910n/a """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
911n/a url = str(url).strip()
912n/a if url[:1] == '<' and url[-1:] == '>':
913n/a url = url[1:-1].strip()
914n/a if url[:4] == 'URL:': url = url[4:].strip()
915n/a return url
916n/a
917n/a_typeprog = None
918n/adef splittype(url):
919n/a """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
920n/a global _typeprog
921n/a if _typeprog is None:
922n/a _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
923n/a
924n/a match = _typeprog.match(url)
925n/a if match:
926n/a scheme, data = match.groups()
927n/a return scheme.lower(), data
928n/a return None, url
929n/a
930n/a_hostprog = None
931n/adef splithost(url):
932n/a """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
933n/a global _hostprog
934n/a if _hostprog is None:
935n/a _hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL)
936n/a
937n/a match = _hostprog.match(url)
938n/a if match:
939n/a host_port, path = match.groups()
940n/a if path and path[0] != '/':
941n/a path = '/' + path
942n/a return host_port, path
943n/a return None, url
944n/a
945n/adef splituser(host):
946n/a """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
947n/a user, delim, host = host.rpartition('@')
948n/a return (user if delim else None), host
949n/a
950n/adef splitpasswd(user):
951n/a """splitpasswd('user:passwd') -> 'user', 'passwd'."""
952n/a user, delim, passwd = user.partition(':')
953n/a return user, (passwd if delim else None)
954n/a
955n/a# splittag('/path#tag') --> '/path', 'tag'
956n/a_portprog = None
957n/adef splitport(host):
958n/a """splitport('host:port') --> 'host', 'port'."""
959n/a global _portprog
960n/a if _portprog is None:
961n/a _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL)
962n/a
963n/a match = _portprog.match(host)
964n/a if match:
965n/a host, port = match.groups()
966n/a if port:
967n/a return host, port
968n/a return host, None
969n/a
970n/adef splitnport(host, defport=-1):
971n/a """Split host and port, returning numeric port.
972n/a Return given default port if no ':' found; defaults to -1.
973n/a Return numerical port if a valid number are found after ':'.
974n/a Return None if ':' but not a valid number."""
975n/a host, delim, port = host.rpartition(':')
976n/a if not delim:
977n/a host = port
978n/a elif port:
979n/a try:
980n/a nport = int(port)
981n/a except ValueError:
982n/a nport = None
983n/a return host, nport
984n/a return host, defport
985n/a
986n/adef splitquery(url):
987n/a """splitquery('/path?query') --> '/path', 'query'."""
988n/a path, delim, query = url.rpartition('?')
989n/a if delim:
990n/a return path, query
991n/a return url, None
992n/a
993n/adef splittag(url):
994n/a """splittag('/path#tag') --> '/path', 'tag'."""
995n/a path, delim, tag = url.rpartition('#')
996n/a if delim:
997n/a return path, tag
998n/a return url, None
999n/a
1000n/adef splitattr(url):
1001n/a """splitattr('/path;attr1=value1;attr2=value2;...') ->
1002n/a '/path', ['attr1=value1', 'attr2=value2', ...]."""
1003n/a words = url.split(';')
1004n/a return words[0], words[1:]
1005n/a
1006n/adef splitvalue(attr):
1007n/a """splitvalue('attr=value') --> 'attr', 'value'."""
1008n/a attr, delim, value = attr.partition('=')
1009n/a return attr, (value if delim else None)