1 | n/a | """Parse (absolute and relative) URLs. |
---|
2 | n/a | |
---|
3 | n/a | urlparse module is based upon the following RFC specifications. |
---|
4 | n/a | |
---|
5 | n/a | RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding |
---|
6 | n/a | and L. Masinter, January 2005. |
---|
7 | n/a | |
---|
8 | n/a | RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter |
---|
9 | n/a | and L.Masinter, December 1999. |
---|
10 | n/a | |
---|
11 | n/a | RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T. |
---|
12 | n/a | Berners-Lee, R. Fielding, and L. Masinter, August 1998. |
---|
13 | n/a | |
---|
14 | n/a | RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998. |
---|
15 | n/a | |
---|
16 | n/a | RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June |
---|
17 | n/a | 1995. |
---|
18 | n/a | |
---|
19 | n/a | RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M. |
---|
20 | n/a | McCahill, December 1994 |
---|
21 | n/a | |
---|
22 | n/a | RFC 3986 is considered the current standard and any future changes to |
---|
23 | n/a | urlparse module should conform with it. The urlparse module is |
---|
24 | n/a | currently not entirely compliant with this RFC due to defacto |
---|
25 | n/a | scenarios for parsing, and for backward compatibility purposes, some |
---|
26 | n/a | parsing quirks from older RFCs are retained. The testcases in |
---|
27 | n/a | test_urlparse.py provides a good indicator of parsing behavior. |
---|
28 | n/a | """ |
---|
29 | n/a | |
---|
30 | n/a | import re |
---|
31 | n/a | import sys |
---|
32 | n/a | import collections |
---|
33 | n/a | |
---|
34 | n/a | __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", |
---|
35 | n/a | "urlsplit", "urlunsplit", "urlencode", "parse_qs", |
---|
36 | n/a | "parse_qsl", "quote", "quote_plus", "quote_from_bytes", |
---|
37 | n/a | "unquote", "unquote_plus", "unquote_to_bytes", |
---|
38 | n/a | "DefragResult", "ParseResult", "SplitResult", |
---|
39 | n/a | "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"] |
---|
40 | n/a | |
---|
41 | n/a | # A classification of schemes ('' means apply by default) |
---|
42 | n/a | uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', |
---|
43 | n/a | 'wais', 'file', 'https', 'shttp', 'mms', |
---|
44 | n/a | 'prospero', 'rtsp', 'rtspu', '', 'sftp', |
---|
45 | n/a | 'svn', 'svn+ssh', 'ws', 'wss'] |
---|
46 | n/a | uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', |
---|
47 | n/a | 'imap', 'wais', 'file', 'mms', 'https', 'shttp', |
---|
48 | n/a | 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', |
---|
49 | n/a | 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh', |
---|
50 | n/a | 'ws', 'wss'] |
---|
51 | n/a | uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', |
---|
52 | n/a | 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', |
---|
53 | n/a | 'mms', '', 'sftp', 'tel'] |
---|
54 | n/a | |
---|
55 | n/a | # These are not actually used anymore, but should stay for backwards |
---|
56 | n/a | # compatibility. (They are undocumented, but have a public-looking name.) |
---|
57 | n/a | non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', |
---|
58 | n/a | 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] |
---|
59 | n/a | uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms', |
---|
60 | n/a | 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', ''] |
---|
61 | n/a | uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', |
---|
62 | n/a | 'nntp', 'wais', 'https', 'shttp', 'snews', |
---|
63 | n/a | 'file', 'prospero', ''] |
---|
64 | n/a | |
---|
65 | n/a | # Characters valid in scheme names |
---|
66 | n/a | scheme_chars = ('abcdefghijklmnopqrstuvwxyz' |
---|
67 | n/a | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' |
---|
68 | n/a | '0123456789' |
---|
69 | n/a | '+-.') |
---|
70 | n/a | |
---|
71 | n/a | # XXX: Consider replacing with functools.lru_cache |
---|
72 | n/a | MAX_CACHE_SIZE = 20 |
---|
73 | n/a | _parse_cache = {} |
---|
74 | n/a | |
---|
75 | n/a | def clear_cache(): |
---|
76 | n/a | """Clear the parse cache and the quoters cache.""" |
---|
77 | n/a | _parse_cache.clear() |
---|
78 | n/a | _safe_quoters.clear() |
---|
79 | n/a | |
---|
80 | n/a | |
---|
81 | n/a | # Helpers for bytes handling |
---|
82 | n/a | # For 3.2, we deliberately require applications that |
---|
83 | n/a | # handle improperly quoted URLs to do their own |
---|
84 | n/a | # decoding and encoding. If valid use cases are |
---|
85 | n/a | # presented, we may relax this by using latin-1 |
---|
86 | n/a | # decoding internally for 3.3 |
---|
87 | n/a | _implicit_encoding = 'ascii' |
---|
88 | n/a | _implicit_errors = 'strict' |
---|
89 | n/a | |
---|
90 | n/a | def _noop(obj): |
---|
91 | n/a | return obj |
---|
92 | n/a | |
---|
93 | n/a | def _encode_result(obj, encoding=_implicit_encoding, |
---|
94 | n/a | errors=_implicit_errors): |
---|
95 | n/a | return obj.encode(encoding, errors) |
---|
96 | n/a | |
---|
97 | n/a | def _decode_args(args, encoding=_implicit_encoding, |
---|
98 | n/a | errors=_implicit_errors): |
---|
99 | n/a | return tuple(x.decode(encoding, errors) if x else '' for x in args) |
---|
100 | n/a | |
---|
101 | n/a | def _coerce_args(*args): |
---|
102 | n/a | # Invokes decode if necessary to create str args |
---|
103 | n/a | # and returns the coerced inputs along with |
---|
104 | n/a | # an appropriate result coercion function |
---|
105 | n/a | # - noop for str inputs |
---|
106 | n/a | # - encoding function otherwise |
---|
107 | n/a | str_input = isinstance(args[0], str) |
---|
108 | n/a | for arg in args[1:]: |
---|
109 | n/a | # We special-case the empty string to support the |
---|
110 | n/a | # "scheme=''" default argument to some functions |
---|
111 | n/a | if arg and isinstance(arg, str) != str_input: |
---|
112 | n/a | raise TypeError("Cannot mix str and non-str arguments") |
---|
113 | n/a | if str_input: |
---|
114 | n/a | return args + (_noop,) |
---|
115 | n/a | return _decode_args(args) + (_encode_result,) |
---|
116 | n/a | |
---|
117 | n/a | # Result objects are more helpful than simple tuples |
---|
118 | n/a | class _ResultMixinStr(object): |
---|
119 | n/a | """Standard approach to encoding parsed results from str to bytes""" |
---|
120 | n/a | __slots__ = () |
---|
121 | n/a | |
---|
122 | n/a | def encode(self, encoding='ascii', errors='strict'): |
---|
123 | n/a | return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self)) |
---|
124 | n/a | |
---|
125 | n/a | |
---|
126 | n/a | class _ResultMixinBytes(object): |
---|
127 | n/a | """Standard approach to decoding parsed results from bytes to str""" |
---|
128 | n/a | __slots__ = () |
---|
129 | n/a | |
---|
130 | n/a | def decode(self, encoding='ascii', errors='strict'): |
---|
131 | n/a | return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self)) |
---|
132 | n/a | |
---|
133 | n/a | |
---|
134 | n/a | class _NetlocResultMixinBase(object): |
---|
135 | n/a | """Shared methods for the parsed result objects containing a netloc element""" |
---|
136 | n/a | __slots__ = () |
---|
137 | n/a | |
---|
138 | n/a | @property |
---|
139 | n/a | def username(self): |
---|
140 | n/a | return self._userinfo[0] |
---|
141 | n/a | |
---|
142 | n/a | @property |
---|
143 | n/a | def password(self): |
---|
144 | n/a | return self._userinfo[1] |
---|
145 | n/a | |
---|
146 | n/a | @property |
---|
147 | n/a | def hostname(self): |
---|
148 | n/a | hostname = self._hostinfo[0] |
---|
149 | n/a | if not hostname: |
---|
150 | n/a | hostname = None |
---|
151 | n/a | elif hostname is not None: |
---|
152 | n/a | hostname = hostname.lower() |
---|
153 | n/a | return hostname |
---|
154 | n/a | |
---|
155 | n/a | @property |
---|
156 | n/a | def port(self): |
---|
157 | n/a | port = self._hostinfo[1] |
---|
158 | n/a | if port is not None: |
---|
159 | n/a | port = int(port, 10) |
---|
160 | n/a | if not ( 0 <= port <= 65535): |
---|
161 | n/a | raise ValueError("Port out of range 0-65535") |
---|
162 | n/a | return port |
---|
163 | n/a | |
---|
164 | n/a | |
---|
165 | n/a | class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr): |
---|
166 | n/a | __slots__ = () |
---|
167 | n/a | |
---|
168 | n/a | @property |
---|
169 | n/a | def _userinfo(self): |
---|
170 | n/a | netloc = self.netloc |
---|
171 | n/a | userinfo, have_info, hostinfo = netloc.rpartition('@') |
---|
172 | n/a | if have_info: |
---|
173 | n/a | username, have_password, password = userinfo.partition(':') |
---|
174 | n/a | if not have_password: |
---|
175 | n/a | password = None |
---|
176 | n/a | else: |
---|
177 | n/a | username = password = None |
---|
178 | n/a | return username, password |
---|
179 | n/a | |
---|
180 | n/a | @property |
---|
181 | n/a | def _hostinfo(self): |
---|
182 | n/a | netloc = self.netloc |
---|
183 | n/a | _, _, hostinfo = netloc.rpartition('@') |
---|
184 | n/a | _, have_open_br, bracketed = hostinfo.partition('[') |
---|
185 | n/a | if have_open_br: |
---|
186 | n/a | hostname, _, port = bracketed.partition(']') |
---|
187 | n/a | _, _, port = port.partition(':') |
---|
188 | n/a | else: |
---|
189 | n/a | hostname, _, port = hostinfo.partition(':') |
---|
190 | n/a | if not port: |
---|
191 | n/a | port = None |
---|
192 | n/a | return hostname, port |
---|
193 | n/a | |
---|
194 | n/a | |
---|
195 | n/a | class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes): |
---|
196 | n/a | __slots__ = () |
---|
197 | n/a | |
---|
198 | n/a | @property |
---|
199 | n/a | def _userinfo(self): |
---|
200 | n/a | netloc = self.netloc |
---|
201 | n/a | userinfo, have_info, hostinfo = netloc.rpartition(b'@') |
---|
202 | n/a | if have_info: |
---|
203 | n/a | username, have_password, password = userinfo.partition(b':') |
---|
204 | n/a | if not have_password: |
---|
205 | n/a | password = None |
---|
206 | n/a | else: |
---|
207 | n/a | username = password = None |
---|
208 | n/a | return username, password |
---|
209 | n/a | |
---|
210 | n/a | @property |
---|
211 | n/a | def _hostinfo(self): |
---|
212 | n/a | netloc = self.netloc |
---|
213 | n/a | _, _, hostinfo = netloc.rpartition(b'@') |
---|
214 | n/a | _, have_open_br, bracketed = hostinfo.partition(b'[') |
---|
215 | n/a | if have_open_br: |
---|
216 | n/a | hostname, _, port = bracketed.partition(b']') |
---|
217 | n/a | _, _, port = port.partition(b':') |
---|
218 | n/a | else: |
---|
219 | n/a | hostname, _, port = hostinfo.partition(b':') |
---|
220 | n/a | if not port: |
---|
221 | n/a | port = None |
---|
222 | n/a | return hostname, port |
---|
223 | n/a | |
---|
224 | n/a | |
---|
225 | n/a | from collections import namedtuple |
---|
226 | n/a | |
---|
227 | n/a | _DefragResultBase = namedtuple('DefragResult', 'url fragment') |
---|
228 | n/a | _SplitResultBase = namedtuple( |
---|
229 | n/a | 'SplitResult', 'scheme netloc path query fragment') |
---|
230 | n/a | _ParseResultBase = namedtuple( |
---|
231 | n/a | 'ParseResult', 'scheme netloc path params query fragment') |
---|
232 | n/a | |
---|
233 | n/a | _DefragResultBase.__doc__ = """ |
---|
234 | n/a | DefragResult(url, fragment) |
---|
235 | n/a | |
---|
236 | n/a | A 2-tuple that contains the url without fragment identifier and the fragment |
---|
237 | n/a | identifier as a separate argument. |
---|
238 | n/a | """ |
---|
239 | n/a | |
---|
240 | n/a | _DefragResultBase.url.__doc__ = """The URL with no fragment identifier.""" |
---|
241 | n/a | |
---|
242 | n/a | _DefragResultBase.fragment.__doc__ = """ |
---|
243 | n/a | Fragment identifier separated from URL, that allows indirect identification of a |
---|
244 | n/a | secondary resource by reference to a primary resource and additional identifying |
---|
245 | n/a | information. |
---|
246 | n/a | """ |
---|
247 | n/a | |
---|
248 | n/a | _SplitResultBase.__doc__ = """ |
---|
249 | n/a | SplitResult(scheme, netloc, path, query, fragment) |
---|
250 | n/a | |
---|
251 | n/a | A 5-tuple that contains the different components of a URL. Similar to |
---|
252 | n/a | ParseResult, but does not split params. |
---|
253 | n/a | """ |
---|
254 | n/a | |
---|
255 | n/a | _SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request.""" |
---|
256 | n/a | |
---|
257 | n/a | _SplitResultBase.netloc.__doc__ = """ |
---|
258 | n/a | Network location where the request is made to. |
---|
259 | n/a | """ |
---|
260 | n/a | |
---|
261 | n/a | _SplitResultBase.path.__doc__ = """ |
---|
262 | n/a | The hierarchical path, such as the path to a file to download. |
---|
263 | n/a | """ |
---|
264 | n/a | |
---|
265 | n/a | _SplitResultBase.query.__doc__ = """ |
---|
266 | n/a | The query component, that contains non-hierarchical data, that along with data |
---|
267 | n/a | in path component, identifies a resource in the scope of URI's scheme and |
---|
268 | n/a | network location. |
---|
269 | n/a | """ |
---|
270 | n/a | |
---|
271 | n/a | _SplitResultBase.fragment.__doc__ = """ |
---|
272 | n/a | Fragment identifier, that allows indirect identification of a secondary resource |
---|
273 | n/a | by reference to a primary resource and additional identifying information. |
---|
274 | n/a | """ |
---|
275 | n/a | |
---|
276 | n/a | _ParseResultBase.__doc__ = """ |
---|
277 | n/a | ParseResult(scheme, netloc, path, params, query, fragment) |
---|
278 | n/a | |
---|
279 | n/a | A 6-tuple that contains components of a parsed URL. |
---|
280 | n/a | """ |
---|
281 | n/a | |
---|
282 | n/a | _ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__ |
---|
283 | n/a | _ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__ |
---|
284 | n/a | _ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__ |
---|
285 | n/a | _ParseResultBase.params.__doc__ = """ |
---|
286 | n/a | Parameters for last path element used to dereference the URI in order to provide |
---|
287 | n/a | access to perform some operation on the resource. |
---|
288 | n/a | """ |
---|
289 | n/a | |
---|
290 | n/a | _ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__ |
---|
291 | n/a | _ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__ |
---|
292 | n/a | |
---|
293 | n/a | |
---|
294 | n/a | # For backwards compatibility, alias _NetlocResultMixinStr |
---|
295 | n/a | # ResultBase is no longer part of the documented API, but it is |
---|
296 | n/a | # retained since deprecating it isn't worth the hassle |
---|
297 | n/a | ResultBase = _NetlocResultMixinStr |
---|
298 | n/a | |
---|
299 | n/a | # Structured result objects for string data |
---|
300 | n/a | class DefragResult(_DefragResultBase, _ResultMixinStr): |
---|
301 | n/a | __slots__ = () |
---|
302 | n/a | def geturl(self): |
---|
303 | n/a | if self.fragment: |
---|
304 | n/a | return self.url + '#' + self.fragment |
---|
305 | n/a | else: |
---|
306 | n/a | return self.url |
---|
307 | n/a | |
---|
308 | n/a | class SplitResult(_SplitResultBase, _NetlocResultMixinStr): |
---|
309 | n/a | __slots__ = () |
---|
310 | n/a | def geturl(self): |
---|
311 | n/a | return urlunsplit(self) |
---|
312 | n/a | |
---|
313 | n/a | class ParseResult(_ParseResultBase, _NetlocResultMixinStr): |
---|
314 | n/a | __slots__ = () |
---|
315 | n/a | def geturl(self): |
---|
316 | n/a | return urlunparse(self) |
---|
317 | n/a | |
---|
318 | n/a | # Structured result objects for bytes data |
---|
319 | n/a | class DefragResultBytes(_DefragResultBase, _ResultMixinBytes): |
---|
320 | n/a | __slots__ = () |
---|
321 | n/a | def geturl(self): |
---|
322 | n/a | if self.fragment: |
---|
323 | n/a | return self.url + b'#' + self.fragment |
---|
324 | n/a | else: |
---|
325 | n/a | return self.url |
---|
326 | n/a | |
---|
327 | n/a | class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes): |
---|
328 | n/a | __slots__ = () |
---|
329 | n/a | def geturl(self): |
---|
330 | n/a | return urlunsplit(self) |
---|
331 | n/a | |
---|
332 | n/a | class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes): |
---|
333 | n/a | __slots__ = () |
---|
334 | n/a | def geturl(self): |
---|
335 | n/a | return urlunparse(self) |
---|
336 | n/a | |
---|
337 | n/a | # Set up the encode/decode result pairs |
---|
338 | n/a | def _fix_result_transcoding(): |
---|
339 | n/a | _result_pairs = ( |
---|
340 | n/a | (DefragResult, DefragResultBytes), |
---|
341 | n/a | (SplitResult, SplitResultBytes), |
---|
342 | n/a | (ParseResult, ParseResultBytes), |
---|
343 | n/a | ) |
---|
344 | n/a | for _decoded, _encoded in _result_pairs: |
---|
345 | n/a | _decoded._encoded_counterpart = _encoded |
---|
346 | n/a | _encoded._decoded_counterpart = _decoded |
---|
347 | n/a | |
---|
348 | n/a | _fix_result_transcoding() |
---|
349 | n/a | del _fix_result_transcoding |
---|
350 | n/a | |
---|
351 | n/a | def urlparse(url, scheme='', allow_fragments=True): |
---|
352 | n/a | """Parse a URL into 6 components: |
---|
353 | n/a | <scheme>://<netloc>/<path>;<params>?<query>#<fragment> |
---|
354 | n/a | Return a 6-tuple: (scheme, netloc, path, params, query, fragment). |
---|
355 | n/a | Note that we don't break the components up in smaller bits |
---|
356 | n/a | (e.g. netloc is a single string) and we don't expand % escapes.""" |
---|
357 | n/a | url, scheme, _coerce_result = _coerce_args(url, scheme) |
---|
358 | n/a | splitresult = urlsplit(url, scheme, allow_fragments) |
---|
359 | n/a | scheme, netloc, url, query, fragment = splitresult |
---|
360 | n/a | if scheme in uses_params and ';' in url: |
---|
361 | n/a | url, params = _splitparams(url) |
---|
362 | n/a | else: |
---|
363 | n/a | params = '' |
---|
364 | n/a | result = ParseResult(scheme, netloc, url, params, query, fragment) |
---|
365 | n/a | return _coerce_result(result) |
---|
366 | n/a | |
---|
367 | n/a | def _splitparams(url): |
---|
368 | n/a | if '/' in url: |
---|
369 | n/a | i = url.find(';', url.rfind('/')) |
---|
370 | n/a | if i < 0: |
---|
371 | n/a | return url, '' |
---|
372 | n/a | else: |
---|
373 | n/a | i = url.find(';') |
---|
374 | n/a | return url[:i], url[i+1:] |
---|
375 | n/a | |
---|
376 | n/a | def _splitnetloc(url, start=0): |
---|
377 | n/a | delim = len(url) # position of end of domain part of url, default is end |
---|
378 | n/a | for c in '/?#': # look for delimiters; the order is NOT important |
---|
379 | n/a | wdelim = url.find(c, start) # find first of this delim |
---|
380 | n/a | if wdelim >= 0: # if found |
---|
381 | n/a | delim = min(delim, wdelim) # use earliest delim position |
---|
382 | n/a | return url[start:delim], url[delim:] # return (domain, rest) |
---|
383 | n/a | |
---|
384 | n/a | def urlsplit(url, scheme='', allow_fragments=True): |
---|
385 | n/a | """Parse a URL into 5 components: |
---|
386 | n/a | <scheme>://<netloc>/<path>?<query>#<fragment> |
---|
387 | n/a | Return a 5-tuple: (scheme, netloc, path, query, fragment). |
---|
388 | n/a | Note that we don't break the components up in smaller bits |
---|
389 | n/a | (e.g. netloc is a single string) and we don't expand % escapes.""" |
---|
390 | n/a | url, scheme, _coerce_result = _coerce_args(url, scheme) |
---|
391 | n/a | allow_fragments = bool(allow_fragments) |
---|
392 | n/a | key = url, scheme, allow_fragments, type(url), type(scheme) |
---|
393 | n/a | cached = _parse_cache.get(key, None) |
---|
394 | n/a | if cached: |
---|
395 | n/a | return _coerce_result(cached) |
---|
396 | n/a | if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth |
---|
397 | n/a | clear_cache() |
---|
398 | n/a | netloc = query = fragment = '' |
---|
399 | n/a | i = url.find(':') |
---|
400 | n/a | if i > 0: |
---|
401 | n/a | if url[:i] == 'http': # optimize the common case |
---|
402 | n/a | scheme = url[:i].lower() |
---|
403 | n/a | url = url[i+1:] |
---|
404 | n/a | if url[:2] == '//': |
---|
405 | n/a | netloc, url = _splitnetloc(url, 2) |
---|
406 | n/a | if (('[' in netloc and ']' not in netloc) or |
---|
407 | n/a | (']' in netloc and '[' not in netloc)): |
---|
408 | n/a | raise ValueError("Invalid IPv6 URL") |
---|
409 | n/a | if allow_fragments and '#' in url: |
---|
410 | n/a | url, fragment = url.split('#', 1) |
---|
411 | n/a | if '?' in url: |
---|
412 | n/a | url, query = url.split('?', 1) |
---|
413 | n/a | v = SplitResult(scheme, netloc, url, query, fragment) |
---|
414 | n/a | _parse_cache[key] = v |
---|
415 | n/a | return _coerce_result(v) |
---|
416 | n/a | for c in url[:i]: |
---|
417 | n/a | if c not in scheme_chars: |
---|
418 | n/a | break |
---|
419 | n/a | else: |
---|
420 | n/a | # make sure "url" is not actually a port number (in which case |
---|
421 | n/a | # "scheme" is really part of the path) |
---|
422 | n/a | rest = url[i+1:] |
---|
423 | n/a | if not rest or any(c not in '0123456789' for c in rest): |
---|
424 | n/a | # not a port number |
---|
425 | n/a | scheme, url = url[:i].lower(), rest |
---|
426 | n/a | |
---|
427 | n/a | if url[:2] == '//': |
---|
428 | n/a | netloc, url = _splitnetloc(url, 2) |
---|
429 | n/a | if (('[' in netloc and ']' not in netloc) or |
---|
430 | n/a | (']' in netloc and '[' not in netloc)): |
---|
431 | n/a | raise ValueError("Invalid IPv6 URL") |
---|
432 | n/a | if allow_fragments and '#' in url: |
---|
433 | n/a | url, fragment = url.split('#', 1) |
---|
434 | n/a | if '?' in url: |
---|
435 | n/a | url, query = url.split('?', 1) |
---|
436 | n/a | v = SplitResult(scheme, netloc, url, query, fragment) |
---|
437 | n/a | _parse_cache[key] = v |
---|
438 | n/a | return _coerce_result(v) |
---|
439 | n/a | |
---|
440 | n/a | def urlunparse(components): |
---|
441 | n/a | """Put a parsed URL back together again. This may result in a |
---|
442 | n/a | slightly different, but equivalent URL, if the URL that was parsed |
---|
443 | n/a | originally had redundant delimiters, e.g. a ? with an empty query |
---|
444 | n/a | (the draft states that these are equivalent).""" |
---|
445 | n/a | scheme, netloc, url, params, query, fragment, _coerce_result = ( |
---|
446 | n/a | _coerce_args(*components)) |
---|
447 | n/a | if params: |
---|
448 | n/a | url = "%s;%s" % (url, params) |
---|
449 | n/a | return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment))) |
---|
450 | n/a | |
---|
451 | n/a | def urlunsplit(components): |
---|
452 | n/a | """Combine the elements of a tuple as returned by urlsplit() into a |
---|
453 | n/a | complete URL as a string. The data argument can be any five-item iterable. |
---|
454 | n/a | This may result in a slightly different, but equivalent URL, if the URL that |
---|
455 | n/a | was parsed originally had unnecessary delimiters (for example, a ? with an |
---|
456 | n/a | empty query; the RFC states that these are equivalent).""" |
---|
457 | n/a | scheme, netloc, url, query, fragment, _coerce_result = ( |
---|
458 | n/a | _coerce_args(*components)) |
---|
459 | n/a | if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): |
---|
460 | n/a | if url and url[:1] != '/': url = '/' + url |
---|
461 | n/a | url = '//' + (netloc or '') + url |
---|
462 | n/a | if scheme: |
---|
463 | n/a | url = scheme + ':' + url |
---|
464 | n/a | if query: |
---|
465 | n/a | url = url + '?' + query |
---|
466 | n/a | if fragment: |
---|
467 | n/a | url = url + '#' + fragment |
---|
468 | n/a | return _coerce_result(url) |
---|
469 | n/a | |
---|
470 | n/a | def urljoin(base, url, allow_fragments=True): |
---|
471 | n/a | """Join a base URL and a possibly relative URL to form an absolute |
---|
472 | n/a | interpretation of the latter.""" |
---|
473 | n/a | if not base: |
---|
474 | n/a | return url |
---|
475 | n/a | if not url: |
---|
476 | n/a | return base |
---|
477 | n/a | |
---|
478 | n/a | base, url, _coerce_result = _coerce_args(base, url) |
---|
479 | n/a | bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ |
---|
480 | n/a | urlparse(base, '', allow_fragments) |
---|
481 | n/a | scheme, netloc, path, params, query, fragment = \ |
---|
482 | n/a | urlparse(url, bscheme, allow_fragments) |
---|
483 | n/a | |
---|
484 | n/a | if scheme != bscheme or scheme not in uses_relative: |
---|
485 | n/a | return _coerce_result(url) |
---|
486 | n/a | if scheme in uses_netloc: |
---|
487 | n/a | if netloc: |
---|
488 | n/a | return _coerce_result(urlunparse((scheme, netloc, path, |
---|
489 | n/a | params, query, fragment))) |
---|
490 | n/a | netloc = bnetloc |
---|
491 | n/a | |
---|
492 | n/a | if not path and not params: |
---|
493 | n/a | path = bpath |
---|
494 | n/a | params = bparams |
---|
495 | n/a | if not query: |
---|
496 | n/a | query = bquery |
---|
497 | n/a | return _coerce_result(urlunparse((scheme, netloc, path, |
---|
498 | n/a | params, query, fragment))) |
---|
499 | n/a | |
---|
500 | n/a | base_parts = bpath.split('/') |
---|
501 | n/a | if base_parts[-1] != '': |
---|
502 | n/a | # the last item is not a directory, so will not be taken into account |
---|
503 | n/a | # in resolving the relative path |
---|
504 | n/a | del base_parts[-1] |
---|
505 | n/a | |
---|
506 | n/a | # for rfc3986, ignore all base path should the first character be root. |
---|
507 | n/a | if path[:1] == '/': |
---|
508 | n/a | segments = path.split('/') |
---|
509 | n/a | else: |
---|
510 | n/a | segments = base_parts + path.split('/') |
---|
511 | n/a | # filter out elements that would cause redundant slashes on re-joining |
---|
512 | n/a | # the resolved_path |
---|
513 | n/a | segments[1:-1] = filter(None, segments[1:-1]) |
---|
514 | n/a | |
---|
515 | n/a | resolved_path = [] |
---|
516 | n/a | |
---|
517 | n/a | for seg in segments: |
---|
518 | n/a | if seg == '..': |
---|
519 | n/a | try: |
---|
520 | n/a | resolved_path.pop() |
---|
521 | n/a | except IndexError: |
---|
522 | n/a | # ignore any .. segments that would otherwise cause an IndexError |
---|
523 | n/a | # when popped from resolved_path if resolving for rfc3986 |
---|
524 | n/a | pass |
---|
525 | n/a | elif seg == '.': |
---|
526 | n/a | continue |
---|
527 | n/a | else: |
---|
528 | n/a | resolved_path.append(seg) |
---|
529 | n/a | |
---|
530 | n/a | if segments[-1] in ('.', '..'): |
---|
531 | n/a | # do some post-processing here. if the last segment was a relative dir, |
---|
532 | n/a | # then we need to append the trailing '/' |
---|
533 | n/a | resolved_path.append('') |
---|
534 | n/a | |
---|
535 | n/a | return _coerce_result(urlunparse((scheme, netloc, '/'.join( |
---|
536 | n/a | resolved_path) or '/', params, query, fragment))) |
---|
537 | n/a | |
---|
538 | n/a | |
---|
539 | n/a | def urldefrag(url): |
---|
540 | n/a | """Removes any existing fragment from URL. |
---|
541 | n/a | |
---|
542 | n/a | Returns a tuple of the defragmented URL and the fragment. If |
---|
543 | n/a | the URL contained no fragments, the second element is the |
---|
544 | n/a | empty string. |
---|
545 | n/a | """ |
---|
546 | n/a | url, _coerce_result = _coerce_args(url) |
---|
547 | n/a | if '#' in url: |
---|
548 | n/a | s, n, p, a, q, frag = urlparse(url) |
---|
549 | n/a | defrag = urlunparse((s, n, p, a, q, '')) |
---|
550 | n/a | else: |
---|
551 | n/a | frag = '' |
---|
552 | n/a | defrag = url |
---|
553 | n/a | return _coerce_result(DefragResult(defrag, frag)) |
---|
554 | n/a | |
---|
555 | n/a | _hexdig = '0123456789ABCDEFabcdef' |
---|
556 | n/a | _hextobyte = None |
---|
557 | n/a | |
---|
558 | n/a | def unquote_to_bytes(string): |
---|
559 | n/a | """unquote_to_bytes('abc%20def') -> b'abc def'.""" |
---|
560 | n/a | # Note: strings are encoded as UTF-8. This is only an issue if it contains |
---|
561 | n/a | # unescaped non-ASCII characters, which URIs should not. |
---|
562 | n/a | if not string: |
---|
563 | n/a | # Is it a string-like object? |
---|
564 | n/a | string.split |
---|
565 | n/a | return b'' |
---|
566 | n/a | if isinstance(string, str): |
---|
567 | n/a | string = string.encode('utf-8') |
---|
568 | n/a | bits = string.split(b'%') |
---|
569 | n/a | if len(bits) == 1: |
---|
570 | n/a | return string |
---|
571 | n/a | res = [bits[0]] |
---|
572 | n/a | append = res.append |
---|
573 | n/a | # Delay the initialization of the table to not waste memory |
---|
574 | n/a | # if the function is never called |
---|
575 | n/a | global _hextobyte |
---|
576 | n/a | if _hextobyte is None: |
---|
577 | n/a | _hextobyte = {(a + b).encode(): bytes.fromhex(a + b) |
---|
578 | n/a | for a in _hexdig for b in _hexdig} |
---|
579 | n/a | for item in bits[1:]: |
---|
580 | n/a | try: |
---|
581 | n/a | append(_hextobyte[item[:2]]) |
---|
582 | n/a | append(item[2:]) |
---|
583 | n/a | except KeyError: |
---|
584 | n/a | append(b'%') |
---|
585 | n/a | append(item) |
---|
586 | n/a | return b''.join(res) |
---|
587 | n/a | |
---|
588 | n/a | _asciire = re.compile('([\x00-\x7f]+)') |
---|
589 | n/a | |
---|
590 | n/a | def unquote(string, encoding='utf-8', errors='replace'): |
---|
591 | n/a | """Replace %xx escapes by their single-character equivalent. The optional |
---|
592 | n/a | encoding and errors parameters specify how to decode percent-encoded |
---|
593 | n/a | sequences into Unicode characters, as accepted by the bytes.decode() |
---|
594 | n/a | method. |
---|
595 | n/a | By default, percent-encoded sequences are decoded with UTF-8, and invalid |
---|
596 | n/a | sequences are replaced by a placeholder character. |
---|
597 | n/a | |
---|
598 | n/a | unquote('abc%20def') -> 'abc def'. |
---|
599 | n/a | """ |
---|
600 | n/a | if '%' not in string: |
---|
601 | n/a | string.split |
---|
602 | n/a | return string |
---|
603 | n/a | if encoding is None: |
---|
604 | n/a | encoding = 'utf-8' |
---|
605 | n/a | if errors is None: |
---|
606 | n/a | errors = 'replace' |
---|
607 | n/a | bits = _asciire.split(string) |
---|
608 | n/a | res = [bits[0]] |
---|
609 | n/a | append = res.append |
---|
610 | n/a | for i in range(1, len(bits), 2): |
---|
611 | n/a | append(unquote_to_bytes(bits[i]).decode(encoding, errors)) |
---|
612 | n/a | append(bits[i + 1]) |
---|
613 | n/a | return ''.join(res) |
---|
614 | n/a | |
---|
615 | n/a | def parse_qs(qs, keep_blank_values=False, strict_parsing=False, |
---|
616 | n/a | encoding='utf-8', errors='replace'): |
---|
617 | n/a | """Parse a query given as a string argument. |
---|
618 | n/a | |
---|
619 | n/a | Arguments: |
---|
620 | n/a | |
---|
621 | n/a | qs: percent-encoded query string to be parsed |
---|
622 | n/a | |
---|
623 | n/a | keep_blank_values: flag indicating whether blank values in |
---|
624 | n/a | percent-encoded queries should be treated as blank strings. |
---|
625 | n/a | A true value indicates that blanks should be retained as |
---|
626 | n/a | blank strings. The default false value indicates that |
---|
627 | n/a | blank values are to be ignored and treated as if they were |
---|
628 | n/a | not included. |
---|
629 | n/a | |
---|
630 | n/a | strict_parsing: flag indicating what to do with parsing errors. |
---|
631 | n/a | If false (the default), errors are silently ignored. |
---|
632 | n/a | If true, errors raise a ValueError exception. |
---|
633 | n/a | |
---|
634 | n/a | encoding and errors: specify how to decode percent-encoded sequences |
---|
635 | n/a | into Unicode characters, as accepted by the bytes.decode() method. |
---|
636 | n/a | """ |
---|
637 | n/a | parsed_result = {} |
---|
638 | n/a | pairs = parse_qsl(qs, keep_blank_values, strict_parsing, |
---|
639 | n/a | encoding=encoding, errors=errors) |
---|
640 | n/a | for name, value in pairs: |
---|
641 | n/a | if name in parsed_result: |
---|
642 | n/a | parsed_result[name].append(value) |
---|
643 | n/a | else: |
---|
644 | n/a | parsed_result[name] = [value] |
---|
645 | n/a | return parsed_result |
---|
646 | n/a | |
---|
647 | n/a | def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, |
---|
648 | n/a | encoding='utf-8', errors='replace'): |
---|
649 | n/a | """Parse a query given as a string argument. |
---|
650 | n/a | |
---|
651 | n/a | Arguments: |
---|
652 | n/a | |
---|
653 | n/a | qs: percent-encoded query string to be parsed |
---|
654 | n/a | |
---|
655 | n/a | keep_blank_values: flag indicating whether blank values in |
---|
656 | n/a | percent-encoded queries should be treated as blank strings. A |
---|
657 | n/a | true value indicates that blanks should be retained as blank |
---|
658 | n/a | strings. The default false value indicates that blank values |
---|
659 | n/a | are to be ignored and treated as if they were not included. |
---|
660 | n/a | |
---|
661 | n/a | strict_parsing: flag indicating what to do with parsing errors. If |
---|
662 | n/a | false (the default), errors are silently ignored. If true, |
---|
663 | n/a | errors raise a ValueError exception. |
---|
664 | n/a | |
---|
665 | n/a | encoding and errors: specify how to decode percent-encoded sequences |
---|
666 | n/a | into Unicode characters, as accepted by the bytes.decode() method. |
---|
667 | n/a | |
---|
668 | n/a | Returns a list, as G-d intended. |
---|
669 | n/a | """ |
---|
670 | n/a | qs, _coerce_result = _coerce_args(qs) |
---|
671 | n/a | pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] |
---|
672 | n/a | r = [] |
---|
673 | n/a | for name_value in pairs: |
---|
674 | n/a | if not name_value and not strict_parsing: |
---|
675 | n/a | continue |
---|
676 | n/a | nv = name_value.split('=', 1) |
---|
677 | n/a | if len(nv) != 2: |
---|
678 | n/a | if strict_parsing: |
---|
679 | n/a | raise ValueError("bad query field: %r" % (name_value,)) |
---|
680 | n/a | # Handle case of a control-name with no equal sign |
---|
681 | n/a | if keep_blank_values: |
---|
682 | n/a | nv.append('') |
---|
683 | n/a | else: |
---|
684 | n/a | continue |
---|
685 | n/a | if len(nv[1]) or keep_blank_values: |
---|
686 | n/a | name = nv[0].replace('+', ' ') |
---|
687 | n/a | name = unquote(name, encoding=encoding, errors=errors) |
---|
688 | n/a | name = _coerce_result(name) |
---|
689 | n/a | value = nv[1].replace('+', ' ') |
---|
690 | n/a | value = unquote(value, encoding=encoding, errors=errors) |
---|
691 | n/a | value = _coerce_result(value) |
---|
692 | n/a | r.append((name, value)) |
---|
693 | n/a | return r |
---|
694 | n/a | |
---|
695 | n/a | def unquote_plus(string, encoding='utf-8', errors='replace'): |
---|
696 | n/a | """Like unquote(), but also replace plus signs by spaces, as required for |
---|
697 | n/a | unquoting HTML form values. |
---|
698 | n/a | |
---|
699 | n/a | unquote_plus('%7e/abc+def') -> '~/abc def' |
---|
700 | n/a | """ |
---|
701 | n/a | string = string.replace('+', ' ') |
---|
702 | n/a | return unquote(string, encoding, errors) |
---|
703 | n/a | |
---|
704 | n/a | _ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' |
---|
705 | n/a | b'abcdefghijklmnopqrstuvwxyz' |
---|
706 | n/a | b'0123456789' |
---|
707 | n/a | b'_.-') |
---|
708 | n/a | _ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE) |
---|
709 | n/a | _safe_quoters = {} |
---|
710 | n/a | |
---|
711 | n/a | class Quoter(collections.defaultdict): |
---|
712 | n/a | """A mapping from bytes (in range(0,256)) to strings. |
---|
713 | n/a | |
---|
714 | n/a | String values are percent-encoded byte values, unless the key < 128, and |
---|
715 | n/a | in the "safe" set (either the specified safe set, or default set). |
---|
716 | n/a | """ |
---|
717 | n/a | # Keeps a cache internally, using defaultdict, for efficiency (lookups |
---|
718 | n/a | # of cached keys don't call Python code at all). |
---|
719 | n/a | def __init__(self, safe): |
---|
720 | n/a | """safe: bytes object.""" |
---|
721 | n/a | self.safe = _ALWAYS_SAFE.union(safe) |
---|
722 | n/a | |
---|
723 | n/a | def __repr__(self): |
---|
724 | n/a | # Without this, will just display as a defaultdict |
---|
725 | n/a | return "<%s %r>" % (self.__class__.__name__, dict(self)) |
---|
726 | n/a | |
---|
727 | n/a | def __missing__(self, b): |
---|
728 | n/a | # Handle a cache miss. Store quoted string in cache and return. |
---|
729 | n/a | res = chr(b) if b in self.safe else '%{:02X}'.format(b) |
---|
730 | n/a | self[b] = res |
---|
731 | n/a | return res |
---|
732 | n/a | |
---|
733 | n/a | def quote(string, safe='/', encoding=None, errors=None): |
---|
734 | n/a | """quote('abc def') -> 'abc%20def' |
---|
735 | n/a | |
---|
736 | n/a | Each part of a URL, e.g. the path info, the query, etc., has a |
---|
737 | n/a | different set of reserved characters that must be quoted. |
---|
738 | n/a | |
---|
739 | n/a | RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists |
---|
740 | n/a | the following reserved characters. |
---|
741 | n/a | |
---|
742 | n/a | reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | |
---|
743 | n/a | "$" | "," |
---|
744 | n/a | |
---|
745 | n/a | Each of these characters is reserved in some component of a URL, |
---|
746 | n/a | but not necessarily in all of them. |
---|
747 | n/a | |
---|
748 | n/a | By default, the quote function is intended for quoting the path |
---|
749 | n/a | section of a URL. Thus, it will not encode '/'. This character |
---|
750 | n/a | is reserved, but in typical usage the quote function is being |
---|
751 | n/a | called on a path where the existing slash characters are used as |
---|
752 | n/a | reserved characters. |
---|
753 | n/a | |
---|
754 | n/a | string and safe may be either str or bytes objects. encoding and errors |
---|
755 | n/a | must not be specified if string is a bytes object. |
---|
756 | n/a | |
---|
757 | n/a | The optional encoding and errors parameters specify how to deal with |
---|
758 | n/a | non-ASCII characters, as accepted by the str.encode method. |
---|
759 | n/a | By default, encoding='utf-8' (characters are encoded with UTF-8), and |
---|
760 | n/a | errors='strict' (unsupported characters raise a UnicodeEncodeError). |
---|
761 | n/a | """ |
---|
762 | n/a | if isinstance(string, str): |
---|
763 | n/a | if not string: |
---|
764 | n/a | return string |
---|
765 | n/a | if encoding is None: |
---|
766 | n/a | encoding = 'utf-8' |
---|
767 | n/a | if errors is None: |
---|
768 | n/a | errors = 'strict' |
---|
769 | n/a | string = string.encode(encoding, errors) |
---|
770 | n/a | else: |
---|
771 | n/a | if encoding is not None: |
---|
772 | n/a | raise TypeError("quote() doesn't support 'encoding' for bytes") |
---|
773 | n/a | if errors is not None: |
---|
774 | n/a | raise TypeError("quote() doesn't support 'errors' for bytes") |
---|
775 | n/a | return quote_from_bytes(string, safe) |
---|
776 | n/a | |
---|
777 | n/a | def quote_plus(string, safe='', encoding=None, errors=None): |
---|
778 | n/a | """Like quote(), but also replace ' ' with '+', as required for quoting |
---|
779 | n/a | HTML form values. Plus signs in the original string are escaped unless |
---|
780 | n/a | they are included in safe. It also does not have safe default to '/'. |
---|
781 | n/a | """ |
---|
782 | n/a | # Check if ' ' in string, where string may either be a str or bytes. If |
---|
783 | n/a | # there are no spaces, the regular quote will produce the right answer. |
---|
784 | n/a | if ((isinstance(string, str) and ' ' not in string) or |
---|
785 | n/a | (isinstance(string, bytes) and b' ' not in string)): |
---|
786 | n/a | return quote(string, safe, encoding, errors) |
---|
787 | n/a | if isinstance(safe, str): |
---|
788 | n/a | space = ' ' |
---|
789 | n/a | else: |
---|
790 | n/a | space = b' ' |
---|
791 | n/a | string = quote(string, safe + space, encoding, errors) |
---|
792 | n/a | return string.replace(' ', '+') |
---|
793 | n/a | |
---|
794 | n/a | def quote_from_bytes(bs, safe='/'): |
---|
795 | n/a | """Like quote(), but accepts a bytes object rather than a str, and does |
---|
796 | n/a | not perform string-to-bytes encoding. It always returns an ASCII string. |
---|
797 | n/a | quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f' |
---|
798 | n/a | """ |
---|
799 | n/a | if not isinstance(bs, (bytes, bytearray)): |
---|
800 | n/a | raise TypeError("quote_from_bytes() expected bytes") |
---|
801 | n/a | if not bs: |
---|
802 | n/a | return '' |
---|
803 | n/a | if isinstance(safe, str): |
---|
804 | n/a | # Normalize 'safe' by converting to bytes and removing non-ASCII chars |
---|
805 | n/a | safe = safe.encode('ascii', 'ignore') |
---|
806 | n/a | else: |
---|
807 | n/a | safe = bytes([c for c in safe if c < 128]) |
---|
808 | n/a | if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe): |
---|
809 | n/a | return bs.decode() |
---|
810 | n/a | try: |
---|
811 | n/a | quoter = _safe_quoters[safe] |
---|
812 | n/a | except KeyError: |
---|
813 | n/a | _safe_quoters[safe] = quoter = Quoter(safe).__getitem__ |
---|
814 | n/a | return ''.join([quoter(char) for char in bs]) |
---|
815 | n/a | |
---|
816 | n/a | def urlencode(query, doseq=False, safe='', encoding=None, errors=None, |
---|
817 | n/a | quote_via=quote_plus): |
---|
818 | n/a | """Encode a dict or sequence of two-element tuples into a URL query string. |
---|
819 | n/a | |
---|
820 | n/a | If any values in the query arg are sequences and doseq is true, each |
---|
821 | n/a | sequence element is converted to a separate parameter. |
---|
822 | n/a | |
---|
823 | n/a | If the query arg is a sequence of two-element tuples, the order of the |
---|
824 | n/a | parameters in the output will match the order of parameters in the |
---|
825 | n/a | input. |
---|
826 | n/a | |
---|
827 | n/a | The components of a query arg may each be either a string or a bytes type. |
---|
828 | n/a | |
---|
829 | n/a | The safe, encoding, and errors parameters are passed down to the function |
---|
830 | n/a | specified by quote_via (encoding and errors only if a component is a str). |
---|
831 | n/a | """ |
---|
832 | n/a | |
---|
833 | n/a | if hasattr(query, "items"): |
---|
834 | n/a | query = query.items() |
---|
835 | n/a | else: |
---|
836 | n/a | # It's a bother at times that strings and string-like objects are |
---|
837 | n/a | # sequences. |
---|
838 | n/a | try: |
---|
839 | n/a | # non-sequence items should not work with len() |
---|
840 | n/a | # non-empty strings will fail this |
---|
841 | n/a | if len(query) and not isinstance(query[0], tuple): |
---|
842 | n/a | raise TypeError |
---|
843 | n/a | # Zero-length sequences of all types will get here and succeed, |
---|
844 | n/a | # but that's a minor nit. Since the original implementation |
---|
845 | n/a | # allowed empty dicts that type of behavior probably should be |
---|
846 | n/a | # preserved for consistency |
---|
847 | n/a | except TypeError: |
---|
848 | n/a | ty, va, tb = sys.exc_info() |
---|
849 | n/a | raise TypeError("not a valid non-string sequence " |
---|
850 | n/a | "or mapping object").with_traceback(tb) |
---|
851 | n/a | |
---|
852 | n/a | l = [] |
---|
853 | n/a | if not doseq: |
---|
854 | n/a | for k, v in query: |
---|
855 | n/a | if isinstance(k, bytes): |
---|
856 | n/a | k = quote_via(k, safe) |
---|
857 | n/a | else: |
---|
858 | n/a | k = quote_via(str(k), safe, encoding, errors) |
---|
859 | n/a | |
---|
860 | n/a | if isinstance(v, bytes): |
---|
861 | n/a | v = quote_via(v, safe) |
---|
862 | n/a | else: |
---|
863 | n/a | v = quote_via(str(v), safe, encoding, errors) |
---|
864 | n/a | l.append(k + '=' + v) |
---|
865 | n/a | else: |
---|
866 | n/a | for k, v in query: |
---|
867 | n/a | if isinstance(k, bytes): |
---|
868 | n/a | k = quote_via(k, safe) |
---|
869 | n/a | else: |
---|
870 | n/a | k = quote_via(str(k), safe, encoding, errors) |
---|
871 | n/a | |
---|
872 | n/a | if isinstance(v, bytes): |
---|
873 | n/a | v = quote_via(v, safe) |
---|
874 | n/a | l.append(k + '=' + v) |
---|
875 | n/a | elif isinstance(v, str): |
---|
876 | n/a | v = quote_via(v, safe, encoding, errors) |
---|
877 | n/a | l.append(k + '=' + v) |
---|
878 | n/a | else: |
---|
879 | n/a | try: |
---|
880 | n/a | # Is this a sufficient test for sequence-ness? |
---|
881 | n/a | x = len(v) |
---|
882 | n/a | except TypeError: |
---|
883 | n/a | # not a sequence |
---|
884 | n/a | v = quote_via(str(v), safe, encoding, errors) |
---|
885 | n/a | l.append(k + '=' + v) |
---|
886 | n/a | else: |
---|
887 | n/a | # loop over the sequence |
---|
888 | n/a | for elt in v: |
---|
889 | n/a | if isinstance(elt, bytes): |
---|
890 | n/a | elt = quote_via(elt, safe) |
---|
891 | n/a | else: |
---|
892 | n/a | elt = quote_via(str(elt), safe, encoding, errors) |
---|
893 | n/a | l.append(k + '=' + elt) |
---|
894 | n/a | return '&'.join(l) |
---|
895 | n/a | |
---|
896 | n/a | def to_bytes(url): |
---|
897 | n/a | """to_bytes(u"URL") --> 'URL'.""" |
---|
898 | n/a | # Most URL schemes require ASCII. If that changes, the conversion |
---|
899 | n/a | # can be relaxed. |
---|
900 | n/a | # XXX get rid of to_bytes() |
---|
901 | n/a | if isinstance(url, str): |
---|
902 | n/a | try: |
---|
903 | n/a | url = url.encode("ASCII").decode() |
---|
904 | n/a | except UnicodeError: |
---|
905 | n/a | raise UnicodeError("URL " + repr(url) + |
---|
906 | n/a | " contains non-ASCII characters") |
---|
907 | n/a | return url |
---|
908 | n/a | |
---|
909 | n/a | def unwrap(url): |
---|
910 | n/a | """unwrap('<URL:type://host/path>') --> 'type://host/path'.""" |
---|
911 | n/a | url = str(url).strip() |
---|
912 | n/a | if url[:1] == '<' and url[-1:] == '>': |
---|
913 | n/a | url = url[1:-1].strip() |
---|
914 | n/a | if url[:4] == 'URL:': url = url[4:].strip() |
---|
915 | n/a | return url |
---|
916 | n/a | |
---|
917 | n/a | _typeprog = None |
---|
918 | n/a | def splittype(url): |
---|
919 | n/a | """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" |
---|
920 | n/a | global _typeprog |
---|
921 | n/a | if _typeprog is None: |
---|
922 | n/a | _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL) |
---|
923 | n/a | |
---|
924 | n/a | match = _typeprog.match(url) |
---|
925 | n/a | if match: |
---|
926 | n/a | scheme, data = match.groups() |
---|
927 | n/a | return scheme.lower(), data |
---|
928 | n/a | return None, url |
---|
929 | n/a | |
---|
930 | n/a | _hostprog = None |
---|
931 | n/a | def splithost(url): |
---|
932 | n/a | """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" |
---|
933 | n/a | global _hostprog |
---|
934 | n/a | if _hostprog is None: |
---|
935 | n/a | _hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL) |
---|
936 | n/a | |
---|
937 | n/a | match = _hostprog.match(url) |
---|
938 | n/a | if match: |
---|
939 | n/a | host_port, path = match.groups() |
---|
940 | n/a | if path and path[0] != '/': |
---|
941 | n/a | path = '/' + path |
---|
942 | n/a | return host_port, path |
---|
943 | n/a | return None, url |
---|
944 | n/a | |
---|
945 | n/a | def splituser(host): |
---|
946 | n/a | """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" |
---|
947 | n/a | user, delim, host = host.rpartition('@') |
---|
948 | n/a | return (user if delim else None), host |
---|
949 | n/a | |
---|
950 | n/a | def splitpasswd(user): |
---|
951 | n/a | """splitpasswd('user:passwd') -> 'user', 'passwd'.""" |
---|
952 | n/a | user, delim, passwd = user.partition(':') |
---|
953 | n/a | return user, (passwd if delim else None) |
---|
954 | n/a | |
---|
955 | n/a | # splittag('/path#tag') --> '/path', 'tag' |
---|
956 | n/a | _portprog = None |
---|
957 | n/a | def splitport(host): |
---|
958 | n/a | """splitport('host:port') --> 'host', 'port'.""" |
---|
959 | n/a | global _portprog |
---|
960 | n/a | if _portprog is None: |
---|
961 | n/a | _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL) |
---|
962 | n/a | |
---|
963 | n/a | match = _portprog.match(host) |
---|
964 | n/a | if match: |
---|
965 | n/a | host, port = match.groups() |
---|
966 | n/a | if port: |
---|
967 | n/a | return host, port |
---|
968 | n/a | return host, None |
---|
969 | n/a | |
---|
970 | n/a | def splitnport(host, defport=-1): |
---|
971 | n/a | """Split host and port, returning numeric port. |
---|
972 | n/a | Return given default port if no ':' found; defaults to -1. |
---|
973 | n/a | Return numerical port if a valid number are found after ':'. |
---|
974 | n/a | Return None if ':' but not a valid number.""" |
---|
975 | n/a | host, delim, port = host.rpartition(':') |
---|
976 | n/a | if not delim: |
---|
977 | n/a | host = port |
---|
978 | n/a | elif port: |
---|
979 | n/a | try: |
---|
980 | n/a | nport = int(port) |
---|
981 | n/a | except ValueError: |
---|
982 | n/a | nport = None |
---|
983 | n/a | return host, nport |
---|
984 | n/a | return host, defport |
---|
985 | n/a | |
---|
986 | n/a | def splitquery(url): |
---|
987 | n/a | """splitquery('/path?query') --> '/path', 'query'.""" |
---|
988 | n/a | path, delim, query = url.rpartition('?') |
---|
989 | n/a | if delim: |
---|
990 | n/a | return path, query |
---|
991 | n/a | return url, None |
---|
992 | n/a | |
---|
993 | n/a | def splittag(url): |
---|
994 | n/a | """splittag('/path#tag') --> '/path', 'tag'.""" |
---|
995 | n/a | path, delim, tag = url.rpartition('#') |
---|
996 | n/a | if delim: |
---|
997 | n/a | return path, tag |
---|
998 | n/a | return url, None |
---|
999 | n/a | |
---|
1000 | n/a | def splitattr(url): |
---|
1001 | n/a | """splitattr('/path;attr1=value1;attr2=value2;...') -> |
---|
1002 | n/a | '/path', ['attr1=value1', 'attr2=value2', ...].""" |
---|
1003 | n/a | words = url.split(';') |
---|
1004 | n/a | return words[0], words[1:] |
---|
1005 | n/a | |
---|
1006 | n/a | def splitvalue(attr): |
---|
1007 | n/a | """splitvalue('attr=value') --> 'attr', 'value'.""" |
---|
1008 | n/a | attr, delim, value = attr.partition('=') |
---|
1009 | n/a | return attr, (value if delim else None) |
---|