| 1 | n/a | """Parse (absolute and relative) URLs. |
|---|
| 2 | n/a | |
|---|
| 3 | n/a | urlparse module is based upon the following RFC specifications. |
|---|
| 4 | n/a | |
|---|
| 5 | n/a | RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding |
|---|
| 6 | n/a | and L. Masinter, January 2005. |
|---|
| 7 | n/a | |
|---|
| 8 | n/a | RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter |
|---|
| 9 | n/a | and L.Masinter, December 1999. |
|---|
| 10 | n/a | |
|---|
| 11 | n/a | RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T. |
|---|
| 12 | n/a | Berners-Lee, R. Fielding, and L. Masinter, August 1998. |
|---|
| 13 | n/a | |
|---|
| 14 | n/a | RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998. |
|---|
| 15 | n/a | |
|---|
| 16 | n/a | RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June |
|---|
| 17 | n/a | 1995. |
|---|
| 18 | n/a | |
|---|
| 19 | n/a | RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M. |
|---|
| 20 | n/a | McCahill, December 1994 |
|---|
| 21 | n/a | |
|---|
| 22 | n/a | RFC 3986 is considered the current standard and any future changes to |
|---|
| 23 | n/a | urlparse module should conform with it. The urlparse module is |
|---|
| 24 | n/a | currently not entirely compliant with this RFC due to defacto |
|---|
| 25 | n/a | scenarios for parsing, and for backward compatibility purposes, some |
|---|
| 26 | n/a | parsing quirks from older RFCs are retained. The testcases in |
|---|
| 27 | n/a | test_urlparse.py provides a good indicator of parsing behavior. |
|---|
| 28 | n/a | """ |
|---|
| 29 | n/a | |
|---|
| 30 | n/a | import re |
|---|
| 31 | n/a | import sys |
|---|
| 32 | n/a | import collections |
|---|
| 33 | n/a | |
|---|
| 34 | n/a | __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", |
|---|
| 35 | n/a | "urlsplit", "urlunsplit", "urlencode", "parse_qs", |
|---|
| 36 | n/a | "parse_qsl", "quote", "quote_plus", "quote_from_bytes", |
|---|
| 37 | n/a | "unquote", "unquote_plus", "unquote_to_bytes", |
|---|
| 38 | n/a | "DefragResult", "ParseResult", "SplitResult", |
|---|
| 39 | n/a | "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"] |
|---|
| 40 | n/a | |
|---|
| 41 | n/a | # A classification of schemes ('' means apply by default) |
|---|
| 42 | n/a | uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', |
|---|
| 43 | n/a | 'wais', 'file', 'https', 'shttp', 'mms', |
|---|
| 44 | n/a | 'prospero', 'rtsp', 'rtspu', '', 'sftp', |
|---|
| 45 | n/a | 'svn', 'svn+ssh', 'ws', 'wss'] |
|---|
| 46 | n/a | uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', |
|---|
| 47 | n/a | 'imap', 'wais', 'file', 'mms', 'https', 'shttp', |
|---|
| 48 | n/a | 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', |
|---|
| 49 | n/a | 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh', |
|---|
| 50 | n/a | 'ws', 'wss'] |
|---|
| 51 | n/a | uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', |
|---|
| 52 | n/a | 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', |
|---|
| 53 | n/a | 'mms', '', 'sftp', 'tel'] |
|---|
| 54 | n/a | |
|---|
| 55 | n/a | # These are not actually used anymore, but should stay for backwards |
|---|
| 56 | n/a | # compatibility. (They are undocumented, but have a public-looking name.) |
|---|
| 57 | n/a | non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', |
|---|
| 58 | n/a | 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] |
|---|
| 59 | n/a | uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms', |
|---|
| 60 | n/a | 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', ''] |
|---|
| 61 | n/a | uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', |
|---|
| 62 | n/a | 'nntp', 'wais', 'https', 'shttp', 'snews', |
|---|
| 63 | n/a | 'file', 'prospero', ''] |
|---|
| 64 | n/a | |
|---|
| 65 | n/a | # Characters valid in scheme names |
|---|
| 66 | n/a | scheme_chars = ('abcdefghijklmnopqrstuvwxyz' |
|---|
| 67 | n/a | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' |
|---|
| 68 | n/a | '0123456789' |
|---|
| 69 | n/a | '+-.') |
|---|
| 70 | n/a | |
|---|
| 71 | n/a | # XXX: Consider replacing with functools.lru_cache |
|---|
| 72 | n/a | MAX_CACHE_SIZE = 20 |
|---|
| 73 | n/a | _parse_cache = {} |
|---|
| 74 | n/a | |
|---|
| 75 | n/a | def clear_cache(): |
|---|
| 76 | n/a | """Clear the parse cache and the quoters cache.""" |
|---|
| 77 | n/a | _parse_cache.clear() |
|---|
| 78 | n/a | _safe_quoters.clear() |
|---|
| 79 | n/a | |
|---|
| 80 | n/a | |
|---|
| 81 | n/a | # Helpers for bytes handling |
|---|
| 82 | n/a | # For 3.2, we deliberately require applications that |
|---|
| 83 | n/a | # handle improperly quoted URLs to do their own |
|---|
| 84 | n/a | # decoding and encoding. If valid use cases are |
|---|
| 85 | n/a | # presented, we may relax this by using latin-1 |
|---|
| 86 | n/a | # decoding internally for 3.3 |
|---|
| 87 | n/a | _implicit_encoding = 'ascii' |
|---|
| 88 | n/a | _implicit_errors = 'strict' |
|---|
| 89 | n/a | |
|---|
| 90 | n/a | def _noop(obj): |
|---|
| 91 | n/a | return obj |
|---|
| 92 | n/a | |
|---|
| 93 | n/a | def _encode_result(obj, encoding=_implicit_encoding, |
|---|
| 94 | n/a | errors=_implicit_errors): |
|---|
| 95 | n/a | return obj.encode(encoding, errors) |
|---|
| 96 | n/a | |
|---|
| 97 | n/a | def _decode_args(args, encoding=_implicit_encoding, |
|---|
| 98 | n/a | errors=_implicit_errors): |
|---|
| 99 | n/a | return tuple(x.decode(encoding, errors) if x else '' for x in args) |
|---|
| 100 | n/a | |
|---|
| 101 | n/a | def _coerce_args(*args): |
|---|
| 102 | n/a | # Invokes decode if necessary to create str args |
|---|
| 103 | n/a | # and returns the coerced inputs along with |
|---|
| 104 | n/a | # an appropriate result coercion function |
|---|
| 105 | n/a | # - noop for str inputs |
|---|
| 106 | n/a | # - encoding function otherwise |
|---|
| 107 | n/a | str_input = isinstance(args[0], str) |
|---|
| 108 | n/a | for arg in args[1:]: |
|---|
| 109 | n/a | # We special-case the empty string to support the |
|---|
| 110 | n/a | # "scheme=''" default argument to some functions |
|---|
| 111 | n/a | if arg and isinstance(arg, str) != str_input: |
|---|
| 112 | n/a | raise TypeError("Cannot mix str and non-str arguments") |
|---|
| 113 | n/a | if str_input: |
|---|
| 114 | n/a | return args + (_noop,) |
|---|
| 115 | n/a | return _decode_args(args) + (_encode_result,) |
|---|
| 116 | n/a | |
|---|
| 117 | n/a | # Result objects are more helpful than simple tuples |
|---|
| 118 | n/a | class _ResultMixinStr(object): |
|---|
| 119 | n/a | """Standard approach to encoding parsed results from str to bytes""" |
|---|
| 120 | n/a | __slots__ = () |
|---|
| 121 | n/a | |
|---|
| 122 | n/a | def encode(self, encoding='ascii', errors='strict'): |
|---|
| 123 | n/a | return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self)) |
|---|
| 124 | n/a | |
|---|
| 125 | n/a | |
|---|
| 126 | n/a | class _ResultMixinBytes(object): |
|---|
| 127 | n/a | """Standard approach to decoding parsed results from bytes to str""" |
|---|
| 128 | n/a | __slots__ = () |
|---|
| 129 | n/a | |
|---|
| 130 | n/a | def decode(self, encoding='ascii', errors='strict'): |
|---|
| 131 | n/a | return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self)) |
|---|
| 132 | n/a | |
|---|
| 133 | n/a | |
|---|
| 134 | n/a | class _NetlocResultMixinBase(object): |
|---|
| 135 | n/a | """Shared methods for the parsed result objects containing a netloc element""" |
|---|
| 136 | n/a | __slots__ = () |
|---|
| 137 | n/a | |
|---|
| 138 | n/a | @property |
|---|
| 139 | n/a | def username(self): |
|---|
| 140 | n/a | return self._userinfo[0] |
|---|
| 141 | n/a | |
|---|
| 142 | n/a | @property |
|---|
| 143 | n/a | def password(self): |
|---|
| 144 | n/a | return self._userinfo[1] |
|---|
| 145 | n/a | |
|---|
| 146 | n/a | @property |
|---|
| 147 | n/a | def hostname(self): |
|---|
| 148 | n/a | hostname = self._hostinfo[0] |
|---|
| 149 | n/a | if not hostname: |
|---|
| 150 | n/a | hostname = None |
|---|
| 151 | n/a | elif hostname is not None: |
|---|
| 152 | n/a | hostname = hostname.lower() |
|---|
| 153 | n/a | return hostname |
|---|
| 154 | n/a | |
|---|
| 155 | n/a | @property |
|---|
| 156 | n/a | def port(self): |
|---|
| 157 | n/a | port = self._hostinfo[1] |
|---|
| 158 | n/a | if port is not None: |
|---|
| 159 | n/a | port = int(port, 10) |
|---|
| 160 | n/a | if not ( 0 <= port <= 65535): |
|---|
| 161 | n/a | raise ValueError("Port out of range 0-65535") |
|---|
| 162 | n/a | return port |
|---|
| 163 | n/a | |
|---|
| 164 | n/a | |
|---|
| 165 | n/a | class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr): |
|---|
| 166 | n/a | __slots__ = () |
|---|
| 167 | n/a | |
|---|
| 168 | n/a | @property |
|---|
| 169 | n/a | def _userinfo(self): |
|---|
| 170 | n/a | netloc = self.netloc |
|---|
| 171 | n/a | userinfo, have_info, hostinfo = netloc.rpartition('@') |
|---|
| 172 | n/a | if have_info: |
|---|
| 173 | n/a | username, have_password, password = userinfo.partition(':') |
|---|
| 174 | n/a | if not have_password: |
|---|
| 175 | n/a | password = None |
|---|
| 176 | n/a | else: |
|---|
| 177 | n/a | username = password = None |
|---|
| 178 | n/a | return username, password |
|---|
| 179 | n/a | |
|---|
| 180 | n/a | @property |
|---|
| 181 | n/a | def _hostinfo(self): |
|---|
| 182 | n/a | netloc = self.netloc |
|---|
| 183 | n/a | _, _, hostinfo = netloc.rpartition('@') |
|---|
| 184 | n/a | _, have_open_br, bracketed = hostinfo.partition('[') |
|---|
| 185 | n/a | if have_open_br: |
|---|
| 186 | n/a | hostname, _, port = bracketed.partition(']') |
|---|
| 187 | n/a | _, _, port = port.partition(':') |
|---|
| 188 | n/a | else: |
|---|
| 189 | n/a | hostname, _, port = hostinfo.partition(':') |
|---|
| 190 | n/a | if not port: |
|---|
| 191 | n/a | port = None |
|---|
| 192 | n/a | return hostname, port |
|---|
| 193 | n/a | |
|---|
| 194 | n/a | |
|---|
| 195 | n/a | class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes): |
|---|
| 196 | n/a | __slots__ = () |
|---|
| 197 | n/a | |
|---|
| 198 | n/a | @property |
|---|
| 199 | n/a | def _userinfo(self): |
|---|
| 200 | n/a | netloc = self.netloc |
|---|
| 201 | n/a | userinfo, have_info, hostinfo = netloc.rpartition(b'@') |
|---|
| 202 | n/a | if have_info: |
|---|
| 203 | n/a | username, have_password, password = userinfo.partition(b':') |
|---|
| 204 | n/a | if not have_password: |
|---|
| 205 | n/a | password = None |
|---|
| 206 | n/a | else: |
|---|
| 207 | n/a | username = password = None |
|---|
| 208 | n/a | return username, password |
|---|
| 209 | n/a | |
|---|
| 210 | n/a | @property |
|---|
| 211 | n/a | def _hostinfo(self): |
|---|
| 212 | n/a | netloc = self.netloc |
|---|
| 213 | n/a | _, _, hostinfo = netloc.rpartition(b'@') |
|---|
| 214 | n/a | _, have_open_br, bracketed = hostinfo.partition(b'[') |
|---|
| 215 | n/a | if have_open_br: |
|---|
| 216 | n/a | hostname, _, port = bracketed.partition(b']') |
|---|
| 217 | n/a | _, _, port = port.partition(b':') |
|---|
| 218 | n/a | else: |
|---|
| 219 | n/a | hostname, _, port = hostinfo.partition(b':') |
|---|
| 220 | n/a | if not port: |
|---|
| 221 | n/a | port = None |
|---|
| 222 | n/a | return hostname, port |
|---|
| 223 | n/a | |
|---|
| 224 | n/a | |
|---|
| 225 | n/a | from collections import namedtuple |
|---|
| 226 | n/a | |
|---|
| 227 | n/a | _DefragResultBase = namedtuple('DefragResult', 'url fragment') |
|---|
| 228 | n/a | _SplitResultBase = namedtuple( |
|---|
| 229 | n/a | 'SplitResult', 'scheme netloc path query fragment') |
|---|
| 230 | n/a | _ParseResultBase = namedtuple( |
|---|
| 231 | n/a | 'ParseResult', 'scheme netloc path params query fragment') |
|---|
| 232 | n/a | |
|---|
| 233 | n/a | _DefragResultBase.__doc__ = """ |
|---|
| 234 | n/a | DefragResult(url, fragment) |
|---|
| 235 | n/a | |
|---|
| 236 | n/a | A 2-tuple that contains the url without fragment identifier and the fragment |
|---|
| 237 | n/a | identifier as a separate argument. |
|---|
| 238 | n/a | """ |
|---|
| 239 | n/a | |
|---|
| 240 | n/a | _DefragResultBase.url.__doc__ = """The URL with no fragment identifier.""" |
|---|
| 241 | n/a | |
|---|
| 242 | n/a | _DefragResultBase.fragment.__doc__ = """ |
|---|
| 243 | n/a | Fragment identifier separated from URL, that allows indirect identification of a |
|---|
| 244 | n/a | secondary resource by reference to a primary resource and additional identifying |
|---|
| 245 | n/a | information. |
|---|
| 246 | n/a | """ |
|---|
| 247 | n/a | |
|---|
| 248 | n/a | _SplitResultBase.__doc__ = """ |
|---|
| 249 | n/a | SplitResult(scheme, netloc, path, query, fragment) |
|---|
| 250 | n/a | |
|---|
| 251 | n/a | A 5-tuple that contains the different components of a URL. Similar to |
|---|
| 252 | n/a | ParseResult, but does not split params. |
|---|
| 253 | n/a | """ |
|---|
| 254 | n/a | |
|---|
| 255 | n/a | _SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request.""" |
|---|
| 256 | n/a | |
|---|
| 257 | n/a | _SplitResultBase.netloc.__doc__ = """ |
|---|
| 258 | n/a | Network location where the request is made to. |
|---|
| 259 | n/a | """ |
|---|
| 260 | n/a | |
|---|
| 261 | n/a | _SplitResultBase.path.__doc__ = """ |
|---|
| 262 | n/a | The hierarchical path, such as the path to a file to download. |
|---|
| 263 | n/a | """ |
|---|
| 264 | n/a | |
|---|
| 265 | n/a | _SplitResultBase.query.__doc__ = """ |
|---|
| 266 | n/a | The query component, that contains non-hierarchical data, that along with data |
|---|
| 267 | n/a | in path component, identifies a resource in the scope of URI's scheme and |
|---|
| 268 | n/a | network location. |
|---|
| 269 | n/a | """ |
|---|
| 270 | n/a | |
|---|
| 271 | n/a | _SplitResultBase.fragment.__doc__ = """ |
|---|
| 272 | n/a | Fragment identifier, that allows indirect identification of a secondary resource |
|---|
| 273 | n/a | by reference to a primary resource and additional identifying information. |
|---|
| 274 | n/a | """ |
|---|
| 275 | n/a | |
|---|
| 276 | n/a | _ParseResultBase.__doc__ = """ |
|---|
| 277 | n/a | ParseResult(scheme, netloc, path, params, query, fragment) |
|---|
| 278 | n/a | |
|---|
| 279 | n/a | A 6-tuple that contains components of a parsed URL. |
|---|
| 280 | n/a | """ |
|---|
| 281 | n/a | |
|---|
| 282 | n/a | _ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__ |
|---|
| 283 | n/a | _ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__ |
|---|
| 284 | n/a | _ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__ |
|---|
| 285 | n/a | _ParseResultBase.params.__doc__ = """ |
|---|
| 286 | n/a | Parameters for last path element used to dereference the URI in order to provide |
|---|
| 287 | n/a | access to perform some operation on the resource. |
|---|
| 288 | n/a | """ |
|---|
| 289 | n/a | |
|---|
| 290 | n/a | _ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__ |
|---|
| 291 | n/a | _ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__ |
|---|
| 292 | n/a | |
|---|
| 293 | n/a | |
|---|
| 294 | n/a | # For backwards compatibility, alias _NetlocResultMixinStr |
|---|
| 295 | n/a | # ResultBase is no longer part of the documented API, but it is |
|---|
| 296 | n/a | # retained since deprecating it isn't worth the hassle |
|---|
| 297 | n/a | ResultBase = _NetlocResultMixinStr |
|---|
| 298 | n/a | |
|---|
| 299 | n/a | # Structured result objects for string data |
|---|
| 300 | n/a | class DefragResult(_DefragResultBase, _ResultMixinStr): |
|---|
| 301 | n/a | __slots__ = () |
|---|
| 302 | n/a | def geturl(self): |
|---|
| 303 | n/a | if self.fragment: |
|---|
| 304 | n/a | return self.url + '#' + self.fragment |
|---|
| 305 | n/a | else: |
|---|
| 306 | n/a | return self.url |
|---|
| 307 | n/a | |
|---|
| 308 | n/a | class SplitResult(_SplitResultBase, _NetlocResultMixinStr): |
|---|
| 309 | n/a | __slots__ = () |
|---|
| 310 | n/a | def geturl(self): |
|---|
| 311 | n/a | return urlunsplit(self) |
|---|
| 312 | n/a | |
|---|
| 313 | n/a | class ParseResult(_ParseResultBase, _NetlocResultMixinStr): |
|---|
| 314 | n/a | __slots__ = () |
|---|
| 315 | n/a | def geturl(self): |
|---|
| 316 | n/a | return urlunparse(self) |
|---|
| 317 | n/a | |
|---|
| 318 | n/a | # Structured result objects for bytes data |
|---|
| 319 | n/a | class DefragResultBytes(_DefragResultBase, _ResultMixinBytes): |
|---|
| 320 | n/a | __slots__ = () |
|---|
| 321 | n/a | def geturl(self): |
|---|
| 322 | n/a | if self.fragment: |
|---|
| 323 | n/a | return self.url + b'#' + self.fragment |
|---|
| 324 | n/a | else: |
|---|
| 325 | n/a | return self.url |
|---|
| 326 | n/a | |
|---|
| 327 | n/a | class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes): |
|---|
| 328 | n/a | __slots__ = () |
|---|
| 329 | n/a | def geturl(self): |
|---|
| 330 | n/a | return urlunsplit(self) |
|---|
| 331 | n/a | |
|---|
| 332 | n/a | class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes): |
|---|
| 333 | n/a | __slots__ = () |
|---|
| 334 | n/a | def geturl(self): |
|---|
| 335 | n/a | return urlunparse(self) |
|---|
| 336 | n/a | |
|---|
| 337 | n/a | # Set up the encode/decode result pairs |
|---|
| 338 | n/a | def _fix_result_transcoding(): |
|---|
| 339 | n/a | _result_pairs = ( |
|---|
| 340 | n/a | (DefragResult, DefragResultBytes), |
|---|
| 341 | n/a | (SplitResult, SplitResultBytes), |
|---|
| 342 | n/a | (ParseResult, ParseResultBytes), |
|---|
| 343 | n/a | ) |
|---|
| 344 | n/a | for _decoded, _encoded in _result_pairs: |
|---|
| 345 | n/a | _decoded._encoded_counterpart = _encoded |
|---|
| 346 | n/a | _encoded._decoded_counterpart = _decoded |
|---|
| 347 | n/a | |
|---|
| 348 | n/a | _fix_result_transcoding() |
|---|
| 349 | n/a | del _fix_result_transcoding |
|---|
| 350 | n/a | |
|---|
| 351 | n/a | def urlparse(url, scheme='', allow_fragments=True): |
|---|
| 352 | n/a | """Parse a URL into 6 components: |
|---|
| 353 | n/a | <scheme>://<netloc>/<path>;<params>?<query>#<fragment> |
|---|
| 354 | n/a | Return a 6-tuple: (scheme, netloc, path, params, query, fragment). |
|---|
| 355 | n/a | Note that we don't break the components up in smaller bits |
|---|
| 356 | n/a | (e.g. netloc is a single string) and we don't expand % escapes.""" |
|---|
| 357 | n/a | url, scheme, _coerce_result = _coerce_args(url, scheme) |
|---|
| 358 | n/a | splitresult = urlsplit(url, scheme, allow_fragments) |
|---|
| 359 | n/a | scheme, netloc, url, query, fragment = splitresult |
|---|
| 360 | n/a | if scheme in uses_params and ';' in url: |
|---|
| 361 | n/a | url, params = _splitparams(url) |
|---|
| 362 | n/a | else: |
|---|
| 363 | n/a | params = '' |
|---|
| 364 | n/a | result = ParseResult(scheme, netloc, url, params, query, fragment) |
|---|
| 365 | n/a | return _coerce_result(result) |
|---|
| 366 | n/a | |
|---|
| 367 | n/a | def _splitparams(url): |
|---|
| 368 | n/a | if '/' in url: |
|---|
| 369 | n/a | i = url.find(';', url.rfind('/')) |
|---|
| 370 | n/a | if i < 0: |
|---|
| 371 | n/a | return url, '' |
|---|
| 372 | n/a | else: |
|---|
| 373 | n/a | i = url.find(';') |
|---|
| 374 | n/a | return url[:i], url[i+1:] |
|---|
| 375 | n/a | |
|---|
| 376 | n/a | def _splitnetloc(url, start=0): |
|---|
| 377 | n/a | delim = len(url) # position of end of domain part of url, default is end |
|---|
| 378 | n/a | for c in '/?#': # look for delimiters; the order is NOT important |
|---|
| 379 | n/a | wdelim = url.find(c, start) # find first of this delim |
|---|
| 380 | n/a | if wdelim >= 0: # if found |
|---|
| 381 | n/a | delim = min(delim, wdelim) # use earliest delim position |
|---|
| 382 | n/a | return url[start:delim], url[delim:] # return (domain, rest) |
|---|
| 383 | n/a | |
|---|
| 384 | n/a | def urlsplit(url, scheme='', allow_fragments=True): |
|---|
| 385 | n/a | """Parse a URL into 5 components: |
|---|
| 386 | n/a | <scheme>://<netloc>/<path>?<query>#<fragment> |
|---|
| 387 | n/a | Return a 5-tuple: (scheme, netloc, path, query, fragment). |
|---|
| 388 | n/a | Note that we don't break the components up in smaller bits |
|---|
| 389 | n/a | (e.g. netloc is a single string) and we don't expand % escapes.""" |
|---|
| 390 | n/a | url, scheme, _coerce_result = _coerce_args(url, scheme) |
|---|
| 391 | n/a | allow_fragments = bool(allow_fragments) |
|---|
| 392 | n/a | key = url, scheme, allow_fragments, type(url), type(scheme) |
|---|
| 393 | n/a | cached = _parse_cache.get(key, None) |
|---|
| 394 | n/a | if cached: |
|---|
| 395 | n/a | return _coerce_result(cached) |
|---|
| 396 | n/a | if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth |
|---|
| 397 | n/a | clear_cache() |
|---|
| 398 | n/a | netloc = query = fragment = '' |
|---|
| 399 | n/a | i = url.find(':') |
|---|
| 400 | n/a | if i > 0: |
|---|
| 401 | n/a | if url[:i] == 'http': # optimize the common case |
|---|
| 402 | n/a | scheme = url[:i].lower() |
|---|
| 403 | n/a | url = url[i+1:] |
|---|
| 404 | n/a | if url[:2] == '//': |
|---|
| 405 | n/a | netloc, url = _splitnetloc(url, 2) |
|---|
| 406 | n/a | if (('[' in netloc and ']' not in netloc) or |
|---|
| 407 | n/a | (']' in netloc and '[' not in netloc)): |
|---|
| 408 | n/a | raise ValueError("Invalid IPv6 URL") |
|---|
| 409 | n/a | if allow_fragments and '#' in url: |
|---|
| 410 | n/a | url, fragment = url.split('#', 1) |
|---|
| 411 | n/a | if '?' in url: |
|---|
| 412 | n/a | url, query = url.split('?', 1) |
|---|
| 413 | n/a | v = SplitResult(scheme, netloc, url, query, fragment) |
|---|
| 414 | n/a | _parse_cache[key] = v |
|---|
| 415 | n/a | return _coerce_result(v) |
|---|
| 416 | n/a | for c in url[:i]: |
|---|
| 417 | n/a | if c not in scheme_chars: |
|---|
| 418 | n/a | break |
|---|
| 419 | n/a | else: |
|---|
| 420 | n/a | # make sure "url" is not actually a port number (in which case |
|---|
| 421 | n/a | # "scheme" is really part of the path) |
|---|
| 422 | n/a | rest = url[i+1:] |
|---|
| 423 | n/a | if not rest or any(c not in '0123456789' for c in rest): |
|---|
| 424 | n/a | # not a port number |
|---|
| 425 | n/a | scheme, url = url[:i].lower(), rest |
|---|
| 426 | n/a | |
|---|
| 427 | n/a | if url[:2] == '//': |
|---|
| 428 | n/a | netloc, url = _splitnetloc(url, 2) |
|---|
| 429 | n/a | if (('[' in netloc and ']' not in netloc) or |
|---|
| 430 | n/a | (']' in netloc and '[' not in netloc)): |
|---|
| 431 | n/a | raise ValueError("Invalid IPv6 URL") |
|---|
| 432 | n/a | if allow_fragments and '#' in url: |
|---|
| 433 | n/a | url, fragment = url.split('#', 1) |
|---|
| 434 | n/a | if '?' in url: |
|---|
| 435 | n/a | url, query = url.split('?', 1) |
|---|
| 436 | n/a | v = SplitResult(scheme, netloc, url, query, fragment) |
|---|
| 437 | n/a | _parse_cache[key] = v |
|---|
| 438 | n/a | return _coerce_result(v) |
|---|
| 439 | n/a | |
|---|
| 440 | n/a | def urlunparse(components): |
|---|
| 441 | n/a | """Put a parsed URL back together again. This may result in a |
|---|
| 442 | n/a | slightly different, but equivalent URL, if the URL that was parsed |
|---|
| 443 | n/a | originally had redundant delimiters, e.g. a ? with an empty query |
|---|
| 444 | n/a | (the draft states that these are equivalent).""" |
|---|
| 445 | n/a | scheme, netloc, url, params, query, fragment, _coerce_result = ( |
|---|
| 446 | n/a | _coerce_args(*components)) |
|---|
| 447 | n/a | if params: |
|---|
| 448 | n/a | url = "%s;%s" % (url, params) |
|---|
| 449 | n/a | return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment))) |
|---|
| 450 | n/a | |
|---|
| 451 | n/a | def urlunsplit(components): |
|---|
| 452 | n/a | """Combine the elements of a tuple as returned by urlsplit() into a |
|---|
| 453 | n/a | complete URL as a string. The data argument can be any five-item iterable. |
|---|
| 454 | n/a | This may result in a slightly different, but equivalent URL, if the URL that |
|---|
| 455 | n/a | was parsed originally had unnecessary delimiters (for example, a ? with an |
|---|
| 456 | n/a | empty query; the RFC states that these are equivalent).""" |
|---|
| 457 | n/a | scheme, netloc, url, query, fragment, _coerce_result = ( |
|---|
| 458 | n/a | _coerce_args(*components)) |
|---|
| 459 | n/a | if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): |
|---|
| 460 | n/a | if url and url[:1] != '/': url = '/' + url |
|---|
| 461 | n/a | url = '//' + (netloc or '') + url |
|---|
| 462 | n/a | if scheme: |
|---|
| 463 | n/a | url = scheme + ':' + url |
|---|
| 464 | n/a | if query: |
|---|
| 465 | n/a | url = url + '?' + query |
|---|
| 466 | n/a | if fragment: |
|---|
| 467 | n/a | url = url + '#' + fragment |
|---|
| 468 | n/a | return _coerce_result(url) |
|---|
| 469 | n/a | |
|---|
| 470 | n/a | def urljoin(base, url, allow_fragments=True): |
|---|
| 471 | n/a | """Join a base URL and a possibly relative URL to form an absolute |
|---|
| 472 | n/a | interpretation of the latter.""" |
|---|
| 473 | n/a | if not base: |
|---|
| 474 | n/a | return url |
|---|
| 475 | n/a | if not url: |
|---|
| 476 | n/a | return base |
|---|
| 477 | n/a | |
|---|
| 478 | n/a | base, url, _coerce_result = _coerce_args(base, url) |
|---|
| 479 | n/a | bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ |
|---|
| 480 | n/a | urlparse(base, '', allow_fragments) |
|---|
| 481 | n/a | scheme, netloc, path, params, query, fragment = \ |
|---|
| 482 | n/a | urlparse(url, bscheme, allow_fragments) |
|---|
| 483 | n/a | |
|---|
| 484 | n/a | if scheme != bscheme or scheme not in uses_relative: |
|---|
| 485 | n/a | return _coerce_result(url) |
|---|
| 486 | n/a | if scheme in uses_netloc: |
|---|
| 487 | n/a | if netloc: |
|---|
| 488 | n/a | return _coerce_result(urlunparse((scheme, netloc, path, |
|---|
| 489 | n/a | params, query, fragment))) |
|---|
| 490 | n/a | netloc = bnetloc |
|---|
| 491 | n/a | |
|---|
| 492 | n/a | if not path and not params: |
|---|
| 493 | n/a | path = bpath |
|---|
| 494 | n/a | params = bparams |
|---|
| 495 | n/a | if not query: |
|---|
| 496 | n/a | query = bquery |
|---|
| 497 | n/a | return _coerce_result(urlunparse((scheme, netloc, path, |
|---|
| 498 | n/a | params, query, fragment))) |
|---|
| 499 | n/a | |
|---|
| 500 | n/a | base_parts = bpath.split('/') |
|---|
| 501 | n/a | if base_parts[-1] != '': |
|---|
| 502 | n/a | # the last item is not a directory, so will not be taken into account |
|---|
| 503 | n/a | # in resolving the relative path |
|---|
| 504 | n/a | del base_parts[-1] |
|---|
| 505 | n/a | |
|---|
| 506 | n/a | # for rfc3986, ignore all base path should the first character be root. |
|---|
| 507 | n/a | if path[:1] == '/': |
|---|
| 508 | n/a | segments = path.split('/') |
|---|
| 509 | n/a | else: |
|---|
| 510 | n/a | segments = base_parts + path.split('/') |
|---|
| 511 | n/a | # filter out elements that would cause redundant slashes on re-joining |
|---|
| 512 | n/a | # the resolved_path |
|---|
| 513 | n/a | segments[1:-1] = filter(None, segments[1:-1]) |
|---|
| 514 | n/a | |
|---|
| 515 | n/a | resolved_path = [] |
|---|
| 516 | n/a | |
|---|
| 517 | n/a | for seg in segments: |
|---|
| 518 | n/a | if seg == '..': |
|---|
| 519 | n/a | try: |
|---|
| 520 | n/a | resolved_path.pop() |
|---|
| 521 | n/a | except IndexError: |
|---|
| 522 | n/a | # ignore any .. segments that would otherwise cause an IndexError |
|---|
| 523 | n/a | # when popped from resolved_path if resolving for rfc3986 |
|---|
| 524 | n/a | pass |
|---|
| 525 | n/a | elif seg == '.': |
|---|
| 526 | n/a | continue |
|---|
| 527 | n/a | else: |
|---|
| 528 | n/a | resolved_path.append(seg) |
|---|
| 529 | n/a | |
|---|
| 530 | n/a | if segments[-1] in ('.', '..'): |
|---|
| 531 | n/a | # do some post-processing here. if the last segment was a relative dir, |
|---|
| 532 | n/a | # then we need to append the trailing '/' |
|---|
| 533 | n/a | resolved_path.append('') |
|---|
| 534 | n/a | |
|---|
| 535 | n/a | return _coerce_result(urlunparse((scheme, netloc, '/'.join( |
|---|
| 536 | n/a | resolved_path) or '/', params, query, fragment))) |
|---|
| 537 | n/a | |
|---|
| 538 | n/a | |
|---|
| 539 | n/a | def urldefrag(url): |
|---|
| 540 | n/a | """Removes any existing fragment from URL. |
|---|
| 541 | n/a | |
|---|
| 542 | n/a | Returns a tuple of the defragmented URL and the fragment. If |
|---|
| 543 | n/a | the URL contained no fragments, the second element is the |
|---|
| 544 | n/a | empty string. |
|---|
| 545 | n/a | """ |
|---|
| 546 | n/a | url, _coerce_result = _coerce_args(url) |
|---|
| 547 | n/a | if '#' in url: |
|---|
| 548 | n/a | s, n, p, a, q, frag = urlparse(url) |
|---|
| 549 | n/a | defrag = urlunparse((s, n, p, a, q, '')) |
|---|
| 550 | n/a | else: |
|---|
| 551 | n/a | frag = '' |
|---|
| 552 | n/a | defrag = url |
|---|
| 553 | n/a | return _coerce_result(DefragResult(defrag, frag)) |
|---|
| 554 | n/a | |
|---|
| 555 | n/a | _hexdig = '0123456789ABCDEFabcdef' |
|---|
| 556 | n/a | _hextobyte = None |
|---|
| 557 | n/a | |
|---|
| 558 | n/a | def unquote_to_bytes(string): |
|---|
| 559 | n/a | """unquote_to_bytes('abc%20def') -> b'abc def'.""" |
|---|
| 560 | n/a | # Note: strings are encoded as UTF-8. This is only an issue if it contains |
|---|
| 561 | n/a | # unescaped non-ASCII characters, which URIs should not. |
|---|
| 562 | n/a | if not string: |
|---|
| 563 | n/a | # Is it a string-like object? |
|---|
| 564 | n/a | string.split |
|---|
| 565 | n/a | return b'' |
|---|
| 566 | n/a | if isinstance(string, str): |
|---|
| 567 | n/a | string = string.encode('utf-8') |
|---|
| 568 | n/a | bits = string.split(b'%') |
|---|
| 569 | n/a | if len(bits) == 1: |
|---|
| 570 | n/a | return string |
|---|
| 571 | n/a | res = [bits[0]] |
|---|
| 572 | n/a | append = res.append |
|---|
| 573 | n/a | # Delay the initialization of the table to not waste memory |
|---|
| 574 | n/a | # if the function is never called |
|---|
| 575 | n/a | global _hextobyte |
|---|
| 576 | n/a | if _hextobyte is None: |
|---|
| 577 | n/a | _hextobyte = {(a + b).encode(): bytes.fromhex(a + b) |
|---|
| 578 | n/a | for a in _hexdig for b in _hexdig} |
|---|
| 579 | n/a | for item in bits[1:]: |
|---|
| 580 | n/a | try: |
|---|
| 581 | n/a | append(_hextobyte[item[:2]]) |
|---|
| 582 | n/a | append(item[2:]) |
|---|
| 583 | n/a | except KeyError: |
|---|
| 584 | n/a | append(b'%') |
|---|
| 585 | n/a | append(item) |
|---|
| 586 | n/a | return b''.join(res) |
|---|
| 587 | n/a | |
|---|
| 588 | n/a | _asciire = re.compile('([\x00-\x7f]+)') |
|---|
| 589 | n/a | |
|---|
| 590 | n/a | def unquote(string, encoding='utf-8', errors='replace'): |
|---|
| 591 | n/a | """Replace %xx escapes by their single-character equivalent. The optional |
|---|
| 592 | n/a | encoding and errors parameters specify how to decode percent-encoded |
|---|
| 593 | n/a | sequences into Unicode characters, as accepted by the bytes.decode() |
|---|
| 594 | n/a | method. |
|---|
| 595 | n/a | By default, percent-encoded sequences are decoded with UTF-8, and invalid |
|---|
| 596 | n/a | sequences are replaced by a placeholder character. |
|---|
| 597 | n/a | |
|---|
| 598 | n/a | unquote('abc%20def') -> 'abc def'. |
|---|
| 599 | n/a | """ |
|---|
| 600 | n/a | if '%' not in string: |
|---|
| 601 | n/a | string.split |
|---|
| 602 | n/a | return string |
|---|
| 603 | n/a | if encoding is None: |
|---|
| 604 | n/a | encoding = 'utf-8' |
|---|
| 605 | n/a | if errors is None: |
|---|
| 606 | n/a | errors = 'replace' |
|---|
| 607 | n/a | bits = _asciire.split(string) |
|---|
| 608 | n/a | res = [bits[0]] |
|---|
| 609 | n/a | append = res.append |
|---|
| 610 | n/a | for i in range(1, len(bits), 2): |
|---|
| 611 | n/a | append(unquote_to_bytes(bits[i]).decode(encoding, errors)) |
|---|
| 612 | n/a | append(bits[i + 1]) |
|---|
| 613 | n/a | return ''.join(res) |
|---|
| 614 | n/a | |
|---|
| 615 | n/a | def parse_qs(qs, keep_blank_values=False, strict_parsing=False, |
|---|
| 616 | n/a | encoding='utf-8', errors='replace'): |
|---|
| 617 | n/a | """Parse a query given as a string argument. |
|---|
| 618 | n/a | |
|---|
| 619 | n/a | Arguments: |
|---|
| 620 | n/a | |
|---|
| 621 | n/a | qs: percent-encoded query string to be parsed |
|---|
| 622 | n/a | |
|---|
| 623 | n/a | keep_blank_values: flag indicating whether blank values in |
|---|
| 624 | n/a | percent-encoded queries should be treated as blank strings. |
|---|
| 625 | n/a | A true value indicates that blanks should be retained as |
|---|
| 626 | n/a | blank strings. The default false value indicates that |
|---|
| 627 | n/a | blank values are to be ignored and treated as if they were |
|---|
| 628 | n/a | not included. |
|---|
| 629 | n/a | |
|---|
| 630 | n/a | strict_parsing: flag indicating what to do with parsing errors. |
|---|
| 631 | n/a | If false (the default), errors are silently ignored. |
|---|
| 632 | n/a | If true, errors raise a ValueError exception. |
|---|
| 633 | n/a | |
|---|
| 634 | n/a | encoding and errors: specify how to decode percent-encoded sequences |
|---|
| 635 | n/a | into Unicode characters, as accepted by the bytes.decode() method. |
|---|
| 636 | n/a | """ |
|---|
| 637 | n/a | parsed_result = {} |
|---|
| 638 | n/a | pairs = parse_qsl(qs, keep_blank_values, strict_parsing, |
|---|
| 639 | n/a | encoding=encoding, errors=errors) |
|---|
| 640 | n/a | for name, value in pairs: |
|---|
| 641 | n/a | if name in parsed_result: |
|---|
| 642 | n/a | parsed_result[name].append(value) |
|---|
| 643 | n/a | else: |
|---|
| 644 | n/a | parsed_result[name] = [value] |
|---|
| 645 | n/a | return parsed_result |
|---|
| 646 | n/a | |
|---|
| 647 | n/a | def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, |
|---|
| 648 | n/a | encoding='utf-8', errors='replace'): |
|---|
| 649 | n/a | """Parse a query given as a string argument. |
|---|
| 650 | n/a | |
|---|
| 651 | n/a | Arguments: |
|---|
| 652 | n/a | |
|---|
| 653 | n/a | qs: percent-encoded query string to be parsed |
|---|
| 654 | n/a | |
|---|
| 655 | n/a | keep_blank_values: flag indicating whether blank values in |
|---|
| 656 | n/a | percent-encoded queries should be treated as blank strings. A |
|---|
| 657 | n/a | true value indicates that blanks should be retained as blank |
|---|
| 658 | n/a | strings. The default false value indicates that blank values |
|---|
| 659 | n/a | are to be ignored and treated as if they were not included. |
|---|
| 660 | n/a | |
|---|
| 661 | n/a | strict_parsing: flag indicating what to do with parsing errors. If |
|---|
| 662 | n/a | false (the default), errors are silently ignored. If true, |
|---|
| 663 | n/a | errors raise a ValueError exception. |
|---|
| 664 | n/a | |
|---|
| 665 | n/a | encoding and errors: specify how to decode percent-encoded sequences |
|---|
| 666 | n/a | into Unicode characters, as accepted by the bytes.decode() method. |
|---|
| 667 | n/a | |
|---|
| 668 | n/a | Returns a list, as G-d intended. |
|---|
| 669 | n/a | """ |
|---|
| 670 | n/a | qs, _coerce_result = _coerce_args(qs) |
|---|
| 671 | n/a | pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] |
|---|
| 672 | n/a | r = [] |
|---|
| 673 | n/a | for name_value in pairs: |
|---|
| 674 | n/a | if not name_value and not strict_parsing: |
|---|
| 675 | n/a | continue |
|---|
| 676 | n/a | nv = name_value.split('=', 1) |
|---|
| 677 | n/a | if len(nv) != 2: |
|---|
| 678 | n/a | if strict_parsing: |
|---|
| 679 | n/a | raise ValueError("bad query field: %r" % (name_value,)) |
|---|
| 680 | n/a | # Handle case of a control-name with no equal sign |
|---|
| 681 | n/a | if keep_blank_values: |
|---|
| 682 | n/a | nv.append('') |
|---|
| 683 | n/a | else: |
|---|
| 684 | n/a | continue |
|---|
| 685 | n/a | if len(nv[1]) or keep_blank_values: |
|---|
| 686 | n/a | name = nv[0].replace('+', ' ') |
|---|
| 687 | n/a | name = unquote(name, encoding=encoding, errors=errors) |
|---|
| 688 | n/a | name = _coerce_result(name) |
|---|
| 689 | n/a | value = nv[1].replace('+', ' ') |
|---|
| 690 | n/a | value = unquote(value, encoding=encoding, errors=errors) |
|---|
| 691 | n/a | value = _coerce_result(value) |
|---|
| 692 | n/a | r.append((name, value)) |
|---|
| 693 | n/a | return r |
|---|
| 694 | n/a | |
|---|
| 695 | n/a | def unquote_plus(string, encoding='utf-8', errors='replace'): |
|---|
| 696 | n/a | """Like unquote(), but also replace plus signs by spaces, as required for |
|---|
| 697 | n/a | unquoting HTML form values. |
|---|
| 698 | n/a | |
|---|
| 699 | n/a | unquote_plus('%7e/abc+def') -> '~/abc def' |
|---|
| 700 | n/a | """ |
|---|
| 701 | n/a | string = string.replace('+', ' ') |
|---|
| 702 | n/a | return unquote(string, encoding, errors) |
|---|
| 703 | n/a | |
|---|
| 704 | n/a | _ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' |
|---|
| 705 | n/a | b'abcdefghijklmnopqrstuvwxyz' |
|---|
| 706 | n/a | b'0123456789' |
|---|
| 707 | n/a | b'_.-') |
|---|
| 708 | n/a | _ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE) |
|---|
| 709 | n/a | _safe_quoters = {} |
|---|
| 710 | n/a | |
|---|
| 711 | n/a | class Quoter(collections.defaultdict): |
|---|
| 712 | n/a | """A mapping from bytes (in range(0,256)) to strings. |
|---|
| 713 | n/a | |
|---|
| 714 | n/a | String values are percent-encoded byte values, unless the key < 128, and |
|---|
| 715 | n/a | in the "safe" set (either the specified safe set, or default set). |
|---|
| 716 | n/a | """ |
|---|
| 717 | n/a | # Keeps a cache internally, using defaultdict, for efficiency (lookups |
|---|
| 718 | n/a | # of cached keys don't call Python code at all). |
|---|
| 719 | n/a | def __init__(self, safe): |
|---|
| 720 | n/a | """safe: bytes object.""" |
|---|
| 721 | n/a | self.safe = _ALWAYS_SAFE.union(safe) |
|---|
| 722 | n/a | |
|---|
| 723 | n/a | def __repr__(self): |
|---|
| 724 | n/a | # Without this, will just display as a defaultdict |
|---|
| 725 | n/a | return "<%s %r>" % (self.__class__.__name__, dict(self)) |
|---|
| 726 | n/a | |
|---|
| 727 | n/a | def __missing__(self, b): |
|---|
| 728 | n/a | # Handle a cache miss. Store quoted string in cache and return. |
|---|
| 729 | n/a | res = chr(b) if b in self.safe else '%{:02X}'.format(b) |
|---|
| 730 | n/a | self[b] = res |
|---|
| 731 | n/a | return res |
|---|
| 732 | n/a | |
|---|
| 733 | n/a | def quote(string, safe='/', encoding=None, errors=None): |
|---|
| 734 | n/a | """quote('abc def') -> 'abc%20def' |
|---|
| 735 | n/a | |
|---|
| 736 | n/a | Each part of a URL, e.g. the path info, the query, etc., has a |
|---|
| 737 | n/a | different set of reserved characters that must be quoted. |
|---|
| 738 | n/a | |
|---|
| 739 | n/a | RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists |
|---|
| 740 | n/a | the following reserved characters. |
|---|
| 741 | n/a | |
|---|
| 742 | n/a | reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | |
|---|
| 743 | n/a | "$" | "," |
|---|
| 744 | n/a | |
|---|
| 745 | n/a | Each of these characters is reserved in some component of a URL, |
|---|
| 746 | n/a | but not necessarily in all of them. |
|---|
| 747 | n/a | |
|---|
| 748 | n/a | By default, the quote function is intended for quoting the path |
|---|
| 749 | n/a | section of a URL. Thus, it will not encode '/'. This character |
|---|
| 750 | n/a | is reserved, but in typical usage the quote function is being |
|---|
| 751 | n/a | called on a path where the existing slash characters are used as |
|---|
| 752 | n/a | reserved characters. |
|---|
| 753 | n/a | |
|---|
| 754 | n/a | string and safe may be either str or bytes objects. encoding and errors |
|---|
| 755 | n/a | must not be specified if string is a bytes object. |
|---|
| 756 | n/a | |
|---|
| 757 | n/a | The optional encoding and errors parameters specify how to deal with |
|---|
| 758 | n/a | non-ASCII characters, as accepted by the str.encode method. |
|---|
| 759 | n/a | By default, encoding='utf-8' (characters are encoded with UTF-8), and |
|---|
| 760 | n/a | errors='strict' (unsupported characters raise a UnicodeEncodeError). |
|---|
| 761 | n/a | """ |
|---|
| 762 | n/a | if isinstance(string, str): |
|---|
| 763 | n/a | if not string: |
|---|
| 764 | n/a | return string |
|---|
| 765 | n/a | if encoding is None: |
|---|
| 766 | n/a | encoding = 'utf-8' |
|---|
| 767 | n/a | if errors is None: |
|---|
| 768 | n/a | errors = 'strict' |
|---|
| 769 | n/a | string = string.encode(encoding, errors) |
|---|
| 770 | n/a | else: |
|---|
| 771 | n/a | if encoding is not None: |
|---|
| 772 | n/a | raise TypeError("quote() doesn't support 'encoding' for bytes") |
|---|
| 773 | n/a | if errors is not None: |
|---|
| 774 | n/a | raise TypeError("quote() doesn't support 'errors' for bytes") |
|---|
| 775 | n/a | return quote_from_bytes(string, safe) |
|---|
| 776 | n/a | |
|---|
| 777 | n/a | def quote_plus(string, safe='', encoding=None, errors=None): |
|---|
| 778 | n/a | """Like quote(), but also replace ' ' with '+', as required for quoting |
|---|
| 779 | n/a | HTML form values. Plus signs in the original string are escaped unless |
|---|
| 780 | n/a | they are included in safe. It also does not have safe default to '/'. |
|---|
| 781 | n/a | """ |
|---|
| 782 | n/a | # Check if ' ' in string, where string may either be a str or bytes. If |
|---|
| 783 | n/a | # there are no spaces, the regular quote will produce the right answer. |
|---|
| 784 | n/a | if ((isinstance(string, str) and ' ' not in string) or |
|---|
| 785 | n/a | (isinstance(string, bytes) and b' ' not in string)): |
|---|
| 786 | n/a | return quote(string, safe, encoding, errors) |
|---|
| 787 | n/a | if isinstance(safe, str): |
|---|
| 788 | n/a | space = ' ' |
|---|
| 789 | n/a | else: |
|---|
| 790 | n/a | space = b' ' |
|---|
| 791 | n/a | string = quote(string, safe + space, encoding, errors) |
|---|
| 792 | n/a | return string.replace(' ', '+') |
|---|
| 793 | n/a | |
|---|
| 794 | n/a | def quote_from_bytes(bs, safe='/'): |
|---|
| 795 | n/a | """Like quote(), but accepts a bytes object rather than a str, and does |
|---|
| 796 | n/a | not perform string-to-bytes encoding. It always returns an ASCII string. |
|---|
| 797 | n/a | quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f' |
|---|
| 798 | n/a | """ |
|---|
| 799 | n/a | if not isinstance(bs, (bytes, bytearray)): |
|---|
| 800 | n/a | raise TypeError("quote_from_bytes() expected bytes") |
|---|
| 801 | n/a | if not bs: |
|---|
| 802 | n/a | return '' |
|---|
| 803 | n/a | if isinstance(safe, str): |
|---|
| 804 | n/a | # Normalize 'safe' by converting to bytes and removing non-ASCII chars |
|---|
| 805 | n/a | safe = safe.encode('ascii', 'ignore') |
|---|
| 806 | n/a | else: |
|---|
| 807 | n/a | safe = bytes([c for c in safe if c < 128]) |
|---|
| 808 | n/a | if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe): |
|---|
| 809 | n/a | return bs.decode() |
|---|
| 810 | n/a | try: |
|---|
| 811 | n/a | quoter = _safe_quoters[safe] |
|---|
| 812 | n/a | except KeyError: |
|---|
| 813 | n/a | _safe_quoters[safe] = quoter = Quoter(safe).__getitem__ |
|---|
| 814 | n/a | return ''.join([quoter(char) for char in bs]) |
|---|
| 815 | n/a | |
|---|
| 816 | n/a | def urlencode(query, doseq=False, safe='', encoding=None, errors=None, |
|---|
| 817 | n/a | quote_via=quote_plus): |
|---|
| 818 | n/a | """Encode a dict or sequence of two-element tuples into a URL query string. |
|---|
| 819 | n/a | |
|---|
| 820 | n/a | If any values in the query arg are sequences and doseq is true, each |
|---|
| 821 | n/a | sequence element is converted to a separate parameter. |
|---|
| 822 | n/a | |
|---|
| 823 | n/a | If the query arg is a sequence of two-element tuples, the order of the |
|---|
| 824 | n/a | parameters in the output will match the order of parameters in the |
|---|
| 825 | n/a | input. |
|---|
| 826 | n/a | |
|---|
| 827 | n/a | The components of a query arg may each be either a string or a bytes type. |
|---|
| 828 | n/a | |
|---|
| 829 | n/a | The safe, encoding, and errors parameters are passed down to the function |
|---|
| 830 | n/a | specified by quote_via (encoding and errors only if a component is a str). |
|---|
| 831 | n/a | """ |
|---|
| 832 | n/a | |
|---|
| 833 | n/a | if hasattr(query, "items"): |
|---|
| 834 | n/a | query = query.items() |
|---|
| 835 | n/a | else: |
|---|
| 836 | n/a | # It's a bother at times that strings and string-like objects are |
|---|
| 837 | n/a | # sequences. |
|---|
| 838 | n/a | try: |
|---|
| 839 | n/a | # non-sequence items should not work with len() |
|---|
| 840 | n/a | # non-empty strings will fail this |
|---|
| 841 | n/a | if len(query) and not isinstance(query[0], tuple): |
|---|
| 842 | n/a | raise TypeError |
|---|
| 843 | n/a | # Zero-length sequences of all types will get here and succeed, |
|---|
| 844 | n/a | # but that's a minor nit. Since the original implementation |
|---|
| 845 | n/a | # allowed empty dicts that type of behavior probably should be |
|---|
| 846 | n/a | # preserved for consistency |
|---|
| 847 | n/a | except TypeError: |
|---|
| 848 | n/a | ty, va, tb = sys.exc_info() |
|---|
| 849 | n/a | raise TypeError("not a valid non-string sequence " |
|---|
| 850 | n/a | "or mapping object").with_traceback(tb) |
|---|
| 851 | n/a | |
|---|
| 852 | n/a | l = [] |
|---|
| 853 | n/a | if not doseq: |
|---|
| 854 | n/a | for k, v in query: |
|---|
| 855 | n/a | if isinstance(k, bytes): |
|---|
| 856 | n/a | k = quote_via(k, safe) |
|---|
| 857 | n/a | else: |
|---|
| 858 | n/a | k = quote_via(str(k), safe, encoding, errors) |
|---|
| 859 | n/a | |
|---|
| 860 | n/a | if isinstance(v, bytes): |
|---|
| 861 | n/a | v = quote_via(v, safe) |
|---|
| 862 | n/a | else: |
|---|
| 863 | n/a | v = quote_via(str(v), safe, encoding, errors) |
|---|
| 864 | n/a | l.append(k + '=' + v) |
|---|
| 865 | n/a | else: |
|---|
| 866 | n/a | for k, v in query: |
|---|
| 867 | n/a | if isinstance(k, bytes): |
|---|
| 868 | n/a | k = quote_via(k, safe) |
|---|
| 869 | n/a | else: |
|---|
| 870 | n/a | k = quote_via(str(k), safe, encoding, errors) |
|---|
| 871 | n/a | |
|---|
| 872 | n/a | if isinstance(v, bytes): |
|---|
| 873 | n/a | v = quote_via(v, safe) |
|---|
| 874 | n/a | l.append(k + '=' + v) |
|---|
| 875 | n/a | elif isinstance(v, str): |
|---|
| 876 | n/a | v = quote_via(v, safe, encoding, errors) |
|---|
| 877 | n/a | l.append(k + '=' + v) |
|---|
| 878 | n/a | else: |
|---|
| 879 | n/a | try: |
|---|
| 880 | n/a | # Is this a sufficient test for sequence-ness? |
|---|
| 881 | n/a | x = len(v) |
|---|
| 882 | n/a | except TypeError: |
|---|
| 883 | n/a | # not a sequence |
|---|
| 884 | n/a | v = quote_via(str(v), safe, encoding, errors) |
|---|
| 885 | n/a | l.append(k + '=' + v) |
|---|
| 886 | n/a | else: |
|---|
| 887 | n/a | # loop over the sequence |
|---|
| 888 | n/a | for elt in v: |
|---|
| 889 | n/a | if isinstance(elt, bytes): |
|---|
| 890 | n/a | elt = quote_via(elt, safe) |
|---|
| 891 | n/a | else: |
|---|
| 892 | n/a | elt = quote_via(str(elt), safe, encoding, errors) |
|---|
| 893 | n/a | l.append(k + '=' + elt) |
|---|
| 894 | n/a | return '&'.join(l) |
|---|
| 895 | n/a | |
|---|
| 896 | n/a | def to_bytes(url): |
|---|
| 897 | n/a | """to_bytes(u"URL") --> 'URL'.""" |
|---|
| 898 | n/a | # Most URL schemes require ASCII. If that changes, the conversion |
|---|
| 899 | n/a | # can be relaxed. |
|---|
| 900 | n/a | # XXX get rid of to_bytes() |
|---|
| 901 | n/a | if isinstance(url, str): |
|---|
| 902 | n/a | try: |
|---|
| 903 | n/a | url = url.encode("ASCII").decode() |
|---|
| 904 | n/a | except UnicodeError: |
|---|
| 905 | n/a | raise UnicodeError("URL " + repr(url) + |
|---|
| 906 | n/a | " contains non-ASCII characters") |
|---|
| 907 | n/a | return url |
|---|
| 908 | n/a | |
|---|
| 909 | n/a | def unwrap(url): |
|---|
| 910 | n/a | """unwrap('<URL:type://host/path>') --> 'type://host/path'.""" |
|---|
| 911 | n/a | url = str(url).strip() |
|---|
| 912 | n/a | if url[:1] == '<' and url[-1:] == '>': |
|---|
| 913 | n/a | url = url[1:-1].strip() |
|---|
| 914 | n/a | if url[:4] == 'URL:': url = url[4:].strip() |
|---|
| 915 | n/a | return url |
|---|
| 916 | n/a | |
|---|
| 917 | n/a | _typeprog = None |
|---|
| 918 | n/a | def splittype(url): |
|---|
| 919 | n/a | """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" |
|---|
| 920 | n/a | global _typeprog |
|---|
| 921 | n/a | if _typeprog is None: |
|---|
| 922 | n/a | _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL) |
|---|
| 923 | n/a | |
|---|
| 924 | n/a | match = _typeprog.match(url) |
|---|
| 925 | n/a | if match: |
|---|
| 926 | n/a | scheme, data = match.groups() |
|---|
| 927 | n/a | return scheme.lower(), data |
|---|
| 928 | n/a | return None, url |
|---|
| 929 | n/a | |
|---|
| 930 | n/a | _hostprog = None |
|---|
| 931 | n/a | def splithost(url): |
|---|
| 932 | n/a | """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" |
|---|
| 933 | n/a | global _hostprog |
|---|
| 934 | n/a | if _hostprog is None: |
|---|
| 935 | n/a | _hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL) |
|---|
| 936 | n/a | |
|---|
| 937 | n/a | match = _hostprog.match(url) |
|---|
| 938 | n/a | if match: |
|---|
| 939 | n/a | host_port, path = match.groups() |
|---|
| 940 | n/a | if path and path[0] != '/': |
|---|
| 941 | n/a | path = '/' + path |
|---|
| 942 | n/a | return host_port, path |
|---|
| 943 | n/a | return None, url |
|---|
| 944 | n/a | |
|---|
| 945 | n/a | def splituser(host): |
|---|
| 946 | n/a | """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" |
|---|
| 947 | n/a | user, delim, host = host.rpartition('@') |
|---|
| 948 | n/a | return (user if delim else None), host |
|---|
| 949 | n/a | |
|---|
| 950 | n/a | def splitpasswd(user): |
|---|
| 951 | n/a | """splitpasswd('user:passwd') -> 'user', 'passwd'.""" |
|---|
| 952 | n/a | user, delim, passwd = user.partition(':') |
|---|
| 953 | n/a | return user, (passwd if delim else None) |
|---|
| 954 | n/a | |
|---|
| 955 | n/a | # splittag('/path#tag') --> '/path', 'tag' |
|---|
| 956 | n/a | _portprog = None |
|---|
| 957 | n/a | def splitport(host): |
|---|
| 958 | n/a | """splitport('host:port') --> 'host', 'port'.""" |
|---|
| 959 | n/a | global _portprog |
|---|
| 960 | n/a | if _portprog is None: |
|---|
| 961 | n/a | _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL) |
|---|
| 962 | n/a | |
|---|
| 963 | n/a | match = _portprog.match(host) |
|---|
| 964 | n/a | if match: |
|---|
| 965 | n/a | host, port = match.groups() |
|---|
| 966 | n/a | if port: |
|---|
| 967 | n/a | return host, port |
|---|
| 968 | n/a | return host, None |
|---|
| 969 | n/a | |
|---|
| 970 | n/a | def splitnport(host, defport=-1): |
|---|
| 971 | n/a | """Split host and port, returning numeric port. |
|---|
| 972 | n/a | Return given default port if no ':' found; defaults to -1. |
|---|
| 973 | n/a | Return numerical port if a valid number are found after ':'. |
|---|
| 974 | n/a | Return None if ':' but not a valid number.""" |
|---|
| 975 | n/a | host, delim, port = host.rpartition(':') |
|---|
| 976 | n/a | if not delim: |
|---|
| 977 | n/a | host = port |
|---|
| 978 | n/a | elif port: |
|---|
| 979 | n/a | try: |
|---|
| 980 | n/a | nport = int(port) |
|---|
| 981 | n/a | except ValueError: |
|---|
| 982 | n/a | nport = None |
|---|
| 983 | n/a | return host, nport |
|---|
| 984 | n/a | return host, defport |
|---|
| 985 | n/a | |
|---|
| 986 | n/a | def splitquery(url): |
|---|
| 987 | n/a | """splitquery('/path?query') --> '/path', 'query'.""" |
|---|
| 988 | n/a | path, delim, query = url.rpartition('?') |
|---|
| 989 | n/a | if delim: |
|---|
| 990 | n/a | return path, query |
|---|
| 991 | n/a | return url, None |
|---|
| 992 | n/a | |
|---|
| 993 | n/a | def splittag(url): |
|---|
| 994 | n/a | """splittag('/path#tag') --> '/path', 'tag'.""" |
|---|
| 995 | n/a | path, delim, tag = url.rpartition('#') |
|---|
| 996 | n/a | if delim: |
|---|
| 997 | n/a | return path, tag |
|---|
| 998 | n/a | return url, None |
|---|
| 999 | n/a | |
|---|
| 1000 | n/a | def splitattr(url): |
|---|
| 1001 | n/a | """splitattr('/path;attr1=value1;attr2=value2;...') -> |
|---|
| 1002 | n/a | '/path', ['attr1=value1', 'attr2=value2', ...].""" |
|---|
| 1003 | n/a | words = url.split(';') |
|---|
| 1004 | n/a | return words[0], words[1:] |
|---|
| 1005 | n/a | |
|---|
| 1006 | n/a | def splitvalue(attr): |
|---|
| 1007 | n/a | """splitvalue('attr=value') --> 'attr', 'value'.""" |
|---|
| 1008 | n/a | attr, delim, value = attr.partition('=') |
|---|
| 1009 | n/a | return attr, (value if delim else None) |
|---|