ยปCore Development>Code coverage>Lib/urlparse.py

Python code coverage for Lib/urlparse.py

#countcontent
1n/a"""Parse (absolute and relative) URLs.
2n/a
3n/aurlparse module is based upon the following RFC specifications.
4n/a
5n/aRFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6n/aand L. Masinter, January 2005.
7n/a
8n/aRFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9n/aand L.Masinter, December 1999.
10n/a
11n/aRFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12n/aBerners-Lee, R. Fielding, and L. Masinter, August 1998.
13n/a
14n/aRFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
15n/a
16n/aRFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
17n/a1995.
18n/a
19n/aRFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20n/aMcCahill, December 1994
21n/a
22n/aRFC 3986 is considered the current standard and any future changes to
23n/aurlparse module should conform with it. The urlparse module is
24n/acurrently not entirely compliant with this RFC due to defacto
25n/ascenarios for parsing, and for backward compatibility purposes, some
26n/aparsing quirks from older RFCs are retained. The testcases in
27n/atest_urlparse.py provides a good indicator of parsing behavior.
28n/a
292"""
30n/a
312__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
322 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
33n/a
34n/a# A classification of schemes ('' means apply by default)
352uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
362 'wais', 'file', 'https', 'shttp', 'mms',
372 'prospero', 'rtsp', 'rtspu', '', 'sftp']
382uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
392 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
402 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
412 'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
422non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
432 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
442uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
452 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
462 'mms', '', 'sftp']
472uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
482 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
492uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
502 'nntp', 'wais', 'https', 'shttp', 'snews',
512 'file', 'prospero', '']
52n/a
53n/a# Characters valid in scheme names
542scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
55n/a 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
56n/a '0123456789'
57n/a '+-.')
58n/a
592MAX_CACHE_SIZE = 20
602_parse_cache = {}
61n/a
622def clear_cache():
63n/a """Clear the parse cache."""
6420 _parse_cache.clear()
65n/a
66n/a
674class ResultMixin(object):
682 """Shared methods for the parsed result objects."""
69n/a
702 @property
71n/a def username(self):
7269 netloc = self.netloc
7369 if "@" in netloc:
746 userinfo = netloc.rsplit("@", 1)[0]
756 if ":" in userinfo:
762 userinfo = userinfo.split(":", 1)[0]
776 return userinfo
7863 return None
79n/a
802 @property
81n/a def password(self):
8269 netloc = self.netloc
8369 if "@" in netloc:
846 userinfo = netloc.rsplit("@", 1)[0]
856 if ":" in userinfo:
862 return userinfo.split(":", 1)[1]
8767 return None
88n/a
892 @property
90n/a def hostname(self):
9186 netloc = self.netloc.split('@')[-1]
9286 if '[' in netloc and ']' in netloc:
9312 return netloc.split(']')[0][1:].lower()
9474 elif ':' in netloc:
954 return netloc.split(':')[0].lower()
9670 elif netloc == '':
976 return None
98n/a else:
9964 return netloc.lower()
100n/a
1012 @property
102n/a def port(self):
10387 netloc = self.netloc.split('@')[-1].split(']')[-1]
10487 if ':' in netloc:
10512 port = netloc.split(':')[1]
10612 return int(port, 10)
107n/a else:
10875 return None
109n/a
1102from collections import namedtuple
111n/a
1124class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
113n/a
1142 __slots__ = ()
115n/a
1162 def geturl(self):
11767 return urlunsplit(self)
118n/a
119n/a
1204class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
121n/a
1222 __slots__ = ()
123n/a
1242 def geturl(self):
12565 return urlunparse(self)
126n/a
127n/a
1282def urlparse(url, scheme='', allow_fragments=True):
129n/a """Parse a URL into 6 components:
130n/a <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
131n/a Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
132n/a Note that we don't break the components up in smaller bits
133n/a (e.g. netloc is a single string) and we don't expand % escapes."""
1341467 tuple = urlsplit(url, scheme, allow_fragments)
1351462 scheme, netloc, url, query, fragment = tuple
1361462 if scheme in uses_params and ';' in url:
137132 url, params = _splitparams(url)
138n/a else:
1391330 params = ''
1401462 return ParseResult(scheme, netloc, url, params, query, fragment)
141n/a
1422def _splitparams(url):
143132 if '/' in url:
144123 i = url.find(';', url.rfind('/'))
145123 if i < 0:
1464 return url, ''
147n/a else:
1489 i = url.find(';')
149128 return url[:i], url[i+1:]
150n/a
1512def _splitnetloc(url, start=0):
152260 delim = len(url) # position of end of domain part of url, default is end
1531040 for c in '/?#': # look for delimiters; the order is NOT important
154780 wdelim = url.find(c, start) # find first of this delim
155780 if wdelim >= 0: # if found
156253 delim = min(delim, wdelim) # use earliest delim position
157260 return url[start:delim], url[delim:] # return (domain, rest)
158n/a
1592def urlsplit(url, scheme='', allow_fragments=True):
160n/a """Parse a URL into 5 components:
161n/a <scheme>://<netloc>/<path>?<query>#<fragment>
162n/a Return a 5-tuple: (scheme, netloc, path, query, fragment).
163n/a Note that we don't break the components up in smaller bits
164n/a (e.g. netloc is a single string) and we don't expand % escapes."""
1651601 allow_fragments = bool(allow_fragments)
1661601 key = url, scheme, allow_fragments, type(url), type(scheme)
1671601 cached = _parse_cache.get(key, None)
1681601 if cached:
1691182 return cached
170419 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
17120 clear_cache()
172419 netloc = query = fragment = ''
173419 i = url.find(':')
174419 if i > 0:
175273 if url[:i] == 'http': # optimize the common case
176218 scheme = url[:i].lower()
177218 url = url[i+1:]
178218 if url[:2] == '//':
179217 netloc, url = _splitnetloc(url, 2)
180217 if (('[' in netloc and ']' not in netloc) or
181214 (']' in netloc and '[' not in netloc)):
1824 raise ValueError("Invalid IPv6 URL")
183214 if allow_fragments and '#' in url:
18414 url, fragment = url.split('#', 1)
185214 if '?' in url:
18617 url, query = url.split('?', 1)
187214 v = SplitResult(scheme, netloc, url, query, fragment)
188214 _parse_cache[key] = v
189214 return v
190345 for c in url[:i]:
191290 if c not in scheme_chars:
1920 break
193n/a else:
19455 scheme, url = url[:i].lower(), url[i+1:]
195n/a
196201 if url[:2] == '//':
19743 netloc, url = _splitnetloc(url, 2)
19843 if (('[' in netloc and ']' not in netloc) or
19942 (']' in netloc and '[' not in netloc)):
2001 raise ValueError("Invalid IPv6 URL")
201200 if allow_fragments and scheme in uses_fragment and '#' in url:
20221 url, fragment = url.split('#', 1)
203200 if scheme in uses_query and '?' in url:
20418 url, query = url.split('?', 1)
205200 v = SplitResult(scheme, netloc, url, query, fragment)
206200 _parse_cache[key] = v
207200 return v
208n/a
2092def urlunparse(data):
210n/a """Put a parsed URL back together again. This may result in a
211n/a slightly different, but equivalent URL, if the URL that was parsed
212n/a originally had redundant delimiters, e.g. a ? with an empty query
213n/a (the draft states that these are equivalent)."""
214498 scheme, netloc, url, params, query, fragment = data
215498 if params:
21625 url = "%s;%s" % (url, params)
217498 return urlunsplit((scheme, netloc, url, query, fragment))
218n/a
2192def urlunsplit(data):
220n/a """Combine the elements of a tuple as returned by urlsplit() into a
221n/a complete URL as a string. The data argument can be any five-item iterable.
222n/a This may result in a slightly different, but equivalent URL, if the URL that
223n/a was parsed originally had unnecessary delimiters (for example, a ? with an
224n/a empty query; the RFC states that these are equivalent)."""
225588 scheme, netloc, url, query, fragment = data
226588 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
227331 if url and url[:1] != '/': url = '/' + url
228331 url = '//' + (netloc or '') + url
229588 if scheme:
230337 url = scheme + ':' + url
231588 if query:
23265 url = url + '?' + query
233588 if fragment:
23481 url = url + '#' + fragment
235588 return url
236n/a
2372def urljoin(base, url, allow_fragments=True):
238n/a """Join a base URL and a possibly relative URL to form an absolute
239n/a interpretation of the latter."""
240140 if not base:
2410 return url
242140 if not url:
2433 return base
244n/a bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
245137 urlparse(base, '', allow_fragments)
246n/a scheme, netloc, path, params, query, fragment = \
247137 urlparse(url, bscheme, allow_fragments)
248137 if scheme != bscheme or scheme not in uses_relative:
2493 return url
250134 if scheme in uses_netloc:
251134 if netloc:
25229 return urlunparse((scheme, netloc, path,
25329 params, query, fragment))
254105 netloc = bnetloc
255105 if path[:1] == '/':
2567 return urlunparse((scheme, netloc, path,
2577 params, query, fragment))
25898 if not path:
2597 path = bpath
2607 if not params:
2615 params = bparams
262n/a else:
2632 path = path[:-1]
2642 return urlunparse((scheme, netloc, path,
2652 params, query, fragment))
2665 if not query:
2673 query = bquery
2685 return urlunparse((scheme, netloc, path,
2695 params, query, fragment))
27091 segments = bpath.split('/')[:-1] + path.split('/')
271n/a # XXX The stuff below is bogus in various ways...
27291 if segments[-1] == '.':
2736 segments[-1] = ''
274108 while '.' in segments:
27517 segments.remove('.')
27691 while 1:
277128 i = 1
278128 n = len(segments) - 1
279364 while i < n:
280273 if (segments[i] == '..'
28143 and segments[i-1] not in ('', '..')):
28237 del segments[i-1:i+1]
28337 break
284236 i = i+1
285n/a else:
28691 break
28791 if segments == ['', '..']:
2880 segments[-1] = ''
28991 elif len(segments) >= 2 and segments[-1] == '..':
2906 segments[-2:] = ['']
29191 return urlunparse((scheme, netloc, '/'.join(segments),
29291 params, query, fragment))
293n/a
2942def urldefrag(url):
295n/a """Removes any existing fragment from URL.
296n/a
297n/a Returns a tuple of the defragmented URL and the fragment. If
298n/a the URL contained no fragments, the second element is the
299n/a empty string.
300n/a """
30110 if '#' in url:
3025 s, n, p, a, q, frag = urlparse(url)
3035 defrag = urlunparse((s, n, p, a, q, ''))
3045 return defrag, frag
305n/a else:
3065 return url, ''
307n/a
308n/a# unquote method for parse_qs and parse_qsl
309n/a# Cannot use directly from urllib as it would create a circular reference
310n/a# because urllib uses urlparse methods (urljoin). If you update this function,
311n/a# update it also in urllib. This code duplication does not existin in Python3.
312n/a
3132_hexdig = '0123456789ABCDEFabcdef'
3141016_hextochr = dict((a+b, chr(int(a+b,16)))
3152 for a in _hexdig for b in _hexdig)
316n/a
3172def unquote(s):
318n/a """unquote('abc%20def') -> 'abc def'."""
319406 res = s.split('%')
320n/a # fastpath
321406 if len(res) == 1:
322390 return s
32316 s = res[0]
32432 for item in res[1:]:
32516 try:
32616 s += _hextochr[item[:2]] + item[2:]
3270 except KeyError:
3280 s += '%' + item
3290 except UnicodeDecodeError:
3300 s += unichr(int(item[:2], 16)) + item[2:]
33116 return s
332n/a
3332def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
334n/a """Parse a query given as a string argument.
335n/a
336n/a Arguments:
337n/a
338n/a qs: URL-encoded query string to be parsed
339n/a
340n/a keep_blank_values: flag indicating whether blank values in
341n/a URL encoded queries should be treated as blank strings.
342n/a A true value indicates that blanks should be retained as
343n/a blank strings. The default false value indicates that
344n/a blank values are to be ignored and treated as if they were
345n/a not included.
346n/a
347n/a strict_parsing: flag indicating what to do with parsing errors.
348n/a If false (the default), errors are silently ignored.
349n/a If true, errors raise a ValueError exception.
350n/a """
351102 dict = {}
352245 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
353143 if name in dict:
3544 dict[name].append(value)
355n/a else:
356139 dict[name] = [value]
35782 return dict
358n/a
3592def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
360n/a """Parse a query given as a string argument.
361n/a
362n/a Arguments:
363n/a
364n/a qs: URL-encoded query string to be parsed
365n/a
366n/a keep_blank_values: flag indicating whether blank values in
367n/a URL encoded queries should be treated as blank strings. A
368n/a true value indicates that blanks should be retained as blank
369n/a strings. The default false value indicates that blank values
370n/a are to be ignored and treated as if they were not included.
371n/a
372n/a strict_parsing: flag indicating what to do with parsing errors. If
373n/a false (the default), errors are silently ignored. If true,
374n/a errors raise a ValueError exception.
375n/a
376n/a Returns a list, as G-d intended.
377n/a """
378769 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
379142 r = []
380433 for name_value in pairs:
381311 if not name_value and not strict_parsing:
38252 continue
383259 nv = name_value.split('=', 1)
384259 if len(nv) != 2:
38527 if strict_parsing:
38620 raise ValueError, "bad query field: %r" % (name_value,)
387n/a # Handle case of a control-name with no equal sign
3887 if keep_blank_values:
3891 nv.append('')
390n/a else:
3910 continue
392233 if len(nv[1]) or keep_blank_values:
393203 name = unquote(nv[0].replace('+', ' '))
394203 value = unquote(nv[1].replace('+', ' '))
395203 r.append((name, value))
396n/a
397122 return r
398n/a
399n/a
400n/atest_input = """
401n/a http://a/b/c/d
402n/a
403n/a g:h = <URL:g:h>
404n/a http:g = <URL:http://a/b/c/g>
405n/a http: = <URL:http://a/b/c/d>
406n/a g = <URL:http://a/b/c/g>
407n/a ./g = <URL:http://a/b/c/g>
408n/a g/ = <URL:http://a/b/c/g/>
409n/a /g = <URL:http://a/g>
410n/a //g = <URL:http://g>
411n/a ?y = <URL:http://a/b/c/d?y>
412n/a g?y = <URL:http://a/b/c/g?y>
413n/a g?y/./x = <URL:http://a/b/c/g?y/./x>
414n/a . = <URL:http://a/b/c/>
415n/a ./ = <URL:http://a/b/c/>
416n/a .. = <URL:http://a/b/>
417n/a ../ = <URL:http://a/b/>
418n/a ../g = <URL:http://a/b/g>
419n/a ../.. = <URL:http://a/>
420n/a ../../g = <URL:http://a/g>
421n/a ../../../g = <URL:http://a/../g>
422n/a ./../g = <URL:http://a/b/g>
423n/a ./g/. = <URL:http://a/b/c/g/>
424n/a /./g = <URL:http://a/./g>
425n/a g/./h = <URL:http://a/b/c/g/h>
426n/a g/../h = <URL:http://a/b/c/h>
427n/a http:g = <URL:http://a/b/c/g>
428n/a http: = <URL:http://a/b/c/d>
429n/a http:?y = <URL:http://a/b/c/d?y>
430n/a http:g?y = <URL:http://a/b/c/g?y>
431n/a http:g?y/./x = <URL:http://a/b/c/g?y/./x>
4322"""
433n/a
4342def test():
4350 import sys
4360 base = ''
4370 if sys.argv[1:]:
4380 fn = sys.argv[1]
4390 if fn == '-':
4400 fp = sys.stdin
441n/a else:
4420 fp = open(fn)
443n/a else:
4440 try:
4450 from cStringIO import StringIO
4460 except ImportError:
4470 from StringIO import StringIO
4480 fp = StringIO(test_input)
4490 for line in fp:
4500 words = line.split()
4510 if not words:
4520 continue
4530 url = words[0]
4540 parts = urlparse(url)
4550 print '%-10s : %s' % (url, parts)
4560 abs = urljoin(base, url)
4570 if not base:
4580 base = abs
4590 wrapped = '<URL:%s>' % abs
4600 print '%-10s = %s' % (url, wrapped)
4610 if len(words) == 3 and words[1] == '=':
4620 if wrapped != words[2]:
4630 print 'EXPECTED', words[2], '!!!!!!!!!!'
464n/a
4652if __name__ == '__main__':
4660 test()