Python code coverage for Lib/urlparse.py

#	count	content
1	n/a	"""Parse (absolute and relative) URLs.
2	n/a
3	n/a	urlparse module is based upon the following RFC specifications.
4	n/a
5	n/a	RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6	n/a	and L. Masinter, January 2005.
7	n/a
8	n/a	RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9	n/a	and L.Masinter, December 1999.
10	n/a
11	n/a	RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12	n/a	Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13	n/a
14	n/a	RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
15	n/a
16	n/a	RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
17	n/a	1995.
18	n/a
19	n/a	RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20	n/a	McCahill, December 1994
21	n/a
22	n/a	RFC 3986 is considered the current standard and any future changes to
23	n/a	urlparse module should conform with it. The urlparse module is
24	n/a	currently not entirely compliant with this RFC due to defacto
25	n/a	scenarios for parsing, and for backward compatibility purposes, some
26	n/a	parsing quirks from older RFCs are retained. The testcases in
27	n/a	test_urlparse.py provides a good indicator of parsing behavior.
28	n/a
29	2	"""
30	n/a
31	2	__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
32	2	"urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
33	n/a
34	n/a	# A classification of schemes ('' means apply by default)
35	2	uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
36	2	'wais', 'file', 'https', 'shttp', 'mms',
37	2	'prospero', 'rtsp', 'rtspu', '', 'sftp']
38	2	uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
39	2	'imap', 'wais', 'file', 'mms', 'https', 'shttp',
40	2	'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
41	2	'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
42	2	non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
43	2	'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
44	2	uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
45	2	'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
46	2	'mms', '', 'sftp']
47	2	uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
48	2	'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
49	2	uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
50	2	'nntp', 'wais', 'https', 'shttp', 'snews',
51	2	'file', 'prospero', '']
52	n/a
53	n/a	# Characters valid in scheme names
54	2	scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
55	n/a	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
56	n/a	'0123456789'
57	n/a	'+-.')
58	n/a
59	2	MAX_CACHE_SIZE = 20
60	2	_parse_cache = {}
61	n/a
62	2	def clear_cache():
63	n/a	"""Clear the parse cache."""
64	20	_parse_cache.clear()
65	n/a
66	n/a
67	4	class ResultMixin(object):
68	2	"""Shared methods for the parsed result objects."""
69	n/a
70	2	@property
71	n/a	def username(self):
72	69	netloc = self.netloc
73	69	if "@" in netloc:
74	6	userinfo = netloc.rsplit("@", 1)[0]
75	6	if ":" in userinfo:
76	2	userinfo = userinfo.split(":", 1)[0]
77	6	return userinfo
78	63	return None
79	n/a
80	2	@property
81	n/a	def password(self):
82	69	netloc = self.netloc
83	69	if "@" in netloc:
84	6	userinfo = netloc.rsplit("@", 1)[0]
85	6	if ":" in userinfo:
86	2	return userinfo.split(":", 1)[1]
87	67	return None
88	n/a
89	2	@property
90	n/a	def hostname(self):
91	86	netloc = self.netloc.split('@')[-1]
92	86	if '[' in netloc and ']' in netloc:
93	12	return netloc.split(']')[0][1:].lower()
94	74	elif ':' in netloc:
95	4	return netloc.split(':')[0].lower()
96	70	elif netloc == '':
97	6	return None
98	n/a	else:
99	64	return netloc.lower()
100	n/a
101	2	@property
102	n/a	def port(self):
103	87	netloc = self.netloc.split('@')[-1].split(']')[-1]
104	87	if ':' in netloc:
105	12	port = netloc.split(':')[1]
106	12	return int(port, 10)
107	n/a	else:
108	75	return None
109	n/a
110	2	from collections import namedtuple
111	n/a
112	4	class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
113	n/a
114	2	__slots__ = ()
115	n/a
116	2	def geturl(self):
117	67	return urlunsplit(self)
118	n/a
119	n/a
120	4	class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
121	n/a
122	2	__slots__ = ()
123	n/a
124	2	def geturl(self):
125	65	return urlunparse(self)
126	n/a
127	n/a
128	2	def urlparse(url, scheme='', allow_fragments=True):
129	n/a	"""Parse a URL into 6 components:
130	n/a	<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
131	n/a	Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
132	n/a	Note that we don't break the components up in smaller bits
133	n/a	(e.g. netloc is a single string) and we don't expand % escapes."""
134	1467	tuple = urlsplit(url, scheme, allow_fragments)
135	1462	scheme, netloc, url, query, fragment = tuple
136	1462	if scheme in uses_params and ';' in url:
137	132	url, params = _splitparams(url)
138	n/a	else:
139	1330	params = ''
140	1462	return ParseResult(scheme, netloc, url, params, query, fragment)
141	n/a
142	2	def _splitparams(url):
143	132	if '/' in url:
144	123	i = url.find(';', url.rfind('/'))
145	123	if i < 0:
146	4	return url, ''
147	n/a	else:
148	9	i = url.find(';')
149	128	return url[:i], url[i+1:]
150	n/a
151	2	def _splitnetloc(url, start=0):
152	260	delim = len(url) # position of end of domain part of url, default is end
153	1040	for c in '/?#': # look for delimiters; the order is NOT important
154	780	wdelim = url.find(c, start) # find first of this delim
155	780	if wdelim >= 0: # if found
156	253	delim = min(delim, wdelim) # use earliest delim position
157	260	return url[start:delim], url[delim:] # return (domain, rest)
158	n/a
159	2	def urlsplit(url, scheme='', allow_fragments=True):
160	n/a	"""Parse a URL into 5 components:
161	n/a	<scheme>://<netloc>/<path>?<query>#<fragment>
162	n/a	Return a 5-tuple: (scheme, netloc, path, query, fragment).
163	n/a	Note that we don't break the components up in smaller bits
164	n/a	(e.g. netloc is a single string) and we don't expand % escapes."""
165	1601	allow_fragments = bool(allow_fragments)
166	1601	key = url, scheme, allow_fragments, type(url), type(scheme)
167	1601	cached = _parse_cache.get(key, None)
168	1601	if cached:
169	1182	return cached
170	419	if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
171	20	clear_cache()
172	419	netloc = query = fragment = ''
173	419	i = url.find(':')
174	419	if i > 0:
175	273	if url[:i] == 'http': # optimize the common case
176	218	scheme = url[:i].lower()
177	218	url = url[i+1:]
178	218	if url[:2] == '//':
179	217	netloc, url = _splitnetloc(url, 2)
180	217	if (('[' in netloc and ']' not in netloc) or
181	214	(']' in netloc and '[' not in netloc)):
182	4	raise ValueError("Invalid IPv6 URL")
183	214	if allow_fragments and '#' in url:
184	14	url, fragment = url.split('#', 1)
185	214	if '?' in url:
186	17	url, query = url.split('?', 1)
187	214	v = SplitResult(scheme, netloc, url, query, fragment)
188	214	_parse_cache[key] = v
189	214	return v
190	345	for c in url[:i]:
191	290	if c not in scheme_chars:
192	0	break
193	n/a	else:
194	55	scheme, url = url[:i].lower(), url[i+1:]
195	n/a
196	201	if url[:2] == '//':
197	43	netloc, url = _splitnetloc(url, 2)
198	43	if (('[' in netloc and ']' not in netloc) or
199	42	(']' in netloc and '[' not in netloc)):
200	1	raise ValueError("Invalid IPv6 URL")
201	200	if allow_fragments and scheme in uses_fragment and '#' in url:
202	21	url, fragment = url.split('#', 1)
203	200	if scheme in uses_query and '?' in url:
204	18	url, query = url.split('?', 1)
205	200	v = SplitResult(scheme, netloc, url, query, fragment)
206	200	_parse_cache[key] = v
207	200	return v
208	n/a
209	2	def urlunparse(data):
210	n/a	"""Put a parsed URL back together again. This may result in a
211	n/a	slightly different, but equivalent URL, if the URL that was parsed
212	n/a	originally had redundant delimiters, e.g. a ? with an empty query
213	n/a	(the draft states that these are equivalent)."""
214	498	scheme, netloc, url, params, query, fragment = data
215	498	if params:
216	25	url = "%s;%s" % (url, params)
217	498	return urlunsplit((scheme, netloc, url, query, fragment))
218	n/a
219	2	def urlunsplit(data):
220	n/a	"""Combine the elements of a tuple as returned by urlsplit() into a
221	n/a	complete URL as a string. The data argument can be any five-item iterable.
222	n/a	This may result in a slightly different, but equivalent URL, if the URL that
223	n/a	was parsed originally had unnecessary delimiters (for example, a ? with an
224	n/a	empty query; the RFC states that these are equivalent)."""
225	588	scheme, netloc, url, query, fragment = data
226	588	if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
227	331	if url and url[:1] != '/': url = '/' + url
228	331	url = '//' + (netloc or '') + url
229	588	if scheme:
230	337	url = scheme + ':' + url
231	588	if query:
232	65	url = url + '?' + query
233	588	if fragment:
234	81	url = url + '#' + fragment
235	588	return url
236	n/a
237	2	def urljoin(base, url, allow_fragments=True):
238	n/a	"""Join a base URL and a possibly relative URL to form an absolute
239	n/a	interpretation of the latter."""
240	140	if not base:
241	0	return url
242	140	if not url:
243	3	return base
244	n/a	bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
245	137	urlparse(base, '', allow_fragments)
246	n/a	scheme, netloc, path, params, query, fragment = \
247	137	urlparse(url, bscheme, allow_fragments)
248	137	if scheme != bscheme or scheme not in uses_relative:
249	3	return url
250	134	if scheme in uses_netloc:
251	134	if netloc:
252	29	return urlunparse((scheme, netloc, path,
253	29	params, query, fragment))
254	105	netloc = bnetloc
255	105	if path[:1] == '/':
256	7	return urlunparse((scheme, netloc, path,
257	7	params, query, fragment))
258	98	if not path:
259	7	path = bpath
260	7	if not params:
261	5	params = bparams
262	n/a	else:
263	2	path = path[:-1]
264	2	return urlunparse((scheme, netloc, path,
265	2	params, query, fragment))
266	5	if not query:
267	3	query = bquery
268	5	return urlunparse((scheme, netloc, path,
269	5	params, query, fragment))
270	91	segments = bpath.split('/')[:-1] + path.split('/')
271	n/a	# XXX The stuff below is bogus in various ways...
272	91	if segments[-1] == '.':
273	6	segments[-1] = ''
274	108	while '.' in segments:
275	17	segments.remove('.')
276	91	while 1:
277	128	i = 1
278	128	n = len(segments) - 1
279	364	while i < n:
280	273	if (segments[i] == '..'
281	43	and segments[i-1] not in ('', '..')):
282	37	del segments[i-1:i+1]
283	37	break
284	236	i = i+1
285	n/a	else:
286	91	break
287	91	if segments == ['', '..']:
288	0	segments[-1] = ''
289	91	elif len(segments) >= 2 and segments[-1] == '..':
290	6	segments[-2:] = ['']
291	91	return urlunparse((scheme, netloc, '/'.join(segments),
292	91	params, query, fragment))
293	n/a
294	2	def urldefrag(url):
295	n/a	"""Removes any existing fragment from URL.
296	n/a
297	n/a	Returns a tuple of the defragmented URL and the fragment. If
298	n/a	the URL contained no fragments, the second element is the
299	n/a	empty string.
300	n/a	"""
301	10	if '#' in url:
302	5	s, n, p, a, q, frag = urlparse(url)
303	5	defrag = urlunparse((s, n, p, a, q, ''))
304	5	return defrag, frag
305	n/a	else:
306	5	return url, ''
307	n/a
308	n/a	# unquote method for parse_qs and parse_qsl
309	n/a	# Cannot use directly from urllib as it would create a circular reference
310	n/a	# because urllib uses urlparse methods (urljoin). If you update this function,
311	n/a	# update it also in urllib. This code duplication does not existin in Python3.
312	n/a
313	2	_hexdig = '0123456789ABCDEFabcdef'
314	1016	_hextochr = dict((a+b, chr(int(a+b,16)))
315	2	for a in _hexdig for b in _hexdig)
316	n/a
317	2	def unquote(s):
318	n/a	"""unquote('abc%20def') -> 'abc def'."""
319	406	res = s.split('%')
320	n/a	# fastpath
321	406	if len(res) == 1:
322	390	return s
323	16	s = res[0]
324	32	for item in res[1:]:
325	16	try:
326	16	s += _hextochr[item[:2]] + item[2:]
327	0	except KeyError:
328	0	s += '%' + item
329	0	except UnicodeDecodeError:
330	0	s += unichr(int(item[:2], 16)) + item[2:]
331	16	return s
332	n/a
333	2	def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
334	n/a	"""Parse a query given as a string argument.
335	n/a
336	n/a	Arguments:
337	n/a
338	n/a	qs: URL-encoded query string to be parsed
339	n/a
340	n/a	keep_blank_values: flag indicating whether blank values in
341	n/a	URL encoded queries should be treated as blank strings.
342	n/a	A true value indicates that blanks should be retained as
343	n/a	blank strings. The default false value indicates that
344	n/a	blank values are to be ignored and treated as if they were
345	n/a	not included.
346	n/a
347	n/a	strict_parsing: flag indicating what to do with parsing errors.
348	n/a	If false (the default), errors are silently ignored.
349	n/a	If true, errors raise a ValueError exception.
350	n/a	"""
351	102	dict = {}
352	245	for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
353	143	if name in dict:
354	4	dict[name].append(value)
355	n/a	else:
356	139	dict[name] = [value]
357	82	return dict
358	n/a
359	2	def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
360	n/a	"""Parse a query given as a string argument.
361	n/a
362	n/a	Arguments:
363	n/a
364	n/a	qs: URL-encoded query string to be parsed
365	n/a
366	n/a	keep_blank_values: flag indicating whether blank values in
367	n/a	URL encoded queries should be treated as blank strings. A
368	n/a	true value indicates that blanks should be retained as blank
369	n/a	strings. The default false value indicates that blank values
370	n/a	are to be ignored and treated as if they were not included.
371	n/a
372	n/a	strict_parsing: flag indicating what to do with parsing errors. If
373	n/a	false (the default), errors are silently ignored. If true,
374	n/a	errors raise a ValueError exception.
375	n/a
376	n/a	Returns a list, as G-d intended.
377	n/a	"""
378	769	pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
379	142	r = []
380	433	for name_value in pairs:
381	311	if not name_value and not strict_parsing:
382	52	continue
383	259	nv = name_value.split('=', 1)
384	259	if len(nv) != 2:
385	27	if strict_parsing:
386	20	raise ValueError, "bad query field: %r" % (name_value,)
387	n/a	# Handle case of a control-name with no equal sign
388	7	if keep_blank_values:
389	1	nv.append('')
390	n/a	else:
391	0	continue
392	233	if len(nv[1]) or keep_blank_values:
393	203	name = unquote(nv[0].replace('+', ' '))
394	203	value = unquote(nv[1].replace('+', ' '))
395	203	r.append((name, value))
396	n/a
397	122	return r
398	n/a
399	n/a
400	n/a	test_input = """
401	n/a	http://a/b/c/d
402	n/a
403	n/a	g:h = <URL:g:h>
404	n/a	http:g = <URL:http://a/b/c/g>
405	n/a	http: = <URL:http://a/b/c/d>
406	n/a	g = <URL:http://a/b/c/g>
407	n/a	./g = <URL:http://a/b/c/g>
408	n/a	g/ = <URL:http://a/b/c/g/>
409	n/a	/g = <URL:http://a/g>
410	n/a	//g = <URL:http://g>
411	n/a	?y = <URL:http://a/b/c/d?y>
412	n/a	g?y = <URL:http://a/b/c/g?y>
413	n/a	g?y/./x = <URL:http://a/b/c/g?y/./x>
414	n/a	. = <URL:http://a/b/c/>
415	n/a	./ = <URL:http://a/b/c/>
416	n/a	.. = <URL:http://a/b/>
417	n/a	../ = <URL:http://a/b/>
418	n/a	../g = <URL:http://a/b/g>
419	n/a	../.. = <URL:http://a/>
420	n/a	../../g = <URL:http://a/g>
421	n/a	../../../g = <URL:http://a/../g>
422	n/a	./../g = <URL:http://a/b/g>
423	n/a	./g/. = <URL:http://a/b/c/g/>
424	n/a	/./g = <URL:http://a/./g>
425	n/a	g/./h = <URL:http://a/b/c/g/h>
426	n/a	g/../h = <URL:http://a/b/c/h>
427	n/a	http:g = <URL:http://a/b/c/g>
428	n/a	http: = <URL:http://a/b/c/d>
429	n/a	http:?y = <URL:http://a/b/c/d?y>
430	n/a	http:g?y = <URL:http://a/b/c/g?y>
431	n/a	http:g?y/./x = <URL:http://a/b/c/g?y/./x>
432	2	"""
433	n/a
434	2	def test():
435	0	import sys
436	0	base = ''
437	0	if sys.argv[1:]:
438	0	fn = sys.argv[1]
439	0	if fn == '-':
440	0	fp = sys.stdin
441	n/a	else:
442	0	fp = open(fn)
443	n/a	else:
444	0	try:
445	0	from cStringIO import StringIO
446	0	except ImportError:
447	0	from StringIO import StringIO
448	0	fp = StringIO(test_input)
449	0	for line in fp:
450	0	words = line.split()
451	0	if not words:
452	0	continue
453	0	url = words[0]
454	0	parts = urlparse(url)
455	0	print '%-10s : %s' % (url, parts)
456	0	abs = urljoin(base, url)
457	0	if not base:
458	0	base = abs
459	0	wrapped = '<URL:%s>' % abs
460	0	print '%-10s = %s' % (url, wrapped)
461	0	if len(words) == 3 and words[1] == '=':
462	0	if wrapped != words[2]:
463	0	print 'EXPECTED', words[2], '!!!!!!!!!!'
464	n/a
465	2	if __name__ == '__main__':
466	0	test()