Python code coverage for Lib/urllib2.py

#	count	content
1	n/a	"""An extensible library for opening URLs using a variety of protocols
2	n/a
3	n/a	The simplest way to use this module is to call the urlopen function,
4	n/a	which accepts a string containing a URL or a Request object (described
5	n/a	below). It opens the URL and returns the results as file-like
6	n/a	object; the returned object has some extra methods described below.
7	n/a
8	n/a	The OpenerDirector manages a collection of Handler objects that do
9	n/a	all the actual work. Each Handler implements a particular protocol or
10	n/a	option. The OpenerDirector is a composite object that invokes the
11	n/a	Handlers needed to open the requested URL. For example, the
12	n/a	HTTPHandler performs HTTP GET and POST requests and deals with
13	n/a	non-error returns. The HTTPRedirectHandler automatically deals with
14	n/a	HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15	n/a	deals with digest authentication.
16	n/a
17	n/a	urlopen(url, data=None) -- Basic usage is the same as original
18	n/a	urllib. pass the url and optionally data to post to an HTTP URL, and
19	n/a	get a file-like object back. One difference is that you can also pass
20	n/a	a Request instance instead of URL. Raises a URLError (subclass of
21	n/a	IOError); for HTTP errors, raises an HTTPError, which can also be
22	n/a	treated as a valid response.
23	n/a
24	n/a	build_opener -- Function that creates a new OpenerDirector instance.
25	n/a	Will install the default handlers. Accepts one or more Handlers as
26	n/a	arguments, either instances or Handler classes that it will
27	n/a	instantiate. If one of the argument is a subclass of the default
28	n/a	handler, the argument will be installed instead of the default.
29	n/a
30	n/a	install_opener -- Installs a new opener as the default opener.
31	n/a
32	n/a	objects of interest:
33	n/a
34	n/a	OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35	n/a	the Handler classes, while dealing with requests and responses.
36	n/a
37	n/a	Request -- An object that encapsulates the state of a request. The
38	n/a	state can be as simple as the URL. It can also include extra HTTP
39	n/a	headers, e.g. a User-Agent.
40	n/a
41	n/a	BaseHandler --
42	n/a
43	n/a	exceptions:
44	n/a	URLError -- A subclass of IOError, individual protocols have their own
45	n/a	specific subclass.
46	n/a
47	n/a	HTTPError -- Also a valid HTTP response, so you can treat an HTTP error
48	n/a	as an exceptional event or valid response.
49	n/a
50	n/a	internals:
51	n/a	BaseHandler and parent
52	n/a	_call_chain conventions
53	n/a
54	n/a	Example usage:
55	n/a
56	n/a	import urllib2
57	n/a
58	n/a	# set up authentication info
59	n/a	authinfo = urllib2.HTTPBasicAuthHandler()
60	n/a	authinfo.add_password(realm='PDQ Application',
61	n/a	uri='https://mahler:8092/site-updates.py',
62	n/a	user='klem',
63	n/a	passwd='geheim$parole')
64	n/a
65	n/a	proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
66	n/a
67	n/a	# build a new opener that adds authentication and caching FTP handlers
68	n/a	opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
69	n/a
70	n/a	# install it
71	n/a	urllib2.install_opener(opener)
72	n/a
73	n/a	f = urllib2.urlopen('http://www.python.org/')
74	n/a
75	n/a
76	1	"""
77	n/a
78	n/a	# XXX issues:
79	n/a	# If an authentication error handler that tries to perform
80	n/a	# authentication for some reason but fails, how should the error be
81	n/a	# signalled? The client needs to know the HTTP error code. But if
82	n/a	# the handler knows that the problem was, e.g., that it didn't know
83	n/a	# that hash algo that requested in the challenge, it would be good to
84	n/a	# pass that information along to the client, too.
85	n/a	# ftp errors aren't handled cleanly
86	n/a	# check digest against correct (i.e. non-apache) implementation
87	n/a
88	n/a	# Possible extensions:
89	n/a	# complex proxies XXX not sure what exactly was meant by this
90	n/a	# abstract factory for opener
91	n/a
92	1	import base64
93	1	import hashlib
94	1	import httplib
95	1	import mimetools
96	1	import os
97	1	import posixpath
98	1	import random
99	1	import re
100	1	import socket
101	1	import sys
102	1	import time
103	1	import urlparse
104	1	import bisect
105	n/a
106	1	try:
107	1	from cStringIO import StringIO
108	0	except ImportError:
109	0	from StringIO import StringIO
110	n/a
111	1	from urllib import (unwrap, unquote, splittype, splithost, quote,
112	n/a	addinfourl, splitport,
113	n/a	splitattr, ftpwrapper, splituser, splitpasswd, splitvalue)
114	n/a
115	n/a	# support for FileHandler, proxies via environment variables
116	1	from urllib import localhost, url2pathname, getproxies, proxy_bypass
117	n/a
118	n/a	# used in User-Agent header sent
119	1	__version__ = sys.version[:3]
120	n/a
121	1	_opener = None
122	1	def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
123	n/a	global _opener
124	35	if _opener is None:
125	1	_opener = build_opener()
126	35	return _opener.open(url, data, timeout)
127	n/a
128	1	def install_opener(opener):
129	n/a	global _opener
130	0	_opener = opener
131	n/a
132	n/a	# do these error classes make sense?
133	n/a	# make sure all of the IOError stuff is overridden. we just want to be
134	n/a	# subtypes.
135	n/a
136	2	class URLError(IOError):
137	n/a	# URLError is a sub-type of IOError, but it doesn't share any of
138	n/a	# the implementation. need to override __init__ and __str__.
139	n/a	# It sets self.args for compatibility with other EnvironmentError
140	n/a	# subclasses, but args doesn't have the typical format with errno in
141	n/a	# slot 0 and strerror in slot 1. This may be better than nothing.
142	1	def __init__(self, reason):
143	19	self.args = reason,
144	19	self.reason = reason
145	n/a
146	1	def __str__(self):
147	1	return '<urlopen error %s>' % self.reason
148	n/a
149	2	class HTTPError(URLError, addinfourl):
150	1	"""Raised when HTTP error occurs, but also acts like non-error return"""
151	1	__super_init = addinfourl.__init__
152	n/a
153	1	def __init__(self, url, code, msg, hdrs, fp):
154	6	self.code = code
155	6	self.msg = msg
156	6	self.hdrs = hdrs
157	6	self.fp = fp
158	6	self.filename = url
159	n/a	# The addinfourl classes depend on fp being a valid file
160	n/a	# object. In some cases, the HTTPError may not have a valid
161	n/a	# file object. If this happens, the simplest workaround is to
162	n/a	# not initialize the base classes.
163	6	if fp is not None:
164	5	self.__super_init(fp, hdrs, url, code)
165	n/a
166	1	def __str__(self):
167	0	return 'HTTP Error %s: %s' % (self.code, self.msg)
168	n/a
169	n/a	# copied from cookielib.py
170	1	_cut_port_re = re.compile(r":\d+$")
171	1	def request_host(request):
172	n/a	"""Return request-host, as defined by RFC 2965.
173	n/a
174	n/a	Variation from RFC: returned value is lowercased, for convenient
175	n/a	comparison.
176	n/a
177	n/a	"""
178	323	url = request.get_full_url()
179	323	host = urlparse.urlparse(url)[1]
180	323	if host == "":
181	9	host = request.get_header("Host", "")
182	n/a
183	n/a	# remove port, if present
184	323	host = _cut_port_re.sub("", host, 1)
185	323	return host.lower()
186	n/a
187	2	class Request:
188	n/a
189	1	def __init__(self, url, data=None, headers={},
190	1	origin_req_host=None, unverifiable=False):
191	n/a	# unwrap('<URL:type://host/path>') --> 'type://host/path'
192	350	self.__original = unwrap(url)
193	350	self.type = None
194	n/a	# self.__r_type is what's left after doing the splittype
195	350	self.host = None
196	350	self.port = None
197	350	self._tunnel_host = None
198	350	self.data = data
199	350	self.headers = {}
200	391	for key, value in headers.items():
201	41	self.add_header(key, value)
202	350	self.unredirected_hdrs = {}
203	350	if origin_req_host is None:
204	323	origin_req_host = request_host(self)
205	350	self.origin_req_host = origin_req_host
206	350	self.unverifiable = unverifiable
207	n/a
208	1	def __getattr__(self, attr):
209	n/a	# XXX this is a fallback mechanism to guard against these
210	n/a	# methods getting called in a non-standard order. this may be
211	n/a	# too complicated and/or unnecessary.
212	n/a	# XXX should the __r_XXX attributes be public?
213	142	if attr[:12] == '_Request__r_':
214	75	name = attr[12:]
215	75	if hasattr(Request, 'get_' + name):
216	75	getattr(self, 'get_' + name)()
217	75	return getattr(self, attr)
218	67	raise AttributeError, attr
219	n/a
220	1	def get_method(self):
221	92	if self.has_data():
222	10	return "POST"
223	n/a	else:
224	82	return "GET"
225	n/a
226	n/a	# XXX these helper methods are lame
227	n/a
228	1	def add_data(self, data):
229	1	self.data = data
230	n/a
231	1	def has_data(self):
232	155	return self.data is not None
233	n/a
234	1	def get_data(self):
235	11	return self.data
236	n/a
237	1	def get_full_url(self):
238	1100	return self.__original
239	n/a
240	1	def get_type(self):
241	247	if self.type is None:
242	127	self.type, self.__r_type = splittype(self.__original)
243	127	if self.type is None:
244	1	raise ValueError, "unknown url type: %s" % self.__original
245	246	return self.type
246	n/a
247	1	def get_host(self):
248	199	if self.host is None:
249	105	self.host, self.__r_host = splithost(self.__r_type)
250	105	if self.host:
251	97	self.host = unquote(self.host)
252	199	return self.host
253	n/a
254	1	def get_selector(self):
255	116	return self.__r_host
256	n/a
257	1	def set_proxy(self, host, type):
258	23	if self.type == 'https' and not self._tunnel_host:
259	2	self._tunnel_host = self.host
260	n/a	else:
261	21	self.type = type
262	21	self.__r_host = self.__original
263	n/a
264	23	self.host = host
265	n/a
266	1	def has_proxy(self):
267	55	return self.__r_host == self.__original
268	n/a
269	1	def get_origin_req_host(self):
270	27	return self.origin_req_host
271	n/a
272	1	def is_unverifiable(self):
273	244	return self.unverifiable
274	n/a
275	1	def add_header(self, key, val):
276	n/a	# useful for something like authentication
277	56	self.headers[key.capitalize()] = val
278	n/a
279	1	def add_unredirected_header(self, key, val):
280	n/a	# will not be added to a redirected request
281	205	self.unredirected_hdrs[key.capitalize()] = val
282	n/a
283	1	def has_header(self, header_name):
284	307	return (header_name in self.headers or
285	306	header_name in self.unredirected_hdrs)
286	n/a
287	1	def get_header(self, header_name, default=None):
288	167	return self.headers.get(
289	167	header_name,
290	167	self.unredirected_hdrs.get(header_name, default))
291	n/a
292	1	def header_items(self):
293	2	hdrs = self.unredirected_hdrs.copy()
294	2	hdrs.update(self.headers)
295	2	return hdrs.items()
296	n/a
297	2	class OpenerDirector:
298	1	def __init__(self):
299	30	client_version = "Python-urllib/%s" % __version__
300	30	self.addheaders = [('User-agent', client_version)]
301	n/a	# manage the individual handlers
302	30	self.handlers = []
303	30	self.handle_open = {}
304	30	self.handle_error = {}
305	30	self.process_response = {}
306	30	self.process_request = {}
307	n/a
308	1	def add_handler(self, handler):
309	174	if not hasattr(handler, "add_parent"):
310	1	raise TypeError("expected BaseHandler instance, got %r" %
311	1	type(handler))
312	n/a
313	173	added = False
314	2019	for meth in dir(handler):
315	1846	if meth in ["redirect_request", "do_open", "proxy_open"]:
316	n/a	# oops, coincidental match
317	67	continue
318	n/a
319	1779	i = meth.find("_")
320	1779	protocol = meth[:i]
321	1779	condition = meth[i+1:]
322	n/a
323	1779	if condition.startswith("error"):
324	99	j = condition.find("_") + i + 1
325	99	kind = meth[j+1:]
326	99	try:
327	99	kind = int(kind)
328	25	except ValueError:
329	25	pass
330	99	lookup = self.handle_error.get(protocol, {})
331	99	self.handle_error[protocol] = lookup
332	1680	elif condition == "open":
333	104	kind = protocol
334	104	lookup = self.handle_open
335	1576	elif condition == "response":
336	32	kind = protocol
337	32	lookup = self.process_response
338	1544	elif condition == "request":
339	33	kind = protocol
340	33	lookup = self.process_request
341	n/a	else:
342	0	continue
343	n/a
344	268	handlers = lookup.setdefault(kind, [])
345	268	if handlers:
346	21	bisect.insort(handlers, handler)
347	n/a	else:
348	247	handlers.append(handler)
349	268	added = True
350	n/a
351	173	if added:
352	n/a	# the handlers must work in an specific order, the order
353	n/a	# is specified in a Handler attribute
354	161	bisect.insort(self.handlers, handler)
355	161	handler.add_parent(self)
356	n/a
357	1	def close(self):
358	n/a	# Only exists for backwards compatibility.
359	0	pass
360	n/a
361	1	def _call_chain(self, chain, kind, meth_name, *args):
362	n/a	# Handlers raise an exception if no one else should try to handle
363	n/a	# the request, or return None if they can't but another handler
364	n/a	# could. Otherwise, they return the response.
365	204	handlers = chain.get(kind, ())
366	238	for handler in handlers:
367	125	func = getattr(handler, meth_name)
368	n/a
369	125	result = func(*args)
370	104	if result is not None:
371	70	return result
372	n/a
373	1	def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
374	n/a	# accept a URL or a Request object
375	83	if isinstance(fullurl, basestring):
376	60	req = Request(fullurl, data)
377	n/a	else:
378	23	req = fullurl
379	23	if data is not None:
380	0	req.add_data(data)
381	n/a
382	83	req.timeout = timeout
383	83	protocol = req.get_type()
384	n/a
385	n/a	# pre-process request
386	82	meth_name = protocol+"_request"
387	127	for processor in self.process_request.get(protocol, []):
388	45	meth = getattr(processor, meth_name)
389	45	req = meth(req)
390	n/a
391	82	response = self._open(req, data)
392	n/a
393	n/a	# post-process response
394	71	meth_name = protocol+"_response"
395	105	for processor in self.process_response.get(protocol, []):
396	44	meth = getattr(processor, meth_name)
397	44	response = meth(req, response)
398	n/a
399	61	return response
400	n/a
401	1	def _open(self, req, data=None):
402	82	result = self._call_chain(self.handle_open, 'default',
403	82	'default_open', req)
404	82	if result:
405	0	return result
406	n/a
407	82	protocol = req.get_type()
408	82	result = self._call_chain(self.handle_open, protocol, protocol +
409	82	'_open', req)
410	74	if result:
411	63	return result
412	n/a
413	11	return self._call_chain(self.handle_open, 'unknown',
414	11	'unknown_open', req)
415	n/a
416	1	def error(self, proto, *args):
417	22	if proto in ('http', 'https'):
418	n/a	# XXX http[s] protocols are special-cased
419	22	dict = self.handle_error['http'] # https is not different than http
420	22	proto = args[2] # YUCK!
421	22	meth_name = 'http_error_%s' % proto
422	22	http_err = 1
423	22	orig_args = args
424	n/a	else:
425	0	dict = self.handle_error
426	0	meth_name = proto + '_error'
427	0	http_err = 0
428	22	args = (dict, proto, meth_name) + args
429	22	result = self._call_chain(*args)
430	14	if result:
431	7	return result
432	n/a
433	7	if http_err:
434	7	args = (dict, 'default', 'http_error_default') + orig_args
435	7	return self._call_chain(*args)
436	n/a
437	n/a	# XXX probably also want an abstract factory that knows when it makes
438	n/a	# sense to skip a superclass in favor of a subclass and when it might
439	n/a	# make sense to include both
440	n/a
441	1	def build_opener(*handlers):
442	n/a	"""Create an opener object from a list of handlers.
443	n/a
444	n/a	The opener will use several default handlers, including support
445	n/a	for HTTP, FTP and when applicable, HTTPS.
446	n/a
447	n/a	If any of the handlers passed as arguments are subclasses of the
448	n/a	default handlers, the default handlers will not be used.
449	n/a	"""
450	14	import types
451	14	def isclass(obj):
452	190	return isinstance(obj, (types.ClassType, type))
453	n/a
454	14	opener = OpenerDirector()
455	14	default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
456	14	HTTPDefaultErrorHandler, HTTPRedirectHandler,
457	14	FTPHandler, FileHandler, HTTPErrorProcessor]
458	14	if hasattr(httplib, 'HTTPS'):
459	14	default_classes.append(HTTPSHandler)
460	14	skip = set()
461	140	for klass in default_classes:
462	297	for check in handlers:
463	171	if isclass(check):
464	63	if issubclass(check, klass):
465	4	skip.add(klass)
466	108	elif isinstance(check, klass):
467	7	skip.add(klass)
468	24	for klass in skip:
469	10	default_classes.remove(klass)
470	n/a
471	130	for klass in default_classes:
472	116	opener.add_handler(klass())
473	n/a
474	33	for h in handlers:
475	19	if isclass(h):
476	7	h = h()
477	19	opener.add_handler(h)
478	14	return opener
479	n/a
480	2	class BaseHandler:
481	1	handler_order = 500
482	n/a
483	1	def add_parent(self, parent):
484	145	self.parent = parent
485	n/a
486	1	def close(self):
487	n/a	# Only exists for backwards compatibility
488	0	pass
489	n/a
490	1	def __lt__(self, other):
491	256	if not hasattr(other, "handler_order"):
492	n/a	# Try to preserve the old behavior of having custom classes
493	n/a	# inserted after default ones (works only for custom user
494	n/a	# classes which are not aware of handler_order).
495	0	return True
496	256	return self.handler_order < other.handler_order
497	n/a
498	n/a
499	2	class HTTPErrorProcessor(BaseHandler):
500	1	"""Process HTTP error responses."""
501	1	handler_order = 1000 # after all other processing
502	n/a
503	1	def http_response(self, request, response):
504	44	code, msg, hdrs = response.code, response.msg, response.info()
505	n/a
506	n/a	# According to RFC 2616, "2xx" code indicates that the client's
507	n/a	# request was successfully received, understood, and accepted.
508	44	if not (200 <= code < 300):
509	13	response = self.parent.error(
510	13	'http', request, response, code, msg, hdrs)
511	n/a
512	34	return response
513	n/a
514	1	https_response = http_response
515	n/a
516	2	class HTTPDefaultErrorHandler(BaseHandler):
517	1	def http_error_default(self, req, fp, code, msg, hdrs):
518	2	raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
519	n/a
520	2	class HTTPRedirectHandler(BaseHandler):
521	n/a	# maximum number of redirections to any single URL
522	n/a	# this is needed because of the state that cookies introduce
523	1	max_repeats = 4
524	n/a	# maximum total number of redirections (regardless of URL) before
525	n/a	# assuming we're in a loop
526	1	max_redirections = 10
527	n/a
528	1	def redirect_request(self, req, fp, code, msg, headers, newurl):
529	n/a	"""Return a Request or None in response to a redirect.
530	n/a
531	n/a	This is called by the http_error_30x methods when a
532	n/a	redirection response is received. If a redirection should
533	n/a	take place, return a new Request to allow http_error_30x to
534	n/a	perform the redirect. Otherwise, raise HTTPError if no-one
535	n/a	else should try to handle this url. Return None if you can't
536	n/a	but another Handler might.
537	n/a	"""
538	26	m = req.get_method()
539	26	if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
540	4	or code in (301, 302, 303) and m == "POST"):
541	n/a	# Strictly (according to RFC 2616), 301 or 302 in response
542	n/a	# to a POST MUST NOT cause a redirection without confirmation
543	n/a	# from the user (of urllib2, in this case). In practice,
544	n/a	# essentially all clients do redirect in this case, so we
545	n/a	# do the same.
546	n/a	# be conciliant with URIs containing a space
547	25	newurl = newurl.replace(' ', '%20')
548	60	newheaders = dict((k,v) for k,v in req.headers.items()
549	10	if k.lower() not in ("content-length", "content-type")
550	n/a	)
551	25	return Request(newurl,
552	25	headers=newheaders,
553	25	origin_req_host=req.get_origin_req_host(),
554	25	unverifiable=True)
555	n/a	else:
556	1	raise HTTPError(req.get_full_url(), code, msg, headers, fp)
557	n/a
558	n/a	# Implementation note: To avoid the server sending us into an
559	n/a	# infinite loop, the request object needs to track what URLs we
560	n/a	# have already seen. Do this by adding a handler-specific
561	n/a	# attribute to the Request object.
562	1	def http_error_302(self, req, fp, code, msg, headers):
563	n/a	# Some servers (incorrectly) return multiple Location headers
564	n/a	# (so probably same goes for URI). Use first header.
565	26	if 'location' in headers:
566	26	newurl = headers.getheaders('location')[0]
567	0	elif 'uri' in headers:
568	0	newurl = headers.getheaders('uri')[0]
569	n/a	else:
570	0	return
571	n/a
572	n/a	# fix a possible malformed URL
573	26	urlparts = urlparse.urlparse(newurl)
574	26	if not urlparts.path:
575	0	urlparts = list(urlparts)
576	0	urlparts[2] = "/"
577	26	newurl = urlparse.urlunparse(urlparts)
578	n/a
579	26	newurl = urlparse.urljoin(req.get_full_url(), newurl)
580	n/a
581	n/a	# XXX Probably want to forget about the state of the current
582	n/a	# request, although that might interact poorly with other
583	n/a	# handlers that also use handler-specific request attributes
584	26	new = self.redirect_request(req, fp, code, msg, headers, newurl)
585	25	if new is None:
586	0	return
587	n/a
588	n/a	# loop detection
589	n/a	# .redirect_dict has a key url if url was previously visited.
590	25	if hasattr(req, 'redirect_dict'):
591	14	visited = new.redirect_dict = req.redirect_dict
592	14	if (visited.get(newurl, 0) >= self.max_repeats or
593	13	len(visited) >= self.max_redirections):
594	2	raise HTTPError(req.get_full_url(), code,
595	2	self.inf_msg + msg, headers, fp)
596	n/a	else:
597	11	visited = new.redirect_dict = req.redirect_dict = {}
598	23	visited[newurl] = visited.get(newurl, 0) + 1
599	n/a
600	n/a	# Don't close the fp until we are sure that we won't use it
601	n/a	# with HTTPError.
602	23	fp.read()
603	23	fp.close()
604	n/a
605	23	return self.parent.open(new, timeout=req.timeout)
606	n/a
607	1	http_error_301 = http_error_303 = http_error_307 = http_error_302
608	n/a
609	1	inf_msg = "The HTTP server returned a redirect error that would " \
610	n/a	"lead to an infinite loop.\n" \
611	n/a	"The last 30x error message was:\n"
612	n/a
613	n/a
614	1	def _parse_proxy(proxy):
615	n/a	"""Return (scheme, user, password, host/port) given a URL or an authority.
616	n/a
617	n/a	If a URL is supplied, it must have an authority (host:port) component.
618	n/a	According to RFC 3986, having an authority component means the URL must
619	n/a	have two slashes after the scheme:
620	n/a
621	n/a	>>> _parse_proxy('file:/ftp.example.com/')
622	n/a	Traceback (most recent call last):
623	n/a	ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
624	n/a
625	n/a	The first three items of the returned tuple may be None.
626	n/a
627	n/a	Examples of authority parsing:
628	n/a
629	n/a	>>> _parse_proxy('proxy.example.com')
630	n/a	(None, None, None, 'proxy.example.com')
631	n/a	>>> _parse_proxy('proxy.example.com:3128')
632	n/a	(None, None, None, 'proxy.example.com:3128')
633	n/a
634	n/a	The authority component may optionally include userinfo (assumed to be
635	n/a	username:password):
636	n/a
637	n/a	>>> _parse_proxy('joe:password@proxy.example.com')
638	n/a	(None, 'joe', 'password', 'proxy.example.com')
639	n/a	>>> _parse_proxy('joe:password@proxy.example.com:3128')
640	n/a	(None, 'joe', 'password', 'proxy.example.com:3128')
641	n/a
642	n/a	Same examples, but with URLs instead:
643	n/a
644	n/a	>>> _parse_proxy('http://proxy.example.com/')
645	n/a	('http', None, None, 'proxy.example.com')
646	n/a	>>> _parse_proxy('http://proxy.example.com:3128/')
647	n/a	('http', None, None, 'proxy.example.com:3128')
648	n/a	>>> _parse_proxy('http://joe:password@proxy.example.com/')
649	n/a	('http', 'joe', 'password', 'proxy.example.com')
650	n/a	>>> _parse_proxy('http://joe:password@proxy.example.com:3128')
651	n/a	('http', 'joe', 'password', 'proxy.example.com:3128')
652	n/a
653	n/a	Everything after the authority is ignored:
654	n/a
655	n/a	>>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
656	n/a	('ftp', 'joe', 'password', 'proxy.example.com')
657	n/a
658	n/a	Test for no trailing '/' case:
659	n/a
660	n/a	>>> _parse_proxy('http://joe:password@proxy.example.com')
661	n/a	('http', 'joe', 'password', 'proxy.example.com')
662	n/a
663	n/a	"""
664	30	scheme, r_scheme = splittype(proxy)
665	30	if not r_scheme.startswith("/"):
666	n/a	# authority
667	12	scheme = None
668	12	authority = proxy
669	n/a	else:
670	n/a	# URL
671	18	if not r_scheme.startswith("//"):
672	1	raise ValueError("proxy URL with no authority: %r" % proxy)
673	n/a	# We have an authority, so for RFC 3986-compliant URLs (by ss 3.
674	n/a	# and 3.3.), path is empty or starts with '/'
675	17	end = r_scheme.find("/", 2)
676	17	if end == -1:
677	13	end = None
678	17	authority = r_scheme[2:end]
679	29	userinfo, hostport = splituser(authority)
680	29	if userinfo is not None:
681	6	user, password = splitpasswd(userinfo)
682	n/a	else:
683	23	user = password = None
684	29	return scheme, user, password, hostport
685	n/a
686	2	class ProxyHandler(BaseHandler):
687	n/a	# Proxies must be in front
688	1	handler_order = 100
689	n/a
690	1	def __init__(self, proxies=None):
691	19	if proxies is None:
692	10	proxies = getproxies()
693	19	assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
694	19	self.proxies = proxies
695	28	for type, url in proxies.items():
696	9	setattr(self, '%s_open' % type,
697	9	lambda r, proxy=url, type=type, meth=self.proxy_open: \
698	19	meth(r, proxy, type))
699	n/a
700	1	def proxy_open(self, req, proxy, type):
701	19	orig_type = req.get_type()
702	19	proxy_type, user, password, hostport = _parse_proxy(proxy)
703	n/a
704	19	if proxy_type is None:
705	8	proxy_type = orig_type
706	n/a
707	19	if req.host and proxy_bypass(req.host):
708	1	return None
709	n/a
710	18	if user and password:
711	0	user_pass = '%s:%s' % (unquote(user), unquote(password))
712	0	creds = base64.b64encode(user_pass).strip()
713	0	req.add_header('Proxy-authorization', 'Basic ' + creds)
714	18	hostport = unquote(hostport)
715	18	req.set_proxy(hostport, proxy_type)
716	n/a
717	18	if orig_type == proxy_type or orig_type == 'https':
718	n/a	# let other handlers take care of it
719	18	return None
720	n/a	else:
721	n/a	# need to start over, because the other handlers don't
722	n/a	# grok the proxy's URL type
723	n/a	# e.g. if we have a constructor arg proxies like so:
724	n/a	# {'http': 'ftp://proxy.example.com'}, we may end up turning
725	n/a	# a request for http://acme.example.com/a into one for
726	n/a	# ftp://proxy.example.com/a
727	0	return self.parent.open(req, timeout=req.timeout)
728	n/a
729	2	class HTTPPasswordMgr:
730	n/a
731	1	def __init__(self):
732	11	self.passwd = {}
733	n/a
734	1	def add_password(self, realm, uri, user, passwd):
735	n/a	# uri could be a single URI or a sequence
736	20	if isinstance(uri, basestring):
737	20	uri = [uri]
738	20	if not realm in self.passwd:
739	14	self.passwd[realm] = {}
740	60	for default_port in True, False:
741	40	reduced_uri = tuple(
742	80	[self.reduce_uri(u, default_port) for u in uri])
743	40	self.passwd[realm][reduced_uri] = (user, passwd)
744	n/a
745	1	def find_user_password(self, realm, authuri):
746	40	domains = self.passwd.get(realm, {})
747	59	for default_port in True, False:
748	50	reduced_authuri = self.reduce_uri(authuri, default_port)
749	130	for uris, authinfo in domains.iteritems():
750	191	for uri in uris:
751	111	if self.is_suburi(uri, reduced_authuri):
752	31	return authinfo
753	9	return None, None
754	n/a
755	1	def reduce_uri(self, uri, default_port=True):
756	n/a	"""Accept authority or URI and extract only the authority and path."""
757	n/a	# note HTTP URLs do not have a userinfo component
758	90	parts = urlparse.urlsplit(uri)
759	90	if parts[1]:
760	n/a	# URI
761	57	scheme = parts[0]
762	57	authority = parts[1]
763	57	path = parts[2] or '/'
764	n/a	else:
765	n/a	# host or host:port
766	33	scheme = None
767	33	authority = uri
768	33	path = '/'
769	90	host, port = splitport(authority)
770	90	if default_port and port is None and scheme is not None:
771	31	dport = {"http": 80,
772	31	"https": 443,
773	31	}.get(scheme)
774	31	if dport is not None:
775	31	authority = "%s:%d" % (host, dport)
776	90	return authority, path
777	n/a
778	1	def is_suburi(self, base, test):
779	n/a	"""Check if test is below base in a URI tree
780	n/a
781	n/a	Both args must be URIs in reduced form.
782	n/a	"""
783	111	if base == test:
784	29	return True
785	82	if base[0] != test[0]:
786	75	return False
787	7	common = posixpath.commonprefix((base[1], test[1]))
788	7	if len(common) == len(base[1]):
789	2	return True
790	5	return False
791	n/a
792	n/a
793	2	class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
794	n/a
795	1	def find_user_password(self, realm, authuri):
796	0	user, password = HTTPPasswordMgr.find_user_password(self, realm,
797	0	authuri)
798	0	if user is not None:
799	0	return user, password
800	0	return HTTPPasswordMgr.find_user_password(self, None, authuri)
801	n/a
802	n/a
803	2	class AbstractBasicAuthHandler:
804	n/a
805	n/a	# XXX this allows for multiple auth-schemes, but will stupidly pick
806	n/a	# the last one with a realm specified.
807	n/a
808	n/a	# allow for double- and single-quoted realm values
809	n/a	# (single quotes are a violation of the RFC, but appear in the wild)
810	1	rx = re.compile('(?:.,)[ \t]*([^ \t]+)[ \t]+'
811	1	'realm=(["\'])(.*?)\\2', re.I)
812	n/a
813	n/a	# XXX could pre-emptively send auth info already accepted (RFC 2617,
814	n/a	# end of section 2, and section 1.2 immediately after "credentials"
815	n/a	# production).
816	n/a
817	1	def __init__(self, password_mgr=None):
818	9	if password_mgr is None:
819	2	password_mgr = HTTPPasswordMgr()
820	9	self.passwd = password_mgr
821	9	self.add_password = self.passwd.add_password
822	9	self.retried = 0
823	n/a
824	1	def http_error_auth_reqed(self, authreq, host, req, headers):
825	n/a	# host may be an authority (without userinfo) or a URL with an
826	n/a	# authority
827	n/a	# XXX could be multiple headers
828	8	authreq = headers.get(authreq, None)
829	n/a
830	8	if self.retried > 5:
831	n/a	# retry sending the username:password 5 times before failing.
832	0	raise HTTPError(req.get_full_url(), 401, "basic auth failed",
833	0	headers, None)
834	n/a	else:
835	8	self.retried += 1
836	n/a
837	8	if authreq:
838	8	mo = AbstractBasicAuthHandler.rx.search(authreq)
839	8	if mo:
840	8	scheme, quote, realm = mo.groups()
841	8	if scheme.lower() == 'basic':
842	8	return self.retry_http_basic_auth(host, req, realm)
843	n/a
844	1	def retry_http_basic_auth(self, host, req, realm):
845	8	user, pw = self.passwd.find_user_password(realm, host)
846	8	if pw is not None:
847	4	raw = "%s:%s" % (user, pw)
848	4	auth = 'Basic %s' % base64.b64encode(raw).strip()
849	4	if req.headers.get(self.auth_header, None) == auth:
850	0	return None
851	4	req.add_unredirected_header(self.auth_header, auth)
852	4	return self.parent.open(req, timeout=req.timeout)
853	n/a	else:
854	4	return None
855	n/a
856	n/a
857	2	class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
858	n/a
859	1	auth_header = 'Authorization'
860	n/a
861	1	def http_error_401(self, req, fp, code, msg, headers):
862	6	url = req.get_full_url()
863	6	return self.http_error_auth_reqed('www-authenticate',
864	6	url, req, headers)
865	n/a
866	n/a
867	2	class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
868	n/a
869	1	auth_header = 'Proxy-authorization'
870	n/a
871	1	def http_error_407(self, req, fp, code, msg, headers):
872	n/a	# http_error_auth_reqed requires that there is no userinfo component in
873	n/a	# authority. Assume there isn't one, since urllib2 does not (and
874	n/a	# should not, RFC 3986 s. 3.2.1) support requests for URLs containing
875	n/a	# userinfo.
876	2	authority = req.get_host()
877	2	return self.http_error_auth_reqed('proxy-authenticate',
878	2	authority, req, headers)
879	n/a
880	n/a
881	1	def randombytes(n):
882	n/a	"""Return n random bytes."""
883	n/a	# Use /dev/urandom if it is available. Fall back to random module
884	n/a	# if not. It might be worthwhile to extend this function to use
885	n/a	# other platform-specific mechanisms for getting random bytes.
886	7	if os.path.exists("/dev/urandom"):
887	7	f = open("/dev/urandom")
888	7	s = f.read(n)
889	7	f.close()
890	7	return s
891	n/a	else:
892	0	L = [chr(random.randrange(0, 256)) for i in range(n)]
893	0	return "".join(L)
894	n/a
895	2	class AbstractDigestAuthHandler:
896	n/a	# Digest authentication is specified in RFC 2617.
897	n/a
898	n/a	# XXX The client does not inspect the Authentication-Info header
899	n/a	# in a successful response.
900	n/a
901	n/a	# XXX It should be possible to test this implementation against
902	n/a	# a mock server that just generates a static set of challenges.
903	n/a
904	n/a	# XXX qop="auth-int" supports is shaky
905	n/a
906	1	def __init__(self, passwd=None):
907	5	if passwd is None:
908	4	passwd = HTTPPasswordMgr()
909	5	self.passwd = passwd
910	5	self.add_password = self.passwd.add_password
911	5	self.retried = 0
912	5	self.nonce_count = 0
913	5	self.last_nonce = None
914	n/a
915	1	def reset_retry_count(self):
916	4	self.retried = 0
917	n/a
918	1	def http_error_auth_reqed(self, auth_header, host, req, headers):
919	12	authreq = headers.get(auth_header, None)
920	12	if self.retried > 5:
921	n/a	# Don't fail endlessly - if we failed once, we'll probably
922	n/a	# fail a second time. Hm. Unless the Password Manager is
923	n/a	# prompting for the information. Crap. This isn't great
924	n/a	# but it's better than the current 'repeat until recursion
925	n/a	# depth exceeded' approach <wink>
926	1	raise HTTPError(req.get_full_url(), 401, "digest auth failed",
927	1	headers, None)
928	n/a	else:
929	11	self.retried += 1
930	11	if authreq:
931	11	scheme = authreq.split()[0]
932	11	if scheme.lower() == 'digest':
933	9	return self.retry_http_digest_auth(req, authreq)
934	n/a
935	1	def retry_http_digest_auth(self, req, auth):
936	9	token, challenge = auth.split(' ', 1)
937	9	chal = parse_keqv_list(parse_http_list(challenge))
938	9	auth = self.get_authorization(req, chal)
939	8	if auth:
940	7	auth_val = 'Digest %s' % auth
941	7	if req.headers.get(self.auth_header, None) == auth_val:
942	0	return None
943	7	req.add_unredirected_header(self.auth_header, auth_val)
944	7	resp = self.parent.open(req, timeout=req.timeout)
945	1	return resp
946	n/a
947	1	def get_cnonce(self, nonce):
948	n/a	# The cnonce-value is an opaque
949	n/a	# quoted string value provided by the client and used by both client
950	n/a	# and server to avoid chosen plaintext attacks, to provide mutual
951	n/a	# authentication, and to provide some message integrity protection.
952	n/a	# This isn't a fabulous effort, but it's probably Good Enough.
953	7	dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
954	7	randombytes(8))).hexdigest()
955	7	return dig[:16]
956	n/a
957	1	def get_authorization(self, req, chal):
958	9	try:
959	9	realm = chal['realm']
960	9	nonce = chal['nonce']
961	9	qop = chal.get('qop')
962	9	algorithm = chal.get('algorithm', 'MD5')
963	n/a	# mod_digest doesn't send an opaque, even though it isn't
964	n/a	# supposed to be optional
965	9	opaque = chal.get('opaque', None)
966	0	except KeyError:
967	0	return None
968	n/a
969	9	H, KD = self.get_algorithm_impls(algorithm)
970	9	if H is None:
971	0	return None
972	n/a
973	9	user, pw = self.passwd.find_user_password(realm, req.get_full_url())
974	9	if user is None:
975	1	return None
976	n/a
977	n/a	# XXX not implemented yet
978	8	if req.has_data():
979	0	entdig = self.get_entity_digest(req.get_data(), chal)
980	n/a	else:
981	8	entdig = None
982	n/a
983	8	A1 = "%s:%s:%s" % (user, realm, pw)
984	8	A2 = "%s:%s" % (req.get_method(),
985	n/a	# XXX selector: what about proxies and full urls
986	8	req.get_selector())
987	8	if qop == 'auth':
988	7	if nonce == self.last_nonce:
989	0	self.nonce_count += 1
990	n/a	else:
991	7	self.nonce_count = 1
992	7	self.last_nonce = nonce
993	n/a
994	7	ncvalue = '%08x' % self.nonce_count
995	7	cnonce = self.get_cnonce(nonce)
996	7	noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
997	7	respdig = KD(H(A1), noncebit)
998	1	elif qop is None:
999	0	respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1000	n/a	else:
1001	n/a	# XXX handle auth-int.
1002	1	raise URLError("qop '%s' is not supported." % qop)
1003	n/a
1004	n/a	# XXX should the partial digests be encoded too?
1005	n/a
1006	7	base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
1007	7	'response="%s"' % (user, realm, nonce, req.get_selector(),
1008	7	respdig)
1009	7	if opaque:
1010	0	base += ', opaque="%s"' % opaque
1011	7	if entdig:
1012	0	base += ', digest="%s"' % entdig
1013	7	base += ', algorithm="%s"' % algorithm
1014	7	if qop:
1015	7	base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1016	7	return base
1017	n/a
1018	1	def get_algorithm_impls(self, algorithm):
1019	n/a	# algorithm should be case-insensitive according to RFC2617
1020	9	algorithm = algorithm.upper()
1021	n/a	# lambdas assume digest modules are imported at the top level
1022	9	if algorithm == 'MD5':
1023	30	H = lambda x: hashlib.md5(x).hexdigest()
1024	0	elif algorithm == 'SHA':
1025	0	H = lambda x: hashlib.sha1(x).hexdigest()
1026	n/a	# XXX MD5-sess
1027	16	KD = lambda s, d: H("%s:%s" % (s, d))
1028	9	return H, KD
1029	n/a
1030	1	def get_entity_digest(self, data, chal):
1031	n/a	# XXX not implemented yet
1032	0	return None
1033	n/a
1034	n/a
1035	2	class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1036	n/a	"""An authentication protocol defined by RFC 2069
1037	n/a
1038	n/a	Digest authentication improves on basic authentication because it
1039	n/a	does not transmit passwords in the clear.
1040	1	"""
1041	n/a
1042	1	auth_header = 'Authorization'
1043	1	handler_order = 490 # before Basic auth
1044	n/a
1045	1	def http_error_401(self, req, fp, code, msg, headers):
1046	2	host = urlparse.urlparse(req.get_full_url())[1]
1047	2	retry = self.http_error_auth_reqed('www-authenticate',
1048	2	host, req, headers)
1049	2	self.reset_retry_count()
1050	2	return retry
1051	n/a
1052	n/a
1053	2	class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1054	n/a
1055	1	auth_header = 'Proxy-Authorization'
1056	1	handler_order = 490 # before Basic auth
1057	n/a
1058	1	def http_error_407(self, req, fp, code, msg, headers):
1059	10	host = req.get_host()
1060	10	retry = self.http_error_auth_reqed('proxy-authenticate',
1061	10	host, req, headers)
1062	2	self.reset_retry_count()
1063	2	return retry
1064	n/a
1065	2	class AbstractHTTPHandler(BaseHandler):
1066	n/a
1067	1	def __init__(self, debuglevel=0):
1068	32	self._debuglevel = debuglevel
1069	n/a
1070	1	def set_http_debuglevel(self, level):
1071	0	self._debuglevel = level
1072	n/a
1073	1	def do_request_(self, request):
1074	53	host = request.get_host()
1075	53	if not host:
1076	0	raise URLError('no host given')
1077	n/a
1078	53	if request.has_data(): # POST
1079	11	data = request.get_data()
1080	11	if not request.has_header('Content-type'):
1081	6	request.add_unredirected_header(
1082	6	'Content-type',
1083	6	'application/x-www-form-urlencoded')
1084	11	if not request.has_header('Content-length'):
1085	6	request.add_unredirected_header(
1086	6	'Content-length', '%d' % len(data))
1087	n/a
1088	53	sel_host = host
1089	53	if request.has_proxy():
1090	11	scheme, sel = splittype(request.get_selector())
1091	11	sel_host, sel_path = splithost(sel)
1092	n/a
1093	53	if not request.has_header('Host'):
1094	40	request.add_unredirected_header('Host', sel_host)
1095	98	for name, value in self.parent.addheaders:
1096	45	name = name.capitalize()
1097	45	if not request.has_header(name):
1098	36	request.add_unredirected_header(name, value)
1099	n/a
1100	53	return request
1101	n/a
1102	1	def do_open(self, http_class, req):
1103	n/a	"""Return an addinfourl object for the request, using http_class.
1104	n/a
1105	n/a	http_class must implement the HTTPConnection API from httplib.
1106	n/a	The addinfourl return value is a file-like object. It also
1107	n/a	has methods and attributes including:
1108	n/a	- info(): return a mimetools.Message object for the headers
1109	n/a	- geturl(): return the original request URL
1110	n/a	- code: HTTP status code
1111	n/a	"""
1112	45	host = req.get_host()
1113	45	if not host:
1114	0	raise URLError('no host given')
1115	n/a
1116	45	h = http_class(host, timeout=req.timeout) # will parse host:port
1117	45	h.set_debuglevel(self._debuglevel)
1118	n/a
1119	45	headers = dict(req.headers)
1120	45	headers.update(req.unredirected_hdrs)
1121	n/a	# We want to make an HTTP/1.1 request, but the addinfourl
1122	n/a	# class isn't prepared to deal with a persistent connection.
1123	n/a	# It will try to read all remaining data from the socket,
1124	n/a	# which will block while the server waits for the next request.
1125	n/a	# So make sure the connection gets closed after the (only)
1126	n/a	# request.
1127	45	headers["Connection"] = "close"
1128	45	headers = dict(
1129	235	(name.title(), val) for name, val in headers.items())
1130	n/a
1131	45	if req._tunnel_host:
1132	1	tunnel_headers = {}
1133	1	proxy_auth_hdr = "Proxy-Authorization"
1134	1	if proxy_auth_hdr in headers:
1135	1	tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1136	n/a	# Proxy-Authorization should not be sent to origin
1137	n/a	# server.
1138	1	del headers[proxy_auth_hdr]
1139	1	h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
1140	n/a
1141	45	try:
1142	45	h.request(req.get_method(), req.get_selector(), req.data, headers)
1143	43	try:
1144	43	r = h.getresponse(buffering=True)
1145	3	except TypeError: #buffering kw not supported
1146	3	r = h.getresponse()
1147	2	except socket.error, err: # XXX what error?
1148	2	raise URLError(err)
1149	n/a
1150	n/a	# Pick apart the HTTPResponse object to get the addinfourl
1151	n/a	# object initialized properly.
1152	n/a
1153	n/a	# Wrap the HTTPResponse object in socket's file object adapter
1154	n/a	# for Windows. That adapter calls recv(), so delegate recv()
1155	n/a	# to read(). This weird wrapping allows the returned object to
1156	n/a	# have readline() and readlines() methods.
1157	n/a
1158	n/a	# XXX It might be better to extract the read buffering code
1159	n/a	# out of socket._fileobject() and into a base class.
1160	n/a
1161	43	r.recv = r.read
1162	43	fp = socket._fileobject(r, close=True)
1163	n/a
1164	43	resp = addinfourl(fp, r.msg, req.get_full_url())
1165	43	resp.code = r.status
1166	43	resp.msg = r.reason
1167	43	return resp
1168	n/a
1169	n/a
1170	2	class HTTPHandler(AbstractHTTPHandler):
1171	n/a
1172	1	def http_open(self, req):
1173	41	return self.do_open(httplib.HTTPConnection, req)
1174	n/a
1175	1	http_request = AbstractHTTPHandler.do_request_
1176	n/a
1177	1	if hasattr(httplib, 'HTTPS'):
1178	2	class HTTPSHandler(AbstractHTTPHandler):
1179	n/a
1180	1	def https_open(self, req):
1181	0	return self.do_open(httplib.HTTPSConnection, req)
1182	n/a
1183	1	https_request = AbstractHTTPHandler.do_request_
1184	n/a
1185	2	class HTTPCookieProcessor(BaseHandler):
1186	1	def __init__(self, cookiejar=None):
1187	2	import cookielib
1188	2	if cookiejar is None:
1189	0	cookiejar = cookielib.CookieJar()
1190	2	self.cookiejar = cookiejar
1191	n/a
1192	1	def http_request(self, request):
1193	3	self.cookiejar.add_cookie_header(request)
1194	3	return request
1195	n/a
1196	1	def http_response(self, request, response):
1197	3	self.cookiejar.extract_cookies(response, request)
1198	3	return response
1199	n/a
1200	1	https_request = http_request
1201	1	https_response = http_response
1202	n/a
1203	2	class UnknownHandler(BaseHandler):
1204	1	def unknown_open(self, req):
1205	3	type = req.get_type()
1206	3	raise URLError('unknown url type: %s' % type)
1207	n/a
1208	1	def parse_keqv_list(l):
1209	n/a	"""Parse list of key=value strings where keys are not duplicated."""
1210	9	parsed = {}
1211	36	for elt in l:
1212	27	k, v = elt.split('=', 1)
1213	27	if v[0] == '"' and v[-1] == '"':
1214	27	v = v[1:-1]
1215	27	parsed[k] = v
1216	9	return parsed
1217	n/a
1218	1	def parse_http_list(s):
1219	n/a	"""Parse lists as described by RFC 2068 Section 2.
1220	n/a
1221	n/a	In particular, parse comma-separated lists where the elements of
1222	n/a	the list may include quoted-strings. A quoted-string could
1223	n/a	contain a comma. A non-quoted string could have quotes in the
1224	n/a	middle. Neither commas nor quotes count if they are escaped.
1225	n/a	Only double-quotes count, not single-quotes.
1226	n/a	"""
1227	13	res = []
1228	13	part = ''
1229	n/a
1230	13	escape = quote = False
1231	742	for cur in s:
1232	729	if escape:
1233	3	part += cur
1234	3	escape = False
1235	3	continue
1236	726	if quote:
1237	462	if cur == '\\':
1238	3	escape = True
1239	3	continue
1240	459	elif cur == '"':
1241	35	quote = False
1242	459	part += cur
1243	459	continue
1244	n/a
1245	264	if cur == ',':
1246	38	res.append(part)
1247	38	part = ''
1248	38	continue
1249	n/a
1250	226	if cur == '"':
1251	35	quote = True
1252	n/a
1253	226	part += cur
1254	n/a
1255	n/a	# append last part
1256	13	if part:
1257	4	res.append(part)
1258	n/a
1259	55	return [part.strip() for part in res]
1260	n/a
1261	2	class FileHandler(BaseHandler):
1262	n/a	# Use local file or FTP depending on form of URL
1263	1	def file_open(self, req):
1264	16	url = req.get_selector()
1265	16	if url[:2] == '//' and url[2:3] != '/':
1266	1	req.type = 'ftp'
1267	1	return self.parent.open(req)
1268	n/a	else:
1269	15	return self.open_local_file(req)
1270	n/a
1271	n/a	# names for the localhost
1272	1	names = None
1273	1	def get_names(self):
1274	4	if FileHandler.names is None:
1275	1	try:
1276	1	FileHandler.names = tuple(
1277	1	socket.gethostbyname_ex('localhost')[2] +
1278	1	socket.gethostbyname_ex(socket.gethostname())[2])
1279	0	except socket.gaierror:
1280	0	FileHandler.names = (socket.gethostbyname('localhost'),)
1281	4	return FileHandler.names
1282	n/a
1283	n/a	# not entirely sure what the rules are here
1284	1	def open_local_file(self, req):
1285	15	import email.utils
1286	15	import mimetypes
1287	15	host = req.get_host()
1288	15	filename = req.get_selector()
1289	15	localfile = url2pathname(filename)
1290	15	try:
1291	15	stats = os.stat(localfile)
1292	9	size = stats.st_size
1293	9	modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1294	9	mtype = mimetypes.guess_type(filename)[0]
1295	9	headers = mimetools.Message(StringIO(
1296	9	'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1297	9	(mtype or 'text/plain', size, modified)))
1298	9	if host:
1299	6	host, port = splitport(host)
1300	9	if not host or \
1301	6	(not port and socket.gethostbyname(host) in self.get_names()):
1302	6	if host:
1303	3	origurl = 'file://' + host + filename
1304	n/a	else:
1305	3	origurl = 'file://' + filename
1306	6	return addinfourl(open(localfile, 'rb'), headers, origurl)
1307	6	except OSError, msg:
1308	n/a	# urllib2 users shouldn't expect OSErrors coming from urlopen()
1309	6	raise URLError(msg)
1310	3	raise URLError('file not on local host')
1311	n/a
1312	2	class FTPHandler(BaseHandler):
1313	1	def ftp_open(self, req):
1314	12	import ftplib
1315	12	import mimetypes
1316	12	host = req.get_host()
1317	12	if not host:
1318	0	raise URLError('ftp error: no host given')
1319	12	host, port = splitport(host)
1320	12	if port is None:
1321	11	port = ftplib.FTP_PORT
1322	n/a	else:
1323	1	port = int(port)
1324	n/a
1325	n/a	# username/password handling
1326	12	user, host = splituser(host)
1327	12	if user:
1328	0	user, passwd = splitpasswd(user)
1329	n/a	else:
1330	12	passwd = None
1331	12	host = unquote(host)
1332	12	user = unquote(user or '')
1333	12	passwd = unquote(passwd or '')
1334	n/a
1335	12	try:
1336	12	host = socket.gethostbyname(host)
1337	0	except socket.error, msg:
1338	0	raise URLError(msg)
1339	12	path, attrs = splitattr(req.get_selector())
1340	12	dirs = path.split('/')
1341	12	dirs = map(unquote, dirs)
1342	12	dirs, file = dirs[:-1], dirs[-1]
1343	12	if dirs and not dirs[0]:
1344	12	dirs = dirs[1:]
1345	12	try:
1346	12	fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1347	12	type = file and 'I' or 'D'
1348	13	for attr in attrs:
1349	1	attr, value = splitvalue(attr)
1350	1	if attr.lower() == 'type' and \
1351	1	value in ('a', 'A', 'i', 'I', 'd', 'D'):
1352	1	type = value.upper()
1353	12	fp, retrlen = fw.retrfile(file, type)
1354	9	headers = ""
1355	9	mtype = mimetypes.guess_type(req.get_full_url())[0]
1356	9	if mtype:
1357	1	headers += "Content-type: %s\n" % mtype
1358	9	if retrlen is not None and retrlen >= 0:
1359	5	headers += "Content-length: %d\n" % retrlen
1360	9	sf = StringIO(headers)
1361	9	headers = mimetools.Message(sf)
1362	9	return addinfourl(fp, headers, req.get_full_url())
1363	3	except ftplib.all_errors, msg:
1364	3	raise URLError, ('ftp error: %s' % msg), sys.exc_info()[2]
1365	n/a
1366	1	def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1367	4	fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1368	n/a	## fw.ftp.set_debuglevel(1)
1369	4	return fw
1370	n/a
1371	2	class CacheFTPHandler(FTPHandler):
1372	n/a	# XXX would be nice to have pluggable cache strategies
1373	n/a	# XXX this stuff is definitely not thread safe
1374	1	def __init__(self):
1375	2	self.cache = {}
1376	2	self.timeout = {}
1377	2	self.soonest = 0
1378	2	self.delay = 60
1379	2	self.max_conns = 16
1380	n/a
1381	1	def setTimeout(self, t):
1382	2	self.delay = t
1383	n/a
1384	1	def setMaxConns(self, m):
1385	0	self.max_conns = m
1386	n/a
1387	1	def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1388	5	key = user, host, port, '/'.join(dirs), timeout
1389	5	if key in self.cache:
1390	3	self.timeout[key] = time.time() + self.delay
1391	n/a	else:
1392	2	self.cache[key] = ftpwrapper(user, passwd, host, port, dirs, timeout)
1393	2	self.timeout[key] = time.time() + self.delay
1394	5	self.check_cache()
1395	5	return self.cache[key]
1396	n/a
1397	1	def check_cache(self):
1398	n/a	# first check for old ones
1399	5	t = time.time()
1400	5	if self.soonest <= t:
1401	5	for k, v in self.timeout.items():
1402	3	if v < t:
1403	1	self.cache[k].close()
1404	1	del self.cache[k]
1405	1	del self.timeout[k]
1406	5	self.soonest = min(self.timeout.values())
1407	n/a
1408	n/a	# then check the size
1409	5	if len(self.cache) == self.max_conns:
1410	0	for k, v in self.timeout.items():
1411	0	if v == self.soonest:
1412	0	del self.cache[k]
1413	0	del self.timeout[k]
1414	0	break
1415	0	self.soonest = min(self.timeout.values())