Python code coverage for Lib/robotparser.py

#	count	content
1	n/a	""" robotparser.py
2	n/a
3	n/a	Copyright (C) 2000 Bastian Kleineidam
4	n/a
5	n/a	You can choose between two licenses when using this package:
6	n/a	1) GNU GPLv2
7	n/a	2) PSF license for Python 2.2
8	n/a
9	n/a	The robots.txt Exclusion Protocol is implemented as specified in
10	n/a	http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
11	1	"""
12	1	import urlparse
13	1	import urllib
14	n/a
15	1	__all__ = ["RobotFileParser"]
16	n/a
17	n/a
18	2	class RobotFileParser:
19	n/a	""" This class provides a set of methods to read, parse and answer
20	n/a	questions about a single robots.txt file.
21	n/a
22	1	"""
23	n/a
24	1	def __init__(self, url=''):
25	15	self.entries = []
26	15	self.default_entry = None
27	15	self.disallow_all = False
28	15	self.allow_all = False
29	15	self.set_url(url)
30	15	self.last_checked = 0
31	n/a
32	1	def mtime(self):
33	n/a	"""Returns the time the robots.txt file was last fetched.
34	n/a
35	n/a	This is useful for long-running web spiders that need to
36	n/a	check for new robots.txt files periodically.
37	n/a
38	n/a	"""
39	0	return self.last_checked
40	n/a
41	1	def modified(self):
42	n/a	"""Sets the time the robots.txt file was last fetched to the
43	n/a	current time.
44	n/a
45	n/a	"""
46	0	import time
47	0	self.last_checked = time.time()
48	n/a
49	1	def set_url(self, url):
50	n/a	"""Sets the URL referring to a robots.txt file."""
51	16	self.url = url
52	16	self.host, self.path = urlparse.urlparse(url)[1:3]
53	n/a
54	1	def read(self):
55	n/a	"""Reads the robots.txt URL and feeds it to the parser."""
56	2	opener = URLopener()
57	2	f = opener.open(self.url)
58	57	lines = [line.strip() for line in f]
59	2	f.close()
60	2	self.errcode = opener.errcode
61	2	if self.errcode in (401, 403):
62	1	self.disallow_all = True
63	1	elif self.errcode >= 400:
64	0	self.allow_all = True
65	1	elif self.errcode == 200 and lines:
66	1	self.parse(lines)
67	n/a
68	1	def _add_entry(self, entry):
69	9	if "*" in entry.useragents:
70	n/a	# the default entry is considered last
71	2	self.default_entry = entry
72	n/a	else:
73	7	self.entries.append(entry)
74	n/a
75	1	def parse(self, lines):
76	n/a	"""parse the input lines from a robots.txt file.
77	n/a	We allow that a user-agent: line is not preceded by
78	n/a	one or more blank lines."""
79	n/a	# states:
80	n/a	# 0: start state
81	n/a	# 1: saw user-agent line
82	n/a	# 2: saw an allow or disallow line
83	14	state = 0
84	14	linenumber = 0
85	14	entry = Entry()
86	n/a
87	118	for line in lines:
88	104	linenumber += 1
89	104	if not line:
90	5	if state == 1:
91	0	entry = Entry()
92	0	state = 0
93	5	elif state == 2:
94	4	self._add_entry(entry)
95	4	entry = Entry()
96	4	state = 0
97	n/a	# remove optional comment and strip line
98	104	i = line.find('#')
99	104	if i >= 0:
100	12	line = line[:i]
101	104	line = line.strip()
102	104	if not line:
103	34	continue
104	70	line = line.split(':', 1)
105	70	if len(line) == 2:
106	70	line[0] = line[0].strip().lower()
107	70	line[1] = urllib.unquote(line[1].strip())
108	70	if line[0] == "user-agent":
109	24	if state == 2:
110	5	self._add_entry(entry)
111	5	entry = Entry()
112	24	entry.useragents.append(line[1])
113	24	state = 1
114	46	elif line[0] == "disallow":
115	39	if state != 0:
116	39	entry.rulelines.append(RuleLine(line[1], False))
117	39	state = 2
118	7	elif line[0] == "allow":
119	7	if state != 0:
120	7	entry.rulelines.append(RuleLine(line[1], True))
121	7	state = 2
122	14	if state == 2:
123	13	self.entries.append(entry)
124	n/a
125	n/a
126	1	def can_fetch(self, useragent, url):
127	n/a	"""using the parsed robots.txt decide if useragent can fetch url"""
128	44	if self.disallow_all:
129	1	return False
130	43	if self.allow_all:
131	0	return True
132	n/a	# search for given user agent matches
133	n/a	# the first match counts
134	43	url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
135	50	for entry in self.entries:
136	46	if entry.applies_to(useragent):
137	39	return entry.allowance(url)
138	n/a	# try the default entry last
139	4	if self.default_entry:
140	4	return self.default_entry.allowance(url)
141	n/a	# agent not found ==> access granted
142	0	return True
143	n/a
144	n/a
145	1	def __str__(self):
146	0	return ''.join([str(entry) + "\n" for entry in self.entries])
147	n/a
148	n/a
149	2	class RuleLine:
150	n/a	"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
151	1	(allowance==False) followed by a path."""
152	1	def __init__(self, path, allowance):
153	46	if path == '' and not allowance:
154	n/a	# an empty value means allow all
155	1	allowance = True
156	46	self.path = urllib.quote(path)
157	46	self.allowance = allowance
158	n/a
159	1	def applies_to(self, filename):
160	82	return self.path == "*" or filename.startswith(self.path)
161	n/a
162	1	def __str__(self):
163	0	return (self.allowance and "Allow" or "Disallow") + ": " + self.path
164	n/a
165	n/a
166	2	class Entry:
167	1	"""An entry has one or more user-agents and zero or more rulelines"""
168	1	def __init__(self):
169	23	self.useragents = []
170	23	self.rulelines = []
171	n/a
172	1	def __str__(self):
173	0	ret = []
174	0	for agent in self.useragents:
175	0	ret.extend(["User-agent: ", agent, "\n"])
176	0	for line in self.rulelines:
177	0	ret.extend([str(line), "\n"])
178	0	return ''.join(ret)
179	n/a
180	1	def applies_to(self, useragent):
181	n/a	"""check if this entry applies to the specified agent"""
182	n/a	# split the name token and make it lower case
183	46	useragent = useragent.split("/")[0].lower()
184	55	for agent in self.useragents:
185	48	if agent == '*':
186	n/a	# we have the catch-all agent
187	16	return True
188	32	agent = agent.lower()
189	32	if agent in useragent:
190	23	return True
191	7	return False
192	n/a
193	1	def allowance(self, filename):
194	n/a	"""Preconditions:
195	n/a	- our agent applies to this entry
196	n/a	- filename is URL decoded"""
197	89	for line in self.rulelines:
198	82	if line.applies_to(filename):
199	36	return line.allowance
200	7	return True
201	n/a
202	2	class URLopener(urllib.FancyURLopener):
203	1	def __init__(self, *args):
204	2	urllib.FancyURLopener.__init__(self, *args)
205	2	self.errcode = 200
206	n/a
207	1	def prompt_user_passwd(self, host, realm):
208	n/a	## If robots.txt file is accessible only with a password,
209	n/a	## we act as if the file wasn't there.
210	1	return None, None
211	n/a
212	1	def http_error_default(self, url, fp, errcode, errmsg, headers):
213	1	self.errcode = errcode
214	1	return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
215	1	errmsg, headers)