ยปCore Development>Code coverage>Lib/robotparser.py

Python code coverage for Lib/robotparser.py

#countcontent
1n/a""" robotparser.py
2n/a
3n/a Copyright (C) 2000 Bastian Kleineidam
4n/a
5n/a You can choose between two licenses when using this package:
6n/a 1) GNU GPLv2
7n/a 2) PSF license for Python 2.2
8n/a
9n/a The robots.txt Exclusion Protocol is implemented as specified in
10n/a http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
111"""
121import urlparse
131import urllib
14n/a
151__all__ = ["RobotFileParser"]
16n/a
17n/a
182class RobotFileParser:
19n/a """ This class provides a set of methods to read, parse and answer
20n/a questions about a single robots.txt file.
21n/a
221 """
23n/a
241 def __init__(self, url=''):
2515 self.entries = []
2615 self.default_entry = None
2715 self.disallow_all = False
2815 self.allow_all = False
2915 self.set_url(url)
3015 self.last_checked = 0
31n/a
321 def mtime(self):
33n/a """Returns the time the robots.txt file was last fetched.
34n/a
35n/a This is useful for long-running web spiders that need to
36n/a check for new robots.txt files periodically.
37n/a
38n/a """
390 return self.last_checked
40n/a
411 def modified(self):
42n/a """Sets the time the robots.txt file was last fetched to the
43n/a current time.
44n/a
45n/a """
460 import time
470 self.last_checked = time.time()
48n/a
491 def set_url(self, url):
50n/a """Sets the URL referring to a robots.txt file."""
5116 self.url = url
5216 self.host, self.path = urlparse.urlparse(url)[1:3]
53n/a
541 def read(self):
55n/a """Reads the robots.txt URL and feeds it to the parser."""
562 opener = URLopener()
572 f = opener.open(self.url)
5857 lines = [line.strip() for line in f]
592 f.close()
602 self.errcode = opener.errcode
612 if self.errcode in (401, 403):
621 self.disallow_all = True
631 elif self.errcode >= 400:
640 self.allow_all = True
651 elif self.errcode == 200 and lines:
661 self.parse(lines)
67n/a
681 def _add_entry(self, entry):
699 if "*" in entry.useragents:
70n/a # the default entry is considered last
712 self.default_entry = entry
72n/a else:
737 self.entries.append(entry)
74n/a
751 def parse(self, lines):
76n/a """parse the input lines from a robots.txt file.
77n/a We allow that a user-agent: line is not preceded by
78n/a one or more blank lines."""
79n/a # states:
80n/a # 0: start state
81n/a # 1: saw user-agent line
82n/a # 2: saw an allow or disallow line
8314 state = 0
8414 linenumber = 0
8514 entry = Entry()
86n/a
87118 for line in lines:
88104 linenumber += 1
89104 if not line:
905 if state == 1:
910 entry = Entry()
920 state = 0
935 elif state == 2:
944 self._add_entry(entry)
954 entry = Entry()
964 state = 0
97n/a # remove optional comment and strip line
98104 i = line.find('#')
99104 if i >= 0:
10012 line = line[:i]
101104 line = line.strip()
102104 if not line:
10334 continue
10470 line = line.split(':', 1)
10570 if len(line) == 2:
10670 line[0] = line[0].strip().lower()
10770 line[1] = urllib.unquote(line[1].strip())
10870 if line[0] == "user-agent":
10924 if state == 2:
1105 self._add_entry(entry)
1115 entry = Entry()
11224 entry.useragents.append(line[1])
11324 state = 1
11446 elif line[0] == "disallow":
11539 if state != 0:
11639 entry.rulelines.append(RuleLine(line[1], False))
11739 state = 2
1187 elif line[0] == "allow":
1197 if state != 0:
1207 entry.rulelines.append(RuleLine(line[1], True))
1217 state = 2
12214 if state == 2:
12313 self.entries.append(entry)
124n/a
125n/a
1261 def can_fetch(self, useragent, url):
127n/a """using the parsed robots.txt decide if useragent can fetch url"""
12844 if self.disallow_all:
1291 return False
13043 if self.allow_all:
1310 return True
132n/a # search for given user agent matches
133n/a # the first match counts
13443 url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
13550 for entry in self.entries:
13646 if entry.applies_to(useragent):
13739 return entry.allowance(url)
138n/a # try the default entry last
1394 if self.default_entry:
1404 return self.default_entry.allowance(url)
141n/a # agent not found ==> access granted
1420 return True
143n/a
144n/a
1451 def __str__(self):
1460 return ''.join([str(entry) + "\n" for entry in self.entries])
147n/a
148n/a
1492class RuleLine:
150n/a """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
1511 (allowance==False) followed by a path."""
1521 def __init__(self, path, allowance):
15346 if path == '' and not allowance:
154n/a # an empty value means allow all
1551 allowance = True
15646 self.path = urllib.quote(path)
15746 self.allowance = allowance
158n/a
1591 def applies_to(self, filename):
16082 return self.path == "*" or filename.startswith(self.path)
161n/a
1621 def __str__(self):
1630 return (self.allowance and "Allow" or "Disallow") + ": " + self.path
164n/a
165n/a
1662class Entry:
1671 """An entry has one or more user-agents and zero or more rulelines"""
1681 def __init__(self):
16923 self.useragents = []
17023 self.rulelines = []
171n/a
1721 def __str__(self):
1730 ret = []
1740 for agent in self.useragents:
1750 ret.extend(["User-agent: ", agent, "\n"])
1760 for line in self.rulelines:
1770 ret.extend([str(line), "\n"])
1780 return ''.join(ret)
179n/a
1801 def applies_to(self, useragent):
181n/a """check if this entry applies to the specified agent"""
182n/a # split the name token and make it lower case
18346 useragent = useragent.split("/")[0].lower()
18455 for agent in self.useragents:
18548 if agent == '*':
186n/a # we have the catch-all agent
18716 return True
18832 agent = agent.lower()
18932 if agent in useragent:
19023 return True
1917 return False
192n/a
1931 def allowance(self, filename):
194n/a """Preconditions:
195n/a - our agent applies to this entry
196n/a - filename is URL decoded"""
19789 for line in self.rulelines:
19882 if line.applies_to(filename):
19936 return line.allowance
2007 return True
201n/a
2022class URLopener(urllib.FancyURLopener):
2031 def __init__(self, *args):
2042 urllib.FancyURLopener.__init__(self, *args)
2052 self.errcode = 200
206n/a
2071 def prompt_user_passwd(self, host, realm):
208n/a ## If robots.txt file is accessible only with a password,
209n/a ## we act as if the file wasn't there.
2101 return None, None
211n/a
2121 def http_error_default(self, url, fp, errcode, errmsg, headers):
2131 self.errcode = errcode
2141 return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
2151 errmsg, headers)