ยปCore Development>Code coverage>Lib/urllib/robotparser.py

Python code coverage for Lib/urllib/robotparser.py

#countcontent
1n/a""" robotparser.py
2n/a
3n/a Copyright (C) 2000 Bastian Kleineidam
4n/a
5n/a You can choose between two licenses when using this package:
6n/a 1) GNU GPLv2
7n/a 2) PSF license for Python 2.2
8n/a
9n/a The robots.txt Exclusion Protocol is implemented as specified in
10n/a http://www.robotstxt.org/norobots-rfc.txt
11n/a"""
12n/a
13n/aimport collections
14n/aimport urllib.parse
15n/aimport urllib.request
16n/a
17n/a__all__ = ["RobotFileParser"]
18n/a
19n/aclass RobotFileParser:
20n/a """ This class provides a set of methods to read, parse and answer
21n/a questions about a single robots.txt file.
22n/a
23n/a """
24n/a
25n/a def __init__(self, url=''):
26n/a self.entries = []
27n/a self.default_entry = None
28n/a self.disallow_all = False
29n/a self.allow_all = False
30n/a self.set_url(url)
31n/a self.last_checked = 0
32n/a
33n/a def mtime(self):
34n/a """Returns the time the robots.txt file was last fetched.
35n/a
36n/a This is useful for long-running web spiders that need to
37n/a check for new robots.txt files periodically.
38n/a
39n/a """
40n/a return self.last_checked
41n/a
42n/a def modified(self):
43n/a """Sets the time the robots.txt file was last fetched to the
44n/a current time.
45n/a
46n/a """
47n/a import time
48n/a self.last_checked = time.time()
49n/a
50n/a def set_url(self, url):
51n/a """Sets the URL referring to a robots.txt file."""
52n/a self.url = url
53n/a self.host, self.path = urllib.parse.urlparse(url)[1:3]
54n/a
55n/a def read(self):
56n/a """Reads the robots.txt URL and feeds it to the parser."""
57n/a try:
58n/a f = urllib.request.urlopen(self.url)
59n/a except urllib.error.HTTPError as err:
60n/a if err.code in (401, 403):
61n/a self.disallow_all = True
62n/a elif err.code >= 400 and err.code < 500:
63n/a self.allow_all = True
64n/a else:
65n/a raw = f.read()
66n/a self.parse(raw.decode("utf-8").splitlines())
67n/a
68n/a def _add_entry(self, entry):
69n/a if "*" in entry.useragents:
70n/a # the default entry is considered last
71n/a if self.default_entry is None:
72n/a # the first default entry wins
73n/a self.default_entry = entry
74n/a else:
75n/a self.entries.append(entry)
76n/a
77n/a def parse(self, lines):
78n/a """Parse the input lines from a robots.txt file.
79n/a
80n/a We allow that a user-agent: line is not preceded by
81n/a one or more blank lines.
82n/a """
83n/a # states:
84n/a # 0: start state
85n/a # 1: saw user-agent line
86n/a # 2: saw an allow or disallow line
87n/a state = 0
88n/a entry = Entry()
89n/a
90n/a self.modified()
91n/a for line in lines:
92n/a if not line:
93n/a if state == 1:
94n/a entry = Entry()
95n/a state = 0
96n/a elif state == 2:
97n/a self._add_entry(entry)
98n/a entry = Entry()
99n/a state = 0
100n/a # remove optional comment and strip line
101n/a i = line.find('#')
102n/a if i >= 0:
103n/a line = line[:i]
104n/a line = line.strip()
105n/a if not line:
106n/a continue
107n/a line = line.split(':', 1)
108n/a if len(line) == 2:
109n/a line[0] = line[0].strip().lower()
110n/a line[1] = urllib.parse.unquote(line[1].strip())
111n/a if line[0] == "user-agent":
112n/a if state == 2:
113n/a self._add_entry(entry)
114n/a entry = Entry()
115n/a entry.useragents.append(line[1])
116n/a state = 1
117n/a elif line[0] == "disallow":
118n/a if state != 0:
119n/a entry.rulelines.append(RuleLine(line[1], False))
120n/a state = 2
121n/a elif line[0] == "allow":
122n/a if state != 0:
123n/a entry.rulelines.append(RuleLine(line[1], True))
124n/a state = 2
125n/a elif line[0] == "crawl-delay":
126n/a if state != 0:
127n/a # before trying to convert to int we need to make
128n/a # sure that robots.txt has valid syntax otherwise
129n/a # it will crash
130n/a if line[1].strip().isdigit():
131n/a entry.delay = int(line[1])
132n/a state = 2
133n/a elif line[0] == "request-rate":
134n/a if state != 0:
135n/a numbers = line[1].split('/')
136n/a # check if all values are sane
137n/a if (len(numbers) == 2 and numbers[0].strip().isdigit()
138n/a and numbers[1].strip().isdigit()):
139n/a req_rate = collections.namedtuple('req_rate',
140n/a 'requests seconds')
141n/a entry.req_rate = req_rate
142n/a entry.req_rate.requests = int(numbers[0])
143n/a entry.req_rate.seconds = int(numbers[1])
144n/a state = 2
145n/a if state == 2:
146n/a self._add_entry(entry)
147n/a
148n/a def can_fetch(self, useragent, url):
149n/a """using the parsed robots.txt decide if useragent can fetch url"""
150n/a if self.disallow_all:
151n/a return False
152n/a if self.allow_all:
153n/a return True
154n/a # Until the robots.txt file has been read or found not
155n/a # to exist, we must assume that no url is allowable.
156n/a # This prevents false positives when a user erroneously
157n/a # calls can_fetch() before calling read().
158n/a if not self.last_checked:
159n/a return False
160n/a # search for given user agent matches
161n/a # the first match counts
162n/a parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
163n/a url = urllib.parse.urlunparse(('','',parsed_url.path,
164n/a parsed_url.params,parsed_url.query, parsed_url.fragment))
165n/a url = urllib.parse.quote(url)
166n/a if not url:
167n/a url = "/"
168n/a for entry in self.entries:
169n/a if entry.applies_to(useragent):
170n/a return entry.allowance(url)
171n/a # try the default entry last
172n/a if self.default_entry:
173n/a return self.default_entry.allowance(url)
174n/a # agent not found ==> access granted
175n/a return True
176n/a
177n/a def crawl_delay(self, useragent):
178n/a if not self.mtime():
179n/a return None
180n/a for entry in self.entries:
181n/a if entry.applies_to(useragent):
182n/a return entry.delay
183n/a return self.default_entry.delay
184n/a
185n/a def request_rate(self, useragent):
186n/a if not self.mtime():
187n/a return None
188n/a for entry in self.entries:
189n/a if entry.applies_to(useragent):
190n/a return entry.req_rate
191n/a return self.default_entry.req_rate
192n/a
193n/a def __str__(self):
194n/a return ''.join([str(entry) + "\n" for entry in self.entries])
195n/a
196n/a
197n/aclass RuleLine:
198n/a """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
199n/a (allowance==False) followed by a path."""
200n/a def __init__(self, path, allowance):
201n/a if path == '' and not allowance:
202n/a # an empty value means allow all
203n/a allowance = True
204n/a path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
205n/a self.path = urllib.parse.quote(path)
206n/a self.allowance = allowance
207n/a
208n/a def applies_to(self, filename):
209n/a return self.path == "*" or filename.startswith(self.path)
210n/a
211n/a def __str__(self):
212n/a return ("Allow" if self.allowance else "Disallow") + ": " + self.path
213n/a
214n/a
215n/aclass Entry:
216n/a """An entry has one or more user-agents and zero or more rulelines"""
217n/a def __init__(self):
218n/a self.useragents = []
219n/a self.rulelines = []
220n/a self.delay = None
221n/a self.req_rate = None
222n/a
223n/a def __str__(self):
224n/a ret = []
225n/a for agent in self.useragents:
226n/a ret.extend(["User-agent: ", agent, "\n"])
227n/a for line in self.rulelines:
228n/a ret.extend([str(line), "\n"])
229n/a return ''.join(ret)
230n/a
231n/a def applies_to(self, useragent):
232n/a """check if this entry applies to the specified agent"""
233n/a # split the name token and make it lower case
234n/a useragent = useragent.split("/")[0].lower()
235n/a for agent in self.useragents:
236n/a if agent == '*':
237n/a # we have the catch-all agent
238n/a return True
239n/a agent = agent.lower()
240n/a if agent in useragent:
241n/a return True
242n/a return False
243n/a
244n/a def allowance(self, filename):
245n/a """Preconditions:
246n/a - our agent applies to this entry
247n/a - filename is URL decoded"""
248n/a for line in self.rulelines:
249n/a if line.applies_to(filename):
250n/a return line.allowance
251n/a return True