ยปCore Development>Code coverage>Lib/test/test_robotparser.py

Python code coverage for Lib/test/test_robotparser.py

#countcontent
1n/aimport io
2n/aimport os
3n/aimport unittest
4n/aimport urllib.robotparser
5n/afrom collections import namedtuple
6n/afrom test import support
7n/afrom http.server import BaseHTTPRequestHandler, HTTPServer
8n/atry:
9n/a import threading
10n/aexcept ImportError:
11n/a threading = None
12n/a
13n/a
14n/aclass BaseRobotTest:
15n/a robots_txt = ''
16n/a agent = 'test_robotparser'
17n/a good = []
18n/a bad = []
19n/a
20n/a def setUp(self):
21n/a lines = io.StringIO(self.robots_txt).readlines()
22n/a self.parser = urllib.robotparser.RobotFileParser()
23n/a self.parser.parse(lines)
24n/a
25n/a def get_agent_and_url(self, url):
26n/a if isinstance(url, tuple):
27n/a agent, url = url
28n/a return agent, url
29n/a return self.agent, url
30n/a
31n/a def test_good_urls(self):
32n/a for url in self.good:
33n/a agent, url = self.get_agent_and_url(url)
34n/a with self.subTest(url=url, agent=agent):
35n/a self.assertTrue(self.parser.can_fetch(agent, url))
36n/a
37n/a def test_bad_urls(self):
38n/a for url in self.bad:
39n/a agent, url = self.get_agent_and_url(url)
40n/a with self.subTest(url=url, agent=agent):
41n/a self.assertFalse(self.parser.can_fetch(agent, url))
42n/a
43n/a
44n/aclass UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
45n/a robots_txt = """\
46n/aUser-agent: *
47n/aDisallow: /cyberworld/map/ # This is an infinite virtual URL space
48n/aDisallow: /tmp/ # these will soon disappear
49n/aDisallow: /foo.html
50n/a """
51n/a good = ['/', '/test.html']
52n/a bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
53n/a
54n/a
55n/aclass CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
56n/a robots_txt = """\
57n/a# robots.txt for http://www.example.com/
58n/a
59n/aUser-agent: *
60n/aCrawl-delay: 1
61n/aRequest-rate: 3/15
62n/aDisallow: /cyberworld/map/ # This is an infinite virtual URL space
63n/a
64n/a# Cybermapper knows where to go.
65n/aUser-agent: cybermapper
66n/aDisallow:
67n/a """
68n/a good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
69n/a bad = ['/cyberworld/map/index.html']
70n/a
71n/a
72n/aclass RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
73n/a robots_txt = """\
74n/a# go away
75n/aUser-agent: *
76n/aDisallow: /
77n/a """
78n/a good = []
79n/a bad = ['/cyberworld/map/index.html', '/', '/tmp/']
80n/a
81n/a
82n/aclass BaseRequestRateTest(BaseRobotTest):
83n/a
84n/a def test_request_rate(self):
85n/a for url in self.good + self.bad:
86n/a agent, url = self.get_agent_and_url(url)
87n/a with self.subTest(url=url, agent=agent):
88n/a if self.crawl_delay:
89n/a self.assertEqual(
90n/a self.parser.crawl_delay(agent), self.crawl_delay
91n/a )
92n/a if self.request_rate:
93n/a self.assertEqual(
94n/a self.parser.request_rate(agent).requests,
95n/a self.request_rate.requests
96n/a )
97n/a self.assertEqual(
98n/a self.parser.request_rate(agent).seconds,
99n/a self.request_rate.seconds
100n/a )
101n/a
102n/a
103n/aclass CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
104n/a robots_txt = """\
105n/aUser-agent: figtree
106n/aCrawl-delay: 3
107n/aRequest-rate: 9/30
108n/aDisallow: /tmp
109n/aDisallow: /a%3cd.html
110n/aDisallow: /a%2fb.html
111n/aDisallow: /%7ejoe/index.html
112n/a """
113n/a agent = 'figtree'
114n/a request_rate = namedtuple('req_rate', 'requests seconds')(9, 30)
115n/a crawl_delay = 3
116n/a good = [('figtree', '/foo.html')]
117n/a bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
118n/a '/a%2fb.html', '/~joe/index.html']
119n/a
120n/a
121n/aclass DifferentAgentTest(CrawlDelayAndRequestRateTest):
122n/a agent = 'FigTree Robot libwww-perl/5.04'
123n/a # these are not actually tested, but we still need to parse it
124n/a # in order to accommodate the input parameters
125n/a request_rate = None
126n/a crawl_delay = None
127n/a
128n/a
129n/aclass InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
130n/a robots_txt = """\
131n/aUser-agent: *
132n/aDisallow: /tmp/
133n/aDisallow: /a%3Cd.html
134n/aDisallow: /a/b.html
135n/aDisallow: /%7ejoe/index.html
136n/aCrawl-delay: 3
137n/aRequest-rate: 9/banana
138n/a """
139n/a good = ['/tmp']
140n/a bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
141n/a '/%7Ejoe/index.html']
142n/a crawl_delay = 3
143n/a
144n/a
145n/aclass InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):
146n/a # From bug report #523041
147n/a robots_txt = """\
148n/aUser-Agent: *
149n/aDisallow: /.
150n/aCrawl-delay: pears
151n/a """
152n/a good = ['/foo.html']
153n/a # bug report says "/" should be denied, but that is not in the RFC
154n/a bad = []
155n/a
156n/a
157n/aclass AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
158n/a # also test that Allow and Diasallow works well with each other
159n/a robots_txt = """\
160n/aUser-agent: Googlebot
161n/aAllow: /folder1/myfile.html
162n/aDisallow: /folder1/
163n/aRequest-rate: whale/banana
164n/a """
165n/a agent = 'Googlebot'
166n/a good = ['/folder1/myfile.html']
167n/a bad = ['/folder1/anotherfile.html']
168n/a
169n/a
170n/aclass UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
171n/a # the order of User-agent should be correct. note
172n/a # that this file is incorrect because "Googlebot" is a
173n/a # substring of "Googlebot-Mobile"
174n/a robots_txt = """\
175n/aUser-agent: Googlebot
176n/aDisallow: /
177n/a
178n/aUser-agent: Googlebot-Mobile
179n/aAllow: /
180n/a """
181n/a agent = 'Googlebot'
182n/a bad = ['/something.jpg']
183n/a
184n/a
185n/aclass UserAgentGoogleMobileTest(UserAgentOrderingTest):
186n/a agent = 'Googlebot-Mobile'
187n/a
188n/a
189n/aclass GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
190n/a # Google also got the order wrong. You need
191n/a # to specify the URLs from more specific to more general
192n/a robots_txt = """\
193n/aUser-agent: Googlebot
194n/aAllow: /folder1/myfile.html
195n/aDisallow: /folder1/
196n/a """
197n/a agent = 'googlebot'
198n/a good = ['/folder1/myfile.html']
199n/a bad = ['/folder1/anotherfile.html']
200n/a
201n/a
202n/aclass DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
203n/a # see issue #6325 for details
204n/a robots_txt = """\
205n/aUser-agent: *
206n/aDisallow: /some/path?name=value
207n/a """
208n/a good = ['/some/path']
209n/a bad = ['/some/path?name=value']
210n/a
211n/a
212n/aclass UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
213n/a # obey first * entry (#4108)
214n/a robots_txt = """\
215n/aUser-agent: *
216n/aDisallow: /some/path
217n/a
218n/aUser-agent: *
219n/aDisallow: /another/path
220n/a """
221n/a good = ['/another/path']
222n/a bad = ['/some/path']
223n/a
224n/a
225n/aclass EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
226n/a # normalize the URL first (#17403)
227n/a robots_txt = """\
228n/aUser-agent: *
229n/aAllow: /some/path?
230n/aDisallow: /another/path?
231n/a """
232n/a good = ['/some/path?']
233n/a bad = ['/another/path?']
234n/a
235n/a
236n/aclass DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
237n/a robots_txt = """\
238n/aUser-agent: *
239n/aCrawl-delay: 1
240n/aRequest-rate: 3/15
241n/aDisallow: /cyberworld/map/
242n/a """
243n/a request_rate = namedtuple('req_rate', 'requests seconds')(3, 15)
244n/a crawl_delay = 1
245n/a good = ['/', '/test.html']
246n/a bad = ['/cyberworld/map/index.html']
247n/a
248n/a
249n/aclass RobotHandler(BaseHTTPRequestHandler):
250n/a
251n/a def do_GET(self):
252n/a self.send_error(403, "Forbidden access")
253n/a
254n/a def log_message(self, format, *args):
255n/a pass
256n/a
257n/a
258n/a@unittest.skipUnless(threading, 'threading required for this test')
259n/aclass PasswordProtectedSiteTestCase(unittest.TestCase):
260n/a
261n/a def setUp(self):
262n/a self.server = HTTPServer((support.HOST, 0), RobotHandler)
263n/a
264n/a self.t = threading.Thread(
265n/a name='HTTPServer serving',
266n/a target=self.server.serve_forever,
267n/a # Short poll interval to make the test finish quickly.
268n/a # Time between requests is short enough that we won't wake
269n/a # up spuriously too many times.
270n/a kwargs={'poll_interval':0.01})
271n/a self.t.daemon = True # In case this function raises.
272n/a self.t.start()
273n/a
274n/a def tearDown(self):
275n/a self.server.shutdown()
276n/a self.t.join()
277n/a self.server.server_close()
278n/a
279n/a @support.reap_threads
280n/a def testPasswordProtectedSite(self):
281n/a addr = self.server.server_address
282n/a url = 'http://' + support.HOST + ':' + str(addr[1])
283n/a robots_url = url + "/robots.txt"
284n/a parser = urllib.robotparser.RobotFileParser()
285n/a parser.set_url(url)
286n/a parser.read()
287n/a self.assertFalse(parser.can_fetch("*", robots_url))
288n/a
289n/a
290n/aclass NetworkTestCase(unittest.TestCase):
291n/a
292n/a base_url = 'http://www.pythontest.net/'
293n/a robots_txt = '{}elsewhere/robots.txt'.format(base_url)
294n/a
295n/a @classmethod
296n/a def setUpClass(cls):
297n/a support.requires('network')
298n/a with support.transient_internet(cls.base_url):
299n/a cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt)
300n/a cls.parser.read()
301n/a
302n/a def url(self, path):
303n/a return '{}{}{}'.format(
304n/a self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
305n/a )
306n/a
307n/a def test_basic(self):
308n/a self.assertFalse(self.parser.disallow_all)
309n/a self.assertFalse(self.parser.allow_all)
310n/a self.assertGreater(self.parser.mtime(), 0)
311n/a self.assertFalse(self.parser.crawl_delay('*'))
312n/a self.assertFalse(self.parser.request_rate('*'))
313n/a
314n/a def test_can_fetch(self):
315n/a self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
316n/a self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
317n/a self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
318n/a self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
319n/a self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
320n/a self.assertTrue(self.parser.can_fetch('*', self.base_url))
321n/a
322n/a def test_read_404(self):
323n/a parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt'))
324n/a parser.read()
325n/a self.assertTrue(parser.allow_all)
326n/a self.assertFalse(parser.disallow_all)
327n/a self.assertEqual(parser.mtime(), 0)
328n/a self.assertIsNone(parser.crawl_delay('*'))
329n/a self.assertIsNone(parser.request_rate('*'))
330n/a
331n/aif __name__=='__main__':
332n/a unittest.main()