ยปCore Development>Code coverage>Lib/packaging/pypi/simple.py

Python code coverage for Lib/packaging/pypi/simple.py

#countcontent
1n/a"""Spider using the screen-scraping "simple" PyPI API.
2n/a
3n/aThis module contains the class Crawler, a simple spider that
4n/acan be used to find and retrieve distributions from a project index
5n/a(like the Python Package Index), using its so-called simple API (see
6n/areference implementation available at http://pypi.python.org/simple/).
7n/a"""
8n/a
9n/aimport http.client
10n/aimport re
11n/aimport socket
12n/aimport sys
13n/aimport urllib.request
14n/aimport urllib.parse
15n/aimport urllib.error
16n/aimport os
17n/a
18n/afrom fnmatch import translate
19n/afrom functools import wraps
20n/afrom packaging import logger
21n/afrom packaging.metadata import Metadata
22n/afrom packaging.version import get_version_predicate
23n/afrom packaging import __version__ as packaging_version
24n/afrom packaging.pypi.base import BaseClient
25n/afrom packaging.pypi.dist import (ReleasesList, EXTENSIONS,
26n/a get_infos_from_url, MD5_HASH)
27n/afrom packaging.pypi.errors import (PackagingPyPIError, DownloadError,
28n/a UnableToDownload, CantParseArchiveName,
29n/a ReleaseNotFound, ProjectNotFound)
30n/afrom packaging.pypi.mirrors import get_mirrors
31n/a
32n/a__all__ = ['Crawler', 'DEFAULT_SIMPLE_INDEX_URL']
33n/a
34n/a# -- Constants -----------------------------------------------
35n/aDEFAULT_SIMPLE_INDEX_URL = "http://a.pypi.python.org/simple/"
36n/aDEFAULT_HOSTS = ("*",)
37n/aSOCKET_TIMEOUT = 15
38n/aUSER_AGENT = "Python-urllib/%s.%s packaging/%s" % (
39n/a sys.version_info[0], sys.version_info[1], packaging_version)
40n/a
41n/a# -- Regexps -------------------------------------------------
42n/aEGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.]+)$')
43n/aHREF = re.compile("""href\\s*=\\s*['"]?([^'"> ]+)""", re.I)
44n/aURL_SCHEME = re.compile('([-+.a-z0-9]{2,}):', re.I).match
45n/a
46n/a# This pattern matches a character entity reference (a decimal numeric
47n/a# references, a hexadecimal numeric reference, or a named reference).
48n/aENTITY_SUB = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?').sub
49n/aREL = re.compile("""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I)
50n/a
51n/a
52n/adef socket_timeout(timeout=SOCKET_TIMEOUT):
53n/a """Decorator to add a socket timeout when requesting pages on PyPI.
54n/a """
55n/a def wrapper(func):
56n/a @wraps(func)
57n/a def wrapped(self, *args, **kwargs):
58n/a old_timeout = socket.getdefaulttimeout()
59n/a if hasattr(self, "_timeout"):
60n/a timeout = self._timeout
61n/a socket.setdefaulttimeout(timeout)
62n/a try:
63n/a return func(self, *args, **kwargs)
64n/a finally:
65n/a socket.setdefaulttimeout(old_timeout)
66n/a return wrapped
67n/a return wrapper
68n/a
69n/a
70n/adef with_mirror_support():
71n/a """Decorator that makes the mirroring support easier"""
72n/a def wrapper(func):
73n/a @wraps(func)
74n/a def wrapped(self, *args, **kwargs):
75n/a try:
76n/a return func(self, *args, **kwargs)
77n/a except DownloadError:
78n/a # if an error occurs, try with the next index_url
79n/a if self._mirrors_tries >= self._mirrors_max_tries:
80n/a try:
81n/a self._switch_to_next_mirror()
82n/a except KeyError:
83n/a raise UnableToDownload("Tried all mirrors")
84n/a else:
85n/a self._mirrors_tries += 1
86n/a self._projects.clear()
87n/a return wrapped(self, *args, **kwargs)
88n/a return wrapped
89n/a return wrapper
90n/a
91n/a
92n/aclass Crawler(BaseClient):
93n/a """Provides useful tools to request the Python Package Index simple API.
94n/a
95n/a You can specify both mirrors and mirrors_url, but mirrors_url will only be
96n/a used if mirrors is set to None.
97n/a
98n/a :param index_url: the url of the simple index to search on.
99n/a :param prefer_final: if the version is not mentioned, and the last
100n/a version is not a "final" one (alpha, beta, etc.),
101n/a pick up the last final version.
102n/a :param prefer_source: if the distribution type is not mentioned, pick up
103n/a the source one if available.
104n/a :param follow_externals: tell if following external links is needed or
105n/a not. Default is False.
106n/a :param hosts: a list of hosts allowed to be processed while using
107n/a follow_externals=True. Default behavior is to follow all
108n/a hosts.
109n/a :param follow_externals: tell if following external links is needed or
110n/a not. Default is False.
111n/a :param mirrors_url: the url to look on for DNS records giving mirror
112n/a addresses.
113n/a :param mirrors: a list of mirrors (see PEP 381).
114n/a :param timeout: time in seconds to consider a url has timeouted.
115n/a :param mirrors_max_tries": number of times to try requesting informations
116n/a on mirrors before switching.
117n/a """
118n/a
119n/a def __init__(self, index_url=DEFAULT_SIMPLE_INDEX_URL, prefer_final=False,
120n/a prefer_source=True, hosts=DEFAULT_HOSTS,
121n/a follow_externals=False, mirrors_url=None, mirrors=None,
122n/a timeout=SOCKET_TIMEOUT, mirrors_max_tries=0):
123n/a super(Crawler, self).__init__(prefer_final, prefer_source)
124n/a self.follow_externals = follow_externals
125n/a
126n/a # mirroring attributes.
127n/a parsed = urllib.parse.urlparse(index_url)
128n/a self.scheme = parsed[0]
129n/a if self.scheme == 'file':
130n/a ender = os.path.sep
131n/a else:
132n/a ender = '/'
133n/a if not index_url.endswith(ender):
134n/a index_url += ender
135n/a # if no mirrors are defined, use the method described in PEP 381.
136n/a if mirrors is None:
137n/a mirrors = get_mirrors(mirrors_url)
138n/a self._mirrors = set(mirrors)
139n/a self._mirrors_used = set()
140n/a self.index_url = index_url
141n/a self._mirrors_max_tries = mirrors_max_tries
142n/a self._mirrors_tries = 0
143n/a self._timeout = timeout
144n/a
145n/a # create a regexp to match all given hosts
146n/a self._allowed_hosts = re.compile('|'.join(map(translate, hosts))).match
147n/a
148n/a # we keep an index of pages we have processed, in order to avoid
149n/a # scanning them multple time (eg. if there is multiple pages pointing
150n/a # on one)
151n/a self._processed_urls = []
152n/a self._projects = {}
153n/a
154n/a @with_mirror_support()
155n/a def search_projects(self, name=None, **kwargs):
156n/a """Search the index for projects containing the given name.
157n/a
158n/a Return a list of names.
159n/a """
160n/a if '*' in name:
161n/a name.replace('*', '.*')
162n/a else:
163n/a name = "%s%s%s" % ('*.?', name, '*.?')
164n/a name = name.replace('*', '[^<]*') # avoid matching end tag
165n/a pattern = ('<a[^>]*>(%s)</a>' % name).encode('utf-8')
166n/a projectname = re.compile(pattern, re.I)
167n/a matching_projects = []
168n/a
169n/a with self._open_url(self.index_url) as index:
170n/a index_content = index.read()
171n/a
172n/a for match in projectname.finditer(index_content):
173n/a project_name = match.group(1).decode('utf-8')
174n/a matching_projects.append(self._get_project(project_name))
175n/a return matching_projects
176n/a
177n/a def get_releases(self, requirements, prefer_final=None,
178n/a force_update=False):
179n/a """Search for releases and return a ReleasesList object containing
180n/a the results.
181n/a """
182n/a predicate = get_version_predicate(requirements)
183n/a if predicate.name.lower() in self._projects and not force_update:
184n/a return self._projects.get(predicate.name.lower())
185n/a prefer_final = self._get_prefer_final(prefer_final)
186n/a logger.debug('Reading info on PyPI about %s', predicate.name)
187n/a self._process_index_page(predicate.name)
188n/a
189n/a if predicate.name.lower() not in self._projects:
190n/a raise ProjectNotFound
191n/a
192n/a releases = self._projects.get(predicate.name.lower())
193n/a releases.sort_releases(prefer_final=prefer_final)
194n/a return releases
195n/a
196n/a def get_release(self, requirements, prefer_final=None):
197n/a """Return only one release that fulfill the given requirements"""
198n/a predicate = get_version_predicate(requirements)
199n/a release = self.get_releases(predicate, prefer_final)\
200n/a .get_last(predicate)
201n/a if not release:
202n/a raise ReleaseNotFound("No release matches the given criterias")
203n/a return release
204n/a
205n/a def get_distributions(self, project_name, version):
206n/a """Return the distributions found on the index for the specific given
207n/a release"""
208n/a # as the default behavior of get_release is to return a release
209n/a # containing the distributions, just alias it.
210n/a return self.get_release("%s (%s)" % (project_name, version))
211n/a
212n/a def get_metadata(self, project_name, version):
213n/a """Return the metadatas from the simple index.
214n/a
215n/a Currently, download one archive, extract it and use the PKG-INFO file.
216n/a """
217n/a release = self.get_distributions(project_name, version)
218n/a if not release.metadata:
219n/a location = release.get_distribution().unpack()
220n/a pkg_info = os.path.join(location, 'PKG-INFO')
221n/a release.metadata = Metadata(pkg_info)
222n/a return release
223n/a
224n/a def _switch_to_next_mirror(self):
225n/a """Switch to the next mirror (eg. point self.index_url to the next
226n/a mirror url.
227n/a
228n/a Raise a KeyError if all mirrors have been tried.
229n/a """
230n/a self._mirrors_used.add(self.index_url)
231n/a index_url = self._mirrors.pop()
232n/a # XXX use urllib.parse for a real check of missing scheme part
233n/a if not index_url.startswith(("http://", "https://", "file://")):
234n/a index_url = "http://%s" % index_url
235n/a
236n/a if not index_url.endswith("/simple"):
237n/a index_url = "%s/simple/" % index_url
238n/a
239n/a self.index_url = index_url
240n/a
241n/a def _is_browsable(self, url):
242n/a """Tell if the given URL can be browsed or not.
243n/a
244n/a It uses the follow_externals and the hosts list to tell if the given
245n/a url is browsable or not.
246n/a """
247n/a # if _index_url is contained in the given URL, we are browsing the
248n/a # index, and it's always "browsable".
249n/a # local files are always considered browable resources
250n/a if self.index_url in url or urllib.parse.urlparse(url)[0] == "file":
251n/a return True
252n/a elif self.follow_externals:
253n/a if self._allowed_hosts(urllib.parse.urlparse(url)[1]): # 1 is netloc
254n/a return True
255n/a else:
256n/a return False
257n/a return False
258n/a
259n/a def _is_distribution(self, link):
260n/a """Tell if the given URL matches to a distribution name or not.
261n/a """
262n/a #XXX find a better way to check that links are distributions
263n/a # Using a regexp ?
264n/a for ext in EXTENSIONS:
265n/a if ext in link:
266n/a return True
267n/a return False
268n/a
269n/a def _register_release(self, release=None, release_info={}):
270n/a """Register a new release.
271n/a
272n/a Both a release or a dict of release_info can be provided, the preferred
273n/a way (eg. the quicker) is the dict one.
274n/a
275n/a Return the list of existing releases for the given project.
276n/a """
277n/a # Check if the project already has a list of releases (refering to
278n/a # the project name). If not, create a new release list.
279n/a # Then, add the release to the list.
280n/a if release:
281n/a name = release.name
282n/a else:
283n/a name = release_info['name']
284n/a if name.lower() not in self._projects:
285n/a self._projects[name.lower()] = ReleasesList(name, index=self._index)
286n/a
287n/a if release:
288n/a self._projects[name.lower()].add_release(release=release)
289n/a else:
290n/a name = release_info.pop('name')
291n/a version = release_info.pop('version')
292n/a dist_type = release_info.pop('dist_type')
293n/a self._projects[name.lower()].add_release(version, dist_type,
294n/a **release_info)
295n/a return self._projects[name.lower()]
296n/a
297n/a def _process_url(self, url, project_name=None, follow_links=True):
298n/a """Process an url and search for distributions packages.
299n/a
300n/a For each URL found, if it's a download, creates a PyPIdistribution
301n/a object. If it's a homepage and we can follow links, process it too.
302n/a
303n/a :param url: the url to process
304n/a :param project_name: the project name we are searching for.
305n/a :param follow_links: Do not want to follow links more than from one
306n/a level. This parameter tells if we want to follow
307n/a the links we find (eg. run recursively this
308n/a method on it)
309n/a """
310n/a with self._open_url(url) as f:
311n/a base_url = f.url
312n/a if url not in self._processed_urls:
313n/a self._processed_urls.append(url)
314n/a link_matcher = self._get_link_matcher(url)
315n/a for link, is_download in link_matcher(f.read().decode(), base_url):
316n/a if link not in self._processed_urls:
317n/a if self._is_distribution(link) or is_download:
318n/a self._processed_urls.append(link)
319n/a # it's a distribution, so create a dist object
320n/a try:
321n/a infos = get_infos_from_url(link, project_name,
322n/a is_external=self.index_url not in url)
323n/a except CantParseArchiveName as e:
324n/a logger.warning(
325n/a "version has not been parsed: %s", e)
326n/a else:
327n/a self._register_release(release_info=infos)
328n/a else:
329n/a if self._is_browsable(link) and follow_links:
330n/a self._process_url(link, project_name,
331n/a follow_links=False)
332n/a
333n/a def _get_link_matcher(self, url):
334n/a """Returns the right link matcher function of the given url
335n/a """
336n/a if self.index_url in url:
337n/a return self._simple_link_matcher
338n/a else:
339n/a return self._default_link_matcher
340n/a
341n/a def _get_full_url(self, url, base_url):
342n/a return urllib.parse.urljoin(base_url, self._htmldecode(url))
343n/a
344n/a def _simple_link_matcher(self, content, base_url):
345n/a """Yield all links with a rel="download" or rel="homepage".
346n/a
347n/a This matches the simple index requirements for matching links.
348n/a If follow_externals is set to False, dont yeld the external
349n/a urls.
350n/a
351n/a :param content: the content of the page we want to parse
352n/a :param base_url: the url of this page.
353n/a """
354n/a for match in HREF.finditer(content):
355n/a url = self._get_full_url(match.group(1), base_url)
356n/a if MD5_HASH.match(url):
357n/a yield (url, True)
358n/a
359n/a for match in REL.finditer(content):
360n/a # search for rel links.
361n/a tag, rel = match.groups()
362n/a rels = [s.strip() for s in rel.lower().split(',')]
363n/a if 'homepage' in rels or 'download' in rels:
364n/a for match in HREF.finditer(tag):
365n/a url = self._get_full_url(match.group(1), base_url)
366n/a if 'download' in rels or self._is_browsable(url):
367n/a # yield a list of (url, is_download)
368n/a yield (url, 'download' in rels)
369n/a
370n/a def _default_link_matcher(self, content, base_url):
371n/a """Yield all links found on the page.
372n/a """
373n/a for match in HREF.finditer(content):
374n/a url = self._get_full_url(match.group(1), base_url)
375n/a if self._is_browsable(url):
376n/a yield (url, False)
377n/a
378n/a @with_mirror_support()
379n/a def _process_index_page(self, name):
380n/a """Find and process a PyPI page for the given project name.
381n/a
382n/a :param name: the name of the project to find the page
383n/a """
384n/a # Browse and index the content of the given PyPI page.
385n/a if self.scheme == 'file':
386n/a ender = os.path.sep
387n/a else:
388n/a ender = '/'
389n/a url = self.index_url + name + ender
390n/a self._process_url(url, name)
391n/a
392n/a @socket_timeout()
393n/a def _open_url(self, url):
394n/a """Open a urllib2 request, handling HTTP authentication, and local
395n/a files support.
396n/a
397n/a """
398n/a scheme, netloc, path, params, query, frag = urllib.parse.urlparse(url)
399n/a
400n/a # authentication stuff
401n/a if scheme in ('http', 'https'):
402n/a auth, host = urllib.parse.splituser(netloc)
403n/a else:
404n/a auth = None
405n/a
406n/a # add index.html automatically for filesystem paths
407n/a if scheme == 'file':
408n/a if url.endswith(os.path.sep):
409n/a url += "index.html"
410n/a
411n/a # add authorization headers if auth is provided
412n/a if auth:
413n/a auth = "Basic " + \
414n/a urllib.parse.unquote(auth).encode('base64').strip()
415n/a new_url = urllib.parse.urlunparse((
416n/a scheme, host, path, params, query, frag))
417n/a request = urllib.request.Request(new_url)
418n/a request.add_header("Authorization", auth)
419n/a else:
420n/a request = urllib.request.Request(url)
421n/a request.add_header('User-Agent', USER_AGENT)
422n/a try:
423n/a fp = urllib.request.urlopen(request)
424n/a except (ValueError, http.client.InvalidURL) as v:
425n/a msg = ' '.join([str(arg) for arg in v.args])
426n/a raise PackagingPyPIError('%s %s' % (url, msg))
427n/a except urllib.error.HTTPError as v:
428n/a return v
429n/a except urllib.error.URLError as v:
430n/a raise DownloadError("Download error for %s: %s" % (url, v.reason))
431n/a except http.client.BadStatusLine as v:
432n/a raise DownloadError('%s returned a bad status line. '
433n/a 'The server might be down, %s' % (url, v.line))
434n/a except http.client.HTTPException as v:
435n/a raise DownloadError("Download error for %s: %s" % (url, v))
436n/a except socket.timeout:
437n/a raise DownloadError("The server timeouted")
438n/a
439n/a if auth:
440n/a # Put authentication info back into request URL if same host,
441n/a # so that links found on the page will work
442n/a s2, h2, path2, param2, query2, frag2 = \
443n/a urllib.parse.urlparse(fp.url)
444n/a if s2 == scheme and h2 == host:
445n/a fp.url = urllib.parse.urlunparse(
446n/a (s2, netloc, path2, param2, query2, frag2))
447n/a return fp
448n/a
449n/a def _decode_entity(self, match):
450n/a what = match.group(1)
451n/a if what.startswith('#x'):
452n/a what = int(what[2:], 16)
453n/a elif what.startswith('#'):
454n/a what = int(what[1:])
455n/a else:
456n/a from html.entities import name2codepoint
457n/a what = name2codepoint.get(what, match.group(0))
458n/a return chr(what)
459n/a
460n/a def _htmldecode(self, text):
461n/a """Decode HTML entities in the given text."""
462n/a return ENTITY_SUB(self._decode_entity, text)