PageRenderTime 59ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/env/lib/python3.3/site-packages/pip/index.py

https://github.com/wantsomechocolate/MosaicMaker
Python | 1041 lines | 931 code | 54 blank | 56 comment | 96 complexity | 830066c1e2310a99bddccbe1dfb91dbb MD5 | raw file
  1. """Routines related to PyPI, indexes"""
  2. import sys
  3. import os
  4. import re
  5. import gzip
  6. import mimetypes
  7. import posixpath
  8. import pkg_resources
  9. import random
  10. import socket
  11. import ssl
  12. import string
  13. import zlib
  14. try:
  15. import threading
  16. except ImportError:
  17. import dummy_threading as threading
  18. from pip.log import logger
  19. from pip.util import Inf, normalize_name, splitext, is_prerelease
  20. from pip.exceptions import DistributionNotFound, BestVersionAlreadyInstalled,\
  21. InstallationError
  22. from pip.backwardcompat import (WindowsError, BytesIO,
  23. Queue, urlparse,
  24. URLError, HTTPError, u,
  25. product, url2pathname,
  26. Empty as QueueEmpty)
  27. from pip.backwardcompat import CertificateError
  28. from pip.download import urlopen, path_to_url2, url_to_path, geturl, Urllib2HeadRequest
  29. from pip.wheel import Wheel, wheel_ext, wheel_setuptools_support, setuptools_requirement
  30. from pip.pep425tags import supported_tags, supported_tags_noarch, get_platform
  31. from pip.vendor import html5lib
  32. __all__ = ['PackageFinder']
  33. DEFAULT_MIRROR_HOSTNAME = "last.pypi.python.org"
  34. class PackageFinder(object):
  35. """This finds packages.
  36. This is meant to match easy_install's technique for looking for
  37. packages, by reading pages and looking for appropriate links
  38. """
  39. def __init__(self, find_links, index_urls,
  40. use_mirrors=False, mirrors=None, main_mirror_url=None,
  41. use_wheel=False, allow_external=[], allow_insecure=[],
  42. allow_all_external=False, allow_all_insecure=False,
  43. allow_all_prereleases=False):
  44. self.find_links = find_links
  45. self.index_urls = index_urls
  46. self.dependency_links = []
  47. self.cache = PageCache()
  48. # These are boring links that have already been logged somehow:
  49. self.logged_links = set()
  50. if use_mirrors:
  51. self.mirror_urls = self._get_mirror_urls(mirrors, main_mirror_url)
  52. logger.info('Using PyPI mirrors: %s' % ', '.join(self.mirror_urls))
  53. else:
  54. self.mirror_urls = []
  55. self.use_wheel = use_wheel
  56. # Do we allow (safe and verifiable) externally hosted files?
  57. self.allow_external = set(normalize_name(n) for n in allow_external)
  58. # Which names are allowed to install insecure and unverifiable files?
  59. self.allow_insecure = set(normalize_name(n) for n in allow_insecure)
  60. # Do we allow all (safe and verifiable) externally hosted files?
  61. self.allow_all_external = allow_all_external
  62. # Do we allow unsafe and unverifiable files?
  63. self.allow_all_insecure = allow_all_insecure
  64. # Stores if we ignored any external links so that we can instruct
  65. # end users how to install them if no distributions are available
  66. self.need_warn_external = False
  67. # Stores if we ignored any unsafe links so that we can instruct
  68. # end users how to install them if no distributions are available
  69. self.need_warn_insecure = False
  70. # Do we want to allow _all_ pre-releases?
  71. self.allow_all_prereleases = allow_all_prereleases
  72. @property
  73. def use_wheel(self):
  74. return self._use_wheel
  75. @use_wheel.setter
  76. def use_wheel(self, value):
  77. self._use_wheel = value
  78. if self._use_wheel and not wheel_setuptools_support():
  79. raise InstallationError("pip's wheel support requires %s." % setuptools_requirement)
  80. def add_dependency_links(self, links):
  81. ## FIXME: this shouldn't be global list this, it should only
  82. ## apply to requirements of the package that specifies the
  83. ## dependency_links value
  84. ## FIXME: also, we should track comes_from (i.e., use Link)
  85. self.dependency_links.extend(links)
  86. def _sort_locations(self, locations):
  87. """
  88. Sort locations into "files" (archives) and "urls", and return
  89. a pair of lists (files,urls)
  90. """
  91. files = []
  92. urls = []
  93. # puts the url for the given file path into the appropriate list
  94. def sort_path(path):
  95. url = path_to_url2(path)
  96. if mimetypes.guess_type(url, strict=False)[0] == 'text/html':
  97. urls.append(url)
  98. else:
  99. files.append(url)
  100. for url in locations:
  101. is_local_path = os.path.exists(url)
  102. is_file_url = url.startswith('file:')
  103. is_find_link = url in self.find_links
  104. if is_local_path or is_file_url:
  105. if is_local_path:
  106. path = url
  107. else:
  108. path = url_to_path(url)
  109. if is_find_link and os.path.isdir(path):
  110. path = os.path.realpath(path)
  111. for item in os.listdir(path):
  112. sort_path(os.path.join(path, item))
  113. elif is_file_url and os.path.isdir(path):
  114. urls.append(url)
  115. elif os.path.isfile(path):
  116. sort_path(path)
  117. else:
  118. urls.append(url)
  119. return files, urls
  120. def _link_sort_key(self, link_tuple):
  121. """
  122. Function used to generate link sort key for link tuples.
  123. The greater the return value, the more preferred it is.
  124. If not finding wheels, then sorted by version only.
  125. If finding wheels, then the sort order is by version, then:
  126. 1. existing installs
  127. 2. wheels ordered via Wheel.support_index_min()
  128. 3. source archives
  129. Note: it was considered to embed this logic into the Link
  130. comparison operators, but then different sdist links
  131. with the same version, would have to be considered equal
  132. """
  133. parsed_version, link, _ = link_tuple
  134. if self.use_wheel:
  135. support_num = len(supported_tags)
  136. if link == InfLink: # existing install
  137. pri = 1
  138. elif link.wheel:
  139. # all wheel links are known to be supported at this stage
  140. pri = -(link.wheel.support_index_min())
  141. else: # sdist
  142. pri = -(support_num)
  143. return (parsed_version, pri)
  144. else:
  145. return parsed_version
  146. def _sort_versions(self, applicable_versions):
  147. """
  148. Bring the latest version (and wheels) to the front, but maintain the existing ordering as secondary.
  149. See the docstring for `_link_sort_key` for details.
  150. This function is isolated for easier unit testing.
  151. """
  152. return sorted(applicable_versions, key=self._link_sort_key, reverse=True)
  153. def find_requirement(self, req, upgrade):
  154. def mkurl_pypi_url(url):
  155. loc = posixpath.join(url, url_name)
  156. # For maximum compatibility with easy_install, ensure the path
  157. # ends in a trailing slash. Although this isn't in the spec
  158. # (and PyPI can handle it without the slash) some other index
  159. # implementations might break if they relied on easy_install's behavior.
  160. if not loc.endswith('/'):
  161. loc = loc + '/'
  162. return loc
  163. url_name = req.url_name
  164. # Only check main index if index URL is given:
  165. main_index_url = None
  166. if self.index_urls:
  167. # Check that we have the url_name correctly spelled:
  168. main_index_url = Link(mkurl_pypi_url(self.index_urls[0]), trusted=True)
  169. # This will also cache the page, so it's okay that we get it again later:
  170. page = self._get_page(main_index_url, req)
  171. if page is None:
  172. url_name = self._find_url_name(Link(self.index_urls[0], trusted=True), url_name, req) or req.url_name
  173. # Combine index URLs with mirror URLs here to allow
  174. # adding more index URLs from requirements files
  175. all_index_urls = self.index_urls + self.mirror_urls
  176. if url_name is not None:
  177. locations = [
  178. mkurl_pypi_url(url)
  179. for url in all_index_urls] + self.find_links
  180. else:
  181. locations = list(self.find_links)
  182. for version in req.absolute_versions:
  183. if url_name is not None and main_index_url is not None:
  184. locations = [
  185. posixpath.join(main_index_url.url, version)] + locations
  186. file_locations, url_locations = self._sort_locations(locations)
  187. _flocations, _ulocations = self._sort_locations(self.dependency_links)
  188. file_locations.extend(_flocations)
  189. # We trust every url that the user has given us whether it was given
  190. # via --index-url, --user-mirrors/--mirror, or --find-links or a
  191. # default option thereof
  192. locations = [Link(url, trusted=True) for url in url_locations]
  193. # We explicitly do not trust links that came from dependency_links
  194. locations.extend([Link(url) for url in _ulocations])
  195. logger.debug('URLs to search for versions for %s:' % req)
  196. for location in locations:
  197. logger.debug('* %s' % location)
  198. found_versions = []
  199. found_versions.extend(
  200. self._package_versions(
  201. # We trust every directly linked archive in find_links
  202. [Link(url, '-f', trusted=True) for url in self.find_links], req.name.lower()))
  203. page_versions = []
  204. for page in self._get_pages(locations, req):
  205. logger.debug('Analyzing links from page %s' % page.url)
  206. logger.indent += 2
  207. try:
  208. page_versions.extend(self._package_versions(page.links, req.name.lower()))
  209. finally:
  210. logger.indent -= 2
  211. dependency_versions = list(self._package_versions(
  212. [Link(url) for url in self.dependency_links], req.name.lower()))
  213. if dependency_versions:
  214. logger.info('dependency_links found: %s' % ', '.join([link.url for parsed, link, version in dependency_versions]))
  215. file_versions = list(self._package_versions(
  216. [Link(url) for url in file_locations], req.name.lower()))
  217. if not found_versions and not page_versions and not dependency_versions and not file_versions:
  218. logger.fatal('Could not find any downloads that satisfy the requirement %s' % req)
  219. if self.need_warn_external:
  220. logger.warn("Some externally hosted files were ignored (use "
  221. "--allow-external %s to allow)." % req.name)
  222. if self.need_warn_insecure:
  223. logger.warn("Some insecure and unverifiable files were ignored"
  224. " (use --allow-insecure %s to allow)." % req.name)
  225. raise DistributionNotFound('No distributions at all found for %s' % req)
  226. installed_version = []
  227. if req.satisfied_by is not None:
  228. installed_version = [(req.satisfied_by.parsed_version, InfLink, req.satisfied_by.version)]
  229. if file_versions:
  230. file_versions.sort(reverse=True)
  231. logger.info('Local files found: %s' % ', '.join([url_to_path(link.url) for parsed, link, version in file_versions]))
  232. #this is an intentional priority ordering
  233. all_versions = installed_version + file_versions + found_versions + page_versions + dependency_versions
  234. applicable_versions = []
  235. for (parsed_version, link, version) in all_versions:
  236. if version not in req.req:
  237. logger.info("Ignoring link %s, version %s doesn't match %s"
  238. % (link, version, ','.join([''.join(s) for s in req.req.specs])))
  239. continue
  240. elif is_prerelease(version) and not (self.allow_all_prereleases or req.prereleases):
  241. # If this version isn't the already installed one, then
  242. # ignore it if it's a pre-release.
  243. if link is not InfLink:
  244. logger.info("Ignoring link %s, version %s is a pre-release (use --pre to allow)." % (link, version))
  245. continue
  246. applicable_versions.append((parsed_version, link, version))
  247. applicable_versions = self._sort_versions(applicable_versions)
  248. existing_applicable = bool([link for parsed_version, link, version in applicable_versions if link is InfLink])
  249. if not upgrade and existing_applicable:
  250. if applicable_versions[0][1] is InfLink:
  251. logger.info('Existing installed version (%s) is most up-to-date and satisfies requirement'
  252. % req.satisfied_by.version)
  253. else:
  254. logger.info('Existing installed version (%s) satisfies requirement (most up-to-date version is %s)'
  255. % (req.satisfied_by.version, applicable_versions[0][2]))
  256. return None
  257. if not applicable_versions:
  258. logger.fatal('Could not find a version that satisfies the requirement %s (from versions: %s)'
  259. % (req, ', '.join([version for parsed_version, link, version in all_versions])))
  260. if self.need_warn_external:
  261. logger.warn("Some externally hosted files were ignored (use "
  262. "--allow-external to allow).")
  263. if self.need_warn_insecure:
  264. logger.warn("Some insecure and unverifiable files were ignored"
  265. " (use --allow-insecure %s to allow)." % req.name)
  266. raise DistributionNotFound('No distributions matching the version for %s' % req)
  267. if applicable_versions[0][1] is InfLink:
  268. # We have an existing version, and its the best version
  269. logger.info('Installed version (%s) is most up-to-date (past versions: %s)'
  270. % (req.satisfied_by.version, ', '.join([version for parsed_version, link, version in applicable_versions[1:]]) or 'none'))
  271. raise BestVersionAlreadyInstalled
  272. if len(applicable_versions) > 1:
  273. logger.info('Using version %s (newest of versions: %s)' %
  274. (applicable_versions[0][2], ', '.join([version for parsed_version, link, version in applicable_versions])))
  275. selected_version = applicable_versions[0][1]
  276. # TODO: Remove after 1.4 has been released
  277. if (selected_version.internal is not None
  278. and not selected_version.internal):
  279. logger.warn("You are installing an externally hosted file. Future "
  280. "versions of pip will default to disallowing "
  281. "externally hosted files.")
  282. if (selected_version.verifiable is not None
  283. and not selected_version.verifiable):
  284. logger.warn("You are installing a potentially insecure and "
  285. "unverifiable file. Future versions of pip will "
  286. "default to disallowing insecure files.")
  287. return selected_version
  288. def _find_url_name(self, index_url, url_name, req):
  289. """Finds the true URL name of a package, when the given name isn't quite correct.
  290. This is usually used to implement case-insensitivity."""
  291. if not index_url.url.endswith('/'):
  292. # Vaguely part of the PyPI API... weird but true.
  293. ## FIXME: bad to modify this?
  294. index_url.url += '/'
  295. page = self._get_page(index_url, req)
  296. if page is None:
  297. logger.fatal('Cannot fetch index base URL %s' % index_url)
  298. return
  299. norm_name = normalize_name(req.url_name)
  300. for link in page.links:
  301. base = posixpath.basename(link.path.rstrip('/'))
  302. if norm_name == normalize_name(base):
  303. logger.notify('Real name of requirement %s is %s' % (url_name, base))
  304. return base
  305. return None
  306. def _get_pages(self, locations, req):
  307. """Yields (page, page_url) from the given locations, skipping
  308. locations that have errors, and adding download/homepage links"""
  309. pending_queue = Queue()
  310. for location in locations:
  311. pending_queue.put(location)
  312. done = []
  313. seen = set()
  314. threads = []
  315. for i in range(min(10, len(locations))):
  316. t = threading.Thread(target=self._get_queued_page, args=(req, pending_queue, done, seen))
  317. t.setDaemon(True)
  318. threads.append(t)
  319. t.start()
  320. for t in threads:
  321. t.join()
  322. return done
  323. _log_lock = threading.Lock()
  324. def _get_queued_page(self, req, pending_queue, done, seen):
  325. while 1:
  326. try:
  327. location = pending_queue.get(False)
  328. except QueueEmpty:
  329. return
  330. if location in seen:
  331. continue
  332. seen.add(location)
  333. page = self._get_page(location, req)
  334. if page is None:
  335. continue
  336. done.append(page)
  337. for link in page.rel_links():
  338. normalized = normalize_name(req.name).lower()
  339. if (not normalized in self.allow_external
  340. and not self.allow_all_external):
  341. self.need_warn_external = True
  342. logger.debug("Not searching %s for files because external "
  343. "urls are disallowed." % link)
  344. continue
  345. if (link.trusted is not None
  346. and not link.trusted
  347. and not normalized in self.allow_insecure
  348. and not self.allow_all_insecure): # TODO: Remove after release
  349. logger.debug("Not searching %s for urls, it is an "
  350. "untrusted link and cannot produce safe or "
  351. "verifiable files." % link)
  352. self.need_warn_insecure = True
  353. continue
  354. pending_queue.put(link)
  355. _egg_fragment_re = re.compile(r'#egg=([^&]*)')
  356. _egg_info_re = re.compile(r'([a-z0-9_.]+)-([a-z0-9_.-]+)', re.I)
  357. _py_version_re = re.compile(r'-py([123]\.?[0-9]?)$')
  358. def _sort_links(self, links):
  359. "Returns elements of links in order, non-egg links first, egg links second, while eliminating duplicates"
  360. eggs, no_eggs = [], []
  361. seen = set()
  362. for link in links:
  363. if link not in seen:
  364. seen.add(link)
  365. if link.egg_fragment:
  366. eggs.append(link)
  367. else:
  368. no_eggs.append(link)
  369. return no_eggs + eggs
  370. def _package_versions(self, links, search_name):
  371. for link in self._sort_links(links):
  372. for v in self._link_package_versions(link, search_name):
  373. yield v
  374. def _known_extensions(self):
  375. extensions = ('.tar.gz', '.tar.bz2', '.tar', '.tgz', '.zip')
  376. if self.use_wheel:
  377. return extensions + (wheel_ext,)
  378. return extensions
  379. def _link_package_versions(self, link, search_name):
  380. """
  381. Return an iterable of triples (pkg_resources_version_key,
  382. link, python_version) that can be extracted from the given
  383. link.
  384. Meant to be overridden by subclasses, not called by clients.
  385. """
  386. platform = get_platform()
  387. version = None
  388. if link.egg_fragment:
  389. egg_info = link.egg_fragment
  390. else:
  391. egg_info, ext = link.splitext()
  392. if not ext:
  393. if link not in self.logged_links:
  394. logger.debug('Skipping link %s; not a file' % link)
  395. self.logged_links.add(link)
  396. return []
  397. if egg_info.endswith('.tar'):
  398. # Special double-extension case:
  399. egg_info = egg_info[:-4]
  400. ext = '.tar' + ext
  401. if ext not in self._known_extensions():
  402. if link not in self.logged_links:
  403. logger.debug('Skipping link %s; unknown archive format: %s' % (link, ext))
  404. self.logged_links.add(link)
  405. return []
  406. if "macosx10" in link.path and ext == '.zip':
  407. if link not in self.logged_links:
  408. logger.debug('Skipping link %s; macosx10 one' % (link))
  409. self.logged_links.add(link)
  410. return []
  411. if link.wheel and link.wheel.name.lower() == search_name.lower():
  412. version = link.wheel.version
  413. if not link.wheel.supported():
  414. logger.debug('Skipping %s because it is not compatible with this Python' % link)
  415. return []
  416. # This is a dirty hack to prevent installing Binary Wheels from
  417. # PyPI or one of its mirrors unless it is a Windows Binary
  418. # Wheel. This is paired with a change to PyPI disabling
  419. # uploads for the same. Once we have a mechanism for enabling
  420. # support for binary wheels on linux that deals with the
  421. # inherent problems of binary distribution this can be
  422. # removed.
  423. comes_from = getattr(link, "comes_from", None)
  424. if (not platform.startswith('win')
  425. and comes_from is not None
  426. and urlparse.urlparse(comes_from.url).netloc.endswith(
  427. "pypi.python.org")):
  428. if not link.wheel.supported(tags=supported_tags_noarch):
  429. logger.debug(
  430. "Skipping %s because it is a pypi-hosted binary "
  431. "Wheel on an unsupported platform" % link
  432. )
  433. return []
  434. if not version:
  435. version = self._egg_info_matches(egg_info, search_name, link)
  436. if version is None:
  437. logger.debug('Skipping link %s; wrong project name (not %s)' % (link, search_name))
  438. return []
  439. if (link.internal is not None
  440. and not link.internal
  441. and not normalize_name(search_name).lower() in self.allow_external
  442. and not self.allow_all_external):
  443. # We have a link that we are sure is external, so we should skip
  444. # it unless we are allowing externals
  445. logger.debug("Skipping %s because it is externally hosted." % link)
  446. self.need_warn_external = True
  447. return []
  448. if (link.verifiable is not None
  449. and not link.verifiable
  450. and not normalize_name(search_name).lower() in self.allow_insecure
  451. and not self.allow_all_insecure): # TODO: Remove after release
  452. # We have a link that we are sure we cannot verify it's integrity,
  453. # so we should skip it unless we are allowing unsafe installs
  454. # for this requirement.
  455. logger.debug("Skipping %s because it is an insecure and "
  456. "unverifiable file." % link)
  457. self.need_warn_insecure = True
  458. return []
  459. match = self._py_version_re.search(version)
  460. if match:
  461. version = version[:match.start()]
  462. py_version = match.group(1)
  463. if py_version != sys.version[:3]:
  464. logger.debug('Skipping %s because Python version is incorrect' % link)
  465. return []
  466. logger.debug('Found link %s, version: %s' % (link, version))
  467. return [(pkg_resources.parse_version(version),
  468. link,
  469. version)]
  470. def _egg_info_matches(self, egg_info, search_name, link):
  471. match = self._egg_info_re.search(egg_info)
  472. if not match:
  473. logger.debug('Could not parse version from link: %s' % link)
  474. return None
  475. name = match.group(0).lower()
  476. # To match the "safe" name that pkg_resources creates:
  477. name = name.replace('_', '-')
  478. # project name and version must be separated by a dash
  479. look_for = search_name.lower() + "-"
  480. if name.startswith(look_for):
  481. return match.group(0)[len(look_for):]
  482. else:
  483. return None
  484. def _get_page(self, link, req):
  485. return HTMLPage.get_page(link, req, cache=self.cache)
  486. def _get_mirror_urls(self, mirrors=None, main_mirror_url=None):
  487. """Retrieves a list of URLs from the main mirror DNS entry
  488. unless a list of mirror URLs are passed.
  489. """
  490. if not mirrors:
  491. mirrors = get_mirrors(main_mirror_url)
  492. # Should this be made "less random"? E.g. netselect like?
  493. random.shuffle(mirrors)
  494. mirror_urls = set()
  495. for mirror_url in mirrors:
  496. mirror_url = mirror_url.rstrip('/')
  497. # Make sure we have a valid URL
  498. if not any([mirror_url.startswith(scheme) for scheme in ["http://", "https://", "file://"]]):
  499. mirror_url = "http://%s" % mirror_url
  500. if not mirror_url.endswith("/simple"):
  501. mirror_url = "%s/simple" % mirror_url
  502. mirror_urls.add(mirror_url + '/')
  503. return list(mirror_urls)
  504. class PageCache(object):
  505. """Cache of HTML pages"""
  506. failure_limit = 3
  507. def __init__(self):
  508. self._failures = {}
  509. self._pages = {}
  510. self._archives = {}
  511. def too_many_failures(self, url):
  512. return self._failures.get(url, 0) >= self.failure_limit
  513. def get_page(self, url):
  514. return self._pages.get(url)
  515. def is_archive(self, url):
  516. return self._archives.get(url, False)
  517. def set_is_archive(self, url, value=True):
  518. self._archives[url] = value
  519. def add_page_failure(self, url, level):
  520. self._failures[url] = self._failures.get(url, 0)+level
  521. def add_page(self, urls, page):
  522. for url in urls:
  523. self._pages[url] = page
  524. class HTMLPage(object):
  525. """Represents one page, along with its URL"""
  526. ## FIXME: these regexes are horrible hacks:
  527. _homepage_re = re.compile(r'<th>\s*home\s*page', re.I)
  528. _download_re = re.compile(r'<th>\s*download\s+url', re.I)
  529. _href_re = re.compile('href=(?:"([^"]*)"|\'([^\']*)\'|([^>\\s\\n]*))', re.I|re.S)
  530. def __init__(self, content, url, headers=None, trusted=None):
  531. self.content = content
  532. self.parsed = html5lib.parse(self.content, namespaceHTMLElements=False)
  533. self.url = url
  534. self.headers = headers
  535. self.trusted = trusted
  536. def __str__(self):
  537. return self.url
  538. @classmethod
  539. def get_page(cls, link, req, cache=None, skip_archives=True):
  540. url = link.url
  541. url = url.split('#', 1)[0]
  542. if cache.too_many_failures(url):
  543. return None
  544. # Check for VCS schemes that do not support lookup as web pages.
  545. from pip.vcs import VcsSupport
  546. for scheme in VcsSupport.schemes:
  547. if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
  548. logger.debug('Cannot look at %(scheme)s URL %(link)s' % locals())
  549. return None
  550. if cache is not None:
  551. inst = cache.get_page(url)
  552. if inst is not None:
  553. return inst
  554. try:
  555. if skip_archives:
  556. if cache is not None:
  557. if cache.is_archive(url):
  558. return None
  559. filename = link.filename
  560. for bad_ext in ['.tar', '.tar.gz', '.tar.bz2', '.tgz', '.zip']:
  561. if filename.endswith(bad_ext):
  562. content_type = cls._get_content_type(url)
  563. if content_type.lower().startswith('text/html'):
  564. break
  565. else:
  566. logger.debug('Skipping page %s because of Content-Type: %s' % (link, content_type))
  567. if cache is not None:
  568. cache.set_is_archive(url)
  569. return None
  570. logger.debug('Getting page %s' % url)
  571. # Tack index.html onto file:// URLs that point to directories
  572. (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url)
  573. if scheme == 'file' and os.path.isdir(url2pathname(path)):
  574. # add trailing slash if not present so urljoin doesn't trim final segment
  575. if not url.endswith('/'):
  576. url += '/'
  577. url = urlparse.urljoin(url, 'index.html')
  578. logger.debug(' file: URL is directory, getting %s' % url)
  579. resp = urlopen(url)
  580. real_url = geturl(resp)
  581. headers = resp.info()
  582. contents = resp.read()
  583. encoding = headers.get('Content-Encoding', None)
  584. #XXX need to handle exceptions and add testing for this
  585. if encoding is not None:
  586. if encoding == 'gzip':
  587. contents = gzip.GzipFile(fileobj=BytesIO(contents)).read()
  588. if encoding == 'deflate':
  589. contents = zlib.decompress(contents)
  590. # The check for archives above only works if the url ends with
  591. # something that looks like an archive. However that is not a
  592. # requirement. For instance http://sourceforge.net/projects/docutils/files/docutils/0.8.1/docutils-0.8.1.tar.gz/download
  593. # redirects to http://superb-dca3.dl.sourceforge.net/project/docutils/docutils/0.8.1/docutils-0.8.1.tar.gz
  594. # Unless we issue a HEAD request on every url we cannot know
  595. # ahead of time for sure if something is HTML or not. However we
  596. # can check after we've downloaded it.
  597. content_type = headers.get('Content-Type', 'unknown')
  598. if not content_type.lower().startswith("text/html"):
  599. logger.debug('Skipping page %s because of Content-Type: %s' %
  600. (link, content_type))
  601. if cache is not None:
  602. cache.set_is_archive(url)
  603. return None
  604. inst = cls(u(contents), real_url, headers, trusted=link.trusted)
  605. except (HTTPError, URLError, socket.timeout, socket.error, OSError, WindowsError):
  606. e = sys.exc_info()[1]
  607. desc = str(e)
  608. if isinstance(e, socket.timeout):
  609. log_meth = logger.info
  610. level =1
  611. desc = 'timed out'
  612. elif isinstance(e, URLError):
  613. #ssl/certificate error
  614. if hasattr(e, 'reason') and (isinstance(e.reason, ssl.SSLError) or isinstance(e.reason, CertificateError)):
  615. desc = 'There was a problem confirming the ssl certificate: %s' % e
  616. log_meth = logger.notify
  617. else:
  618. log_meth = logger.info
  619. if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout):
  620. desc = 'timed out'
  621. level = 1
  622. else:
  623. level = 2
  624. elif isinstance(e, HTTPError) and e.code == 404:
  625. ## FIXME: notify?
  626. log_meth = logger.info
  627. level = 2
  628. else:
  629. log_meth = logger.info
  630. level = 1
  631. log_meth('Could not fetch URL %s: %s' % (link, desc))
  632. log_meth('Will skip URL %s when looking for download links for %s' % (link.url, req))
  633. if cache is not None:
  634. cache.add_page_failure(url, level)
  635. return None
  636. if cache is not None:
  637. cache.add_page([url, real_url], inst)
  638. return inst
  639. @staticmethod
  640. def _get_content_type(url):
  641. """Get the Content-Type of the given url, using a HEAD request"""
  642. scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
  643. if not scheme in ('http', 'https', 'ftp', 'ftps'):
  644. ## FIXME: some warning or something?
  645. ## assertion error?
  646. return ''
  647. req = Urllib2HeadRequest(url, headers={'Host': netloc})
  648. resp = urlopen(req)
  649. try:
  650. if hasattr(resp, 'code') and resp.code != 200 and scheme not in ('ftp', 'ftps'):
  651. ## FIXME: doesn't handle redirects
  652. return ''
  653. return resp.info().get('content-type', '')
  654. finally:
  655. resp.close()
  656. @property
  657. def api_version(self):
  658. if not hasattr(self, "_api_version"):
  659. _api_version = None
  660. metas = [x for x in self.parsed.findall(".//meta")
  661. if x.get("name", "").lower() == "api-version"]
  662. if metas:
  663. try:
  664. _api_version = int(metas[0].get("value", None))
  665. except (TypeError, ValueError):
  666. _api_version = None
  667. self._api_version = _api_version
  668. return self._api_version
  669. @property
  670. def base_url(self):
  671. if not hasattr(self, "_base_url"):
  672. base = self.parsed.find(".//base")
  673. if base is not None and base.get("href"):
  674. self._base_url = base.get("href")
  675. else:
  676. self._base_url = self.url
  677. return self._base_url
  678. @property
  679. def links(self):
  680. """Yields all links in the page"""
  681. for anchor in self.parsed.findall(".//a"):
  682. if anchor.get("href"):
  683. href = anchor.get("href")
  684. url = self.clean_link(urlparse.urljoin(self.base_url, href))
  685. # Determine if this link is internal. If that distinction
  686. # doesn't make sense in this context, then we don't make
  687. # any distinction.
  688. internal = None
  689. if self.api_version and self.api_version >= 2:
  690. # Only api_versions >= 2 have a distinction between
  691. # external and internal links
  692. internal = bool(anchor.get("rel")
  693. and "internal" in anchor.get("rel").split())
  694. yield Link(url, self, internal=internal)
  695. def rel_links(self):
  696. for url in self.explicit_rel_links():
  697. yield url
  698. for url in self.scraped_rel_links():
  699. yield url
  700. def explicit_rel_links(self, rels=('homepage', 'download')):
  701. """Yields all links with the given relations"""
  702. rels = set(rels)
  703. for anchor in self.parsed.findall(".//a"):
  704. if anchor.get("rel") and anchor.get("href"):
  705. found_rels = set(anchor.get("rel").split())
  706. # Determine the intersection between what rels were found and
  707. # what rels were being looked for
  708. if found_rels & rels:
  709. href = anchor.get("href")
  710. url = self.clean_link(urlparse.urljoin(self.base_url, href))
  711. yield Link(url, self, trusted=False)
  712. def scraped_rel_links(self):
  713. # Can we get rid of this horrible horrible method?
  714. for regex in (self._homepage_re, self._download_re):
  715. match = regex.search(self.content)
  716. if not match:
  717. continue
  718. href_match = self._href_re.search(self.content, pos=match.end())
  719. if not href_match:
  720. continue
  721. url = href_match.group(1) or href_match.group(2) or href_match.group(3)
  722. if not url:
  723. continue
  724. url = self.clean_link(urlparse.urljoin(self.base_url, url))
  725. yield Link(url, self, trusted=False)
  726. _clean_re = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I)
  727. def clean_link(self, url):
  728. """Makes sure a link is fully encoded. That is, if a ' ' shows up in
  729. the link, it will be rewritten to %20 (while not over-quoting
  730. % or other characters)."""
  731. return self._clean_re.sub(
  732. lambda match: '%%%2x' % ord(match.group(0)), url)
  733. class Link(object):
  734. def __init__(self, url, comes_from=None, internal=None, trusted=None):
  735. self.url = url
  736. self.comes_from = comes_from
  737. self.internal = internal
  738. self.trusted = trusted
  739. # Set whether it's a wheel
  740. self.wheel = None
  741. if url != Inf and self.splitext()[1] == wheel_ext:
  742. self.wheel = Wheel(self.filename)
  743. def __str__(self):
  744. if self.comes_from:
  745. return '%s (from %s)' % (self.url, self.comes_from)
  746. else:
  747. return str(self.url)
  748. def __repr__(self):
  749. return '<Link %s>' % self
  750. def __eq__(self, other):
  751. return self.url == other.url
  752. def __ne__(self, other):
  753. return self.url != other.url
  754. def __lt__(self, other):
  755. return self.url < other.url
  756. def __le__(self, other):
  757. return self.url <= other.url
  758. def __gt__(self, other):
  759. return self.url > other.url
  760. def __ge__(self, other):
  761. return self.url >= other.url
  762. def __hash__(self):
  763. return hash(self.url)
  764. @property
  765. def filename(self):
  766. _, netloc, path, _, _ = urlparse.urlsplit(self.url)
  767. name = posixpath.basename(path.rstrip('/')) or netloc
  768. assert name, ('URL %r produced no filename' % self.url)
  769. return name
  770. @property
  771. def scheme(self):
  772. return urlparse.urlsplit(self.url)[0]
  773. @property
  774. def path(self):
  775. return urlparse.urlsplit(self.url)[2]
  776. def splitext(self):
  777. return splitext(posixpath.basename(self.path.rstrip('/')))
  778. @property
  779. def url_without_fragment(self):
  780. scheme, netloc, path, query, fragment = urlparse.urlsplit(self.url)
  781. return urlparse.urlunsplit((scheme, netloc, path, query, None))
  782. _egg_fragment_re = re.compile(r'#egg=([^&]*)')
  783. @property
  784. def egg_fragment(self):
  785. match = self._egg_fragment_re.search(self.url)
  786. if not match:
  787. return None
  788. return match.group(1)
  789. _hash_re = re.compile(r'(sha1|sha224|sha384|sha256|sha512|md5)=([a-f0-9]+)')
  790. @property
  791. def hash(self):
  792. match = self._hash_re.search(self.url)
  793. if match:
  794. return match.group(2)
  795. return None
  796. @property
  797. def hash_name(self):
  798. match = self._hash_re.search(self.url)
  799. if match:
  800. return match.group(1)
  801. return None
  802. @property
  803. def show_url(self):
  804. return posixpath.basename(self.url.split('#', 1)[0].split('?', 1)[0])
  805. @property
  806. def verifiable(self):
  807. """
  808. Returns True if this link can be verified after download, False if it
  809. cannot, and None if we cannot determine.
  810. """
  811. trusted = self.trusted or getattr(self.comes_from, "trusted", None)
  812. if trusted is not None and trusted:
  813. # This link came from a trusted source. It *may* be verifiable but
  814. # first we need to see if this page is operating under the new
  815. # API version.
  816. try:
  817. api_version = getattr(self.comes_from, "api_version", None)
  818. api_version = int(api_version)
  819. except (ValueError, TypeError):
  820. api_version = None
  821. if api_version is None or api_version <= 1:
  822. # This link is either trusted, or it came from a trusted,
  823. # however it is not operating under the API version 2 so
  824. # we can't make any claims about if it's safe or not
  825. return
  826. if self.hash:
  827. # This link came from a trusted source and it has a hash, so we
  828. # can consider it safe.
  829. return True
  830. else:
  831. # This link came from a trusted source, using the new API
  832. # version, and it does not have a hash. It is NOT verifiable
  833. return False
  834. elif trusted is not None:
  835. # This link came from an untrusted source and we cannot trust it
  836. return False
  837. #An "Infinite Link" that compares greater than other links
  838. InfLink = Link(Inf) #this object is not currently used as a sortable
  839. def get_requirement_from_url(url):
  840. """Get a requirement from the URL, if possible. This looks for #egg
  841. in the URL"""
  842. link = Link(url)
  843. egg_info = link.egg_fragment
  844. if not egg_info:
  845. egg_info = splitext(link.filename)[0]
  846. return package_to_requirement(egg_info)
  847. def package_to_requirement(package_name):
  848. """Translate a name like Foo-1.2 to Foo==1.3"""
  849. match = re.search(r'^(.*?)-(dev|\d.*)', package_name)
  850. if match:
  851. name = match.group(1)
  852. version = match.group(2)
  853. else:
  854. name = package_name
  855. version = ''
  856. if version:
  857. return '%s==%s' % (name, version)
  858. else:
  859. return name
  860. def get_mirrors(hostname=None):
  861. """Return the list of mirrors from the last record found on the DNS
  862. entry::
  863. >>> from pip.index import get_mirrors
  864. >>> get_mirrors()
  865. ['a.pypi.python.org', 'b.pypi.python.org', 'c.pypi.python.org',
  866. 'd.pypi.python.org']
  867. Originally written for the distutils2 project by Alexis Metaireau.
  868. """
  869. if hostname is None:
  870. hostname = DEFAULT_MIRROR_HOSTNAME
  871. # return the last mirror registered on PyPI.
  872. last_mirror_hostname = None
  873. try:
  874. last_mirror_hostname = socket.gethostbyname_ex(hostname)[0]
  875. except socket.gaierror:
  876. return []
  877. if not last_mirror_hostname or last_mirror_hostname == DEFAULT_MIRROR_HOSTNAME:
  878. last_mirror_hostname = "z.pypi.python.org"
  879. end_letter = last_mirror_hostname.split(".", 1)
  880. # determine the list from the last one.
  881. return ["%s.%s" % (s, end_letter[1]) for s in string_range(end_letter[0])]
  882. def string_range(last):
  883. """Compute the range of string between "a" and last.
  884. This works for simple "a to z" lists, but also for "a to zz" lists.
  885. """
  886. for k in range(len(last)):
  887. for x in product(string.ascii_lowercase, repeat=k+1):
  888. result = ''.join(x)
  889. yield result
  890. if result == last:
  891. return