PageRenderTime 47ms CodeModel.GetById 14ms RepoModel.GetById 1ms app.codeStats 0ms

/setuptools/package_index.py

https://bitbucket.org/mumak/distribute
Python | 862 lines | 787 code | 43 blank | 32 comment | 43 complexity | 3c3814695122deab463f8fd240acaba2 MD5 | raw file
  1. """PyPI and direct package downloading"""
  2. import sys, os.path, re, urlparse, urllib, urllib2, shutil, random, socket, cStringIO
  3. import base64
  4. import httplib
  5. from pkg_resources import *
  6. from distutils import log
  7. from distutils.errors import DistutilsError
  8. try:
  9. from hashlib import md5
  10. except ImportError:
  11. from md5 import md5
  12. from fnmatch import translate
  13. EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.]+)$')
  14. HREF = re.compile("""href\\s*=\\s*['"]?([^'"> ]+)""", re.I)
  15. # this is here to fix emacs' cruddy broken syntax highlighting
  16. PYPI_MD5 = re.compile(
  17. '<a href="([^"#]+)">([^<]+)</a>\n\s+\\(<a (?:title="MD5 hash"\n\s+)'
  18. 'href="[^?]+\?:action=show_md5&amp;digest=([0-9a-f]{32})">md5</a>\\)'
  19. )
  20. URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):',re.I).match
  21. EXTENSIONS = ".tar.gz .tar.bz2 .tar .zip .tgz".split()
  22. __all__ = [
  23. 'PackageIndex', 'distros_for_url', 'parse_bdist_wininst',
  24. 'interpret_distro_name',
  25. ]
  26. _SOCKET_TIMEOUT = 15
  27. def parse_bdist_wininst(name):
  28. """Return (base,pyversion) or (None,None) for possible .exe name"""
  29. lower = name.lower()
  30. base, py_ver, plat = None, None, None
  31. if lower.endswith('.exe'):
  32. if lower.endswith('.win32.exe'):
  33. base = name[:-10]
  34. plat = 'win32'
  35. elif lower.startswith('.win32-py',-16):
  36. py_ver = name[-7:-4]
  37. base = name[:-16]
  38. plat = 'win32'
  39. elif lower.endswith('.win-amd64.exe'):
  40. base = name[:-14]
  41. plat = 'win-amd64'
  42. elif lower.startswith('.win-amd64-py',-20):
  43. py_ver = name[-7:-4]
  44. base = name[:-20]
  45. plat = 'win-amd64'
  46. return base,py_ver,plat
  47. def egg_info_for_url(url):
  48. scheme, server, path, parameters, query, fragment = urlparse.urlparse(url)
  49. base = urllib2.unquote(path.split('/')[-1])
  50. if '#' in base: base, fragment = base.split('#',1)
  51. return base,fragment
  52. def distros_for_url(url, metadata=None):
  53. """Yield egg or source distribution objects that might be found at a URL"""
  54. base, fragment = egg_info_for_url(url)
  55. for dist in distros_for_location(url, base, metadata): yield dist
  56. if fragment:
  57. match = EGG_FRAGMENT.match(fragment)
  58. if match:
  59. for dist in interpret_distro_name(
  60. url, match.group(1), metadata, precedence = CHECKOUT_DIST
  61. ):
  62. yield dist
  63. def distros_for_location(location, basename, metadata=None):
  64. """Yield egg or source distribution objects based on basename"""
  65. if basename.endswith('.egg.zip'):
  66. basename = basename[:-4] # strip the .zip
  67. if basename.endswith('.egg') and '-' in basename:
  68. # only one, unambiguous interpretation
  69. return [Distribution.from_location(location, basename, metadata)]
  70. if basename.endswith('.exe'):
  71. win_base, py_ver, platform = parse_bdist_wininst(basename)
  72. if win_base is not None:
  73. return interpret_distro_name(
  74. location, win_base, metadata, py_ver, BINARY_DIST, platform
  75. )
  76. # Try source distro extensions (.zip, .tgz, etc.)
  77. #
  78. for ext in EXTENSIONS:
  79. if basename.endswith(ext):
  80. basename = basename[:-len(ext)]
  81. return interpret_distro_name(location, basename, metadata)
  82. return [] # no extension matched
  83. def distros_for_filename(filename, metadata=None):
  84. """Yield possible egg or source distribution objects based on a filename"""
  85. return distros_for_location(
  86. normalize_path(filename), os.path.basename(filename), metadata
  87. )
  88. def interpret_distro_name(location, basename, metadata,
  89. py_version=None, precedence=SOURCE_DIST, platform=None
  90. ):
  91. """Generate alternative interpretations of a source distro name
  92. Note: if `location` is a filesystem filename, you should call
  93. ``pkg_resources.normalize_path()`` on it before passing it to this
  94. routine!
  95. """
  96. # Generate alternative interpretations of a source distro name
  97. # Because some packages are ambiguous as to name/versions split
  98. # e.g. "adns-python-1.1.0", "egenix-mx-commercial", etc.
  99. # So, we generate each possible interepretation (e.g. "adns, python-1.1.0"
  100. # "adns-python, 1.1.0", and "adns-python-1.1.0, no version"). In practice,
  101. # the spurious interpretations should be ignored, because in the event
  102. # there's also an "adns" package, the spurious "python-1.1.0" version will
  103. # compare lower than any numeric version number, and is therefore unlikely
  104. # to match a request for it. It's still a potential problem, though, and
  105. # in the long run PyPI and the distutils should go for "safe" names and
  106. # versions in distribution archive names (sdist and bdist).
  107. parts = basename.split('-')
  108. if not py_version:
  109. for i,p in enumerate(parts[2:]):
  110. if len(p)==5 and p.startswith('py2.'):
  111. return # It's a bdist_dumb, not an sdist -- bail out
  112. for p in range(1,len(parts)+1):
  113. yield Distribution(
  114. location, metadata, '-'.join(parts[:p]), '-'.join(parts[p:]),
  115. py_version=py_version, precedence = precedence,
  116. platform = platform
  117. )
  118. REL = re.compile("""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I)
  119. # this line is here to fix emacs' cruddy broken syntax highlighting
  120. def find_external_links(url, page):
  121. """Find rel="homepage" and rel="download" links in `page`, yielding URLs"""
  122. for match in REL.finditer(page):
  123. tag, rel = match.groups()
  124. rels = map(str.strip, rel.lower().split(','))
  125. if 'homepage' in rels or 'download' in rels:
  126. for match in HREF.finditer(tag):
  127. yield urlparse.urljoin(url, htmldecode(match.group(1)))
  128. for tag in ("<th>Home Page", "<th>Download URL"):
  129. pos = page.find(tag)
  130. if pos!=-1:
  131. match = HREF.search(page,pos)
  132. if match:
  133. yield urlparse.urljoin(url, htmldecode(match.group(1)))
  134. user_agent = "Python-urllib/%s distribute/%s" % (
  135. sys.version[:3], require('distribute')[0].version
  136. )
  137. class PackageIndex(Environment):
  138. """A distribution index that scans web pages for download URLs"""
  139. def __init__(self, index_url="http://pypi.python.org/simple", hosts=('*',),
  140. *args, **kw
  141. ):
  142. Environment.__init__(self,*args,**kw)
  143. self.index_url = index_url + "/"[:not index_url.endswith('/')]
  144. self.scanned_urls = {}
  145. self.fetched_urls = {}
  146. self.package_pages = {}
  147. self.allows = re.compile('|'.join(map(translate,hosts))).match
  148. self.to_scan = []
  149. def process_url(self, url, retrieve=False):
  150. """Evaluate a URL as a possible download, and maybe retrieve it"""
  151. if url in self.scanned_urls and not retrieve:
  152. return
  153. self.scanned_urls[url] = True
  154. if not URL_SCHEME(url):
  155. self.process_filename(url)
  156. return
  157. else:
  158. dists = list(distros_for_url(url))
  159. if dists:
  160. if not self.url_ok(url):
  161. return
  162. self.debug("Found link: %s", url)
  163. if dists or not retrieve or url in self.fetched_urls:
  164. map(self.add, dists)
  165. return # don't need the actual page
  166. if not self.url_ok(url):
  167. self.fetched_urls[url] = True
  168. return
  169. self.info("Reading %s", url)
  170. f = self.open_url(url, "Download error on %s: %%s -- Some packages may not be found!" % url)
  171. if f is None: return
  172. self.fetched_urls[url] = self.fetched_urls[f.url] = True
  173. if 'html' not in f.headers.get('content-type', '').lower():
  174. f.close() # not html, we can't process it
  175. return
  176. base = f.url # handle redirects
  177. page = f.read()
  178. if not isinstance(page, str): # We are in Python 3 and got bytes. We want str.
  179. if isinstance(f, urllib2.HTTPError):
  180. # Errors have no charset, assume latin1:
  181. charset = 'latin-1'
  182. else:
  183. charset = f.headers.get_param('charset') or 'latin-1'
  184. page = page.decode(charset, "ignore")
  185. f.close()
  186. for match in HREF.finditer(page):
  187. link = urlparse.urljoin(base, htmldecode(match.group(1)))
  188. self.process_url(link)
  189. if url.startswith(self.index_url) and getattr(f,'code',None)!=404:
  190. page = self.process_index(url, page)
  191. def process_filename(self, fn, nested=False):
  192. # process filenames or directories
  193. if not os.path.exists(fn):
  194. self.warn("Not found: %s", fn)
  195. return
  196. if os.path.isdir(fn) and not nested:
  197. path = os.path.realpath(fn)
  198. for item in os.listdir(path):
  199. self.process_filename(os.path.join(path,item), True)
  200. dists = distros_for_filename(fn)
  201. if dists:
  202. self.debug("Found: %s", fn)
  203. map(self.add, dists)
  204. def url_ok(self, url, fatal=False):
  205. s = URL_SCHEME(url)
  206. if (s and s.group(1).lower()=='file') or self.allows(urlparse.urlparse(url)[1]):
  207. return True
  208. msg = "\nLink to % s ***BLOCKED*** by --allow-hosts\n"
  209. if fatal:
  210. raise DistutilsError(msg % url)
  211. else:
  212. self.warn(msg, url)
  213. def scan_egg_links(self, search_path):
  214. for item in search_path:
  215. if os.path.isdir(item):
  216. for entry in os.listdir(item):
  217. if entry.endswith('.egg-link'):
  218. self.scan_egg_link(item, entry)
  219. def scan_egg_link(self, path, entry):
  220. lines = filter(None, map(str.strip, open(os.path.join(path, entry))))
  221. if len(lines)==2:
  222. for dist in find_distributions(os.path.join(path, lines[0])):
  223. dist.location = os.path.join(path, *lines)
  224. dist.precedence = SOURCE_DIST
  225. self.add(dist)
  226. def process_index(self,url,page):
  227. """Process the contents of a PyPI page"""
  228. def scan(link):
  229. # Process a URL to see if it's for a package page
  230. if link.startswith(self.index_url):
  231. parts = map(
  232. urllib2.unquote, link[len(self.index_url):].split('/')
  233. )
  234. if len(parts)==2 and '#' not in parts[1]:
  235. # it's a package page, sanitize and index it
  236. pkg = safe_name(parts[0])
  237. ver = safe_version(parts[1])
  238. self.package_pages.setdefault(pkg.lower(),{})[link] = True
  239. return to_filename(pkg), to_filename(ver)
  240. return None, None
  241. # process an index page into the package-page index
  242. for match in HREF.finditer(page):
  243. try:
  244. scan( urlparse.urljoin(url, htmldecode(match.group(1))) )
  245. except ValueError:
  246. pass
  247. pkg, ver = scan(url) # ensure this page is in the page index
  248. if pkg:
  249. # process individual package page
  250. for new_url in find_external_links(url, page):
  251. # Process the found URL
  252. base, frag = egg_info_for_url(new_url)
  253. if base.endswith('.py') and not frag:
  254. if ver:
  255. new_url+='#egg=%s-%s' % (pkg,ver)
  256. else:
  257. self.need_version_info(url)
  258. self.scan_url(new_url)
  259. return PYPI_MD5.sub(
  260. lambda m: '<a href="%s#md5=%s">%s</a>' % m.group(1,3,2), page
  261. )
  262. else:
  263. return "" # no sense double-scanning non-package pages
  264. def need_version_info(self, url):
  265. self.scan_all(
  266. "Page at %s links to .py file(s) without version info; an index "
  267. "scan is required.", url
  268. )
  269. def scan_all(self, msg=None, *args):
  270. if self.index_url not in self.fetched_urls:
  271. if msg: self.warn(msg,*args)
  272. self.info(
  273. "Scanning index of all packages (this may take a while)"
  274. )
  275. self.scan_url(self.index_url)
  276. def find_packages(self, requirement):
  277. self.scan_url(self.index_url + requirement.unsafe_name+'/')
  278. if not self.package_pages.get(requirement.key):
  279. # Fall back to safe version of the name
  280. self.scan_url(self.index_url + requirement.project_name+'/')
  281. if not self.package_pages.get(requirement.key):
  282. # We couldn't find the target package, so search the index page too
  283. self.not_found_in_index(requirement)
  284. for url in list(self.package_pages.get(requirement.key,())):
  285. # scan each page that might be related to the desired package
  286. self.scan_url(url)
  287. def obtain(self, requirement, installer=None):
  288. self.prescan(); self.find_packages(requirement)
  289. for dist in self[requirement.key]:
  290. if dist in requirement:
  291. return dist
  292. self.debug("%s does not match %s", requirement, dist)
  293. return super(PackageIndex, self).obtain(requirement,installer)
  294. def check_md5(self, cs, info, filename, tfp):
  295. if re.match('md5=[0-9a-f]{32}$', info):
  296. self.debug("Validating md5 checksum for %s", filename)
  297. if cs.hexdigest()<>info[4:]:
  298. tfp.close()
  299. os.unlink(filename)
  300. raise DistutilsError(
  301. "MD5 validation failed for "+os.path.basename(filename)+
  302. "; possible download problem?"
  303. )
  304. def add_find_links(self, urls):
  305. """Add `urls` to the list that will be prescanned for searches"""
  306. for url in urls:
  307. if (
  308. self.to_scan is None # if we have already "gone online"
  309. or not URL_SCHEME(url) # or it's a local file/directory
  310. or url.startswith('file:')
  311. or list(distros_for_url(url)) # or a direct package link
  312. ):
  313. # then go ahead and process it now
  314. self.scan_url(url)
  315. else:
  316. # otherwise, defer retrieval till later
  317. self.to_scan.append(url)
  318. def prescan(self):
  319. """Scan urls scheduled for prescanning (e.g. --find-links)"""
  320. if self.to_scan:
  321. map(self.scan_url, self.to_scan)
  322. self.to_scan = None # from now on, go ahead and process immediately
  323. def not_found_in_index(self, requirement):
  324. if self[requirement.key]: # we've seen at least one distro
  325. meth, msg = self.info, "Couldn't retrieve index page for %r"
  326. else: # no distros seen for this name, might be misspelled
  327. meth, msg = (self.warn,
  328. "Couldn't find index page for %r (maybe misspelled?)")
  329. meth(msg, requirement.unsafe_name)
  330. self.scan_all()
  331. def download(self, spec, tmpdir):
  332. """Locate and/or download `spec` to `tmpdir`, returning a local path
  333. `spec` may be a ``Requirement`` object, or a string containing a URL,
  334. an existing local filename, or a project/version requirement spec
  335. (i.e. the string form of a ``Requirement`` object). If it is the URL
  336. of a .py file with an unambiguous ``#egg=name-version`` tag (i.e., one
  337. that escapes ``-`` as ``_`` throughout), a trivial ``setup.py`` is
  338. automatically created alongside the downloaded file.
  339. If `spec` is a ``Requirement`` object or a string containing a
  340. project/version requirement spec, this method returns the location of
  341. a matching distribution (possibly after downloading it to `tmpdir`).
  342. If `spec` is a locally existing file or directory name, it is simply
  343. returned unchanged. If `spec` is a URL, it is downloaded to a subpath
  344. of `tmpdir`, and the local filename is returned. Various errors may be
  345. raised if a problem occurs during downloading.
  346. """
  347. if not isinstance(spec,Requirement):
  348. scheme = URL_SCHEME(spec)
  349. if scheme:
  350. # It's a url, download it to tmpdir
  351. found = self._download_url(scheme.group(1), spec, tmpdir)
  352. base, fragment = egg_info_for_url(spec)
  353. if base.endswith('.py'):
  354. found = self.gen_setup(found,fragment,tmpdir)
  355. return found
  356. elif os.path.exists(spec):
  357. # Existing file or directory, just return it
  358. return spec
  359. else:
  360. try:
  361. spec = Requirement.parse(spec)
  362. except ValueError:
  363. raise DistutilsError(
  364. "Not a URL, existing file, or requirement spec: %r" %
  365. (spec,)
  366. )
  367. return getattr(self.fetch_distribution(spec, tmpdir),'location',None)
  368. def fetch_distribution(self,
  369. requirement, tmpdir, force_scan=False, source=False, develop_ok=False,
  370. local_index=None
  371. ):
  372. """Obtain a distribution suitable for fulfilling `requirement`
  373. `requirement` must be a ``pkg_resources.Requirement`` instance.
  374. If necessary, or if the `force_scan` flag is set, the requirement is
  375. searched for in the (online) package index as well as the locally
  376. installed packages. If a distribution matching `requirement` is found,
  377. the returned distribution's ``location`` is the value you would have
  378. gotten from calling the ``download()`` method with the matching
  379. distribution's URL or filename. If no matching distribution is found,
  380. ``None`` is returned.
  381. If the `source` flag is set, only source distributions and source
  382. checkout links will be considered. Unless the `develop_ok` flag is
  383. set, development and system eggs (i.e., those using the ``.egg-info``
  384. format) will be ignored.
  385. """
  386. # process a Requirement
  387. self.info("Searching for %s", requirement)
  388. skipped = {}
  389. dist = None
  390. def find(req, env=None):
  391. if env is None:
  392. env = self
  393. # Find a matching distribution; may be called more than once
  394. for dist in env[req.key]:
  395. if dist.precedence==DEVELOP_DIST and not develop_ok:
  396. if dist not in skipped:
  397. self.warn("Skipping development or system egg: %s",dist)
  398. skipped[dist] = 1
  399. continue
  400. if dist in req and (dist.precedence<=SOURCE_DIST or not source):
  401. self.info("Best match: %s", dist)
  402. return dist.clone(
  403. location=self.download(dist.location, tmpdir)
  404. )
  405. if force_scan:
  406. self.prescan()
  407. self.find_packages(requirement)
  408. dist = find(requirement)
  409. if local_index is not None:
  410. dist = dist or find(requirement, local_index)
  411. if dist is None and self.to_scan is not None:
  412. self.prescan()
  413. dist = find(requirement)
  414. if dist is None and not force_scan:
  415. self.find_packages(requirement)
  416. dist = find(requirement)
  417. if dist is None:
  418. self.warn(
  419. "No local packages or download links found for %s%s",
  420. (source and "a source distribution of " or ""),
  421. requirement,
  422. )
  423. return dist
  424. def fetch(self, requirement, tmpdir, force_scan=False, source=False):
  425. """Obtain a file suitable for fulfilling `requirement`
  426. DEPRECATED; use the ``fetch_distribution()`` method now instead. For
  427. backward compatibility, this routine is identical but returns the
  428. ``location`` of the downloaded distribution instead of a distribution
  429. object.
  430. """
  431. dist = self.fetch_distribution(requirement,tmpdir,force_scan,source)
  432. if dist is not None:
  433. return dist.location
  434. return None
  435. def gen_setup(self, filename, fragment, tmpdir):
  436. match = EGG_FRAGMENT.match(fragment)
  437. dists = match and [d for d in
  438. interpret_distro_name(filename, match.group(1), None) if d.version
  439. ] or []
  440. if len(dists)==1: # unambiguous ``#egg`` fragment
  441. basename = os.path.basename(filename)
  442. # Make sure the file has been downloaded to the temp dir.
  443. if os.path.dirname(filename) != tmpdir:
  444. dst = os.path.join(tmpdir, basename)
  445. from setuptools.command.easy_install import samefile
  446. if not samefile(filename, dst):
  447. shutil.copy2(filename, dst)
  448. filename=dst
  449. file = open(os.path.join(tmpdir, 'setup.py'), 'w')
  450. file.write(
  451. "from setuptools import setup\n"
  452. "setup(name=%r, version=%r, py_modules=[%r])\n"
  453. % (
  454. dists[0].project_name, dists[0].version,
  455. os.path.splitext(basename)[0]
  456. )
  457. )
  458. file.close()
  459. return filename
  460. elif match:
  461. raise DistutilsError(
  462. "Can't unambiguously interpret project/version identifier %r; "
  463. "any dashes in the name or version should be escaped using "
  464. "underscores. %r" % (fragment,dists)
  465. )
  466. else:
  467. raise DistutilsError(
  468. "Can't process plain .py files without an '#egg=name-version'"
  469. " suffix to enable automatic setup script generation."
  470. )
  471. dl_blocksize = 8192
  472. def _download_to(self, url, filename):
  473. self.info("Downloading %s", url)
  474. # Download the file
  475. fp, tfp, info = None, None, None
  476. try:
  477. if '#' in url:
  478. url, info = url.split('#', 1)
  479. fp = self.open_url(url)
  480. if isinstance(fp, urllib2.HTTPError):
  481. raise DistutilsError(
  482. "Can't download %s: %s %s" % (url, fp.code,fp.msg)
  483. )
  484. cs = md5()
  485. headers = fp.info()
  486. blocknum = 0
  487. bs = self.dl_blocksize
  488. size = -1
  489. if "content-length" in headers:
  490. # Some servers return multiple Content-Length headers :(
  491. content_length = headers.get("Content-Length")
  492. size = int(content_length)
  493. self.reporthook(url, filename, blocknum, bs, size)
  494. tfp = open(filename,'wb')
  495. while True:
  496. block = fp.read(bs)
  497. if block:
  498. cs.update(block)
  499. tfp.write(block)
  500. blocknum += 1
  501. self.reporthook(url, filename, blocknum, bs, size)
  502. else:
  503. break
  504. if info: self.check_md5(cs, info, filename, tfp)
  505. return headers
  506. finally:
  507. if fp: fp.close()
  508. if tfp: tfp.close()
  509. def reporthook(self, url, filename, blocknum, blksize, size):
  510. pass # no-op
  511. def open_url(self, url, warning=None):
  512. if url.startswith('file:'):
  513. return local_open(url)
  514. try:
  515. return open_with_auth(url)
  516. except (ValueError, httplib.InvalidURL), v:
  517. msg = ' '.join([str(arg) for arg in v.args])
  518. if warning:
  519. self.warn(warning, msg)
  520. else:
  521. raise DistutilsError('%s %s' % (url, msg))
  522. except urllib2.HTTPError, v:
  523. return v
  524. except urllib2.URLError, v:
  525. if warning:
  526. self.warn(warning, v.reason)
  527. else:
  528. raise DistutilsError("Download error for %s: %s"
  529. % (url, v.reason))
  530. except httplib.BadStatusLine, v:
  531. if warning:
  532. self.warn(warning, v.line)
  533. else:
  534. raise DistutilsError('%s returned a bad status line. '
  535. 'The server might be down, %s' % \
  536. (url, v.line))
  537. except httplib.HTTPException, v:
  538. if warning:
  539. self.warn(warning, v)
  540. else:
  541. raise DistutilsError("Download error for %s: %s"
  542. % (url, v))
  543. def _download_url(self, scheme, url, tmpdir):
  544. # Determine download filename
  545. #
  546. name = filter(None,urlparse.urlparse(url)[2].split('/'))
  547. if name:
  548. name = name[-1]
  549. while '..' in name:
  550. name = name.replace('..','.').replace('\\','_')
  551. else:
  552. name = "__downloaded__" # default if URL has no path contents
  553. if name.endswith('.egg.zip'):
  554. name = name[:-4] # strip the extra .zip before download
  555. filename = os.path.join(tmpdir,name)
  556. # Download the file
  557. #
  558. if scheme=='svn' or scheme.startswith('svn+'):
  559. return self._download_svn(url, filename)
  560. elif scheme=='file':
  561. return urllib.url2pathname(urlparse.urlparse(url)[2])
  562. else:
  563. self.url_ok(url, True) # raises error if not allowed
  564. return self._attempt_download(url, filename)
  565. def scan_url(self, url):
  566. self.process_url(url, True)
  567. def _attempt_download(self, url, filename):
  568. headers = self._download_to(url, filename)
  569. if 'html' in headers.get('content-type','').lower():
  570. return self._download_html(url, headers, filename)
  571. else:
  572. return filename
  573. def _download_html(self, url, headers, filename):
  574. file = open(filename)
  575. for line in file:
  576. if line.strip():
  577. # Check for a subversion index page
  578. if re.search(r'<title>([^- ]+ - )?Revision \d+:', line):
  579. # it's a subversion index page:
  580. file.close()
  581. os.unlink(filename)
  582. return self._download_svn(url, filename)
  583. break # not an index page
  584. file.close()
  585. os.unlink(filename)
  586. raise DistutilsError("Unexpected HTML page found at "+url)
  587. def _download_svn(self, url, filename):
  588. url = url.split('#',1)[0] # remove any fragment for svn's sake
  589. self.info("Doing subversion checkout from %s to %s", url, filename)
  590. os.system("svn checkout -q %s %s" % (url, filename))
  591. return filename
  592. def debug(self, msg, *args):
  593. log.debug(msg, *args)
  594. def info(self, msg, *args):
  595. log.info(msg, *args)
  596. def warn(self, msg, *args):
  597. log.warn(msg, *args)
  598. # This pattern matches a character entity reference (a decimal numeric
  599. # references, a hexadecimal numeric reference, or a named reference).
  600. entity_sub = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?').sub
  601. def uchr(c):
  602. if not isinstance(c, int):
  603. return c
  604. if c>255: return unichr(c)
  605. return chr(c)
  606. def decode_entity(match):
  607. what = match.group(1)
  608. if what.startswith('#x'):
  609. what = int(what[2:], 16)
  610. elif what.startswith('#'):
  611. what = int(what[1:])
  612. else:
  613. from htmlentitydefs import name2codepoint
  614. what = name2codepoint.get(what, match.group(0))
  615. return uchr(what)
  616. def htmldecode(text):
  617. """Decode HTML entities in the given text."""
  618. return entity_sub(decode_entity, text)
  619. def socket_timeout(timeout=15):
  620. def _socket_timeout(func):
  621. def _socket_timeout(*args, **kwargs):
  622. old_timeout = socket.getdefaulttimeout()
  623. socket.setdefaulttimeout(timeout)
  624. try:
  625. return func(*args, **kwargs)
  626. finally:
  627. socket.setdefaulttimeout(old_timeout)
  628. return _socket_timeout
  629. return _socket_timeout
  630. def _encode_auth(auth):
  631. """
  632. A function compatible with Python 2.3-3.3 that will encode
  633. auth from a URL suitable for an HTTP header.
  634. >>> _encode_auth('username%3Apassword')
  635. u'dXNlcm5hbWU6cGFzc3dvcmQ='
  636. """
  637. auth_s = urllib2.unquote(auth)
  638. # convert to bytes
  639. auth_bytes = auth_s.encode()
  640. # use the legacy interface for Python 2.3 support
  641. encoded_bytes = base64.encodestring(auth_bytes)
  642. # convert back to a string
  643. encoded = encoded_bytes.decode()
  644. # strip the trailing carriage return
  645. return encoded.rstrip()
  646. def open_with_auth(url):
  647. """Open a urllib2 request, handling HTTP authentication"""
  648. scheme, netloc, path, params, query, frag = urlparse.urlparse(url)
  649. if scheme in ('http', 'https'):
  650. auth, host = urllib2.splituser(netloc)
  651. else:
  652. auth = None
  653. if auth:
  654. auth = "Basic " + _encode_auth(auth)
  655. new_url = urlparse.urlunparse((scheme,host,path,params,query,frag))
  656. request = urllib2.Request(new_url)
  657. request.add_header("Authorization", auth)
  658. else:
  659. request = urllib2.Request(url)
  660. request.add_header('User-Agent', user_agent)
  661. fp = urllib2.urlopen(request)
  662. if auth:
  663. # Put authentication info back into request URL if same host,
  664. # so that links found on the page will work
  665. s2, h2, path2, param2, query2, frag2 = urlparse.urlparse(fp.url)
  666. if s2==scheme and h2==host:
  667. fp.url = urlparse.urlunparse((s2,netloc,path2,param2,query2,frag2))
  668. return fp
  669. # adding a timeout to avoid freezing package_index
  670. open_with_auth = socket_timeout(_SOCKET_TIMEOUT)(open_with_auth)
  671. def fix_sf_url(url):
  672. return url # backward compatibility
  673. def local_open(url):
  674. """Read a local path, with special support for directories"""
  675. scheme, server, path, param, query, frag = urlparse.urlparse(url)
  676. filename = urllib.url2pathname(path)
  677. if os.path.isfile(filename):
  678. return urllib2.urlopen(url)
  679. elif path.endswith('/') and os.path.isdir(filename):
  680. files = []
  681. for f in os.listdir(filename):
  682. if f=='index.html':
  683. fp = open(os.path.join(filename,f),'rb')
  684. body = fp.read()
  685. fp.close()
  686. break
  687. elif os.path.isdir(os.path.join(filename,f)):
  688. f+='/'
  689. files.append("<a href=%r>%s</a>" % (f,f))
  690. else:
  691. body = ("<html><head><title>%s</title>" % url) + \
  692. "</head><body>%s</body></html>" % '\n'.join(files)
  693. status, message = 200, "OK"
  694. else:
  695. status, message, body = 404, "Path not found", "Not found"
  696. return urllib2.HTTPError(url, status, message,
  697. {'content-type':'text/html'}, cStringIO.StringIO(body))
  698. # this line is a kludge to keep the trailing blank lines for pje's editor