PageRenderTime 50ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/modules/miscutil/lib/filedownloadutils.py

https://github.com/bambistas/invenio
Python | 342 lines | 319 code | 0 blank | 23 comment | 0 complexity | c1e1e498d3c003e057d1cec64b7a9991 MD5 | raw file
Possible License(s): GPL-2.0
  1. # -*- coding: utf-8 -*-
  2. ##
  3. ## This file is part of Invenio.
  4. ## Copyright (C) 2012 CERN.
  5. ##
  6. ## Invenio is free software; you can redistribute it and/or
  7. ## modify it under the terms of the GNU General Public License as
  8. ## published by the Free Software Foundation; either version 2 of the
  9. ## License, or (at your option) any later version.
  10. ##
  11. ## Invenio is distributed in the hope that it will be useful, but
  12. ## WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. ## General Public License for more details.
  15. ##
  16. ## You should have received a copy of the GNU General Public License
  17. ## along with Invenio; if not, write to the Free Software Foundation, Inc.,
  18. ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
  19. """
  20. File handling utilities.
  21. Main API usage:
  22. >>> from filedownloadutils import download_url
  23. >>> new_file = download_url("http://duckduckgo.com", content_type="html")
  24. Raises InvenioFileDownloadError exception.
  25. """
  26. import urllib2
  27. import time
  28. import os
  29. import socket
  30. import urllib
  31. import tempfile
  32. import shutil
  33. import sys
  34. from invenio.urlutils import make_invenio_opener
  35. URL_OPENER = make_invenio_opener('filedownloadutils')
  36. from invenio.config import (CFG_TMPSHAREDDIR,
  37. CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS,
  38. CFG_WEBSUBMIT_STORAGEDIR)
  39. #: block size when performing I/O.
  40. CFG_FILEUTILS_BLOCK_SIZE = 1024 * 8
  41. class InvenioFileDownloadError(Exception):
  42. """A generic download exception."""
  43. def __init__(self, msg, code=None):
  44. Exception.__init__(self, msg)
  45. self.code = code
  46. class InvenioFileCopyError(Exception):
  47. """A generic file copy exception."""
  48. pass
  49. def download_url(url, content_type=None, download_to_file=None,
  50. retry_count=10, timeout=10.0):
  51. """
  52. Will download a file from given URL (either local or external) to the
  53. desired path (or generate one if none is given). Local files are copied
  54. directly.
  55. The function will retry a number of times based on retry_count (default 10)
  56. parameter and sleeps a number of seconds based on given timeout
  57. (default 10.0 sec) after each failed request.
  58. Returns the path to the downloaded file if successful.
  59. Otherwise an exception is raised.
  60. Given a content_type and an external URL, the function will make sure
  61. that the desired content_type is equal to the content-type of returned
  62. file.
  63. @param url: where the file lives on the interwebs
  64. @type url: string
  65. @param content_type: desired content_type to check for in external URLs.
  66. (optional)
  67. @type content_type: string
  68. @param download_to_file: where the file should live after download.
  69. (optional)
  70. @type download_to_file: string
  71. @param retry_count: number of times to retry. Defaults to 10.
  72. (optional)
  73. @type retry_count: int
  74. @param timeout: number of seconds to sleep between attempts.
  75. Defaults to 10.0 seconds. (optional)
  76. @type timeout: float
  77. @return: the path of the downloaded/copied file
  78. @raise InvenioFileDownloadError: raised upon URL/HTTP errors, file errors or wrong format
  79. """
  80. if not download_to_file:
  81. download_to_file = safe_mkstemp(suffix=".tmp",
  82. prefix="filedownloadutils_")
  83. try:
  84. if is_url_a_local_file(url):
  85. downloaded_file = download_local_file(url,
  86. download_to_file)
  87. else:
  88. downloaded_file = download_external_url(url,
  89. download_to_file,
  90. content_type=content_type,
  91. retry_count=retry_count,
  92. timeout=timeout)
  93. except InvenioFileDownloadError:
  94. raise
  95. return downloaded_file
  96. def download_external_url(url, download_to_file, content_type=None,
  97. retry_count=10, timeout=10.0, verbose=False):
  98. """
  99. Download a url (if it corresponds to a remote file) and return a
  100. local url to it. If format is specified, a check will be performed
  101. in order to make sure that the format of the downloaded file is equal
  102. to the expected format.
  103. @param url: the URL to download
  104. @type url: string
  105. @param download_to_file: the path to download the file to
  106. @type download_to_file: string
  107. @param content_type: the content_type of the file (optional)
  108. @type content_type: string
  109. @param retry_count: max number of retries for downloading the file
  110. @type retry_count: int
  111. @param timeout: time to sleep in between attemps
  112. @type timeout: int
  113. @return: the path to the download local file
  114. @rtype: string
  115. @raise StandardError: if the download failed
  116. """
  117. error_str = ""
  118. error_code = None
  119. retry_attempt = 0
  120. while retry_attempt < retry_count:
  121. try:
  122. # Attempt to download the external file
  123. request = open_url(url)
  124. if request.code == 200 and "Refresh" in request.headers:
  125. # PDF is being generated, they ask us to wait for
  126. # n seconds.
  127. # New arxiv responses, we are not sure if the old ones are
  128. # deactivated
  129. try:
  130. retry_after = int(request.headers["Refresh"])
  131. # We make sure that we do not retry too often even if
  132. # they tell us to retry after 1s
  133. retry_after = max(retry_after, timeout)
  134. except ValueError:
  135. retry_after = timeout
  136. if verbose:
  137. msg = "retrying after %ss" % (retry_after,)
  138. print >> sys.stderr, msg
  139. time.sleep(retry_after)
  140. retry_attempt += 1
  141. continue
  142. except urllib2.HTTPError, e:
  143. error_code = e.code
  144. error_str = str(e)
  145. retry_after = timeout
  146. # This handling is the same as OAI queries.
  147. # We are getting 503 errors when PDFs are being generated
  148. if e.code == 503 and "Retry-After" in e.headers:
  149. # PDF is being generated, they ask us to wait for n seconds
  150. try:
  151. retry_after = int(e.headers["Retry-After"])
  152. # We make sure that we do not retry too often even if
  153. # they tell us to retry after 1s
  154. retry_after = max(retry_after, timeout)
  155. except ValueError:
  156. pass
  157. if verbose:
  158. msg = "retrying after %ss" % (retry_after,)
  159. print >> sys.stderr, msg
  160. time.sleep(retry_after)
  161. retry_attempt += 1
  162. except (urllib2.URLError,
  163. socket.timeout,
  164. socket.gaierror,
  165. socket.error), e:
  166. if verbose:
  167. error_str = str(e)
  168. msg = "socket error, retrying after %ss" % (timeout,)
  169. print >> sys.stderr, msg
  170. time.sleep(timeout)
  171. retry_attempt += 1
  172. else:
  173. # When we get here, it means that the download was a success.
  174. try:
  175. finalize_download(url, download_to_file, content_type, request)
  176. finally:
  177. request.close()
  178. return download_to_file
  179. # All the attempts were used, but no successfull download - so raise error
  180. msg = 'URL could not be opened: %s' % (error_str,)
  181. raise InvenioFileDownloadError(msg, code=error_code)
  182. def finalize_download(url, download_to_file, content_type, request):
  183. """
  184. Finalizes the download operation by doing various checks, such as format
  185. type, size check etc.
  186. """
  187. # If format is given, a format check is performed.
  188. if content_type and content_type not in request.headers['content-type']:
  189. msg = 'The downloaded file is not of the desired format'
  190. raise InvenioFileDownloadError(msg)
  191. # Save the downloaded file to desired or generated location.
  192. to_file = open(download_to_file, 'w')
  193. try:
  194. try:
  195. while True:
  196. block = request.read(CFG_FILEUTILS_BLOCK_SIZE)
  197. if not block:
  198. break
  199. to_file.write(block)
  200. except Exception, e:
  201. msg = "Error when downloading %s into %s: %s" % \
  202. (url, download_to_file, e)
  203. raise InvenioFileDownloadError(msg)
  204. finally:
  205. to_file.close()
  206. # Check Size
  207. filesize = os.path.getsize(download_to_file)
  208. if filesize == 0:
  209. raise InvenioFileDownloadError("%s seems to be empty" % (url,))
  210. # download successful, return the new path
  211. return download_to_file
  212. def download_local_file(filename, download_to_file):
  213. """
  214. Copies a local file to Invenio's temporary directory.
  215. @param filename: the name of the file to copy
  216. @type filename: string
  217. @param download_to_file: the path to save the file to
  218. @type download_to_file: string
  219. @return: the path of the temporary file created
  220. @rtype: string
  221. @raise StandardError: if something went wrong
  222. """
  223. # Try to copy.
  224. try:
  225. path = urllib2.urlparse.urlsplit(urllib.unquote(filename))[2]
  226. if os.path.abspath(path) != path:
  227. msg = "%s is not a normalized path (would be %s)." \
  228. % (path, os.path.normpath(path))
  229. raise InvenioFileCopyError(msg)
  230. allowed_path_list = CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS
  231. allowed_path_list.append(CFG_TMPSHAREDDIR)
  232. allowed_path_list.append(CFG_WEBSUBMIT_STORAGEDIR)
  233. for allowed_path in allowed_path_list:
  234. if path.startswith(allowed_path):
  235. shutil.copy(path, download_to_file)
  236. if os.path.getsize(download_to_file) == 0:
  237. os.remove(download_to_file)
  238. msg = "%s seems to be empty" % (filename,)
  239. raise InvenioFileCopyError(msg)
  240. break
  241. else:
  242. msg = "%s is not in one of the allowed paths." % (path,)
  243. raise InvenioFileCopyError()
  244. except Exception, e:
  245. msg = "Impossible to copy the local file '%s' to %s: %s" % \
  246. (filename, download_to_file, str(e))
  247. raise InvenioFileCopyError(msg)
  248. return download_to_file
  249. def is_url_a_local_file(url):
  250. """Return True if the given URL is pointing to a local file."""
  251. protocol = urllib2.urlparse.urlsplit(url)[0]
  252. return protocol in ('', 'file')
  253. def safe_mkstemp(suffix, prefix='filedownloadutils_'):
  254. """Create a temporary filename that don't have any '.' inside a part
  255. from the suffix."""
  256. tmpfd, tmppath = tempfile.mkstemp(suffix=suffix,
  257. prefix=prefix,
  258. dir=CFG_TMPSHAREDDIR)
  259. # Close the file and leave the responsability to the client code to
  260. # correctly open/close it.
  261. os.close(tmpfd)
  262. if '.' not in suffix:
  263. # Just in case format is empty
  264. return tmppath
  265. while '.' in os.path.basename(tmppath)[:-len(suffix)]:
  266. os.remove(tmppath)
  267. tmpfd, tmppath = tempfile.mkstemp(suffix=suffix,
  268. prefix=prefix,
  269. dir=CFG_TMPSHAREDDIR)
  270. os.close(tmpfd)
  271. return tmppath
  272. def open_url(url, headers=None):
  273. """
  274. Opens a URL. If headers are passed as argument, no check is performed and
  275. the URL will be opened.
  276. @param url: the URL to open
  277. @type url: string
  278. @param headers: the headers to use
  279. @type headers: dictionary
  280. @return: a file-like object as returned by urllib2.urlopen.
  281. """
  282. request = urllib2.Request(url)
  283. if headers:
  284. for key, value in headers.items():
  285. request.add_header(key, value)
  286. return URL_OPENER.open(request)