PageRenderTime 48ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/scrapy/core/downloader/handlers/ftp.py

https://gitlab.com/e0/scrapy
Python | 104 lines | 83 code | 0 blank | 21 comment | 0 complexity | 449cd5775b78bebff50578c7f063ae0d MD5 | raw file
  1. """
  2. An asynchronous FTP file download handler for scrapy which somehow emulates an http response.
  3. FTP connection parameters are passed using the request meta field:
  4. - ftp_user (required)
  5. - ftp_password (required)
  6. - ftp_passive (by default, enabled) sets FTP connection passive mode
  7. - ftp_local_filename
  8. - If not given, file data will come in the response.body, as a normal scrapy Response,
  9. which will imply that the entire file will be on memory.
  10. - if given, file data will be saved in a local file with the given name
  11. This helps when downloading very big files to avoid memory issues. In addition, for
  12. convenience the local file name will also be given in the response body.
  13. The status of the built html response will be, by default
  14. - 200 in case of success
  15. - 404 in case specified file was not found in the server (ftp code 550)
  16. or raise corresponding ftp exception otherwise
  17. The matching from server ftp command return codes to html response codes is defined in the
  18. CODE_MAPPING attribute of the handler class. The key 'default' is used for any code
  19. that is not explicitly present among the map keys. You may need to overwrite this
  20. mapping if want a different behaviour than default.
  21. In case of status 200 request, response.headers will come with two keys:
  22. 'Local Filename' - with the value of the local filename if given
  23. 'Size' - with size of the downloaded data
  24. """
  25. import re
  26. from io import BytesIO
  27. from six.moves.urllib.parse import urlparse, unquote
  28. from twisted.internet import reactor
  29. from twisted.protocols.ftp import FTPClient, CommandFailed
  30. from twisted.internet.protocol import Protocol, ClientCreator
  31. from scrapy.http import Response
  32. from scrapy.responsetypes import responsetypes
  33. class ReceivedDataProtocol(Protocol):
  34. def __init__(self, filename=None):
  35. self.__filename = filename
  36. self.body = open(filename, "w") if filename else BytesIO()
  37. self.size = 0
  38. def dataReceived(self, data):
  39. self.body.write(data)
  40. self.size += len(data)
  41. @property
  42. def filename(self):
  43. return self.__filename
  44. def close(self):
  45. self.body.close() if self.filename else self.body.seek(0)
  46. _CODE_RE = re.compile("\d+")
  47. class FTPDownloadHandler(object):
  48. CODE_MAPPING = {
  49. "550": 404,
  50. "default": 503,
  51. }
  52. def __init__(self, setting):
  53. pass
  54. def download_request(self, request, spider):
  55. parsed_url = urlparse(request.url)
  56. creator = ClientCreator(reactor, FTPClient, request.meta["ftp_user"],
  57. request.meta["ftp_password"],
  58. passive=request.meta.get("ftp_passive", 1))
  59. return creator.connectTCP(parsed_url.hostname, parsed_url.port or 21).addCallback(self.gotClient,
  60. request, unquote(parsed_url.path))
  61. def gotClient(self, client, request, filepath):
  62. self.client = client
  63. protocol = ReceivedDataProtocol(request.meta.get("ftp_local_filename"))
  64. return client.retrieveFile(filepath, protocol)\
  65. .addCallbacks(callback=self._build_response,
  66. callbackArgs=(request, protocol),
  67. errback=self._failed,
  68. errbackArgs=(request,))
  69. def _build_response(self, result, request, protocol):
  70. self.result = result
  71. respcls = responsetypes.from_args(url=request.url)
  72. protocol.close()
  73. body = protocol.filename or protocol.body.read()
  74. headers = {"local filename": protocol.filename or '', "size": protocol.size}
  75. return respcls(url=request.url, status=200, body=body, headers=headers)
  76. def _failed(self, result, request):
  77. message = result.getErrorMessage()
  78. if result.type == CommandFailed:
  79. m = _CODE_RE.search(message)
  80. if m:
  81. ftpcode = m.group()
  82. httpcode = self.CODE_MAPPING.get(ftpcode, self.CODE_MAPPING["default"])
  83. return Response(url=request.url, status=httpcode, body=message)
  84. raise result.type(result.value)