PageRenderTime 50ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/awscli/customizations/s3/s3handler.py

https://gitlab.com/github-cloud-corp/aws-cli
Python | 535 lines | 379 code | 42 blank | 114 comment | 60 complexity | ea19992eed468fd1eb2f6b4cd1edd262 MD5 | raw file
  1. # Copyright 2013 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License"). You
  4. # may not use this file except in compliance with the License. A copy of
  5. # the License is located at
  6. #
  7. # http://aws.amazon.com/apache2.0/
  8. #
  9. # or in the "license" file accompanying this file. This file is
  10. # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
  11. # ANY KIND, either express or implied. See the License for the specific
  12. # language governing permissions and limitations under the License.
  13. from collections import namedtuple
  14. import logging
  15. import math
  16. import os
  17. import sys
  18. from awscli.customizations.s3.utils import (
  19. find_chunksize, adjust_chunksize_to_upload_limits, MAX_UPLOAD_SIZE,
  20. find_bucket_key, relative_path, PrintTask, create_warning)
  21. from awscli.customizations.s3.executor import Executor
  22. from awscli.customizations.s3 import tasks
  23. from awscli.customizations.s3.transferconfig import RuntimeConfig
  24. from awscli.compat import six
  25. from awscli.compat import queue
  26. LOGGER = logging.getLogger(__name__)
  27. CommandResult = namedtuple('CommandResult',
  28. ['num_tasks_failed', 'num_tasks_warned'])
  29. class S3Handler(object):
  30. """
  31. This class sets up the process to perform the tasks sent to it. It
  32. sources the ``self.executor`` from which threads inside the
  33. class pull tasks from to complete.
  34. """
  35. MAX_IO_QUEUE_SIZE = 20
  36. def __init__(self, session, params, result_queue=None,
  37. runtime_config=None):
  38. self.session = session
  39. if runtime_config is None:
  40. runtime_config = RuntimeConfig.defaults()
  41. self._runtime_config = runtime_config
  42. # The write_queue has potential for optimizations, so the constant
  43. # for maxsize is scoped to this class (as opposed to constants.py)
  44. # so we have the ability to change this value later.
  45. self.write_queue = queue.Queue(maxsize=self.MAX_IO_QUEUE_SIZE)
  46. self.result_queue = result_queue
  47. if not self.result_queue:
  48. self.result_queue = queue.Queue()
  49. self.params = {
  50. 'dryrun': False, 'quiet': False, 'acl': None,
  51. 'guess_mime_type': True, 'sse_c_copy_source': None,
  52. 'sse_c_copy_source_key': None, 'sse': None,
  53. 'sse_c': None, 'sse_c_key': None, 'sse_kms_key_id': None,
  54. 'storage_class': None, 'website_redirect': None,
  55. 'content_type': None, 'cache_control': None,
  56. 'content_disposition': None, 'content_encoding': None,
  57. 'content_language': None, 'expires': None, 'grants': None,
  58. 'only_show_errors': False, 'is_stream': False,
  59. 'paths_type': None, 'expected_size': None, 'metadata': None,
  60. 'metadata_directive': None, 'ignore_glacier_warnings': False,
  61. 'force_glacier_transfer': False
  62. }
  63. self.params['region'] = params['region']
  64. for key in self.params.keys():
  65. if key in params:
  66. self.params[key] = params[key]
  67. self.multi_threshold = self._runtime_config['multipart_threshold']
  68. self.chunksize = self._runtime_config['multipart_chunksize']
  69. LOGGER.debug("Using a multipart threshold of %s and a part size of %s",
  70. self.multi_threshold, self.chunksize)
  71. self.executor = Executor(
  72. num_threads=self._runtime_config['max_concurrent_requests'],
  73. result_queue=self.result_queue,
  74. quiet=self.params['quiet'],
  75. only_show_errors=self.params['only_show_errors'],
  76. max_queue_size=self._runtime_config['max_queue_size'],
  77. write_queue=self.write_queue
  78. )
  79. self._multipart_uploads = []
  80. self._multipart_downloads = []
  81. def call(self, files):
  82. """
  83. This function pulls a ``FileInfo`` or ``TaskInfo`` object from
  84. a list ``files``. Each object is then deemed if it will be a
  85. multipart operation and add the necessary attributes if so. Each
  86. object is then wrapped with a ``BasicTask`` object which is
  87. essentially a thread of execution for a thread to follow. These
  88. tasks are then submitted to the main executor.
  89. """
  90. try:
  91. self.executor.start()
  92. total_files, total_parts = self._enqueue_tasks(files)
  93. self.executor.print_thread.set_total_files(total_files)
  94. self.executor.print_thread.set_total_parts(total_parts)
  95. self.executor.initiate_shutdown()
  96. self._finalize_shutdown()
  97. except Exception as e:
  98. LOGGER.debug('Exception caught during task execution: %s',
  99. str(e), exc_info=True)
  100. self.result_queue.put(PrintTask(message=str(e), error=True))
  101. self.executor.initiate_shutdown(
  102. priority=self.executor.IMMEDIATE_PRIORITY)
  103. self._finalize_shutdown()
  104. except KeyboardInterrupt:
  105. self.result_queue.put(PrintTask(message=("Cleaning up. "
  106. "Please wait..."),
  107. error=True))
  108. self.executor.initiate_shutdown(
  109. priority=self.executor.IMMEDIATE_PRIORITY)
  110. self._finalize_shutdown()
  111. return CommandResult(self.executor.num_tasks_failed,
  112. self.executor.num_tasks_warned)
  113. def _finalize_shutdown(self):
  114. # Run all remaining tasks needed to completely shutdown the
  115. # S3 handler. This method will block until shutdown is complete.
  116. # The order here is important. We need to wait until all the
  117. # tasks have been completed before we can cleanup. Otherwise
  118. # we can have race conditions where we're trying to cleanup
  119. # uploads/downloads that are still in progress.
  120. self.executor.wait_until_shutdown()
  121. self._cleanup()
  122. def _cleanup(self):
  123. # And finally we need to make a pass through all the existing
  124. # multipart uploads and abort any pending multipart uploads.
  125. self._abort_pending_multipart_uploads()
  126. self._remove_pending_downloads()
  127. def _abort_pending_multipart_uploads(self):
  128. # precondition: this method is assumed to be called when there are no ongoing
  129. # uploads (the executor has been shutdown).
  130. for upload, filename in self._multipart_uploads:
  131. if upload.is_cancelled() or upload.in_progress():
  132. # Cancel any upload that's not unstarted and not complete.
  133. upload.cancel_upload(self._cancel_upload, args=(filename,))
  134. def _remove_pending_downloads(self):
  135. # The downloads case is easier than the uploads case because we don't
  136. # need to make any service calls. To properly cleanup we just need
  137. # to go through the multipart downloads that were in progress but
  138. # cancelled and remove the local file.
  139. for context, local_filename in self._multipart_downloads:
  140. if (context.is_cancelled() or context.is_started()) and \
  141. os.path.exists(local_filename):
  142. # The file is in an inconsistent state (not all the parts
  143. # were written to the file) so we should remove the
  144. # local file rather than leave it in a bad state. We don't
  145. # want to remove the files if the download has *not* been
  146. # started because we haven't touched the file yet, so it's
  147. # better to leave the old version of the file rather than
  148. # deleting the file entirely.
  149. os.remove(local_filename)
  150. context.cancel()
  151. def _cancel_upload(self, upload_id, filename):
  152. bucket, key = find_bucket_key(filename.dest)
  153. params = {
  154. 'Bucket': bucket,
  155. 'Key': key,
  156. 'UploadId': upload_id,
  157. }
  158. LOGGER.debug("Aborting multipart upload for: %s", key)
  159. filename.client.abort_multipart_upload(**params)
  160. def _enqueue_tasks(self, files):
  161. total_files = 0
  162. total_parts = 0
  163. for filename in files:
  164. num_uploads = 1
  165. is_multipart_task = self._is_multipart_task(filename)
  166. too_large = False
  167. if hasattr(filename, 'size'):
  168. too_large = filename.size > MAX_UPLOAD_SIZE
  169. if too_large and filename.operation_name == 'upload':
  170. warning_message = "File exceeds s3 upload limit of 5 TB."
  171. warning = create_warning(relative_path(filename.src),
  172. warning_message)
  173. self.result_queue.put(warning)
  174. # Warn and skip over glacier incompatible tasks.
  175. elif not self.params.get('force_glacier_transfer') and \
  176. not filename.is_glacier_compatible():
  177. LOGGER.debug(
  178. 'Encountered glacier object s3://%s. Not performing '
  179. '%s on object.' % (filename.src, filename.operation_name))
  180. if not self.params['ignore_glacier_warnings']:
  181. warning = create_warning(
  182. 's3://'+filename.src,
  183. 'Object is of storage class GLACIER. Unable to '
  184. 'perform %s operations on GLACIER objects. You must '
  185. 'restore the object to be able to the perform '
  186. 'operation.' %
  187. filename.operation_name
  188. )
  189. self.result_queue.put(warning)
  190. continue
  191. elif is_multipart_task and not self.params['dryrun']:
  192. # If we're in dryrun mode, then we don't need the
  193. # real multipart tasks. We can just use a BasicTask
  194. # in the else clause below, which will print out the
  195. # fact that it's transferring a file rather than
  196. # the specific part tasks required to perform the
  197. # transfer.
  198. num_uploads = self._enqueue_multipart_tasks(filename)
  199. else:
  200. task = tasks.BasicTask(
  201. session=self.session, filename=filename,
  202. parameters=self.params,
  203. result_queue=self.result_queue)
  204. self.executor.submit(task)
  205. total_files += 1
  206. total_parts += num_uploads
  207. return total_files, total_parts
  208. def _is_multipart_task(self, filename):
  209. # First we need to determine if it's an operation that even
  210. # qualifies for multipart upload.
  211. if hasattr(filename, 'size'):
  212. above_multipart_threshold = filename.size > self.multi_threshold
  213. if above_multipart_threshold:
  214. if filename.operation_name in ('upload', 'download',
  215. 'move', 'copy'):
  216. return True
  217. else:
  218. return False
  219. else:
  220. return False
  221. def _enqueue_multipart_tasks(self, filename):
  222. num_uploads = 1
  223. if filename.operation_name == 'upload':
  224. num_uploads = self._enqueue_multipart_upload_tasks(filename)
  225. elif filename.operation_name == 'move':
  226. if filename.src_type == 'local' and filename.dest_type == 's3':
  227. num_uploads = self._enqueue_multipart_upload_tasks(
  228. filename, remove_local_file=True)
  229. elif filename.src_type == 's3' and filename.dest_type == 'local':
  230. num_uploads = self._enqueue_range_download_tasks(
  231. filename, remove_remote_file=True)
  232. elif filename.src_type == 's3' and filename.dest_type == 's3':
  233. num_uploads = self._enqueue_multipart_copy_tasks(
  234. filename, remove_remote_file=True)
  235. else:
  236. raise ValueError("Unknown transfer type of %s -> %s" %
  237. (filename.src_type, filename.dest_type))
  238. elif filename.operation_name == 'copy':
  239. num_uploads = self._enqueue_multipart_copy_tasks(
  240. filename, remove_remote_file=False)
  241. elif filename.operation_name == 'download':
  242. num_uploads = self._enqueue_range_download_tasks(filename)
  243. return num_uploads
  244. def _enqueue_range_download_tasks(self, filename, remove_remote_file=False):
  245. num_downloads = int(filename.size / self.chunksize)
  246. context = tasks.MultipartDownloadContext(num_downloads)
  247. create_file_task = tasks.CreateLocalFileTask(
  248. context=context, filename=filename,
  249. result_queue=self.result_queue)
  250. self.executor.submit(create_file_task)
  251. self._do_enqueue_range_download_tasks(
  252. filename=filename, chunksize=self.chunksize,
  253. num_downloads=num_downloads, context=context,
  254. remove_remote_file=remove_remote_file
  255. )
  256. complete_file_task = tasks.CompleteDownloadTask(
  257. context=context, filename=filename, result_queue=self.result_queue,
  258. params=self.params, io_queue=self.write_queue)
  259. self.executor.submit(complete_file_task)
  260. self._multipart_downloads.append((context, filename.dest))
  261. if remove_remote_file:
  262. remove_task = tasks.RemoveRemoteObjectTask(
  263. filename=filename, context=context)
  264. self.executor.submit(remove_task)
  265. return num_downloads
  266. def _do_enqueue_range_download_tasks(self, filename, chunksize,
  267. num_downloads, context,
  268. remove_remote_file=False):
  269. for i in range(num_downloads):
  270. task = tasks.DownloadPartTask(
  271. part_number=i, chunk_size=chunksize,
  272. result_queue=self.result_queue, filename=filename,
  273. context=context, io_queue=self.write_queue,
  274. params=self.params)
  275. self.executor.submit(task)
  276. def _enqueue_multipart_upload_tasks(self, filename,
  277. remove_local_file=False):
  278. # First we need to create a CreateMultipartUpload task,
  279. # then create UploadTask objects for each of the parts.
  280. # And finally enqueue a CompleteMultipartUploadTask.
  281. chunksize = find_chunksize(filename.size, self.chunksize)
  282. num_uploads = int(math.ceil(filename.size /
  283. float(chunksize)))
  284. upload_context = self._enqueue_upload_start_task(
  285. chunksize, num_uploads, filename)
  286. self._enqueue_upload_tasks(
  287. num_uploads, chunksize, upload_context, filename, tasks.UploadPartTask)
  288. self._enqueue_upload_end_task(filename, upload_context)
  289. if remove_local_file:
  290. remove_task = tasks.RemoveFileTask(local_filename=filename.src,
  291. upload_context=upload_context)
  292. self.executor.submit(remove_task)
  293. return num_uploads
  294. def _enqueue_multipart_copy_tasks(self, filename,
  295. remove_remote_file=False):
  296. chunksize = find_chunksize(filename.size, self.chunksize)
  297. num_uploads = int(math.ceil(filename.size / float(chunksize)))
  298. upload_context = self._enqueue_upload_start_task(
  299. chunksize, num_uploads, filename)
  300. self._enqueue_upload_tasks(
  301. num_uploads, chunksize, upload_context, filename, tasks.CopyPartTask)
  302. self._enqueue_upload_end_task(filename, upload_context)
  303. if remove_remote_file:
  304. remove_task = tasks.RemoveRemoteObjectTask(
  305. filename=filename, context=upload_context)
  306. self.executor.submit(remove_task)
  307. return num_uploads
  308. def _enqueue_upload_start_task(self, chunksize, num_uploads, filename):
  309. upload_context = tasks.MultipartUploadContext(
  310. expected_parts=num_uploads)
  311. create_multipart_upload_task = tasks.CreateMultipartUploadTask(
  312. session=self.session, filename=filename,
  313. parameters=self.params,
  314. result_queue=self.result_queue, upload_context=upload_context)
  315. self.executor.submit(create_multipart_upload_task)
  316. self._multipart_uploads.append((upload_context, filename))
  317. return upload_context
  318. def _enqueue_upload_tasks(self, num_uploads, chunksize, upload_context,
  319. filename, task_class):
  320. for i in range(1, (num_uploads + 1)):
  321. self._enqueue_upload_single_part_task(
  322. part_number=i,
  323. chunk_size=chunksize,
  324. upload_context=upload_context,
  325. filename=filename,
  326. task_class=task_class
  327. )
  328. def _enqueue_upload_single_part_task(self, part_number, chunk_size,
  329. upload_context, filename, task_class,
  330. payload=None):
  331. kwargs = {'part_number': part_number, 'chunk_size': chunk_size,
  332. 'result_queue': self.result_queue,
  333. 'upload_context': upload_context, 'filename': filename,
  334. 'params': self.params}
  335. if payload:
  336. kwargs['payload'] = payload
  337. task = task_class(**kwargs)
  338. self.executor.submit(task)
  339. def _enqueue_upload_end_task(self, filename, upload_context):
  340. complete_multipart_upload_task = tasks.CompleteMultipartUploadTask(
  341. session=self.session, filename=filename, parameters=self.params,
  342. result_queue=self.result_queue, upload_context=upload_context)
  343. self.executor.submit(complete_multipart_upload_task)
  344. class S3StreamHandler(S3Handler):
  345. """
  346. This class is an alternative ``S3Handler`` to be used when the operation
  347. involves a stream since the logic is different when uploading and
  348. downloading streams.
  349. """
  350. # This ensures that the number of multipart chunks waiting in the
  351. # executor queue and in the threads is limited.
  352. MAX_EXECUTOR_QUEUE_SIZE = 2
  353. EXECUTOR_NUM_THREADS = 6
  354. def __init__(self, session, params, result_queue=None,
  355. runtime_config=None):
  356. if runtime_config is None:
  357. # Rather than using the .defaults(), streaming
  358. # has different default values so that it does not
  359. # consume large amounts of memory.
  360. runtime_config = RuntimeConfig().build_config(
  361. max_queue_size=self.MAX_EXECUTOR_QUEUE_SIZE,
  362. max_concurrent_requests=self.EXECUTOR_NUM_THREADS)
  363. super(S3StreamHandler, self).__init__(session, params, result_queue,
  364. runtime_config)
  365. def _enqueue_tasks(self, files):
  366. total_files = 0
  367. total_parts = 0
  368. for filename in files:
  369. num_uploads = 1
  370. # If uploading stream, it is required to read from the stream
  371. # to determine if the stream needs to be multipart uploaded.
  372. payload = None
  373. if filename.operation_name == 'upload':
  374. payload, is_multipart_task = \
  375. self._pull_from_stream(self.multi_threshold)
  376. else:
  377. # Set the file size for the ``FileInfo`` object since
  378. # streams do not use a ``FileGenerator`` that usually
  379. # determines the size.
  380. filename.set_size_from_s3()
  381. is_multipart_task = self._is_multipart_task(filename)
  382. if is_multipart_task and not self.params['dryrun']:
  383. # If we're in dryrun mode, then we don't need the
  384. # real multipart tasks. We can just use a BasicTask
  385. # in the else clause below, which will print out the
  386. # fact that it's transferring a file rather than
  387. # the specific part tasks required to perform the
  388. # transfer.
  389. num_uploads = self._enqueue_multipart_tasks(filename, payload)
  390. else:
  391. task = tasks.BasicTask(
  392. session=self.session, filename=filename,
  393. parameters=self.params,
  394. result_queue=self.result_queue,
  395. payload=payload)
  396. self.executor.submit(task)
  397. total_files += 1
  398. total_parts += num_uploads
  399. return total_files, total_parts
  400. def _pull_from_stream(self, amount_requested):
  401. """
  402. This function pulls data from stdin until it hits the amount
  403. requested or there is no more left to pull in from stdin. The
  404. function wraps the data into a ``BytesIO`` object that is returned
  405. along with a boolean telling whether the amount requested is
  406. the amount returned.
  407. """
  408. stream_filein = sys.stdin
  409. if six.PY3:
  410. stream_filein = sys.stdin.buffer
  411. payload = stream_filein.read(amount_requested)
  412. payload_file = six.BytesIO(payload)
  413. return payload_file, len(payload) == amount_requested
  414. def _enqueue_multipart_tasks(self, filename, payload=None):
  415. num_uploads = 1
  416. if filename.operation_name == 'upload':
  417. num_uploads = self._enqueue_multipart_upload_tasks(filename,
  418. payload=payload)
  419. elif filename.operation_name == 'download':
  420. num_uploads = self._enqueue_range_download_tasks(filename)
  421. return num_uploads
  422. def _enqueue_range_download_tasks(self, filename, remove_remote_file=False):
  423. # Create the context for the multipart download.
  424. num_downloads = int(filename.size / self.chunksize)
  425. context = tasks.MultipartDownloadContext(num_downloads)
  426. # No file is needed for downloading a stream. So just announce
  427. # that it has been made since it is required for the context to
  428. # begin downloading.
  429. context.announce_file_created()
  430. # Submit download part tasks to the executor.
  431. self._do_enqueue_range_download_tasks(
  432. filename=filename, chunksize=self.chunksize,
  433. num_downloads=num_downloads, context=context,
  434. remove_remote_file=remove_remote_file
  435. )
  436. return num_downloads
  437. def _enqueue_multipart_upload_tasks(self, filename, payload=None):
  438. # First we need to create a CreateMultipartUpload task,
  439. # then create UploadTask objects for each of the parts.
  440. # And finally enqueue a CompleteMultipartUploadTask.
  441. if self.params['expected_size']:
  442. # If we have the expected size, we can calculate an appropriate
  443. # chunksize based on max parts and chunksize limits
  444. chunksize = find_chunksize(int(self.params['expected_size']),
  445. self.chunksize)
  446. else:
  447. # Otherwise, we can still adjust for chunksize limits
  448. chunksize = adjust_chunksize_to_upload_limits(self.chunksize)
  449. num_uploads = '...'
  450. # Submit a task to begin the multipart upload.
  451. upload_context = self._enqueue_upload_start_task(
  452. chunksize, num_uploads, filename)
  453. # Now submit a task to upload the initial chunk of data pulled
  454. # from the stream that was used to determine if a multipart upload
  455. # was needed.
  456. self._enqueue_upload_single_part_task(
  457. part_number=1, chunk_size=chunksize,
  458. upload_context=upload_context, filename=filename,
  459. task_class=tasks.UploadPartTask, payload=payload
  460. )
  461. # Submit tasks to upload the rest of the chunks of the data coming in
  462. # from standard input.
  463. num_uploads = self._enqueue_upload_tasks(
  464. num_uploads, chunksize, upload_context,
  465. filename, tasks.UploadPartTask
  466. )
  467. # Submit a task to notify the multipart upload is complete.
  468. self._enqueue_upload_end_task(filename, upload_context)
  469. return num_uploads
  470. def _enqueue_upload_tasks(self, num_uploads, chunksize, upload_context,
  471. filename, task_class):
  472. # The previous upload occured right after the multipart
  473. # upload started for a stream.
  474. num_uploads = 1
  475. while True:
  476. # Pull more data from standard input.
  477. payload, is_remaining = self._pull_from_stream(chunksize)
  478. # Submit an upload part task for the recently pulled data.
  479. self._enqueue_upload_single_part_task(
  480. part_number=num_uploads+1,
  481. chunk_size=chunksize,
  482. upload_context=upload_context,
  483. filename=filename,
  484. task_class=task_class,
  485. payload=payload
  486. )
  487. num_uploads += 1
  488. if not is_remaining:
  489. break
  490. # Once there is no more data left, announce to the context how
  491. # many parts are being uploaded so it knows when it can quit.
  492. upload_context.announce_total_parts(num_uploads)
  493. return num_uploads