PageRenderTime 57ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/src/s3ql/backends/s3c.py

https://code.google.com/p/s3ql/
Python | 707 lines | 552 code | 79 blank | 76 comment | 53 complexity | 5f223030f5627eccd2689781a6123583 MD5 | raw file
Possible License(s): GPL-3.0
  1. '''
  2. backends/s3c.py - this file is part of S3QL (http://s3ql.googlecode.com)
  3. Copyright (C) Nikolaus Rath <Nikolaus@rath.org>
  4. This program can be distributed under the terms of the GNU GPLv3.
  5. '''
  6. from __future__ import division, print_function, absolute_import
  7. from ..common import BUFSIZE, QuietError
  8. from .common import AbstractBackend, NoSuchObject, retry, AuthorizationError, http_connection, \
  9. AuthenticationError
  10. from .common import DanglingStorageURL as DanglingStorageURL_common
  11. from base64 import b64encode
  12. from email.utils import parsedate_tz, mktime_tz
  13. from urlparse import urlsplit
  14. import errno
  15. import hashlib
  16. import hmac
  17. import httplib
  18. import logging
  19. import re
  20. import tempfile
  21. import time
  22. import urllib
  23. import xml.etree.cElementTree as ElementTree
  24. C_DAY_NAMES = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ]
  25. C_MONTH_NAMES = [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ]
  26. XML_CONTENT_RE = re.compile('^application/xml(?:;\s+|$)', re.IGNORECASE)
  27. log = logging.getLogger("backends.s3c")
  28. class Backend(AbstractBackend):
  29. """A backend to stored data in some S3 compatible storage service.
  30. This class uses standard HTTP connections to connect to GS.
  31. The backend guarantees only immediate get after create consistency.
  32. """
  33. def __init__(self, storage_url, login, password, use_ssl):
  34. super(Backend, self).__init__()
  35. (host, port, bucket_name, prefix) = self._parse_storage_url(storage_url, use_ssl)
  36. self.bucket_name = bucket_name
  37. self.prefix = prefix
  38. self.hostname = host
  39. self.port = port
  40. self.use_ssl = use_ssl
  41. self.conn = self._get_conn()
  42. self.password = password
  43. self.login = login
  44. self.namespace = 'http://s3.amazonaws.com/doc/2006-03-01/'
  45. @staticmethod
  46. def _parse_storage_url(storage_url, use_ssl):
  47. '''Extract information from storage URL
  48. Return a tuple * (host, port, bucket_name, prefix) * .
  49. '''
  50. hit = re.match(r'^[a-zA-Z0-9]+://' # Backend
  51. r'([^/:]+)' # Hostname
  52. r'(?::([0-9]+))?' # Port
  53. r'/([^/]+)' # Bucketname
  54. r'(?:/(.*))?$', # Prefix
  55. storage_url)
  56. if not hit:
  57. raise QuietError('Invalid storage URL')
  58. hostname = hit.group(1)
  59. if hit.group(2):
  60. port = int(hit.group(2))
  61. elif use_ssl:
  62. port = 443
  63. else:
  64. port = 80
  65. bucketname = hit.group(3)
  66. prefix = hit.group(4) or ''
  67. return (hostname, port, bucketname, prefix)
  68. def _get_conn(self):
  69. '''Return connection to server'''
  70. return http_connection(self.hostname, self.port, self.use_ssl)
  71. def is_temp_failure(self, exc): #IGNORE:W0613
  72. '''Return true if exc indicates a temporary error
  73. Return true if the given exception indicates a temporary problem. Most instance methods
  74. automatically retry the request in this case, so the caller does not need to worry about
  75. temporary failures.
  76. However, in same cases (e.g. when reading or writing an object), the request cannot
  77. automatically be retried. In these case this method can be used to check for temporary
  78. problems and so that the request can be manually restarted if applicable.
  79. '''
  80. if isinstance(exc, (InternalError, BadDigest, IncompleteBody, RequestTimeout,
  81. OperationAborted, SlowDown, RequestTimeTooSkewed,
  82. httplib.IncompleteRead)):
  83. return True
  84. # Server closed connection
  85. elif (isinstance(exc, httplib.BadStatusLine)
  86. and (not exc.line or exc.line == "''")):
  87. return True
  88. elif (isinstance(exc, IOError) and
  89. exc.errno in (errno.EPIPE, errno.ECONNRESET, errno.ETIMEDOUT,
  90. errno.EINTR)):
  91. return True
  92. elif isinstance(exc, HTTPError) and exc.status >= 500 and exc.status <= 599:
  93. return True
  94. return False
  95. @retry
  96. def delete(self, key, force=False):
  97. '''Delete the specified object'''
  98. log.debug('delete(%s)', key)
  99. try:
  100. resp = self._do_request('DELETE', '/%s%s' % (self.prefix, key))
  101. assert resp.length == 0
  102. except NoSuchKey:
  103. if force:
  104. pass
  105. else:
  106. raise NoSuchObject(key)
  107. def list(self, prefix=''):
  108. '''List keys in backend
  109. Returns an iterator over all keys in the backend. This method
  110. handles temporary errors.
  111. '''
  112. log.debug('list(%s): start', prefix)
  113. marker = ''
  114. waited = 0
  115. interval = 1 / 50
  116. iterator = self._list(prefix, marker)
  117. while True:
  118. try:
  119. marker = iterator.next()
  120. waited = 0
  121. except StopIteration:
  122. break
  123. except Exception as exc:
  124. if not self.is_temp_failure(exc):
  125. raise
  126. if waited > 60 * 60:
  127. log.error('list(): Timeout exceeded, re-raising %s exception',
  128. type(exc).__name__)
  129. raise
  130. log.info('Encountered %s exception (%s), retrying call to s3c.Backend.list()',
  131. type(exc).__name__, exc)
  132. if hasattr(exc, 'retry_after') and exc.retry_after:
  133. interval = exc.retry_after
  134. time.sleep(interval)
  135. waited += interval
  136. interval = min(5*60, 2*interval)
  137. iterator = self._list(prefix, marker)
  138. else:
  139. yield marker
  140. def _list(self, prefix='', start=''):
  141. '''List keys in backend, starting with *start*
  142. Returns an iterator over all keys in the backend. This method
  143. does not retry on errors.
  144. '''
  145. keys_remaining = True
  146. marker = start
  147. prefix = self.prefix + prefix
  148. while keys_remaining:
  149. log.debug('list(%s): requesting with marker=%s', prefix, marker)
  150. keys_remaining = None
  151. resp = self._do_request('GET', '/', query_string={ 'prefix': prefix,
  152. 'marker': marker,
  153. 'max-keys': 1000 })
  154. if not XML_CONTENT_RE.match(resp.getheader('Content-Type')):
  155. raise RuntimeError('unexpected content type: %s' % resp.getheader('Content-Type'))
  156. itree = iter(ElementTree.iterparse(resp, events=("start", "end")))
  157. (event, root) = itree.next()
  158. namespace = re.sub(r'^\{(.+)\}.+$', r'\1', root.tag)
  159. if namespace != self.namespace:
  160. raise RuntimeError('Unsupported namespace: %s' % namespace)
  161. try:
  162. for (event, el) in itree:
  163. if event != 'end':
  164. continue
  165. if el.tag == '{%s}IsTruncated' % self.namespace:
  166. keys_remaining = (el.text == 'true')
  167. elif el.tag == '{%s}Contents' % self.namespace:
  168. marker = el.findtext('{%s}Key' % self.namespace)
  169. yield marker[len(self.prefix):]
  170. root.clear()
  171. except GeneratorExit:
  172. # Need to read rest of response
  173. while True:
  174. buf = resp.read(BUFSIZE)
  175. if buf == '':
  176. break
  177. break
  178. if keys_remaining is None:
  179. raise RuntimeError('Could not parse body')
  180. @retry
  181. def lookup(self, key):
  182. """Return metadata for given key"""
  183. log.debug('lookup(%s)', key)
  184. try:
  185. resp = self._do_request('HEAD', '/%s%s' % (self.prefix, key))
  186. assert resp.length == 0
  187. except HTTPError as exc:
  188. if exc.status == 404:
  189. raise NoSuchObject(key)
  190. else:
  191. raise
  192. return extractmeta(resp)
  193. @retry
  194. def get_size(self, key):
  195. '''Return size of object stored under *key*'''
  196. log.debug('get_size(%s)', key)
  197. try:
  198. resp = self._do_request('HEAD', '/%s%s' % (self.prefix, key))
  199. assert resp.length == 0
  200. except HTTPError as exc:
  201. if exc.status == 404:
  202. raise NoSuchObject(key)
  203. else:
  204. raise
  205. for (name, val) in resp.getheaders():
  206. if name.lower() == 'content-length':
  207. return int(val)
  208. raise RuntimeError('HEAD request did not return Content-Length')
  209. @retry
  210. def open_read(self, key):
  211. """Open object for reading
  212. Return a file-like object. Data can be read using the `read` method. metadata is stored in
  213. its *metadata* attribute and can be modified by the caller at will. The object must be
  214. closed explicitly.
  215. """
  216. try:
  217. resp = self._do_request('GET', '/%s%s' % (self.prefix, key))
  218. except NoSuchKey:
  219. raise NoSuchObject(key)
  220. return ObjectR(key, resp, self, extractmeta(resp))
  221. def open_write(self, key, metadata=None, is_compressed=False):
  222. """Open object for writing
  223. `metadata` can be a dict of additional attributes to store with the object. Returns a file-
  224. like object. The object must be closed explicitly. After closing, the *get_obj_size* may be
  225. used to retrieve the size of the stored object (which may differ from the size of the
  226. written data).
  227. The *is_compressed* parameter indicates that the caller is going to write compressed data,
  228. and may be used to avoid recompression by the backend.
  229. Since Amazon S3 does not support chunked uploads, the entire data will
  230. be buffered in memory before upload.
  231. """
  232. log.debug('open_write(%s): start', key)
  233. headers = dict()
  234. if metadata:
  235. for (hdr, val) in metadata.iteritems():
  236. headers['x-amz-meta-%s' % hdr] = val
  237. return ObjectW(key, self, headers)
  238. @retry
  239. def copy(self, src, dest):
  240. """Copy data stored under key `src` to key `dest`
  241. If `dest` already exists, it will be overwritten. The copying is done on
  242. the remote side.
  243. """
  244. log.debug('copy(%s, %s): start', src, dest)
  245. try:
  246. resp = self._do_request('PUT', '/%s%s' % (self.prefix, dest),
  247. headers={ 'x-amz-copy-source': '/%s/%s%s' % (self.bucket_name,
  248. self.prefix, src)})
  249. # Discard response body
  250. resp.read()
  251. except NoSuchKey:
  252. raise NoSuchObject(src)
  253. def _do_request(self, method, path, subres=None, query_string=None,
  254. headers=None, body=None):
  255. '''Send request, read and return response object'''
  256. log.debug('_do_request(): start with parameters (%r, %r, %r, %r, %r, %r)',
  257. method, path, subres, query_string, headers, body)
  258. if headers is None:
  259. headers = dict()
  260. headers['connection'] = 'keep-alive'
  261. if not body:
  262. headers['content-length'] = '0'
  263. redirect_count = 0
  264. while True:
  265. resp = self._send_request(method, path, headers, subres, query_string, body)
  266. log.debug('_do_request(): request-id: %s', resp.getheader('x-amz-request-id'))
  267. if (resp.status < 300 or resp.status > 399):
  268. break
  269. # Assume redirect
  270. new_url = resp.getheader('Location')
  271. if new_url is None:
  272. break
  273. log.info('_do_request(): redirected to %s', new_url)
  274. redirect_count += 1
  275. if redirect_count > 10:
  276. raise RuntimeError('Too many chained redirections')
  277. # Pylint can't infer SplitResult Types
  278. #pylint: disable=E1103
  279. o = urlsplit(new_url)
  280. if o.scheme:
  281. if isinstance(self.conn, httplib.HTTPConnection) and o.scheme != 'http':
  282. raise RuntimeError('Redirect to non-http URL')
  283. elif isinstance(self.conn, httplib.HTTPSConnection) and o.scheme != 'https':
  284. raise RuntimeError('Redirect to non-https URL')
  285. if o.hostname != self.hostname or o.port != self.port:
  286. self.hostname = o.hostname
  287. self.port = o.port
  288. self.conn = self._get_conn()
  289. else:
  290. raise RuntimeError('Redirect to different path on same host')
  291. if body and not isinstance(body, bytes):
  292. body.seek(0)
  293. # Read and discard body
  294. log.debug('Response body: %s', resp.read())
  295. # We need to call read() at least once for httplib to consider this
  296. # request finished, even if there is no response body.
  297. if resp.length == 0:
  298. resp.read()
  299. # Success
  300. if resp.status >= 200 and resp.status <= 299:
  301. return resp
  302. # If method == HEAD, server must not return response body
  303. # even in case of errors
  304. if method.upper() == 'HEAD':
  305. raise HTTPError(resp.status, resp.reason)
  306. content_type = resp.getheader('Content-Type')
  307. if not content_type or not XML_CONTENT_RE.match(content_type):
  308. raise HTTPError(resp.status, resp.reason, resp.getheaders(), resp.read())
  309. # Error
  310. tree = ElementTree.parse(resp).getroot()
  311. raise get_S3Error(tree.findtext('Code'), tree.findtext('Message'))
  312. def clear(self):
  313. """Delete all objects in backend
  314. Note that this method may not be able to see (and therefore also not
  315. delete) recently uploaded objects.
  316. """
  317. # We have to cache keys, because otherwise we can't use the
  318. # http connection to delete keys.
  319. for (no, s3key) in enumerate(list(self)):
  320. if no != 0 and no % 1000 == 0:
  321. log.info('clear(): deleted %d objects so far..', no)
  322. log.debug('clear(): deleting key %s', s3key)
  323. # Ignore missing objects when clearing bucket
  324. self.delete(s3key, True)
  325. def __str__(self):
  326. return 's3c://%s/%s/%s' % (self.hostname, self.bucket_name, self.prefix)
  327. def _send_request(self, method, path, headers, subres=None, query_string=None, body=None):
  328. '''Add authentication and send request
  329. Note that *headers* is modified in-place. Returns the response object.
  330. '''
  331. # See http://docs.amazonwebservices.com/AmazonS3/latest/dev/RESTAuthentication.html
  332. # Lowercase headers
  333. keys = list(headers.iterkeys())
  334. for key in keys:
  335. key_l = key.lower()
  336. if key_l == key:
  337. continue
  338. headers[key_l] = headers[key]
  339. del headers[key]
  340. # Date, can't use strftime because it's locale dependent
  341. now = time.gmtime()
  342. headers['date'] = ('%s, %02d %s %04d %02d:%02d:%02d GMT'
  343. % (C_DAY_NAMES[now.tm_wday],
  344. now.tm_mday,
  345. C_MONTH_NAMES[now.tm_mon - 1],
  346. now.tm_year, now.tm_hour,
  347. now.tm_min, now.tm_sec))
  348. auth_strs = [method, '\n']
  349. for hdr in ('content-md5', 'content-type', 'date'):
  350. if hdr in headers:
  351. auth_strs.append(headers[hdr])
  352. auth_strs.append('\n')
  353. for hdr in sorted(x for x in headers if x.startswith('x-amz-')):
  354. val = ' '.join(re.split(r'\s*\n\s*', headers[hdr].strip()))
  355. auth_strs.append('%s:%s\n' % (hdr, val))
  356. # Always include bucket name in path for signing
  357. sign_path = urllib.quote('/%s%s' % (self.bucket_name, path))
  358. auth_strs.append(sign_path)
  359. if subres:
  360. auth_strs.append('?%s' % subres)
  361. # False positive, hashlib *does* have sha1 member
  362. #pylint: disable=E1101
  363. signature = b64encode(hmac.new(self.password, ''.join(auth_strs), hashlib.sha1).digest())
  364. headers['authorization'] = 'AWS %s:%s' % (self.login, signature)
  365. # Construct full path
  366. if not self.hostname.startswith(self.bucket_name):
  367. path = '/%s%s' % (self.bucket_name, path)
  368. path = urllib.quote(path)
  369. if query_string:
  370. s = urllib.urlencode(query_string, doseq=True)
  371. if subres:
  372. path += '?%s&%s' % (subres, s)
  373. else:
  374. path += '?%s' % s
  375. elif subres:
  376. path += '?%s' % subres
  377. try:
  378. log.debug('_send_request(): sending request for %s', path)
  379. self.conn.request(method, path, body, headers)
  380. log.debug('_send_request(): Reading response..')
  381. return self.conn.getresponse()
  382. except:
  383. # We probably can't use the connection anymore
  384. self.conn.close()
  385. raise
  386. class ObjectR(object):
  387. '''An S3 object open for reading'''
  388. def __init__(self, key, resp, backend, metadata=None):
  389. self.key = key
  390. self.resp = resp
  391. self.md5_checked = False
  392. self.backend = backend
  393. self.metadata = metadata
  394. # False positive, hashlib *does* have md5 member
  395. #pylint: disable=E1101
  396. self.md5 = hashlib.md5()
  397. def read(self, size=None):
  398. '''Read object data
  399. For integrity checking to work, this method has to be called until
  400. it returns an empty string, indicating that all data has been read
  401. (and verified).
  402. '''
  403. # chunked encoding handled by httplib
  404. buf = self.resp.read(size)
  405. # Check MD5 on EOF
  406. if not buf and not self.md5_checked:
  407. etag = self.resp.getheader('ETag').strip('"')
  408. self.md5_checked = True
  409. if etag != self.md5.hexdigest():
  410. log.warn('ObjectR(%s).close(): MD5 mismatch: %s vs %s', self.key, etag,
  411. self.md5.hexdigest())
  412. raise BadDigest('BadDigest', 'ETag header does not agree with calculated MD5')
  413. return buf
  414. self.md5.update(buf)
  415. return buf
  416. def __enter__(self):
  417. return self
  418. def __exit__(self, *a):
  419. return False
  420. def close(self):
  421. '''Close object'''
  422. pass
  423. class ObjectW(object):
  424. '''An S3 object open for writing
  425. All data is first cached in memory, upload only starts when
  426. the close() method is called.
  427. '''
  428. def __init__(self, key, backend, headers):
  429. self.key = key
  430. self.backend = backend
  431. self.headers = headers
  432. self.closed = False
  433. self.obj_size = 0
  434. self.fh = tempfile.TemporaryFile(bufsize=0) # no Python buffering
  435. # False positive, hashlib *does* have md5 member
  436. #pylint: disable=E1101
  437. self.md5 = hashlib.md5()
  438. def write(self, buf):
  439. '''Write object data'''
  440. self.fh.write(buf)
  441. self.md5.update(buf)
  442. self.obj_size += len(buf)
  443. def is_temp_failure(self, exc):
  444. return self.backend.is_temp_failure(exc)
  445. @retry
  446. def close(self):
  447. '''Close object and upload data'''
  448. # Access to protected member ok
  449. #pylint: disable=W0212
  450. log.debug('ObjectW(%s).close(): start', self.key)
  451. self.closed = True
  452. self.headers['Content-Length'] = self.obj_size
  453. self.fh.seek(0)
  454. resp = self.backend._do_request('PUT', '/%s%s' % (self.backend.prefix, self.key),
  455. headers=self.headers, body=self.fh)
  456. etag = resp.getheader('ETag').strip('"')
  457. assert resp.length == 0
  458. if etag != self.md5.hexdigest():
  459. log.warn('ObjectW(%s).close(): MD5 mismatch (%s vs %s)', self.key, etag,
  460. self.md5.hexdigest)
  461. try:
  462. self.backend.delete(self.key)
  463. except:
  464. log.exception('Objectw(%s).close(): unable to delete corrupted object!',
  465. self.key)
  466. raise BadDigest('BadDigest', 'Received ETag does not agree with our calculations.')
  467. def __enter__(self):
  468. return self
  469. def __exit__(self, *a):
  470. self.close()
  471. return False
  472. def get_obj_size(self):
  473. if not self.closed:
  474. raise RuntimeError('Object must be closed first.')
  475. return self.obj_size
  476. def get_S3Error(code, msg):
  477. '''Instantiate most specific S3Error subclass'''
  478. return globals().get(code, S3Error)(code, msg)
  479. def extractmeta(resp):
  480. '''Extract metadata from HTTP response object'''
  481. meta = dict()
  482. for (name, val) in resp.getheaders():
  483. hit = re.match(r'^x-amz-meta-(.+)$', name)
  484. if not hit:
  485. continue
  486. meta[hit.group(1)] = val
  487. return meta
  488. class HTTPError(Exception):
  489. '''
  490. Represents an HTTP error returned by S3.
  491. '''
  492. def __init__(self, status, msg, headers=None, body=None):
  493. super(HTTPError, self).__init__()
  494. self.status = status
  495. self.msg = msg
  496. self.headers = headers
  497. self.body = body
  498. self.retry_after = None
  499. if self.headers is not None:
  500. self._set_retry_after()
  501. def _set_retry_after(self):
  502. '''Parse headers for Retry-After value'''
  503. val = None
  504. for (k, v) in self.headers:
  505. if k.lower() == 'retry-after':
  506. hit = re.match(r'^\s*([0-9]+)\s*$', v)
  507. if hit:
  508. val = int(v)
  509. else:
  510. date = parsedate_tz(v)
  511. if date is None:
  512. log.warn('Unable to parse header: %s: %s', k, v)
  513. continue
  514. val = mktime_tz(*date) - time.time()
  515. if val is not None:
  516. if val > 300 or val < 0:
  517. log.warn('Ignoring invalid retry-after value of %.3f', val)
  518. else:
  519. self.retry_after = val
  520. def __str__(self):
  521. return '%d %s' % (self.status, self.msg)
  522. class S3Error(Exception):
  523. '''
  524. Represents an error returned by S3. For possible codes, see
  525. http://docs.amazonwebservices.com/AmazonS3/latest/API/ErrorResponses.html
  526. '''
  527. def __init__(self, code, msg):
  528. super(S3Error, self).__init__(msg)
  529. self.code = code
  530. self.msg = msg
  531. def __str__(self):
  532. return '%s: %s' % (self.code, self.msg)
  533. class NoSuchKey(S3Error): pass
  534. class AccessDenied(S3Error, AuthorizationError): pass
  535. class BadDigest(S3Error): pass
  536. class IncompleteBody(S3Error): pass
  537. class InternalError(S3Error): pass
  538. class InvalidAccessKeyId(S3Error, AuthenticationError): pass
  539. class InvalidSecurity(S3Error, AuthenticationError): pass
  540. class SignatureDoesNotMatch(S3Error, AuthenticationError): pass
  541. class OperationAborted(S3Error): pass
  542. class RequestTimeout(S3Error): pass
  543. class SlowDown(S3Error): pass
  544. class RequestTimeTooSkewed(S3Error): pass
  545. class DanglingStorageURL(S3Error, DanglingStorageURL_common): pass