/web/lib/cloudstorage/common.py
Python | 397 lines | 376 code | 13 blank | 8 comment | 5 complexity | 262b8ce58970c55b2f57a991852657c4 MD5 | raw file
- # Copyright 2012 Google Inc. All Rights Reserved.
- """Helpers shared by cloudstorage_stub and cloudstorage_api."""
- __all__ = ['CS_XML_NS',
- 'CSFileStat',
- 'dt_str_to_posix',
- 'local_api_url',
- 'LOCAL_GCS_ENDPOINT',
- 'local_run',
- 'get_access_token',
- 'get_metadata',
- 'GCSFileStat',
- 'http_time_to_posix',
- 'memory_usage',
- 'posix_time_to_http',
- 'posix_to_dt_str',
- 'set_access_token',
- 'validate_options',
- 'validate_bucket_name',
- 'validate_bucket_path',
- 'validate_file_path',
- ]
- import calendar
- import datetime
- from email import utils as email_utils
- import logging
- import os
- import re
- try:
- from google.appengine.api import runtime
- except ImportError:
- from google.appengine.api import runtime
- _GCS_BUCKET_REGEX_BASE = r'[a-z0-9\.\-_]{3,63}'
- _GCS_BUCKET_REGEX = re.compile(_GCS_BUCKET_REGEX_BASE + r'$')
- _GCS_BUCKET_PATH_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'$')
- _GCS_PATH_PREFIX_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'.*')
- _GCS_FULLPATH_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'/.*')
- _GCS_METADATA = ['x-goog-meta-',
- 'content-disposition',
- 'cache-control',
- 'content-encoding']
- _GCS_OPTIONS = _GCS_METADATA + ['x-goog-acl']
- CS_XML_NS = 'http://doc.s3.amazonaws.com/2006-03-01'
- LOCAL_GCS_ENDPOINT = '/_ah/gcs'
- _access_token = ''
- _MAX_GET_BUCKET_RESULT = 1000
- def set_access_token(access_token):
- """Set the shared access token to authenticate with Google Cloud Storage.
- When set, the library will always attempt to communicate with the
- real Google Cloud Storage with this token even when running on dev appserver.
- Note the token could expire so it's up to you to renew it.
- When absent, the library will automatically request and refresh a token
- on appserver, or when on dev appserver, talk to a Google Cloud Storage
- stub.
- Args:
- access_token: you can get one by run 'gsutil -d ls' and copy the
- str after 'Bearer'.
- """
- global _access_token
- _access_token = access_token
- def get_access_token():
- """Returns the shared access token."""
- return _access_token
- class GCSFileStat(object):
- """Container for GCS file stat."""
- def __init__(self,
- filename,
- st_size,
- etag,
- st_ctime,
- content_type=None,
- metadata=None,
- is_dir=False):
- """Initialize.
- For files, the non optional arguments are always set.
- For directories, only filename and is_dir is set.
- Args:
- filename: a Google Cloud Storage filename of form '/bucket/filename'.
- st_size: file size in bytes. long compatible.
- etag: hex digest of the md5 hash of the file's content. str.
- st_ctime: posix file creation time. float compatible.
- content_type: content type. str.
- metadata: a str->str dict of user specified options when creating
- the file. Possible keys are x-goog-meta-, content-disposition,
- content-encoding, and cache-control.
- is_dir: True if this represents a directory. False if this is a real file.
- """
- self.filename = filename
- self.is_dir = is_dir
- self.st_size = None
- self.st_ctime = None
- self.etag = None
- self.content_type = content_type
- self.metadata = metadata
- if not is_dir:
- self.st_size = long(st_size)
- self.st_ctime = float(st_ctime)
- if etag[0] == '"' and etag[-1] == '"':
- etag = etag[1:-1]
- self.etag = etag
- def __repr__(self):
- if self.is_dir:
- return '(directory: %s)' % self.filename
- return (
- '(filename: %(filename)s, st_size: %(st_size)s, '
- 'st_ctime: %(st_ctime)s, etag: %(etag)s, '
- 'content_type: %(content_type)s, '
- 'metadata: %(metadata)s)' %
- dict(filename=self.filename,
- st_size=self.st_size,
- st_ctime=self.st_ctime,
- etag=self.etag,
- content_type=self.content_type,
- metadata=self.metadata))
- def __cmp__(self, other):
- if not isinstance(other, self.__class__):
- raise ValueError('Argument to cmp must have the same type. '
- 'Expect %s, got %s', self.__class__.__name__,
- other.__class__.__name__)
- if self.filename > other.filename:
- return 1
- elif self.filename < other.filename:
- return -1
- return 0
- def __hash__(self):
- if self.etag:
- return hash(self.etag)
- return hash(self.filename)
- CSFileStat = GCSFileStat
- def get_metadata(headers):
- """Get user defined options from HTTP response headers."""
- return dict((k, v) for k, v in headers.iteritems()
- if any(k.lower().startswith(valid) for valid in _GCS_METADATA))
- def validate_bucket_name(name):
- """Validate a Google Storage bucket name.
- Args:
- name: a Google Storage bucket name with no prefix or suffix.
- Raises:
- ValueError: if name is invalid.
- """
- _validate_path(name)
- if not _GCS_BUCKET_REGEX.match(name):
- raise ValueError('Bucket should be 3-63 characters long using only a-z,'
- '0-9, underscore, dash or dot but got %s' % name)
- def validate_bucket_path(path):
- """Validate a Google Cloud Storage bucket path.
- Args:
- path: a Google Storage bucket path. It should have form '/bucket'.
- Raises:
- ValueError: if path is invalid.
- """
- _validate_path(path)
- if not _GCS_BUCKET_PATH_REGEX.match(path):
- raise ValueError('Bucket should have format /bucket '
- 'but got %s' % path)
- def validate_file_path(path):
- """Validate a Google Cloud Storage file path.
- Args:
- path: a Google Storage file path. It should have form '/bucket/filename'.
- Raises:
- ValueError: if path is invalid.
- """
- _validate_path(path)
- if not _GCS_FULLPATH_REGEX.match(path):
- raise ValueError('Path should have format /bucket/filename '
- 'but got %s' % path)
- def _process_path_prefix(path_prefix):
- """Validate and process a Google Cloud Stoarge path prefix.
- Args:
- path_prefix: a Google Cloud Storage path prefix of format '/bucket/prefix'
- or '/bucket/' or '/bucket'.
- Raises:
- ValueError: if path is invalid.
- Returns:
- a tuple of /bucket and prefix. prefix can be None.
- """
- _validate_path(path_prefix)
- if not _GCS_PATH_PREFIX_REGEX.match(path_prefix):
- raise ValueError('Path prefix should have format /bucket, /bucket/, '
- 'or /bucket/prefix but got %s.' % path_prefix)
- bucket_name_end = path_prefix.find('/', 1)
- bucket = path_prefix
- prefix = None
- if bucket_name_end != -1:
- bucket = path_prefix[:bucket_name_end]
- prefix = path_prefix[bucket_name_end + 1:] or None
- return bucket, prefix
- def _validate_path(path):
- """Basic validation of Google Storage paths.
- Args:
- path: a Google Storage path. It should have form '/bucket/filename'
- or '/bucket'.
- Raises:
- ValueError: if path is invalid.
- TypeError: if path is not of type basestring.
- """
- if not path:
- raise ValueError('Path is empty')
- if not isinstance(path, basestring):
- raise TypeError('Path should be a string but is %s (%s).' %
- (path.__class__, path))
- def validate_options(options):
- """Validate Google Cloud Storage options.
- Args:
- options: a str->basestring dict of options to pass to Google Cloud Storage.
- Raises:
- ValueError: if option is not supported.
- TypeError: if option is not of type str or value of an option
- is not of type basestring.
- """
- if not options:
- return
- for k, v in options.iteritems():
- if not isinstance(k, str):
- raise TypeError('option %r should be a str.' % k)
- if not any(k.lower().startswith(valid) for valid in _GCS_OPTIONS):
- raise ValueError('option %s is not supported.' % k)
- if not isinstance(v, basestring):
- raise TypeError('value %r for option %s should be of type basestring.' %
- v, k)
- def http_time_to_posix(http_time):
- """Convert HTTP time format to posix time.
- See http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3.1
- for http time format.
- Args:
- http_time: time in RFC 2616 format. e.g.
- "Mon, 20 Nov 1995 19:12:08 GMT".
- Returns:
- A float of secs from unix epoch.
- """
- if http_time is not None:
- return email_utils.mktime_tz(email_utils.parsedate_tz(http_time))
- def posix_time_to_http(posix_time):
- """Convert posix time to HTML header time format.
- Args:
- posix_time: unix time.
- Returns:
- A datatime str in RFC 2616 format.
- """
- if posix_time:
- return email_utils.formatdate(posix_time, usegmt=True)
- _DT_FORMAT = '%Y-%m-%dT%H:%M:%S'
- def dt_str_to_posix(dt_str):
- """format str to posix.
- datetime str is of format %Y-%m-%dT%H:%M:%S.%fZ,
- e.g. 2013-04-12T00:22:27.978Z. According to ISO 8601, T is a separator
- between date and time when they are on the same line.
- Z indicates UTC (zero meridian).
- A pointer: http://www.cl.cam.ac.uk/~mgk25/iso-time.html
- This is used to parse LastModified node from GCS's GET bucket XML response.
- Args:
- dt_str: A datetime str.
- Returns:
- A float of secs from unix epoch. By posix definition, epoch is midnight
- 1970/1/1 UTC.
- """
- parsable, _ = dt_str.split('.')
- dt = datetime.datetime.strptime(parsable, _DT_FORMAT)
- return calendar.timegm(dt.utctimetuple())
- def posix_to_dt_str(posix):
- """Reverse of str_to_datetime.
- This is used by GCS stub to generate GET bucket XML response.
- Args:
- posix: A float of secs from unix epoch.
- Returns:
- A datetime str.
- """
- dt = datetime.datetime.utcfromtimestamp(posix)
- dt_str = dt.strftime(_DT_FORMAT)
- return dt_str + '.000Z'
- def local_run():
- """Whether we should hit GCS dev appserver stub."""
- server_software = os.environ.get('SERVER_SOFTWARE')
- if server_software is None:
- return True
- if 'remote_api' in server_software:
- return False
- if server_software.startswith(('Development', 'testutil')):
- return True
- return False
- def local_api_url():
- """Return URL for GCS emulation on dev appserver."""
- return 'http://%s%s' % (os.environ.get('HTTP_HOST'), LOCAL_GCS_ENDPOINT)
- def memory_usage(method):
- """Log memory usage before and after a method."""
- def wrapper(*args, **kwargs):
- logging.info('Memory before method %s is %s.',
- method.__name__, runtime.memory_usage().current())
- result = method(*args, **kwargs)
- logging.info('Memory after method %s is %s',
- method.__name__, runtime.memory_usage().current())
- return result
- return wrapper
- def _add_ns(tagname):
- return '{%(ns)s}%(tag)s' % {'ns': CS_XML_NS,
- 'tag': tagname}
- _T_CONTENTS = _add_ns('Contents')
- _T_LAST_MODIFIED = _add_ns('LastModified')
- _T_ETAG = _add_ns('ETag')
- _T_KEY = _add_ns('Key')
- _T_SIZE = _add_ns('Size')
- _T_PREFIX = _add_ns('Prefix')
- _T_COMMON_PREFIXES = _add_ns('CommonPrefixes')
- _T_NEXT_MARKER = _add_ns('NextMarker')
- _T_IS_TRUNCATED = _add_ns('IsTruncated')