test_gbq.py | searchcode

/pandas/io/tests/test_gbq.py

http://github.com/wesm/pandas · Python · 949 lines · 640 code · 214 blank · 95 comment · 56 complexity · e26033c838cee36bc2c509fa2700a8b7 MD5 · raw file

import re
from datetime import datetime
import nose
import pytz
import platform
from time import sleep

import numpy as np

from distutils.version import StrictVersion
from pandas import compat

from pandas import NaT
from pandas.compat import u, range
from pandas.core.frame import DataFrame
import pandas.io.gbq as gbq
import pandas.util.testing as tm
from pandas.compat.numpy import np_datetime64_compat

PROJECT_ID = None
PRIVATE_KEY_JSON_PATH = None
PRIVATE_KEY_JSON_CONTENTS = None

DATASET_ID = 'pydata_pandas_bq_testing'
TABLE_ID = 'new_test'
DESTINATION_TABLE = "{0}.{1}".format(DATASET_ID + "1", TABLE_ID)

VERSION = platform.python_version()

_IMPORTS = False
_GOOGLE_API_CLIENT_INSTALLED = False
_GOOGLE_API_CLIENT_VALID_VERSION = False
_HTTPLIB2_INSTALLED = False
_SETUPTOOLS_INSTALLED = False


def _skip_if_no_project_id():
    if not PROJECT_ID:
        raise nose.SkipTest(
            "Cannot run integration tests without a project id")


def _skip_if_no_private_key_path():
    if not PRIVATE_KEY_JSON_PATH:
        raise nose.SkipTest("Cannot run integration tests without a "
                            "private key json file path")


def _skip_if_no_private_key_contents():
    if not PRIVATE_KEY_JSON_CONTENTS:
        raise nose.SkipTest("Cannot run integration tests without a "
                            "private key json contents")

        _skip_if_no_project_id()
        _skip_if_no_private_key_path()
        _skip_if_no_private_key_contents()


def _test_imports():
    global _GOOGLE_API_CLIENT_INSTALLED, _GOOGLE_API_CLIENT_VALID_VERSION, \
        _HTTPLIB2_INSTALLED, _SETUPTOOLS_INSTALLED

    try:
        import pkg_resources
        _SETUPTOOLS_INSTALLED = True
    except ImportError:
        _SETUPTOOLS_INSTALLED = False

    if compat.PY3:
        google_api_minimum_version = '1.4.1'
    else:
        google_api_minimum_version = '1.2.0'

    if _SETUPTOOLS_INSTALLED:
        try:
            try:
                from googleapiclient.discovery import build  # noqa
                from googleapiclient.errors import HttpError  # noqa
            except:
                from apiclient.discovery import build  # noqa
                from apiclient.errors import HttpError  # noqa

            from oauth2client.client import OAuth2WebServerFlow  # noqa
            from oauth2client.client import AccessTokenRefreshError  # noqa

            from oauth2client.file import Storage  # noqa
            from oauth2client.tools import run_flow  # noqa
            _GOOGLE_API_CLIENT_INSTALLED = True
            _GOOGLE_API_CLIENT_VERSION = pkg_resources.get_distribution(
                'google-api-python-client').version

            if (StrictVersion(_GOOGLE_API_CLIENT_VERSION) >=
                    StrictVersion(google_api_minimum_version)):
                _GOOGLE_API_CLIENT_VALID_VERSION = True

        except ImportError:
            _GOOGLE_API_CLIENT_INSTALLED = False

        try:
            import httplib2  # noqa
            _HTTPLIB2_INSTALLED = True
        except ImportError:
            _HTTPLIB2_INSTALLED = False

    if not _SETUPTOOLS_INSTALLED:
        raise ImportError('Could not import pkg_resources (setuptools).')

    if not _GOOGLE_API_CLIENT_INSTALLED:
        raise ImportError('Could not import Google API Client.')

    if not _GOOGLE_API_CLIENT_VALID_VERSION:
        raise ImportError("pandas requires google-api-python-client >= {0} "
                          "for Google BigQuery support, "
                          "current version {1}"
                          .format(google_api_minimum_version,
                                  _GOOGLE_API_CLIENT_VERSION))

    if not _HTTPLIB2_INSTALLED:
        raise ImportError(
            "pandas requires httplib2 for Google BigQuery support")

    # Bug fix for https://github.com/pydata/pandas/issues/12572
    # We need to know that a supported version of oauth2client is installed
    # Test that either of the following is installed:
    # - SignedJwtAssertionCredentials from oauth2client.client
    # - ServiceAccountCredentials from oauth2client.service_account
    # SignedJwtAssertionCredentials is available in oauthclient < 2.0.0
    # ServiceAccountCredentials is available in oauthclient >= 2.0.0
    oauth2client_v1 = True
    oauth2client_v2 = True

    try:
        from oauth2client.client import SignedJwtAssertionCredentials  # noqa
    except ImportError:
        oauth2client_v1 = False

    try:
        from oauth2client.service_account import ServiceAccountCredentials  # noqa
    except ImportError:
        oauth2client_v2 = False

    if not oauth2client_v1 and not oauth2client_v2:
        raise ImportError("Missing oauth2client required for BigQuery "
                          "service account support")


def test_requirements():
    try:
        _test_imports()
    except (ImportError, NotImplementedError) as import_exception:
        raise nose.SkipTest(import_exception)


def clean_gbq_environment(private_key=None):
    dataset = gbq._Dataset(PROJECT_ID, private_key=private_key)

    for i in range(1, 10):
        if DATASET_ID + str(i) in dataset.datasets():
            dataset_id = DATASET_ID + str(i)
            table = gbq._Table(PROJECT_ID, dataset_id, private_key=private_key)
            for j in range(1, 20):
                if TABLE_ID + str(j) in dataset.tables(dataset_id):
                    table.delete(TABLE_ID + str(j))

            dataset.delete(dataset_id)


def make_mixed_dataframe_v2(test_size):
    # create df to test for all BQ datatypes except RECORD
    bools = np.random.randint(2, size=(1, test_size)).astype(bool)
    flts = np.random.randn(1, test_size)
    ints = np.random.randint(1, 10, size=(1, test_size))
    strs = np.random.randint(1, 10, size=(1, test_size)).astype(str)
    times = [datetime.now(pytz.timezone('US/Arizona'))
             for t in range(test_size)]
    return DataFrame({'bools': bools[0],
                      'flts': flts[0],
                      'ints': ints[0],
                      'strs': strs[0],
                      'times': times[0]},
                     index=range(test_size))


def test_generate_bq_schema_deprecated():
    # 11121 Deprecation of generate_bq_schema
    with tm.assert_produces_warning(FutureWarning):
        df = make_mixed_dataframe_v2(10)
        gbq.generate_bq_schema(df)


class TestGBQConnectorIntegration(tm.TestCase):

    def setUp(self):
        test_requirements()

        _skip_if_no_project_id()

        self.sut = gbq.GbqConnector(PROJECT_ID)

    def test_should_be_able_to_make_a_connector(self):
        self.assertTrue(self.sut is not None,
                        'Could not create a GbqConnector')

    def test_should_be_able_to_get_valid_credentials(self):
        credentials = self.sut.get_credentials()
        self.assertFalse(credentials.invalid, 'Returned credentials invalid')

    def test_should_be_able_to_get_a_bigquery_service(self):
        bigquery_service = self.sut.get_service()
        self.assertTrue(bigquery_service is not None, 'No service returned')

    def test_should_be_able_to_get_schema_from_query(self):
        schema, pages = self.sut.run_query('SELECT 1')
        self.assertTrue(schema is not None)

    def test_should_be_able_to_get_results_from_query(self):
        schema, pages = self.sut.run_query('SELECT 1')
        self.assertTrue(pages is not None)


class TestGBQConnectorServiceAccountKeyPathIntegration(tm.TestCase):
    def setUp(self):
        test_requirements()

        _skip_if_no_project_id()
        _skip_if_no_private_key_path()

        self.sut = gbq.GbqConnector(PROJECT_ID,
                                    private_key=PRIVATE_KEY_JSON_PATH)

    def test_should_be_able_to_make_a_connector(self):
        self.assertTrue(self.sut is not None,
                        'Could not create a GbqConnector')

    def test_should_be_able_to_get_valid_credentials(self):
        credentials = self.sut.get_credentials()
        self.assertFalse(credentials.invalid, 'Returned credentials invalid')

    def test_should_be_able_to_get_a_bigquery_service(self):
        bigquery_service = self.sut.get_service()
        self.assertTrue(bigquery_service is not None, 'No service returned')

    def test_should_be_able_to_get_schema_from_query(self):
        schema, pages = self.sut.run_query('SELECT 1')
        self.assertTrue(schema is not None)

    def test_should_be_able_to_get_results_from_query(self):
        schema, pages = self.sut.run_query('SELECT 1')
        self.assertTrue(pages is not None)


class TestGBQConnectorServiceAccountKeyContentsIntegration(tm.TestCase):
    def setUp(self):
        test_requirements()

        _skip_if_no_project_id()
        _skip_if_no_private_key_contents()

        self.sut = gbq.GbqConnector(PROJECT_ID,
                                    private_key=PRIVATE_KEY_JSON_CONTENTS)

    def test_should_be_able_to_make_a_connector(self):
        self.assertTrue(self.sut is not None,
                        'Could not create a GbqConnector')

    def test_should_be_able_to_get_valid_credentials(self):
        credentials = self.sut.get_credentials()
        self.assertFalse(credentials.invalid, 'Returned credentials invalid')

    def test_should_be_able_to_get_a_bigquery_service(self):
        bigquery_service = self.sut.get_service()
        self.assertTrue(bigquery_service is not None, 'No service returned')

    def test_should_be_able_to_get_schema_from_query(self):
        schema, pages = self.sut.run_query('SELECT 1')
        self.assertTrue(schema is not None)

    def test_should_be_able_to_get_results_from_query(self):
        schema, pages = self.sut.run_query('SELECT 1')
        self.assertTrue(pages is not None)


class GBQUnitTests(tm.TestCase):
    def setUp(self):
        test_requirements()

    def test_import_google_api_python_client(self):
        if compat.PY2:
            with tm.assertRaises(ImportError):
                from googleapiclient.discovery import build  # noqa
                from googleapiclient.errors import HttpError  # noqa
            from apiclient.discovery import build  # noqa
            from apiclient.errors import HttpError  # noqa
        else:
            from googleapiclient.discovery import build  # noqa
            from googleapiclient.errors import HttpError  # noqa

    def test_should_return_bigquery_integers_as_python_floats(self):
        result = gbq._parse_entry(1, 'INTEGER')
        tm.assert_equal(result, float(1))

    def test_should_return_bigquery_floats_as_python_floats(self):
        result = gbq._parse_entry(1, 'FLOAT')
        tm.assert_equal(result, float(1))

    def test_should_return_bigquery_timestamps_as_numpy_datetime(self):
        result = gbq._parse_entry('0e9', 'TIMESTAMP')
        tm.assert_equal(result, np_datetime64_compat('1970-01-01T00:00:00Z'))

    def test_should_return_bigquery_booleans_as_python_booleans(self):
        result = gbq._parse_entry('false', 'BOOLEAN')
        tm.assert_equal(result, False)

    def test_should_return_bigquery_strings_as_python_strings(self):
        result = gbq._parse_entry('STRING', 'STRING')
        tm.assert_equal(result, 'STRING')

    def test_to_gbq_should_fail_if_invalid_table_name_passed(self):
        with tm.assertRaises(gbq.NotFoundException):
            gbq.to_gbq(DataFrame(), 'invalid_table_name', project_id="1234")

    def test_to_gbq_with_no_project_id_given_should_fail(self):
        with tm.assertRaises(TypeError):
            gbq.to_gbq(DataFrame(), 'dataset.tablename')

    def test_read_gbq_with_no_project_id_given_should_fail(self):
        with tm.assertRaises(TypeError):
            gbq.read_gbq('SELECT "1" as NUMBER_1')

    def test_that_parse_data_works_properly(self):
        test_schema = {'fields': [
            {'mode': 'NULLABLE', 'name': 'VALID_STRING', 'type': 'STRING'}]}
        test_page = [{'f': [{'v': 'PI'}]}]

        test_output = gbq._parse_data(test_schema, test_page)
        correct_output = DataFrame({'VALID_STRING': ['PI']})
        tm.assert_frame_equal(test_output, correct_output)

    def test_read_gbq_with_invalid_private_key_json_should_fail(self):
        with tm.assertRaises(gbq.InvalidPrivateKeyFormat):
            gbq.read_gbq('SELECT 1', project_id='x', private_key='y')

    def test_read_gbq_with_empty_private_key_json_should_fail(self):
        with tm.assertRaises(gbq.InvalidPrivateKeyFormat):
            gbq.read_gbq('SELECT 1', project_id='x', private_key='{}')

    def test_read_gbq_with_private_key_json_wrong_types_should_fail(self):
        with tm.assertRaises(gbq.InvalidPrivateKeyFormat):
            gbq.read_gbq(
                'SELECT 1', project_id='x',
                private_key='{ "client_email" : 1, "private_key" : True }')

    def test_read_gbq_with_empty_private_key_file_should_fail(self):
        with tm.ensure_clean() as empty_file_path:
            with tm.assertRaises(gbq.InvalidPrivateKeyFormat):
                gbq.read_gbq('SELECT 1', project_id='x',
                             private_key=empty_file_path)

    def test_read_gbq_with_corrupted_private_key_json_should_fail(self):
        _skip_if_no_private_key_contents()

        with tm.assertRaises(gbq.InvalidPrivateKeyFormat):
            gbq.read_gbq(
                'SELECT 1', project_id='x',
                private_key=re.sub('[a-z]', '9', PRIVATE_KEY_JSON_CONTENTS))


class TestReadGBQIntegration(tm.TestCase):

    @classmethod
    def setUpClass(cls):
        # - GLOBAL CLASS FIXTURES -
        #   put here any instruction you want to execute only *ONCE* *BEFORE*
        #   executing *ALL* tests described below.

        _skip_if_no_project_id()

        test_requirements()

    def setUp(self):
        # - PER-TEST FIXTURES -
        # put here any instruction you want to be run *BEFORE* *EVERY* test is
        # executed.
        pass

    @classmethod
    def tearDownClass(cls):
        # - GLOBAL CLASS FIXTURES -
        # put here any instruction you want to execute only *ONCE* *AFTER*
        # executing all tests.
        pass

    def tearDown(self):
        # - PER-TEST FIXTURES -
        # put here any instructions you want to be run *AFTER* *EVERY* test is
        # executed.
        pass

    def test_should_read_as_service_account_with_key_path(self):
        _skip_if_no_private_key_path()
        query = 'SELECT "PI" as VALID_STRING'
        df = gbq.read_gbq(query, project_id=PROJECT_ID,
                          private_key=PRIVATE_KEY_JSON_PATH)
        tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']}))

    def test_should_read_as_service_account_with_key_contents(self):
        _skip_if_no_private_key_contents()
        query = 'SELECT "PI" as VALID_STRING'
        df = gbq.read_gbq(query, project_id=PROJECT_ID,
                          private_key=PRIVATE_KEY_JSON_CONTENTS)
        tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']}))

    def test_should_properly_handle_valid_strings(self):
        query = 'SELECT "PI" as VALID_STRING'
        df = gbq.read_gbq(query, project_id=PROJECT_ID)
        tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']}))

    def test_should_properly_handle_empty_strings(self):
        query = 'SELECT "" as EMPTY_STRING'
        df = gbq.read_gbq(query, project_id=PROJECT_ID)
        tm.assert_frame_equal(df, DataFrame({'EMPTY_STRING': [""]}))

    def test_should_properly_handle_null_strings(self):
        query = 'SELECT STRING(NULL) as NULL_STRING'
        df = gbq.read_gbq(query, project_id=PROJECT_ID)
        tm.assert_frame_equal(df, DataFrame({'NULL_STRING': [None]}))

    def test_should_properly_handle_valid_integers(self):
        query = 'SELECT INTEGER(3) as VALID_INTEGER'
        df = gbq.read_gbq(query, project_id=PROJECT_ID)
        tm.assert_frame_equal(df, DataFrame({'VALID_INTEGER': [3]}))

    def test_should_properly_handle_null_integers(self):
        query = 'SELECT INTEGER(NULL) as NULL_INTEGER'
        df = gbq.read_gbq(query, project_id=PROJECT_ID)
        tm.assert_frame_equal(df, DataFrame({'NULL_INTEGER': [np.nan]}))

    def test_should_properly_handle_valid_floats(self):
        query = 'SELECT PI() as VALID_FLOAT'
        df = gbq.read_gbq(query, project_id=PROJECT_ID)
        tm.assert_frame_equal(df, DataFrame(
            {'VALID_FLOAT': [3.141592653589793]}))

    def test_should_properly_handle_null_floats(self):
        query = 'SELECT FLOAT(NULL) as NULL_FLOAT'
        df = gbq.read_gbq(query, project_id=PROJECT_ID)
        tm.assert_frame_equal(df, DataFrame({'NULL_FLOAT': [np.nan]}))

    def test_should_properly_handle_timestamp_unix_epoch(self):
        query = 'SELECT TIMESTAMP("1970-01-01 00:00:00") as UNIX_EPOCH'
        df = gbq.read_gbq(query, project_id=PROJECT_ID)
        tm.assert_frame_equal(df, DataFrame(
            {'UNIX_EPOCH': [np.datetime64('1970-01-01T00:00:00.000000Z')]}))

    def test_should_properly_handle_arbitrary_timestamp(self):
        query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") as VALID_TIMESTAMP'
        df = gbq.read_gbq(query, project_id=PROJECT_ID)
        tm.assert_frame_equal(df, DataFrame({
            'VALID_TIMESTAMP': [np.datetime64('2004-09-15T05:00:00.000000Z')]
        }))

    def test_should_properly_handle_null_timestamp(self):
        query = 'SELECT TIMESTAMP(NULL) as NULL_TIMESTAMP'
        df = gbq.read_gbq(query, project_id=PROJECT_ID)
        tm.assert_frame_equal(df, DataFrame({'NULL_TIMESTAMP': [NaT]}))

    def test_should_properly_handle_true_boolean(self):
        query = 'SELECT BOOLEAN(TRUE) as TRUE_BOOLEAN'
        df = gbq.read_gbq(query, project_id=PROJECT_ID)
        tm.assert_frame_equal(df, DataFrame({'TRUE_BOOLEAN': [True]}))

    def test_should_properly_handle_false_boolean(self):
        query = 'SELECT BOOLEAN(FALSE) as FALSE_BOOLEAN'
        df = gbq.read_gbq(query, project_id=PROJECT_ID)
        tm.assert_frame_equal(df, DataFrame({'FALSE_BOOLEAN': [False]}))

    def test_should_properly_handle_null_boolean(self):
        query = 'SELECT BOOLEAN(NULL) as NULL_BOOLEAN'
        df = gbq.read_gbq(query, project_id=PROJECT_ID)
        tm.assert_frame_equal(df, DataFrame({'NULL_BOOLEAN': [None]}))

    def test_unicode_string_conversion_and_normalization(self):
        correct_test_datatype = DataFrame(
            {'UNICODE_STRING': [u("\xe9\xfc")]}
        )

        unicode_string = "\xc3\xa9\xc3\xbc"

        if compat.PY3:
            unicode_string = unicode_string.encode('latin-1').decode('utf8')

        query = 'SELECT "{0}" as UNICODE_STRING'.format(unicode_string)

        df = gbq.read_gbq(query, project_id=PROJECT_ID)
        tm.assert_frame_equal(df, correct_test_datatype)

    def test_index_column(self):
        query = "SELECT 'a' as STRING_1, 'b' as STRING_2"
        result_frame = gbq.read_gbq(
            query, project_id=PROJECT_ID, index_col="STRING_1")
        correct_frame = DataFrame(
            {'STRING_1': ['a'], 'STRING_2': ['b']}).set_index("STRING_1")
        tm.assert_equal(result_frame.index.name, correct_frame.index.name)

    def test_column_order(self):
        query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3"
        col_order = ['STRING_3', 'STRING_1', 'STRING_2']
        result_frame = gbq.read_gbq(
            query, project_id=PROJECT_ID, col_order=col_order)
        correct_frame = DataFrame({'STRING_1': ['a'], 'STRING_2': [
                                  'b'], 'STRING_3': ['c']})[col_order]
        tm.assert_frame_equal(result_frame, correct_frame)

    def test_column_order_plus_index(self):
        query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3"
        col_order = ['STRING_3', 'STRING_2']
        result_frame = gbq.read_gbq(query, project_id=PROJECT_ID,
                                    index_col='STRING_1', col_order=col_order)
        correct_frame = DataFrame(
            {'STRING_1': ['a'], 'STRING_2': ['b'], 'STRING_3': ['c']})
        correct_frame.set_index('STRING_1', inplace=True)
        correct_frame = correct_frame[col_order]
        tm.assert_frame_equal(result_frame, correct_frame)

    def test_malformed_query(self):
        with tm.assertRaises(gbq.GenericGBQException):
            gbq.read_gbq("SELCET * FORM [publicdata:samples.shakespeare]",
                         project_id=PROJECT_ID)

    def test_bad_project_id(self):
        with tm.assertRaises(gbq.GenericGBQException):
            gbq.read_gbq("SELECT 1", project_id='001')

    def test_bad_table_name(self):
        with tm.assertRaises(gbq.GenericGBQException):
            gbq.read_gbq("SELECT * FROM [publicdata:samples.nope]",
                         project_id=PROJECT_ID)

    def test_download_dataset_larger_than_200k_rows(self):
        test_size = 200005
        # Test for known BigQuery bug in datasets larger than 100k rows
        # http://stackoverflow.com/questions/19145587/bq-py-not-paging-results
        df = gbq.read_gbq("SELECT id FROM [publicdata:samples.wikipedia] "
                          "GROUP EACH BY id ORDER BY id ASC LIMIT {0}"
                          .format(test_size),
                          project_id=PROJECT_ID)
        self.assertEqual(len(df.drop_duplicates()), test_size)

    def test_zero_rows(self):
        # Bug fix for https://github.com/pydata/pandas/issues/10273
        df = gbq.read_gbq("SELECT title, id "
                          "FROM [publicdata:samples.wikipedia] "
                          "WHERE timestamp=-9999999",
                          project_id=PROJECT_ID)
        page_array = np.zeros(
            (0,), dtype=[('title', object), ('id', np.dtype(float))])
        expected_result = DataFrame(page_array, columns=['title', 'id'])
        self.assert_frame_equal(df, expected_result)

    def test_legacy_sql(self):
        legacy_sql = "SELECT id FROM [publicdata.samples.wikipedia] LIMIT 10"

        # Test that a legacy sql statement fails when
        # setting dialect='standard'
        with tm.assertRaises(gbq.GenericGBQException):
            gbq.read_gbq(legacy_sql, project_id=PROJECT_ID,
                         dialect='standard')

        # Test that a legacy sql statement succeeds when
        # setting dialect='legacy'
        df = gbq.read_gbq(legacy_sql, project_id=PROJECT_ID,
                          dialect='legacy')
        self.assertEqual(len(df.drop_duplicates()), 10)

    def test_standard_sql(self):
        standard_sql = "SELECT DISTINCT id FROM " \
                       "`publicdata.samples.wikipedia` LIMIT 10"

        # Test that a standard sql statement fails when using
        # the legacy SQL dialect (default value)
        with tm.assertRaises(gbq.GenericGBQException):
            gbq.read_gbq(standard_sql, project_id=PROJECT_ID)

        # Test that a standard sql statement succeeds when
        # setting dialect='standard'
        df = gbq.read_gbq(standard_sql, project_id=PROJECT_ID,
                          dialect='standard')
        self.assertEqual(len(df.drop_duplicates()), 10)

    def test_invalid_option_for_sql_dialect(self):
        sql_statement = "SELECT DISTINCT id FROM " \
                        "`publicdata.samples.wikipedia` LIMIT 10"

        # Test that an invalid option for `dialect` raises ValueError
        with tm.assertRaises(ValueError):
            gbq.read_gbq(sql_statement, project_id=PROJECT_ID,
                         dialect='invalid')

        # Test that a correct option for dialect succeeds
        # to make sure ValueError was due to invalid dialect
        gbq.read_gbq(sql_statement, project_id=PROJECT_ID,
                     dialect='standard')


class TestToGBQIntegration(tm.TestCase):
    # Changes to BigQuery table schema may take up to 2 minutes as of May 2015
    # As a workaround to this issue, each test should use a unique table name.
    # Make sure to modify the for loop range in the tearDownClass when a new
    # test is added See `Issue 191
    # <https://code.google.com/p/google-bigquery/issues/detail?id=191>`__

    @classmethod
    def setUpClass(cls):
        # - GLOBAL CLASS FIXTURES -
        # put here any instruction you want to execute only *ONCE* *BEFORE*
        # executing *ALL* tests described below.

        _skip_if_no_project_id()

        test_requirements()
        clean_gbq_environment()

        gbq._Dataset(PROJECT_ID).create(DATASET_ID + "1")

    def setUp(self):
        # - PER-TEST FIXTURES -
        # put here any instruction you want to be run *BEFORE* *EVERY* test is
        # executed.

        self.dataset = gbq._Dataset(PROJECT_ID)
        self.table = gbq._Table(PROJECT_ID, DATASET_ID + "1")

    @classmethod
    def tearDownClass(cls):
        # - GLOBAL CLASS FIXTURES -
        # put here any instruction you want to execute only *ONCE* *AFTER*
        # executing all tests.

        clean_gbq_environment()

    def tearDown(self):
        # - PER-TEST FIXTURES -
        # put here any instructions you want to be run *AFTER* *EVERY* test is
        # executed.
        pass

    def test_upload_data(self):
        destination_table = DESTINATION_TABLE + "1"

        test_size = 20001
        df = make_mixed_dataframe_v2(test_size)

        gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000)

        sleep(30)  # <- Curses Google!!!

        result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}"
                              .format(destination_table),
                              project_id=PROJECT_ID)
        self.assertEqual(result['NUM_ROWS'][0], test_size)

    def test_upload_data_if_table_exists_fail(self):
        destination_table = DESTINATION_TABLE + "2"

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        self.table.create(TABLE_ID + "2", gbq._generate_bq_schema(df))

        # Test the default value of if_exists is 'fail'
        with tm.assertRaises(gbq.TableCreationError):
            gbq.to_gbq(df, destination_table, PROJECT_ID)

        # Test the if_exists parameter with value 'fail'
        with tm.assertRaises(gbq.TableCreationError):
            gbq.to_gbq(df, destination_table, PROJECT_ID, if_exists='fail')

    def test_upload_data_if_table_exists_append(self):
        destination_table = DESTINATION_TABLE + "3"

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        df_different_schema = tm.makeMixedDataFrame()

        # Initialize table with sample data
        gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000)

        # Test the if_exists parameter with value 'append'
        gbq.to_gbq(df, destination_table, PROJECT_ID, if_exists='append')

        sleep(30)  # <- Curses Google!!!

        result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}"
                              .format(destination_table),
                              project_id=PROJECT_ID)
        self.assertEqual(result['NUM_ROWS'][0], test_size * 2)

        # Try inserting with a different schema, confirm failure
        with tm.assertRaises(gbq.InvalidSchema):
            gbq.to_gbq(df_different_schema, destination_table,
                       PROJECT_ID, if_exists='append')

    def test_upload_data_if_table_exists_replace(self):
        destination_table = DESTINATION_TABLE + "4"

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        df_different_schema = tm.makeMixedDataFrame()

        # Initialize table with sample data
        gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000)

        # Test the if_exists parameter with the value 'replace'.
        gbq.to_gbq(df_different_schema, destination_table,
                   PROJECT_ID, if_exists='replace')

        sleep(30)  # <- Curses Google!!!

        result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}"
                              .format(destination_table),
                              project_id=PROJECT_ID)
        self.assertEqual(result['NUM_ROWS'][0], 5)

    def test_google_upload_errors_should_raise_exception(self):
        destination_table = DESTINATION_TABLE + "5"

        test_timestamp = datetime.now(pytz.timezone('US/Arizona'))
        bad_df = DataFrame({'bools': [False, False], 'flts': [0.0, 1.0],
                            'ints': [0, '1'], 'strs': ['a', 1],
                            'times': [test_timestamp, test_timestamp]},
                           index=range(2))

        with tm.assertRaises(gbq.StreamingInsertError):
            gbq.to_gbq(bad_df, destination_table, PROJECT_ID, verbose=True)

    def test_generate_schema(self):
        df = tm.makeMixedDataFrame()
        schema = gbq._generate_bq_schema(df)

        test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'},
                                  {'name': 'B', 'type': 'FLOAT'},
                                  {'name': 'C', 'type': 'STRING'},
                                  {'name': 'D', 'type': 'TIMESTAMP'}]}

        self.assertEqual(schema, test_schema)

    def test_create_table(self):
        destination_table = TABLE_ID + "6"
        test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'},
                                  {'name': 'B', 'type': 'FLOAT'},
                                  {'name': 'C', 'type': 'STRING'},
                                  {'name': 'D', 'type': 'TIMESTAMP'}]}
        self.table.create(destination_table, test_schema)
        self.assertTrue(self.table.exists(destination_table),
                        'Expected table to exist')

    def test_table_does_not_exist(self):
        self.assertTrue(not self.table.exists(TABLE_ID + "7"),
                        'Expected table not to exist')

    def test_delete_table(self):
        destination_table = TABLE_ID + "8"
        test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'},
                                  {'name': 'B', 'type': 'FLOAT'},
                                  {'name': 'C', 'type': 'STRING'},
                                  {'name': 'D', 'type': 'TIMESTAMP'}]}
        self.table.create(destination_table, test_schema)
        self.table.delete(destination_table)
        self.assertTrue(not self.table.exists(
            destination_table), 'Expected table not to exist')

    def test_list_table(self):
        destination_table = TABLE_ID + "9"
        test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'},
                                  {'name': 'B', 'type': 'FLOAT'},
                                  {'name': 'C', 'type': 'STRING'},
                                  {'name': 'D', 'type': 'TIMESTAMP'}]}
        self.table.create(destination_table, test_schema)
        self.assertTrue(
            destination_table in self.dataset.tables(DATASET_ID + "1"),
            'Expected table list to contain table {0}'
            .format(destination_table))

    def test_list_dataset(self):
        dataset_id = DATASET_ID + "1"
        self.assertTrue(dataset_id in self.dataset.datasets(),
                        'Expected dataset list to contain dataset {0}'
                        .format(dataset_id))

    def test_list_table_zero_results(self):
        dataset_id = DATASET_ID + "2"
        self.dataset.create(dataset_id)
        table_list = gbq._Dataset(PROJECT_ID).tables(dataset_id)
        self.assertEqual(len(table_list), 0,
                         'Expected gbq.list_table() to return 0')

    def test_create_dataset(self):
        dataset_id = DATASET_ID + "3"
        self.dataset.create(dataset_id)
        self.assertTrue(dataset_id in self.dataset.datasets(),
                        'Expected dataset to exist')

    def test_delete_dataset(self):
        dataset_id = DATASET_ID + "4"
        self.dataset.create(dataset_id)
        self.dataset.delete(dataset_id)
        self.assertTrue(dataset_id not in self.dataset.datasets(),
                        'Expected dataset not to exist')

    def test_dataset_exists(self):
        dataset_id = DATASET_ID + "5"
        self.dataset.create(dataset_id)
        self.assertTrue(self.dataset.exists(dataset_id),
                        'Expected dataset to exist')

    def create_table_data_dataset_does_not_exist(self):
        dataset_id = DATASET_ID + "6"
        table_id = TABLE_ID + "1"
        table_with_new_dataset = gbq._Table(PROJECT_ID, dataset_id)
        df = make_mixed_dataframe_v2(10)
        table_with_new_dataset.create(table_id, gbq._generate_bq_schema(df))
        self.assertTrue(self.dataset.exists(dataset_id),
                        'Expected dataset to exist')
        self.assertTrue(table_with_new_dataset.exists(
            table_id), 'Expected dataset to exist')

    def test_dataset_does_not_exist(self):
        self.assertTrue(not self.dataset.exists(
            DATASET_ID + "_not_found"), 'Expected dataset not to exist')


class TestToGBQIntegrationServiceAccountKeyPath(tm.TestCase):
    # Changes to BigQuery table schema may take up to 2 minutes as of May 2015
    # As a workaround to this issue, each test should use a unique table name.
    # Make sure to modify the for loop range in the tearDownClass when a new
    # test is added
    # See `Issue 191
    # <https://code.google.com/p/google-bigquery/issues/detail?id=191>`__

    @classmethod
    def setUpClass(cls):
        # - GLOBAL CLASS FIXTURES -
        # put here any instruction you want to execute only *ONCE* *BEFORE*
        # executing *ALL* tests described below.

        _skip_if_no_project_id()
        _skip_if_no_private_key_path()

        test_requirements()
        clean_gbq_environment(PRIVATE_KEY_JSON_PATH)

    def setUp(self):
        # - PER-TEST FIXTURES -
        # put here any instruction you want to be run *BEFORE* *EVERY* test
        # is executed.
        pass

    @classmethod
    def tearDownClass(cls):
        # - GLOBAL CLASS FIXTURES -
        # put here any instruction you want to execute only *ONCE* *AFTER*
        # executing all tests.

        clean_gbq_environment(PRIVATE_KEY_JSON_PATH)

    def tearDown(self):
        # - PER-TEST FIXTURES -
        # put here any instructions you want to be run *AFTER* *EVERY* test
        # is executed.
        pass

    def test_upload_data_as_service_account_with_key_path(self):
        destination_table = DESTINATION_TABLE + "11"

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)

        gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000,
                   private_key=PRIVATE_KEY_JSON_PATH)

        sleep(30)  # <- Curses Google!!!

        result = gbq.read_gbq(
            "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table),
            project_id=PROJECT_ID,
            private_key=PRIVATE_KEY_JSON_PATH)

        self.assertEqual(result['NUM_ROWS'][0], test_size)


class TestToGBQIntegrationServiceAccountKeyContents(tm.TestCase):
    # Changes to BigQuery table schema may take up to 2 minutes as of May 2015
    # As a workaround to this issue, each test should use a unique table name.
    # Make sure to modify the for loop range in the tearDownClass when a new
    # test is added
    # See `Issue 191
    # <https://code.google.com/p/google-bigquery/issues/detail?id=191>`__

    @classmethod
    def setUpClass(cls):
        # - GLOBAL CLASS FIXTURES -
        # put here any instruction you want to execute only *ONCE* *BEFORE*
        # executing *ALL* tests described below.

        _skip_if_no_project_id()
        _skip_if_no_private_key_contents()

        test_requirements()
        clean_gbq_environment(PRIVATE_KEY_JSON_CONTENTS)

    def setUp(self):
        # - PER-TEST FIXTURES -
        # put here any instruction you want to be run *BEFORE* *EVERY* test
        # is executed.
        pass

    @classmethod
    def tearDownClass(cls):
        # - GLOBAL CLASS FIXTURES -
        # put here any instruction you want to execute only *ONCE* *AFTER*
        # executing all tests.

        clean_gbq_environment(PRIVATE_KEY_JSON_CONTENTS)

    def tearDown(self):
        # - PER-TEST FIXTURES -
        # put here any instructions you want to be run *AFTER* *EVERY* test
        # is executed.
        pass

    def test_upload_data_as_service_account_with_key_contents(self):
        destination_table = DESTINATION_TABLE + "12"

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)

        gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000,
                   private_key=PRIVATE_KEY_JSON_CONTENTS)

        sleep(30)  # <- Curses Google!!!

        result = gbq.read_gbq(
            "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table),
            project_id=PROJECT_ID,
            private_key=PRIVATE_KEY_JSON_CONTENTS)
        self.assertEqual(result['NUM_ROWS'][0], test_size)

if __name__ == '__main__':
    nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                   exit=False)
Tech Fingerprint

Alerts (85)

'global' Avoid global variables; use function parameters or class attributes for better scope management
60
Complexity hotspot; lines 74 to 76 (total complexity: 3)
74 75 76
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
79
'def' Ensure functions have docstrings for documentation
147 154 168 184 193 200 204 208 212 216 222 231 235 239 243 247 253 262 266 270 274 278 284 287 298 302 306 310 314 318 322 326 330 339 343 347 353 359 371 380 387 393 399 406 413 418 423 428 433 438 444 449 455 462 467 472 477 482 497 505 514 525 530 534 539 549 560 575 590 840 851 858 865 871 899 910 917 924 930
All Visa card numbers start with a 4. New cards have 16 digits. Old cards have 13.
442