PageRenderTime 49ms CodeModel.GetById 9ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/io/tests/test_gbq.py

http://github.com/wesm/pandas
Python | 949 lines | 827 code | 101 blank | 21 comment | 44 complexity | e26033c838cee36bc2c509fa2700a8b7 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. import re
  2. from datetime import datetime
  3. import nose
  4. import pytz
  5. import platform
  6. from time import sleep
  7. import numpy as np
  8. from distutils.version import StrictVersion
  9. from pandas import compat
  10. from pandas import NaT
  11. from pandas.compat import u, range
  12. from pandas.core.frame import DataFrame
  13. import pandas.io.gbq as gbq
  14. import pandas.util.testing as tm
  15. from pandas.compat.numpy import np_datetime64_compat
  16. PROJECT_ID = None
  17. PRIVATE_KEY_JSON_PATH = None
  18. PRIVATE_KEY_JSON_CONTENTS = None
  19. DATASET_ID = 'pydata_pandas_bq_testing'
  20. TABLE_ID = 'new_test'
  21. DESTINATION_TABLE = "{0}.{1}".format(DATASET_ID + "1", TABLE_ID)
  22. VERSION = platform.python_version()
  23. _IMPORTS = False
  24. _GOOGLE_API_CLIENT_INSTALLED = False
  25. _GOOGLE_API_CLIENT_VALID_VERSION = False
  26. _HTTPLIB2_INSTALLED = False
  27. _SETUPTOOLS_INSTALLED = False
  28. def _skip_if_no_project_id():
  29. if not PROJECT_ID:
  30. raise nose.SkipTest(
  31. "Cannot run integration tests without a project id")
  32. def _skip_if_no_private_key_path():
  33. if not PRIVATE_KEY_JSON_PATH:
  34. raise nose.SkipTest("Cannot run integration tests without a "
  35. "private key json file path")
  36. def _skip_if_no_private_key_contents():
  37. if not PRIVATE_KEY_JSON_CONTENTS:
  38. raise nose.SkipTest("Cannot run integration tests without a "
  39. "private key json contents")
  40. _skip_if_no_project_id()
  41. _skip_if_no_private_key_path()
  42. _skip_if_no_private_key_contents()
  43. def _test_imports():
  44. global _GOOGLE_API_CLIENT_INSTALLED, _GOOGLE_API_CLIENT_VALID_VERSION, \
  45. _HTTPLIB2_INSTALLED, _SETUPTOOLS_INSTALLED
  46. try:
  47. import pkg_resources
  48. _SETUPTOOLS_INSTALLED = True
  49. except ImportError:
  50. _SETUPTOOLS_INSTALLED = False
  51. if compat.PY3:
  52. google_api_minimum_version = '1.4.1'
  53. else:
  54. google_api_minimum_version = '1.2.0'
  55. if _SETUPTOOLS_INSTALLED:
  56. try:
  57. try:
  58. from googleapiclient.discovery import build # noqa
  59. from googleapiclient.errors import HttpError # noqa
  60. except:
  61. from apiclient.discovery import build # noqa
  62. from apiclient.errors import HttpError # noqa
  63. from oauth2client.client import OAuth2WebServerFlow # noqa
  64. from oauth2client.client import AccessTokenRefreshError # noqa
  65. from oauth2client.file import Storage # noqa
  66. from oauth2client.tools import run_flow # noqa
  67. _GOOGLE_API_CLIENT_INSTALLED = True
  68. _GOOGLE_API_CLIENT_VERSION = pkg_resources.get_distribution(
  69. 'google-api-python-client').version
  70. if (StrictVersion(_GOOGLE_API_CLIENT_VERSION) >=
  71. StrictVersion(google_api_minimum_version)):
  72. _GOOGLE_API_CLIENT_VALID_VERSION = True
  73. except ImportError:
  74. _GOOGLE_API_CLIENT_INSTALLED = False
  75. try:
  76. import httplib2 # noqa
  77. _HTTPLIB2_INSTALLED = True
  78. except ImportError:
  79. _HTTPLIB2_INSTALLED = False
  80. if not _SETUPTOOLS_INSTALLED:
  81. raise ImportError('Could not import pkg_resources (setuptools).')
  82. if not _GOOGLE_API_CLIENT_INSTALLED:
  83. raise ImportError('Could not import Google API Client.')
  84. if not _GOOGLE_API_CLIENT_VALID_VERSION:
  85. raise ImportError("pandas requires google-api-python-client >= {0} "
  86. "for Google BigQuery support, "
  87. "current version {1}"
  88. .format(google_api_minimum_version,
  89. _GOOGLE_API_CLIENT_VERSION))
  90. if not _HTTPLIB2_INSTALLED:
  91. raise ImportError(
  92. "pandas requires httplib2 for Google BigQuery support")
  93. # Bug fix for https://github.com/pydata/pandas/issues/12572
  94. # We need to know that a supported version of oauth2client is installed
  95. # Test that either of the following is installed:
  96. # - SignedJwtAssertionCredentials from oauth2client.client
  97. # - ServiceAccountCredentials from oauth2client.service_account
  98. # SignedJwtAssertionCredentials is available in oauthclient < 2.0.0
  99. # ServiceAccountCredentials is available in oauthclient >= 2.0.0
  100. oauth2client_v1 = True
  101. oauth2client_v2 = True
  102. try:
  103. from oauth2client.client import SignedJwtAssertionCredentials # noqa
  104. except ImportError:
  105. oauth2client_v1 = False
  106. try:
  107. from oauth2client.service_account import ServiceAccountCredentials # noqa
  108. except ImportError:
  109. oauth2client_v2 = False
  110. if not oauth2client_v1 and not oauth2client_v2:
  111. raise ImportError("Missing oauth2client required for BigQuery "
  112. "service account support")
  113. def test_requirements():
  114. try:
  115. _test_imports()
  116. except (ImportError, NotImplementedError) as import_exception:
  117. raise nose.SkipTest(import_exception)
  118. def clean_gbq_environment(private_key=None):
  119. dataset = gbq._Dataset(PROJECT_ID, private_key=private_key)
  120. for i in range(1, 10):
  121. if DATASET_ID + str(i) in dataset.datasets():
  122. dataset_id = DATASET_ID + str(i)
  123. table = gbq._Table(PROJECT_ID, dataset_id, private_key=private_key)
  124. for j in range(1, 20):
  125. if TABLE_ID + str(j) in dataset.tables(dataset_id):
  126. table.delete(TABLE_ID + str(j))
  127. dataset.delete(dataset_id)
  128. def make_mixed_dataframe_v2(test_size):
  129. # create df to test for all BQ datatypes except RECORD
  130. bools = np.random.randint(2, size=(1, test_size)).astype(bool)
  131. flts = np.random.randn(1, test_size)
  132. ints = np.random.randint(1, 10, size=(1, test_size))
  133. strs = np.random.randint(1, 10, size=(1, test_size)).astype(str)
  134. times = [datetime.now(pytz.timezone('US/Arizona'))
  135. for t in range(test_size)]
  136. return DataFrame({'bools': bools[0],
  137. 'flts': flts[0],
  138. 'ints': ints[0],
  139. 'strs': strs[0],
  140. 'times': times[0]},
  141. index=range(test_size))
  142. def test_generate_bq_schema_deprecated():
  143. # 11121 Deprecation of generate_bq_schema
  144. with tm.assert_produces_warning(FutureWarning):
  145. df = make_mixed_dataframe_v2(10)
  146. gbq.generate_bq_schema(df)
  147. class TestGBQConnectorIntegration(tm.TestCase):
  148. def setUp(self):
  149. test_requirements()
  150. _skip_if_no_project_id()
  151. self.sut = gbq.GbqConnector(PROJECT_ID)
  152. def test_should_be_able_to_make_a_connector(self):
  153. self.assertTrue(self.sut is not None,
  154. 'Could not create a GbqConnector')
  155. def test_should_be_able_to_get_valid_credentials(self):
  156. credentials = self.sut.get_credentials()
  157. self.assertFalse(credentials.invalid, 'Returned credentials invalid')
  158. def test_should_be_able_to_get_a_bigquery_service(self):
  159. bigquery_service = self.sut.get_service()
  160. self.assertTrue(bigquery_service is not None, 'No service returned')
  161. def test_should_be_able_to_get_schema_from_query(self):
  162. schema, pages = self.sut.run_query('SELECT 1')
  163. self.assertTrue(schema is not None)
  164. def test_should_be_able_to_get_results_from_query(self):
  165. schema, pages = self.sut.run_query('SELECT 1')
  166. self.assertTrue(pages is not None)
  167. class TestGBQConnectorServiceAccountKeyPathIntegration(tm.TestCase):
  168. def setUp(self):
  169. test_requirements()
  170. _skip_if_no_project_id()
  171. _skip_if_no_private_key_path()
  172. self.sut = gbq.GbqConnector(PROJECT_ID,
  173. private_key=PRIVATE_KEY_JSON_PATH)
  174. def test_should_be_able_to_make_a_connector(self):
  175. self.assertTrue(self.sut is not None,
  176. 'Could not create a GbqConnector')
  177. def test_should_be_able_to_get_valid_credentials(self):
  178. credentials = self.sut.get_credentials()
  179. self.assertFalse(credentials.invalid, 'Returned credentials invalid')
  180. def test_should_be_able_to_get_a_bigquery_service(self):
  181. bigquery_service = self.sut.get_service()
  182. self.assertTrue(bigquery_service is not None, 'No service returned')
  183. def test_should_be_able_to_get_schema_from_query(self):
  184. schema, pages = self.sut.run_query('SELECT 1')
  185. self.assertTrue(schema is not None)
  186. def test_should_be_able_to_get_results_from_query(self):
  187. schema, pages = self.sut.run_query('SELECT 1')
  188. self.assertTrue(pages is not None)
  189. class TestGBQConnectorServiceAccountKeyContentsIntegration(tm.TestCase):
  190. def setUp(self):
  191. test_requirements()
  192. _skip_if_no_project_id()
  193. _skip_if_no_private_key_contents()
  194. self.sut = gbq.GbqConnector(PROJECT_ID,
  195. private_key=PRIVATE_KEY_JSON_CONTENTS)
  196. def test_should_be_able_to_make_a_connector(self):
  197. self.assertTrue(self.sut is not None,
  198. 'Could not create a GbqConnector')
  199. def test_should_be_able_to_get_valid_credentials(self):
  200. credentials = self.sut.get_credentials()
  201. self.assertFalse(credentials.invalid, 'Returned credentials invalid')
  202. def test_should_be_able_to_get_a_bigquery_service(self):
  203. bigquery_service = self.sut.get_service()
  204. self.assertTrue(bigquery_service is not None, 'No service returned')
  205. def test_should_be_able_to_get_schema_from_query(self):
  206. schema, pages = self.sut.run_query('SELECT 1')
  207. self.assertTrue(schema is not None)
  208. def test_should_be_able_to_get_results_from_query(self):
  209. schema, pages = self.sut.run_query('SELECT 1')
  210. self.assertTrue(pages is not None)
  211. class GBQUnitTests(tm.TestCase):
  212. def setUp(self):
  213. test_requirements()
  214. def test_import_google_api_python_client(self):
  215. if compat.PY2:
  216. with tm.assertRaises(ImportError):
  217. from googleapiclient.discovery import build # noqa
  218. from googleapiclient.errors import HttpError # noqa
  219. from apiclient.discovery import build # noqa
  220. from apiclient.errors import HttpError # noqa
  221. else:
  222. from googleapiclient.discovery import build # noqa
  223. from googleapiclient.errors import HttpError # noqa
  224. def test_should_return_bigquery_integers_as_python_floats(self):
  225. result = gbq._parse_entry(1, 'INTEGER')
  226. tm.assert_equal(result, float(1))
  227. def test_should_return_bigquery_floats_as_python_floats(self):
  228. result = gbq._parse_entry(1, 'FLOAT')
  229. tm.assert_equal(result, float(1))
  230. def test_should_return_bigquery_timestamps_as_numpy_datetime(self):
  231. result = gbq._parse_entry('0e9', 'TIMESTAMP')
  232. tm.assert_equal(result, np_datetime64_compat('1970-01-01T00:00:00Z'))
  233. def test_should_return_bigquery_booleans_as_python_booleans(self):
  234. result = gbq._parse_entry('false', 'BOOLEAN')
  235. tm.assert_equal(result, False)
  236. def test_should_return_bigquery_strings_as_python_strings(self):
  237. result = gbq._parse_entry('STRING', 'STRING')
  238. tm.assert_equal(result, 'STRING')
  239. def test_to_gbq_should_fail_if_invalid_table_name_passed(self):
  240. with tm.assertRaises(gbq.NotFoundException):
  241. gbq.to_gbq(DataFrame(), 'invalid_table_name', project_id="1234")
  242. def test_to_gbq_with_no_project_id_given_should_fail(self):
  243. with tm.assertRaises(TypeError):
  244. gbq.to_gbq(DataFrame(), 'dataset.tablename')
  245. def test_read_gbq_with_no_project_id_given_should_fail(self):
  246. with tm.assertRaises(TypeError):
  247. gbq.read_gbq('SELECT "1" as NUMBER_1')
  248. def test_that_parse_data_works_properly(self):
  249. test_schema = {'fields': [
  250. {'mode': 'NULLABLE', 'name': 'VALID_STRING', 'type': 'STRING'}]}
  251. test_page = [{'f': [{'v': 'PI'}]}]
  252. test_output = gbq._parse_data(test_schema, test_page)
  253. correct_output = DataFrame({'VALID_STRING': ['PI']})
  254. tm.assert_frame_equal(test_output, correct_output)
  255. def test_read_gbq_with_invalid_private_key_json_should_fail(self):
  256. with tm.assertRaises(gbq.InvalidPrivateKeyFormat):
  257. gbq.read_gbq('SELECT 1', project_id='x', private_key='y')
  258. def test_read_gbq_with_empty_private_key_json_should_fail(self):
  259. with tm.assertRaises(gbq.InvalidPrivateKeyFormat):
  260. gbq.read_gbq('SELECT 1', project_id='x', private_key='{}')
  261. def test_read_gbq_with_private_key_json_wrong_types_should_fail(self):
  262. with tm.assertRaises(gbq.InvalidPrivateKeyFormat):
  263. gbq.read_gbq(
  264. 'SELECT 1', project_id='x',
  265. private_key='{ "client_email" : 1, "private_key" : True }')
  266. def test_read_gbq_with_empty_private_key_file_should_fail(self):
  267. with tm.ensure_clean() as empty_file_path:
  268. with tm.assertRaises(gbq.InvalidPrivateKeyFormat):
  269. gbq.read_gbq('SELECT 1', project_id='x',
  270. private_key=empty_file_path)
  271. def test_read_gbq_with_corrupted_private_key_json_should_fail(self):
  272. _skip_if_no_private_key_contents()
  273. with tm.assertRaises(gbq.InvalidPrivateKeyFormat):
  274. gbq.read_gbq(
  275. 'SELECT 1', project_id='x',
  276. private_key=re.sub('[a-z]', '9', PRIVATE_KEY_JSON_CONTENTS))
  277. class TestReadGBQIntegration(tm.TestCase):
  278. @classmethod
  279. def setUpClass(cls):
  280. # - GLOBAL CLASS FIXTURES -
  281. # put here any instruction you want to execute only *ONCE* *BEFORE*
  282. # executing *ALL* tests described below.
  283. _skip_if_no_project_id()
  284. test_requirements()
  285. def setUp(self):
  286. # - PER-TEST FIXTURES -
  287. # put here any instruction you want to be run *BEFORE* *EVERY* test is
  288. # executed.
  289. pass
  290. @classmethod
  291. def tearDownClass(cls):
  292. # - GLOBAL CLASS FIXTURES -
  293. # put here any instruction you want to execute only *ONCE* *AFTER*
  294. # executing all tests.
  295. pass
  296. def tearDown(self):
  297. # - PER-TEST FIXTURES -
  298. # put here any instructions you want to be run *AFTER* *EVERY* test is
  299. # executed.
  300. pass
  301. def test_should_read_as_service_account_with_key_path(self):
  302. _skip_if_no_private_key_path()
  303. query = 'SELECT "PI" as VALID_STRING'
  304. df = gbq.read_gbq(query, project_id=PROJECT_ID,
  305. private_key=PRIVATE_KEY_JSON_PATH)
  306. tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']}))
  307. def test_should_read_as_service_account_with_key_contents(self):
  308. _skip_if_no_private_key_contents()
  309. query = 'SELECT "PI" as VALID_STRING'
  310. df = gbq.read_gbq(query, project_id=PROJECT_ID,
  311. private_key=PRIVATE_KEY_JSON_CONTENTS)
  312. tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']}))
  313. def test_should_properly_handle_valid_strings(self):
  314. query = 'SELECT "PI" as VALID_STRING'
  315. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  316. tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']}))
  317. def test_should_properly_handle_empty_strings(self):
  318. query = 'SELECT "" as EMPTY_STRING'
  319. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  320. tm.assert_frame_equal(df, DataFrame({'EMPTY_STRING': [""]}))
  321. def test_should_properly_handle_null_strings(self):
  322. query = 'SELECT STRING(NULL) as NULL_STRING'
  323. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  324. tm.assert_frame_equal(df, DataFrame({'NULL_STRING': [None]}))
  325. def test_should_properly_handle_valid_integers(self):
  326. query = 'SELECT INTEGER(3) as VALID_INTEGER'
  327. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  328. tm.assert_frame_equal(df, DataFrame({'VALID_INTEGER': [3]}))
  329. def test_should_properly_handle_null_integers(self):
  330. query = 'SELECT INTEGER(NULL) as NULL_INTEGER'
  331. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  332. tm.assert_frame_equal(df, DataFrame({'NULL_INTEGER': [np.nan]}))
  333. def test_should_properly_handle_valid_floats(self):
  334. query = 'SELECT PI() as VALID_FLOAT'
  335. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  336. tm.assert_frame_equal(df, DataFrame(
  337. {'VALID_FLOAT': [3.141592653589793]}))
  338. def test_should_properly_handle_null_floats(self):
  339. query = 'SELECT FLOAT(NULL) as NULL_FLOAT'
  340. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  341. tm.assert_frame_equal(df, DataFrame({'NULL_FLOAT': [np.nan]}))
  342. def test_should_properly_handle_timestamp_unix_epoch(self):
  343. query = 'SELECT TIMESTAMP("1970-01-01 00:00:00") as UNIX_EPOCH'
  344. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  345. tm.assert_frame_equal(df, DataFrame(
  346. {'UNIX_EPOCH': [np.datetime64('1970-01-01T00:00:00.000000Z')]}))
  347. def test_should_properly_handle_arbitrary_timestamp(self):
  348. query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") as VALID_TIMESTAMP'
  349. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  350. tm.assert_frame_equal(df, DataFrame({
  351. 'VALID_TIMESTAMP': [np.datetime64('2004-09-15T05:00:00.000000Z')]
  352. }))
  353. def test_should_properly_handle_null_timestamp(self):
  354. query = 'SELECT TIMESTAMP(NULL) as NULL_TIMESTAMP'
  355. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  356. tm.assert_frame_equal(df, DataFrame({'NULL_TIMESTAMP': [NaT]}))
  357. def test_should_properly_handle_true_boolean(self):
  358. query = 'SELECT BOOLEAN(TRUE) as TRUE_BOOLEAN'
  359. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  360. tm.assert_frame_equal(df, DataFrame({'TRUE_BOOLEAN': [True]}))
  361. def test_should_properly_handle_false_boolean(self):
  362. query = 'SELECT BOOLEAN(FALSE) as FALSE_BOOLEAN'
  363. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  364. tm.assert_frame_equal(df, DataFrame({'FALSE_BOOLEAN': [False]}))
  365. def test_should_properly_handle_null_boolean(self):
  366. query = 'SELECT BOOLEAN(NULL) as NULL_BOOLEAN'
  367. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  368. tm.assert_frame_equal(df, DataFrame({'NULL_BOOLEAN': [None]}))
  369. def test_unicode_string_conversion_and_normalization(self):
  370. correct_test_datatype = DataFrame(
  371. {'UNICODE_STRING': [u("\xe9\xfc")]}
  372. )
  373. unicode_string = "\xc3\xa9\xc3\xbc"
  374. if compat.PY3:
  375. unicode_string = unicode_string.encode('latin-1').decode('utf8')
  376. query = 'SELECT "{0}" as UNICODE_STRING'.format(unicode_string)
  377. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  378. tm.assert_frame_equal(df, correct_test_datatype)
  379. def test_index_column(self):
  380. query = "SELECT 'a' as STRING_1, 'b' as STRING_2"
  381. result_frame = gbq.read_gbq(
  382. query, project_id=PROJECT_ID, index_col="STRING_1")
  383. correct_frame = DataFrame(
  384. {'STRING_1': ['a'], 'STRING_2': ['b']}).set_index("STRING_1")
  385. tm.assert_equal(result_frame.index.name, correct_frame.index.name)
  386. def test_column_order(self):
  387. query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3"
  388. col_order = ['STRING_3', 'STRING_1', 'STRING_2']
  389. result_frame = gbq.read_gbq(
  390. query, project_id=PROJECT_ID, col_order=col_order)
  391. correct_frame = DataFrame({'STRING_1': ['a'], 'STRING_2': [
  392. 'b'], 'STRING_3': ['c']})[col_order]
  393. tm.assert_frame_equal(result_frame, correct_frame)
  394. def test_column_order_plus_index(self):
  395. query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3"
  396. col_order = ['STRING_3', 'STRING_2']
  397. result_frame = gbq.read_gbq(query, project_id=PROJECT_ID,
  398. index_col='STRING_1', col_order=col_order)
  399. correct_frame = DataFrame(
  400. {'STRING_1': ['a'], 'STRING_2': ['b'], 'STRING_3': ['c']})
  401. correct_frame.set_index('STRING_1', inplace=True)
  402. correct_frame = correct_frame[col_order]
  403. tm.assert_frame_equal(result_frame, correct_frame)
  404. def test_malformed_query(self):
  405. with tm.assertRaises(gbq.GenericGBQException):
  406. gbq.read_gbq("SELCET * FORM [publicdata:samples.shakespeare]",
  407. project_id=PROJECT_ID)
  408. def test_bad_project_id(self):
  409. with tm.assertRaises(gbq.GenericGBQException):
  410. gbq.read_gbq("SELECT 1", project_id='001')
  411. def test_bad_table_name(self):
  412. with tm.assertRaises(gbq.GenericGBQException):
  413. gbq.read_gbq("SELECT * FROM [publicdata:samples.nope]",
  414. project_id=PROJECT_ID)
  415. def test_download_dataset_larger_than_200k_rows(self):
  416. test_size = 200005
  417. # Test for known BigQuery bug in datasets larger than 100k rows
  418. # http://stackoverflow.com/questions/19145587/bq-py-not-paging-results
  419. df = gbq.read_gbq("SELECT id FROM [publicdata:samples.wikipedia] "
  420. "GROUP EACH BY id ORDER BY id ASC LIMIT {0}"
  421. .format(test_size),
  422. project_id=PROJECT_ID)
  423. self.assertEqual(len(df.drop_duplicates()), test_size)
  424. def test_zero_rows(self):
  425. # Bug fix for https://github.com/pydata/pandas/issues/10273
  426. df = gbq.read_gbq("SELECT title, id "
  427. "FROM [publicdata:samples.wikipedia] "
  428. "WHERE timestamp=-9999999",
  429. project_id=PROJECT_ID)
  430. page_array = np.zeros(
  431. (0,), dtype=[('title', object), ('id', np.dtype(float))])
  432. expected_result = DataFrame(page_array, columns=['title', 'id'])
  433. self.assert_frame_equal(df, expected_result)
  434. def test_legacy_sql(self):
  435. legacy_sql = "SELECT id FROM [publicdata.samples.wikipedia] LIMIT 10"
  436. # Test that a legacy sql statement fails when
  437. # setting dialect='standard'
  438. with tm.assertRaises(gbq.GenericGBQException):
  439. gbq.read_gbq(legacy_sql, project_id=PROJECT_ID,
  440. dialect='standard')
  441. # Test that a legacy sql statement succeeds when
  442. # setting dialect='legacy'
  443. df = gbq.read_gbq(legacy_sql, project_id=PROJECT_ID,
  444. dialect='legacy')
  445. self.assertEqual(len(df.drop_duplicates()), 10)
  446. def test_standard_sql(self):
  447. standard_sql = "SELECT DISTINCT id FROM " \
  448. "`publicdata.samples.wikipedia` LIMIT 10"
  449. # Test that a standard sql statement fails when using
  450. # the legacy SQL dialect (default value)
  451. with tm.assertRaises(gbq.GenericGBQException):
  452. gbq.read_gbq(standard_sql, project_id=PROJECT_ID)
  453. # Test that a standard sql statement succeeds when
  454. # setting dialect='standard'
  455. df = gbq.read_gbq(standard_sql, project_id=PROJECT_ID,
  456. dialect='standard')
  457. self.assertEqual(len(df.drop_duplicates()), 10)
  458. def test_invalid_option_for_sql_dialect(self):
  459. sql_statement = "SELECT DISTINCT id FROM " \
  460. "`publicdata.samples.wikipedia` LIMIT 10"
  461. # Test that an invalid option for `dialect` raises ValueError
  462. with tm.assertRaises(ValueError):
  463. gbq.read_gbq(sql_statement, project_id=PROJECT_ID,
  464. dialect='invalid')
  465. # Test that a correct option for dialect succeeds
  466. # to make sure ValueError was due to invalid dialect
  467. gbq.read_gbq(sql_statement, project_id=PROJECT_ID,
  468. dialect='standard')
  469. class TestToGBQIntegration(tm.TestCase):
  470. # Changes to BigQuery table schema may take up to 2 minutes as of May 2015
  471. # As a workaround to this issue, each test should use a unique table name.
  472. # Make sure to modify the for loop range in the tearDownClass when a new
  473. # test is added See `Issue 191
  474. # <https://code.google.com/p/google-bigquery/issues/detail?id=191>`__
  475. @classmethod
  476. def setUpClass(cls):
  477. # - GLOBAL CLASS FIXTURES -
  478. # put here any instruction you want to execute only *ONCE* *BEFORE*
  479. # executing *ALL* tests described below.
  480. _skip_if_no_project_id()
  481. test_requirements()
  482. clean_gbq_environment()
  483. gbq._Dataset(PROJECT_ID).create(DATASET_ID + "1")
  484. def setUp(self):
  485. # - PER-TEST FIXTURES -
  486. # put here any instruction you want to be run *BEFORE* *EVERY* test is
  487. # executed.
  488. self.dataset = gbq._Dataset(PROJECT_ID)
  489. self.table = gbq._Table(PROJECT_ID, DATASET_ID + "1")
  490. @classmethod
  491. def tearDownClass(cls):
  492. # - GLOBAL CLASS FIXTURES -
  493. # put here any instruction you want to execute only *ONCE* *AFTER*
  494. # executing all tests.
  495. clean_gbq_environment()
  496. def tearDown(self):
  497. # - PER-TEST FIXTURES -
  498. # put here any instructions you want to be run *AFTER* *EVERY* test is
  499. # executed.
  500. pass
  501. def test_upload_data(self):
  502. destination_table = DESTINATION_TABLE + "1"
  503. test_size = 20001
  504. df = make_mixed_dataframe_v2(test_size)
  505. gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000)
  506. sleep(30) # <- Curses Google!!!
  507. result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}"
  508. .format(destination_table),
  509. project_id=PROJECT_ID)
  510. self.assertEqual(result['NUM_ROWS'][0], test_size)
  511. def test_upload_data_if_table_exists_fail(self):
  512. destination_table = DESTINATION_TABLE + "2"
  513. test_size = 10
  514. df = make_mixed_dataframe_v2(test_size)
  515. self.table.create(TABLE_ID + "2", gbq._generate_bq_schema(df))
  516. # Test the default value of if_exists is 'fail'
  517. with tm.assertRaises(gbq.TableCreationError):
  518. gbq.to_gbq(df, destination_table, PROJECT_ID)
  519. # Test the if_exists parameter with value 'fail'
  520. with tm.assertRaises(gbq.TableCreationError):
  521. gbq.to_gbq(df, destination_table, PROJECT_ID, if_exists='fail')
  522. def test_upload_data_if_table_exists_append(self):
  523. destination_table = DESTINATION_TABLE + "3"
  524. test_size = 10
  525. df = make_mixed_dataframe_v2(test_size)
  526. df_different_schema = tm.makeMixedDataFrame()
  527. # Initialize table with sample data
  528. gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000)
  529. # Test the if_exists parameter with value 'append'
  530. gbq.to_gbq(df, destination_table, PROJECT_ID, if_exists='append')
  531. sleep(30) # <- Curses Google!!!
  532. result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}"
  533. .format(destination_table),
  534. project_id=PROJECT_ID)
  535. self.assertEqual(result['NUM_ROWS'][0], test_size * 2)
  536. # Try inserting with a different schema, confirm failure
  537. with tm.assertRaises(gbq.InvalidSchema):
  538. gbq.to_gbq(df_different_schema, destination_table,
  539. PROJECT_ID, if_exists='append')
  540. def test_upload_data_if_table_exists_replace(self):
  541. destination_table = DESTINATION_TABLE + "4"
  542. test_size = 10
  543. df = make_mixed_dataframe_v2(test_size)
  544. df_different_schema = tm.makeMixedDataFrame()
  545. # Initialize table with sample data
  546. gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000)
  547. # Test the if_exists parameter with the value 'replace'.
  548. gbq.to_gbq(df_different_schema, destination_table,
  549. PROJECT_ID, if_exists='replace')
  550. sleep(30) # <- Curses Google!!!
  551. result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}"
  552. .format(destination_table),
  553. project_id=PROJECT_ID)
  554. self.assertEqual(result['NUM_ROWS'][0], 5)
  555. def test_google_upload_errors_should_raise_exception(self):
  556. destination_table = DESTINATION_TABLE + "5"
  557. test_timestamp = datetime.now(pytz.timezone('US/Arizona'))
  558. bad_df = DataFrame({'bools': [False, False], 'flts': [0.0, 1.0],
  559. 'ints': [0, '1'], 'strs': ['a', 1],
  560. 'times': [test_timestamp, test_timestamp]},
  561. index=range(2))
  562. with tm.assertRaises(gbq.StreamingInsertError):
  563. gbq.to_gbq(bad_df, destination_table, PROJECT_ID, verbose=True)
  564. def test_generate_schema(self):
  565. df = tm.makeMixedDataFrame()
  566. schema = gbq._generate_bq_schema(df)
  567. test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'},
  568. {'name': 'B', 'type': 'FLOAT'},
  569. {'name': 'C', 'type': 'STRING'},
  570. {'name': 'D', 'type': 'TIMESTAMP'}]}
  571. self.assertEqual(schema, test_schema)
  572. def test_create_table(self):
  573. destination_table = TABLE_ID + "6"
  574. test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'},
  575. {'name': 'B', 'type': 'FLOAT'},
  576. {'name': 'C', 'type': 'STRING'},
  577. {'name': 'D', 'type': 'TIMESTAMP'}]}
  578. self.table.create(destination_table, test_schema)
  579. self.assertTrue(self.table.exists(destination_table),
  580. 'Expected table to exist')
  581. def test_table_does_not_exist(self):
  582. self.assertTrue(not self.table.exists(TABLE_ID + "7"),
  583. 'Expected table not to exist')
  584. def test_delete_table(self):
  585. destination_table = TABLE_ID + "8"
  586. test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'},
  587. {'name': 'B', 'type': 'FLOAT'},
  588. {'name': 'C', 'type': 'STRING'},
  589. {'name': 'D', 'type': 'TIMESTAMP'}]}
  590. self.table.create(destination_table, test_schema)
  591. self.table.delete(destination_table)
  592. self.assertTrue(not self.table.exists(
  593. destination_table), 'Expected table not to exist')
  594. def test_list_table(self):
  595. destination_table = TABLE_ID + "9"
  596. test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'},
  597. {'name': 'B', 'type': 'FLOAT'},
  598. {'name': 'C', 'type': 'STRING'},
  599. {'name': 'D', 'type': 'TIMESTAMP'}]}
  600. self.table.create(destination_table, test_schema)
  601. self.assertTrue(
  602. destination_table in self.dataset.tables(DATASET_ID + "1"),
  603. 'Expected table list to contain table {0}'
  604. .format(destination_table))
  605. def test_list_dataset(self):
  606. dataset_id = DATASET_ID + "1"
  607. self.assertTrue(dataset_id in self.dataset.datasets(),
  608. 'Expected dataset list to contain dataset {0}'
  609. .format(dataset_id))
  610. def test_list_table_zero_results(self):
  611. dataset_id = DATASET_ID + "2"
  612. self.dataset.create(dataset_id)
  613. table_list = gbq._Dataset(PROJECT_ID).tables(dataset_id)
  614. self.assertEqual(len(table_list), 0,
  615. 'Expected gbq.list_table() to return 0')
  616. def test_create_dataset(self):
  617. dataset_id = DATASET_ID + "3"
  618. self.dataset.create(dataset_id)
  619. self.assertTrue(dataset_id in self.dataset.datasets(),
  620. 'Expected dataset to exist')
  621. def test_delete_dataset(self):
  622. dataset_id = DATASET_ID + "4"
  623. self.dataset.create(dataset_id)
  624. self.dataset.delete(dataset_id)
  625. self.assertTrue(dataset_id not in self.dataset.datasets(),
  626. 'Expected dataset not to exist')
  627. def test_dataset_exists(self):
  628. dataset_id = DATASET_ID + "5"
  629. self.dataset.create(dataset_id)
  630. self.assertTrue(self.dataset.exists(dataset_id),
  631. 'Expected dataset to exist')
  632. def create_table_data_dataset_does_not_exist(self):
  633. dataset_id = DATASET_ID + "6"
  634. table_id = TABLE_ID + "1"
  635. table_with_new_dataset = gbq._Table(PROJECT_ID, dataset_id)
  636. df = make_mixed_dataframe_v2(10)
  637. table_with_new_dataset.create(table_id, gbq._generate_bq_schema(df))
  638. self.assertTrue(self.dataset.exists(dataset_id),
  639. 'Expected dataset to exist')
  640. self.assertTrue(table_with_new_dataset.exists(
  641. table_id), 'Expected dataset to exist')
  642. def test_dataset_does_not_exist(self):
  643. self.assertTrue(not self.dataset.exists(
  644. DATASET_ID + "_not_found"), 'Expected dataset not to exist')
  645. class TestToGBQIntegrationServiceAccountKeyPath(tm.TestCase):
  646. # Changes to BigQuery table schema may take up to 2 minutes as of May 2015
  647. # As a workaround to this issue, each test should use a unique table name.
  648. # Make sure to modify the for loop range in the tearDownClass when a new
  649. # test is added
  650. # See `Issue 191
  651. # <https://code.google.com/p/google-bigquery/issues/detail?id=191>`__
  652. @classmethod
  653. def setUpClass(cls):
  654. # - GLOBAL CLASS FIXTURES -
  655. # put here any instruction you want to execute only *ONCE* *BEFORE*
  656. # executing *ALL* tests described below.
  657. _skip_if_no_project_id()
  658. _skip_if_no_private_key_path()
  659. test_requirements()
  660. clean_gbq_environment(PRIVATE_KEY_JSON_PATH)
  661. def setUp(self):
  662. # - PER-TEST FIXTURES -
  663. # put here any instruction you want to be run *BEFORE* *EVERY* test
  664. # is executed.
  665. pass
  666. @classmethod
  667. def tearDownClass(cls):
  668. # - GLOBAL CLASS FIXTURES -
  669. # put here any instruction you want to execute only *ONCE* *AFTER*
  670. # executing all tests.
  671. clean_gbq_environment(PRIVATE_KEY_JSON_PATH)
  672. def tearDown(self):
  673. # - PER-TEST FIXTURES -
  674. # put here any instructions you want to be run *AFTER* *EVERY* test
  675. # is executed.
  676. pass
  677. def test_upload_data_as_service_account_with_key_path(self):
  678. destination_table = DESTINATION_TABLE + "11"
  679. test_size = 10
  680. df = make_mixed_dataframe_v2(test_size)
  681. gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000,
  682. private_key=PRIVATE_KEY_JSON_PATH)
  683. sleep(30) # <- Curses Google!!!
  684. result = gbq.read_gbq(
  685. "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table),
  686. project_id=PROJECT_ID,
  687. private_key=PRIVATE_KEY_JSON_PATH)
  688. self.assertEqual(result['NUM_ROWS'][0], test_size)
  689. class TestToGBQIntegrationServiceAccountKeyContents(tm.TestCase):
  690. # Changes to BigQuery table schema may take up to 2 minutes as of May 2015
  691. # As a workaround to this issue, each test should use a unique table name.
  692. # Make sure to modify the for loop range in the tearDownClass when a new
  693. # test is added
  694. # See `Issue 191
  695. # <https://code.google.com/p/google-bigquery/issues/detail?id=191>`__
  696. @classmethod
  697. def setUpClass(cls):
  698. # - GLOBAL CLASS FIXTURES -
  699. # put here any instruction you want to execute only *ONCE* *BEFORE*
  700. # executing *ALL* tests described below.
  701. _skip_if_no_project_id()
  702. _skip_if_no_private_key_contents()
  703. test_requirements()
  704. clean_gbq_environment(PRIVATE_KEY_JSON_CONTENTS)
  705. def setUp(self):
  706. # - PER-TEST FIXTURES -
  707. # put here any instruction you want to be run *BEFORE* *EVERY* test
  708. # is executed.
  709. pass
  710. @classmethod
  711. def tearDownClass(cls):
  712. # - GLOBAL CLASS FIXTURES -
  713. # put here any instruction you want to execute only *ONCE* *AFTER*
  714. # executing all tests.
  715. clean_gbq_environment(PRIVATE_KEY_JSON_CONTENTS)
  716. def tearDown(self):
  717. # - PER-TEST FIXTURES -
  718. # put here any instructions you want to be run *AFTER* *EVERY* test
  719. # is executed.
  720. pass
  721. def test_upload_data_as_service_account_with_key_contents(self):
  722. destination_table = DESTINATION_TABLE + "12"
  723. test_size = 10
  724. df = make_mixed_dataframe_v2(test_size)
  725. gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000,
  726. private_key=PRIVATE_KEY_JSON_CONTENTS)
  727. sleep(30) # <- Curses Google!!!
  728. result = gbq.read_gbq(
  729. "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table),
  730. project_id=PROJECT_ID,
  731. private_key=PRIVATE_KEY_JSON_CONTENTS)
  732. self.assertEqual(result['NUM_ROWS'][0], test_size)
  733. if __name__ == '__main__':
  734. nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
  735. exit=False)