PageRenderTime 51ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/io/tests/test_gbq.py

http://github.com/pydata/pandas
Python | 290 lines | 261 code | 29 blank | 0 comment | 9 complexity | 48dafdcd850fcab03a5a7d1cbc825d4d MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. import ast
  2. import datetime
  3. import json
  4. import nose
  5. import os
  6. import pytz
  7. import shutil
  8. import subprocess
  9. import sys
  10. import platform
  11. from time import sleep
  12. import numpy as np
  13. from pandas import NaT
  14. from pandas.compat import u
  15. from pandas.core.frame import DataFrame
  16. import pandas.io.gbq as gbq
  17. import pandas.util.testing as tm
  18. PROJECT_ID = None
  19. VERSION = platform.python_version()
  20. def missing_bq():
  21. try:
  22. subprocess.call('bq')
  23. return False
  24. except OSError:
  25. return True
  26. def test_requirements():
  27. try:
  28. gbq._test_imports()
  29. except (ImportError, NotImplementedError) as import_exception:
  30. raise nose.SkipTest(import_exception)
  31. class TestGBQConnectorIntegration(tm.TestCase):
  32. def setUp(self):
  33. test_requirements()
  34. if not PROJECT_ID:
  35. raise nose.SkipTest("Cannot run integration tests without a project id")
  36. self.sut = gbq.GbqConnector(PROJECT_ID)
  37. def test_should_be_able_to_make_a_connector(self):
  38. self.assertTrue(self.sut is not None, 'Could not create a GbqConnector')
  39. def test_should_be_able_to_get_valid_credentials(self):
  40. credentials = self.sut.get_credentials()
  41. self.assertFalse(credentials.invalid, 'Returned credentials invalid')
  42. def test_should_be_able_to_get_a_bigquery_service(self):
  43. credentials = self.sut.get_credentials()
  44. bigquery_service = self.sut.get_service(credentials)
  45. self.assertTrue(bigquery_service is not None, 'No service returned')
  46. def test_should_be_able_to_get_schema_from_query(self):
  47. schema, pages = self.sut.run_query('SELECT 1')
  48. self.assertTrue(schema is not None)
  49. def test_should_be_able_to_get_results_from_query(self):
  50. schema, pages = self.sut.run_query('SELECT 1')
  51. self.assertTrue(pages is not None)
  52. class TestReadGBQUnitTests(tm.TestCase):
  53. def setUp(self):
  54. test_requirements()
  55. def test_should_return_bigquery_integers_as_python_floats(self):
  56. result = gbq._parse_entry(1, 'INTEGER')
  57. tm.assert_equal(result, float(1))
  58. def test_should_return_bigquery_floats_as_python_floats(self):
  59. result = gbq._parse_entry(1, 'FLOAT')
  60. tm.assert_equal(result, float(1))
  61. def test_should_return_bigquery_timestamps_as_numpy_datetime(self):
  62. result = gbq._parse_entry('0e9', 'TIMESTAMP')
  63. tm.assert_equal(result, np.datetime64('1970-01-01T00:00:00Z'))
  64. def test_should_return_bigquery_booleans_as_python_booleans(self):
  65. result = gbq._parse_entry('false', 'BOOLEAN')
  66. tm.assert_equal(result, False)
  67. def test_should_return_bigquery_strings_as_python_strings(self):
  68. result = gbq._parse_entry('STRING', 'STRING')
  69. tm.assert_equal(result, 'STRING')
  70. def test_to_gbq_should_fail_if_invalid_table_name_passed(self):
  71. with tm.assertRaises(gbq.NotFoundException):
  72. gbq.to_gbq(DataFrame(), 'invalid_table_name', project_id="1234")
  73. def test_to_gbq_with_no_project_id_given_should_fail(self):
  74. with tm.assertRaises(TypeError):
  75. gbq.to_gbq(DataFrame(), 'dataset.tablename')
  76. def test_read_gbq_with_no_project_id_given_should_fail(self):
  77. with tm.assertRaises(TypeError):
  78. gbq.read_gbq('SELECT "1" as NUMBER_1')
  79. def test_that_parse_data_works_properly(self):
  80. test_schema = {'fields': [{'mode': 'NULLABLE',
  81. 'name': 'VALID_STRING',
  82. 'type': 'STRING'}]}
  83. test_page = [{'f': [{'v': 'PI'}]}]
  84. test_output = gbq._parse_data(test_schema, test_page)
  85. correct_output = DataFrame({'VALID_STRING' : ['PI']})
  86. tm.assert_frame_equal(test_output, correct_output)
  87. class TestReadGBQIntegration(tm.TestCase):
  88. def setUp(self):
  89. test_requirements()
  90. if not PROJECT_ID:
  91. raise nose.SkipTest("Cannot run integration tests without a project id")
  92. def test_should_properly_handle_valid_strings(self):
  93. query = 'SELECT "PI" as VALID_STRING'
  94. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  95. tm.assert_frame_equal(df, DataFrame({'VALID_STRING' : ['PI']}))
  96. def test_should_properly_handle_empty_strings(self):
  97. query = 'SELECT "" as EMPTY_STRING'
  98. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  99. tm.assert_frame_equal(df, DataFrame({'EMPTY_STRING' : [""]}))
  100. def test_should_properly_handle_null_strings(self):
  101. query = 'SELECT STRING(NULL) as NULL_STRING'
  102. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  103. tm.assert_frame_equal(df, DataFrame({'NULL_STRING' : [None]}))
  104. def test_should_properly_handle_valid_integers(self):
  105. query = 'SELECT INTEGER(3) as VALID_INTEGER'
  106. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  107. tm.assert_frame_equal(df, DataFrame({'VALID_INTEGER' : [3]}))
  108. def test_should_properly_handle_null_integers(self):
  109. query = 'SELECT INTEGER(NULL) as NULL_INTEGER'
  110. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  111. tm.assert_frame_equal(df, DataFrame({'NULL_INTEGER' : [np.nan]}))
  112. def test_should_properly_handle_valid_floats(self):
  113. query = 'SELECT PI() as VALID_FLOAT'
  114. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  115. tm.assert_frame_equal(df, DataFrame({'VALID_FLOAT' : [3.141592653589793]}))
  116. def test_should_properly_handle_null_floats(self):
  117. query = 'SELECT FLOAT(NULL) as NULL_FLOAT'
  118. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  119. tm.assert_frame_equal(df, DataFrame({'NULL_FLOAT' : [np.nan]}))
  120. def test_should_properly_handle_timestamp_unix_epoch(self):
  121. query = 'SELECT TIMESTAMP("1970-01-01 00:00:00") as UNIX_EPOCH'
  122. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  123. tm.assert_frame_equal(df, DataFrame({'UNIX_EPOCH' : [np.datetime64('1970-01-01T00:00:00.000000Z')]}))
  124. def test_should_properly_handle_arbitrary_timestamp(self):
  125. query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") as VALID_TIMESTAMP'
  126. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  127. tm.assert_frame_equal(df, DataFrame({'VALID_TIMESTAMP' : [np.datetime64('2004-09-15T05:00:00.000000Z')]}))
  128. def test_should_properly_handle_null_timestamp(self):
  129. query = 'SELECT TIMESTAMP(NULL) as NULL_TIMESTAMP'
  130. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  131. tm.assert_frame_equal(df, DataFrame({'NULL_TIMESTAMP' :[NaT]}))
  132. def test_should_properly_handle_true_boolean(self):
  133. query = 'SELECT BOOLEAN(TRUE) as TRUE_BOOLEAN'
  134. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  135. tm.assert_frame_equal(df, DataFrame({'TRUE_BOOLEAN' : [True]}))
  136. def test_should_properly_handle_false_boolean(self):
  137. query = 'SELECT BOOLEAN(FALSE) as FALSE_BOOLEAN'
  138. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  139. tm.assert_frame_equal(df, DataFrame({'FALSE_BOOLEAN' : [False]}))
  140. def test_should_properly_handle_null_boolean(self):
  141. query = 'SELECT BOOLEAN(NULL) as NULL_BOOLEAN'
  142. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  143. tm.assert_frame_equal(df, DataFrame({'NULL_BOOLEAN' : [None]}))
  144. def test_unicode_string_conversion_and_normalization(self):
  145. correct_test_datatype = DataFrame(
  146. {'UNICODE_STRING' : [u("\xe9\xfc")]}
  147. )
  148. query = 'SELECT "\xc3\xa9\xc3\xbc" as UNICODE_STRING'
  149. df = gbq.read_gbq(query, project_id=PROJECT_ID)
  150. tm.assert_frame_equal(df, correct_test_datatype)
  151. def test_index_column(self):
  152. query = "SELECT 'a' as STRING_1, 'b' as STRING_2"
  153. result_frame = gbq.read_gbq(query, project_id=PROJECT_ID, index_col="STRING_1")
  154. correct_frame = DataFrame({'STRING_1' : ['a'], 'STRING_2' : ['b']}).set_index("STRING_1")
  155. tm.assert_equal(result_frame.index.name, correct_frame.index.name)
  156. def test_column_order(self):
  157. query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3"
  158. col_order = ['STRING_3', 'STRING_1', 'STRING_2']
  159. result_frame = gbq.read_gbq(query, project_id=PROJECT_ID, col_order=col_order)
  160. correct_frame = DataFrame({'STRING_1' : ['a'], 'STRING_2' : ['b'], 'STRING_3' : ['c']})[col_order]
  161. tm.assert_frame_equal(result_frame, correct_frame)
  162. def test_column_order_plus_index(self):
  163. query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3"
  164. col_order = ['STRING_3', 'STRING_2']
  165. result_frame = gbq.read_gbq(query, project_id=PROJECT_ID, index_col='STRING_1', col_order=col_order)
  166. correct_frame = DataFrame({'STRING_1' : ['a'], 'STRING_2' : ['b'], 'STRING_3' : ['c']})
  167. correct_frame.set_index('STRING_1', inplace=True)
  168. correct_frame = correct_frame[col_order]
  169. tm.assert_frame_equal(result_frame, correct_frame)
  170. def test_malformed_query(self):
  171. with tm.assertRaises(gbq.InvalidQueryException):
  172. gbq.read_gbq("SELCET * FORM [publicdata:samples.shakespeare]", project_id=PROJECT_ID)
  173. def test_bad_project_id(self):
  174. with tm.assertRaises(gbq.NotFoundException):
  175. gbq.read_gbq("SELECT 1", project_id='001')
  176. def test_bad_table_name(self):
  177. with tm.assertRaises(gbq.NotFoundException):
  178. gbq.read_gbq("SELECT * FROM [publicdata:samples.nope]", project_id=PROJECT_ID)
  179. def test_download_dataset_larger_than_200k_rows(self):
  180. # Test for known BigQuery bug in datasets larger than 100k rows
  181. # http://stackoverflow.com/questions/19145587/bq-py-not-paging-results
  182. df = gbq.read_gbq("SELECT id FROM [publicdata:samples.wikipedia] GROUP EACH BY id ORDER BY id ASC LIMIT 200005", project_id=PROJECT_ID)
  183. self.assertEqual(len(df.drop_duplicates()), 200005)
  184. class TestToGBQIntegration(tm.TestCase):
  185. # This class requires bq.py to be installed for setup/teardown.
  186. # It will also need to be preconfigured with a default dataset,
  187. # so, be sure to `bq init` in terminal before running.
  188. def setUp(self):
  189. test_requirements()
  190. if not PROJECT_ID:
  191. raise nose.SkipTest("Cannot run integration tests without a project id")
  192. if missing_bq():
  193. raise nose.SkipTest("Cannot run to_gbq tests without bq command line client")
  194. @classmethod
  195. def setUpClass(cls):
  196. if PROJECT_ID and not missing_bq():
  197. subprocess.call(['bq','mk','pydata_pandas_bq_testing'])
  198. subprocess.call(['bq','mk','pydata_pandas_bq_testing.new_test','bools:BOOLEAN,flts:FLOAT,ints:INTEGER,strs:STRING,times:TIMESTAMP'])
  199. def test_upload_data(self):
  200. test_size = 1000001
  201. #create df to test for all BQ datatypes except RECORD
  202. bools = np.random.randint(2, size=(1,test_size)).astype(bool)
  203. flts = np.random.randn(1,test_size)
  204. ints = np.random.randint(1,10, size=(1,test_size))
  205. strs = np.random.randint(1,10, size=(1,test_size)).astype(str)
  206. times = [datetime.datetime.now(pytz.timezone('US/Arizona')) for t in xrange(test_size)]
  207. df = DataFrame({'bools':bools[0], 'flts':flts[0], 'ints':ints[0], 'strs':strs[0], 'times':times[0]}, index=range(test_size))
  208. gbq.to_gbq(df,"pydata_pandas_bq_testing.new_test", project_id=PROJECT_ID, chunksize=10000)
  209. sleep(60) # <- Curses Google!!!
  210. result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing.new_test", project_id=PROJECT_ID)
  211. self.assertEqual(result['NUM_ROWS'][0], test_size)
  212. def test_google_upload_errors_should_raise_exception(self):
  213. test_timestamp = datetime.datetime.now(pytz.timezone('US/Arizona'))
  214. bad_df = DataFrame( {'bools': [False, False],
  215. 'flts': [0.0,1.0],
  216. 'ints': [0,'1'],
  217. 'strs': ['a', 1],
  218. 'times': [test_timestamp, test_timestamp]
  219. }, index=range(2))
  220. with tm.assertRaises(gbq.UnknownGBQException):
  221. gbq.to_gbq(bad_df, 'pydata_pandas_bq_testing.new_test', project_id = PROJECT_ID)
  222. @classmethod
  223. def tearDownClass(cls):
  224. if PROJECT_ID and not missing_bq():
  225. subprocess.call(['bq','rm','-f','pydata_pandas_bq_testing.new_test'])
  226. subprocess.call(['bq','rm','-f','pydata_pandas_bq_testing'])
  227. if __name__ == '__main__':
  228. nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
  229. exit=False)