PageRenderTime 69ms CodeModel.GetById 30ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/io/gbq.py

https://github.com/ajcr/pandas
Python | 530 lines | 366 code | 46 blank | 118 comment | 32 complexity | e75d492bc37347d6b5d01b40413afb4e MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. """
  2. Pandas module to interface with Google BigQuery.
  3. """
  4. import os
  5. import sys
  6. import tempfile
  7. import csv
  8. import logging
  9. from datetime import datetime
  10. import pkg_resources
  11. from distutils.version import LooseVersion
  12. from pandas.compat import u
  13. import pandas as pd
  14. import numpy as np
  15. from pandas.core.common import PandasError
  16. from pandas.core.frame import DataFrame
  17. from pandas.tools.merge import concat
  18. try:
  19. import bq
  20. import bigquery_client
  21. import gflags as flags
  22. _BQ_INSTALLED = True
  23. _BQ_VERSION = pkg_resources.get_distribution('bigquery').version
  24. if LooseVersion(_BQ_VERSION) >= '2.0.17':
  25. _BQ_VALID_VERSION = True
  26. else:
  27. _BQ_VALID_VERSION = False
  28. except ImportError:
  29. _BQ_INSTALLED = False
  30. # Setup the logger
  31. logger = logging.getLogger('pandas.io.gbq')
  32. # These are some custom exceptions that the
  33. # to_gbq() method can throw
  34. class SchemaMissing(PandasError, IOError):
  35. """
  36. Raised when attempting to write a DataFrame to
  37. a new table in Google BigQuery without specifying
  38. a schema describing the DataFrame.
  39. """
  40. pass
  41. class InvalidSchema(PandasError, IOError):
  42. """
  43. Raised when attempting to write a DataFrame to
  44. Google BigQuery with an invalid table schema.
  45. """
  46. pass
  47. class TableExistsFail(PandasError, IOError):
  48. """
  49. Raised when attempting to write a DataFrame to
  50. an existing Google BigQuery table without specifying
  51. that a replace/update action be taken.
  52. """
  53. pass
  54. class InvalidColumnOrder(PandasError, IOError):
  55. """
  56. Raised when the provided column order for output
  57. results DataFrame does not match the schema
  58. returned by BigQuery.
  59. """
  60. pass
  61. def _authenticate():
  62. """
  63. For testing, we abstract the authentication to BigQuery API.
  64. Presently this is implemented using the bq.py Client.Get()
  65. method. Any exceptions raised are considered fatal, so we
  66. do not process them.
  67. Returns
  68. -------
  69. BigqueryClient : Configured connection to Google BigQuery
  70. """
  71. return bq.Client.Get()
  72. def _parse_entry(field_value, field_type):
  73. """
  74. Given a value and the corresponding BigQuery data type,
  75. perform any operations needed and return in a format
  76. appropriate for a numpy record dictionary
  77. Parameters
  78. ----------
  79. field_value : Source object to be transformed
  80. field_type : String representation of Google BigQuery
  81. data type (per schema)
  82. Returns
  83. -------
  84. field_value : object or primitive of type corresponding
  85. to field_type
  86. """
  87. # Avoid any casting problems
  88. if field_value is None or field_value == 'null':
  89. return None
  90. if field_type == 'INTEGER' or field_type == 'FLOAT':
  91. field_value = float(field_value)
  92. elif field_type == 'TIMESTAMP':
  93. timestamp = datetime.utcfromtimestamp(float(field_value))
  94. field_value = np.datetime64(timestamp)
  95. elif field_type == 'BOOLEAN':
  96. field_value = field_value == 'true'
  97. elif field_type == 'STRING':
  98. field_value = field_value
  99. else:
  100. field_value = str(field_value)
  101. return field_value
  102. def _parse_page(raw_page, col_names, col_types, col_dtypes):
  103. """
  104. Given a list of rows produced by the client.apiclient.tabledata().list(),
  105. build a numpy array with proper dtypes and column names as specified
  106. by the arguments.
  107. Parameters
  108. ----------
  109. raw_page : Resulting list of rows from a page retrieved via
  110. bigquery API
  111. client.apiclient.tabledata().list().execute()['rows']
  112. col_names: An ordered list of names for the columns
  113. col_types: String representation of the BigQuery DataType for that
  114. column
  115. col_dtypes: Target numpy.dtype for the column
  116. Returns
  117. -------
  118. page_array : numpy record array corresponding
  119. to the page data
  120. """
  121. # Should be at most 100,000 per the API, but this could
  122. # be increased in the future. Should only be less than
  123. # this for the last page to reduce API calls
  124. page_row_count = len(raw_page)
  125. # Place to hold the results for a page of data
  126. page_array = np.zeros((page_row_count,), dtype=zip(col_names, col_dtypes))
  127. for row_num, raw_row in enumerate(raw_page):
  128. entries = raw_row.get('f', [])
  129. # Iterate over each entry - setting proper field types
  130. for col_num, field_type in enumerate(col_types):
  131. # Process the field's types using schema
  132. field_value = _parse_entry(entries[col_num].get('v', ''),
  133. field_type)
  134. # Fill the value into the final array
  135. page_array[row_num][col_num] = field_value
  136. return page_array
  137. def _parse_data(client, job, index_col=None, col_order=None):
  138. """
  139. Iterate through the query results and piece together the
  140. final DataFrame. Builds a DataFrame for each page of
  141. results, then concatenates them together when finished.
  142. To save memory, we use numpy record arrays to build these
  143. DataFrames.
  144. Parameters
  145. ----------
  146. client: An instance of bq.Client
  147. job: An array containing the job info for a completed query
  148. index_col: str (optional)
  149. Name of result column to use for index in results DataFrame
  150. col_order: list() (optional)
  151. List of BigQuery column names in the desired order for results
  152. DataFrame
  153. Returns
  154. -------
  155. df: pandas DataFrame
  156. DataFrame representing results of query
  157. Raises:
  158. ------
  159. InvalidColumnOrder:
  160. Raised if 'col_order' parameter doesn't match returned DataFrame
  161. BigqueryError:
  162. Raised by bigquery_client if a Google API error is encountered
  163. Notes:
  164. -----
  165. This script relies on Google being consistent with their
  166. pagination API. We are using the most flexible iteration method
  167. that we could find in the bq.py/bigquery_client.py API's, but
  168. these have undergone large amounts of change recently.
  169. """
  170. # dtype Map -
  171. # see: http://pandas.pydata.org/pandas-docs/dev/missing_data.html#missing-data-casting-rules-and-indexing
  172. dtype_map = {'INTEGER': np.dtype(float),
  173. 'FLOAT': np.dtype(float),
  174. 'TIMESTAMP': 'M8[ns]'} # This seems to be buggy without
  175. # nanosecond indicator
  176. # We first need the schema to get information about the columns of
  177. # our dataframe.
  178. table_dict = job['configuration']['query']['destinationTable']
  179. fields = client.GetTableSchema(table_dict)['fields']
  180. # Get the schema into a format useable to create our
  181. # dataframe
  182. col_dtypes = []
  183. col_types = []
  184. col_names = []
  185. # TODO: Do this in one clean step
  186. for field in fields:
  187. col_types.append(field['type'])
  188. # Note the encoding... numpy doesn't like titles that are UTF8, which
  189. # is the return type from the API
  190. col_names.append(field['name'].encode('ascii', 'ignore'))
  191. # Note, it would be nice to use 'str' types, but BigQuery doesn't have
  192. # a fixed length in mind - just maxes out at 64k
  193. col_dtypes.append(dtype_map.get(field['type'], object))
  194. # How many columns are there
  195. num_columns = len(col_names)
  196. # Iterate over the result rows.
  197. # Since Google's API now requires pagination of results,
  198. # we do that here. The following is repurposed from
  199. # bigquery_client.py :: Client._JobTableReader._ReadOnePage
  200. # TODO: Enable Reading From Table,
  201. # see Client._TableTableReader._ReadOnePage
  202. # Initially, no page token is set
  203. page_token = None
  204. # This number is the current max results per page
  205. max_rows = bigquery_client._MAX_ROWS_PER_REQUEST
  206. # How many rows in result set? Initialize to max_rows
  207. total_rows = max_rows
  208. # This is the starting row for a particular page...
  209. # is ignored if page_token is present, though
  210. # it may be useful if we wish to implement SQL like LIMITs
  211. # with minimums
  212. start_row = 0
  213. # Keep our page DataFrames until the end when we concatenate them
  214. dataframe_list = list()
  215. current_job = job['jobReference']
  216. # Iterate over all rows
  217. while start_row < total_rows:
  218. # Setup the parameters for getQueryResults() API Call
  219. kwds = dict(current_job)
  220. kwds['maxResults'] = max_rows
  221. # Sets the timeout to 0 because we assume the table is already ready.
  222. # This is because our previous call to Query() is synchronous
  223. # and will block until it's actually done
  224. kwds['timeoutMs'] = 0
  225. # Use start row if there's no page_token ... in other words, the
  226. # user requested to start somewhere other than the beginning...
  227. # presently this is not a parameter to read_gbq(), but it will be
  228. # added eventually.
  229. if page_token:
  230. kwds['pageToken'] = page_token
  231. else:
  232. kwds['startIndex'] = start_row
  233. data = client.apiclient.jobs().getQueryResults(**kwds).execute()
  234. if not data['jobComplete']:
  235. raise bigquery_client.BigqueryError('Job was not completed, or was invalid')
  236. # How many rows are there across all pages?
  237. # Note: This is presently the only reason we don't just use
  238. # _ReadOnePage() directly
  239. total_rows = int(data['totalRows'])
  240. page_token = data.get('pageToken', None)
  241. raw_page = data.get('rows', [])
  242. page_array = _parse_page(raw_page, col_names, col_types, col_dtypes)
  243. start_row += len(raw_page)
  244. if total_rows > 0:
  245. completed = (100 * start_row) / total_rows
  246. logger.info('Remaining Rows: ' + str(total_rows - start_row) + '('
  247. + str(completed) + '% Complete)')
  248. else:
  249. logger.info('No Rows')
  250. dataframe_list.append(DataFrame(page_array))
  251. # Did we get enough rows? Note: gbq.py stopped checking for this
  252. # but we felt it was still a good idea.
  253. if not page_token and not raw_page and start_row != total_rows:
  254. raise bigquery_client.BigqueryInterfaceError(
  255. 'Not enough rows returned by server. Expected: {0} Rows, But '
  256. 'Received {1}'.format(total_rows, start_row)
  257. )
  258. # Build final dataframe
  259. final_df = concat(dataframe_list, ignore_index=True)
  260. # Reindex the DataFrame on the provided column
  261. if index_col is not None:
  262. if index_col in col_names:
  263. final_df.set_index(index_col, inplace=True)
  264. col_names.remove(index_col)
  265. else:
  266. raise InvalidColumnOrder(
  267. 'Index column "{0}" does not exist in DataFrame.'
  268. .format(index_col)
  269. )
  270. # Change the order of columns in the DataFrame based on provided list
  271. if col_order is not None:
  272. if sorted(col_order) == sorted(col_names):
  273. final_df = final_df[col_order]
  274. else:
  275. raise InvalidColumnOrder(
  276. 'Column order does not match this DataFrame.'
  277. )
  278. # Downcast floats to integers and objects to booleans
  279. # if there are no NaN's. This is presently due to a
  280. # limitation of numpy in handling missing data.
  281. final_df._data = final_df._data.downcast(dtypes='infer')
  282. return final_df
  283. def to_gbq(dataframe, destination_table, schema=None, col_order=None,
  284. if_exists='fail', **kwargs):
  285. """Write a DataFrame to a Google BigQuery table.
  286. THIS IS AN EXPERIMENTAL LIBRARY
  287. If the table exists, the DataFrame will be appended. If not, a new table
  288. will be created, in which case the schema will have to be specified. By
  289. default, rows will be written in the order they appear in the DataFrame,
  290. though the user may specify an alternative order.
  291. Parameters
  292. ----------
  293. dataframe : DataFrame
  294. DataFrame to be written
  295. destination_table : string
  296. name of table to be written, in the form 'dataset.tablename'
  297. schema : sequence (optional)
  298. list of column types in order for data to be inserted,
  299. e.g. ['INTEGER', 'TIMESTAMP', 'BOOLEAN']
  300. col_order : sequence (optional)
  301. order which columns are to be inserted,
  302. e.g. ['primary_key', 'birthday', 'username']
  303. if_exists : {'fail', 'replace', 'append'} (optional)
  304. - fail: If table exists, do nothing.
  305. - replace: If table exists, drop it, recreate it, and insert data.
  306. - append: If table exists, insert data. Create if does not exist.
  307. kwargs are passed to the Client constructor
  308. Raises
  309. ------
  310. SchemaMissing :
  311. Raised if the 'if_exists' parameter is set to 'replace', but no schema
  312. is specified
  313. TableExists :
  314. Raised if the specified 'destination_table' exists but the 'if_exists'
  315. parameter is set to 'fail' (the default)
  316. InvalidSchema :
  317. Raised if the 'schema' parameter does not match the provided DataFrame
  318. """
  319. if not _BQ_INSTALLED:
  320. if sys.version_info >= (3, 0):
  321. raise NotImplementedError('gbq module does not support Python 3 '
  322. 'yet')
  323. else:
  324. raise ImportError('Could not import Google BigQuery Client.')
  325. if not _BQ_VALID_VERSION:
  326. raise ImportError("pandas requires bigquery >= 2.0.17 for Google "
  327. "BigQuery support, current version " + _BQ_VERSION)
  328. ALLOWED_TYPES = ['STRING', 'INTEGER', 'FLOAT', 'BOOLEAN', 'TIMESTAMP',
  329. 'RECORD']
  330. if if_exists == 'replace' and schema is None:
  331. raise SchemaMissing('Cannot replace a table without specifying the '
  332. 'data schema')
  333. else:
  334. client = _authenticate()
  335. table_reference = client.GetTableReference(destination_table)
  336. if client.TableExists(table_reference):
  337. if if_exists == 'fail':
  338. raise TableExistsFail('Cannot overwrite existing tables if '
  339. '\'if_exists="fail"\'')
  340. else:
  341. # Build up a string representation of the
  342. # table's schema. Since the table already
  343. # exists, we ask ask the API for it, which
  344. # is returned in a list of dictionaries
  345. # describing column data. Iterate over these
  346. # and build up a string of form:
  347. # "col_name1 : col_type1, col_name2 : col_type2..."
  348. schema_full = client.GetTableSchema(
  349. dict(table_reference)
  350. )['fields']
  351. schema = ''
  352. for count, row in enumerate(schema_full):
  353. if count > 0:
  354. schema += ', '
  355. schema += row['name'] + ':' + row['type']
  356. else:
  357. logger.info('Creating New Table')
  358. if schema is None:
  359. raise SchemaMissing('Cannot create a new table without '
  360. 'specifying the data schema')
  361. else:
  362. columns = dataframe.columns
  363. if len(schema) != len(columns):
  364. raise InvalidSchema('Incorrect number of columns in '
  365. 'schema')
  366. else:
  367. schema_string = ''
  368. for count, name in enumerate(columns):
  369. if count > 0:
  370. schema_string += ', '
  371. column_type = schema[count].upper()
  372. if column_type in ALLOWED_TYPES:
  373. schema_string += name + ':' + schema[count].lower()
  374. else:
  375. raise InvalidSchema('Invalid Type: ' + column_type
  376. + ". Must be one of: " +
  377. str(ALLOWED_TYPES))
  378. schema = schema_string
  379. opts = kwargs
  380. opts['sync'] = True
  381. opts['skip_leading_rows'] = 1
  382. opts['encoding'] = 'UTF-8'
  383. opts['max_bad_records'] = 0
  384. # See: https://developers.google.com/bigquery/docs/reference/v2/jobs
  385. if if_exists == 'replace':
  386. opts['write_disposition'] = 'WRITE_TRUNCATE'
  387. elif if_exists == 'append':
  388. opts['write_disposition'] = 'WRITE_APPEND'
  389. with tempfile.NamedTemporaryFile() as csv_file:
  390. dataframe.to_csv(csv_file.name, index=False, encoding='utf-8')
  391. job = client.Load(table_reference, csv_file.name, schema=schema,
  392. **opts)
  393. def read_gbq(query, project_id=None, destination_table=None, index_col=None,
  394. col_order=None, **kwargs):
  395. """Load data from Google BigQuery.
  396. THIS IS AN EXPERIMENTAL LIBRARY
  397. The main method a user calls to load data from Google BigQuery into a
  398. pandas DataFrame. This is a simple wrapper for Google's bq.py and
  399. bigquery_client.py, which we use to get the source data. Because of this,
  400. this script respects the user's bq settings file, '~/.bigqueryrc', if it
  401. exists. Such a file can be generated using 'bq init'. Further, additional
  402. parameters for the query can be specified as either ``**kwds`` in the
  403. command, or using FLAGS provided in the 'gflags' module. Particular options
  404. can be found in bigquery_client.py.
  405. Parameters
  406. ----------
  407. query : str
  408. SQL-Like Query to return data values
  409. project_id : str (optional)
  410. Google BigQuery Account project ID. Optional, since it may be
  411. located in ~/.bigqueryrc
  412. index_col : str (optional)
  413. Name of result column to use for index in results DataFrame
  414. col_order : list(str) (optional)
  415. List of BigQuery column names in the desired order for results
  416. DataFrame
  417. destination_table : string (optional)
  418. If provided, send the results to the given table.
  419. **kwargs :
  420. To be passed to bq.Client.Create(). Particularly: 'trace',
  421. 'sync', 'api', 'api_version'
  422. Returns
  423. -------
  424. df: DataFrame
  425. DataFrame representing results of query
  426. """
  427. if not _BQ_INSTALLED:
  428. if sys.version_info >= (3, 0):
  429. raise NotImplementedError('gbq module does not support Python 3 '
  430. 'yet')
  431. else:
  432. raise ImportError('Could not import Google BigQuery Client.')
  433. if not _BQ_VALID_VERSION:
  434. raise ImportError('pandas requires bigquery >= 2.0.17 for Google '
  435. 'BigQuery support, current version ' + _BQ_VERSION)
  436. query_args = kwargs
  437. query_args['project_id'] = project_id
  438. query_args['query'] = query
  439. query_args['destination_table'] = destination_table
  440. query_args['sync'] = True
  441. client = _authenticate()
  442. job = client.Query(**query_args)
  443. return _parse_data(client, job, index_col=index_col, col_order=col_order)