/examples/datavault2-example/dags/upload_to_bq.py

https://github.com/gtoonstra/etl-with-airflow · Python · 73 lines · 49 code · 11 blank · 13 comment · 1 complexity · d3e624a122e75d5c4e2a14ae6ac8c7bf MD5 · raw file

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from __future__ import print_function
  15. import airflow
  16. from datetime import datetime, timedelta
  17. from airflow.operators.hive_operator import HiveOperator
  18. from airflow.operators.dummy_operator import DummyOperator
  19. from acme.operators.hive_to_gcs_operator import HiveToGcsOperator
  20. from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
  21. from airflow.models import Variable
  22. args = {
  23. 'owner': 'airflow',
  24. 'start_date': airflow.utils.dates.days_ago(1),
  25. 'provide_context': True,
  26. 'depends_on_past': True
  27. }
  28. dag = airflow.DAG(
  29. 'upload_to_bq',
  30. schedule_interval="@daily",
  31. dagrun_timeout=timedelta(minutes=60),
  32. template_searchpath='/usr/local/airflow/sql',
  33. default_args=args,
  34. max_active_runs=1)
  35. all_done = DummyOperator(
  36. task_id='all_done',
  37. dag=dag)
  38. t1 = HiveToGcsOperator(
  39. hql='bigquery/upload_flat_table.hql',
  40. bucket='datavault2-example',
  41. subdir='{{ds_nodash[:4]}}/{{ds_nodash[4:6]}}/{{ds_nodash[6:8]}}',
  42. file_pattern='dv_star_data-{0}.json',
  43. schema='dv_star',
  44. hiveserver2_conn_id='hiveserver2-dvstar',
  45. google_cloud_storage_conn_id='gcp',
  46. task_id='upload_flat_table',
  47. dag=dag)
  48. t2 = GoogleCloudStorageToBigQueryOperator(
  49. bucket='datavault2-example',
  50. source_objects=['{{ds_nodash[:4]}}/{{ds_nodash[4:6]}}/{{ds_nodash[6:8]}}/dv_star_data-*.json'],
  51. destination_project_dataset_table='information_mart.flat_table',
  52. source_format='NEWLINE_DELIMITED_JSON',
  53. write_disposition='WRITE_TRUNCATE',
  54. src_fmt_configs={'autodetect': True},
  55. bigquery_conn_id='gcp',
  56. google_cloud_storage_conn_id='gcp',
  57. task_id='gcs_to_bq',
  58. dag=dag)
  59. t1 >> t2
  60. t2 >> all_done
  61. if __name__ == "__main__":
  62. dag.cli()