/databuilder/extractor/mysql_metadata_extractor.py

https://github.com/lyft/amundsendatabuilder · Python · 139 lines · 92 code · 26 blank · 21 comment · 9 complexity · c838908f95a195698d4712156be9d58d MD5 · raw file

  1. # Copyright Contributors to the Amundsen project.
  2. # SPDX-License-Identifier: Apache-2.0
  3. import logging
  4. from collections import namedtuple
  5. from itertools import groupby
  6. from typing import (
  7. Any, Dict, Iterator, Union,
  8. )
  9. from pyhocon import ConfigFactory, ConfigTree
  10. from databuilder import Scoped
  11. from databuilder.extractor.base_extractor import Extractor
  12. from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor
  13. from databuilder.models.table_metadata import ColumnMetadata, TableMetadata
  14. TableKey = namedtuple('TableKey', ['schema', 'table_name'])
  15. LOGGER = logging.getLogger(__name__)
  16. class MysqlMetadataExtractor(Extractor):
  17. """
  18. Extracts mysql table and column metadata from underlying meta store database using SQLAlchemyExtractor
  19. """
  20. # SELECT statement from mysql information_schema to extract table and column metadata
  21. SQL_STATEMENT = """
  22. SELECT
  23. lower(c.column_name) AS col_name,
  24. c.column_comment AS col_description,
  25. lower(c.data_type) AS col_type,
  26. lower(c.ordinal_position) AS col_sort_order,
  27. {cluster_source} AS cluster,
  28. lower(c.table_schema) AS "schema",
  29. lower(c.table_name) AS name,
  30. t.table_comment AS description,
  31. case when lower(t.table_type) = "view" then "true" else "false" end AS is_view
  32. FROM
  33. INFORMATION_SCHEMA.COLUMNS AS c
  34. LEFT JOIN
  35. INFORMATION_SCHEMA.TABLES t
  36. ON c.TABLE_NAME = t.TABLE_NAME
  37. AND c.TABLE_SCHEMA = t.TABLE_SCHEMA
  38. {where_clause_suffix}
  39. ORDER by cluster, "schema", name, col_sort_order ;
  40. """
  41. # CONFIG KEYS
  42. WHERE_CLAUSE_SUFFIX_KEY = 'where_clause_suffix'
  43. CLUSTER_KEY = 'cluster_key'
  44. USE_CATALOG_AS_CLUSTER_NAME = 'use_catalog_as_cluster_name'
  45. DATABASE_KEY = 'database_key'
  46. # Default values
  47. DEFAULT_CLUSTER_NAME = 'master'
  48. DEFAULT_CONFIG = ConfigFactory.from_dict(
  49. {WHERE_CLAUSE_SUFFIX_KEY: ' ', CLUSTER_KEY: DEFAULT_CLUSTER_NAME, USE_CATALOG_AS_CLUSTER_NAME: True}
  50. )
  51. def init(self, conf: ConfigTree) -> None:
  52. conf = conf.with_fallback(MysqlMetadataExtractor.DEFAULT_CONFIG)
  53. self._cluster = conf.get_string(MysqlMetadataExtractor.CLUSTER_KEY)
  54. if conf.get_bool(MysqlMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME):
  55. cluster_source = "c.table_catalog"
  56. else:
  57. cluster_source = f"'{self._cluster}'"
  58. self._database = conf.get_string(MysqlMetadataExtractor.DATABASE_KEY, default='mysql')
  59. self.sql_stmt = MysqlMetadataExtractor.SQL_STATEMENT.format(
  60. where_clause_suffix=conf.get_string(MysqlMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY),
  61. cluster_source=cluster_source
  62. )
  63. self._alchemy_extractor = SQLAlchemyExtractor()
  64. sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope()) \
  65. .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))
  66. self.sql_stmt = sql_alch_conf.get_string(SQLAlchemyExtractor.EXTRACT_SQL)
  67. LOGGER.info('SQL for mysql metadata: %s', self.sql_stmt)
  68. self._alchemy_extractor.init(sql_alch_conf)
  69. self._extract_iter: Union[None, Iterator] = None
  70. def extract(self) -> Union[TableMetadata, None]:
  71. if not self._extract_iter:
  72. self._extract_iter = self._get_extract_iter()
  73. try:
  74. return next(self._extract_iter)
  75. except StopIteration:
  76. return None
  77. def get_scope(self) -> str:
  78. return 'extractor.mysql_metadata'
  79. def _get_extract_iter(self) -> Iterator[TableMetadata]:
  80. """
  81. Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
  82. :return:
  83. """
  84. for key, group in groupby(self._get_raw_extract_iter(), self._get_table_key):
  85. columns = []
  86. for row in group:
  87. last_row = row
  88. columns.append(ColumnMetadata(row['col_name'], row['col_description'],
  89. row['col_type'], row['col_sort_order']))
  90. yield TableMetadata(self._database, last_row['cluster'],
  91. last_row['schema'],
  92. last_row['name'],
  93. last_row['description'],
  94. columns,
  95. is_view=last_row['is_view'])
  96. def _get_raw_extract_iter(self) -> Iterator[Dict[str, Any]]:
  97. """
  98. Provides iterator of result row from SQLAlchemy extractor
  99. :return:
  100. """
  101. row = self._alchemy_extractor.extract()
  102. while row:
  103. yield row
  104. row = self._alchemy_extractor.extract()
  105. def _get_table_key(self, row: Dict[str, Any]) -> Union[TableKey, None]:
  106. """
  107. Table key consists of schema and table name
  108. :param row:
  109. :return:
  110. """
  111. if row:
  112. return TableKey(schema=row['schema'], table_name=row['name'])
  113. return None