From 8fb95e88a17260d0d6727f4d5e09636b128faf47 Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Mon, 23 Oct 2023 12:40:42 -0700
Subject: [PATCH 01/40] feat(sqlparser): parse create DDL statements (#9002)
---
.../goldens/v2_sqlite_operator.json | 162 +++++++++++++++---
.../v2_sqlite_operator_no_dag_listener.json | 162 +++++++++++++++---
.../datahub/emitter/sql_parsing_builder.py | 9 +-
.../testing/check_sql_parser_result.py | 9 +
.../src/datahub/utilities/sqlglot_lineage.py | 92 ++++++++--
.../test_bigquery_create_view_with_cte.json | 8 +-
..._bigquery_from_sharded_table_wildcard.json | 4 +-
.../test_bigquery_nested_subqueries.json | 4 +-
..._bigquery_sharded_table_normalization.json | 4 +-
.../test_bigquery_star_with_replace.json | 6 +-
.../test_bigquery_view_from_union.json | 4 +-
.../goldens/test_create_table_ddl.json | 55 +++++-
.../goldens/test_create_view_as_select.json | 2 +-
.../test_select_from_struct_subfields.json | 2 +-
.../test_select_with_full_col_name.json | 2 +-
.../test_teradata_default_normalization.json | 2 +
16 files changed, 430 insertions(+), 97 deletions(-)
diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json
index 1a32b38ce055d..81d0a71b651d9 100644
--- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json
+++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json
@@ -74,9 +74,7 @@
"downstream_task_ids": "['populate_cost_table']",
"inlets": "[]",
"outlets": "[]",
- "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )",
- "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}",
- "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}"
+ "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}"
},
"externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=create_cost_table",
"name": "create_cost_table",
@@ -98,7 +96,44 @@
"urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)"
],
"inputDatajobs": [],
- "fineGrainedLineages": []
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)"
+ ],
+ "confidenceScore": 1.0
+ }
+ ]
}
}
},
@@ -157,7 +192,7 @@
"customProperties": {
"run_id": "manual_run_test",
"duration": "None",
- "start_date": "2023-09-30 06:56:24.632190+00:00",
+ "start_date": "2023-10-15 20:29:10.262813+00:00",
"end_date": "None",
"execution_date": "2023-09-27 21:34:38+00:00",
"try_number": "0",
@@ -172,7 +207,7 @@
"name": "sqlite_operator_create_cost_table_manual_run_test",
"type": "BATCH_AD_HOC",
"created": {
- "time": 1696056984632,
+ "time": 1697401750262,
"actor": "urn:li:corpuser:datahub"
}
}
@@ -221,7 +256,7 @@
"aspectName": "dataProcessInstanceRunEvent",
"aspect": {
"json": {
- "timestampMillis": 1696056984632,
+ "timestampMillis": 1697401750262,
"partitionSpec": {
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
@@ -251,9 +286,7 @@
"downstream_task_ids": "['populate_cost_table']",
"inlets": "[]",
"outlets": "[]",
- "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )",
- "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}",
- "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}"
+ "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}"
},
"externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=create_cost_table",
"name": "create_cost_table",
@@ -275,7 +308,80 @@
"urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)"
],
"inputDatajobs": [],
- "fineGrainedLineages": []
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)"
+ ],
+ "confidenceScore": 1.0
+ }
+ ]
}
}
},
@@ -331,7 +437,7 @@
"aspectName": "dataProcessInstanceRunEvent",
"aspect": {
"json": {
- "timestampMillis": 1696056984947,
+ "timestampMillis": 1697401750651,
"partitionSpec": {
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
@@ -447,7 +553,7 @@
"customProperties": {
"run_id": "manual_run_test",
"duration": "None",
- "start_date": "2023-09-30 06:56:28.605901+00:00",
+ "start_date": "2023-10-15 20:29:15.013834+00:00",
"end_date": "None",
"execution_date": "2023-09-27 21:34:38+00:00",
"try_number": "0",
@@ -462,7 +568,7 @@
"name": "sqlite_operator_populate_cost_table_manual_run_test",
"type": "BATCH_AD_HOC",
"created": {
- "time": 1696056988605,
+ "time": 1697401755013,
"actor": "urn:li:corpuser:datahub"
}
}
@@ -511,7 +617,7 @@
"aspectName": "dataProcessInstanceRunEvent",
"aspect": {
"json": {
- "timestampMillis": 1696056988605,
+ "timestampMillis": 1697401755013,
"partitionSpec": {
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
@@ -621,7 +727,7 @@
"aspectName": "dataProcessInstanceRunEvent",
"aspect": {
"json": {
- "timestampMillis": 1696056989098,
+ "timestampMillis": 1697401755600,
"partitionSpec": {
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
@@ -807,7 +913,7 @@
"customProperties": {
"run_id": "manual_run_test",
"duration": "None",
- "start_date": "2023-09-30 06:56:32.888165+00:00",
+ "start_date": "2023-10-15 20:29:20.216818+00:00",
"end_date": "None",
"execution_date": "2023-09-27 21:34:38+00:00",
"try_number": "0",
@@ -822,7 +928,7 @@
"name": "sqlite_operator_transform_cost_table_manual_run_test",
"type": "BATCH_AD_HOC",
"created": {
- "time": 1696056992888,
+ "time": 1697401760216,
"actor": "urn:li:corpuser:datahub"
}
}
@@ -895,7 +1001,7 @@
"aspectName": "dataProcessInstanceRunEvent",
"aspect": {
"json": {
- "timestampMillis": 1696056992888,
+ "timestampMillis": 1697401760216,
"partitionSpec": {
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
@@ -1131,7 +1237,7 @@
"aspectName": "dataProcessInstanceRunEvent",
"aspect": {
"json": {
- "timestampMillis": 1696056993744,
+ "timestampMillis": 1697401761237,
"partitionSpec": {
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
@@ -1249,7 +1355,7 @@
"customProperties": {
"run_id": "manual_run_test",
"duration": "None",
- "start_date": "2023-09-30 06:56:37.745717+00:00",
+ "start_date": "2023-10-15 20:29:26.243934+00:00",
"end_date": "None",
"execution_date": "2023-09-27 21:34:38+00:00",
"try_number": "0",
@@ -1264,7 +1370,7 @@
"name": "sqlite_operator_cleanup_costs_manual_run_test",
"type": "BATCH_AD_HOC",
"created": {
- "time": 1696056997745,
+ "time": 1697401766243,
"actor": "urn:li:corpuser:datahub"
}
}
@@ -1313,7 +1419,7 @@
"aspectName": "dataProcessInstanceRunEvent",
"aspect": {
"json": {
- "timestampMillis": 1696056997745,
+ "timestampMillis": 1697401766243,
"partitionSpec": {
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
@@ -1425,7 +1531,7 @@
"aspectName": "dataProcessInstanceRunEvent",
"aspect": {
"json": {
- "timestampMillis": 1696056998672,
+ "timestampMillis": 1697401767373,
"partitionSpec": {
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
@@ -1543,7 +1649,7 @@
"customProperties": {
"run_id": "manual_run_test",
"duration": "None",
- "start_date": "2023-09-30 06:56:42.645806+00:00",
+ "start_date": "2023-10-15 20:29:32.075613+00:00",
"end_date": "None",
"execution_date": "2023-09-27 21:34:38+00:00",
"try_number": "0",
@@ -1558,7 +1664,7 @@
"name": "sqlite_operator_cleanup_processed_costs_manual_run_test",
"type": "BATCH_AD_HOC",
"created": {
- "time": 1696057002645,
+ "time": 1697401772075,
"actor": "urn:li:corpuser:datahub"
}
}
@@ -1607,7 +1713,7 @@
"aspectName": "dataProcessInstanceRunEvent",
"aspect": {
"json": {
- "timestampMillis": 1696057002645,
+ "timestampMillis": 1697401772075,
"partitionSpec": {
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
@@ -1719,7 +1825,7 @@
"aspectName": "dataProcessInstanceRunEvent",
"aspect": {
"json": {
- "timestampMillis": 1696057003759,
+ "timestampMillis": 1697401773454,
"partitionSpec": {
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json
index c082be693e30c..96a0f02ccec17 100644
--- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json
+++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json
@@ -74,9 +74,7 @@
"downstream_task_ids": "['populate_cost_table']",
"inlets": "[]",
"outlets": "[]",
- "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )",
- "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}",
- "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}"
+ "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}"
},
"externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=create_cost_table",
"name": "create_cost_table",
@@ -98,7 +96,44 @@
"urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)"
],
"inputDatajobs": [],
- "fineGrainedLineages": []
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)"
+ ],
+ "confidenceScore": 1.0
+ }
+ ]
}
}
},
@@ -157,7 +192,7 @@
"customProperties": {
"run_id": "manual_run_test",
"duration": "None",
- "start_date": "2023-09-30 07:00:45.832554+00:00",
+ "start_date": "2023-10-15 20:27:26.883178+00:00",
"end_date": "None",
"execution_date": "2023-09-27 21:34:38+00:00",
"try_number": "0",
@@ -172,7 +207,7 @@
"name": "sqlite_operator_create_cost_table_manual_run_test",
"type": "BATCH_AD_HOC",
"created": {
- "time": 1696057245832,
+ "time": 1697401646883,
"actor": "urn:li:corpuser:datahub"
}
}
@@ -221,7 +256,7 @@
"aspectName": "dataProcessInstanceRunEvent",
"aspect": {
"json": {
- "timestampMillis": 1696057245832,
+ "timestampMillis": 1697401646883,
"partitionSpec": {
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
@@ -251,9 +286,7 @@
"downstream_task_ids": "['populate_cost_table']",
"inlets": "[]",
"outlets": "[]",
- "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )",
- "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}",
- "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}"
+ "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}"
},
"externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=create_cost_table",
"name": "create_cost_table",
@@ -275,7 +308,80 @@
"urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)"
],
"inputDatajobs": [],
- "fineGrainedLineages": []
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)"
+ ],
+ "confidenceScore": 1.0
+ }
+ ]
}
}
},
@@ -331,7 +437,7 @@
"aspectName": "dataProcessInstanceRunEvent",
"aspect": {
"json": {
- "timestampMillis": 1696057246734,
+ "timestampMillis": 1697401647826,
"partitionSpec": {
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
@@ -502,7 +608,7 @@
"customProperties": {
"run_id": "manual_run_test",
"duration": "None",
- "start_date": "2023-09-30 07:00:49.653938+00:00",
+ "start_date": "2023-10-15 20:27:31.398799+00:00",
"end_date": "None",
"execution_date": "2023-09-27 21:34:38+00:00",
"try_number": "0",
@@ -517,7 +623,7 @@
"name": "sqlite_operator_populate_cost_table_manual_run_test",
"type": "BATCH_AD_HOC",
"created": {
- "time": 1696057249653,
+ "time": 1697401651398,
"actor": "urn:li:corpuser:datahub"
}
}
@@ -566,7 +672,7 @@
"aspectName": "dataProcessInstanceRunEvent",
"aspect": {
"json": {
- "timestampMillis": 1696057249653,
+ "timestampMillis": 1697401651398,
"partitionSpec": {
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
@@ -676,7 +782,7 @@
"aspectName": "dataProcessInstanceRunEvent",
"aspect": {
"json": {
- "timestampMillis": 1696057250831,
+ "timestampMillis": 1697401652651,
"partitionSpec": {
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
@@ -917,7 +1023,7 @@
"customProperties": {
"run_id": "manual_run_test",
"duration": "None",
- "start_date": "2023-09-30 07:00:53.989264+00:00",
+ "start_date": "2023-10-15 20:27:37.697995+00:00",
"end_date": "None",
"execution_date": "2023-09-27 21:34:38+00:00",
"try_number": "0",
@@ -932,7 +1038,7 @@
"name": "sqlite_operator_transform_cost_table_manual_run_test",
"type": "BATCH_AD_HOC",
"created": {
- "time": 1696057253989,
+ "time": 1697401657697,
"actor": "urn:li:corpuser:datahub"
}
}
@@ -1005,7 +1111,7 @@
"aspectName": "dataProcessInstanceRunEvent",
"aspect": {
"json": {
- "timestampMillis": 1696057253989,
+ "timestampMillis": 1697401657697,
"partitionSpec": {
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
@@ -1241,7 +1347,7 @@
"aspectName": "dataProcessInstanceRunEvent",
"aspect": {
"json": {
- "timestampMillis": 1696057255628,
+ "timestampMillis": 1697401659496,
"partitionSpec": {
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
@@ -1414,7 +1520,7 @@
"customProperties": {
"run_id": "manual_run_test",
"duration": "None",
- "start_date": "2023-09-30 07:01:00.421177+00:00",
+ "start_date": "2023-10-15 20:27:45.670215+00:00",
"end_date": "None",
"execution_date": "2023-09-27 21:34:38+00:00",
"try_number": "0",
@@ -1429,7 +1535,7 @@
"name": "sqlite_operator_cleanup_costs_manual_run_test",
"type": "BATCH_AD_HOC",
"created": {
- "time": 1696057260421,
+ "time": 1697401665670,
"actor": "urn:li:corpuser:datahub"
}
}
@@ -1478,7 +1584,7 @@
"aspectName": "dataProcessInstanceRunEvent",
"aspect": {
"json": {
- "timestampMillis": 1696057260421,
+ "timestampMillis": 1697401665670,
"partitionSpec": {
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
@@ -1590,7 +1696,7 @@
"aspectName": "dataProcessInstanceRunEvent",
"aspect": {
"json": {
- "timestampMillis": 1696057262258,
+ "timestampMillis": 1697401667670,
"partitionSpec": {
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
@@ -1763,7 +1869,7 @@
"customProperties": {
"run_id": "manual_run_test",
"duration": "None",
- "start_date": "2023-09-30 07:01:05.540192+00:00",
+ "start_date": "2023-10-15 20:27:51.559194+00:00",
"end_date": "None",
"execution_date": "2023-09-27 21:34:38+00:00",
"try_number": "0",
@@ -1778,7 +1884,7 @@
"name": "sqlite_operator_cleanup_processed_costs_manual_run_test",
"type": "BATCH_AD_HOC",
"created": {
- "time": 1696057265540,
+ "time": 1697401671559,
"actor": "urn:li:corpuser:datahub"
}
}
@@ -1827,7 +1933,7 @@
"aspectName": "dataProcessInstanceRunEvent",
"aspect": {
"json": {
- "timestampMillis": 1696057265540,
+ "timestampMillis": 1697401671559,
"partitionSpec": {
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
@@ -1939,7 +2045,7 @@
"aspectName": "dataProcessInstanceRunEvent",
"aspect": {
"json": {
- "timestampMillis": 1696057267631,
+ "timestampMillis": 1697401673788,
"partitionSpec": {
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
diff --git a/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py b/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py
index 071d590f270f8..dedcfa0385f75 100644
--- a/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py
+++ b/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py
@@ -179,15 +179,16 @@ def add_lineage(
def gen_workunits(self) -> Iterable[MetadataWorkUnit]:
if self.generate_lineage:
- yield from self._gen_lineage_workunits()
+ for mcp in self._gen_lineage_mcps():
+ yield mcp.as_workunit()
if self.generate_usage_statistics:
yield from self._gen_usage_statistics_workunits()
- def _gen_lineage_workunits(self) -> Iterable[MetadataWorkUnit]:
+ def _gen_lineage_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
for downstream_urn in self._lineage_map:
upstreams: List[UpstreamClass] = []
fine_upstreams: List[FineGrainedLineageClass] = []
- for upstream_urn, edge in self._lineage_map[downstream_urn].items():
+ for edge in self._lineage_map[downstream_urn].values():
upstreams.append(edge.gen_upstream_aspect())
fine_upstreams.extend(edge.gen_fine_grained_lineage_aspects())
@@ -201,7 +202,7 @@ def _gen_lineage_workunits(self) -> Iterable[MetadataWorkUnit]:
)
yield MetadataChangeProposalWrapper(
entityUrn=downstream_urn, aspect=upstream_lineage
- ).as_workunit()
+ )
def _gen_usage_statistics_workunits(self) -> Iterable[MetadataWorkUnit]:
yield from self._usage_aggregator.generate_workunits(
diff --git a/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py b/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py
index b3b1331db768b..2b610947e9043 100644
--- a/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py
+++ b/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py
@@ -24,6 +24,7 @@ def assert_sql_result_with_resolver(
*,
expected_file: pathlib.Path,
schema_resolver: SchemaResolver,
+ allow_table_error: bool = False,
**kwargs: Any,
) -> None:
# HACK: Our BigQuery source overwrites this value and doesn't undo it.
@@ -36,6 +37,14 @@ def assert_sql_result_with_resolver(
**kwargs,
)
+ if res.debug_info.table_error:
+ if allow_table_error:
+ logger.info(
+ f"SQL parser table error: {res.debug_info.table_error}",
+ exc_info=res.debug_info.table_error,
+ )
+ else:
+ raise res.debug_info.table_error
if res.debug_info.column_error:
logger.warning(
f"SQL parser column error: {res.debug_info.column_error}",
diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
index c830ec8c02fd4..97121b368f507 100644
--- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
+++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
@@ -241,9 +241,9 @@ class SqlParsingResult(_ParserBaseModel):
)
-def _parse_statement(sql: str, dialect: str) -> sqlglot.Expression:
- statement = sqlglot.parse_one(
- sql, read=dialect, error_level=sqlglot.ErrorLevel.RAISE
+def _parse_statement(sql: sqlglot.exp.ExpOrStr, dialect: str) -> sqlglot.Expression:
+ statement: sqlglot.Expression = sqlglot.maybe_parse(
+ sql, dialect=dialect, error_level=sqlglot.ErrorLevel.RAISE
)
return statement
@@ -467,14 +467,20 @@ def _column_level_lineage( # noqa: C901
default_db: Optional[str],
default_schema: Optional[str],
) -> List[_ColumnLineageInfo]:
- if not isinstance(
- statement,
- _SupportedColumnLineageTypesTuple,
+ is_create_ddl = _is_create_table_ddl(statement)
+ if (
+ not isinstance(
+ statement,
+ _SupportedColumnLineageTypesTuple,
+ )
+ and not is_create_ddl
):
raise UnsupportedStatementTypeError(
f"Can only generate column-level lineage for select-like inner statements, not {type(statement)}"
)
+ column_lineage: List[_ColumnLineageInfo] = []
+
use_case_insensitive_cols = dialect in {
# Column identifiers are case-insensitive in BigQuery, so we need to
# do a normalization step beforehand to make sure it's resolved correctly.
@@ -580,6 +586,38 @@ def _schema_aware_fuzzy_column_resolve(
) from e
logger.debug("Qualified sql %s", statement.sql(pretty=True, dialect=dialect))
+ # Handle the create DDL case.
+ if is_create_ddl:
+ assert (
+ output_table is not None
+ ), "output_table must be set for create DDL statements"
+
+ create_schema: sqlglot.exp.Schema = statement.this
+ sqlglot_columns = create_schema.expressions
+
+ for column_def in sqlglot_columns:
+ if not isinstance(column_def, sqlglot.exp.ColumnDef):
+ # Ignore things like constraints.
+ continue
+
+ output_col = _schema_aware_fuzzy_column_resolve(
+ output_table, column_def.name
+ )
+ output_col_type = column_def.args.get("kind")
+
+ column_lineage.append(
+ _ColumnLineageInfo(
+ downstream=_DownstreamColumnRef(
+ table=output_table,
+ column=output_col,
+ column_type=output_col_type,
+ ),
+ upstreams=[],
+ )
+ )
+
+ return column_lineage
+
# Try to figure out the types of the output columns.
try:
statement = sqlglot.optimizer.annotate_types.annotate_types(
@@ -589,8 +627,6 @@ def _schema_aware_fuzzy_column_resolve(
# This is not a fatal error, so we can continue.
logger.debug("sqlglot failed to annotate types: %s", e)
- column_lineage = []
-
try:
assert isinstance(statement, _SupportedColumnLineageTypesTuple)
@@ -599,7 +635,6 @@ def _schema_aware_fuzzy_column_resolve(
(select_col.alias_or_name, select_col) for select_col in statement.selects
]
logger.debug("output columns: %s", [col[0] for col in output_columns])
- output_col: str
for output_col, original_col_expression in output_columns:
if output_col == "*":
# If schema information is available, the * will be expanded to the actual columns.
@@ -628,7 +663,7 @@ def _schema_aware_fuzzy_column_resolve(
# Generate SELECT lineage.
# Using a set here to deduplicate upstreams.
- direct_col_upstreams: Set[_ColumnRef] = set()
+ direct_raw_col_upstreams: Set[_ColumnRef] = set()
for node in lineage_node.walk():
if node.downstream:
# We only want the leaf nodes.
@@ -643,8 +678,9 @@ def _schema_aware_fuzzy_column_resolve(
if node.subfield:
normalized_col = f"{normalized_col}.{node.subfield}"
- col = _schema_aware_fuzzy_column_resolve(table_ref, normalized_col)
- direct_col_upstreams.add(_ColumnRef(table=table_ref, column=col))
+ direct_raw_col_upstreams.add(
+ _ColumnRef(table=table_ref, column=normalized_col)
+ )
else:
# This branch doesn't matter. For example, a count(*) column would go here, and
# we don't get any column-level lineage for that.
@@ -665,7 +701,16 @@ def _schema_aware_fuzzy_column_resolve(
if original_col_expression.type:
output_col_type = original_col_expression.type
- if not direct_col_upstreams:
+ # Fuzzy resolve upstream columns.
+ direct_resolved_col_upstreams = {
+ _ColumnRef(
+ table=edge.table,
+ column=_schema_aware_fuzzy_column_resolve(edge.table, edge.column),
+ )
+ for edge in direct_raw_col_upstreams
+ }
+
+ if not direct_resolved_col_upstreams:
logger.debug(f' "{output_col}" has no upstreams')
column_lineage.append(
_ColumnLineageInfo(
@@ -674,12 +719,12 @@ def _schema_aware_fuzzy_column_resolve(
column=output_col,
column_type=output_col_type,
),
- upstreams=sorted(direct_col_upstreams),
+ upstreams=sorted(direct_resolved_col_upstreams),
# logic=column_logic.sql(pretty=True, dialect=dialect),
)
)
- # TODO: Also extract referenced columns (e.g. non-SELECT lineage)
+ # TODO: Also extract referenced columns (aka auxillary / non-SELECT lineage)
except (sqlglot.errors.OptimizeError, ValueError) as e:
raise SqlUnderstandingError(
f"sqlglot failed to compute some lineage: {e}"
@@ -700,6 +745,12 @@ def _extract_select_from_create(
return statement
+def _is_create_table_ddl(statement: sqlglot.exp.Expression) -> bool:
+ return isinstance(statement, sqlglot.exp.Create) and isinstance(
+ statement.this, sqlglot.exp.Schema
+ )
+
+
def _try_extract_select(
statement: sqlglot.exp.Expression,
) -> sqlglot.exp.Expression:
@@ -766,6 +817,7 @@ def _translate_sqlglot_type(
def _translate_internal_column_lineage(
table_name_urn_mapping: Dict[_TableName, str],
raw_column_lineage: _ColumnLineageInfo,
+ dialect: str,
) -> ColumnLineageInfo:
downstream_urn = None
if raw_column_lineage.downstream.table:
@@ -779,7 +831,9 @@ def _translate_internal_column_lineage(
)
if raw_column_lineage.downstream.column_type
else None,
- native_column_type=raw_column_lineage.downstream.column_type.sql()
+ native_column_type=raw_column_lineage.downstream.column_type.sql(
+ dialect=dialect
+ )
if raw_column_lineage.downstream.column_type
and raw_column_lineage.downstream.column_type.this
!= sqlglot.exp.DataType.Type.UNKNOWN
@@ -800,12 +854,14 @@ def _get_dialect(platform: str) -> str:
# TODO: convert datahub platform names to sqlglot dialect
if platform == "presto-on-hive":
return "hive"
+ if platform == "mssql":
+ return "tsql"
else:
return platform
def _sqlglot_lineage_inner(
- sql: str,
+ sql: sqlglot.exp.ExpOrStr,
schema_resolver: SchemaResolver,
default_db: Optional[str] = None,
default_schema: Optional[str] = None,
@@ -918,7 +974,7 @@ def _sqlglot_lineage_inner(
if column_lineage:
column_lineage_urns = [
_translate_internal_column_lineage(
- table_name_urn_mapping, internal_col_lineage
+ table_name_urn_mapping, internal_col_lineage, dialect=dialect
)
for internal_col_lineage in column_lineage
]
diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_create_view_with_cte.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_create_view_with_cte.json
index f0175b4dc8892..d610b0a83f229 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_create_view_with_cte.json
+++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_create_view_with_cte.json
@@ -18,7 +18,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "native_column_type": "TEXT"
+ "native_column_type": "STRING"
},
"upstreams": [
{
@@ -36,7 +36,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "native_column_type": "TEXT"
+ "native_column_type": "STRING"
},
"upstreams": [
{
@@ -54,7 +54,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "native_column_type": "TEXT"
+ "native_column_type": "STRING"
},
"upstreams": [
{
@@ -72,7 +72,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "native_column_type": "TEXT"
+ "native_column_type": "STRING"
},
"upstreams": [
{
diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_from_sharded_table_wildcard.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_from_sharded_table_wildcard.json
index b7df5444987f2..2d3d188d28316 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_from_sharded_table_wildcard.json
+++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_from_sharded_table_wildcard.json
@@ -14,7 +14,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "native_column_type": "TEXT"
+ "native_column_type": "STRING"
},
"upstreams": [
{
@@ -32,7 +32,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "native_column_type": "TEXT"
+ "native_column_type": "STRING"
},
"upstreams": [
{
diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_nested_subqueries.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_nested_subqueries.json
index 67e306bebf545..41ae0885941b0 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_nested_subqueries.json
+++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_nested_subqueries.json
@@ -14,7 +14,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "native_column_type": "TEXT"
+ "native_column_type": "STRING"
},
"upstreams": [
{
@@ -32,7 +32,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "native_column_type": "TEXT"
+ "native_column_type": "STRING"
},
"upstreams": [
{
diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_sharded_table_normalization.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_sharded_table_normalization.json
index b7df5444987f2..2d3d188d28316 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_sharded_table_normalization.json
+++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_sharded_table_normalization.json
@@ -14,7 +14,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "native_column_type": "TEXT"
+ "native_column_type": "STRING"
},
"upstreams": [
{
@@ -32,7 +32,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "native_column_type": "TEXT"
+ "native_column_type": "STRING"
},
"upstreams": [
{
diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_star_with_replace.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_star_with_replace.json
index b393b2445d6c4..26f8f8f59a3ff 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_star_with_replace.json
+++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_star_with_replace.json
@@ -16,7 +16,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "native_column_type": "TEXT"
+ "native_column_type": "STRING"
},
"upstreams": [
{
@@ -34,7 +34,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "native_column_type": "TEXT"
+ "native_column_type": "STRING"
},
"upstreams": [
{
@@ -52,7 +52,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "native_column_type": "TEXT"
+ "native_column_type": "STRING"
},
"upstreams": [
{
diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_view_from_union.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_view_from_union.json
index 53fb94300e804..83365c09f69c2 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_view_from_union.json
+++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_view_from_union.json
@@ -17,7 +17,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "native_column_type": "TEXT"
+ "native_column_type": "STRING"
},
"upstreams": [
{
@@ -39,7 +39,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "native_column_type": "TEXT"
+ "native_column_type": "STRING"
},
"upstreams": [
{
diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_table_ddl.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_table_ddl.json
index 4773974545bfa..cf31b71cb50f6 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_table_ddl.json
+++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_table_ddl.json
@@ -4,5 +4,58 @@
"out_tables": [
"urn:li:dataset:(urn:li:dataPlatform:sqlite,costs,PROD)"
],
- "column_lineage": null
+ "column_lineage": [
+ {
+ "downstream": {
+ "table": "urn:li:dataset:(urn:li:dataPlatform:sqlite,costs,PROD)",
+ "column": "id",
+ "column_type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.NumberType": {}
+ }
+ },
+ "native_column_type": "INTEGER"
+ },
+ "upstreams": []
+ },
+ {
+ "downstream": {
+ "table": "urn:li:dataset:(urn:li:dataPlatform:sqlite,costs,PROD)",
+ "column": "month",
+ "column_type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.StringType": {}
+ }
+ },
+ "native_column_type": "TEXT"
+ },
+ "upstreams": []
+ },
+ {
+ "downstream": {
+ "table": "urn:li:dataset:(urn:li:dataPlatform:sqlite,costs,PROD)",
+ "column": "total_cost",
+ "column_type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.NumberType": {}
+ }
+ },
+ "native_column_type": "REAL"
+ },
+ "upstreams": []
+ },
+ {
+ "downstream": {
+ "table": "urn:li:dataset:(urn:li:dataPlatform:sqlite,costs,PROD)",
+ "column": "area",
+ "column_type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.NumberType": {}
+ }
+ },
+ "native_column_type": "REAL"
+ },
+ "upstreams": []
+ }
+ ]
}
\ No newline at end of file
diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_view_as_select.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_view_as_select.json
index ff452467aa5bd..8a6b60d0f1bde 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_view_as_select.json
+++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_view_as_select.json
@@ -30,7 +30,7 @@
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
- "native_column_type": "BIGINT"
+ "native_column_type": "NUMBER"
},
"upstreams": []
},
diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_struct_subfields.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_struct_subfields.json
index 5ad847e252497..2424fcda34752 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_struct_subfields.json
+++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_struct_subfields.json
@@ -14,7 +14,7 @@
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
- "native_column_type": "DECIMAL"
+ "native_column_type": "NUMERIC"
},
"upstreams": [
{
diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_full_col_name.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_full_col_name.json
index 6ee3d2e61c39b..8dd2633eff612 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_full_col_name.json
+++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_full_col_name.json
@@ -14,7 +14,7 @@
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
- "native_column_type": "DECIMAL"
+ "native_column_type": "NUMERIC"
},
"upstreams": [
{
diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_default_normalization.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_default_normalization.json
index b0351a7e07ad2..ee80285d87f60 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_default_normalization.json
+++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_default_normalization.json
@@ -12,6 +12,7 @@
"downstream": {
"table": "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.test_lineage2,PROD)",
"column": "PatientId",
+ "column_type": null,
"native_column_type": "INTEGER()"
},
"upstreams": [
@@ -25,6 +26,7 @@
"downstream": {
"table": "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.test_lineage2,PROD)",
"column": "BMI",
+ "column_type": null,
"native_column_type": "FLOAT()"
},
"upstreams": [
From 10456c5e3cdaad14927b89bb9deee1a6df0ce92c Mon Sep 17 00:00:00 2001
From: Ellie O'Neil <110510035+eboneil@users.noreply.github.com>
Date: Mon, 23 Oct 2023 14:53:07 -0700
Subject: [PATCH 02/40] docs(ingest): update to get_workunits_internal (#9054)
---
metadata-ingestion/adding-source.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/metadata-ingestion/adding-source.md b/metadata-ingestion/adding-source.md
index e4fc950a7cdbd..a0930102c6827 100644
--- a/metadata-ingestion/adding-source.md
+++ b/metadata-ingestion/adding-source.md
@@ -62,7 +62,7 @@ Some sources use the default `SourceReport` class, but others inherit and extend
### 3. Implement the source itself
-The core for the source is the `get_workunits` method, which produces a stream of metadata events (typically MCP objects) wrapped up in a MetadataWorkUnit.
+The core for the source is the `get_workunits_internal` method, which produces a stream of metadata events (typically MCP objects) wrapped up in a MetadataWorkUnit.
The [file source](./src/datahub/ingestion/source/file.py) is a good and simple example.
The MetadataChangeEventClass is defined in the metadata models which are generated
From a0ce4f333e1cbbc544a650ec3e8012a1f10aef2b Mon Sep 17 00:00:00 2001
From: Kos Korchak <97058061+kkorchak@users.noreply.github.com>
Date: Mon, 23 Oct 2023 21:21:21 -0400
Subject: [PATCH 03/40] Column level lineage and path test (#8822)
---
.../preview/EntityPaths/EntityPathsModal.tsx | 1 +
.../e2e/lineage/lineage_column_path.js | 68 +++++++++++++++++++
2 files changed, 69 insertions(+)
create mode 100644 smoke-test/tests/cypress/cypress/e2e/lineage/lineage_column_path.js
diff --git a/datahub-web-react/src/app/preview/EntityPaths/EntityPathsModal.tsx b/datahub-web-react/src/app/preview/EntityPaths/EntityPathsModal.tsx
index d5722429aaf6b..2bb76714d6119 100644
--- a/datahub-web-react/src/app/preview/EntityPaths/EntityPathsModal.tsx
+++ b/datahub-web-react/src/app/preview/EntityPaths/EntityPathsModal.tsx
@@ -39,6 +39,7 @@ export default function EntityPathsModal({ paths, resultEntityUrn, hideModal }:
return (
Column path{paths.length > 1 && 's'} from{' '}
diff --git a/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_column_path.js b/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_column_path.js
new file mode 100644
index 0000000000000..37ca62c8d1229
--- /dev/null
+++ b/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_column_path.js
@@ -0,0 +1,68 @@
+import { aliasQuery } from "../utils";
+const DATASET_ENTITY_TYPE = 'dataset';
+const DATASET_URN = 'urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleCypressHdfsDataset,PROD)';
+const DOWNSTREAM_DATASET_URN = "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleCypressKafkaDataset,PROD)";
+const upstreamColumn = '[data-testid="node-urn:li:dataset:(urn:li:dataPlatform:kafka,SampleCypressKafkaDataset,PROD)-Upstream"] text';
+const downstreamColumn = '[data-testid="node-urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleCypressHdfsDataset,PROD)-Downstream"] text';
+
+const verifyColumnPathModal = (from, to) => {
+ cy.get('[data-testid="entity-paths-modal"]').contains(from).should("be.visible");
+ cy.get('[data-testid="entity-paths-modal"]').contains(to).should("be.visible");
+};
+
+describe("column-Level lineage and impact analysis path test", () => {
+ beforeEach(() => {
+ cy.on('uncaught:exception', (err, runnable) => { return false; });
+ cy.intercept("POST", "/api/v2/graphql", (req) => {
+ aliasQuery(req, "appConfig");
+ });
+ });
+
+ it("verify column-level lineage path at lineage praph and impact analysis ", () => {
+ // Open dataset with column-level lineage configured an navigate to lineage tab -> visualize lineage
+ cy.loginWithCredentials();
+ cy.goToEntityLineageGraph(DATASET_ENTITY_TYPE, DATASET_URN);
+
+ // Enable “show columns” toggle
+ cy.waitTextVisible("SampleCypressHdfs");
+ cy.clickOptionWithTestId("column-toggle");
+ cy.waitTextVisible("shipment_info");
+
+ // Verify functionality of column lineage
+ cy.get(upstreamColumn).eq(3).click();
+ cy.get(upstreamColumn).eq(3).prev().should('not.have.attr', 'fill', 'white');
+ cy.get(downstreamColumn).eq(2).prev().should('not.have.attr', 'stroke', 'transparent');
+ cy.get(downstreamColumn).eq(2).click();
+ cy.get(downstreamColumn).eq(2).prev().should('not.have.attr', 'fill', 'white');
+ cy.get(upstreamColumn).eq(3).prev().should('not.have.attr', 'stroke', 'transparent');
+
+ // Open dataset impact analysis view, enable column lineage
+ cy.goToDataset(DATASET_URN, "SampleCypressHdfsDataset");
+ cy.openEntityTab("Lineage");
+ cy.clickOptionWithText("Column Lineage");
+ cy.clickOptionWithText("Downstream");
+
+ // Verify upstream column lineage, test column path modal
+ cy.clickOptionWithText("Upstream");
+ cy.waitTextVisible("SampleCypressKafkaDataset");
+ cy.ensureTextNotPresent("field_bar");
+ cy.contains("Select column").click({ force: true}).wait(1000);
+ cy.get(".rc-virtual-list").contains("shipment_info").click();
+ cy.waitTextVisible("field_bar");
+ cy.clickOptionWithText("field_bar");
+ verifyColumnPathModal("shipment_info", "field_bar");
+ cy.get('[data-testid="entity-paths-modal"] [data-icon="close"]').click();
+
+ // Verify downstream column lineage, test column path modal
+ cy.goToDataset(DOWNSTREAM_DATASET_URN, "SampleCypressKafkaDataset");
+ cy.openEntityTab("Lineage");
+ cy.clickOptionWithText("Column Lineage");
+ cy.ensureTextNotPresent("shipment_info");
+ cy.contains("Select column").click({ force: true}).wait(1000);
+ cy.get(".rc-virtual-list").contains("field_bar").click();
+ cy.waitTextVisible("shipment_info");
+ cy.clickOptionWithText("shipment_info");
+ verifyColumnPathModal("shipment_info", "field_bar");
+ cy.get('[data-testid="entity-paths-modal"] [data-icon="close"]').click();
+ });
+});
\ No newline at end of file
From adf8c8db38c56250cb612b208f6e59b04c7258c6 Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz
Date: Tue, 24 Oct 2023 02:59:56 -0400
Subject: [PATCH 04/40] refactor(ingest): Move sqlalchemy import out of
sql_types.py (#9065)
---
.../src/datahub/ingestion/source/sql/athena.py | 2 +-
.../src/datahub/ingestion/source/sql/sql_common.py | 2 +-
.../src/datahub/ingestion/source/sql/sql_types.py | 9 +--------
.../src/datahub/utilities/sqlalchemy_type_converter.py | 6 +++++-
metadata-ingestion/tests/unit/test_athena_source.py | 2 +-
.../unit/utilities/test_sqlalchemy_type_converter.py | 2 +-
6 files changed, 10 insertions(+), 13 deletions(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py
index dad61e5173166..06b9ad92677a2 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py
@@ -31,7 +31,6 @@
register_custom_type,
)
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri
-from datahub.ingestion.source.sql.sql_types import MapType
from datahub.ingestion.source.sql.sql_utils import (
add_table_to_schema_container,
gen_database_container,
@@ -41,6 +40,7 @@
from datahub.metadata.schema_classes import RecordTypeClass
from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
from datahub.utilities.sqlalchemy_type_converter import (
+ MapType,
get_schema_fields_for_sqlalchemy_column,
)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
index 6524eea8222d4..be03858ec3ef9 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
@@ -37,7 +37,6 @@
DatasetSubTypes,
)
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
-from datahub.ingestion.source.sql.sql_types import MapType
from datahub.ingestion.source.sql.sql_utils import (
add_table_to_schema_container,
downgrade_schema_from_v2,
@@ -91,6 +90,7 @@
from datahub.utilities.lossy_collections import LossyList
from datahub.utilities.registries.domain_registry import DomainRegistry
from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport
+from datahub.utilities.sqlalchemy_type_converter import MapType
if TYPE_CHECKING:
from datahub.ingestion.source.ge_data_profiler import (
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py
index 51626891e9fef..ae47623188f42 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py
@@ -1,8 +1,6 @@
import re
from typing import Any, Dict, ValuesView
-from sqlalchemy import types
-
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
ArrayType,
BooleanType,
@@ -17,6 +15,7 @@
TimeType,
UnionType,
)
+from datahub.utilities.sqlalchemy_type_converter import MapType
# these can be obtained by running `select format_type(oid, null),* from pg_type;`
# we've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.)
@@ -369,12 +368,6 @@ def resolve_vertica_modified_type(type_string: str) -> Any:
"array": ArrayType,
}
-
-class MapType(types.TupleType):
- # Wrapper class around SQLalchemy's TupleType to increase compatibility with DataHub
- pass
-
-
# https://docs.aws.amazon.com/athena/latest/ug/data-types.html
# https://github.com/dbt-athena/dbt-athena/tree/main
ATHENA_SQL_TYPES_MAP: Dict[str, Any] = {
diff --git a/metadata-ingestion/src/datahub/utilities/sqlalchemy_type_converter.py b/metadata-ingestion/src/datahub/utilities/sqlalchemy_type_converter.py
index a431f262a85fd..1d5ec5dae3519 100644
--- a/metadata-ingestion/src/datahub/utilities/sqlalchemy_type_converter.py
+++ b/metadata-ingestion/src/datahub/utilities/sqlalchemy_type_converter.py
@@ -7,13 +7,17 @@
from sqlalchemy_bigquery import STRUCT
from datahub.ingestion.extractor.schema_util import avro_schema_to_mce_fields
-from datahub.ingestion.source.sql.sql_types import MapType
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
from datahub.metadata.schema_classes import NullTypeClass, SchemaFieldDataTypeClass
logger = logging.getLogger(__name__)
+class MapType(types.TupleType):
+ # Wrapper class around SQLalchemy's TupleType to increase compatibility with DataHub
+ pass
+
+
class SqlAlchemyColumnToAvroConverter:
"""Helper class that collects some methods to convert SQLalchemy columns to Avro schema."""
diff --git a/metadata-ingestion/tests/unit/test_athena_source.py b/metadata-ingestion/tests/unit/test_athena_source.py
index 6d3ed20eafde2..23dd7dd5a6e45 100644
--- a/metadata-ingestion/tests/unit/test_athena_source.py
+++ b/metadata-ingestion/tests/unit/test_athena_source.py
@@ -9,7 +9,7 @@
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.source.aws.s3_util import make_s3_urn
from datahub.ingestion.source.sql.athena import CustomAthenaRestDialect
-from datahub.ingestion.source.sql.sql_types import MapType
+from datahub.utilities.sqlalchemy_type_converter import MapType
FROZEN_TIME = "2020-04-14 07:00:00"
diff --git a/metadata-ingestion/tests/unit/utilities/test_sqlalchemy_type_converter.py b/metadata-ingestion/tests/unit/utilities/test_sqlalchemy_type_converter.py
index 959da0987a825..6c719d351c4c2 100644
--- a/metadata-ingestion/tests/unit/utilities/test_sqlalchemy_type_converter.py
+++ b/metadata-ingestion/tests/unit/utilities/test_sqlalchemy_type_converter.py
@@ -3,7 +3,6 @@
from sqlalchemy import types
from sqlalchemy_bigquery import STRUCT
-from datahub.ingestion.source.sql.sql_types import MapType
from datahub.metadata.schema_classes import (
ArrayTypeClass,
MapTypeClass,
@@ -12,6 +11,7 @@
RecordTypeClass,
)
from datahub.utilities.sqlalchemy_type_converter import (
+ MapType,
get_schema_fields_for_sqlalchemy_column,
)
From c849246e63284bc73768ed58a22be62b708a6c48 Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Tue, 24 Oct 2023 00:09:41 -0700
Subject: [PATCH 05/40] fix(ingest): add releases link (#9014)
---
metadata-ingestion/setup.py | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index c46409ecbf52f..417588a433655 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -282,7 +282,8 @@
# Source plugins
# sqlalchemy-bigquery is included here since it provides an implementation of
# a SQLalchemy-conform STRUCT type definition
- "athena": sql_common | {"PyAthena[SQLAlchemy]>=2.6.0,<3.0.0", "sqlalchemy-bigquery>=1.4.1"},
+ "athena": sql_common
+ | {"PyAthena[SQLAlchemy]>=2.6.0,<3.0.0", "sqlalchemy-bigquery>=1.4.1"},
"azure-ad": set(),
"bigquery": sql_common
| bigquery_common
@@ -354,7 +355,11 @@
| {"psycopg2-binary", "pymysql>=1.0.2"},
"pulsar": {"requests"},
"redash": {"redash-toolbelt", "sql-metadata"} | sqllineage_lib,
- "redshift": sql_common | redshift_common | usage_common | sqlglot_lib | {"redshift-connector"},
+ "redshift": sql_common
+ | redshift_common
+ | usage_common
+ | sqlglot_lib
+ | {"redshift-connector"},
"redshift-legacy": sql_common | redshift_common,
"redshift-usage-legacy": sql_common | usage_common | redshift_common,
"s3": {*s3_base, *data_lake_profiling},
@@ -435,7 +440,9 @@
deepdiff_dep = "deepdiff"
test_api_requirements = {pytest_dep, deepdiff_dep, "PyYAML"}
-debug_requirements = {"memray"}
+debug_requirements = {
+ "memray",
+}
base_dev_requirements = {
*base_requirements,
@@ -668,6 +675,7 @@
"Documentation": "https://datahubproject.io/docs/",
"Source": "https://github.com/datahub-project/datahub",
"Changelog": "https://github.com/datahub-project/datahub/releases",
+ "Releases": "https://github.com/acryldata/datahub/releases",
},
license="Apache License 2.0",
description="A CLI to work with DataHub metadata",
From eb0b03d2f2f2c9ce88562c32d968d095a59f8547 Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz
Date: Tue, 24 Oct 2023 10:45:09 -0400
Subject: [PATCH 06/40] fix(ingest/bigquery): Correctly apply table pattern to
read events; fix end time calculation; deprecate match_fully_qualified_names
(#9077)
---
.../ingestion/source/bigquery_v2/bigquery_config.py | 7 +++----
.../datahub/ingestion/source/bigquery_v2/lineage.py | 2 +-
.../src/datahub/ingestion/source/bigquery_v2/usage.py | 10 +++++++---
3 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
index 944814b6936a4..a6a740385cf5c 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
@@ -119,8 +119,8 @@ class BigQueryV2Config(
)
match_fully_qualified_names: bool = Field(
- default=False,
- description="Whether `dataset_pattern` is matched against fully qualified dataset name `.`.",
+ default=True,
+ description="[deprecated] Whether `dataset_pattern` is matched against fully qualified dataset name `.`.",
)
include_external_url: bool = Field(
@@ -327,8 +327,7 @@ def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
):
logger.warning(
"Please update `dataset_pattern` to match against fully qualified schema name `.` and set config `match_fully_qualified_names : True`."
- "Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. "
- "The config option `match_fully_qualified_names` will be deprecated in future and the default behavior will assume `match_fully_qualified_names: True`."
+ "The config option `match_fully_qualified_names` is deprecated and will be removed in a future release."
)
return values
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
index 98c8cbaf85eec..aa462435b8105 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
@@ -548,7 +548,7 @@ def _get_parsed_audit_log_events(self, project_id: str) -> Iterable[QueryEvent]:
# handle the case where the read happens within our time range but the query
# completion event is delayed and happens after the configured end time.
corrected_start_time = self.start_time - self.config.max_query_duration
- corrected_end_time = self.end_time + -self.config.max_query_duration
+ corrected_end_time = self.end_time + self.config.max_query_duration
self.report.log_entry_start_time = corrected_start_time
self.report.log_entry_end_time = corrected_end_time
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
index 201567e104a51..7fc38991e5928 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
@@ -335,8 +335,12 @@ def get_time_window(self) -> Tuple[datetime, datetime]:
def _is_table_allowed(self, table_ref: Optional[BigQueryTableRef]) -> bool:
return (
table_ref is not None
- and self.config.dataset_pattern.allowed(table_ref.table_identifier.dataset)
- and self.config.table_pattern.allowed(table_ref.table_identifier.table)
+ and self.config.dataset_pattern.allowed(
+ f"{table_ref.table_identifier.project_id}.{table_ref.table_identifier.dataset}"
+ if self.config.match_fully_qualified_names
+ else table_ref.table_identifier.dataset
+ )
+ and self.config.table_pattern.allowed(str(table_ref.table_identifier))
)
def _should_ingest_usage(self) -> bool:
@@ -844,7 +848,7 @@ def _get_parsed_bigquery_log_events(
# handle the case where the read happens within our time range but the query
# completion event is delayed and happens after the configured end time.
corrected_start_time = self.start_time - self.config.max_query_duration
- corrected_end_time = self.end_time + -self.config.max_query_duration
+ corrected_end_time = self.end_time + self.config.max_query_duration
self.report.audit_start_time = corrected_start_time
self.report.audit_end_time = corrected_end_time
From d13553f53ad9e7592256cd88e78eef0ca95832e4 Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Tue, 24 Oct 2023 12:24:50 -0700
Subject: [PATCH 07/40] feat(sqlparser): extract CLL from `update`s (#9078)
---
.../src/datahub/utilities/sqlglot_lineage.py | 68 +++++++++++--
.../test_snowflake_update_from_table.json | 56 +++++++++++
.../test_snowflake_update_hardcoded.json | 35 +++++++
.../unit/sql_parsing/test_sqlglot_lineage.py | 96 +++++++++++++++++++
4 files changed, 246 insertions(+), 9 deletions(-)
create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_from_table.json
create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_hardcoded.json
diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
index 97121b368f507..526d90b2a1bfa 100644
--- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
+++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
@@ -745,6 +745,47 @@ def _extract_select_from_create(
return statement
+_UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT: Set[str] = set(
+ sqlglot.exp.Update.arg_types.keys()
+) - set(sqlglot.exp.Select.arg_types.keys())
+
+
+def _extract_select_from_update(
+ statement: sqlglot.exp.Update,
+) -> sqlglot.exp.Select:
+ statement = statement.copy()
+
+ # The "SET" expressions need to be converted.
+ # For the update command, it'll be a list of EQ expressions, but the select
+ # should contain aliased columns.
+ new_expressions = []
+ for expr in statement.expressions:
+ if isinstance(expr, sqlglot.exp.EQ) and isinstance(
+ expr.left, sqlglot.exp.Column
+ ):
+ new_expressions.append(
+ sqlglot.exp.Alias(
+ this=expr.right,
+ alias=expr.left.this,
+ )
+ )
+ else:
+ # If we don't know how to convert it, just leave it as-is. If this causes issues,
+ # they'll get caught later.
+ new_expressions.append(expr)
+
+ return sqlglot.exp.Select(
+ **{
+ **{
+ k: v
+ for k, v in statement.args.items()
+ if k not in _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT
+ },
+ "expressions": new_expressions,
+ }
+ )
+
+
def _is_create_table_ddl(statement: sqlglot.exp.Expression) -> bool:
return isinstance(statement, sqlglot.exp.Create) and isinstance(
statement.this, sqlglot.exp.Schema
@@ -767,6 +808,9 @@ def _try_extract_select(
elif isinstance(statement, sqlglot.exp.Insert):
# TODO Need to map column renames in the expressions part of the statement.
statement = statement.expression
+ elif isinstance(statement, sqlglot.exp.Update):
+ # Assumption: the output table is already captured in the modified tables list.
+ statement = _extract_select_from_update(statement)
elif isinstance(statement, sqlglot.exp.Create):
# TODO May need to map column renames.
# Assumption: the output table is already captured in the modified tables list.
@@ -942,19 +986,25 @@ def _sqlglot_lineage_inner(
)
# Simplify the input statement for column-level lineage generation.
- select_statement = _try_extract_select(statement)
+ try:
+ select_statement = _try_extract_select(statement)
+ except Exception as e:
+ logger.debug(f"Failed to extract select from statement: {e}", exc_info=True)
+ debug_info.column_error = e
+ select_statement = None
# Generate column-level lineage.
column_lineage: Optional[List[_ColumnLineageInfo]] = None
try:
- column_lineage = _column_level_lineage(
- select_statement,
- dialect=dialect,
- input_tables=table_name_schema_mapping,
- output_table=downstream_table,
- default_db=default_db,
- default_schema=default_schema,
- )
+ if select_statement is not None:
+ column_lineage = _column_level_lineage(
+ select_statement,
+ dialect=dialect,
+ input_tables=table_name_schema_mapping,
+ output_table=downstream_table,
+ default_db=default_db,
+ default_schema=default_schema,
+ )
except UnsupportedStatementTypeError as e:
# Inject details about the outer statement type too.
e.args = (f"{e.args[0]} (outer statement type: {type(statement)})",)
diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_from_table.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_from_table.json
new file mode 100644
index 0000000000000..e2baa34e7fe28
--- /dev/null
+++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_from_table.json
@@ -0,0 +1,56 @@
+{
+ "query_type": "UPDATE",
+ "in_tables": [
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)",
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table2,PROD)"
+ ],
+ "out_tables": [
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)"
+ ],
+ "column_lineage": [
+ {
+ "downstream": {
+ "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)",
+ "column": "col1",
+ "column_type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.StringType": {}
+ }
+ },
+ "native_column_type": "VARCHAR"
+ },
+ "upstreams": [
+ {
+ "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)",
+ "column": "col1"
+ },
+ {
+ "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)",
+ "column": "col2"
+ }
+ ]
+ },
+ {
+ "downstream": {
+ "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)",
+ "column": "col2",
+ "column_type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.StringType": {}
+ }
+ },
+ "native_column_type": "VARCHAR"
+ },
+ "upstreams": [
+ {
+ "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)",
+ "column": "col1"
+ },
+ {
+ "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table2,PROD)",
+ "column": "col2"
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_hardcoded.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_hardcoded.json
new file mode 100644
index 0000000000000..b41ed61b37cdb
--- /dev/null
+++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_hardcoded.json
@@ -0,0 +1,35 @@
+{
+ "query_type": "UPDATE",
+ "in_tables": [],
+ "out_tables": [
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)"
+ ],
+ "column_lineage": [
+ {
+ "downstream": {
+ "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)",
+ "column": "orderkey",
+ "column_type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.NumberType": {}
+ }
+ },
+ "native_column_type": "INT"
+ },
+ "upstreams": []
+ },
+ {
+ "downstream": {
+ "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)",
+ "column": "totalprice",
+ "column_type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.NumberType": {}
+ }
+ },
+ "native_column_type": "INT"
+ },
+ "upstreams": []
+ }
+ ]
+}
\ No newline at end of file
diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py
index 059add8db67e4..dfc5b486abd35 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py
+++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py
@@ -3,6 +3,7 @@
import pytest
from datahub.testing.check_sql_parser_result import assert_sql_result
+from datahub.utilities.sqlglot_lineage import _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT
RESOURCE_DIR = pathlib.Path(__file__).parent / "goldens"
@@ -672,3 +673,98 @@ def test_teradata_default_normalization():
},
expected_file=RESOURCE_DIR / "test_teradata_default_normalization.json",
)
+
+
+def test_snowflake_update_hardcoded():
+ assert_sql_result(
+ """
+UPDATE snowflake_sample_data.tpch_sf1.orders
+SET orderkey = 1, totalprice = 2
+WHERE orderkey = 3
+""",
+ dialect="snowflake",
+ schemas={
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)": {
+ "orderkey": "NUMBER(38,0)",
+ "totalprice": "NUMBER(12,2)",
+ },
+ },
+ expected_file=RESOURCE_DIR / "test_snowflake_update_hardcoded.json",
+ )
+
+
+def test_update_from_select():
+ assert _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT == {"returning", "this"}
+
+
+def test_snowflake_update_from_table():
+ # Can create these tables with the following SQL:
+ """
+ -- Create or replace my_table
+ CREATE OR REPLACE TABLE my_table (
+ id INT IDENTITY PRIMARY KEY,
+ col1 VARCHAR(50),
+ col2 VARCHAR(50)
+ );
+
+ -- Create or replace table1
+ CREATE OR REPLACE TABLE table1 (
+ id INT IDENTITY PRIMARY KEY,
+ col1 VARCHAR(50),
+ col2 VARCHAR(50)
+ );
+
+ -- Create or replace table2
+ CREATE OR REPLACE TABLE table2 (
+ id INT IDENTITY PRIMARY KEY,
+ col2 VARCHAR(50)
+ );
+
+ -- Insert data into my_table
+ INSERT INTO my_table (col1, col2)
+ VALUES ('foo', 'bar'),
+ ('baz', 'qux');
+
+ -- Insert data into table1
+ INSERT INTO table1 (col1, col2)
+ VALUES ('foo', 'bar'),
+ ('baz', 'qux');
+
+ -- Insert data into table2
+ INSERT INTO table2 (col2)
+ VALUES ('bar'),
+ ('qux');
+ """
+
+ assert_sql_result(
+ """
+UPDATE my_table
+SET
+ col1 = t1.col1 || t1.col2,
+ col2 = t1.col1 || t2.col2
+FROM table1 t1
+JOIN table2 t2 ON t1.id = t2.id
+WHERE my_table.id = t1.id;
+""",
+ dialect="snowflake",
+ default_db="my_db",
+ default_schema="my_schema",
+ schemas={
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)": {
+ "id": "NUMBER(38,0)",
+ "col1": "VARCHAR(16777216)",
+ "col2": "VARCHAR(16777216)",
+ },
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)": {
+ "id": "NUMBER(38,0)",
+ "col1": "VARCHAR(16777216)",
+ "col2": "VARCHAR(16777216)",
+ },
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table2,PROD)": {
+ "id": "NUMBER(38,0)",
+ "col1": "VARCHAR(16777216)",
+ "col2": "VARCHAR(16777216)",
+ },
+ },
+ expected_file=RESOURCE_DIR / "test_snowflake_update_from_table.json",
+ )
From 378d84a346cff4061f795dd1b296bde3ea5313c1 Mon Sep 17 00:00:00 2001
From: skrydal
Date: Tue, 24 Oct 2023 22:12:11 +0200
Subject: [PATCH 08/40] fix(ui): Fixes handling of resources filters in UI
(#9087)
---
.../app/permissions/policy/PolicyDetailsModal.tsx | 4 ++--
.../permissions/policy/PolicyPrivilegeForm.tsx | 15 ++++++---------
.../src/app/permissions/policy/policyUtils.ts | 4 ++--
docs/authorization/policies.md | 8 ++++----
metadata-ingestion/tests/unit/serde/test_serde.py | 8 ++++----
5 files changed, 18 insertions(+), 21 deletions(-)
diff --git a/datahub-web-react/src/app/permissions/policy/PolicyDetailsModal.tsx b/datahub-web-react/src/app/permissions/policy/PolicyDetailsModal.tsx
index 68e91983babdb..d3e01df3a66e8 100644
--- a/datahub-web-react/src/app/permissions/policy/PolicyDetailsModal.tsx
+++ b/datahub-web-react/src/app/permissions/policy/PolicyDetailsModal.tsx
@@ -67,8 +67,8 @@ export default function PolicyDetailsModal({ policy, visible, onClose, privilege
const isMetadataPolicy = policy?.type === PolicyType.Metadata;
const resources = convertLegacyResourceFilter(policy?.resources);
- const resourceTypes = getFieldValues(resources?.filter, 'RESOURCE_TYPE') || [];
- const resourceEntities = getFieldValues(resources?.filter, 'RESOURCE_URN') || [];
+ const resourceTypes = getFieldValues(resources?.filter, 'TYPE') || [];
+ const resourceEntities = getFieldValues(resources?.filter, 'URN') || [];
const domains = getFieldValues(resources?.filter, 'DOMAIN') || [];
const {
diff --git a/datahub-web-react/src/app/permissions/policy/PolicyPrivilegeForm.tsx b/datahub-web-react/src/app/permissions/policy/PolicyPrivilegeForm.tsx
index 1520388a5033a..b8e1505fceaec 100644
--- a/datahub-web-react/src/app/permissions/policy/PolicyPrivilegeForm.tsx
+++ b/datahub-web-react/src/app/permissions/policy/PolicyPrivilegeForm.tsx
@@ -67,8 +67,8 @@ export default function PolicyPrivilegeForm({
} = useAppConfig();
const resources: ResourceFilter = convertLegacyResourceFilter(maybeResources) || EMPTY_POLICY.resources;
- const resourceTypes = getFieldValues(resources.filter, 'RESOURCE_TYPE') || [];
- const resourceEntities = getFieldValues(resources.filter, 'RESOURCE_URN') || [];
+ const resourceTypes = getFieldValues(resources.filter, 'TYPE') || [];
+ const resourceEntities = getFieldValues(resources.filter, 'URN') || [];
const getDisplayName = (entity) => {
if (!entity) {
@@ -145,10 +145,7 @@ export default function PolicyPrivilegeForm({
};
setResources({
...resources,
- filter: setFieldValues(filter, 'RESOURCE_TYPE', [
- ...resourceTypes,
- createCriterionValue(selectedResourceType),
- ]),
+ filter: setFieldValues(filter, 'TYPE', [...resourceTypes, createCriterionValue(selectedResourceType)]),
});
};
@@ -160,7 +157,7 @@ export default function PolicyPrivilegeForm({
...resources,
filter: setFieldValues(
filter,
- 'RESOURCE_TYPE',
+ 'TYPE',
resourceTypes?.filter((criterionValue) => criterionValue.value !== deselectedResourceType),
),
});
@@ -173,7 +170,7 @@ export default function PolicyPrivilegeForm({
};
setResources({
...resources,
- filter: setFieldValues(filter, 'RESOURCE_URN', [
+ filter: setFieldValues(filter, 'URN', [
...resourceEntities,
createCriterionValueWithEntity(
resource,
@@ -192,7 +189,7 @@ export default function PolicyPrivilegeForm({
...resources,
filter: setFieldValues(
filter,
- 'RESOURCE_URN',
+ 'URN',
resourceEntities?.filter((criterionValue) => criterionValue.value !== resource),
),
});
diff --git a/datahub-web-react/src/app/permissions/policy/policyUtils.ts b/datahub-web-react/src/app/permissions/policy/policyUtils.ts
index c7af7342f6efa..2f178fcdeb5c3 100644
--- a/datahub-web-react/src/app/permissions/policy/policyUtils.ts
+++ b/datahub-web-react/src/app/permissions/policy/policyUtils.ts
@@ -99,10 +99,10 @@ export const convertLegacyResourceFilter = (resourceFilter: Maybe();
if (resourceFilter.type) {
- criteria.push(createCriterion('RESOURCE_TYPE', [createCriterionValue(resourceFilter.type)]));
+ criteria.push(createCriterion('TYPE', [createCriterionValue(resourceFilter.type)]));
}
if (resourceFilter.resources && resourceFilter.resources.length > 0) {
- criteria.push(createCriterion('RESOURCE_URN', resourceFilter.resources.map(createCriterionValue)));
+ criteria.push(createCriterion('URN', resourceFilter.resources.map(createCriterionValue)));
}
return {
filter: {
diff --git a/docs/authorization/policies.md b/docs/authorization/policies.md
index e3606f2a3e48d..63aa6688d3eec 100644
--- a/docs/authorization/policies.md
+++ b/docs/authorization/policies.md
@@ -137,7 +137,7 @@ We currently support the following:
#### Resources
Resource filter defines the set of resources that the policy applies to is defined using a list of criteria. Each
-criterion defines a field type (like resource_type, resource_urn, domain), a list of field values to compare, and a
+criterion defines a field type (like type, urn, domain), a list of field values to compare, and a
condition (like EQUALS). It essentially checks whether the field of a certain resource matches any of the input values.
Note, that if there are no criteria or resource is not set, policy is applied to ALL resources.
@@ -149,7 +149,7 @@ For example, the following resource filter will apply the policy to datasets, ch
"filter": {
"criteria": [
{
- "field": "RESOURCE_TYPE",
+ "field": "TYPE",
"condition": "EQUALS",
"values": [
"dataset",
@@ -175,8 +175,8 @@ Supported fields are as follows
| Field Type | Description | Example |
|---------------|------------------------|-------------------------|
-| resource_type | Type of the resource | dataset, chart, dataJob |
-| resource_urn | Urn of the resource | urn:li:dataset:... |
+| type | Type of the resource | dataset, chart, dataJob |
+| urn | Urn of the resource | urn:li:dataset:... |
| domain | Domain of the resource | urn:li:domain:domainX |
## Managing Policies
diff --git a/metadata-ingestion/tests/unit/serde/test_serde.py b/metadata-ingestion/tests/unit/serde/test_serde.py
index d116f1f5473fa..d2d6a0bdda5b9 100644
--- a/metadata-ingestion/tests/unit/serde/test_serde.py
+++ b/metadata-ingestion/tests/unit/serde/test_serde.py
@@ -238,7 +238,7 @@ def test_missing_optional_simple() -> None:
"criteria": [
{
"condition": "EQUALS",
- "field": "RESOURCE_TYPE",
+ "field": "TYPE",
"values": ["notebook", "dataset", "dashboard"],
}
]
@@ -252,7 +252,7 @@ def test_missing_optional_simple() -> None:
"criteria": [
{
"condition": "EQUALS",
- "field": "RESOURCE_TYPE",
+ "field": "TYPE",
"values": ["notebook", "dataset", "dashboard"],
}
]
@@ -267,13 +267,13 @@ def test_missing_optional_simple() -> None:
def test_missing_optional_in_union() -> None:
# This one doesn't contain any optional fields and should work fine.
revised_json = json.loads(
- '{"lastUpdatedTimestamp":1662356745807,"actors":{"groups":[],"resourceOwners":false,"allUsers":true,"allGroups":false,"users":[]},"privileges":["EDIT_ENTITY_ASSERTIONS","EDIT_DATASET_COL_GLOSSARY_TERMS","EDIT_DATASET_COL_TAGS","EDIT_DATASET_COL_DESCRIPTION"],"displayName":"customtest","resources":{"filter":{"criteria":[{"field":"RESOURCE_TYPE","condition":"EQUALS","values":["notebook","dataset","dashboard"]}]},"allResources":false},"description":"","state":"ACTIVE","type":"METADATA"}'
+ '{"lastUpdatedTimestamp":1662356745807,"actors":{"groups":[],"resourceOwners":false,"allUsers":true,"allGroups":false,"users":[]},"privileges":["EDIT_ENTITY_ASSERTIONS","EDIT_DATASET_COL_GLOSSARY_TERMS","EDIT_DATASET_COL_TAGS","EDIT_DATASET_COL_DESCRIPTION"],"displayName":"customtest","resources":{"filter":{"criteria":[{"field":"TYPE","condition":"EQUALS","values":["notebook","dataset","dashboard"]}]},"allResources":false},"description":"","state":"ACTIVE","type":"METADATA"}'
)
revised = models.DataHubPolicyInfoClass.from_obj(revised_json)
# This one is missing the optional filters.allResources field.
original_json = json.loads(
- '{"privileges":["EDIT_ENTITY_ASSERTIONS","EDIT_DATASET_COL_GLOSSARY_TERMS","EDIT_DATASET_COL_TAGS","EDIT_DATASET_COL_DESCRIPTION"],"actors":{"resourceOwners":false,"groups":[],"allGroups":false,"allUsers":true,"users":[]},"lastUpdatedTimestamp":1662356745807,"displayName":"customtest","description":"","resources":{"filter":{"criteria":[{"field":"RESOURCE_TYPE","condition":"EQUALS","values":["notebook","dataset","dashboard"]}]}},"state":"ACTIVE","type":"METADATA"}'
+ '{"privileges":["EDIT_ENTITY_ASSERTIONS","EDIT_DATASET_COL_GLOSSARY_TERMS","EDIT_DATASET_COL_TAGS","EDIT_DATASET_COL_DESCRIPTION"],"actors":{"resourceOwners":false,"groups":[],"allGroups":false,"allUsers":true,"users":[]},"lastUpdatedTimestamp":1662356745807,"displayName":"customtest","description":"","resources":{"filter":{"criteria":[{"field":"TYPE","condition":"EQUALS","values":["notebook","dataset","dashboard"]}]}},"state":"ACTIVE","type":"METADATA"}'
)
original = models.DataHubPolicyInfoClass.from_obj(original_json)
From edb82ad91fba8a401c56b82bc4c2916a39a6a6dd Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz
Date: Tue, 24 Oct 2023 18:56:14 -0400
Subject: [PATCH 09/40] docs(ingest/bigquery): Add docs for breaking change:
match_fully_qualified_names (#9094)
---
docs/how/updating-datahub.md | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md
index 3af3b2bdda215..7d8c25b06255a 100644
--- a/docs/how/updating-datahub.md
+++ b/docs/how/updating-datahub.md
@@ -11,11 +11,17 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
by Looker and LookML source connectors.
- #8853 - The Airflow plugin no longer supports Airflow 2.0.x or Python 3.7. See the docs for more details.
- #8853 - Introduced the Airflow plugin v2. If you're using Airflow 2.3+, the v2 plugin will be enabled by default, and so you'll need to switch your requirements to include `pip install 'acryl-datahub-airflow-plugin[plugin-v2]'`. To continue using the v1 plugin, set the `DATAHUB_AIRFLOW_PLUGIN_USE_V1_PLUGIN` environment variable to `true`.
-- #8943 The Unity Catalog ingestion source has a new option `include_metastore`, which will cause all urns to be changed when disabled.
+- #8943 - The Unity Catalog ingestion source has a new option `include_metastore`, which will cause all urns to be changed when disabled.
This is currently enabled by default to preserve compatibility, but will be disabled by default and then removed in the future.
If stateful ingestion is enabled, simply setting `include_metastore: false` will perform all required cleanup.
Otherwise, we recommend soft deleting all databricks data via the DataHub CLI:
`datahub delete --platform databricks --soft` and then reingesting with `include_metastore: false`.
+- #9077 - The BigQuery ingestion source by default sets `match_fully_qualified_names: true`.
+This means that any `dataset_pattern` or `schema_pattern` specified will be matched on the fully
+qualified dataset name, i.e. `.`. If this is not the case, please
+update your pattern (e.g. prepend your old dataset pattern with `.*\.` which matches the project part),
+or set `match_fully_qualified_names: false` in your recipe. However, note that
+setting this to `false` is deprecated and this flag will be removed entirely in a future release.
### Potential Downtime
From fe18532b29e35af1cd3007e6affc102042b1af61 Mon Sep 17 00:00:00 2001
From: skrydal
Date: Wed, 25 Oct 2023 00:58:56 +0200
Subject: [PATCH 10/40] docs(update): Added info on breaking change for
policies (#9093)
Co-authored-by: Pedro Silva
---
docs/how/updating-datahub.md | 33 +++++++++++++++++++++++++++++++++
1 file changed, 33 insertions(+)
diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md
index 7d8c25b06255a..57193ea69f2be 100644
--- a/docs/how/updating-datahub.md
+++ b/docs/how/updating-datahub.md
@@ -16,6 +16,39 @@ This is currently enabled by default to preserve compatibility, but will be disa
If stateful ingestion is enabled, simply setting `include_metastore: false` will perform all required cleanup.
Otherwise, we recommend soft deleting all databricks data via the DataHub CLI:
`datahub delete --platform databricks --soft` and then reingesting with `include_metastore: false`.
+- #8846 - Changed enum values in resource filters used by policies. `RESOURCE_TYPE` became `TYPE` and `RESOURCE_URN` became `URN`.
+Any existing policies using these filters (i.e. defined for particular `urns` or `types` such as `dataset`) need to be upgraded
+manually, for example by retrieving their respective `dataHubPolicyInfo` aspect and changing part using filter i.e.
+```yaml
+ "resources": {
+ "filter": {
+ "criteria": [
+ {
+ "field": "RESOURCE_TYPE",
+ "condition": "EQUALS",
+ "values": [
+ "dataset"
+ ]
+ }
+ ]
+ }
+```
+into
+```yaml
+ "resources": {
+ "filter": {
+ "criteria": [
+ {
+ "field": "TYPE",
+ "condition": "EQUALS",
+ "values": [
+ "dataset"
+ ]
+ }
+ ]
+ }
+```
+for example, using `datahub put` command. Policies can be also removed and re-created via UI.
- #9077 - The BigQuery ingestion source by default sets `match_fully_qualified_names: true`.
This means that any `dataset_pattern` or `schema_pattern` specified will be matched on the fully
qualified dataset name, i.e. `.`. If this is not the case, please
From ca331f58bd24187f9f0ca317216837178e9f41fa Mon Sep 17 00:00:00 2001
From: Hyejin Yoon <0327jane@gmail.com>
Date: Wed, 25 Oct 2023 09:39:57 +0900
Subject: [PATCH 11/40] docs: add luckyorange script to head (#9080)
---
docs-website/docusaurus.config.js | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/docs-website/docusaurus.config.js b/docs-website/docusaurus.config.js
index 68ea1ebffa6c9..259ef970d818e 100644
--- a/docs-website/docusaurus.config.js
+++ b/docs-website/docusaurus.config.js
@@ -13,6 +13,13 @@ module.exports = {
projectName: "datahub", // Usually your repo name.
staticDirectories: ["static", "genStatic"],
stylesheets: ["https://fonts.googleapis.com/css2?family=Manrope:wght@400;500;700&display=swap"],
+ scripts: [
+ {
+ src: "https://tools.luckyorange.com/core/lo.js?site-id=28ea8a38",
+ async: true,
+ defer: true,
+ },
+ ],
noIndex: isSaas,
customFields: {
isSaas: isSaas,
From 9a59c452bf36d750964f6d7f78df84a8c0c5eb66 Mon Sep 17 00:00:00 2001
From: Hyejin Yoon <0327jane@gmail.com>
Date: Wed, 25 Oct 2023 09:40:28 +0900
Subject: [PATCH 12/40] design: refactor docs navbar (#8975)
Co-authored-by: Jeff Merrick
---
docs-website/docusaurus.config.js | 61 +++++++++----------
docs-website/src/styles/global.scss | 27 +++++---
.../DocsVersionDropdownNavbarItem.js | 4 ++
.../src/theme/NavbarItem/styles.module.scss | 8 +++
4 files changed, 59 insertions(+), 41 deletions(-)
create mode 100644 docs-website/src/theme/NavbarItem/styles.module.scss
diff --git a/docs-website/docusaurus.config.js b/docs-website/docusaurus.config.js
index 259ef970d818e..506e263933394 100644
--- a/docs-website/docusaurus.config.js
+++ b/docs-website/docusaurus.config.js
@@ -57,44 +57,41 @@ module.exports = {
position: "right",
},
{
- to: "https://demo.datahubproject.io/",
- label: "Demo",
- position: "right",
- },
- {
- href: "https://blog.datahubproject.io/",
- label: "Blog",
- position: "right",
- },
- {
- href: "https://feature-requests.datahubproject.io/roadmap",
- label: "Roadmap",
+ type: "dropdown",
+ label: "Resources",
position: "right",
+ items: [
+ {
+ href: "https://demo.datahubproject.io/",
+ label: "Demo",
+ },
+ {
+ href: "https://blog.datahubproject.io/",
+ label: "Blog",
+ },
+ {
+ href: "https://feature-requests.datahubproject.io/roadmap",
+ label: "Roadmap",
+ },
+ {
+ href: "https://slack.datahubproject.io",
+ label: "Slack",
+ },
+ {
+ href: "https://github.com/datahub-project/datahub",
+ label: "GitHub",
+ },
+ {
+ href: "https://www.youtube.com/channel/UC3qFQC5IiwR5fvWEqi_tJ5w",
+ label: "YouTube",
+ },
+ ],
},
{
type: "docsVersionDropdown",
- position: "right",
+ position: "left",
dropdownActiveClassDisabled: true,
},
- {
- href: "https://slack.datahubproject.io",
- "aria-label": "Slack",
- position: "right",
- className: "item__icon item__slack",
- },
- {
- href: "https://github.com/datahub-project/datahub",
- "aria-label": "GitHub",
- position: "right",
- className: "item__icon item__github",
- },
-
- {
- href: "https://www.youtube.com/channel/UC3qFQC5IiwR5fvWEqi_tJ5w",
- "aria-label": "YouTube",
- position: "right",
- className: "item__icon item__youtube",
- },
],
},
footer: {
diff --git a/docs-website/src/styles/global.scss b/docs-website/src/styles/global.scss
index 55a54876b41ac..16e3893ed08b7 100644
--- a/docs-website/src/styles/global.scss
+++ b/docs-website/src/styles/global.scss
@@ -144,20 +144,29 @@ div[class^="announcementBar"] {
/** Navbar */
-@media only screen and (max-width: 1050px) {
- .navbar__toggle {
- display: inherit;
- }
- .navbar__item {
- display: none;
- }
-}
-
.navbar {
.navbar__logo {
height: 3rem;
}
+
+ .navbar__link {
+ align-items: center;
+ margin: 0 1rem 0;
+ padding: 0;
+ border-bottom: 2px solid transparent;
+ }
+
+ .dropdown > .navbar__link:after {
+ top: -1px;
+ border-width: 0.3em 0.3em 0;
+ margin-left: 0.4em;
+ }
+
+ .navbar__link--active {
+ border-bottom-color: var(--ifm-navbar-link-hover-color);
+ }
.navbar__item {
+ padding: 0.25rem 0;
svg[class*="iconExternalLink"] {
display: none;
}
diff --git a/docs-website/src/theme/NavbarItem/DocsVersionDropdownNavbarItem.js b/docs-website/src/theme/NavbarItem/DocsVersionDropdownNavbarItem.js
index cc04ab23d3cf3..661d64392e67f 100644
--- a/docs-website/src/theme/NavbarItem/DocsVersionDropdownNavbarItem.js
+++ b/docs-website/src/theme/NavbarItem/DocsVersionDropdownNavbarItem.js
@@ -6,6 +6,9 @@ import { translate } from "@docusaurus/Translate";
import { useLocation } from "@docusaurus/router";
import DefaultNavbarItem from "@theme/NavbarItem/DefaultNavbarItem";
import DropdownNavbarItem from "@theme/NavbarItem/DropdownNavbarItem";
+
+import styles from "./styles.module.scss";
+
const getVersionMainDoc = (version) => version.docs.find((doc) => doc.id === version.mainDocId);
export default function DocsVersionDropdownNavbarItem({
mobile,
@@ -60,6 +63,7 @@ export default function DocsVersionDropdownNavbarItem({
return (
Date: Tue, 24 Oct 2023 19:59:42 -0700
Subject: [PATCH 13/40] fix(ingest): update athena type mapping (#9061)
---
.../src/datahub/ingestion/source/sql/athena.py | 4 +++-
.../src/datahub/ingestion/source/sql/sql_common.py | 5 +----
.../src/datahub/ingestion/source/sql/sql_types.py | 5 ++---
.../datahub/utilities/sqlalchemy_type_converter.py | 13 ++++++++++---
4 files changed, 16 insertions(+), 11 deletions(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py
index 06b9ad92677a2..75e8fe1d6f7a6 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py
@@ -37,7 +37,7 @@
gen_database_key,
)
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
-from datahub.metadata.schema_classes import RecordTypeClass
+from datahub.metadata.schema_classes import MapTypeClass, RecordTypeClass
from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
from datahub.utilities.sqlalchemy_type_converter import (
MapType,
@@ -46,7 +46,9 @@
logger = logging.getLogger(__name__)
+assert STRUCT, "required type modules are not available"
register_custom_type(STRUCT, RecordTypeClass)
+register_custom_type(MapType, MapTypeClass)
class CustomAthenaRestDialect(AthenaRestDialect):
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
index be03858ec3ef9..fad9b9e8018a5 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
@@ -80,7 +80,6 @@
DatasetLineageTypeClass,
DatasetPropertiesClass,
GlobalTagsClass,
- MapTypeClass,
SubTypesClass,
TagAssociationClass,
UpstreamClass,
@@ -90,7 +89,6 @@
from datahub.utilities.lossy_collections import LossyList
from datahub.utilities.registries.domain_registry import DomainRegistry
from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport
-from datahub.utilities.sqlalchemy_type_converter import MapType
if TYPE_CHECKING:
from datahub.ingestion.source.ge_data_profiler import (
@@ -140,6 +138,7 @@ class SqlWorkUnit(MetadataWorkUnit):
_field_type_mapping: Dict[Type[TypeEngine], Type] = {
+ # Note: to add dialect-specific types to this mapping, use the `register_custom_type` function.
types.Integer: NumberTypeClass,
types.Numeric: NumberTypeClass,
types.Boolean: BooleanTypeClass,
@@ -156,8 +155,6 @@ class SqlWorkUnit(MetadataWorkUnit):
types.DATETIME: TimeTypeClass,
types.TIMESTAMP: TimeTypeClass,
types.JSON: RecordTypeClass,
- # additional type definitions that are used by the Athena source
- MapType: MapTypeClass, # type: ignore
# Because the postgresql dialect is used internally by many other dialects,
# we add some postgres types here. This is ok to do because the postgresql
# dialect is built-in to sqlalchemy.
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py
index ae47623188f42..3b4a7e1dc0287 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py
@@ -7,7 +7,7 @@
BytesType,
DateType,
EnumType,
- MapType as MapTypeAvro,
+ MapType,
NullType,
NumberType,
RecordType,
@@ -15,7 +15,6 @@
TimeType,
UnionType,
)
-from datahub.utilities.sqlalchemy_type_converter import MapType
# these can be obtained by running `select format_type(oid, null),* from pg_type;`
# we've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.)
@@ -364,7 +363,7 @@ def resolve_vertica_modified_type(type_string: str) -> Any:
"time": TimeType,
"timestamp": TimeType,
"row": RecordType,
- "map": MapTypeAvro,
+ "map": MapType,
"array": ArrayType,
}
diff --git a/metadata-ingestion/src/datahub/utilities/sqlalchemy_type_converter.py b/metadata-ingestion/src/datahub/utilities/sqlalchemy_type_converter.py
index 1d5ec5dae3519..5d2fc6872c7bd 100644
--- a/metadata-ingestion/src/datahub/utilities/sqlalchemy_type_converter.py
+++ b/metadata-ingestion/src/datahub/utilities/sqlalchemy_type_converter.py
@@ -4,7 +4,6 @@
from typing import Any, Dict, List, Optional, Type, Union
from sqlalchemy import types
-from sqlalchemy_bigquery import STRUCT
from datahub.ingestion.extractor.schema_util import avro_schema_to_mce_fields
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
@@ -12,6 +11,12 @@
logger = logging.getLogger(__name__)
+try:
+ # This is used for both BigQuery and Athena.
+ from sqlalchemy_bigquery import STRUCT
+except ImportError:
+ STRUCT = None
+
class MapType(types.TupleType):
# Wrapper class around SQLalchemy's TupleType to increase compatibility with DataHub
@@ -42,7 +47,9 @@ def get_avro_type(
) -> Dict[str, Any]:
"""Determines the concrete AVRO schema type for a SQLalchemy-typed column"""
- if type(column_type) in cls.PRIMITIVE_SQL_ALCHEMY_TYPE_TO_AVRO_TYPE.keys():
+ if isinstance(
+ column_type, tuple(cls.PRIMITIVE_SQL_ALCHEMY_TYPE_TO_AVRO_TYPE.keys())
+ ):
return {
"type": cls.PRIMITIVE_SQL_ALCHEMY_TYPE_TO_AVRO_TYPE[type(column_type)],
"native_data_type": str(column_type),
@@ -88,7 +95,7 @@ def get_avro_type(
"key_type": cls.get_avro_type(column_type=key_type, nullable=nullable),
"key_native_data_type": str(key_type),
}
- if isinstance(column_type, STRUCT):
+ if STRUCT and isinstance(column_type, STRUCT):
fields = []
for field_def in column_type._STRUCT_fields:
field_name, field_type = field_def
From 2d1584b12fe4a40a077457e618f0937132763586 Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz
Date: Tue, 24 Oct 2023 23:08:24 -0400
Subject: [PATCH 14/40] feat(ingest/datahub-source): Allow ingesting aspects
from the entitiesV2 API (#9089)
---
.../ingestion/source/datahub/config.py | 19 ++++++-
.../source/datahub/datahub_api_reader.py | 49 +++++++++++++++++++
.../source/datahub/datahub_source.py | 16 ++++++
3 files changed, 83 insertions(+), 1 deletion(-)
create mode 100644 metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_api_reader.py
diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py
index 053d136305527..83958dc76754f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py
@@ -1,3 +1,4 @@
+import os
from typing import Optional
from pydantic import Field, root_validator
@@ -67,9 +68,25 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
),
)
+ pull_from_datahub_api: bool = Field(
+ default=False,
+ description="Use the DataHub API to fetch versioned aspects.",
+ hidden_from_docs=True,
+ )
+
+ max_workers: int = Field(
+ default=5 * (os.cpu_count() or 4),
+ description="Number of worker threads to use for datahub api ingestion.",
+ hidden_from_docs=True,
+ )
+
@root_validator
def check_ingesting_data(cls, values):
- if not values.get("database_connection") and not values.get("kafka_connection"):
+ if (
+ not values.get("database_connection")
+ and not values.get("kafka_connection")
+ and not values.get("pull_from_datahub_api")
+ ):
raise ValueError(
"Your current config will not ingest any data."
" Please specify at least one of `database_connection` or `kafka_connection`, ideally both."
diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_api_reader.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_api_reader.py
new file mode 100644
index 0000000000000..7ee36736723b2
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_api_reader.py
@@ -0,0 +1,49 @@
+import logging
+from concurrent import futures
+from typing import Dict, Iterable, List
+
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.ingestion.graph.client import DataHubGraph
+from datahub.ingestion.graph.filters import RemovedStatusFilter
+from datahub.ingestion.source.datahub.config import DataHubSourceConfig
+from datahub.ingestion.source.datahub.report import DataHubSourceReport
+from datahub.metadata._schema_classes import _Aspect
+
+logger = logging.getLogger(__name__)
+
+# Should work for at least mysql, mariadb, postgres
+DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
+
+
+class DataHubApiReader:
+ def __init__(
+ self,
+ config: DataHubSourceConfig,
+ report: DataHubSourceReport,
+ graph: DataHubGraph,
+ ):
+ self.config = config
+ self.report = report
+ self.graph = graph
+
+ def get_aspects(self) -> Iterable[MetadataChangeProposalWrapper]:
+ urns = self.graph.get_urns_by_filter(
+ status=RemovedStatusFilter.ALL,
+ batch_size=self.config.database_query_batch_size,
+ )
+ tasks: List[futures.Future[Iterable[MetadataChangeProposalWrapper]]] = []
+ with futures.ThreadPoolExecutor(
+ max_workers=self.config.max_workers
+ ) as executor:
+ for urn in urns:
+ tasks.append(executor.submit(self._get_aspects_for_urn, urn))
+ for task in futures.as_completed(tasks):
+ yield from task.result()
+
+ def _get_aspects_for_urn(self, urn: str) -> Iterable[MetadataChangeProposalWrapper]:
+ aspects: Dict[str, _Aspect] = self.graph.get_entity_semityped(urn) # type: ignore
+ for aspect in aspects.values():
+ yield MetadataChangeProposalWrapper(
+ entityUrn=urn,
+ aspect=aspect,
+ )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py
index 2368febe1ff57..a2f43b8cc62cb 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py
@@ -15,6 +15,7 @@
from datahub.ingestion.api.source_helpers import auto_workunit_reporter
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.source.datahub.config import DataHubSourceConfig
+from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader
from datahub.ingestion.source.datahub.datahub_database_reader import (
DataHubDatabaseReader,
)
@@ -58,6 +59,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
logger.info(f"Ingesting DataHub metadata up until {self.report.stop_time}")
state = self.stateful_ingestion_handler.get_last_run_state()
+ if self.config.pull_from_datahub_api:
+ yield from self._get_api_workunits()
+
if self.config.database_connection is not None:
yield from self._get_database_workunits(
from_createdon=state.database_createdon_datetime
@@ -139,6 +143,18 @@ def _get_kafka_workunits(
)
self._commit_progress(i)
+ def _get_api_workunits(self) -> Iterable[MetadataWorkUnit]:
+ if self.ctx.graph is None:
+ self.report.report_failure(
+ "datahub_api",
+ "Specify datahub_api on your ingestion recipe to ingest from the DataHub API",
+ )
+ return
+
+ reader = DataHubApiReader(self.config, self.report, self.ctx.graph)
+ for mcp in reader.get_aspects():
+ yield mcp.as_workunit()
+
def _commit_progress(self, i: Optional[int] = None) -> None:
"""Commit progress to stateful storage, if there have been no errors.
From b612545220d9329696eaa26d6b42439cdf01fb95 Mon Sep 17 00:00:00 2001
From: siddiquebagwan-gslab
Date: Wed, 25 Oct 2023 15:26:06 +0530
Subject: [PATCH 15/40] feat(ingestion/redshift): support
auto_incremental_lineage (#9010)
---
docs/how/updating-datahub.md | 2 ++
metadata-ingestion/setup.py | 10 +++-------
.../datahub/ingestion/source/redshift/config.py | 8 +++++++-
.../datahub/ingestion/source/redshift/redshift.py | 15 +++++++++++++--
.../tests/unit/test_redshift_config.py | 6 ++++++
5 files changed, 31 insertions(+), 10 deletions(-)
create mode 100644 metadata-ingestion/tests/unit/test_redshift_config.py
diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md
index 57193ea69f2be..8813afee65be9 100644
--- a/docs/how/updating-datahub.md
+++ b/docs/how/updating-datahub.md
@@ -4,6 +4,8 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
## Next
+- #9010 - In Redshift source's config `incremental_lineage` is set default to off.
+
### Breaking Changes
- #8810 - Removed support for SQLAlchemy 1.3.x. Only SQLAlchemy 1.4.x is supported now.
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index 417588a433655..72b0e776a0da5 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -355,13 +355,9 @@
| {"psycopg2-binary", "pymysql>=1.0.2"},
"pulsar": {"requests"},
"redash": {"redash-toolbelt", "sql-metadata"} | sqllineage_lib,
- "redshift": sql_common
- | redshift_common
- | usage_common
- | sqlglot_lib
- | {"redshift-connector"},
- "redshift-legacy": sql_common | redshift_common,
- "redshift-usage-legacy": sql_common | usage_common | redshift_common,
+ "redshift": sql_common | redshift_common | usage_common | {"redshift-connector"} | sqlglot_lib,
+ "redshift-legacy": sql_common | redshift_common | sqlglot_lib,
+ "redshift-usage-legacy": sql_common | redshift_common | sqlglot_lib | usage_common,
"s3": {*s3_base, *data_lake_profiling},
"gcs": {*s3_base, *data_lake_profiling},
"sagemaker": aws_common,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py
index 2789b800940db..79b044841e054 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py
@@ -133,7 +133,13 @@ class RedshiftConfig(
)
extract_column_level_lineage: bool = Field(
- default=True, description="Whether to extract column level lineage."
+ default=True,
+ description="Whether to extract column level lineage. This config works with rest-sink only.",
+ )
+
+ incremental_lineage: bool = Field(
+ default=False,
+ description="When enabled, emits lineage as incremental to existing lineage already in DataHub. When disabled, re-states lineage on each run. This config works with rest-sink only.",
)
@root_validator(pre=True)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py
index a1b6333a3775d..26237a6ce12e0 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py
@@ -1,5 +1,6 @@
import logging
from collections import defaultdict
+from functools import partial
from typing import Dict, Iterable, List, Optional, Type, Union
import humanfriendly
@@ -25,6 +26,7 @@
platform_name,
support_status,
)
+from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage
from datahub.ingestion.api.source import (
CapabilityReport,
MetadataWorkUnitProcessor,
@@ -369,6 +371,11 @@ def gen_database_container(self, database: str) -> Iterable[MetadataWorkUnit]:
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
return [
*super().get_workunit_processors(),
+ partial(
+ auto_incremental_lineage,
+ self.ctx.graph,
+ self.config.incremental_lineage,
+ ),
StaleEntityRemovalHandler.create(
self, self.config, self.ctx
).workunit_processor,
@@ -942,7 +949,9 @@ def generate_lineage(self, database: str) -> Iterable[MetadataWorkUnit]:
)
if lineage_info:
yield from gen_lineage(
- dataset_urn, lineage_info, self.config.incremental_lineage
+ dataset_urn,
+ lineage_info,
+ incremental_lineage=False, # incremental lineage generation is taken care by auto_incremental_lineage
)
for schema in self.db_views[database]:
@@ -956,7 +965,9 @@ def generate_lineage(self, database: str) -> Iterable[MetadataWorkUnit]:
)
if lineage_info:
yield from gen_lineage(
- dataset_urn, lineage_info, self.config.incremental_lineage
+ dataset_urn,
+ lineage_info,
+ incremental_lineage=False, # incremental lineage generation is taken care by auto_incremental_lineage
)
def add_config_to_report(self):
diff --git a/metadata-ingestion/tests/unit/test_redshift_config.py b/metadata-ingestion/tests/unit/test_redshift_config.py
new file mode 100644
index 0000000000000..8a165e7f5f3fe
--- /dev/null
+++ b/metadata-ingestion/tests/unit/test_redshift_config.py
@@ -0,0 +1,6 @@
+from datahub.ingestion.source.redshift.config import RedshiftConfig
+
+
+def test_incremental_lineage_default_to_false():
+ config = RedshiftConfig(host_port="localhost:5439", database="test")
+ assert config.incremental_lineage is False
From 9cccd22c04bf357b574f4d9d7dae3aee633bf7d3 Mon Sep 17 00:00:00 2001
From: Pedro Silva
Date: Wed, 25 Oct 2023 11:01:49 +0100
Subject: [PATCH 16/40] feat(auth): Add backwards compatible field resolver
(#9096)
---
.../com/datahub/authorization/EntityFieldType.java | 13 +++++++++++++
.../authorization/DefaultEntitySpecResolver.java | 13 +++++++------
.../DataPlatformInstanceFieldResolverProvider.java | 10 +++++-----
.../DomainFieldResolverProvider.java | 5 +++--
.../EntityFieldResolverProvider.java | 6 ++++--
.../EntityTypeFieldResolverProvider.java | 7 +++++--
.../EntityUrnFieldResolverProvider.java | 7 +++++--
.../GroupMembershipFieldResolverProvider.java | 5 +++--
.../OwnerFieldResolverProvider.java | 5 +++--
...taPlatformInstanceFieldResolverProviderTest.java | 2 +-
.../GroupMembershipFieldResolverProviderTest.java | 2 +-
11 files changed, 50 insertions(+), 25 deletions(-)
diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntityFieldType.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntityFieldType.java
index 46763f29a7040..1258d958f2092 100644
--- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntityFieldType.java
+++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntityFieldType.java
@@ -4,6 +4,19 @@
* List of entity field types to fetch for a given entity
*/
public enum EntityFieldType {
+
+ /**
+ * Type of the entity (e.g. dataset, chart)
+ * @deprecated
+ */
+ @Deprecated
+ RESOURCE_URN,
+ /**
+ * Urn of the entity
+ * @deprecated
+ */
+ @Deprecated
+ RESOURCE_TYPE,
/**
* Type of the entity (e.g. dataset, chart)
*/
diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DefaultEntitySpecResolver.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DefaultEntitySpecResolver.java
index 4ad14ed59c9c0..65b0329a9c4f2 100644
--- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DefaultEntitySpecResolver.java
+++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DefaultEntitySpecResolver.java
@@ -1,15 +1,16 @@
package com.datahub.authorization;
-import com.datahub.authorization.fieldresolverprovider.DataPlatformInstanceFieldResolverProvider;
-import com.datahub.authorization.fieldresolverprovider.EntityTypeFieldResolverProvider;
-import com.datahub.authorization.fieldresolverprovider.OwnerFieldResolverProvider;
import com.datahub.authentication.Authentication;
+import com.datahub.authorization.fieldresolverprovider.DataPlatformInstanceFieldResolverProvider;
import com.datahub.authorization.fieldresolverprovider.DomainFieldResolverProvider;
-import com.datahub.authorization.fieldresolverprovider.EntityUrnFieldResolverProvider;
import com.datahub.authorization.fieldresolverprovider.EntityFieldResolverProvider;
+import com.datahub.authorization.fieldresolverprovider.EntityTypeFieldResolverProvider;
+import com.datahub.authorization.fieldresolverprovider.EntityUrnFieldResolverProvider;
import com.datahub.authorization.fieldresolverprovider.GroupMembershipFieldResolverProvider;
+import com.datahub.authorization.fieldresolverprovider.OwnerFieldResolverProvider;
import com.google.common.collect.ImmutableList;
import com.linkedin.entity.client.EntityClient;
+import com.linkedin.util.Pair;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@@ -34,7 +35,7 @@ public ResolvedEntitySpec resolve(EntitySpec entitySpec) {
private Map getFieldResolvers(EntitySpec entitySpec) {
return _entityFieldResolverProviders.stream()
- .collect(Collectors.toMap(EntityFieldResolverProvider::getFieldType,
- hydrator -> hydrator.getFieldResolver(entitySpec)));
+ .flatMap(resolver -> resolver.getFieldTypes().stream().map(fieldType -> Pair.of(fieldType, resolver)))
+ .collect(Collectors.toMap(Pair::getKey, pair -> pair.getValue().getFieldResolver(entitySpec)));
}
}
diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProvider.java
index 27cb8fcee8138..cbb237654e969 100644
--- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProvider.java
+++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProvider.java
@@ -1,8 +1,5 @@
package com.datahub.authorization.fieldresolverprovider;
-import static com.linkedin.metadata.Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME;
-import static com.linkedin.metadata.Constants.DATA_PLATFORM_INSTANCE_ENTITY_NAME;
-
import com.datahub.authentication.Authentication;
import com.datahub.authorization.EntityFieldType;
import com.datahub.authorization.EntitySpec;
@@ -14,10 +11,13 @@
import com.linkedin.entity.EnvelopedAspect;
import com.linkedin.entity.client.EntityClient;
import java.util.Collections;
+import java.util.List;
import java.util.Objects;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
+import static com.linkedin.metadata.Constants.*;
+
/**
* Provides field resolver for domain given resourceSpec
*/
@@ -29,8 +29,8 @@ public class DataPlatformInstanceFieldResolverProvider implements EntityFieldRes
private final Authentication _systemAuthentication;
@Override
- public EntityFieldType getFieldType() {
- return EntityFieldType.DATA_PLATFORM_INSTANCE;
+ public List getFieldTypes() {
+ return Collections.singletonList(EntityFieldType.DATA_PLATFORM_INSTANCE);
}
@Override
diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DomainFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DomainFieldResolverProvider.java
index 25c2165f02b94..15d821b75c0bd 100644
--- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DomainFieldResolverProvider.java
+++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DomainFieldResolverProvider.java
@@ -14,6 +14,7 @@
import java.util.Collections;
import java.util.HashSet;
+import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
@@ -37,8 +38,8 @@ public class DomainFieldResolverProvider implements EntityFieldResolverProvider
private final Authentication _systemAuthentication;
@Override
- public EntityFieldType getFieldType() {
- return EntityFieldType.DOMAIN;
+ public List getFieldTypes() {
+ return Collections.singletonList(EntityFieldType.DOMAIN);
}
@Override
diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityFieldResolverProvider.java
index a76db0ecb5102..227d403a9cd1d 100644
--- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityFieldResolverProvider.java
+++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityFieldResolverProvider.java
@@ -3,6 +3,7 @@
import com.datahub.authorization.FieldResolver;
import com.datahub.authorization.EntityFieldType;
import com.datahub.authorization.EntitySpec;
+import java.util.List;
/**
@@ -11,9 +12,10 @@
public interface EntityFieldResolverProvider {
/**
- * Field that this hydrator is hydrating
+ * List of fields that this hydrator is hydrating.
+ * @return
*/
- EntityFieldType getFieldType();
+ List getFieldTypes();
/**
* Return resolver for fetching the field values given the entity
diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityTypeFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityTypeFieldResolverProvider.java
index 187f696904947..addac84c68b18 100644
--- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityTypeFieldResolverProvider.java
+++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityTypeFieldResolverProvider.java
@@ -3,16 +3,19 @@
import com.datahub.authorization.FieldResolver;
import com.datahub.authorization.EntityFieldType;
import com.datahub.authorization.EntitySpec;
+import com.datastax.oss.driver.shaded.guava.common.collect.ImmutableList;
import java.util.Collections;
+import java.util.List;
/**
* Provides field resolver for entity type given entitySpec
*/
public class EntityTypeFieldResolverProvider implements EntityFieldResolverProvider {
+
@Override
- public EntityFieldType getFieldType() {
- return EntityFieldType.TYPE;
+ public List getFieldTypes() {
+ return ImmutableList.of(EntityFieldType.TYPE, EntityFieldType.RESOURCE_TYPE);
}
@Override
diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityUrnFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityUrnFieldResolverProvider.java
index 2f5c4a7c6c961..32960de687839 100644
--- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityUrnFieldResolverProvider.java
+++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityUrnFieldResolverProvider.java
@@ -3,16 +3,19 @@
import com.datahub.authorization.FieldResolver;
import com.datahub.authorization.EntityFieldType;
import com.datahub.authorization.EntitySpec;
+import com.datastax.oss.driver.shaded.guava.common.collect.ImmutableList;
import java.util.Collections;
+import java.util.List;
/**
* Provides field resolver for entity urn given entitySpec
*/
public class EntityUrnFieldResolverProvider implements EntityFieldResolverProvider {
+
@Override
- public EntityFieldType getFieldType() {
- return EntityFieldType.URN;
+ public List getFieldTypes() {
+ return ImmutableList.of(EntityFieldType.URN, EntityFieldType.RESOURCE_URN);
}
@Override
diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/GroupMembershipFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/GroupMembershipFieldResolverProvider.java
index 8db029632d7e2..b1202d9f4bbd3 100644
--- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/GroupMembershipFieldResolverProvider.java
+++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/GroupMembershipFieldResolverProvider.java
@@ -13,6 +13,7 @@
import com.linkedin.identity.NativeGroupMembership;
import com.linkedin.metadata.Constants;
import com.linkedin.identity.GroupMembership;
+import java.util.Collections;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@@ -35,8 +36,8 @@ public class GroupMembershipFieldResolverProvider implements EntityFieldResolver
private final Authentication _systemAuthentication;
@Override
- public EntityFieldType getFieldType() {
- return EntityFieldType.GROUP_MEMBERSHIP;
+ public List getFieldTypes() {
+ return Collections.singletonList(EntityFieldType.GROUP_MEMBERSHIP);
}
@Override
diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/OwnerFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/OwnerFieldResolverProvider.java
index bdd652d1d3871..3c27f9e6ce8d7 100644
--- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/OwnerFieldResolverProvider.java
+++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/OwnerFieldResolverProvider.java
@@ -12,6 +12,7 @@
import com.linkedin.entity.client.EntityClient;
import com.linkedin.metadata.Constants;
import java.util.Collections;
+import java.util.List;
import java.util.stream.Collectors;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@@ -28,8 +29,8 @@ public class OwnerFieldResolverProvider implements EntityFieldResolverProvider {
private final Authentication _systemAuthentication;
@Override
- public EntityFieldType getFieldType() {
- return EntityFieldType.OWNER;
+ public List getFieldTypes() {
+ return Collections.singletonList(EntityFieldType.OWNER);
}
@Override
diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProviderTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProviderTest.java
index b2343bbb01509..5c7d87f1c05a9 100644
--- a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProviderTest.java
+++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProviderTest.java
@@ -56,7 +56,7 @@ public void setup() {
@Test
public void shouldReturnDataPlatformInstanceType() {
- assertEquals(EntityFieldType.DATA_PLATFORM_INSTANCE, dataPlatformInstanceFieldResolverProvider.getFieldType());
+ assertEquals(EntityFieldType.DATA_PLATFORM_INSTANCE, dataPlatformInstanceFieldResolverProvider.getFieldTypes().get(0));
}
@Test
diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/GroupMembershipFieldResolverProviderTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/GroupMembershipFieldResolverProviderTest.java
index 54675045b4413..af547f14cd3fc 100644
--- a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/GroupMembershipFieldResolverProviderTest.java
+++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/GroupMembershipFieldResolverProviderTest.java
@@ -53,7 +53,7 @@ public void setup() {
@Test
public void shouldReturnGroupsMembershipType() {
- assertEquals(EntityFieldType.GROUP_MEMBERSHIP, groupMembershipFieldResolverProvider.getFieldType());
+ assertEquals(EntityFieldType.GROUP_MEMBERSHIP, groupMembershipFieldResolverProvider.getFieldTypes().get(0));
}
@Test
From dd5d997390d489c777aac25dbbd3f47c4bab8340 Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz
Date: Wed, 25 Oct 2023 10:54:55 -0400
Subject: [PATCH 17/40] build(gradle): Support IntelliJ 2023.2.3 (#9034)
---
metadata-models/build.gradle | 1 +
1 file changed, 1 insertion(+)
diff --git a/metadata-models/build.gradle b/metadata-models/build.gradle
index 53e7765152aef..bd8052283e168 100644
--- a/metadata-models/build.gradle
+++ b/metadata-models/build.gradle
@@ -23,6 +23,7 @@ dependencies {
}
}
api project(':li-utils')
+ api project(path: ':li-utils', configuration: "dataTemplate")
dataModel project(':li-utils')
compileOnly externalDependency.lombok
From 8a80e858a7b6bf67105e082475ada57a27c37c67 Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz
Date: Wed, 25 Oct 2023 13:06:12 -0400
Subject: [PATCH 18/40] build(ingest): Bump avro pin: security vulnerability
(#9042)
---
metadata-ingestion/scripts/avro_codegen.py | 3 +-
metadata-ingestion/scripts/modeldocgen.py | 4 +-
metadata-ingestion/setup.py | 8 +-
.../ingestion/extractor/schema_util.py | 109 ++++++++++++------
.../src/datahub/ingestion/source/kafka.py | 19 ++-
.../src/datahub/utilities/mapping.py | 4 +-
.../unit/data_lake/test_schema_inference.py | 6 +-
7 files changed, 99 insertions(+), 54 deletions(-)
diff --git a/metadata-ingestion/scripts/avro_codegen.py b/metadata-ingestion/scripts/avro_codegen.py
index a9b9b4b20f5ac..021ebd4a31eb3 100644
--- a/metadata-ingestion/scripts/avro_codegen.py
+++ b/metadata-ingestion/scripts/avro_codegen.py
@@ -152,7 +152,8 @@ def add_name(self, name_attr, space_attr, new_schema):
return encoded
-autogen_header = """# flake8: noqa
+autogen_header = """# mypy: ignore-errors
+# flake8: noqa
# This file is autogenerated by /metadata-ingestion/scripts/avro_codegen.py
# Do not modify manually!
diff --git a/metadata-ingestion/scripts/modeldocgen.py b/metadata-ingestion/scripts/modeldocgen.py
index ffa80515dbafd..81b26145e620c 100644
--- a/metadata-ingestion/scripts/modeldocgen.py
+++ b/metadata-ingestion/scripts/modeldocgen.py
@@ -351,8 +351,8 @@ def strip_types(field_path: str) -> str:
field_objects = []
for f in entity_fields:
field = avro.schema.Field(
- type=f["type"],
- name=f["name"],
+ f["type"],
+ f["name"],
has_default=False,
)
field_objects.append(field)
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index 72b0e776a0da5..0b8661b0df5f5 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -32,7 +32,7 @@
"expandvars>=0.6.5",
"avro-gen3==0.7.11",
# "avro-gen3 @ git+https://github.com/acryldata/avro_gen@master#egg=avro-gen3",
- "avro>=1.10.2,<1.11",
+ "avro>=1.11.3,<1.12",
"python-dateutil>=2.8.0",
"tabulate",
"progressbar2",
@@ -355,7 +355,11 @@
| {"psycopg2-binary", "pymysql>=1.0.2"},
"pulsar": {"requests"},
"redash": {"redash-toolbelt", "sql-metadata"} | sqllineage_lib,
- "redshift": sql_common | redshift_common | usage_common | {"redshift-connector"} | sqlglot_lib,
+ "redshift": sql_common
+ | redshift_common
+ | usage_common
+ | {"redshift-connector"}
+ | sqlglot_lib,
"redshift-legacy": sql_common | redshift_common | sqlglot_lib,
"redshift-usage-legacy": sql_common | redshift_common | sqlglot_lib | usage_common,
"s3": {*s3_base, *data_lake_profiling},
diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py
index 4acf99a50e50e..df0b732833fbe 100644
--- a/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py
+++ b/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py
@@ -1,6 +1,18 @@
import json
import logging
-from typing import Any, Callable, Dict, Generator, List, Optional, Type, Union
+from typing import (
+ Any,
+ Callable,
+ Dict,
+ Iterable,
+ List,
+ Mapping,
+ Optional,
+ Type,
+ Union,
+ cast,
+ overload,
+)
import avro.schema
@@ -54,6 +66,8 @@
avro.schema.PrimitiveSchema,
]
+SchemaOrField = Union[avro.schema.Schema, avro.schema.Field]
+
FieldStack = List[avro.schema.Field]
# The latest avro code contains this type definition in a compatibility module,
@@ -124,16 +138,22 @@ def __init__(
self._meta_mapping_processor = meta_mapping_processor
self._schema_tags_field = schema_tags_field
self._tag_prefix = tag_prefix
+
# Map of avro schema type to the conversion handler
- self._avro_type_to_mce_converter_map: Dict[
- avro.schema.Schema,
- Callable[[ExtendedAvroNestedSchemas], Generator[SchemaField, None, None]],
+ # TODO: Clean up this type... perhaps refactor
+ self._avro_type_to_mce_converter_map: Mapping[
+ Union[
+ Type[avro.schema.Schema],
+ Type[avro.schema.Field],
+ Type[avro.schema.LogicalSchema],
+ ],
+ Callable[[SchemaOrField], Iterable[SchemaField]],
] = {
avro.schema.RecordSchema: self._gen_from_non_field_nested_schemas,
avro.schema.UnionSchema: self._gen_from_non_field_nested_schemas,
avro.schema.ArraySchema: self._gen_from_non_field_nested_schemas,
avro.schema.MapSchema: self._gen_from_non_field_nested_schemas,
- avro.schema.Field: self._gen_nested_schema_from_field,
+ avro.schema.Field: self._gen_nested_schema_from_field, # type: ignore
avro.schema.PrimitiveSchema: self._gen_non_nested_to_mce_fields,
avro.schema.FixedSchema: self._gen_non_nested_to_mce_fields,
avro.schema.EnumSchema: self._gen_non_nested_to_mce_fields,
@@ -142,20 +162,22 @@ def __init__(
@staticmethod
def _get_type_name(
- avro_schema: avro.schema.Schema, logical_if_present: bool = False
+ avro_schema: SchemaOrField, logical_if_present: bool = False
) -> str:
logical_type_name: Optional[str] = None
if logical_if_present:
- logical_type_name = getattr(
- avro_schema, "logical_type", None
- ) or avro_schema.props.get("logicalType")
+ logical_type_name = cast(
+ Optional[str],
+ getattr(avro_schema, "logical_type", None)
+ or avro_schema.props.get("logicalType"),
+ )
return logical_type_name or str(
getattr(avro_schema.type, "type", avro_schema.type)
)
@staticmethod
def _get_column_type(
- avro_schema: avro.schema.Schema, logical_type: Optional[str]
+ avro_schema: SchemaOrField, logical_type: Optional[str]
) -> SchemaFieldDataType:
type_name: str = AvroToMceSchemaConverter._get_type_name(avro_schema)
TypeClass: Optional[Type] = AvroToMceSchemaConverter.field_type_mapping.get(
@@ -186,7 +208,7 @@ def _get_column_type(
)
return dt
- def _is_nullable(self, schema: avro.schema.Schema) -> bool:
+ def _is_nullable(self, schema: SchemaOrField) -> bool:
if isinstance(schema, avro.schema.Field):
return self._is_nullable(schema.type)
if isinstance(schema, avro.schema.UnionSchema):
@@ -208,7 +230,7 @@ def _strip_namespace(name_or_fullname: str) -> str:
return name_or_fullname.rsplit(".", maxsplit=1)[-1]
@staticmethod
- def _get_simple_native_type(schema: ExtendedAvroNestedSchemas) -> str:
+ def _get_simple_native_type(schema: SchemaOrField) -> str:
if isinstance(schema, (avro.schema.RecordSchema, avro.schema.Field)):
# For Records, fields, always return the name.
return AvroToMceSchemaConverter._strip_namespace(schema.name)
@@ -226,7 +248,7 @@ def _get_simple_native_type(schema: ExtendedAvroNestedSchemas) -> str:
return schema.type
@staticmethod
- def _get_type_annotation(schema: ExtendedAvroNestedSchemas) -> str:
+ def _get_type_annotation(schema: SchemaOrField) -> str:
simple_native_type = AvroToMceSchemaConverter._get_simple_native_type(schema)
if simple_native_type.startswith("__struct_"):
simple_native_type = "struct"
@@ -237,10 +259,24 @@ def _get_type_annotation(schema: ExtendedAvroNestedSchemas) -> str:
else:
return f"[type={simple_native_type}]"
+ @staticmethod
+ @overload
+ def _get_underlying_type_if_option_as_union(
+ schema: SchemaOrField, default: SchemaOrField
+ ) -> SchemaOrField:
+ ...
+
+ @staticmethod
+ @overload
+ def _get_underlying_type_if_option_as_union(
+ schema: SchemaOrField, default: Optional[SchemaOrField] = None
+ ) -> Optional[SchemaOrField]:
+ ...
+
@staticmethod
def _get_underlying_type_if_option_as_union(
- schema: AvroNestedSchemas, default: Optional[AvroNestedSchemas] = None
- ) -> AvroNestedSchemas:
+ schema: SchemaOrField, default: Optional[SchemaOrField] = None
+ ) -> Optional[SchemaOrField]:
if isinstance(schema, avro.schema.UnionSchema) and len(schema.schemas) == 2:
(first, second) = schema.schemas
if first.type == AVRO_TYPE_NULL:
@@ -258,8 +294,8 @@ class SchemaFieldEmissionContextManager:
def __init__(
self,
- schema: avro.schema.Schema,
- actual_schema: avro.schema.Schema,
+ schema: SchemaOrField,
+ actual_schema: SchemaOrField,
converter: "AvroToMceSchemaConverter",
description: Optional[str] = None,
default_value: Optional[str] = None,
@@ -275,7 +311,7 @@ def __enter__(self):
self._converter._prefix_name_stack.append(type_annotation)
return self
- def emit(self) -> Generator[SchemaField, None, None]:
+ def emit(self) -> Iterable[SchemaField]:
if (
not isinstance(
self._actual_schema,
@@ -307,7 +343,7 @@ def emit(self) -> Generator[SchemaField, None, None]:
description = self._description
if not description and actual_schema.props.get("doc"):
- description = actual_schema.props.get("doc")
+ description = cast(Optional[str], actual_schema.props.get("doc"))
if self._default_value is not None:
description = f"{description if description else ''}\nField default value: {self._default_value}"
@@ -320,12 +356,12 @@ def emit(self) -> Generator[SchemaField, None, None]:
native_data_type = native_data_type[
slice(len(type_prefix), len(native_data_type) - 1)
]
- native_data_type = actual_schema.props.get(
- "native_data_type", native_data_type
+ native_data_type = cast(
+ str, actual_schema.props.get("native_data_type", native_data_type)
)
field_path = self._converter._get_cur_field_path()
- merged_props = {}
+ merged_props: Dict[str, Any] = {}
merged_props.update(self._schema.other_props)
merged_props.update(schema.other_props)
@@ -363,12 +399,13 @@ def emit(self) -> Generator[SchemaField, None, None]:
meta_terms_aspect = meta_aspects.get(Constants.ADD_TERM_OPERATION)
- logical_type_name: Optional[str] = (
+ logical_type_name: Optional[str] = cast(
+ Optional[str],
# logicalType nested inside type
getattr(actual_schema, "logical_type", None)
or actual_schema.props.get("logicalType")
# bare logicalType
- or self._actual_schema.props.get("logicalType")
+ or self._actual_schema.props.get("logicalType"),
)
field = SchemaField(
@@ -392,14 +429,12 @@ def emit(self) -> Generator[SchemaField, None, None]:
def __exit__(self, exc_type, exc_val, exc_tb):
self._converter._prefix_name_stack.pop()
- def _get_sub_schemas(
- self, schema: ExtendedAvroNestedSchemas
- ) -> Generator[avro.schema.Schema, None, None]:
+ def _get_sub_schemas(self, schema: SchemaOrField) -> Iterable[SchemaOrField]:
"""Responsible for generation for appropriate sub-schemas for every nested AVRO type."""
def gen_items_from_list_tuple_or_scalar(
val: Any,
- ) -> Generator[avro.schema.Schema, None, None]:
+ ) -> Iterable[avro.schema.Schema]:
if isinstance(val, (list, tuple)):
for i in val:
yield i
@@ -433,7 +468,7 @@ def gen_items_from_list_tuple_or_scalar(
def _gen_nested_schema_from_field(
self,
field: avro.schema.Field,
- ) -> Generator[SchemaField, None, None]:
+ ) -> Iterable[SchemaField]:
"""Handles generation of MCE SchemaFields for an AVRO Field type."""
# NOTE: Here we only manage the field stack and trigger MCE Field generation from this field's type.
# The actual emitting of a field happens when
@@ -447,7 +482,7 @@ def _gen_nested_schema_from_field(
def _gen_from_last_field(
self, schema_to_recurse: Optional[AvroNestedSchemas] = None
- ) -> Generator[SchemaField, None, None]:
+ ) -> Iterable[SchemaField]:
"""Emits the field most-recent field, optionally triggering sub-schema generation under the field."""
last_field_schema = self._fields_stack[-1]
# Generate the custom-description for the field.
@@ -467,8 +502,8 @@ def _gen_from_last_field(
yield from self._to_mce_fields(sub_schema)
def _gen_from_non_field_nested_schemas(
- self, schema: AvroNestedSchemas
- ) -> Generator[SchemaField, None, None]:
+ self, schema: SchemaOrField
+ ) -> Iterable[SchemaField]:
"""Handles generation of MCE SchemaFields for all standard AVRO nested types."""
# Handle recursive record definitions
recurse: bool = True
@@ -511,8 +546,8 @@ def _gen_from_non_field_nested_schemas(
yield from self._to_mce_fields(sub_schema)
def _gen_non_nested_to_mce_fields(
- self, schema: AvroNonNestedSchemas
- ) -> Generator[SchemaField, None, None]:
+ self, schema: SchemaOrField
+ ) -> Iterable[SchemaField]:
"""Handles generation of MCE SchemaFields for non-nested AVRO types."""
with AvroToMceSchemaConverter.SchemaFieldEmissionContextManager(
schema,
@@ -521,9 +556,7 @@ def _gen_non_nested_to_mce_fields(
) as non_nested_emitter:
yield from non_nested_emitter.emit()
- def _to_mce_fields(
- self, avro_schema: avro.schema.Schema
- ) -> Generator[SchemaField, None, None]:
+ def _to_mce_fields(self, avro_schema: SchemaOrField) -> Iterable[SchemaField]:
# Invoke the relevant conversion handler for the schema element type.
schema_type = (
type(avro_schema)
@@ -541,7 +574,7 @@ def to_mce_fields(
meta_mapping_processor: Optional[OperationProcessor] = None,
schema_tags_field: Optional[str] = None,
tag_prefix: Optional[str] = None,
- ) -> Generator[SchemaField, None, None]:
+ ) -> Iterable[SchemaField]:
"""
Converts a key or value type AVRO schema string to appropriate MCE SchemaFields.
:param avro_schema_string: String representation of the AVRO schema.
diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka.py b/metadata-ingestion/src/datahub/ingestion/source/kafka.py
index d5039360da567..23770ff3cf812 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/kafka.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/kafka.py
@@ -3,7 +3,7 @@
import logging
from dataclasses import dataclass, field
from enum import Enum
-from typing import Any, Dict, Iterable, List, Optional, Type
+from typing import Any, Dict, Iterable, List, Optional, Type, cast
import avro.schema
import confluent_kafka
@@ -316,13 +316,20 @@ def _extract_record(
avro_schema = avro.schema.parse(
schema_metadata.platformSchema.documentSchema
)
- description = avro_schema.doc
+ description = getattr(avro_schema, "doc", None)
# set the tags
all_tags: List[str] = []
- for tag in avro_schema.other_props.get(
- self.source_config.schema_tags_field, []
- ):
- all_tags.append(self.source_config.tag_prefix + tag)
+ try:
+ schema_tags = cast(
+ Iterable[str],
+ avro_schema.other_props.get(
+ self.source_config.schema_tags_field, []
+ ),
+ )
+ for tag in schema_tags:
+ all_tags.append(self.source_config.tag_prefix + tag)
+ except TypeError:
+ pass
if self.source_config.enable_meta_mapping:
meta_aspects = self.meta_processor.process(avro_schema.other_props)
diff --git a/metadata-ingestion/src/datahub/utilities/mapping.py b/metadata-ingestion/src/datahub/utilities/mapping.py
index eb2d975ee607f..f91c01d901ac1 100644
--- a/metadata-ingestion/src/datahub/utilities/mapping.py
+++ b/metadata-ingestion/src/datahub/utilities/mapping.py
@@ -4,7 +4,7 @@
import re
import time
from functools import reduce
-from typing import Any, Dict, List, Match, Optional, Union, cast
+from typing import Any, Dict, List, Mapping, Match, Optional, Union, cast
from datahub.emitter import mce_builder
from datahub.emitter.mce_builder import OwnerType
@@ -111,7 +111,7 @@ def __init__(
self.owner_source_type = owner_source_type
self.match_nested_props = match_nested_props
- def process(self, raw_props: Dict[str, Any]) -> Dict[str, Any]:
+ def process(self, raw_props: Mapping[str, Any]) -> Dict[str, Any]:
# Defining the following local variables -
# operations_map - the final resulting map when operations are processed.
# Against each operation the values to be applied are stored.
diff --git a/metadata-ingestion/tests/unit/data_lake/test_schema_inference.py b/metadata-ingestion/tests/unit/data_lake/test_schema_inference.py
index cbd5be9e7d832..4a69deb572fbd 100644
--- a/metadata-ingestion/tests/unit/data_lake/test_schema_inference.py
+++ b/metadata-ingestion/tests/unit/data_lake/test_schema_inference.py
@@ -1,14 +1,14 @@
import tempfile
from typing import List, Type
-import avro.schema
import pandas as pd
import ujson
from avro import schema as avro_schema
from avro.datafile import DataFileWriter
from avro.io import DatumWriter
-from datahub.ingestion.source.schema_inference import avro, csv_tsv, json, parquet
+from datahub.ingestion.source.schema_inference import csv_tsv, json, parquet
+from datahub.ingestion.source.schema_inference.avro import AvroInferrer
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
BooleanTypeClass,
NumberTypeClass,
@@ -123,7 +123,7 @@ def test_infer_schema_avro():
file.seek(0)
- fields = avro.AvroInferrer().infer_schema(file)
+ fields = AvroInferrer().infer_schema(file)
fields.sort(key=lambda x: x.fieldPath)
assert_field_paths_match(fields, expected_field_paths_avro)
From b9508e6dd50c5d0eaf8eddb21c5bdf55bec1646a Mon Sep 17 00:00:00 2001
From: siddiquebagwan-gslab
Date: Wed, 25 Oct 2023 23:48:15 +0530
Subject: [PATCH 19/40] fix(ingestion/redshift): fix schema field data type
mappings
---
.../src/datahub/ingestion/source/redshift/redshift.py | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py
index 26237a6ce12e0..c7d01021773b1 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py
@@ -218,6 +218,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
] = {
"BYTES": BytesType,
"BOOL": BooleanType,
+ "BOOLEAN": BooleanType,
+ "DOUBLE": NumberType,
+ "DOUBLE PRECISION": NumberType,
"DECIMAL": NumberType,
"NUMERIC": NumberType,
"BIGNUMERIC": NumberType,
@@ -244,6 +247,13 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
"CHARACTER": StringType,
"CHAR": StringType,
"TIMESTAMP WITHOUT TIME ZONE": TimeType,
+ "REAL": NumberType,
+ "VARCHAR": StringType,
+ "TIMESTAMPTZ": TimeType,
+ "GEOMETRY": NullType,
+ "HLLSKETCH": NullType,
+ "TIMETZ": TimeType,
+ "VARBYTE": StringType,
}
def get_platform_instance_id(self) -> str:
From 1c77bca0c68878ca5cb86f741ca77ce0aa497272 Mon Sep 17 00:00:00 2001
From: Younghoon YUN
Date: Thu, 26 Oct 2023 05:01:47 +0900
Subject: [PATCH 20/40] fix(datahub-protobuf): add check if nested field is
reserved (#9058)
---
.../src/main/java/datahub/protobuf/model/ProtobufField.java | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/metadata-integration/java/datahub-protobuf/src/main/java/datahub/protobuf/model/ProtobufField.java b/metadata-integration/java/datahub-protobuf/src/main/java/datahub/protobuf/model/ProtobufField.java
index 42884241d9f7c..d890c373f1299 100644
--- a/metadata-integration/java/datahub-protobuf/src/main/java/datahub/protobuf/model/ProtobufField.java
+++ b/metadata-integration/java/datahub-protobuf/src/main/java/datahub/protobuf/model/ProtobufField.java
@@ -259,7 +259,9 @@ private FieldDescriptorProto getNestedTypeFields(List pathList, Descrip
messageType = messageType.getNestedType(value);
}
- if (pathList.get(pathSize - 2) == DescriptorProto.FIELD_FIELD_NUMBER) {
+ if (pathList.get(pathSize - 2) == DescriptorProto.FIELD_FIELD_NUMBER
+ && pathList.get(pathSize - 1) != DescriptorProto.RESERVED_RANGE_FIELD_NUMBER
+ && pathList.get(pathSize - 1) != DescriptorProto.RESERVED_NAME_FIELD_NUMBER) {
return messageType.getField(pathList.get(pathSize - 1));
} else {
return null;
From 32f5dcb1544e5a47efbb48d39b215d3bdc33535b Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Wed, 25 Oct 2023 13:16:49 -0700
Subject: [PATCH 21/40] fix(ingest): better handling around sink errors (#9003)
---
.../src/datahub/ingestion/run/pipeline.py | 10 +++++-
.../datahub/ingestion/sink/datahub_kafka.py | 33 ++++++++-----------
2 files changed, 22 insertions(+), 21 deletions(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py
index 07b55e0e25a89..f2735c24ca19d 100644
--- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py
+++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py
@@ -390,7 +390,15 @@ def run(self) -> None:
record_envelopes = self.extractor.get_records(wu)
for record_envelope in self.transform(record_envelopes):
if not self.dry_run:
- self.sink.write_record_async(record_envelope, callback)
+ try:
+ self.sink.write_record_async(
+ record_envelope, callback
+ )
+ except Exception as e:
+ # In case the sink's error handling is bad, we still want to report the error.
+ self.sink.report.report_failure(
+ f"Failed to write record: {e}"
+ )
except RuntimeError:
raise
diff --git a/metadata-ingestion/src/datahub/ingestion/sink/datahub_kafka.py b/metadata-ingestion/src/datahub/ingestion/sink/datahub_kafka.py
index 39054c256a7fd..38ddadaafc862 100644
--- a/metadata-ingestion/src/datahub/ingestion/sink/datahub_kafka.py
+++ b/metadata-ingestion/src/datahub/ingestion/sink/datahub_kafka.py
@@ -9,7 +9,6 @@
MetadataChangeEvent,
MetadataChangeProposal,
)
-from datahub.metadata.schema_classes import MetadataChangeProposalClass
class KafkaSinkConfig(KafkaEmitterConfig):
@@ -58,27 +57,21 @@ def write_record_async(
],
write_callback: WriteCallback,
) -> None:
- record = record_envelope.record
- if isinstance(record, MetadataChangeEvent):
- self.emitter.emit_mce_async(
+ callback = _KafkaCallback(
+ self.report, record_envelope, write_callback
+ ).kafka_callback
+ try:
+ record = record_envelope.record
+ self.emitter.emit(
record,
- callback=_KafkaCallback(
- self.report, record_envelope, write_callback
- ).kafka_callback,
- )
- elif isinstance(
- record, (MetadataChangeProposalWrapper, MetadataChangeProposalClass)
- ):
- self.emitter.emit_mcp_async(
- record,
- callback=_KafkaCallback(
- self.report, record_envelope, write_callback
- ).kafka_callback,
- )
- else:
- raise ValueError(
- f"The datahub-kafka sink only supports MetadataChangeEvent/MetadataChangeProposal[Wrapper] classes, not {type(record)}"
+ callback=callback,
)
+ except Exception as err:
+ # In case we throw an exception while trying to emit the record,
+ # catch it and report the failure. This might happen if the schema
+ # registry is down or otherwise misconfigured, in which case we'd
+ # fail when serializing the record.
+ callback(err, f"Failed to write record: {err}")
def close(self) -> None:
self.emitter.flush()
From 6c932e8afeb4ac71c9b6b31e9fde3876c9e947cf Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz
Date: Wed, 25 Oct 2023 16:17:09 -0400
Subject: [PATCH 22/40] feat(ingest/bigquery): Attempt to support raw dataset
pattern (#9109)
---
docs/how/updating-datahub.md | 8 +--
.../source/bigquery_v2/bigquery_config.py | 18 ++++++-
.../tests/unit/test_bigquery_source.py | 53 +++++++++++++++++++
3 files changed, 74 insertions(+), 5 deletions(-)
diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md
index 8813afee65be9..4d1535f28fa0a 100644
--- a/docs/how/updating-datahub.md
+++ b/docs/how/updating-datahub.md
@@ -53,10 +53,10 @@ into
for example, using `datahub put` command. Policies can be also removed and re-created via UI.
- #9077 - The BigQuery ingestion source by default sets `match_fully_qualified_names: true`.
This means that any `dataset_pattern` or `schema_pattern` specified will be matched on the fully
-qualified dataset name, i.e. `.`. If this is not the case, please
-update your pattern (e.g. prepend your old dataset pattern with `.*\.` which matches the project part),
-or set `match_fully_qualified_names: false` in your recipe. However, note that
-setting this to `false` is deprecated and this flag will be removed entirely in a future release.
+qualified dataset name, i.e. `.`. We attempt to support the old
+pattern format by prepending `.*\\.` to dataset patterns lacking a period, so in most cases this
+should not cause any issues. However, if you have a complex dataset pattern, we recommend you
+manually convert it to the fully qualified format to avoid any potential issues.
### Potential Downtime
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
index a6a740385cf5c..6203192769750 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
@@ -299,7 +299,7 @@ def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
"use project_id_pattern whenever possible. project_id will be deprecated, please use project_id_pattern only if possible."
)
- dataset_pattern = values.get("dataset_pattern")
+ dataset_pattern: Optional[AllowDenyPattern] = values.get("dataset_pattern")
schema_pattern = values.get("schema_pattern")
if (
dataset_pattern == AllowDenyPattern.allow_all()
@@ -329,6 +329,22 @@ def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
"Please update `dataset_pattern` to match against fully qualified schema name `.` and set config `match_fully_qualified_names : True`."
"The config option `match_fully_qualified_names` is deprecated and will be removed in a future release."
)
+ elif match_fully_qualified_names and dataset_pattern is not None:
+ adjusted = False
+ for lst in [dataset_pattern.allow, dataset_pattern.deny]:
+ for i, pattern in enumerate(lst):
+ if "." not in pattern:
+ if pattern.startswith("^"):
+ lst[i] = r"^.*\." + pattern[1:]
+ else:
+ lst[i] = r".*\." + pattern
+ adjusted = True
+ if adjusted:
+ logger.warning(
+ "`dataset_pattern` was adjusted to match against fully qualified schema names,"
+ " of the form `.`."
+ )
+
return values
def get_table_pattern(self, pattern: List[str]) -> str:
diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py
index 5a11a933c8595..4cfa5c48d2377 100644
--- a/metadata-ingestion/tests/unit/test_bigquery_source.py
+++ b/metadata-ingestion/tests/unit/test_bigquery_source.py
@@ -53,6 +53,59 @@ def test_bigquery_uri_on_behalf():
assert config.get_sql_alchemy_url() == "bigquery://test-project-on-behalf"
+def test_bigquery_dataset_pattern():
+ config = BigQueryV2Config.parse_obj(
+ {
+ "dataset_pattern": {
+ "allow": [
+ "test-dataset",
+ "test-project.test-dataset",
+ ".*test-dataset",
+ ],
+ "deny": [
+ "^test-dataset-2$",
+ "project\\.second_dataset",
+ ],
+ },
+ }
+ )
+ assert config.dataset_pattern.allow == [
+ r".*\.test-dataset",
+ r"test-project.test-dataset",
+ r".*test-dataset",
+ ]
+ assert config.dataset_pattern.deny == [
+ r"^.*\.test-dataset-2$",
+ r"project\.second_dataset",
+ ]
+
+ config = BigQueryV2Config.parse_obj(
+ {
+ "dataset_pattern": {
+ "allow": [
+ "test-dataset",
+ "test-project.test-dataset",
+ ".*test-dataset",
+ ],
+ "deny": [
+ "^test-dataset-2$",
+ "project\\.second_dataset",
+ ],
+ },
+ "match_fully_qualified_names": False,
+ }
+ )
+ assert config.dataset_pattern.allow == [
+ r"test-dataset",
+ r"test-project.test-dataset",
+ r".*test-dataset",
+ ]
+ assert config.dataset_pattern.deny == [
+ r"^test-dataset-2$",
+ r"project\.second_dataset",
+ ]
+
+
def test_bigquery_uri_with_credential():
expected_credential_json = {
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
From 2ebf33eb13d14c17bc6cb0eaee3a97dba33ea338 Mon Sep 17 00:00:00 2001
From: Zachary McNellis
Date: Wed, 25 Oct 2023 16:25:41 -0400
Subject: [PATCH 23/40] docs(observability): Column Assertion user guide
(#9106)
Co-authored-by: John Joyce
---
docs-website/sidebars.js | 1 +
.../observe/column-assertions.md | 358 ++++++++++++++++++
2 files changed, 359 insertions(+)
create mode 100644 docs/managed-datahub/observe/column-assertions.md
diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js
index b2b3df4dfb33c..31d69aec46d8b 100644
--- a/docs-website/sidebars.js
+++ b/docs-website/sidebars.js
@@ -446,6 +446,7 @@ module.exports = {
"docs/managed-datahub/observe/freshness-assertions",
"docs/managed-datahub/observe/volume-assertions",
"docs/managed-datahub/observe/custom-sql-assertions",
+ "docs/managed-datahub/observe/column-assertions",
],
},
{
diff --git a/docs/managed-datahub/observe/column-assertions.md b/docs/managed-datahub/observe/column-assertions.md
new file mode 100644
index 0000000000000..99a764f771676
--- /dev/null
+++ b/docs/managed-datahub/observe/column-assertions.md
@@ -0,0 +1,358 @@
+---
+description: This page provides an overview of working with DataHub Column Assertions
+---
+import FeatureAvailability from '@site/src/components/FeatureAvailability';
+
+
+# Column Assertions
+
+
+
+> ⚠️ The **Column Assertions** feature is currently in private beta, part of the **Acryl Observe** module, and may only
+> be available to a limited set of design partners.
+>
+> If you are interested in trying it and providing feedback, please reach out to your Acryl Customer Success
+> representative.
+
+## Introduction
+
+Can you remember a time when an important warehouse table column changed dramatically, with little or no notice? Perhaps the number of null values suddenly spiked, or a new value was added to a fixed set of possible values. If the answer is yes, how did you initially find out? We'll take a guess - someone looking at an internal reporting dashboard or worse, a user using your your product, sounded an alarm when a number looked a bit out of the ordinary.
+
+There are many reasons why important columns in your Snowflake, Redshift, or BigQuery tables may change - application code bugs, new feature rollouts, etc. Oftentimes, these changes break important assumptions made about the data used in building key downstream data products like reporting dashboards or data-driven product features.
+
+What if you could reduce the time to detect these incidents, so that the people responsible for the data were made aware of data issues before anyone else? With Acryl DataHub Column Assertions, you can.
+
+With Acryl DataHub, you can define **Column Value** assertions to ensure each value in a column matches specific constraints, and **Column Metric** assertions to ensure that computed metrics from columns align with your expectations. As soon as things go wrong, your team will be the first to know, before the data issue becomes a larger data incident.
+
+In this guide, we'll cover the basics of Column Assertions - what they are, how to configure them, and more - so that you and your team can start building trust in your most important data assets.
+
+Let's dive in!
+
+## Support
+
+Column Assertions are currently supported for:
+
+1. Snowflake
+2. Redshift
+3. BigQuery
+
+Note that an Ingestion Source _must_ be configured with the data platform of your choice in
+Acryl DataHub's **Ingestion** tab.
+
+> Note that Column Assertions are not yet supported if you are connecting to your warehouse
+> using the DataHub CLI or a Remote Ingestion Executor.
+
+## What is a Column Assertion?
+
+A **Column Assertion** is a highly configurable Data Quality rule used to monitor specific columns of a Data Warehouse table for unexpected changes.
+
+Column Assertions are defined to validate a specific column, and can be used to
+
+1. Validate that the values of the column match some constraints (regex, allowed values, max, min, etc) across rows OR
+2. Validate that specific column aggregation metrics match some expectations across rows.
+
+Column Assertions can be particularly useful for documenting and enforcing column-level "contracts", i.e. formal specifications about the expected contents of a particular column that can be used for coordinating among producers and consumers of the data.
+
+### Anatomy of Column Assertion
+
+Column Assertions can be divided into two main types: **Column Value** and **Column Metric** Assertions.
+
+A **Column Value Assertion** is used to monitor the value of a specific column in a table, and ensure that every row
+adheres to a specific condition. In comparison, a **Column Metric Assertion** is used to compute a metric for that column,
+and ensure that the value of that metric adheres to a specific condition.
+
+At the most basic level, both types consist of a few important parts:
+
+1. An **Evaluation Schedule**
+2. A **Column Selection**
+3. A **Evaluation Criteria**
+4. A **Row Evaluation Type**
+
+In this section, we'll give an overview of each.
+
+#### 1. Evaluation Schedule
+
+The **Evaluation Schedule**: This defines how often to evaluate the Column Assertion against the given warehouse table.
+This should usually be configured to match the expected change frequency of the table, although it can also be less
+frequently depending on your requirements. You can also specify specific days of the week, hours in the day, or even
+minutes in an hour.
+
+#### 2. Column Selection
+
+The **Column Selection**: This defines the column that should be monitored by the Column Assertion. You can choose from
+any of the columns from the table listed in the dropdown. Note that columns of struct / object type are not currently supported.
+
+#### 3. Evaluation Criteria
+
+The **Evaluation Criteria**: This defines the condition that must be satisfied in order for the Column
+Assertion to pass.
+
+For **Column Value Assertions**, you will be able to choose from a set of operators that can be applied to the column
+value. The options presented will vary based on the data type of the selected column. For example, if you've selected a numeric column, you
+can verify that the column value is greater than a particular value. For string types, you can check that the column value
+matches a particular regex pattern. Additionally, you are able to control the behavior of the check in the presence of NULL values. If the
+**Allow Nulls** option is _disabled_, then any null values encountered will be reported as a failure when evaluating the
+assertion. If **Allow Nulls** is enabled, then nulls will be ignored; the condition will be evaluated for rows where the column value is non-null.
+
+For **Column Metric Assertions**, you will be able to choose from a list of common column metrics - MAX, MIN, MEAN, NULL COUNT, etc - and then compare these metric values to an expected value. The list of metrics will vary based on the type of the selected column. For example
+if you've selected a numeric column, you can choose to compute the MEAN value of the column, and then assert that it is greater than a
+specific number. For string types, you can choose to compute the MAX LENGTH of the string across all column values, and then assert that it
+is less than a specific number.
+
+#### 4. Row Selection Set
+
+The **Row Selection Set**: This defines which rows in the table the Column Assertion will be evaluated across. You can choose
+from the following options:
+
+- **All Table Rows**: Evaluate the Column Assertion across all rows in the table. This is the default option. Note that
+this may not be desirable for large tables.
+
+- **Only Rows That Have Changed**: Evaluate the Column Assertion only against rows that have changed since the last
+evaluation of the assertion. If you choose this option, you will need to specify a **High Watermark Column** to help determine which rows
+have changed. A **High Watermark Column** is a column that contains a constantly incrementing value - a date, a time, or
+another always-increasing number - that can be used to find the "new rows" that were added since previous evaluation. When selected, a query will be issued to the table to find only the rows that have changed since the previous assertion evaluation.
+
+## Creating a Column Assertion
+
+### Prerequisites
+
+1. **Permissions**: To create or delete Column Assertions for a specific entity on DataHub, you'll need to be granted the
+ `Edit Assertions` and `Edit Monitors` privileges for the entity. This is granted to Entity owners by default.
+
+2. **Data Platform Connection**: In order to create a Column Assertion, you'll need to have an **Ingestion Source**
+ configured to your Data Platform: Snowflake, BigQuery, or Redshift under the **Ingestion** tab.
+
+Once these are in place, you're ready to create your Column Assertions!
+
+### Steps
+
+1. Navigate to the Table that you want to monitor
+2. Click the **Validations** tab
+
+
+
+
+
+3. Click **+ Create Assertion**
+
+
+
+
+
+4. Choose **Column**
+
+5. Configure the evaluation **schedule**. This is the frequency at which the assertion will be evaluated to produce a
+ pass or fail result, and the times when the column values will be checked.
+
+6. Configure the **column assertion type**. You can choose from **Column Value** or **Column Metric**.
+ **Column Value** assertions are used to monitor the value of a specific column in a table, and ensure that every row
+ adheres to a specific condition. **Column Metric** assertions are used to compute a metric for that column, and then compare the value of that metric to your expectations.
+
+
+
+
+
+7. Configure the **column selection**. This defines the column that should be monitored by the Column Assertion.
+ You can choose from any of the columns from the table listed in the dropdown.
+
+
+
+
+
+8. Configure the **evaluation criteria**. This step varies based on the type of assertion you chose in the previous step.
+
+ - **Column Value Assertions**: You will be able to choose from a set of operators that can be applied to the column
+ value. The options presented will vary based on the data type of the selected column. For example with numeric types, you
+ can check that the column value is greater than a specific value. For string types, you can check that the column value
+ matches a particular regex pattern. You will also be able to control the behavior of null values in the column. If the
+ **Allow Nulls** option is _disabled_, any null values encountered will be reported as a failure when evaluating the
+ assertion.
+
+ - **Column Metric Assertions**: You will be able to choose from a list of common metrics and then specify the operator
+ and value to compare against. The list of metrics will vary based on the data type of the selected column. For example
+ with numeric types, you can choose to compute the average value of the column, and then assert that it is greater than a
+ specific number. For string types, you can choose to compute the max length of all column values, and then assert that it
+ is less than a specific number.
+
+9. Configure the **row evaluation type**. This defines which rows in the table the Column Assertion should evaluate. You can choose
+ from the following options:
+
+ - **All Table Rows**: Evaluate the Column Assertion against all rows in the table. This is the default option. Note that
+ this may not be desirable for large tables.
+
+ - **Only Rows That Have Changed**: Evaluate the Column Assertion only against rows that have changed since the last
+ evaluation. If you choose this option, you will need to specify a **High Watermark Column** to help determine which rows
+ have changed. A **High Watermark Column** is a column that contains a constantly-incrementing value - a date, a time, or
+ another always-increasing number. When selected, a query will be issued to the table find only the rows which have changed since the last assertion run.
+
+
+
+
+
+10. (Optional) Click **Advanced** to further customize the Column Assertion. The options listed here will vary based on the
+ type of assertion you chose in the previous step.
+
+ - **Invalid Values Threshold**: For **Column Value** assertions, you can configure the number of invalid values
+ (i.e. rows) that are allowed to fail before the assertion is marked as failing. This is useful if you want to allow a limited number
+ of invalid values in the column. By default this is 0, meaning the assertion will fail if any rows have an invalid column value.
+
+ - **Source**: For **Column Metric** assertions, you can choose the mechanism that will be used to obtain the column
+ metric. **Query** will issue a query to the dataset to compute the metric. **DataHub Dataset Profile** will use the
+ DataHub Dataset Profile metadata to compute the metric. Note that this option requires that dataset profiling
+ statistics are up-to-date as of the assertion run time.
+
+ - **Additional Filters**: You can choose to add additional filters to the query that will be used to evaluate the
+ assertion. This is useful if you want to limit the assertion to a subset of rows in the table. Note this option will not
+ be available if you choose **DataHub Dataset Profile** as the **source**.
+
+11. Click **Next**
+12. Configure actions that should be taken when the Column Assertion passes or fails
+
+
+
+
+
+- **Raise incident**: Automatically raise a new DataHub `Column` Incident for the Table whenever the Column Assertion is failing. This
+ may indicate that the Table is unfit for consumption. Configure Slack Notifications under **Settings** to be notified when
+ an incident is created due to an Assertion failure.
+- **Resolve incident**: Automatically resolved any incidents that were raised due to failures in this Column Assertion. Note that
+ any other incidents will not be impacted.
+
+10. Click **Save**.
+
+And that's it! DataHub will now begin to monitor your Column Assertion for the table.
+
+To view the time of the next Column Assertion evaluation, simply click **Column** and then click on your
+new Assertion:
+
+
+
+
+
+Once your assertion has run, you will begin to see Success or Failure status for the Table
+
+
+
+
+
+## Stopping a Column Assertion
+
+In order to temporarily stop the evaluation of a Column Assertion:
+
+1. Navigate to the **Validations** tab of the table with the assertion
+2. Click **Column** to open the Column Assertions list
+3. Click the three-dot menu on the right side of the assertion you want to disable
+4. Click **Stop**
+
+
+
+## Creating Column Assertions via API
+
+Under the hood, Acryl DataHub implements Column Assertion Monitoring using two "entity" concepts:
+
+- **Assertion**: The specific expectation for the column metric. e.g. "The value of an integer column is greater than 10 for all rows in the table." This is the "what".
+
+- **Monitor**: The process responsible for evaluating the Assertion on a given evaluation schedule and using specific
+ mechanisms. This is the "how".
+
+Note that to create or delete Assertions and Monitors for a specific entity on DataHub, you'll need the
+`Edit Assertions` and `Edit Monitors` privileges for it.
+
+#### GraphQL
+
+In order to create a Column Assertion that is being monitored on a specific **Evaluation Schedule**, you'll need to use 2
+GraphQL mutation queries to create a Column Assertion entity and create an Assertion Monitor entity responsible for evaluating it.
+
+Start by creating the Column Assertion entity using the `createFieldAssertion` query and hang on to the 'urn' field of the Assertion entity
+you get back. Then continue by creating a Monitor entity using the `createAssertionMonitor`.
+
+##### Examples
+
+To create a Column Assertion Entity that checks that the value of an integer column is greater than 10:
+
+```json
+mutation createFieldAssertion {
+ createFieldAssertion(
+ input: {
+ entityUrn: "",
+ type: FIELD_VALUES,
+ fieldValuesAssertion: {
+ field: {
+ path: "",
+ type: "NUMBER",
+ nativeType: "NUMBER(38,0)"
+ },
+ operator: GREATER_THAN,
+ parameters: {
+ value: {
+ type: NUMBER,
+ value: "10"
+ }
+ },
+ failThreshold: {
+ type: COUNT,
+ value: 0
+ },
+ excludeNulls: true
+ }
+ }
+ ) {
+ urn
+}
+}
+```
+
+To create an Assertion Monitor Entity that evaluates the column assertion every 8 hours using all rows in the table:
+
+```json
+mutation createAssertionMonitor {
+ createAssertionMonitor(
+ input: {
+ entityUrn: "",
+ assertionUrn: "",
+ schedule: {
+ cron: "0 */8 * * *",
+ timezone: "America/Los_Angeles"
+ },
+ parameters: {
+ type: DATASET_FIELD,
+ datasetFieldParameters: {
+ sourceType: ALL_ROWS_QUERY
+ }
+ }
+ }
+ ) {
+ urn
+ }
+}
+```
+
+This entity defines _when_ to run the check (Using CRON format - every 8th hour) and _how_ to run the check (using a query against all rows of the table).
+
+After creating the monitor, the new assertion will start to be evaluated every 8 hours in your selected timezone.
+
+You can delete assertions along with their monitors using GraphQL mutations: `deleteAssertion` and `deleteMonitor`.
+
+### Tips
+
+:::info
+**Authorization**
+
+Remember to always provide a DataHub Personal Access Token when calling the GraphQL API. To do so, just add the 'Authorization' header as follows:
+
+```
+Authorization: Bearer
+```
+
+**Exploring GraphQL API**
+
+Also, remember that you can play with an interactive version of the Acryl GraphQL API at `https://your-account-id.acryl.io/api/graphiql`
+:::
From f402090c1ebec9601e5fef6e45879d3a0a015dbd Mon Sep 17 00:00:00 2001
From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com>
Date: Thu, 26 Oct 2023 21:44:32 +0530
Subject: [PATCH 24/40] feat(ingest): support view lineage for all sqlalchemy
sources (#9039)
---
metadata-ingestion/setup.py | 52 +-
.../src/datahub/configuration/common.py | 2 +-
.../datahub/emitter/sql_parsing_builder.py | 5 +-
.../api/incremental_lineage_helper.py | 13 +-
.../src/datahub/ingestion/api/source.py | 1 +
.../ingestion/source/dbt/dbt_common.py | 5 +
.../source/snowflake/snowflake_lineage_v2.py | 14 +-
.../src/datahub/ingestion/source/sql/hive.py | 83 ++-
.../datahub/ingestion/source/sql/postgres.py | 20 +-
.../ingestion/source/sql/sql_common.py | 126 +++-
.../ingestion/source/sql/sql_config.py | 19 +-
.../datahub/ingestion/source/sql/teradata.py | 54 +-
.../source/sql/two_tier_sql_source.py | 6 +-
.../datahub/ingestion/source/sql/vertica.py | 2 +-
.../source/state/stateful_ingestion_base.py | 3 +-
.../ingestion/source_config/sql/snowflake.py | 12 +-
.../src/datahub/utilities/sqlglot_lineage.py | 21 +-
.../hive/hive_mces_all_db_golden.json | 581 +++++++++++++++---
.../integration/hive/hive_mces_golden.json | 530 ++++++++++++++--
.../tests/integration/hive/hive_setup.sql | 22 +-
.../mysql/mysql_mces_no_db_golden.json | 272 ++++++--
.../postgres_all_db_mces_with_db_golden.json | 324 ++++++++--
..._db_to_file_with_db_estimate_row_count.yml | 2 +-
.../postgres_mces_with_db_golden.json | 264 +++++++-
...res_to_file_with_db_estimate_row_count.yml | 2 +-
.../snowflake/test_snowflake_failures.py | 3 +-
.../trino/trino_hive_mces_golden.json | 211 +++++--
.../test_incremental_lineage_helper.py | 21 +
28 files changed, 2193 insertions(+), 477 deletions(-)
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index 0b8661b0df5f5..7f7826abe2095 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -101,22 +101,36 @@
"grpcio-tools>=1.44.0,<2",
}
-sql_common = {
- # Required for all SQL sources.
- # This is temporary lower bound that we're open to loosening/tightening as requirements show up
- "sqlalchemy>=1.4.39, <2",
- # Required for SQL profiling.
- "great-expectations>=0.15.12, <=0.15.50",
- # scipy version restricted to reduce backtracking, used by great-expectations,
- "scipy>=1.7.2",
- # GE added handling for higher version of jinja2
- # https://github.com/great-expectations/great_expectations/pull/5382/files
- # datahub does not depend on traitlets directly but great expectations does.
- # https://github.com/ipython/traitlets/issues/741
- "traitlets<5.2.2",
- "greenlet",
+usage_common = {
+ "sqlparse",
+}
+
+sqlglot_lib = {
+ # Using an Acryl fork of sqlglot.
+ # https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:hsheth?expand=1
+ "acryl-sqlglot==18.5.2.dev45",
}
+sql_common = (
+ {
+ # Required for all SQL sources.
+ # This is temporary lower bound that we're open to loosening/tightening as requirements show up
+ "sqlalchemy>=1.4.39, <2",
+ # Required for SQL profiling.
+ "great-expectations>=0.15.12, <=0.15.50",
+ # scipy version restricted to reduce backtracking, used by great-expectations,
+ "scipy>=1.7.2",
+ # GE added handling for higher version of jinja2
+ # https://github.com/great-expectations/great_expectations/pull/5382/files
+ # datahub does not depend on traitlets directly but great expectations does.
+ # https://github.com/ipython/traitlets/issues/741
+ "traitlets<5.2.2",
+ "greenlet",
+ }
+ | usage_common
+ | sqlglot_lib
+)
+
sqllineage_lib = {
"sqllineage==1.3.8",
# We don't have a direct dependency on sqlparse but it is a dependency of sqllineage.
@@ -125,12 +139,6 @@
"sqlparse==0.4.4",
}
-sqlglot_lib = {
- # Using an Acryl fork of sqlglot.
- # https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:hsheth?expand=1
- "acryl-sqlglot==18.5.2.dev45",
-}
-
aws_common = {
# AWS Python SDK
"boto3",
@@ -243,10 +251,6 @@
powerbi_report_server = {"requests", "requests_ntlm"}
-usage_common = {
- "sqlparse",
-}
-
databricks = {
# 0.1.11 appears to have authentication issues with azure databricks
"databricks-sdk>=0.9.0",
diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py
index c909b89eb0c2d..73ac4baac48c0 100644
--- a/metadata-ingestion/src/datahub/configuration/common.py
+++ b/metadata-ingestion/src/datahub/configuration/common.py
@@ -283,7 +283,7 @@ class VersionedConfig(ConfigModel):
class LineageConfig(ConfigModel):
incremental_lineage: bool = Field(
- default=True,
+ default=False,
description="When enabled, emits lineage as incremental to existing lineage already in DataHub. When disabled, re-states lineage on each run.",
)
diff --git a/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py b/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py
index dedcfa0385f75..cedaa4fbbd7f6 100644
--- a/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py
+++ b/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py
@@ -106,6 +106,7 @@ def process_sql_parsing_result(
user: Optional[UserUrn] = None,
custom_operation_type: Optional[str] = None,
include_urns: Optional[Set[DatasetUrn]] = None,
+ include_column_lineage: bool = True,
) -> Iterable[MetadataWorkUnit]:
"""Process a single query and yield any generated workunits.
@@ -130,7 +131,9 @@ def process_sql_parsing_result(
_merge_lineage_data(
downstream_urn=downstream_urn,
upstream_urns=result.in_tables,
- column_lineage=result.column_lineage,
+ column_lineage=result.column_lineage
+ if include_column_lineage
+ else None,
upstream_edges=self._lineage_map[downstream_urn],
query_timestamp=query_timestamp,
is_view_ddl=is_view_ddl,
diff --git a/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py b/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py
index 9478c5cf7efa2..945b201ca5758 100644
--- a/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py
+++ b/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py
@@ -130,10 +130,13 @@ def auto_incremental_lineage(
if len(wu.metadata.proposedSnapshot.aspects) > 0:
yield wu
- yield _lineage_wu_via_read_modify_write(
- graph, urn, lineage_aspect, wu.metadata.systemMetadata
- ) if lineage_aspect.fineGrainedLineages else _convert_upstream_lineage_to_patch(
- urn, lineage_aspect, wu.metadata.systemMetadata
- )
+ if lineage_aspect.fineGrainedLineages:
+ yield _lineage_wu_via_read_modify_write(
+ graph, urn, lineage_aspect, wu.metadata.systemMetadata
+ )
+ elif lineage_aspect.upstreams:
+ yield _convert_upstream_lineage_to_patch(
+ urn, lineage_aspect, wu.metadata.systemMetadata
+ )
else:
yield wu
diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py
index b86844b1c4c83..8940642f7008a 100644
--- a/metadata-ingestion/src/datahub/ingestion/api/source.py
+++ b/metadata-ingestion/src/datahub/ingestion/api/source.py
@@ -215,6 +215,7 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
)
):
auto_lowercase_dataset_urns = auto_lowercase_urns
+
return [
auto_lowercase_dataset_urns,
auto_status_aspect,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
index 48d2118a9b091..c4de24bf192f1 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
@@ -280,6 +280,11 @@ class DBTCommonConfig(
default=False,
description="When enabled, dbt test warnings will be treated as failures.",
)
+ # override fault value to True.
+ incremental_lineage: bool = Field(
+ default=True,
+ description="When enabled, emits lineage as incremental to existing lineage already in DataHub. When disabled, re-states lineage on each run.",
+ )
@validator("target_platform")
def validate_target_platform_value(cls, target_platform: str) -> str:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py
index 0a15c352fc842..9649054dbe6cb 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py
@@ -136,7 +136,6 @@ def get_workunits(
return
self._populate_external_lineage_map(discovered_tables)
-
if self.config.include_view_lineage:
if len(discovered_views) > 0:
yield from self.get_view_upstream_workunits(
@@ -200,14 +199,15 @@ def _gen_workunit_from_sql_parsing_result(
self,
dataset_identifier: str,
result: SqlParsingResult,
- ) -> MetadataWorkUnit:
+ ) -> Iterable[MetadataWorkUnit]:
upstreams, fine_upstreams = self.get_upstreams_from_sql_parsing_result(
self.dataset_urn_builder(dataset_identifier), result
)
- self.report.num_views_with_upstreams += 1
- return self._create_upstream_lineage_workunit(
- dataset_identifier, upstreams, fine_upstreams
- )
+ if upstreams:
+ self.report.num_views_with_upstreams += 1
+ yield self._create_upstream_lineage_workunit(
+ dataset_identifier, upstreams, fine_upstreams
+ )
def _gen_workunits_from_query_result(
self,
@@ -251,7 +251,7 @@ def get_view_upstream_workunits(
)
if result:
views_processed.add(view_identifier)
- yield self._gen_workunit_from_sql_parsing_result(
+ yield from self._gen_workunit_from_sql_parsing_result(
view_identifier, result
)
self.report.view_lineage_parse_secs = timer.elapsed_seconds()
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py
index 63b21bc82eddd..d081acb6c1eff 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py
@@ -1,15 +1,18 @@
import json
import logging
import re
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Iterable, List, Optional, Union
from pydantic.class_validators import validator
from pydantic.fields import Field
# This import verifies that the dependencies are available.
from pyhive import hive # noqa: F401
-from pyhive.sqlalchemy_hive import HiveDate, HiveDecimal, HiveTimestamp
+from pyhive.sqlalchemy_hive import HiveDate, HiveDecimal, HiveDialect, HiveTimestamp
+from sqlalchemy.engine.reflection import Inspector
+from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.api.decorators import (
SourceCapability,
SupportStatus,
@@ -18,8 +21,10 @@
platform_name,
support_status,
)
+from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.extractor import schema_util
-from datahub.ingestion.source.sql.sql_common import register_custom_type
+from datahub.ingestion.source.sql.sql_common import SqlWorkUnit, register_custom_type
+from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
from datahub.ingestion.source.sql.two_tier_sql_source import (
TwoTierSQLAlchemyConfig,
TwoTierSQLAlchemySource,
@@ -31,6 +36,7 @@
SchemaField,
TimeTypeClass,
)
+from datahub.metadata.schema_classes import ViewPropertiesClass
from datahub.utilities import config_clean
from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
@@ -90,19 +96,34 @@ def dbapi_get_columns_patched(self, connection, table_name, schema=None, **kw):
logger.warning(f"Failed to patch method due to {e}")
+@reflection.cache # type: ignore
+def get_view_names_patched(self, connection, schema=None, **kw):
+ query = "SHOW VIEWS"
+ if schema:
+ query += " IN " + self.identifier_preparer.quote_identifier(schema)
+ return [row[0] for row in connection.execute(query)]
+
+
+@reflection.cache # type: ignore
+def get_view_definition_patched(self, connection, view_name, schema=None, **kw):
+ full_table = self.identifier_preparer.quote_identifier(view_name)
+ if schema:
+ full_table = "{}.{}".format(
+ self.identifier_preparer.quote_identifier(schema),
+ self.identifier_preparer.quote_identifier(view_name),
+ )
+ row = connection.execute("SHOW CREATE TABLE {}".format(full_table)).fetchone()
+ return row[0]
+
+
+HiveDialect.get_view_names = get_view_names_patched
+HiveDialect.get_view_definition = get_view_definition_patched
+
+
class HiveConfig(TwoTierSQLAlchemyConfig):
# defaults
scheme = Field(default="hive", hidden_from_docs=True)
- # Hive SQLAlchemy connector returns views as tables.
- # See https://github.com/dropbox/PyHive/blob/b21c507a24ed2f2b0cf15b0b6abb1c43f31d3ee0/pyhive/sqlalchemy_hive.py#L270-L273.
- # Disabling views helps us prevent this duplication.
- include_views = Field(
- default=False,
- hidden_from_docs=True,
- description="Hive SQLAlchemy connector returns views as tables. See https://github.com/dropbox/PyHive/blob/b21c507a24ed2f2b0cf15b0b6abb1c43f31d3ee0/pyhive/sqlalchemy_hive.py#L270-L273. Disabling views helps us prevent this duplication.",
- )
-
@validator("host_port")
def clean_host_port(cls, v):
return config_clean.remove_protocol(v)
@@ -174,3 +195,41 @@ def get_schema_fields_for_column(
return new_fields
return fields
+
+ # Hive SQLAlchemy connector returns views as tables in get_table_names.
+ # See https://github.com/dropbox/PyHive/blob/b21c507a24ed2f2b0cf15b0b6abb1c43f31d3ee0/pyhive/sqlalchemy_hive.py#L270-L273.
+ # This override makes sure that we ingest view definitions for views
+ def _process_view(
+ self,
+ dataset_name: str,
+ inspector: Inspector,
+ schema: str,
+ view: str,
+ sql_config: SQLCommonConfig,
+ ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
+ dataset_urn = make_dataset_urn_with_platform_instance(
+ self.platform,
+ dataset_name,
+ self.config.platform_instance,
+ self.config.env,
+ )
+
+ try:
+ view_definition = inspector.get_view_definition(view, schema)
+ if view_definition is None:
+ view_definition = ""
+ else:
+ # Some dialects return a TextClause instead of a raw string,
+ # so we need to convert them to a string.
+ view_definition = str(view_definition)
+ except NotImplementedError:
+ view_definition = ""
+
+ if view_definition:
+ view_properties_aspect = ViewPropertiesClass(
+ materialized=False, viewLanguage="SQL", viewLogic=view_definition
+ )
+ yield MetadataChangeProposalWrapper(
+ entityUrn=dataset_urn,
+ aspect=view_properties_aspect,
+ ).as_workunit()
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py b/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py
index a6a9d8e2c8597..4f133c6459a0f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py
@@ -103,10 +103,6 @@ class BasePostgresConfig(BasicSQLAlchemyConfig):
class PostgresConfig(BasePostgresConfig):
- include_view_lineage = Field(
- default=False, description="Include table lineage for views"
- )
-
database_pattern: AllowDenyPattern = Field(
default=AllowDenyPattern.allow_all(),
description=(
@@ -183,9 +179,10 @@ def get_inspectors(self) -> Iterable[Inspector]:
def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
yield from super().get_workunits_internal()
- for inspector in self.get_inspectors():
- if self.config.include_view_lineage:
- yield from self._get_view_lineage_workunits(inspector)
+ if self.views_failed_parsing:
+ for inspector in self.get_inspectors():
+ if self.config.include_view_lineage:
+ yield from self._get_view_lineage_workunits(inspector)
def _get_view_lineage_elements(
self, inspector: Inspector
@@ -245,11 +242,14 @@ def _get_view_lineage_workunits(
dependent_view, dependent_schema = key
# Construct a lineage object.
+ view_identifier = self.get_identifier(
+ schema=dependent_schema, entity=dependent_view, inspector=inspector
+ )
+ if view_identifier not in self.views_failed_parsing:
+ return
urn = mce_builder.make_dataset_urn_with_platform_instance(
platform=self.platform,
- name=self.get_identifier(
- schema=dependent_schema, entity=dependent_view, inspector=inspector
- ),
+ name=view_identifier,
platform_instance=self.config.platform_instance,
env=self.config.env,
)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
index fad9b9e8018a5..51909eaf4ed55 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
@@ -2,12 +2,14 @@
import logging
import traceback
from dataclasses import dataclass, field
+from functools import partial
from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterable,
List,
+ MutableMapping,
Optional,
Set,
Tuple,
@@ -29,7 +31,9 @@
make_tag_urn,
)
from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
from datahub.ingestion.api.common import PipelineContext
+from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.source.common.subtypes import (
@@ -86,9 +90,16 @@
ViewPropertiesClass,
)
from datahub.telemetry import telemetry
+from datahub.utilities.file_backed_collections import FileBackedDict
from datahub.utilities.lossy_collections import LossyList
from datahub.utilities.registries.domain_registry import DomainRegistry
from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport
+from datahub.utilities.sqlglot_lineage import (
+ SchemaResolver,
+ SqlParsingResult,
+ sqlglot_lineage,
+ view_definition_lineage_helper,
+)
if TYPE_CHECKING:
from datahub.ingestion.source.ge_data_profiler import (
@@ -110,6 +121,11 @@ class SQLSourceReport(StaleEntityRemovalSourceReport):
query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None
+ num_view_definitions_parsed: int = 0
+ num_view_definitions_failed_parsing: int = 0
+ num_view_definitions_failed_column_parsing: int = 0
+ view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)
+
def report_entity_scanned(self, name: str, ent_type: str = "table") -> None:
"""
Entity could be a view or a table
@@ -319,6 +335,18 @@ def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str)
cached_domains=[k for k in self.config.domain], graph=self.ctx.graph
)
+ self.views_failed_parsing: Set[str] = set()
+ self.schema_resolver: SchemaResolver = SchemaResolver(
+ platform=self.platform,
+ platform_instance=self.config.platform_instance,
+ env=self.config.env,
+ )
+ self._view_definition_cache: MutableMapping[str, str]
+ if self.config.use_file_backed_cache:
+ self._view_definition_cache = FileBackedDict[str]()
+ else:
+ self._view_definition_cache = {}
+
def warn(self, log: logging.Logger, key: str, reason: str) -> None:
self.report.report_warning(key, reason[:100])
log.warning(f"{key} => {reason}")
@@ -455,6 +483,11 @@ def get_schema_level_workunits(
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
return [
*super().get_workunit_processors(),
+ partial(
+ auto_incremental_lineage,
+ self.ctx.graph,
+ self.config.incremental_lineage,
+ ),
StaleEntityRemovalHandler.create(
self, self.config, self.ctx
).workunit_processor,
@@ -512,6 +545,35 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit
profile_requests, profiler, platform=self.platform
)
+ if self.config.include_view_lineage:
+ yield from self.get_view_lineage()
+
+ def get_view_lineage(self) -> Iterable[MetadataWorkUnit]:
+ builder = SqlParsingBuilder(
+ generate_lineage=True,
+ generate_usage_statistics=False,
+ generate_operations=False,
+ )
+ for dataset_name in self._view_definition_cache.keys():
+ view_definition = self._view_definition_cache[dataset_name]
+ result = self._run_sql_parser(
+ dataset_name,
+ view_definition,
+ self.schema_resolver,
+ )
+ if result and result.out_tables:
+ # This does not yield any workunits but we use
+ # yield here to execute this method
+ yield from builder.process_sql_parsing_result(
+ result=result,
+ query=view_definition,
+ is_view_ddl=True,
+ include_column_lineage=self.config.include_view_column_lineage,
+ )
+ else:
+ self.views_failed_parsing.add(dataset_name)
+ yield from builder.gen_workunits()
+
def get_identifier(
self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any
) -> str:
@@ -658,6 +720,8 @@ def _process_table(
schema_fields,
)
dataset_snapshot.aspects.append(schema_metadata)
+ if self.config.include_view_lineage:
+ self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
db_name = self.get_db_name(inspector)
yield from self.add_table_to_schema_container(
@@ -862,6 +926,12 @@ def _process_view(
view: str,
sql_config: SQLCommonConfig,
) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
+ dataset_urn = make_dataset_urn_with_platform_instance(
+ self.platform,
+ dataset_name,
+ self.config.platform_instance,
+ self.config.env,
+ )
try:
columns = inspector.get_columns(view, schema)
except KeyError:
@@ -877,6 +947,8 @@ def _process_view(
columns,
canonical_schema=schema_fields,
)
+ if self.config.include_view_lineage:
+ self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
description, properties, _ = self.get_table_properties(inspector, schema, view)
try:
view_definition = inspector.get_view_definition(view, schema)
@@ -890,12 +962,9 @@ def _process_view(
view_definition = ""
properties["view_definition"] = view_definition
properties["is_view"] = "True"
- dataset_urn = make_dataset_urn_with_platform_instance(
- self.platform,
- dataset_name,
- self.config.platform_instance,
- self.config.env,
- )
+ if view_definition and self.config.include_view_lineage:
+ self._view_definition_cache[dataset_name] = view_definition
+
dataset_snapshot = DatasetSnapshot(
urn=dataset_urn,
aspects=[StatusClass(removed=False)],
@@ -942,6 +1011,51 @@ def _process_view(
domain_registry=self.domain_registry,
)
+ def _run_sql_parser(
+ self, view_identifier: str, query: str, schema_resolver: SchemaResolver
+ ) -> Optional[SqlParsingResult]:
+ try:
+ database, schema = self.get_db_schema(view_identifier)
+ except ValueError:
+ logger.warning(f"Invalid view identifier: {view_identifier}")
+ return None
+ raw_lineage = sqlglot_lineage(
+ query,
+ schema_resolver=schema_resolver,
+ default_db=database,
+ default_schema=schema,
+ )
+ view_urn = make_dataset_urn_with_platform_instance(
+ self.platform,
+ view_identifier,
+ self.config.platform_instance,
+ self.config.env,
+ )
+
+ if raw_lineage.debug_info.table_error:
+ logger.debug(
+ f"Failed to parse lineage for view {view_identifier}: "
+ f"{raw_lineage.debug_info.table_error}"
+ )
+ self.report.num_view_definitions_failed_parsing += 1
+ self.report.view_definitions_parsing_failures.append(
+ f"Table-level sql parsing error for view {view_identifier}: {raw_lineage.debug_info.table_error}"
+ )
+ return None
+
+ elif raw_lineage.debug_info.column_error:
+ self.report.num_view_definitions_failed_column_parsing += 1
+ self.report.view_definitions_parsing_failures.append(
+ f"Column-level sql parsing error for view {view_identifier}: {raw_lineage.debug_info.column_error}"
+ )
+ else:
+ self.report.num_view_definitions_parsed += 1
+ return view_definition_lineage_helper(raw_lineage, view_urn)
+
+ def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]:
+ database, schema, _view = dataset_identifier.split(".")
+ return database, schema
+
def get_profiler_instance(self, inspector: Inspector) -> "DatahubGEProfiler":
from datahub.ingestion.source.ge_data_profiler import DatahubGEProfiler
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py
index 57aae32b361cf..095b8e6443171 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py
@@ -6,7 +6,7 @@
from pydantic import Field
from sqlalchemy.engine import URL
-from datahub.configuration.common import AllowDenyPattern, ConfigModel
+from datahub.configuration.common import AllowDenyPattern, ConfigModel, LineageConfig
from datahub.configuration.source_common import (
DatasetSourceConfigMixin,
LowerCaseDatasetUrnConfigMixin,
@@ -28,6 +28,7 @@ class SQLCommonConfig(
StatefulIngestionConfigBase,
DatasetSourceConfigMixin,
LowerCaseDatasetUrnConfigMixin,
+ LineageConfig,
):
options: dict = pydantic.Field(
default_factory=dict,
@@ -70,6 +71,22 @@ class SQLCommonConfig(
description="If the source supports it, include table lineage to the underlying storage location.",
)
+ include_view_lineage: bool = Field(
+ default=True,
+ description="Populates view->view and table->view lineage using DataHub's sql parser.",
+ )
+
+ include_view_column_lineage: bool = Field(
+ default=True,
+ description="Populates column-level lineage for view->view and table->view lineage using DataHub's sql parser."
+ " Requires `include_view_lineage` to be enabled.",
+ )
+
+ use_file_backed_cache: bool = Field(
+ default=True,
+ description="Whether to use a file backed cache for the view definitions.",
+ )
+
profiling: GEProfilingConfig = GEProfilingConfig()
# Custom Stateful Ingestion settings
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py
index e628e4dbd3446..899a7b6697c0a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py
@@ -1,7 +1,7 @@
import logging
from dataclasses import dataclass
from datetime import datetime
-from typing import Iterable, MutableMapping, Optional, Union
+from typing import Iterable, Optional, Union
# This import verifies that the dependencies are available.
import teradatasqlalchemy # noqa: F401
@@ -33,14 +33,11 @@
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
-from datahub.metadata._schema_classes import SchemaMetadataClass, ViewPropertiesClass
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
BytesTypeClass,
TimeTypeClass,
)
-from datahub.utilities.file_backed_collections import FileBackedDict
from datahub.utilities.sqlglot_lineage import SchemaResolver, sqlglot_lineage
-from datahub.utilities.urns.dataset_urn import DatasetUrn
logger: logging.Logger = logging.getLogger(__name__)
@@ -87,11 +84,6 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
"This requires to have the table lineage feature enabled.",
)
- include_view_lineage = Field(
- default=True,
- description="Whether to include view lineage in the ingestion. "
- "This requires to have the view lineage feature enabled.",
- )
usage: BaseUsageConfig = Field(
description="The usage config to use when generating usage statistics",
default=BaseUsageConfig(),
@@ -107,11 +99,6 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
description="Generate usage statistic.",
)
- use_file_backed_cache: bool = Field(
- default=True,
- description="Whether to use a file backed cache for the view definitions.",
- )
-
@platform_name("Teradata")
@config_class(TeradataConfig)
@@ -143,8 +130,6 @@ class TeradataSource(TwoTierSQLAlchemySource):
and "timestamp" < TIMESTAMP '{end_time}'
"""
- _view_definition_cache: MutableMapping[str, str]
-
def __init__(self, config: TeradataConfig, ctx: PipelineContext):
super().__init__(config, ctx, "teradata")
@@ -167,34 +152,11 @@ def __init__(self, config: TeradataConfig, ctx: PipelineContext):
env=self.config.env,
)
- if self.config.use_file_backed_cache:
- self._view_definition_cache = FileBackedDict[str]()
- else:
- self._view_definition_cache = {}
-
@classmethod
def create(cls, config_dict, ctx):
config = TeradataConfig.parse_obj(config_dict)
return cls(config, ctx)
- def get_view_lineage(self) -> Iterable[MetadataWorkUnit]:
- for key in self._view_definition_cache.keys():
- view_definition = self._view_definition_cache[key]
- dataset_urn = DatasetUrn.create_from_string(key)
-
- db_name: Optional[str] = None
- # We need to get the default db from the dataset urn otherwise the builder generates the wrong urns
- if "." in dataset_urn.get_dataset_name():
- db_name = dataset_urn.get_dataset_name().split(".", 1)[0]
-
- self.report.num_view_ddl_parsed += 1
- if self.report.num_view_ddl_parsed % 1000 == 0:
- logger.info(f"Parsed {self.report.num_queries_parsed} view ddl")
-
- yield from self.gen_lineage_from_query(
- query=view_definition, default_database=db_name, is_view_ddl=True
- )
-
def get_audit_log_mcps(self) -> Iterable[MetadataWorkUnit]:
engine = self.get_metadata_engine()
for entry in engine.execute(
@@ -252,19 +214,7 @@ def get_metadata_engine(self) -> Engine:
def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
# Add all schemas to the schema resolver
- for wu in super().get_workunits_internal():
- urn = wu.get_urn()
- schema_metadata = wu.get_aspect_of_type(SchemaMetadataClass)
- if schema_metadata:
- self.schema_resolver.add_schema_metadata(urn, schema_metadata)
- view_properties = wu.get_aspect_of_type(ViewPropertiesClass)
- if view_properties and self.config.include_view_lineage:
- self._view_definition_cache[urn] = view_properties.viewLogic
- yield wu
-
- if self.config.include_view_lineage:
- self.report.report_ingestion_stage_start("view lineage extraction")
- yield from self.get_view_lineage()
+ yield from super().get_workunits_internal()
if self.config.include_table_lineage or self.config.include_usage_statistics:
self.report.report_ingestion_stage_start("audit log extraction")
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/two_tier_sql_source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/two_tier_sql_source.py
index 7a49551dc1235..efb1d3ffe119f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/two_tier_sql_source.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/two_tier_sql_source.py
@@ -1,6 +1,6 @@
import typing
import urllib.parse
-from typing import Any, Dict, Iterable, Optional
+from typing import Any, Dict, Iterable, Optional, Tuple
from pydantic.fields import Field
from sqlalchemy import create_engine, inspect
@@ -71,6 +71,10 @@ def __init__(self, config, ctx, platform):
super().__init__(config, ctx, platform)
self.config: TwoTierSQLAlchemyConfig = config
+ def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]:
+ schema, _view = dataset_identifier.split(".", 1)
+ return None, schema
+
def get_database_container_key(self, db_name: str, schema: str) -> ContainerKey:
# Because our overridden get_allowed_schemas method returns db_name as the schema name,
# the db_name and schema here will be the same. Hence, we just ignore the schema parameter.
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py b/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py
index a417cae2b1ab0..b89db755853bc 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py
@@ -86,7 +86,7 @@ class VerticaConfig(BasicSQLAlchemyConfig):
default=True, description="Whether Models should be ingested."
)
- include_view_lineage: Optional[bool] = pydantic.Field(
+ include_view_lineage: bool = pydantic.Field(
default=True,
description="If the source supports it, include view lineage to the underlying storage location.",
)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py
index be97e9380f1f5..7fb2cf9813cab 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py
@@ -11,7 +11,6 @@
ConfigModel,
ConfigurationError,
DynamicTypedConfig,
- LineageConfig,
)
from datahub.configuration.time_window_config import BaseTimeWindowConfig
from datahub.configuration.validate_field_rename import pydantic_renamed_field
@@ -100,7 +99,7 @@ class StatefulIngestionConfigBase(GenericModel, Generic[CustomConfig]):
)
-class StatefulLineageConfigMixin(LineageConfig):
+class StatefulLineageConfigMixin:
enable_stateful_lineage_ingestion: bool = Field(
default=True,
description="Enable stateful lineage ingestion."
diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py
index 0d72fc52da0ca..c3e8c175f1de5 100644
--- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py
+++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py
@@ -166,13 +166,17 @@ def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None:
"but should be set when using use_certificate false for oauth_config"
)
- @pydantic.validator("include_view_lineage")
- def validate_include_view_lineage(cls, v, values):
- if not values.get("include_table_lineage") and v:
+ @pydantic.root_validator()
+ def validate_include_view_lineage(cls, values):
+ if (
+ "include_table_lineage" in values
+ and not values.get("include_table_lineage")
+ and values.get("include_view_lineage")
+ ):
raise ValueError(
"include_table_lineage must be True for include_view_lineage to be set."
)
- return v
+ return values
def get_sql_alchemy_url(
self,
diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
index 526d90b2a1bfa..1d74b20569814 100644
--- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
+++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
@@ -623,9 +623,9 @@ def _schema_aware_fuzzy_column_resolve(
statement = sqlglot.optimizer.annotate_types.annotate_types(
statement, schema=sqlglot_db_schema
)
- except sqlglot.errors.OptimizeError as e:
+ except (sqlglot.errors.OptimizeError, sqlglot.errors.ParseError) as e:
# This is not a fatal error, so we can continue.
- logger.debug("sqlglot failed to annotate types: %s", e)
+ logger.debug("sqlglot failed to annotate or parse types: %s", e)
try:
assert isinstance(statement, _SupportedColumnLineageTypesTuple)
@@ -1156,3 +1156,20 @@ def create_lineage_sql_parsed_result(
finally:
if needs_close:
schema_resolver.close()
+
+
+def view_definition_lineage_helper(
+ result: SqlParsingResult, view_urn: str
+) -> SqlParsingResult:
+ if result.query_type is QueryType.SELECT:
+ # Some platforms (e.g. postgres) store only
-# Future Work
+## Future Work
- Supporting versions as start and end parameters as part of the call to the timeline API
- Supporting entities beyond Datasets
From 9ae0e93d82eac2040af2c3d23d52878e57e19df1 Mon Sep 17 00:00:00 2001
From: Ellie O'Neil <110510035+eboneil@users.noreply.github.com>
Date: Fri, 27 Oct 2023 20:18:31 -0700
Subject: [PATCH 40/40] docs(graphql): Correct mutation -> query for
searchAcrossLineage examples (#9134)
---
docs/api/tutorials/lineage.md | 8 ++------
metadata-ingestion/examples/library/read_lineage_rest.py | 2 +-
2 files changed, 3 insertions(+), 7 deletions(-)
diff --git a/docs/api/tutorials/lineage.md b/docs/api/tutorials/lineage.md
index 4baad09099d07..13ec716b7870b 100644
--- a/docs/api/tutorials/lineage.md
+++ b/docs/api/tutorials/lineage.md
@@ -113,12 +113,10 @@ Expected Response:
You can now see the lineage between `fct_users_deleted` and `logging_events`.
-
-
## Add Column-level Lineage
@@ -135,12 +133,10 @@ You can now see the lineage between `fct_users_deleted` and `logging_events`.
You can now see the column-level lineage between datasets. Note that you have to enable `Show Columns` to be able to see the column-level lineage.
-
-
## Read Lineage
@@ -180,7 +176,7 @@ query searchAcrossLineage {
}
```
-This example shows using lineage degrees as a filter, but additional search filters can be included here as well.
+This example shows using lineage degrees as a filter, but additional search filters can be included here as well.
@@ -188,7 +184,7 @@ This example shows using lineage degrees as a filter, but additional search filt
```shell
curl --location --request POST 'http://localhost:8080/api/graphql' \
--header 'Authorization: Bearer ' \
---header 'Content-Type: application/json' --data-raw '{ { "query": "mutation searchAcrossLineage { searchAcrossLineage( input: { query: \"*\" urn: \"urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD)\" start: 0 count: 10 direction: DOWNSTREAM orFilters: [ { and: [ { condition: EQUAL negated: false field: \"degree\" values: [\"1\", \"2\", \"3+\"] } ] } ] } ) { searchResults { degree entity { urn type } } }}"
+--header 'Content-Type: application/json' --data-raw '{ { "query": "query searchAcrossLineage { searchAcrossLineage( input: { query: \"*\" urn: \"urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD)\" start: 0 count: 10 direction: DOWNSTREAM orFilters: [ { and: [ { condition: EQUAL negated: false field: \"degree\" values: [\"1\", \"2\", \"3+\"] } ] } ] } ) { searchResults { degree entity { urn type } } }}"
}}'
```
diff --git a/metadata-ingestion/examples/library/read_lineage_rest.py b/metadata-ingestion/examples/library/read_lineage_rest.py
index 34437ed86280d..bd9b4e8651dba 100644
--- a/metadata-ingestion/examples/library/read_lineage_rest.py
+++ b/metadata-ingestion/examples/library/read_lineage_rest.py
@@ -6,7 +6,7 @@
# Query multiple aspects from entity
query = """
-mutation searchAcrossLineage {
+query searchAcrossLineage {
searchAcrossLineage(
input: {
query: "*"