Merge branch 'master' into master

datahub-project · Jul 26, 2024 · 18f8481 · 18f8481
2 parents 1d281b7 + 304fc4e
commit 18f8481
Show file tree

Hide file tree

Showing 180 changed files with 2,081 additions and 890 deletions.
diff --git a/build.gradle b/build.gradle
@@ -54,7 +54,7 @@ buildscript {
   ext.hazelcastVersion = '5.3.6'
   ext.ebeanVersion = '12.16.1'
   ext.googleJavaFormatVersion = '1.18.1'
-  ext.openLineageVersion = '1.16.0'
+  ext.openLineageVersion = '1.19.0'
   ext.logbackClassicJava8 = '1.2.12'
 
   ext.docker_registry = 'acryldata'
@@ -111,6 +111,7 @@ project.ext.externalDependency = [
     'avroCompiler': 'org.apache.avro:avro-compiler:1.11.3',
     'awsGlueSchemaRegistrySerde': 'software.amazon.glue:schema-registry-serde:1.1.17',
     'awsMskIamAuth': 'software.amazon.msk:aws-msk-iam-auth:2.0.3',
+    'awsS3': 'software.amazon.awssdk:s3:2.26.21',
     'awsSecretsManagerJdbc': 'com.amazonaws.secretsmanager:aws-secretsmanager-jdbc:1.0.13',
     'awsPostgresIamAuth': 'software.amazon.jdbc:aws-advanced-jdbc-wrapper:1.0.2',
     'awsRds':'software.amazon.awssdk:rds:2.18.24',

diff --git a/docs-website/filterTagIndexes.json b/docs-website/filterTagIndexes.json
@@ -562,7 +562,7 @@
       }
     },
     {
-      "Path": "docs/metadata-integration/java/spark-lineage-beta",
+      "Path": "docs/metadata-integration/java/acryl-spark-lineage",
       "imgPath": "img/logos/platforms/spark.svg",
       "Title": "Spark",
       "Description": "Spark is a data processing tool that enables fast and efficient processing of large-scale data sets using distributed computing.",

diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js
@@ -419,17 +419,13 @@ module.exports = {
         },
         {
           type: "doc",
-          id: "metadata-integration/java/spark-lineage/README",
-          label: "Spark (Legacy)",
-        },
-        {
-          type: "doc",
-          id: "metadata-integration/java/spark-lineage-beta/README",
+          id: "metadata-integration/java/acryl-spark-lineage/README",
           label: "Spark",
         },
         //"docker/airflow/local_airflow",
         "metadata-ingestion/integration_docs/great-expectations",
         "metadata-integration/java/datahub-protobuf/README",
+        //"metadata-integration/java/spark-lineage-legacy/README",
         //"metadata-ingestion/source-docs-template",
         {
           type: "autogenerated",
@@ -886,7 +882,7 @@ module.exports = {
     //"docs/how/graph-onboarding",
     //"docs/demo/graph-onboarding",
     //"metadata-integration/java/spark-lineage/README",
-    // "metadata-integration/java/spark-lineage-beta/README.md
+    // "metadata-integration/java/acryl-spark-lineage/README.md
     // "metadata-integration/java/openlineage-converter/README"
     //"metadata-ingestion-modules/airflow-plugin/README"
     //"metadata-ingestion-modules/dagster-plugin/README"

diff --git a/docs/cli.md b/docs/cli.md
@@ -102,6 +102,7 @@ Command Options:
   --test-source-connection  When set, ingestion will only test the source connection details from the recipe
   --no-progress             If enabled, mute intermediate progress ingestion reports
 ```
+
 #### ingest --dry-run
 
 The `--dry-run` option of the `ingest` command performs all of the ingestion steps, except writing to the sink. This is useful to validate that the
@@ -133,23 +134,8 @@ By default `--preview` creates 10 workunits. But if you wish to try producing mo
 datahub ingest -c ./examples/recipes/example_to_datahub_rest.dhub.yaml -n --preview --preview-workunits=20
 ```
 
-#### ingest deploy
-
-The `ingest deploy` command instructs the cli to upload an ingestion recipe to DataHub to be run by DataHub's [UI Ingestion](./ui-ingestion.md).
-This command can also be used to schedule the ingestion while uploading or even to update existing sources. It will upload to the remote instance the
-CLI is connected to, not the sink of the recipe. Use `datahub init` to set the remote if not already set.
-
-To schedule a recipe called "test", to run at 5am everyday, London time with the recipe configured in a local `recipe.yaml` file: 
-````shell
-datahub ingest deploy --name "test" --schedule "5 * * * *" --time-zone "Europe/London" -c recipe.yaml
-````
-
-To update an existing recipe please use the `--urn` parameter to specify the id of the recipe to update.
-
-**Note:** Updating a recipe will result in a replacement of the existing options with what was specified in the cli command.
-I.e: Not specifying a schedule in the cli update command will remove the schedule from the recipe to be updated.
-
 #### ingest --no-default-report
+
 By default, the cli sends an ingestion report to DataHub, which allows you to see the result of all cli-based ingestion in the UI. This can be turned off with the `--no-default-report` flag.
 
 ```shell
@@ -180,6 +166,52 @@ failure_log:
     filename: ./path/to/failure.json
 ```
 
+### ingest deploy
+
+The `ingest deploy` command instructs the cli to upload an ingestion recipe to DataHub to be run by DataHub's [UI Ingestion](./ui-ingestion.md).
+This command can also be used to schedule the ingestion while uploading or even to update existing sources. It will upload to the remote instance the
+CLI is connected to, not the sink of the recipe. Use `datahub init` to set the remote if not already set.
+
+This command will automatically create a new recipe if it doesn't exist, or update it if it does.
+Note that this is a complete update, and will remove any options that were previously set.
+I.e: Not specifying a schedule in the cli update command will remove the schedule from the recipe to be updated.
+
+**Basic example**
+
+To schedule a recipe called "Snowflake Integration", to run at 5am every day, London time with the recipe configured in a local `recipe.yaml` file:
+
+```shell
+datahub ingest deploy --name "Snowflake Integration" --schedule "5 * * * *" --time-zone "Europe/London" -c recipe.yaml
+```
+
+By default, the ingestion recipe's identifier is generated by hashing the name.
+You can override the urn generation by passing the `--urn` flag to the CLI.
+
+**Using `deployment` to avoid CLI args**
+
+As an alternative to configuring settings from the CLI, all of these settings can also be set in the `deployment` field of the recipe.
+
+```yml
+# deployment_recipe.yml
+deployment:
+  name: "Snowflake Integration"
+  schedule: "5 * * * *"
+  time_zone: "Europe/London"
+
+source: ...
+```
+
+```shell
+datahub ingest deploy -c deployment_recipe.yml
+```
+
+This is particularly useful when you want all recipes to be stored in version control.
+
+```shell
+# Deploy every yml recipe in a directory
+ls recipe_directory/*.yml | xargs -n 1 -I {} datahub ingest deploy -c {}
+```
+
 ### init
 
 The init command is used to tell `datahub` about where your DataHub instance is located. The CLI will point to localhost DataHub by default.
@@ -242,8 +274,6 @@ The [metadata deletion guide](./how/delete-metadata.md) covers the various optio
 
 ### exists
 
-**🤝 Version compatibility** : `acryl-datahub>=0.10.2.4`
-
 The exists command can be used to check if an entity exists in DataHub.
 
 ```shell
@@ -253,7 +283,6 @@ true
 false
 ```
 
-
 ### get
 
 The `get` command allows you to easily retrieve metadata from DataHub, by using the REST API. This works for both versioned aspects and timeseries aspects. For timeseries aspects, it fetches the latest value.
@@ -314,6 +343,7 @@ Update succeeded with status 200
 ```
 
 #### put platform
+
 **🤝 Version Compatibility:** `acryl-datahub>0.8.44.4`
 
 The **put platform** command instructs `datahub` to create or update metadata about a data platform. This is very useful if you are using a custom data platform, to set up its logo and display name for a native UI experience.
@@ -346,6 +376,7 @@ datahub timeline --urn "urn:li:dataset:(urn:li:dataPlatform:mysql,User.UserAccou
 The `dataset` command allows you to interact with the dataset entity.
 
 The `get` operation can be used to read in a dataset into a yaml file.
+
 ```shell
 datahub dataset get --urn "$URN" --to-file "$FILE_NAME"
 ```
@@ -358,7 +389,6 @@ datahub dataset upsert -f dataset.yaml
 
 An example of `dataset.yaml` would look like as in [dataset.yaml](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/examples/cli_usage/dataset/dataset.yaml).
 
-
 ### user (User Entity)
 
 The `user` command allows you to interact with the User entity.
@@ -411,7 +441,6 @@ members:
     display_name: "Joe's Hub"
 ```
 
-
 ### dataproduct (Data Product Entity)
 
 **🤝 Version Compatibility:** `acryl-datahub>=0.10.2.4`
@@ -566,14 +595,12 @@ Use this to delete a Data Product from DataHub. Default to `--soft` which preser
 # > datahub dataproduct delete --urn "urn:li:dataProduct:pet_of_the_week" --hard
 ```
 
-
 ## Miscellaneous Admin Commands
 
 ### lite (experimental)
 
 The lite group of commands allow you to run an embedded, lightweight DataHub instance for command line exploration of your metadata. This is intended more for developer tool oriented usage rather than as a production server instance for DataHub. See [DataHub Lite](./datahub_lite.md) for more information about how you can ingest metadata into DataHub Lite and explore your metadata easily.
 
-
 ### telemetry
 
 To help us understand how people are using DataHub, we collect anonymous usage statistics on actions such as command invocations via Mixpanel.
@@ -640,7 +667,6 @@ External Entities Affected: None
 Old Entities Migrated = {'urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)', 'urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)'}
 ```
 
-
 ## Alternate Installation Options
 
 ### Using docker
@@ -673,7 +699,7 @@ We use a plugin architecture so that you can install only the dependencies you a
 Please see our [Integrations page](https://datahubproject.io/integrations) if you want to filter on the features offered by each source.
 
 | Plugin Name                                                                                    | Install Command                                            | Provides                                |
-|------------------------------------------------------------------------------------------------| ---------------------------------------------------------- | --------------------------------------- |
+| ---------------------------------------------------------------------------------------------- | ---------------------------------------------------------- | --------------------------------------- |
 | [metadata-file](./generated/ingestion/sources/metadata-file.md)                                | _included by default_                                      | File source and sink                    |
 | [athena](./generated/ingestion/sources/athena.md)                                              | `pip install 'acryl-datahub[athena]'`                      | AWS Athena source                       |
 | [bigquery](./generated/ingestion/sources/bigquery.md)                                          | `pip install 'acryl-datahub[bigquery]'`                    | BigQuery source                         |
@@ -715,7 +741,7 @@ Please see our [Integrations page](https://datahubproject.io/integrations) if yo
 ### Sinks
 
 | Plugin Name                                                       | Install Command                              | Provides                   |
-|-------------------------------------------------------------------| -------------------------------------------- | -------------------------- |
+| ----------------------------------------------------------------- | -------------------------------------------- | -------------------------- |
 | [metadata-file](../metadata-ingestion/sink_docs/metadata-file.md) | _included by default_                        | File source and sink       |
 | [console](../metadata-ingestion/sink_docs/console.md)             | _included by default_                        | Console sink               |
 | [datahub-rest](../metadata-ingestion/sink_docs/datahub.md)        | `pip install 'acryl-datahub[datahub-rest]'`  | DataHub sink over REST API |

diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md
@@ -80,6 +80,7 @@ New (optional fields `systemMetadata` and `headers`):
 ### Deprecations
 
 ### Other Notable Change
+- #10466 - Extends configuration in `~/.datahubenv` to match `DatahubClientConfig` object definition. See full configuration in https://datahubproject.io/docs/python-sdk/clients/. The CLI should now respect the updated configurations specified in `~/.datahubenv` across its functions and utilities. This means that for systems where ssl certification is disabled, setting `disable_ssl_verification: true` in `~./datahubenv` will apply to all CLI calls.
 
 ## 0.13.1
 

diff --git a/docs/lineage/openlineage.md b/docs/lineage/openlineage.md
@@ -6,7 +6,7 @@ DataHub, now supports [OpenLineage](https://openlineage.io/) integration. With t
 
 - **REST Endpoint Support**: DataHub now includes a REST endpoint that can understand OpenLineage events. This allows users to send lineage information directly to DataHub, enabling easy integration with various data processing frameworks.
 
-- **[Spark Event Listener Plugin](https://datahubproject.io/docs/metadata-integration/java/spark-lineage-beta)**: DataHub provides a Spark Event Listener plugin that seamlessly integrates OpenLineage's Spark plugin. This plugin enhances DataHub's OpenLineage support by offering additional features such as PathSpec support, column-level lineage, patch support and more.
+- **[Spark Event Listener Plugin](https://datahubproject.io/docs/metadata-integration/java/acryl-spark-lineage)**: DataHub provides a Spark Event Listener plugin that seamlessly integrates OpenLineage's Spark plugin. This plugin enhances DataHub's OpenLineage support by offering additional features such as PathSpec support, column-level lineage, patch support and more.
 
 ## OpenLineage Support with DataHub
 
@@ -73,7 +73,7 @@ The transport should look like this:
 #### Known Limitations
 With Spark and Airflow we recommend using the Spark Lineage or DataHub's Airflow plugin for tighter integration with DataHub.
 
-- **[PathSpec](https://datahubproject.io/docs/metadata-integration/java/spark-lineage-beta/#configuring-hdfs-based-dataset-urns) Support**: While the REST endpoint supports OpenLineage messages, full [PathSpec](https://datahubproject.io/docs/metadata-integration/java/spark-lineage-beta/#configuring-hdfs-based-dataset-urns)) support is not yet available.
+- **[PathSpec](https://datahubproject.io/docs/metadata-integration/java/acryl-spark-lineage/#configuring-hdfs-based-dataset-urns) Support**: While the REST endpoint supports OpenLineage messages, full [PathSpec](https://datahubproject.io/docs/metadata-integration/java/acryl-spark-lineage/#configuring-hdfs-based-dataset-urns)) support is not yet available.
 
 - **Column-level Lineage**: DataHub's current OpenLineage support does not provide full column-level lineage tracking.
 - etc...
@@ -83,10 +83,10 @@ DataHub's Spark Event Listener plugin enhances OpenLineage support by providing
 
 #### How to Use
 
-Follow the guides of the Spark Lineage plugin page for more information on how to set up the Spark Lineage plugin. The guide can be found [here](https://datahubproject.io/docs/metadata-integration/java/spark-lineage-beta)
+Follow the guides of the Spark Lineage plugin page for more information on how to set up the Spark Lineage plugin. The guide can be found [here](https://datahubproject.io/docs/metadata-integration/java/acryl-spark-lineage)
 
 ## References
 
 - [OpenLineage](https://openlineage.io/)
 - [DataHub OpenAPI Guide](../api/openapi/openapi-usage-guide.md)
-- [DataHub Spark Lineage Plugin](https://datahubproject.io/docs/metadata-integration/java/spark-lineage-beta)
+- [DataHub Spark Lineage Plugin](https://datahubproject.io/docs/metadata-integration/java/acryl-spark-lineage)
diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md
@@ -19,7 +19,7 @@ Integration can be divided into two concepts based on the method:
 ### Push-based Integration
 
 Push-based integrations allow you to emit metadata directly from your data systems when metadata changes.
-Examples of push-based integrations include [Airflow](../docs/lineage/airflow.md), [Spark](../metadata-integration/java/spark-lineage/README.md), [Great Expectations](./integration_docs/great-expectations.md) and [Protobuf Schemas](../metadata-integration/java/datahub-protobuf/README.md). This allows you to get low-latency metadata integration from the "active" agents in your data ecosystem.
+Examples of push-based integrations include [Airflow](../docs/lineage/airflow.md), [Spark](../metadata-integration/java/acryl-spark-lineage/README.md), [Great Expectations](./integration_docs/great-expectations.md) and [Protobuf Schemas](../metadata-integration/java/datahub-protobuf/README.md). This allows you to get low-latency metadata integration from the "active" agents in your data ecosystem.
 
 ### Pull-based Integration
 

diff --git a/metadata-ingestion/docs/sources/databricks/README.md b/metadata-ingestion/docs/sources/databricks/README.md
@@ -11,7 +11,7 @@ The alternative way to integrate is via the Hive connector. The [Hive starter re
 
 ## Databricks Spark
 
-To complete the picture, we recommend adding push-based ingestion from your Spark jobs to see real-time activity and lineage between your Databricks tables and your Spark jobs. Use the Spark agent to push metadata to DataHub using the instructions [here](../../../../metadata-integration/java/spark-lineage-beta/README.md#configuration-instructions-databricks).
+To complete the picture, we recommend adding push-based ingestion from your Spark jobs to see real-time activity and lineage between your Databricks tables and your Spark jobs. Use the Spark agent to push metadata to DataHub using the instructions [here](../../../../metadata-integration/java/acryl-spark-lineage/README.md#configuration-instructions-databricks).
 
 ## Watch the DataHub Talk at the Data and AI Summit 2022
 

diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
@@ -113,6 +113,11 @@
     "numpy<2",
 }
 
+dbt_common = {
+    *sqlglot_lib,
+    "more_itertools",
+}
+
 sql_common = (
     {
         # Required for all SQL sources.
@@ -352,8 +357,8 @@
     "datahub-lineage-file": set(),
     "datahub-business-glossary": set(),
     "delta-lake": {*data_lake_profiling, *delta_lake},
-    "dbt": {"requests"} | sqlglot_lib | aws_common,
-    "dbt-cloud": {"requests"} | sqlglot_lib,
+    "dbt": {"requests"} | dbt_common | aws_common,
+    "dbt-cloud": {"requests"} | dbt_common,
     "druid": sql_common | {"pydruid>=0.6.2"},
     "dynamodb": aws_common | classification_lib,
     # Starting with 7.14.0 python client is checking if it is connected to elasticsearch client. If its not it throws

diff --git a/metadata-ingestion/src/datahub/cli/check_cli.py b/metadata-ingestion/src/datahub/cli/check_cli.py
@@ -389,3 +389,13 @@ def extract_sql_agg_log(query_log_file: str, output: Optional[str]) -> None:
         logger.info(f"Extracted {len(queries)} queries to {output}")
     else:
         click.echo(json.dumps(queries, indent=2))
+
+
+@check.command()
+def server_config() -> None:
+    """Print the server config."""
+    graph = get_default_graph()
+
+    server_config = graph.get_server_config()
+
+    click.echo(pprint.pformat(server_config))