From 0f7744784d663b377f1743db188d8632b9f6a86c Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Sat, 16 Sep 2023 03:55:10 +0900 Subject: [PATCH 01/37] fix: fix quickstart page (#8784) --- docs/quickstart.md | 336 +++++++++++++++++++++++++-------------------- 1 file changed, 184 insertions(+), 152 deletions(-) diff --git a/docs/quickstart.md b/docs/quickstart.md index cd91dc8d1ac84..29b22b54dc87a 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -1,219 +1,218 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + # DataHub Quickstart Guide +:::tip Managed DataHub + This guide provides instructions on deploying the open source DataHub locally. -If you're interested in a managed version, [Acryl Data](https://www.acryldata.io/product) provides a fully managed, premium version of DataHub. +If you're interested in a managed version, [Acryl Data](https://www.acryldata.io/product) provides a fully managed, premium version of DataHub.
+**[Get Started with Managed DataHub](./managed-datahub/welcome-acryl.md)** - -Get Started with Managed DataHub - +::: -## Deploying DataHub +## Prerequisites -To deploy a new instance of DataHub, perform the following steps. +- Install **Docker** and **Docker Compose** v2 for your platform. -1. Install Docker and Docker Compose v2 for your platform. + | Platform | Application | + | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | + | Window | [Docker Desktop](https://www.docker.com/products/docker-desktop/) | + | Mac | [Docker Desktop](https://www.docker.com/products/docker-desktop/) | + | Linux | [Docker for Linux](https://docs.docker.com/desktop/install/linux-install/) and [Docker Compose](https://docs.docker.com/compose/install/linux/) | -- On Windows or Mac, install [Docker Desktop](https://www.docker.com/products/docker-desktop/). -- On Linux, install [Docker for Linux](https://docs.docker.com/desktop/install/linux-install/) and [Docker Compose](https://docs.docker.com/compose/install/linux/). +- **Launch the Docker engine** from command line or the desktop app. +- Ensure you have **Python 3.7+** installed & configured. (Check using `python3 --version`). -:::note +:::note Docker Resource Allocation -Make sure to allocate enough hardware resources for Docker engine. +Make sure to allocate enough hardware resources for Docker engine.
Tested & confirmed config: 2 CPUs, 8GB RAM, 2GB Swap area, and 10GB disk space. ::: -2. Launch the Docker Engine from command line or the desktop app. - -3. Install the DataHub CLI - - a. Ensure you have Python 3.7+ installed & configured. (Check using `python3 --version`). - - b. Run the following commands in your terminal +## Install the DataHub CLI - ```sh - python3 -m pip install --upgrade pip wheel setuptools - python3 -m pip install --upgrade acryl-datahub - datahub version - ``` + + - If you're using poetry, run the following command. - - ```sh - poetry add acryl-datahub - datahub version - ``` +```bash +python3 -m pip install --upgrade pip wheel setuptools +python3 -m pip install --upgrade acryl-datahub +datahub version +``` -:::note +:::note Command Not Found -If you see "command not found", try running cli commands with the prefix 'python3 -m' instead like `python3 -m datahub version` +If you see `command not found`, try running cli commands like `python3 -m datahub version`.
Note that DataHub CLI does not support Python 2.x. ::: -4. To deploy a DataHub instance locally, run the following CLI command from your terminal - - ``` - datahub docker quickstart - ``` - - This will deploy a DataHub instance using [docker-compose](https://docs.docker.com/compose/). - If you are curious, the `docker-compose.yaml` file is downloaded to your home directory under the `.datahub/quickstart` directory. - - If things go well, you should see messages like the ones below: - - ``` - Fetching docker-compose file https://raw.githubusercontent.com/datahub-project/datahub/master/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml from GitHub - Pulling docker images... - Finished pulling docker images! - - [+] Running 11/11 - ⠿ Container zookeeper Running 0.0s - ⠿ Container elasticsearch Running 0.0s - ⠿ Container broker Running 0.0s - ⠿ Container schema-registry Running 0.0s - ⠿ Container elasticsearch-setup Started 0.7s - ⠿ Container kafka-setup Started 0.7s - ⠿ Container mysql Running 0.0s - ⠿ Container datahub-gms Running 0.0s - ⠿ Container mysql-setup Started 0.7s - ⠿ Container datahub-datahub-actions-1 Running 0.0s - ⠿ Container datahub-frontend-react Running 0.0s - ....... - ✔ DataHub is now running - Ingest some demo data using `datahub docker ingest-sample-data`, - or head to http://localhost:9002 (username: datahub, password: datahub) to play around with the frontend. - Need support? Get in touch on Slack: https://slack.datahubproject.io/ - ``` - - Upon completion of this step, you should be able to navigate to the DataHub UI - at [http://localhost:9002](http://localhost:9002) in your browser. You can sign in using `datahub` as both the - username and password. - -:::note - -On Mac computers with Apple Silicon (M1, M2 etc.), you might see an error like `no matching manifest for linux/arm64/v8 in the manifest list entries`, this typically means that the datahub cli was not able to detect that you are running it on Apple Silicon. To resolve this issue, override the default architecture detection by issuing `datahub docker quickstart --arch m1` +
+ -::: +```bash +poetry add acryl-datahub +poetry shell +datahub version +``` -5. To ingest the sample metadata, run the following CLI command from your terminal + +
- ```bash - datahub docker ingest-sample-data - ``` +## Start DataHub -:::note +Run the following CLI command from your terminal. -If you've enabled [Metadata Service Authentication](authentication/introducing-metadata-service-authentication.md), you'll need to provide a Personal Access Token -using the `--token ` parameter in the command. +```bash +datahub docker quickstart +``` -::: +This will deploy a DataHub instance using [docker-compose](https://docs.docker.com/compose/). +If you are curious, the `docker-compose.yaml` file is downloaded to your home directory under the `.datahub/quickstart` directory. + +If things go well, you should see messages like the ones below: + +```shell-session +Fetching docker-compose file https://raw.githubusercontent.com/datahub-project/datahub/master/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml from GitHub +Pulling docker images... +Finished pulling docker images! + +[+] Running 11/11 +⠿ Container zookeeper Running 0.0s +⠿ Container elasticsearch Running 0.0s +⠿ Container broker Running 0.0s +⠿ Container schema-registry Running 0.0s +⠿ Container elasticsearch-setup Started 0.7s +⠿ Container kafka-setup Started 0.7s +⠿ Container mysql Running 0.0s +⠿ Container datahub-gms Running 0.0s +⠿ Container mysql-setup Started 0.7s +⠿ Container datahub-datahub-actions-1 Running 0.0s +⠿ Container datahub-frontend-react Running 0.0s +....... +✔ DataHub is now running +Ingest some demo data using `datahub docker ingest-sample-data`, +or head to http://localhost:9002 (username: datahub, password: datahub) to play around with the frontend. +Need support? Get in touch on Slack: https://slack.datahubproject.io/ +``` -That's it! Now feel free to play around with DataHub! +:::note Mac M1/M2 -## Troubleshooting Issues +On Mac computers with Apple Silicon (M1, M2 etc.), you might see an error like `no matching manifest for linux/arm64/v8 in the manifest list entries`. +This typically means that the datahub cli was not able to detect that you are running it on Apple Silicon. +To resolve this issue, override the default architecture detection by issuing `datahub docker quickstart --arch m1` -Please refer to [Quickstart Debugging Guide](./troubleshooting/quickstart.md). +::: -## Next Steps +### Sign In -### Ingest Metadata +Upon completion of this step, you should be able to navigate to the DataHub UI at [http://localhost:9002](http://localhost:9002) in your browser. +You can sign in using the default credentials below. -To start pushing your company's metadata into DataHub, take a look at [UI-based Ingestion Guide](./ui-ingestion.md), or to run ingestion using the cli, look at the [Metadata Ingestion Guide](../metadata-ingestion/README.md). +```json +username: datahub +password: datahub +``` -### Invite Users +To change the default credentials, please refer to [Change the default user datahub in quickstart](authentication/changing-default-credentials.md#quickstart). -To add users to your deployment to share with your team check out our [Adding Users to DataHub](authentication/guides/add-users.md) +### Ingest Sample Data -### Enable Authentication +To ingest the sample metadata, run the following CLI command from your terminal -To enable SSO, check out [Configuring OIDC Authentication](authentication/guides/sso/configure-oidc-react.md) or [Configuring JaaS Authentication](authentication/guides/jaas.md). +```bash +datahub docker ingest-sample-data +``` -To enable backend Authentication, check out [authentication in DataHub's backend](authentication/introducing-metadata-service-authentication.md#configuring-metadata-service-authentication). +:::note Token Authentication -### Change the Default `datahub` User Credentials +If you've enabled [Metadata Service Authentication](authentication/introducing-metadata-service-authentication.md), you'll need to provide a Personal Access Token +using the `--token ` parameter in the command. -:::note -Please note that deleting the `Data Hub` user in the UI **WILL NOT** disable the default user. You will still be able to log in using the default 'datahub:datahub' credentials. To safely delete the default credentials, please follow the guide provided below. ::: -Please refer to [Change the default user datahub in quickstart](authentication/changing-default-credentials.md#quickstart). - -### Move to Production +That's it! Now feel free to play around with DataHub! -We recommend deploying DataHub to production using Kubernetes. We provide helpful [Helm Charts](https://artifacthub.io/packages/helm/datahub/datahub) to help you quickly get up and running. Check out [Deploying DataHub to Kubernetes](./deploy/kubernetes.md) for a step-by-step walkthrough. +--- -The `quickstart` method of running DataHub is intended for local development and a quick way to experience the features that DataHub has to offer. It is not -intended for a production environment. This recommendation is based on the following points. +## Common Operations -#### Default Credentials +### Stop DataHub -`quickstart` uses docker-compose configuration which includes default credentials for both DataHub, and it's underlying -prerequisite data stores, such as MySQL. Additionally, other components are unauthenticated out of the box. This is a -design choice to make development easier and is not best practice for a production environment. - -#### Exposed Ports +To stop DataHub's quickstart, you can issue the following command. -DataHub's services, and it's backend data stores use the docker default behavior of binding to all interface addresses. -This makes it useful for development but is not recommended in a production environment. +```bash +datahub docker quickstart --stop +``` -#### Performance & Management +### Reset DataHub -* `quickstart` is limited by the resources available on a single host, there is no ability to scale horizontally. -* Rollout of new versions requires downtime. -* The configuration is largely pre-determined and not easily managed. -* `quickstart`, by default, follows the most recent builds forcing updates to the latest released and unreleased builds. +To cleanse DataHub of all of its state (e.g. before ingesting your own), you can use the CLI `nuke` command. -## Other Common Operations +```bash +datahub docker nuke +``` -### Stopping DataHub +### Upgrade DataHub -To stop DataHub's quickstart, you can issue the following command. +If you have been testing DataHub locally, a new version of DataHub got released and you want to try the new version then you can just issue the quickstart command again. It will pull down newer images and restart your instance without losing any data. -``` -datahub docker quickstart --stop +```bash +datahub docker quickstart ``` -### Resetting DataHub (a.k.a factory reset) +### Customize installation -To cleanse DataHub of all of its state (e.g. before ingesting your own), you can use the CLI `nuke` command. +If you would like to customize the DataHub installation further, please download the [docker-compose.yaml](https://raw.githubusercontent.com/datahub-project/datahub/master/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml) used by the cli tool, modify it as necessary and deploy DataHub by passing the downloaded docker-compose file: -``` -datahub docker nuke +```bash +datahub docker quickstart --quickstart-compose-file ``` -### Backing up your DataHub Quickstart (experimental) +### Back up DataHub -The quickstart image is not recommended for use as a production instance. See [Moving to production](#move-to-production) for recommendations on setting up your production cluster. However, in case you want to take a backup of your current quickstart state (e.g. you have a demo to your company coming up and you want to create a copy of the quickstart data so you can restore it at a future date), you can supply the `--backup` flag to quickstart. +The quickstart image is not recommended for use as a production instance.
+However, in case you want to take a backup of your current quickstart state (e.g. you have a demo to your company coming up and you want to create a copy of the quickstart data so you can restore it at a future date), you can supply the `--backup` flag to quickstart. -``` + + + +```bash datahub docker quickstart --backup ``` -will take a backup of your MySQL image and write it by default to your `~/.datahub/quickstart/` directory as the file `backup.sql`. You can customize this by passing a `--backup-file` argument. -e.g. +This will take a backup of your MySQL image and write it by default to your `~/.datahub/quickstart/` directory as the file `backup.sql`. + + + +```bash +datahub docker quickstart --backup --backup-file ``` -datahub docker quickstart --backup --backup-file /home/my_user/datahub_backups/quickstart_backup_2002_22_01.sql -``` -:::note +You can customize the backup file path by passing a `--backup-file` argument. + + + + +:::caution Note that the Quickstart backup does not include any timeseries data (dataset statistics, profiles, etc.), so you will lose that information if you delete all your indexes and restore from this backup. ::: -### Restoring your DataHub Quickstart (experimental) +### Restore DataHub As you might imagine, these backups are restore-able. The following section describes a few different options you have to restore your backup. -#### Restoring a backup (primary + index) [most common] + + To restore a previous backup, run the following command: -``` +```bash datahub docker quickstart --restore ``` @@ -221,38 +220,71 @@ This command will pick up the `backup.sql` file located under `~/.datahub/quicks To supply a specific backup file, use the `--restore-file` option. -``` +```bash datahub docker quickstart --restore --restore-file /home/my_user/datahub_backups/quickstart_backup_2002_22_01.sql ``` -#### Restoring only the index [to deal with index out of sync / corruption issues] + + Another situation that can come up is the index can get corrupt, or be missing some update. In order to re-bootstrap the index from the primary store, you can run this command to sync the index with the primary store. -``` +```bash datahub docker quickstart --restore-indices ``` -#### Restoring a backup (primary but NO index) [rarely used] + + + Sometimes, you might want to just restore the state of your primary database (MySQL), but not re-index the data. To do this, you have to explicitly disable the restore-indices capability. -``` +```bash datahub docker quickstart --restore --no-restore-indices ``` -### Upgrading your local DataHub + + -If you have been testing DataHub locally, a new version of DataHub got released and you want to try the new version then you can just issue the quickstart command again. It will pull down newer images and restart your instance without losing any data. +--- -``` -datahub docker quickstart -``` +## Next Steps -### Customization +- [Quickstart Debugging Guide](./troubleshooting/quickstart.md) +- [Ingest metadata through the UI](./ui-ingestion.md) +- [Ingest metadata through the CLI](../metadata-ingestion/README.md) +- [Add Users to DataHub](authentication/guides/add-users.md) +- [Configure OIDC Authentication](authentication/guides/sso/configure-oidc-react.md) +- [Configure JaaS Authentication](authentication/guides/jaas.md) +- [Configure authentication in DataHub's backend](authentication/introducing-metadata-service-authentication.md#configuring-metadata-service-authentication). +- [Change the default user datahub in quickstart](authentication/changing-default-credentials.md#quickstart) -If you would like to customize the DataHub installation further, please download the [docker-compose.yaml](https://raw.githubusercontent.com/datahub-project/datahub/master/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml) used by the cli tool, modify it as necessary and deploy DataHub by passing the downloaded docker-compose file: +### Move To Production -``` -datahub docker quickstart --quickstart-compose-file -``` +:::caution + +Quickstart is not intended for a production environment. We recommend deploying DataHub to production using Kubernetes. +We provide helpful [Helm Charts](https://artifacthub.io/packages/helm/datahub/datahub) to help you quickly get up and running. +Check out [Deploying DataHub to Kubernetes](./deploy/kubernetes.md) for a step-by-step walkthrough. + +::: + +The `quickstart` method of running DataHub is intended for local development and a quick way to experience the features that DataHub has to offer. +It is not intended for a production environment. This recommendation is based on the following points. + +#### Default Credentials + +`quickstart` uses docker-compose configuration which includes default credentials for both DataHub, and it's underlying +prerequisite data stores, such as MySQL. Additionally, other components are unauthenticated out of the box. This is a +design choice to make development easier and is not best practice for a production environment. + +#### Exposed Ports + +DataHub's services, and it's backend data stores use the docker default behavior of binding to all interface addresses. +This makes it useful for development but is not recommended in a production environment. + +#### Performance & Management + +`quickstart` is limited by the resources available on a single host, there is no ability to scale horizontally. +Rollout of new versions often requires downtime and the configuration is largely pre-determined and not easily managed. +Lastly, by default, `quickstart` follows the most recent builds forcing updates to the latest released and unreleased builds. From cdb9f5ba620956346479bdbf68920dbdd3f6e0cc Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Sat, 16 Sep 2023 00:25:39 +0530 Subject: [PATCH 02/37] feat(bigquery): add better timers around every API call (#8626) --- .../ingestion/source/bigquery_v2/bigquery.py | 241 ++------ .../source/bigquery_v2/bigquery_audit.py | 43 -- .../bigquery_v2/bigquery_audit_log_api.py | 139 +++++ .../source/bigquery_v2/bigquery_config.py | 86 ++- .../source/bigquery_v2/bigquery_report.py | 53 +- .../source/bigquery_v2/bigquery_schema.py | 530 ++++++----------- .../ingestion/source/bigquery_v2/common.py | 34 -- .../ingestion/source/bigquery_v2/lineage.py | 545 +++++++++--------- .../ingestion/source/bigquery_v2/queries.py | 426 ++++++++++++++ .../ingestion/source/bigquery_v2/usage.py | 240 ++------ .../ingestion/source/redshift/lineage.py | 4 +- .../source/snowflake/snowflake_v2.py | 21 +- .../src/datahub/utilities/perf_timer.py | 69 ++- .../integration/bigquery_v2/test_bigquery.py | 14 +- .../tests/unit/test_bigquery_lineage.py | 11 +- .../tests/unit/test_bigquery_source.py | 141 +++-- .../unit/test_bigqueryv2_usage_source.py | 11 +- .../tests/unit/utilities/test_perf_timer.py | 46 ++ 18 files changed, 1450 insertions(+), 1204 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py create mode 100644 metadata-ingestion/tests/unit/utilities/test_perf_timer.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 1107a54a1896b..ae49a4ba17c11 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -4,7 +4,7 @@ import re import traceback from collections import defaultdict -from datetime import datetime, timedelta, timezone +from datetime import datetime, timedelta from typing import Dict, Iterable, List, Optional, Set, Type, Union, cast from google.cloud import bigquery @@ -44,21 +44,17 @@ from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( BigqueryColumn, - BigQueryDataDictionary, BigqueryDataset, BigqueryProject, + BigQuerySchemaApi, BigqueryTable, BigqueryView, ) from datahub.ingestion.source.bigquery_v2.common import ( BQ_EXTERNAL_DATASET_URL_TEMPLATE, BQ_EXTERNAL_TABLE_URL_TEMPLATE, - get_bigquery_client, -) -from datahub.ingestion.source.bigquery_v2.lineage import ( - BigqueryLineageExtractor, - make_lineage_edges_from_parsing_result, ) +from datahub.ingestion.source.bigquery_v2.lineage import BigqueryLineageExtractor from datahub.ingestion.source.bigquery_v2.profiler import BigqueryProfiler from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor from datahub.ingestion.source.common.subtypes import ( @@ -83,7 +79,6 @@ StatefulIngestionSourceBase, ) from datahub.ingestion.source_report.ingestion_stage import ( - LINEAGE_EXTRACTION, METADATA_EXTRACTION, PROFILING, ) @@ -94,7 +89,6 @@ ) from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( DatasetProperties, - UpstreamLineage, ViewProperties, ) from datahub.metadata.com.linkedin.pegasus2avro.schema import ( @@ -113,11 +107,9 @@ ) from datahub.metadata.schema_classes import ( DataPlatformInstanceClass, - DatasetLineageTypeClass, GlobalTagsClass, TagAssociationClass, ) -from datahub.specific.dataset import DatasetPatchBuilder from datahub.utilities.file_backed_collections import FileBackedDict from datahub.utilities.hive_schema_to_avro import ( HiveColumnToAvroConverter, @@ -126,7 +118,7 @@ from datahub.utilities.mapping import Constants from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.registries.domain_registry import DomainRegistry -from datahub.utilities.sqlglot_lineage import SchemaResolver, sqlglot_lineage +from datahub.utilities.sqlglot_lineage import SchemaResolver logger: logging.Logger = logging.getLogger(__name__) @@ -228,11 +220,15 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): set_dataset_urn_to_lower(self.config.convert_urns_to_lowercase) - self.redundant_lineage_run_skip_handler: Optional[ + self.bigquery_data_dictionary = BigQuerySchemaApi( + self.report.schema_api_perf, self.config.get_bigquery_client() + ) + + redundant_lineage_run_skip_handler: Optional[ RedundantLineageRunSkipHandler ] = None if self.config.enable_stateful_lineage_ingestion: - self.redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler( + redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler( source=self, config=self.config, pipeline_name=self.ctx.pipeline_name, @@ -241,7 +237,10 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): # For database, schema, tables, views, etc self.lineage_extractor = BigqueryLineageExtractor( - config, self.report, self.redundant_lineage_run_skip_handler + config, + self.report, + dataset_urn_builder=self.gen_dataset_urn_from_ref, + redundant_run_skip_handler=redundant_lineage_run_skip_handler, ) redundant_usage_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = None @@ -289,6 +288,7 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): self.sql_parser_schema_resolver = SchemaResolver( platform=self.platform, env=self.config.env ) + self.add_config_to_report() atexit.register(cleanup, config) @@ -314,18 +314,20 @@ def metadata_read_capability_test( for project_id in project_ids: try: logger.info((f"Metadata read capability test for project {project_id}")) - client: bigquery.Client = get_bigquery_client(config) + client: bigquery.Client = config.get_bigquery_client() assert client - result = BigQueryDataDictionary.get_datasets_for_project_id( - client, project_id, 10 + bigquery_data_dictionary = BigQuerySchemaApi( + BigQueryV2Report().schema_api_perf, client + ) + result = bigquery_data_dictionary.get_datasets_for_project_id( + project_id, 10 ) if len(result) == 0: return CapabilityReport( capable=False, failure_reason=f"Dataset query returned empty dataset. It is either empty or no dataset in project {project_id}", ) - tables = BigQueryDataDictionary.get_tables_for_dataset( - conn=client, + tables = bigquery_data_dictionary.get_tables_for_dataset( project_id=project_id, dataset_name=result[0].name, tables={}, @@ -351,7 +353,9 @@ def lineage_capability_test( project_ids: List[str], report: BigQueryV2Report, ) -> CapabilityReport: - lineage_extractor = BigqueryLineageExtractor(connection_conf, report) + lineage_extractor = BigqueryLineageExtractor( + connection_conf, report, lambda ref: "" + ) for project_id in project_ids: try: logger.info(f"Lineage capability test for project {project_id}") @@ -397,7 +401,7 @@ def test_connection(config_dict: dict) -> TestConnectionReport: try: connection_conf = BigQueryV2Config.parse_obj_allow_extras(config_dict) - client: bigquery.Client = get_bigquery_client(connection_conf) + client: bigquery.Client = connection_conf.get_bigquery_client() assert client test_report.basic_connectivity = BigqueryV2Source.connectivity_test(client) @@ -519,54 +523,30 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: - conn: bigquery.Client = get_bigquery_client(self.config) - - projects = self._get_projects(conn) + projects = self._get_projects() if not projects: return for project_id in projects: self.report.set_ingestion_stage(project_id.id, METADATA_EXTRACTION) logger.info(f"Processing project: {project_id.id}") - yield from self._process_project(conn, project_id) + yield from self._process_project(project_id) if self.config.include_usage_statistics: yield from self.usage_extractor.get_usage_workunits( [p.id for p in projects], self.table_refs ) - if self._should_ingest_lineage(): - for project in projects: - self.report.set_ingestion_stage(project.id, LINEAGE_EXTRACTION) - yield from self.generate_lineage(project.id) - - if self.redundant_lineage_run_skip_handler: - # Update the checkpoint state for this run. - self.redundant_lineage_run_skip_handler.update_state( - self.config.start_time, self.config.end_time - ) - - def _should_ingest_lineage(self) -> bool: - if not self.config.include_table_lineage: - return False - - if ( - self.redundant_lineage_run_skip_handler - and self.redundant_lineage_run_skip_handler.should_skip_this_run( - cur_start_time=self.config.start_time, - cur_end_time=self.config.end_time, + if self.config.include_table_lineage: + yield from self.lineage_extractor.get_lineage_workunits( + [p.id for p in projects], + self.sql_parser_schema_resolver, + self.view_refs_by_project, + self.view_definitions, + self.table_refs, ) - ): - # Skip this run - self.report.report_warning( - "lineage-extraction", - "Skip this run as there was already a run for current ingestion window.", - ) - return False - - return True - def _get_projects(self, conn: bigquery.Client) -> List[BigqueryProject]: + def _get_projects(self) -> List[BigqueryProject]: logger.info("Getting projects") if self.config.project_ids or self.config.project_id: project_ids = self.config.project_ids or [self.config.project_id] # type: ignore @@ -575,15 +555,10 @@ def _get_projects(self, conn: bigquery.Client) -> List[BigqueryProject]: for project_id in project_ids ] else: - return list(self._get_project_list(conn)) - - def _get_project_list(self, conn: bigquery.Client) -> Iterable[BigqueryProject]: - try: - projects = BigQueryDataDictionary.get_projects(conn) - except Exception as e: - logger.error(f"Error getting projects. {e}", exc_info=True) - projects = [] + return list(self._query_project_list()) + def _query_project_list(self) -> Iterable[BigqueryProject]: + projects = self.bigquery_data_dictionary.get_projects() if not projects: # Report failure on exception and if empty list is returned self.report.report_failure( "metadata-extraction", @@ -600,7 +575,7 @@ def _get_project_list(self, conn: bigquery.Client) -> Iterable[BigqueryProject]: self.report.report_dropped(project.id) def _process_project( - self, conn: bigquery.Client, bigquery_project: BigqueryProject + self, bigquery_project: BigqueryProject ) -> Iterable[MetadataWorkUnit]: db_tables: Dict[str, List[BigqueryTable]] = {} db_views: Dict[str, List[BigqueryView]] = {} @@ -611,7 +586,7 @@ def _process_project( try: bigquery_project.datasets = ( - BigQueryDataDictionary.get_datasets_for_project_id(conn, project_id) + self.bigquery_data_dictionary.get_datasets_for_project_id(project_id) ) except Exception as e: error_message = f"Unable to get datasets for project {project_id}, skipping. The error was: {e}" @@ -645,7 +620,7 @@ def _process_project( try: # db_tables and db_views are populated in the this method yield from self._process_schema( - conn, project_id, bigquery_dataset, db_tables, db_views + project_id, bigquery_dataset, db_tables, db_views ) except Exception as e: @@ -670,73 +645,8 @@ def _process_project( tables=db_tables, ) - def generate_lineage(self, project_id: str) -> Iterable[MetadataWorkUnit]: - logger.info(f"Generate lineage for {project_id}") - lineage = self.lineage_extractor.calculate_lineage_for_project( - project_id, - sql_parser_schema_resolver=self.sql_parser_schema_resolver, - ) - - if self.config.lineage_parse_view_ddl: - for view in self.view_refs_by_project[project_id]: - view_definition = self.view_definitions[view] - raw_view_lineage = sqlglot_lineage( - view_definition, - schema_resolver=self.sql_parser_schema_resolver, - default_db=project_id, - ) - if raw_view_lineage.debug_info.table_error: - logger.debug( - f"Failed to parse lineage for view {view}: {raw_view_lineage.debug_info.table_error}" - ) - self.report.num_view_definitions_failed_parsing += 1 - self.report.view_definitions_parsing_failures.append( - f"Table-level sql parsing error for view {view}: {raw_view_lineage.debug_info.table_error}" - ) - continue - elif raw_view_lineage.debug_info.column_error: - self.report.num_view_definitions_failed_column_parsing += 1 - self.report.view_definitions_parsing_failures.append( - f"Column-level sql parsing error for view {view}: {raw_view_lineage.debug_info.column_error}" - ) - else: - self.report.num_view_definitions_parsed += 1 - - # For views, we override the upstreams obtained by parsing audit logs - # as they may contain indirectly referenced tables. - ts = datetime.now(timezone.utc) - lineage[view] = set( - make_lineage_edges_from_parsing_result( - raw_view_lineage, - audit_stamp=ts, - lineage_type=DatasetLineageTypeClass.VIEW, - ) - ) - - for lineage_key in lineage.keys(): - if lineage_key not in self.table_refs: - continue - - table_ref = BigQueryTableRef.from_string_name(lineage_key) - dataset_urn = self.gen_dataset_urn( - project_id=table_ref.table_identifier.project_id, - dataset_name=table_ref.table_identifier.dataset, - table=table_ref.table_identifier.get_table_display_name(), - ) - - lineage_info = self.lineage_extractor.get_lineage_for_table( - bq_table=table_ref, - bq_table_urn=dataset_urn, - platform=self.platform, - lineage_metadata=lineage, - ) - - if lineage_info: - yield from self.gen_lineage(dataset_urn, lineage_info) - def _process_schema( self, - conn: bigquery.Client, project_id: str, bigquery_dataset: BigqueryDataset, db_tables: Dict[str, List[BigqueryTable]], @@ -750,8 +660,7 @@ def _process_schema( columns = None if self.config.include_tables or self.config.include_views: - columns = BigQueryDataDictionary.get_columns_for_dataset( - conn, + columns = self.bigquery_data_dictionary.get_columns_for_dataset( project_id=project_id, dataset_name=dataset_name, column_limit=self.config.column_limit, @@ -760,7 +669,7 @@ def _process_schema( if self.config.include_tables: db_tables[dataset_name] = list( - self.get_tables_for_dataset(conn, project_id, dataset_name) + self.get_tables_for_dataset(project_id, dataset_name) ) for table in db_tables[dataset_name]: @@ -773,7 +682,9 @@ def _process_schema( ) elif self.config.include_table_lineage or self.config.include_usage_statistics: # Need table_refs to calculate lineage and usage - for table_item in conn.list_tables(f"{project_id}.{dataset_name}"): + for table_item in self.bigquery_data_dictionary.list_tables( + dataset_name, project_id + ): identifier = BigqueryTableIdentifier( project_id=project_id, dataset=dataset_name, @@ -793,8 +704,8 @@ def _process_schema( if self.config.include_views: db_views[dataset_name] = list( - BigQueryDataDictionary.get_views_for_dataset( - conn, project_id, dataset_name, self.config.is_profiling_enabled() + self.bigquery_data_dictionary.get_views_for_dataset( + project_id, dataset_name, self.config.is_profiling_enabled() ) ) @@ -1065,39 +976,6 @@ def gen_dataset_workunits( domain_config=self.config.domain, ) - def gen_lineage( - self, - dataset_urn: str, - upstream_lineage: Optional[UpstreamLineage] = None, - ) -> Iterable[MetadataWorkUnit]: - if upstream_lineage is None: - return - - if upstream_lineage is not None: - if self.config.incremental_lineage: - patch_builder: DatasetPatchBuilder = DatasetPatchBuilder( - urn=dataset_urn - ) - for upstream in upstream_lineage.upstreams: - patch_builder.add_upstream_lineage(upstream) - - yield from [ - MetadataWorkUnit( - id=f"upstreamLineage-for-{dataset_urn}", - mcp_raw=mcp, - ) - for mcp in patch_builder.build() - ] - else: - if not self.config.extract_column_lineage: - upstream_lineage.fineGrainedLineages = None - - yield from [ - MetadataChangeProposalWrapper( - entityUrn=dataset_urn, aspect=upstream_lineage - ).as_workunit() - ] - def gen_tags_aspect_workunit( self, dataset_urn: str, tags_to_add: List[str] ) -> MetadataWorkUnit: @@ -1212,7 +1090,6 @@ def get_report(self) -> BigQueryV2Report: def get_tables_for_dataset( self, - conn: bigquery.Client, project_id: str, dataset_name: str, ) -> Iterable[BigqueryTable]: @@ -1231,14 +1108,15 @@ def get_tables_for_dataset( # We get the list of tables in the dataset to get core table properties and to be able to process the tables in batches # We collect only the latest shards from sharded tables (tables with _YYYYMMDD suffix) and ignore temporary tables - table_items = self.get_core_table_details(conn, dataset_name, project_id) + table_items = self.get_core_table_details( + dataset_name, project_id, self.config.temp_table_dataset_prefix + ) items_to_get: Dict[str, TableListItem] = {} for table_item in table_items.keys(): items_to_get[table_item] = table_items[table_item] if len(items_to_get) % max_batch_size == 0: - yield from BigQueryDataDictionary.get_tables_for_dataset( - conn, + yield from self.bigquery_data_dictionary.get_tables_for_dataset( project_id, dataset_name, items_to_get, @@ -1247,8 +1125,7 @@ def get_tables_for_dataset( items_to_get.clear() if items_to_get: - yield from BigQueryDataDictionary.get_tables_for_dataset( - conn, + yield from self.bigquery_data_dictionary.get_tables_for_dataset( project_id, dataset_name, items_to_get, @@ -1260,13 +1137,15 @@ def get_tables_for_dataset( ) def get_core_table_details( - self, conn: bigquery.Client, dataset_name: str, project_id: str + self, dataset_name: str, project_id: str, temp_table_dataset_prefix: str ) -> Dict[str, TableListItem]: table_items: Dict[str, TableListItem] = {} # Dict to store sharded table and the last seen max shard id sharded_tables: Dict[str, TableListItem] = {} - for table in conn.list_tables(f"{project_id}.{dataset_name}"): + for table in self.bigquery_data_dictionary.list_tables( + dataset_name, project_id + ): table_identifier = BigqueryTableIdentifier( project_id=project_id, dataset=dataset_name, @@ -1303,9 +1182,7 @@ def get_core_table_details( if stored_shard < shard: sharded_tables[table_name] = table continue - elif str(table_identifier).startswith( - self.config.temp_table_dataset_prefix - ): + elif str(table_identifier).startswith(temp_table_dataset_prefix): logger.debug(f"Dropping temporary table {table_identifier.table}") self.report.report_dropped(table_identifier.raw_table_name()) continue diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py index 0f9b37c93feaa..b0ac77201b415 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py @@ -13,48 +13,6 @@ get_first_missing_key_any, ) -BQ_FILTER_RULE_TEMPLATE = "BQ_FILTER_RULE_TEMPLATE" - -BQ_AUDIT_V2 = { - BQ_FILTER_RULE_TEMPLATE: """ -resource.type=("bigquery_project" OR "bigquery_dataset") -AND -timestamp >= "{start_time}" -AND -timestamp < "{end_time}" -AND protoPayload.serviceName="bigquery.googleapis.com" -AND -( - ( - protoPayload.methodName= - ( - "google.cloud.bigquery.v2.JobService.Query" - OR - "google.cloud.bigquery.v2.JobService.InsertJob" - ) - AND protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE" - AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:* - AND protoPayload.metadata.jobChange.job.jobConfig.queryConfig:* - AND - ( - ( - protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:* - AND NOT protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables =~ "projects/.*/datasets/.*/tables/__TABLES__|__TABLES_SUMMARY__|INFORMATION_SCHEMA.*" - ) - OR - ( - protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable:* - ) - ) - ) - OR - protoPayload.metadata.tableDataRead.reason = "JOB" -) -""".strip( - "\t \n" - ), -} - AuditLogEntry = Any # BigQueryAuditMetadata is the v2 format in which audit logs are exported to BigQuery @@ -606,7 +564,6 @@ def from_query_event( query_event: QueryEvent, debug_include_full_payloads: bool = False, ) -> "ReadEvent": - readEvent = ReadEvent( actor_email=query_event.actor_email, timestamp=query_event.timestamp, diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py new file mode 100644 index 0000000000000..03b12c61ee5c6 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py @@ -0,0 +1,139 @@ +import logging +from datetime import datetime +from typing import Callable, Iterable, List, Optional + +from google.cloud import bigquery +from google.cloud.logging_v2.client import Client as GCPLoggingClient +from ratelimiter import RateLimiter + +from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( + AuditLogEntry, + BigQueryAuditMetadata, +) +from datahub.ingestion.source.bigquery_v2.bigquery_report import ( + BigQueryAuditLogApiPerfReport, +) +from datahub.ingestion.source.bigquery_v2.common import ( + BQ_DATE_SHARD_FORMAT, + BQ_DATETIME_FORMAT, +) + +logger: logging.Logger = logging.getLogger(__name__) + + +# Api interfaces are separated based on functionality they provide +# rather than the underlying bigquery client that is used to +# provide the functionality. +class BigQueryAuditLogApi: + def __init__( + self, + report: BigQueryAuditLogApiPerfReport, + rate_limit: bool, + requests_per_min: int, + ) -> None: + self.report = report + self.rate_limit = rate_limit + self.requests_per_min = requests_per_min + + def get_exported_bigquery_audit_metadata( + self, + bigquery_client: bigquery.Client, + bigquery_audit_metadata_query_template: Callable[ + [ + str, # dataset: str + bool, # use_date_sharded_tables: bool + Optional[int], # limit: Optional[int] = None + ], + str, + ], + bigquery_audit_metadata_datasets: Optional[List[str]], + use_date_sharded_audit_log_tables: bool, + start_time: datetime, + end_time: datetime, + limit: Optional[int] = None, + ) -> Iterable[BigQueryAuditMetadata]: + if bigquery_audit_metadata_datasets is None: + return + + audit_start_time = start_time.strftime(BQ_DATETIME_FORMAT) + audit_start_date = start_time.strftime(BQ_DATE_SHARD_FORMAT) + + audit_end_time = end_time.strftime(BQ_DATETIME_FORMAT) + audit_end_date = end_time.strftime(BQ_DATE_SHARD_FORMAT) + + rate_limiter: Optional[RateLimiter] = None + if self.rate_limit: + rate_limiter = RateLimiter(max_calls=self.requests_per_min, period=60) + + with self.report.get_exported_log_entries as current_timer: + for dataset in bigquery_audit_metadata_datasets: + logger.info( + f"Start loading log entries from BigQueryAuditMetadata in {dataset}" + ) + + query = bigquery_audit_metadata_query_template( + dataset, + use_date_sharded_audit_log_tables, + limit, + ).format( + start_time=audit_start_time, + end_time=audit_end_time, + start_date=audit_start_date, + end_date=audit_end_date, + ) + + query_job = bigquery_client.query(query) + logger.info( + f"Finished loading log entries from BigQueryAuditMetadata in {dataset}" + ) + + for entry in query_job: + with current_timer.pause(): + if rate_limiter: + with rate_limiter: + yield entry + else: + yield entry + + def get_bigquery_log_entries_via_gcp_logging( + self, + client: GCPLoggingClient, + filter: str, + log_page_size: int, + limit: Optional[int] = None, + ) -> Iterable[AuditLogEntry]: + logger.debug(filter) + + list_entries: Iterable[AuditLogEntry] + rate_limiter: Optional[RateLimiter] = None + if self.rate_limit: + # client.list_entries is a generator, does api calls to GCP Logging when it runs out of entries and needs to fetch more from GCP Logging + # to properly ratelimit we multiply the page size by the number of requests per minute + rate_limiter = RateLimiter( + max_calls=self.requests_per_min * log_page_size, + period=60, + ) + + with self.report.list_log_entries as current_timer: + list_entries = client.list_entries( + filter_=filter, + page_size=log_page_size, + max_results=limit, + ) + + for i, entry in enumerate(list_entries): + if i % 1000 == 0: + logger.info( + f"Loaded {i} log entries from GCP Log for {client.project}" + ) + + with current_timer.pause(): + if rate_limiter: + with rate_limiter: + yield entry + else: + yield entry + + logger.info( + f"Finished loading log entries from GCP Log for {client.project}" + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index 0f2082c5e53bf..3b06a4699c566 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -4,9 +4,11 @@ from typing import Any, Dict, List, Optional import pydantic -from pydantic import Field, PositiveInt, PrivateAttr, root_validator +from google.cloud import bigquery +from google.cloud.logging_v2.client import Client as GCPLoggingClient +from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator -from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.ingestion.source.sql.sql_config import SQLCommonConfig from datahub.ingestion.source.state.stateful_ingestion_base import ( @@ -35,7 +37,52 @@ class BigQueryUsageConfig(BaseUsageConfig): ) +class BigQueryConnectionConfig(ConfigModel): + credential: Optional[BigQueryCredential] = Field( + default=None, description="BigQuery credential informations" + ) + + _credentials_path: Optional[str] = PrivateAttr(None) + + extra_client_options: Dict[str, Any] = Field( + default={}, + description="Additional options to pass to google.cloud.logging_v2.client.Client.", + ) + + project_on_behalf: Optional[str] = Field( + default=None, + description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.", + ) + + def __init__(self, **data: Any): + super().__init__(**data) + + if self.credential: + self._credentials_path = self.credential.create_credential_temp_file() + logger.debug( + f"Creating temporary credential file at {self._credentials_path}" + ) + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path + + def get_bigquery_client(config) -> bigquery.Client: + client_options = config.extra_client_options + return bigquery.Client(config.project_on_behalf, **client_options) + + def make_gcp_logging_client( + self, project_id: Optional[str] = None + ) -> GCPLoggingClient: + # See https://github.com/googleapis/google-cloud-python/issues/2674 for + # why we disable gRPC here. + client_options = self.extra_client_options.copy() + client_options["_use_grpc"] = False + if project_id is not None: + return GCPLoggingClient(**client_options, project=project_id) + else: + return GCPLoggingClient(**client_options) + + class BigQueryV2Config( + BigQueryConnectionConfig, BigQueryBaseConfig, SQLCommonConfig, StatefulUsageConfigMixin, @@ -122,11 +169,6 @@ class BigQueryV2Config( ), ) - project_on_behalf: Optional[str] = Field( - default=None, - description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.", - ) - storage_project_id: None = Field(default=None, hidden_from_docs=True) lineage_use_sql_parser: bool = Field( @@ -180,14 +222,8 @@ def validate_column_lineage(cls, v: bool, values: Dict[str, Any]) -> bool: default=1000, description="The number of log item will be queried per page for lineage collection", ) - credential: Optional[BigQueryCredential] = Field( - description="BigQuery credential informations" - ) + # extra_client_options, include_table_lineage and max_query_duration are relevant only when computing the lineage. - extra_client_options: Dict[str, Any] = Field( - default={}, - description="Additional options to pass to google.cloud.logging_v2.client.Client.", - ) include_table_lineage: Optional[bool] = Field( default=True, description="Option to enable/disable lineage generation. Is enabled by default.", @@ -209,7 +245,6 @@ def validate_column_lineage(cls, v: bool, values: Dict[str, Any]) -> bool: default=False, description="Whether to read date sharded tables or time partitioned tables when extracting usage from exported audit logs.", ) - _credentials_path: Optional[str] = PrivateAttr(None) _cache_path: Optional[str] = PrivateAttr(None) @@ -230,16 +265,6 @@ def validate_column_lineage(cls, v: bool, values: Dict[str, Any]) -> bool: description="Maximum number of entries for the in-memory caches of FileBacked data structures.", ) - def __init__(self, **data: Any): - super().__init__(**data) - - if self.credential: - self._credentials_path = self.credential.create_credential_temp_file() - logger.debug( - f"Creating temporary credential file at {self._credentials_path}" - ) - os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path - @root_validator(pre=False) def profile_default_settings(cls, values: Dict) -> Dict: # Extra default SQLAlchemy option for better connection pooling and threading. @@ -248,6 +273,17 @@ def profile_default_settings(cls, values: Dict) -> Dict: return values + @validator("bigquery_audit_metadata_datasets") + def validate_bigquery_audit_metadata_datasets( + cls, v: Optional[List[str]], values: Dict + ) -> Optional[List[str]]: + if values.get("use_exported_bigquery_audit_metadata"): + assert ( + v and len(v) > 0 + ), "`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata: True`." + + return v + @root_validator(pre=False) def backward_compatibility_configs_set(cls, values: Dict) -> Dict: project_id = values.get("project_id") diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py index b2251fbb8ab1f..2d6882caa38ef 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py @@ -1,5 +1,4 @@ import collections -import dataclasses import logging from dataclasses import dataclass, field from datetime import datetime @@ -11,11 +10,26 @@ from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport from datahub.ingestion.source_report.time_window import BaseTimeWindowReport from datahub.utilities.lossy_collections import LossyDict, LossyList +from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.stats_collections import TopKDict, int_top_k_dict logger: logging.Logger = logging.getLogger(__name__) +class BigQuerySchemaApiPerfReport: + list_projects = PerfTimer() + list_datasets = PerfTimer() + get_columns_for_dataset = PerfTimer() + get_tables_for_dataset = PerfTimer() + list_tables = PerfTimer() + get_views_for_dataset = PerfTimer() + + +class BigQueryAuditLogApiPerfReport: + get_exported_log_entries = PerfTimer() + list_log_entries = PerfTimer() + + @dataclass class BigQueryV2Report(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowReport): num_total_lineage_entries: TopKDict[str, int] = field(default_factory=TopKDict) @@ -31,8 +45,12 @@ class BigQueryV2Report(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowR num_skipped_lineage_entries_other: TopKDict[str, int] = field( default_factory=int_top_k_dict ) - num_total_log_entries: TopKDict[str, int] = field(default_factory=int_top_k_dict) - num_parsed_log_entries: TopKDict[str, int] = field(default_factory=int_top_k_dict) + num_lineage_total_log_entries: TopKDict[str, int] = field( + default_factory=int_top_k_dict + ) + num_lineage_parsed_log_entries: TopKDict[str, int] = field( + default_factory=int_top_k_dict + ) num_lineage_log_parse_failures: TopKDict[str, int] = field( default_factory=int_top_k_dict ) @@ -42,7 +60,14 @@ class BigQueryV2Report(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowR lineage_mem_size: Dict[str, str] = field(default_factory=TopKDict) lineage_extraction_sec: Dict[str, float] = field(default_factory=TopKDict) usage_extraction_sec: Dict[str, float] = field(default_factory=TopKDict) + num_usage_total_log_entries: TopKDict[str, int] = field( + default_factory=int_top_k_dict + ) + num_usage_parsed_log_entries: TopKDict[str, int] = field( + default_factory=int_top_k_dict + ) usage_error_count: Dict[str, int] = field(default_factory=int_top_k_dict) + num_usage_resources_dropped: int = 0 num_usage_operations_dropped: int = 0 operation_dropped: LossyList[str] = field(default_factory=LossyList) @@ -53,10 +78,10 @@ class BigQueryV2Report(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowR use_date_sharded_audit_log_tables: Optional[bool] = None log_page_size: Optional[pydantic.PositiveInt] = None use_exported_bigquery_audit_metadata: Optional[bool] = None - log_entry_start_time: Optional[str] = None - log_entry_end_time: Optional[str] = None - audit_start_time: Optional[str] = None - audit_end_time: Optional[str] = None + log_entry_start_time: Optional[datetime] = None + log_entry_end_time: Optional[datetime] = None + audit_start_time: Optional[datetime] = None + audit_end_time: Optional[datetime] = None upstream_lineage: LossyDict = field(default_factory=LossyDict) partition_info: Dict[str, str] = field(default_factory=TopKDict) profile_table_selection_criteria: Dict[str, str] = field(default_factory=TopKDict) @@ -89,13 +114,17 @@ class BigQueryV2Report(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowR num_view_definitions_failed_column_parsing: int = 0 view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList) - read_reasons_stat: Counter[str] = dataclasses.field( - default_factory=collections.Counter + read_reasons_stat: Counter[str] = field(default_factory=collections.Counter) + operation_types_stat: Counter[str] = field(default_factory=collections.Counter) + + usage_state_size: Optional[str] = None + + schema_api_perf: BigQuerySchemaApiPerfReport = field( + default_factory=BigQuerySchemaApiPerfReport ) - operation_types_stat: Counter[str] = dataclasses.field( - default_factory=collections.Counter + audit_log_api_perf: BigQueryAuditLogApiPerfReport = field( + default_factory=BigQueryAuditLogApiPerfReport ) - usage_state_size: Optional[str] = None lineage_start_time: Optional[datetime] = None lineage_end_time: Optional[datetime] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py index 47a04c545231b..7edc8656360bb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py @@ -13,22 +13,19 @@ ) from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier -from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report +from datahub.ingestion.source.bigquery_v2.bigquery_report import ( + BigQuerySchemaApiPerfReport, + BigQueryV2Report, +) +from datahub.ingestion.source.bigquery_v2.queries import ( + BigqueryQuery, + BigqueryTableType, +) from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView logger: logging.Logger = logging.getLogger(__name__) -class BigqueryTableType: - # See https://cloud.google.com/bigquery/docs/information-schema-tables#schema - BASE_TABLE = "BASE TABLE" - EXTERNAL = "EXTERNAL" - VIEW = "VIEW" - MATERIALIZED_VIEW = "MATERIALIZED VIEW" - CLONE = "CLONE" - SNAPSHOT = "SNAPSHOT" - - @dataclass class BigqueryColumn(BaseColumn): field_path: str @@ -129,253 +126,43 @@ class BigqueryProject: datasets: List[BigqueryDataset] = field(default_factory=list) -class BigqueryQuery: - show_datasets: str = ( - "select schema_name from `{project_id}`.INFORMATION_SCHEMA.SCHEMATA" - ) - - datasets_for_project_id: str = """ -select - s.CATALOG_NAME as catalog_name, - s.schema_name as table_schema, - s.location as location, - s.CREATION_TIME as created, - s.LAST_MODIFIED_TIME as last_altered, - o.OPTION_VALUE as comment -from - `{project_id}`.INFORMATION_SCHEMA.SCHEMATA as s - left join `{project_id}`.INFORMATION_SCHEMA.SCHEMATA_OPTIONS as o on o.schema_name = s.schema_name - and o.option_name = "description" -order by - s.schema_name -""" - - # https://cloud.google.com/bigquery/docs/information-schema-table-storage?hl=en - # Note for max_partition_id - - # should we instead pick the partition with latest LAST_MODIFIED_TIME ? - # for range partitioning max may not be latest partition - tables_for_dataset = f""" -SELECT - t.table_catalog as table_catalog, - t.table_schema as table_schema, - t.table_name as table_name, - t.table_type as table_type, - t.creation_time as created, - ts.last_modified_time as last_altered, - tos.OPTION_VALUE as comment, - is_insertable_into, - ddl, - row_count, - size_bytes as bytes, - num_partitions, - max_partition_id, - active_billable_bytes, - long_term_billable_bytes, - REGEXP_EXTRACT(t.table_name, r".*_(\\d+)$") as table_suffix, - REGEXP_REPLACE(t.table_name, r"_(\\d+)$", "") as table_base - -FROM - `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t - join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME - left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema - and t.TABLE_NAME = tos.TABLE_NAME - and tos.OPTION_NAME = "description" - left join ( - select - table_name, - sum(case when partition_id not in ('__NULL__', '__UNPARTITIONED__', '__STREAMING_UNPARTITIONED__') then 1 else 0 END) as num_partitions, - max(case when partition_id not in ('__NULL__', '__UNPARTITIONED__', '__STREAMING_UNPARTITIONED__') then partition_id else NULL END) as max_partition_id, - sum(total_rows) as total_rows, - sum(case when storage_tier = 'LONG_TERM' then total_billable_bytes else 0 end) as long_term_billable_bytes, - sum(case when storage_tier = 'ACTIVE' then total_billable_bytes else 0 end) as active_billable_bytes, - from - `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.PARTITIONS - group by - table_name) as p on - t.table_name = p.table_name -WHERE - table_type in ('{BigqueryTableType.BASE_TABLE}', '{BigqueryTableType.EXTERNAL}') -{{table_filter}} -order by - table_schema ASC, - table_base ASC, - table_suffix DESC -""" - - tables_for_dataset_without_partition_data = f""" -SELECT - t.table_catalog as table_catalog, - t.table_schema as table_schema, - t.table_name as table_name, - t.table_type as table_type, - t.creation_time as created, - tos.OPTION_VALUE as comment, - is_insertable_into, - ddl, - REGEXP_EXTRACT(t.table_name, r".*_(\\d+)$") as table_suffix, - REGEXP_REPLACE(t.table_name, r"_(\\d+)$", "") as table_base - -FROM - `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t - left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema - and t.TABLE_NAME = tos.TABLE_NAME - and tos.OPTION_NAME = "description" -WHERE - table_type in ('{BigqueryTableType.BASE_TABLE}', '{BigqueryTableType.EXTERNAL}') -{{table_filter}} -order by - table_schema ASC, - table_base ASC, - table_suffix DESC -""" - - views_for_dataset: str = f""" -SELECT - t.table_catalog as table_catalog, - t.table_schema as table_schema, - t.table_name as table_name, - t.table_type as table_type, - t.creation_time as created, - ts.last_modified_time as last_altered, - tos.OPTION_VALUE as comment, - is_insertable_into, - ddl as view_definition, - row_count, - size_bytes -FROM - `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t - join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME - left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema - and t.TABLE_NAME = tos.TABLE_NAME - and tos.OPTION_NAME = "description" -WHERE - table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}') -order by - table_schema ASC, - table_name ASC -""" - - views_for_dataset_without_data_read: str = f""" -SELECT - t.table_catalog as table_catalog, - t.table_schema as table_schema, - t.table_name as table_name, - t.table_type as table_type, - t.creation_time as created, - tos.OPTION_VALUE as comment, - is_insertable_into, - ddl as view_definition -FROM - `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t - left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema - and t.TABLE_NAME = tos.TABLE_NAME - and tos.OPTION_NAME = "description" -WHERE - table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}') -order by - table_schema ASC, - table_name ASC -""" - - columns_for_dataset: str = """ -select - c.table_catalog as table_catalog, - c.table_schema as table_schema, - c.table_name as table_name, - c.column_name as column_name, - c.ordinal_position as ordinal_position, - cfp.field_path as field_path, - c.is_nullable as is_nullable, - CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type, - description as comment, - c.is_hidden as is_hidden, - c.is_partitioning_column as is_partitioning_column, - c.clustering_ordinal_position as clustering_ordinal_position, -from - `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c - join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name - and cfp.column_name = c.column_name -ORDER BY - table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC""" - - optimized_columns_for_dataset: str = """ -select * from -(select - c.table_catalog as table_catalog, - c.table_schema as table_schema, - c.table_name as table_name, - c.column_name as column_name, - c.ordinal_position as ordinal_position, - cfp.field_path as field_path, - c.is_nullable as is_nullable, - CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type, - description as comment, - c.is_hidden as is_hidden, - c.is_partitioning_column as is_partitioning_column, - c.clustering_ordinal_position as clustering_ordinal_position, - -- We count the columns to be able limit it later - row_number() over (partition by c.table_catalog, c.table_schema, c.table_name order by c.ordinal_position asc, c.data_type DESC) as column_num, - -- Getting the maximum shard for each table - row_number() over (partition by c.table_catalog, c.table_schema, ifnull(REGEXP_EXTRACT(c.table_name, r'(.*)_\\d{{8}}$'), c.table_name), cfp.field_path order by c.table_catalog, c.table_schema asc, c.table_name desc) as shard_num -from - `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c - join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name - and cfp.column_name = c.column_name - ) --- We filter column limit + 1 to make sure we warn about the limit being reached but not reading too much data -where column_num <= {column_limit} and shard_num = 1 -ORDER BY - table_catalog, table_schema, table_name, ordinal_position, column_num ASC, data_type DESC""" - - columns_for_table: str = """ -select - c.table_catalog as table_catalog, - c.table_schema as table_schema, - c.table_name as table_name, - c.column_name as column_name, - c.ordinal_position as ordinal_position, - cfp.field_path as field_path, - c.is_nullable as is_nullable, - CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type, - c.is_hidden as is_hidden, - c.is_partitioning_column as is_partitioning_column, - c.clustering_ordinal_position as clustering_ordinal_position, - description as comment -from - `{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMNS as c - join `{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name - and cfp.column_name = c.column_name -where - c.table_name = '{table_identifier.table}' -ORDER BY - table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC""" - - -class BigQueryDataDictionary: - @staticmethod - def get_query_result(conn: bigquery.Client, query: str) -> RowIterator: +class BigQuerySchemaApi: + def __init__( + self, report: BigQuerySchemaApiPerfReport, client: bigquery.Client + ) -> None: + self.bq_client = client + self.report = report + + def get_query_result(self, query: str) -> RowIterator: logger.debug(f"Query : {query}") - resp = conn.query(query) + resp = self.bq_client.query(query) return resp.result() - @staticmethod - def get_projects(conn: bigquery.Client) -> List[BigqueryProject]: - projects = conn.list_projects() + def get_projects(self) -> List[BigqueryProject]: + with self.report.list_projects: + try: + projects = self.bq_client.list_projects() - return [ - BigqueryProject(id=p.project_id, name=p.friendly_name) for p in projects - ] + return [ + BigqueryProject(id=p.project_id, name=p.friendly_name) + for p in projects + ] + except Exception as e: + logger.error(f"Error getting projects. {e}", exc_info=True) + return [] - @staticmethod def get_datasets_for_project_id( - conn: bigquery.Client, project_id: str, maxResults: Optional[int] = None + self, project_id: str, maxResults: Optional[int] = None ) -> List[BigqueryDataset]: - datasets = conn.list_datasets(project_id, max_results=maxResults) - return [BigqueryDataset(name=d.dataset_id, labels=d.labels) for d in datasets] + with self.report.list_datasets: + datasets = self.bq_client.list_datasets(project_id, max_results=maxResults) + return [ + BigqueryDataset(name=d.dataset_id, labels=d.labels) for d in datasets + ] - @staticmethod + # This is not used anywhere def get_datasets_for_project_id_with_information_schema( - conn: bigquery.Client, project_id: str + self, project_id: str ) -> List[BigqueryDataset]: """ This method is not used as of now, due to below limitation. @@ -383,8 +170,7 @@ def get_datasets_for_project_id_with_information_schema( We'll need Region wise separate queries to fetch all datasets https://cloud.google.com/bigquery/docs/information-schema-datasets-schemata """ - schemas = BigQueryDataDictionary.get_query_result( - conn, + schemas = self.get_query_result( BigqueryQuery.datasets_for_project_id.format(project_id=project_id), ) return [ @@ -398,56 +184,67 @@ def get_datasets_for_project_id_with_information_schema( for s in schemas ] - @staticmethod + def list_tables( + self, dataset_name: str, project_id: str + ) -> Iterator[TableListItem]: + with self.report.list_tables as current_timer: + for table in self.bq_client.list_tables(f"{project_id}.{dataset_name}"): + with current_timer.pause(): + yield table + def get_tables_for_dataset( - conn: bigquery.Client, + self, project_id: str, dataset_name: str, tables: Dict[str, TableListItem], with_data_read_permission: bool = False, report: Optional[BigQueryV2Report] = None, ) -> Iterator[BigqueryTable]: - filter: str = ", ".join(f"'{table}'" for table in tables.keys()) - - if with_data_read_permission: - # Tables are ordered by name and table suffix to make sure we always process the latest sharded table - # and skip the others. Sharded tables are tables with suffix _20220102 - cur = BigQueryDataDictionary.get_query_result( - conn, - BigqueryQuery.tables_for_dataset.format( - project_id=project_id, - dataset_name=dataset_name, - table_filter=f" and t.table_name in ({filter})" if filter else "", - ), - ) - else: - # Tables are ordered by name and table suffix to make sure we always process the latest sharded table - # and skip the others. Sharded tables are tables with suffix _20220102 - cur = BigQueryDataDictionary.get_query_result( - conn, - BigqueryQuery.tables_for_dataset_without_partition_data.format( - project_id=project_id, - dataset_name=dataset_name, - table_filter=f" and t.table_name in ({filter})" if filter else "", - ), - ) - - for table in cur: - try: - yield BigQueryDataDictionary._make_bigquery_table( - table, tables.get(table.table_name) + with self.report.get_tables_for_dataset as current_timer: + filter_clause: str = ", ".join(f"'{table}'" for table in tables.keys()) + + if with_data_read_permission: + # Tables are ordered by name and table suffix to make sure we always process the latest sharded table + # and skip the others. Sharded tables are tables with suffix _20220102 + cur = self.get_query_result( + BigqueryQuery.tables_for_dataset.format( + project_id=project_id, + dataset_name=dataset_name, + table_filter=f" and t.table_name in ({filter_clause})" + if filter_clause + else "", + ), ) - except Exception as e: - table_name = f"{project_id}.{dataset_name}.{table.table_name}" - logger.warning( - f"Error while processing table {table_name}", - exc_info=True, + else: + # Tables are ordered by name and table suffix to make sure we always process the latest sharded table + # and skip the others. Sharded tables are tables with suffix _20220102 + cur = self.get_query_result( + BigqueryQuery.tables_for_dataset_without_partition_data.format( + project_id=project_id, + dataset_name=dataset_name, + table_filter=f" and t.table_name in ({filter_clause})" + if filter_clause + else "", + ), ) - if report: - report.report_warning( - "metadata-extraction", - f"Failed to get table {table_name}: {e}", + + for table in cur: + try: + with current_timer.pause(): + yield BigQuerySchemaApi._make_bigquery_table( + table, tables.get(table.table_name) + ) + except Exception as e: + table_name = f"{project_id}.{dataset_name}.{table.table_name}" + logger.warning( + f"Error while processing table {table_name}", + exc_info=True, ) + if report: + report.report_warning( + "metadata-extraction", + f"Failed to get table {table_name}: {e}", + ) @staticmethod def _make_bigquery_table( @@ -487,43 +284,42 @@ def _make_bigquery_table( long_term_billable_bytes=table.get("long_term_billable_bytes"), ) - @staticmethod def get_views_for_dataset( - conn: bigquery.Client, + self, project_id: str, dataset_name: str, has_data_read: bool, report: Optional[BigQueryV2Report] = None, ) -> Iterator[BigqueryView]: - if has_data_read: - cur = BigQueryDataDictionary.get_query_result( - conn, - BigqueryQuery.views_for_dataset.format( - project_id=project_id, dataset_name=dataset_name - ), - ) - else: - cur = BigQueryDataDictionary.get_query_result( - conn, - BigqueryQuery.views_for_dataset_without_data_read.format( - project_id=project_id, dataset_name=dataset_name - ), - ) - - for table in cur: - try: - yield BigQueryDataDictionary._make_bigquery_view(table) - except Exception as e: - view_name = f"{project_id}.{dataset_name}.{table.table_name}" - logger.warning( - f"Error while processing view {view_name}", - exc_info=True, + with self.report.get_views_for_dataset as current_timer: + if has_data_read: + cur = self.get_query_result( + BigqueryQuery.views_for_dataset.format( + project_id=project_id, dataset_name=dataset_name + ), + ) + else: + cur = self.get_query_result( + BigqueryQuery.views_for_dataset_without_data_read.format( + project_id=project_id, dataset_name=dataset_name + ), ) - if report: - report.report_warning( - "metadata-extraction", - f"Failed to get view {view_name}: {e}", + + for table in cur: + try: + with current_timer.pause(): + yield BigQuerySchemaApi._make_bigquery_view(table) + except Exception as e: + view_name = f"{project_id}.{dataset_name}.{table.table_name}" + logger.warning( + f"Error while processing view {view_name}", + exc_info=True, ) + if report: + report.report_warning( + "metadata-extraction", + f"Failed to get view {view_name}: {e}", + ) @staticmethod def _make_bigquery_view(view: bigquery.Row) -> BigqueryView: @@ -540,70 +336,68 @@ def _make_bigquery_view(view: bigquery.Row) -> BigqueryView: materialized=view.table_type == BigqueryTableType.MATERIALIZED_VIEW, ) - @staticmethod def get_columns_for_dataset( - conn: bigquery.Client, + self, project_id: str, dataset_name: str, column_limit: int, run_optimized_column_query: bool = False, ) -> Optional[Dict[str, List[BigqueryColumn]]]: columns: Dict[str, List[BigqueryColumn]] = defaultdict(list) - try: - cur = BigQueryDataDictionary.get_query_result( - conn, - BigqueryQuery.columns_for_dataset.format( - project_id=project_id, dataset_name=dataset_name - ) - if not run_optimized_column_query - else BigqueryQuery.optimized_columns_for_dataset.format( - project_id=project_id, - dataset_name=dataset_name, - column_limit=column_limit, - ), - ) - except Exception as e: - logger.warning(f"Columns for dataset query failed with exception: {e}") - # Error - Information schema query returned too much data. - # Please repeat query with more selective predicates. - return None - - last_seen_table: str = "" - for column in cur: - if ( - column_limit - and column.table_name in columns - and len(columns[column.table_name]) >= column_limit - ): - if last_seen_table != column.table_name: - logger.warning( - f"{project_id}.{dataset_name}.{column.table_name} contains more than {column_limit} columns, only processing {column_limit} columns" - ) - last_seen_table = column.table_name - else: - columns[column.table_name].append( - BigqueryColumn( - name=column.column_name, - ordinal_position=column.ordinal_position, - field_path=column.field_path, - is_nullable=column.is_nullable == "YES", - data_type=column.data_type, - comment=column.comment, - is_partition_column=column.is_partitioning_column == "YES", - cluster_column_position=column.clustering_ordinal_position, + with self.report.get_columns_for_dataset: + try: + cur = self.get_query_result( + BigqueryQuery.columns_for_dataset.format( + project_id=project_id, dataset_name=dataset_name ) + if not run_optimized_column_query + else BigqueryQuery.optimized_columns_for_dataset.format( + project_id=project_id, + dataset_name=dataset_name, + column_limit=column_limit, + ), ) + except Exception as e: + logger.warning(f"Columns for dataset query failed with exception: {e}") + # Error - Information schema query returned too much data. + # Please repeat query with more selective predicates. + return None + + last_seen_table: str = "" + for column in cur: + if ( + column_limit + and column.table_name in columns + and len(columns[column.table_name]) >= column_limit + ): + if last_seen_table != column.table_name: + logger.warning( + f"{project_id}.{dataset_name}.{column.table_name} contains more than {column_limit} columns, only processing {column_limit} columns" + ) + last_seen_table = column.table_name + else: + columns[column.table_name].append( + BigqueryColumn( + name=column.column_name, + ordinal_position=column.ordinal_position, + field_path=column.field_path, + is_nullable=column.is_nullable == "YES", + data_type=column.data_type, + comment=column.comment, + is_partition_column=column.is_partitioning_column == "YES", + cluster_column_position=column.clustering_ordinal_position, + ) + ) return columns - @staticmethod + # This is not used anywhere def get_columns_for_table( - conn: bigquery.Client, + self, table_identifier: BigqueryTableIdentifier, column_limit: Optional[int], ) -> List[BigqueryColumn]: - cur = BigQueryDataDictionary.get_query_result( - conn, + cur = self.get_query_result( BigqueryQuery.columns_for_table.format(table_identifier=table_identifier), ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/common.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/common.py index 4ff509858b87d..e38ab07855b8b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/common.py @@ -1,39 +1,5 @@ -from typing import Any, Dict, Optional - -from google.cloud import bigquery -from google.cloud.logging_v2.client import Client as GCPLoggingClient - -from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config - BQ_DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ" BQ_DATE_SHARD_FORMAT = "%Y%m%d" BQ_EXTERNAL_TABLE_URL_TEMPLATE = "https://console.cloud.google.com/bigquery?project={project}&ws=!1m5!1m4!4m3!1s{project}!2s{dataset}!3s{table}" BQ_EXTERNAL_DATASET_URL_TEMPLATE = "https://console.cloud.google.com/bigquery?project={project}&ws=!1m4!1m3!3m2!1s{project}!2s{dataset}" - - -def _make_gcp_logging_client( - project_id: Optional[str] = None, extra_client_options: Dict[str, Any] = {} -) -> GCPLoggingClient: - # See https://github.com/googleapis/google-cloud-python/issues/2674 for - # why we disable gRPC here. - client_options = extra_client_options.copy() - client_options["_use_grpc"] = False - if project_id is not None: - return GCPLoggingClient(**client_options, project=project_id) - else: - return GCPLoggingClient(**client_options) - - -def get_bigquery_client(config: BigQueryV2Config) -> bigquery.Client: - client_options = config.extra_client_options - return bigquery.Client(config.project_on_behalf, **client_options) - - -def get_sql_alchemy_url(config: BigQueryV2Config) -> str: - if config.project_on_behalf: - return f"bigquery://{config.project_on_behalf}" - # When project_id is not set, we will attempt to detect the project ID - # based on the credentials or environment variables. - # See https://github.com/mxmzdlv/pybigquery#authentication. - return "bigquery://" diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index 341952d95e7d7..98c8cbaf85eec 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -1,7 +1,6 @@ import collections import itertools import logging -import textwrap from dataclasses import dataclass from datetime import datetime, timezone from typing import ( @@ -18,12 +17,12 @@ ) import humanfriendly -from google.cloud.bigquery import Client as BigQueryClient from google.cloud.datacatalog import lineage_v1 from google.cloud.logging_v2.client import Client as GCPLoggingClient -from ratelimiter import RateLimiter from datahub.emitter import mce_builder +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( AuditLogEntry, BigQueryAuditMetadata, @@ -32,13 +31,16 @@ QueryEvent, ReadEvent, ) +from datahub.ingestion.source.bigquery_v2.bigquery_audit_log_api import ( + BigQueryAuditLogApi, +) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report -from datahub.ingestion.source.bigquery_v2.common import ( - BQ_DATE_SHARD_FORMAT, - BQ_DATETIME_FORMAT, - _make_gcp_logging_client, - get_bigquery_client, +from datahub.ingestion.source.bigquery_v2.bigquery_schema import BigQuerySchemaApi +from datahub.ingestion.source.bigquery_v2.common import BQ_DATETIME_FORMAT +from datahub.ingestion.source.bigquery_v2.queries import ( + BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE, + bigquery_audit_metadata_query_template_lineage, ) from datahub.ingestion.source.state.redundant_run_skip_handler import ( RedundantLineageRunSkipHandler, @@ -52,7 +54,9 @@ UpstreamClass, UpstreamLineageClass, ) +from datahub.specific.dataset import DatasetPatchBuilder from datahub.utilities import memory_footprint +from datahub.utilities.file_backed_collections import FileBackedDict from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.sqlglot_lineage import ( SchemaResolver, @@ -194,49 +198,21 @@ def make_lineage_edges_from_parsing_result( class BigqueryLineageExtractor: - BQ_FILTER_RULE_TEMPLATE_V2 = """ -resource.type=("bigquery_project") -AND -( - protoPayload.methodName= - ( - "google.cloud.bigquery.v2.JobService.Query" - OR - "google.cloud.bigquery.v2.JobService.InsertJob" - ) - AND - protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE" - AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:* - AND ( - protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:* - OR - protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedViews:* - ) - AND ( - protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/_.*/tables/anon.*" - AND - protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/INFORMATION_SCHEMA.*" - AND - protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/__TABLES__" - AND - protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable !~ "projects/.*/datasets/_.*/tables/anon.*" - ) - -) -AND -timestamp >= "{start_time}" -AND -timestamp < "{end_time}" -""".strip() - def __init__( self, config: BigQueryV2Config, report: BigQueryV2Report, + dataset_urn_builder: Callable[[BigQueryTableRef], str], redundant_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = None, ): self.config = config self.report = report + self.dataset_urn_builder = dataset_urn_builder + self.audit_log_api = BigQueryAuditLogApi( + report.audit_log_api_perf, + self.config.rate_limit, + self.config.requests_per_min, + ) self.redundant_run_skip_handler = redundant_run_skip_handler self.start_time, self.end_time = ( @@ -256,55 +232,205 @@ def error(self, log: logging.Logger, key: str, reason: str) -> None: self.report.report_warning(key, reason) log.error(f"{key} => {reason}") - @staticmethod - def bigquery_audit_metadata_query_template( - dataset: str, use_date_sharded_tables: bool, limit: Optional[int] = None - ) -> str: - """ - Receives a dataset (with project specified) and returns a query template that is used to query exported - AuditLogs containing protoPayloads of type BigQueryAuditMetadata. - Include only those that: - - have been completed (jobStatus.jobState = "DONE") - - do not contain errors (jobStatus.errorResults is none) - :param dataset: the dataset to query against in the form of $PROJECT.$DATASET - :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log - tables - :param limit: set a limit for the maximum event to return. It is used for connection testing currently - :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery - """ - limit_text = f"limit {limit}" if limit else "" + def _should_ingest_lineage(self) -> bool: + if ( + self.redundant_run_skip_handler + and self.redundant_run_skip_handler.should_skip_this_run( + cur_start_time=self.config.start_time, + cur_end_time=self.config.end_time, + ) + ): + # Skip this run + self.report.report_warning( + "lineage-extraction", + "Skip this run as there was already a run for current ingestion window.", + ) + return False + + return True + + def get_lineage_workunits( + self, + projects: List[str], + sql_parser_schema_resolver: SchemaResolver, + view_refs_by_project: Dict[str, Set[str]], + view_definitions: FileBackedDict[str], + table_refs: Set[str], + ) -> Iterable[MetadataWorkUnit]: + if not self._should_ingest_lineage(): + return + views_skip_audit_log_lineage: Set[str] = set() + if self.config.lineage_parse_view_ddl: + view_lineage: Dict[str, Set[LineageEdge]] = {} + for project in projects: + self.populate_view_lineage_with_sql_parsing( + view_lineage, + view_refs_by_project[project], + view_definitions, + sql_parser_schema_resolver, + project, + ) - shard_condition = "" - if use_date_sharded_tables: - from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`" - shard_condition = ( - """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """ + views_skip_audit_log_lineage.update(view_lineage.keys()) + for lineage_key in view_lineage.keys(): + yield from self.gen_lineage_workunits_for_table( + view_lineage, BigQueryTableRef.from_string_name(lineage_key) + ) + + if self.config.use_exported_bigquery_audit_metadata: + projects = ["*"] # project_id not used when using exported metadata + + for project in projects: + self.report.set_ingestion_stage(project, "Lineage Extraction") + yield from self.generate_lineage( + project, + sql_parser_schema_resolver, + views_skip_audit_log_lineage, + table_refs, ) - else: - from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`" - - query = f""" - SELECT - timestamp, - logName, - insertId, - protopayload_auditlog AS protoPayload, - protopayload_auditlog.metadataJson AS metadata - FROM - {from_table} - WHERE ( - timestamp >= "{{start_time}}" - AND timestamp < "{{end_time}}" + + if self.redundant_run_skip_handler: + # Update the checkpoint state for this run. + self.redundant_run_skip_handler.update_state( + self.config.start_time, self.config.end_time ) - {shard_condition} - AND protopayload_auditlog.serviceName="bigquery.googleapis.com" - AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE" - AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL - AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL - {limit_text}; - """ - return textwrap.dedent(query) + def generate_lineage( + self, + project_id: str, + sql_parser_schema_resolver: SchemaResolver, + views_skip_audit_log_lineage: Set[str], + table_refs: Set[str], + ) -> Iterable[MetadataWorkUnit]: + logger.info(f"Generate lineage for {project_id}") + with PerfTimer() as timer: + try: + if self.config.extract_lineage_from_catalog: + lineage = self.lineage_via_catalog_lineage_api(project_id) + else: + events = self._get_parsed_audit_log_events(project_id) + lineage = self._create_lineage_map( + events, sql_parser_schema_resolver + ) + except Exception as e: + if project_id: + self.report.lineage_failed_extraction.append(project_id) + self.error( + logger, + "lineage", + f"{project_id}: {e}", + ) + lineage = {} + + self.report.lineage_metadata_entries[project_id] = len(lineage) + logger.info(f"Built lineage map containing {len(lineage)} entries.") + logger.debug(f"lineage metadata is {lineage}") + self.report.lineage_extraction_sec[project_id] = round( + timer.elapsed_seconds(), 2 + ) + self.report.lineage_mem_size[project_id] = humanfriendly.format_size( + memory_footprint.total_size(lineage) + ) + + for lineage_key in lineage.keys(): + # For views, we do not use the upstreams obtained by parsing audit logs + # as they may contain indirectly referenced tables. + if ( + lineage_key not in table_refs + or lineage_key in views_skip_audit_log_lineage + ): + continue + + yield from self.gen_lineage_workunits_for_table( + lineage, BigQueryTableRef.from_string_name(lineage_key) + ) + + def populate_view_lineage_with_sql_parsing( + self, + view_lineage: Dict[str, Set[LineageEdge]], + view_refs: Set[str], + view_definitions: FileBackedDict[str], + sql_parser_schema_resolver: SchemaResolver, + default_project: str, + ) -> None: + for view in view_refs: + view_definition = view_definitions[view] + raw_view_lineage = sqlglot_lineage( + view_definition, + schema_resolver=sql_parser_schema_resolver, + default_db=default_project, + ) + if raw_view_lineage.debug_info.table_error: + logger.debug( + f"Failed to parse lineage for view {view}: {raw_view_lineage.debug_info.table_error}" + ) + self.report.num_view_definitions_failed_parsing += 1 + self.report.view_definitions_parsing_failures.append( + f"Table-level sql parsing error for view {view}: {raw_view_lineage.debug_info.table_error}" + ) + continue + elif raw_view_lineage.debug_info.column_error: + self.report.num_view_definitions_failed_column_parsing += 1 + self.report.view_definitions_parsing_failures.append( + f"Column-level sql parsing error for view {view}: {raw_view_lineage.debug_info.column_error}" + ) + else: + self.report.num_view_definitions_parsed += 1 + + ts = datetime.now(timezone.utc) + view_lineage[view] = set( + make_lineage_edges_from_parsing_result( + raw_view_lineage, + audit_stamp=ts, + lineage_type=DatasetLineageTypeClass.VIEW, + ) + ) + + def gen_lineage_workunits_for_table( + self, lineage: Dict[str, Set[LineageEdge]], table_ref: BigQueryTableRef + ) -> Iterable[MetadataWorkUnit]: + dataset_urn = self.dataset_urn_builder(table_ref) + + lineage_info = self.get_lineage_for_table( + bq_table=table_ref, + bq_table_urn=dataset_urn, + lineage_metadata=lineage, + ) + if lineage_info: + yield from self.gen_lineage(dataset_urn, lineage_info) + + def gen_lineage( + self, + dataset_urn: str, + upstream_lineage: Optional[UpstreamLineageClass] = None, + ) -> Iterable[MetadataWorkUnit]: + if upstream_lineage is None: + return + + if upstream_lineage is not None: + if self.config.incremental_lineage: + patch_builder: DatasetPatchBuilder = DatasetPatchBuilder( + urn=dataset_urn + ) + for upstream in upstream_lineage.upstreams: + patch_builder.add_upstream_lineage(upstream) + + yield from [ + MetadataWorkUnit( + id=f"upstreamLineage-for-{dataset_urn}", + mcp_raw=mcp, + ) + for mcp in patch_builder.build() + ] + else: + if not self.config.extract_column_lineage: + upstream_lineage.fineGrainedLineages = None + + yield from [ + MetadataChangeProposalWrapper( + entityUrn=dataset_urn, aspect=upstream_lineage + ).as_workunit() + ] def lineage_via_catalog_lineage_api( self, project_id: str @@ -328,22 +454,28 @@ def lineage_via_catalog_lineage_api( try: lineage_client: lineage_v1.LineageClient = lineage_v1.LineageClient() - bigquery_client: BigQueryClient = get_bigquery_client(self.config) + + data_dictionary = BigQuerySchemaApi( + self.report.schema_api_perf, self.config.get_bigquery_client() + ) + # Filtering datasets - datasets = list(bigquery_client.list_datasets(project_id)) + datasets = list(data_dictionary.get_datasets_for_project_id(project_id)) project_tables = [] for dataset in datasets: # Enables only tables where type is TABLE, VIEW or MATERIALIZED_VIEW (not EXTERNAL) project_tables.extend( [ table - for table in bigquery_client.list_tables(dataset.dataset_id) + for table in data_dictionary.list_tables( + dataset.name, project_id + ) if table.table_type in ["TABLE", "VIEW", "MATERIALIZED_VIEW"] ] ) # Convert project tables to .. format - project_tables = list( + project_table_names = list( map( lambda table: "{}.{}.{}".format( table.project, table.dataset_id, table.table_id @@ -354,7 +486,7 @@ def lineage_via_catalog_lineage_api( lineage_map: Dict[str, Set[LineageEdge]] = {} curr_date = datetime.now() - for table in project_tables: + for table in project_table_names: logger.info("Creating lineage map for table %s", table) upstreams = set() downstream_table = lineage_v1.EntityReference() @@ -411,127 +543,73 @@ def lineage_via_catalog_lineage_api( raise e def _get_parsed_audit_log_events(self, project_id: str) -> Iterable[QueryEvent]: + # We adjust the filter values a bit, since we need to make sure that the join + # between query events and read events is complete. For example, this helps us + # handle the case where the read happens within our time range but the query + # completion event is delayed and happens after the configured end time. + corrected_start_time = self.start_time - self.config.max_query_duration + corrected_end_time = self.end_time + -self.config.max_query_duration + self.report.log_entry_start_time = corrected_start_time + self.report.log_entry_end_time = corrected_end_time + parse_fn: Callable[[Any], Optional[Union[ReadEvent, QueryEvent]]] if self.config.use_exported_bigquery_audit_metadata: - logger.info("Populating lineage info via exported GCP audit logs") - bq_client = get_bigquery_client(self.config) - entries = self._get_exported_bigquery_audit_metadata(bq_client) + entries = self.get_exported_log_entries( + corrected_start_time, corrected_end_time + ) parse_fn = self._parse_exported_bigquery_audit_metadata else: - logger.info("Populating lineage info via exported GCP audit logs") - logging_client = _make_gcp_logging_client(project_id) - entries = self._get_bigquery_log_entries(logging_client) + entries = self.get_log_entries_via_gcp_logging( + project_id, corrected_start_time, corrected_end_time + ) parse_fn = self._parse_bigquery_log_entries for entry in entries: - self.report.num_total_log_entries[project_id] += 1 + self.report.num_lineage_total_log_entries[project_id] += 1 try: event = parse_fn(entry) if event: - self.report.num_parsed_log_entries[project_id] += 1 + self.report.num_lineage_parsed_log_entries[project_id] += 1 yield event except Exception as e: logger.warning(f"Unable to parse log entry `{entry}`: {e}") self.report.num_lineage_log_parse_failures[project_id] += 1 - def _get_bigquery_log_entries( - self, client: GCPLoggingClient, limit: Optional[int] = None - ) -> Iterable[AuditLogEntry]: - self.report.num_total_log_entries[client.project] = 0 - # Add a buffer to start and end time to account for delays in logging events. - start_time = (self.start_time - self.config.max_query_duration).strftime( - BQ_DATETIME_FORMAT - ) - self.report.log_entry_start_time = start_time - - end_time = (self.config.end_time + self.config.max_query_duration).strftime( - BQ_DATETIME_FORMAT - ) - self.report.log_entry_end_time = end_time - - filter = self.BQ_FILTER_RULE_TEMPLATE_V2.format( - start_time=start_time, - end_time=end_time, - ) - - logger.info( - f"Start loading log entries from BigQuery for {client.project} with start_time={start_time} and end_time={end_time}" + def get_exported_log_entries( + self, corrected_start_time, corrected_end_time, limit=None + ): + logger.info("Populating lineage info via exported GCP audit logs") + bq_client = self.config.get_bigquery_client() + entries = self.audit_log_api.get_exported_bigquery_audit_metadata( + bigquery_client=bq_client, + bigquery_audit_metadata_query_template=bigquery_audit_metadata_query_template_lineage, + bigquery_audit_metadata_datasets=self.config.bigquery_audit_metadata_datasets, + use_date_sharded_audit_log_tables=self.config.use_date_sharded_audit_log_tables, + start_time=corrected_start_time, + end_time=corrected_end_time, + limit=limit, ) + return entries - if self.config.rate_limit: - with RateLimiter(max_calls=self.config.requests_per_min, period=60): - entries = client.list_entries( - filter_=filter, - page_size=self.config.log_page_size, - max_results=limit, - ) - else: - entries = client.list_entries( - filter_=filter, page_size=self.config.log_page_size, max_results=limit - ) + def get_log_entries_via_gcp_logging( + self, project_id, corrected_start_time, corrected_end_time + ): + logger.info("Populating lineage info via exported GCP audit logs") + logging_client = self.config.make_gcp_logging_client(project_id) logger.info( - f"Start iterating over log entries from BigQuery for {client.project}" + f"Start loading log entries from BigQuery for {project_id} " + f"with start_time={corrected_start_time} and end_time={corrected_end_time}" ) - for entry in entries: - self.report.num_total_log_entries[client.project] += 1 - if self.report.num_total_log_entries[client.project] % 1000 == 0: - logger.info( - f"{self.report.num_total_log_entries[client.project]} log entries loaded for project {client.project} so far..." - ) - yield entry - - logger.info( - f"Finished loading {self.report.num_total_log_entries[client.project]} log entries from BigQuery project {client.project} so far" + entries = self.audit_log_api.get_bigquery_log_entries_via_gcp_logging( + logging_client, + BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE.format( + start_time=corrected_start_time.strftime(BQ_DATETIME_FORMAT), + end_time=corrected_end_time.strftime(BQ_DATETIME_FORMAT), + ), + self.config.log_page_size, ) - - def _get_exported_bigquery_audit_metadata( - self, bigquery_client: BigQueryClient, limit: Optional[int] = None - ) -> Iterable[BigQueryAuditMetadata]: - if self.config.bigquery_audit_metadata_datasets is None: - self.error( - logger, "audit-metadata", "bigquery_audit_metadata_datasets not set" - ) - self.report.bigquery_audit_metadata_datasets_missing = True - return - - corrected_start_time = self.start_time - self.config.max_query_duration - start_time = corrected_start_time.strftime(BQ_DATETIME_FORMAT) - start_date = corrected_start_time.strftime(BQ_DATE_SHARD_FORMAT) - self.report.audit_start_time = start_time - - corrected_end_time = self.end_time + self.config.max_query_duration - end_time = corrected_end_time.strftime(BQ_DATETIME_FORMAT) - end_date = corrected_end_time.strftime(BQ_DATE_SHARD_FORMAT) - self.report.audit_end_time = end_time - - for dataset in self.config.bigquery_audit_metadata_datasets: - logger.info( - f"Start loading log entries from BigQueryAuditMetadata in {dataset}" - ) - - query: str = self.bigquery_audit_metadata_query_template( - dataset=dataset, - use_date_sharded_tables=self.config.use_date_sharded_audit_log_tables, - limit=limit, - ).format( - start_time=start_time, - end_time=end_time, - start_date=start_date, - end_date=end_date, - ) - - query_job = bigquery_client.query(query) - - logger.info( - f"Finished loading log entries from BigQueryAuditMetadata in {dataset}" - ) - - if self.config.rate_limit: - with RateLimiter(max_calls=self.config.requests_per_min, period=60): - yield from query_job - else: - yield from query_job + return entries # Currently we only parse JobCompleted events but in future we would want to parse other # events to also create field level lineage. @@ -674,39 +752,6 @@ def _create_lineage_map( logger.info("Exiting create lineage map function") return lineage_map - def _compute_bigquery_lineage( - self, - project_id: str, - sql_parser_schema_resolver: SchemaResolver, - ) -> Dict[str, Set[LineageEdge]]: - lineage_metadata: Dict[str, Set[LineageEdge]] - try: - if self.config.extract_lineage_from_catalog: - lineage_metadata = self.lineage_via_catalog_lineage_api(project_id) - else: - events = self._get_parsed_audit_log_events(project_id) - lineage_metadata = self._create_lineage_map( - events, sql_parser_schema_resolver - ) - except Exception as e: - if project_id: - self.report.lineage_failed_extraction.append(project_id) - self.error( - logger, - "lineage", - f"{project_id}: {e}", - ) - self.report_status(f"{project_id}-lineage", False) - lineage_metadata = {} - - self.report.lineage_mem_size[project_id] = humanfriendly.format_size( - memory_footprint.total_size(lineage_metadata) - ) - self.report.lineage_metadata_entries[project_id] = len(lineage_metadata) - logger.info(f"Built lineage map containing {len(lineage_metadata)} entries.") - logger.debug(f"lineage metadata is {lineage_metadata}") - return lineage_metadata - def get_upstream_tables( self, bq_table: BigQueryTableRef, @@ -767,28 +812,11 @@ def get_upstream_tables( return set(upstreams.values()) - def calculate_lineage_for_project( - self, - project_id: str, - sql_parser_schema_resolver: SchemaResolver, - ) -> Dict[str, Set[LineageEdge]]: - with PerfTimer() as timer: - lineage = self._compute_bigquery_lineage( - project_id, sql_parser_schema_resolver - ) - - self.report.lineage_extraction_sec[project_id] = round( - timer.elapsed_seconds(), 2 - ) - - return lineage - def get_lineage_for_table( self, bq_table: BigQueryTableRef, bq_table_urn: str, lineage_metadata: Dict[str, Set[LineageEdge]], - platform: str, ) -> Optional[UpstreamLineageClass]: upstream_list: List[UpstreamClass] = [] fine_grained_lineages: List[FineGrainedLineageClass] = [] @@ -796,12 +824,7 @@ def get_lineage_for_table( # even if the lineage is same but the order is different. for upstream in sorted(self.get_upstream_tables(bq_table, lineage_metadata)): upstream_table = BigQueryTableRef.from_string_name(upstream.table) - upstream_table_urn = mce_builder.make_dataset_urn_with_platform_instance( - platform, - upstream_table.table_identifier.get_table_name(), - self.config.platform_instance, - self.config.env, - ) + upstream_table_urn = self.dataset_urn_builder(upstream_table) # Generate table-level lineage. upstream_table_class = UpstreamClass( @@ -852,19 +875,27 @@ def get_lineage_for_table( def test_capability(self, project_id: str) -> None: if self.config.use_exported_bigquery_audit_metadata: - bigquery_client: BigQueryClient = BigQueryClient(project=project_id) - entries = self._get_exported_bigquery_audit_metadata( - bigquery_client=bigquery_client, limit=1 - ) - for entry in entries: + for entry in self.get_exported_log_entries( + self.start_time, + self.end_time, + limit=1, + ): logger.debug( f"Connection test got one exported_bigquery_audit_metadata {entry}" ) else: - gcp_logging_client: GCPLoggingClient = _make_gcp_logging_client( - project_id, self.config.extra_client_options + gcp_logging_client: GCPLoggingClient = self.config.make_gcp_logging_client( + project_id ) - for entry in self._get_bigquery_log_entries(gcp_logging_client, limit=1): + for entry in self.audit_log_api.get_bigquery_log_entries_via_gcp_logging( + gcp_logging_client, + filter=BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE.format( + self.start_time.strftime(BQ_DATETIME_FORMAT), + self.end_time.strftime(BQ_DATETIME_FORMAT), + ), + log_page_size=self.config.log_page_size, + limit=1, + ): logger.debug(f"Connection test got one audit metadata entry {entry}") def report_status(self, step: str, status: bool) -> None: diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py new file mode 100644 index 0000000000000..5be7a0a7f6b2f --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py @@ -0,0 +1,426 @@ +import textwrap +from typing import Optional + + +class BigqueryTableType: + # See https://cloud.google.com/bigquery/docs/information-schema-tables#schema + BASE_TABLE = "BASE TABLE" + EXTERNAL = "EXTERNAL" + VIEW = "VIEW" + MATERIALIZED_VIEW = "MATERIALIZED VIEW" + CLONE = "CLONE" + SNAPSHOT = "SNAPSHOT" + + +class BigqueryQuery: + show_datasets: str = ( + "select schema_name from `{project_id}`.INFORMATION_SCHEMA.SCHEMATA" + ) + + datasets_for_project_id: str = """ +select + s.CATALOG_NAME as catalog_name, + s.schema_name as table_schema, + s.location as location, + s.CREATION_TIME as created, + s.LAST_MODIFIED_TIME as last_altered, + o.OPTION_VALUE as comment +from + `{project_id}`.INFORMATION_SCHEMA.SCHEMATA as s + left join `{project_id}`.INFORMATION_SCHEMA.SCHEMATA_OPTIONS as o on o.schema_name = s.schema_name + and o.option_name = "description" +order by + s.schema_name +""" + + # https://cloud.google.com/bigquery/docs/information-schema-table-storage?hl=en + tables_for_dataset = f""" +SELECT + t.table_catalog as table_catalog, + t.table_schema as table_schema, + t.table_name as table_name, + t.table_type as table_type, + t.creation_time as created, + ts.last_modified_time as last_altered, + tos.OPTION_VALUE as comment, + is_insertable_into, + ddl, + row_count, + size_bytes as bytes, + num_partitions, + max_partition_id, + active_billable_bytes, + long_term_billable_bytes, + REGEXP_EXTRACT(t.table_name, r".*_(\\d+)$") as table_suffix, + REGEXP_REPLACE(t.table_name, r"_(\\d+)$", "") as table_base + +FROM + `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t + join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME + left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema + and t.TABLE_NAME = tos.TABLE_NAME + and tos.OPTION_NAME = "description" + left join ( + select + table_name, + sum(case when partition_id not in ('__NULL__', '__UNPARTITIONED__', '__STREAMING_UNPARTITIONED__') then 1 else 0 END) as num_partitions, + max(case when partition_id not in ('__NULL__', '__UNPARTITIONED__', '__STREAMING_UNPARTITIONED__') then partition_id else NULL END) as max_partition_id, + sum(total_rows) as total_rows, + sum(case when storage_tier = 'LONG_TERM' then total_billable_bytes else 0 end) as long_term_billable_bytes, + sum(case when storage_tier = 'ACTIVE' then total_billable_bytes else 0 end) as active_billable_bytes, + from + `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.PARTITIONS + group by + table_name) as p on + t.table_name = p.table_name +WHERE + table_type in ('{BigqueryTableType.BASE_TABLE}', '{BigqueryTableType.EXTERNAL}') +{{table_filter}} +order by + table_schema ASC, + table_base ASC, + table_suffix DESC +""" + + tables_for_dataset_without_partition_data = f""" +SELECT + t.table_catalog as table_catalog, + t.table_schema as table_schema, + t.table_name as table_name, + t.table_type as table_type, + t.creation_time as created, + tos.OPTION_VALUE as comment, + is_insertable_into, + ddl, + REGEXP_EXTRACT(t.table_name, r".*_(\\d+)$") as table_suffix, + REGEXP_REPLACE(t.table_name, r"_(\\d+)$", "") as table_base + +FROM + `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t + left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema + and t.TABLE_NAME = tos.TABLE_NAME + and tos.OPTION_NAME = "description" +WHERE + table_type in ('{BigqueryTableType.BASE_TABLE}', '{BigqueryTableType.EXTERNAL}') +{{table_filter}} +order by + table_schema ASC, + table_base ASC, + table_suffix DESC +""" + + views_for_dataset: str = f""" +SELECT + t.table_catalog as table_catalog, + t.table_schema as table_schema, + t.table_name as table_name, + t.table_type as table_type, + t.creation_time as created, + ts.last_modified_time as last_altered, + tos.OPTION_VALUE as comment, + is_insertable_into, + ddl as view_definition, + row_count, + size_bytes +FROM + `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t + join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME + left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema + and t.TABLE_NAME = tos.TABLE_NAME + and tos.OPTION_NAME = "description" +WHERE + table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}') +order by + table_schema ASC, + table_name ASC +""" + + views_for_dataset_without_data_read: str = f""" +SELECT + t.table_catalog as table_catalog, + t.table_schema as table_schema, + t.table_name as table_name, + t.table_type as table_type, + t.creation_time as created, + tos.OPTION_VALUE as comment, + is_insertable_into, + ddl as view_definition +FROM + `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t + left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema + and t.TABLE_NAME = tos.TABLE_NAME + and tos.OPTION_NAME = "description" +WHERE + table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}') +order by + table_schema ASC, + table_name ASC +""" + + columns_for_dataset: str = """ +select + c.table_catalog as table_catalog, + c.table_schema as table_schema, + c.table_name as table_name, + c.column_name as column_name, + c.ordinal_position as ordinal_position, + cfp.field_path as field_path, + c.is_nullable as is_nullable, + CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type, + description as comment, + c.is_hidden as is_hidden, + c.is_partitioning_column as is_partitioning_column, + c.clustering_ordinal_position as clustering_ordinal_position, +from + `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c + join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name + and cfp.column_name = c.column_name +ORDER BY + table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC""" + + optimized_columns_for_dataset: str = """ +select * from +(select + c.table_catalog as table_catalog, + c.table_schema as table_schema, + c.table_name as table_name, + c.column_name as column_name, + c.ordinal_position as ordinal_position, + cfp.field_path as field_path, + c.is_nullable as is_nullable, + CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type, + description as comment, + c.is_hidden as is_hidden, + c.is_partitioning_column as is_partitioning_column, + c.clustering_ordinal_position as clustering_ordinal_position, + -- We count the columns to be able limit it later + row_number() over (partition by c.table_catalog, c.table_schema, c.table_name order by c.ordinal_position asc, c.data_type DESC) as column_num, + -- Getting the maximum shard for each table + row_number() over (partition by c.table_catalog, c.table_schema, ifnull(REGEXP_EXTRACT(c.table_name, r'(.*)_\\d{{8}}$'), c.table_name), cfp.field_path order by c.table_catalog, c.table_schema asc, c.table_name desc) as shard_num +from + `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c + join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name + and cfp.column_name = c.column_name + ) +-- We filter column limit + 1 to make sure we warn about the limit being reached but not reading too much data +where column_num <= {column_limit} and shard_num = 1 +ORDER BY + table_catalog, table_schema, table_name, ordinal_position, column_num ASC, data_type DESC""" + + columns_for_table: str = """ +select + c.table_catalog as table_catalog, + c.table_schema as table_schema, + c.table_name as table_name, + c.column_name as column_name, + c.ordinal_position as ordinal_position, + cfp.field_path as field_path, + c.is_nullable as is_nullable, + CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type, + c.is_hidden as is_hidden, + c.is_partitioning_column as is_partitioning_column, + c.clustering_ordinal_position as clustering_ordinal_position, + description as comment +from + `{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMNS as c + join `{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name + and cfp.column_name = c.column_name +where + c.table_name = '{table_identifier.table}' +ORDER BY + table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC""" + + +BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE = """ +resource.type=("bigquery_project") +AND +( + protoPayload.methodName= + ( + "google.cloud.bigquery.v2.JobService.Query" + OR + "google.cloud.bigquery.v2.JobService.InsertJob" + ) + AND + protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE" + AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:* + AND ( + protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:* + OR + protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedViews:* + ) + AND ( + protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/_.*/tables/anon.*" + AND + protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/INFORMATION_SCHEMA.*" + AND + protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/__TABLES__" + AND + protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable !~ "projects/.*/datasets/_.*/tables/anon.*" + ) + +) +AND +timestamp >= "{start_time}" +AND +timestamp < "{end_time}" +""".strip() +BQ_FILTER_RULE_TEMPLATE_V2_USAGE = """ +resource.type=("bigquery_project" OR "bigquery_dataset") +AND +timestamp >= "{start_time}" +AND +timestamp < "{end_time}" +AND protoPayload.serviceName="bigquery.googleapis.com" +AND +( + ( + protoPayload.methodName= + ( + "google.cloud.bigquery.v2.JobService.Query" + OR + "google.cloud.bigquery.v2.JobService.InsertJob" + ) + AND protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE" + AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:* + AND protoPayload.metadata.jobChange.job.jobConfig.queryConfig:* + AND + ( + ( + protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:* + AND NOT protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables =~ "projects/.*/datasets/.*/tables/__TABLES__|__TABLES_SUMMARY__|INFORMATION_SCHEMA.*" + ) + OR + ( + protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable:* + ) + ) + ) + OR + protoPayload.metadata.tableDataRead.reason = "JOB" +) +""".strip( + "\t \n" +) + + +def bigquery_audit_metadata_query_template_lineage( + dataset: str, use_date_sharded_tables: bool, limit: Optional[int] = None +) -> str: + """ + Receives a dataset (with project specified) and returns a query template that is used to query exported + AuditLogs containing protoPayloads of type BigQueryAuditMetadata. + Include only those that: + - have been completed (jobStatus.jobState = "DONE") + - do not contain errors (jobStatus.errorResults is none) + :param dataset: the dataset to query against in the form of $PROJECT.$DATASET + :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log + tables + :param limit: set a limit for the maximum event to return. It is used for connection testing currently + :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery + """ + limit_text = f"limit {limit}" if limit else "" + + shard_condition = "" + if use_date_sharded_tables: + from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`" + shard_condition = ( + """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """ + ) + else: + from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`" + + query = f""" + SELECT + timestamp, + logName, + insertId, + protopayload_auditlog AS protoPayload, + protopayload_auditlog.metadataJson AS metadata + FROM + {from_table} + WHERE ( + timestamp >= "{{start_time}}" + AND timestamp < "{{end_time}}" + ) + {shard_condition} + AND protopayload_auditlog.serviceName="bigquery.googleapis.com" + AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE" + AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL + AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL + QUALIFY ROW_NUMBER() OVER (PARTITION BY insertId, timestamp, logName) = 1 + {limit_text}; + """ + + return textwrap.dedent(query) + + +def bigquery_audit_metadata_query_template_usage( + dataset: str, + use_date_sharded_tables: bool, + limit: Optional[int] = None, +) -> str: + """ + Receives a dataset (with project specified) and returns a query template that is used to query exported + v2 AuditLogs containing protoPayloads of type BigQueryAuditMetadata. + :param dataset: the dataset to query against in the form of $PROJECT.$DATASET + :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log + tables + :param limit: maximum number of events to query for + :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery + """ + + limit_text = f"limit {limit}" if limit else "" + + shard_condition = "" + if use_date_sharded_tables: + from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`" + shard_condition = ( + """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """ + ) + else: + from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`" + + # Deduplicates insertId via QUALIFY, see: + # https://cloud.google.com/logging/docs/reference/v2/rest/v2/LogEntry, insertId field + query = f""" + SELECT + timestamp, + logName, + insertId, + protopayload_auditlog AS protoPayload, + protopayload_auditlog.metadataJson AS metadata + FROM + {from_table} + WHERE ( + timestamp >= "{{start_time}}" + AND timestamp < "{{end_time}}" + ) + {shard_condition} + AND protopayload_auditlog.serviceName="bigquery.googleapis.com" + AND + ( + ( + protopayload_auditlog.methodName IN + ( + "google.cloud.bigquery.v2.JobService.Query", + "google.cloud.bigquery.v2.JobService.InsertJob" + ) + AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE" + AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL + AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL + AND ( + JSON_EXTRACT_ARRAY(protopayload_auditlog.metadataJson, + "$.jobChange.job.jobStats.queryStats.referencedTables") IS NOT NULL + OR + JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig.destinationTable") IS NOT NULL + ) + ) + OR + JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.tableDataRead.reason") = "JOB" + ) + QUALIFY ROW_NUMBER() OVER (PARTITION BY insertId, timestamp, logName) = 1 + {limit_text}; + """ + + return textwrap.dedent(query) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py index e112db31c5c63..201567e104a51 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py @@ -2,7 +2,6 @@ import json import logging import os -import textwrap import time import uuid from dataclasses import dataclass @@ -21,9 +20,6 @@ ) import humanfriendly -from google.cloud.bigquery import Client as BigQueryClient -from google.cloud.logging_v2.client import Client as GCPLoggingClient -from ratelimiter import RateLimiter from datahub.configuration.time_window_config import ( BaseTimeWindowConfig, @@ -35,8 +31,6 @@ from datahub.ingestion.api.source_helpers import auto_empty_dataset_usage_statistics from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( - BQ_AUDIT_V2, - BQ_FILTER_RULE_TEMPLATE, AuditEvent, AuditLogEntry, BigQueryAuditMetadata, @@ -45,13 +39,15 @@ QueryEvent, ReadEvent, ) +from datahub.ingestion.source.bigquery_v2.bigquery_audit_log_api import ( + BigQueryAuditLogApi, +) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report -from datahub.ingestion.source.bigquery_v2.common import ( - BQ_DATE_SHARD_FORMAT, - BQ_DATETIME_FORMAT, - _make_gcp_logging_client, - get_bigquery_client, +from datahub.ingestion.source.bigquery_v2.common import BQ_DATETIME_FORMAT +from datahub.ingestion.source.bigquery_v2.queries import ( + BQ_FILTER_RULE_TEMPLATE_V2_USAGE, + bigquery_audit_metadata_query_template_usage, ) from datahub.ingestion.source.state.redundant_run_skip_handler import ( RedundantUsageRunSkipHandler, @@ -108,77 +104,6 @@ class OperationalDataMeta: custom_type: Optional[str] = None -def bigquery_audit_metadata_query_template( - dataset: str, - use_date_sharded_tables: bool, - limit: Optional[int] = None, -) -> str: - """ - Receives a dataset (with project specified) and returns a query template that is used to query exported - v2 AuditLogs containing protoPayloads of type BigQueryAuditMetadata. - :param dataset: the dataset to query against in the form of $PROJECT.$DATASET - :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log - tables - :param limit: maximum number of events to query for - :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery - """ - - limit_text = f"limit {limit}" if limit else "" - - shard_condition = "" - if use_date_sharded_tables: - from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`" - shard_condition = ( - """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """ - ) - else: - from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`" - - # Deduplicates insertId via QUALIFY, see: - # https://cloud.google.com/logging/docs/reference/v2/rest/v2/LogEntry, insertId field - query = f""" - SELECT - timestamp, - logName, - insertId, - protopayload_auditlog AS protoPayload, - protopayload_auditlog.metadataJson AS metadata - FROM - {from_table} - WHERE ( - timestamp >= "{{start_time}}" - AND timestamp < "{{end_time}}" - ) - {shard_condition} - AND protopayload_auditlog.serviceName="bigquery.googleapis.com" - AND - ( - ( - protopayload_auditlog.methodName IN - ( - "google.cloud.bigquery.v2.JobService.Query", - "google.cloud.bigquery.v2.JobService.InsertJob" - ) - AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE" - AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL - AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL - AND ( - JSON_EXTRACT_ARRAY(protopayload_auditlog.metadataJson, - "$.jobChange.job.jobStats.queryStats.referencedTables") IS NOT NULL - OR - JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig.destinationTable") IS NOT NULL - ) - ) - OR - JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.tableDataRead.reason") = "JOB" - ) - QUALIFY ROW_NUMBER() OVER (PARTITION BY insertId, timestamp, logName) = 1 - {limit_text}; - """ - - return textwrap.dedent(query) - - class BigQueryUsageState(Closeable): read_events: FileBackedDict[ReadEvent] query_events: FileBackedDict[QueryEvent] @@ -375,7 +300,8 @@ class BigQueryUsageExtractor: * Aggregation of these statistics into buckets, by day or hour granularity :::note - 1. Depending on the compliance policies setup for the bigquery instance, sometimes logging.read permission is not sufficient. In that case, use either admin or private log viewer permission. + 1. Depending on the compliance policies setup for the bigquery instance, sometimes logging.read permission is not sufficient. + In that case, use either admin or private log viewer permission. ::: """ @@ -674,109 +600,6 @@ def _store_usage_event( return True return False - def _get_exported_bigquery_audit_metadata( - self, - bigquery_client: BigQueryClient, - limit: Optional[int] = None, - ) -> Iterable[BigQueryAuditMetadata]: - if self.config.bigquery_audit_metadata_datasets is None: - self.report.bigquery_audit_metadata_datasets_missing = True - return - - corrected_start_time = self.start_time - self.config.max_query_duration - start_time = corrected_start_time.strftime(BQ_DATETIME_FORMAT) - start_date = corrected_start_time.strftime(BQ_DATE_SHARD_FORMAT) - self.report.audit_start_time = start_time - - corrected_end_time = self.end_time + self.config.max_query_duration - end_time = corrected_end_time.strftime(BQ_DATETIME_FORMAT) - end_date = corrected_end_time.strftime(BQ_DATE_SHARD_FORMAT) - self.report.audit_end_time = end_time - - for dataset in self.config.bigquery_audit_metadata_datasets: - logger.info( - f"Start loading log entries from BigQueryAuditMetadata in {dataset}" - ) - - query = bigquery_audit_metadata_query_template( - dataset, - self.config.use_date_sharded_audit_log_tables, - limit=limit, - ).format( - start_time=start_time, - end_time=end_time, - start_date=start_date, - end_date=end_date, - ) - - query_job = bigquery_client.query(query) - logger.info( - f"Finished loading log entries from BigQueryAuditMetadata in {dataset}" - ) - if self.config.rate_limit: - with RateLimiter(max_calls=self.config.requests_per_min, period=60): - yield from query_job - else: - yield from query_job - - def _get_bigquery_log_entries_via_gcp_logging( - self, client: GCPLoggingClient, limit: Optional[int] = None - ) -> Iterable[AuditLogEntry]: - filter = self._generate_filter(BQ_AUDIT_V2) - logger.debug(filter) - - list_entries: Iterable[AuditLogEntry] - rate_limiter: Optional[RateLimiter] = None - if self.config.rate_limit: - # client.list_entries is a generator, does api calls to GCP Logging when it runs out of entries and needs to fetch more from GCP Logging - # to properly ratelimit we multiply the page size by the number of requests per minute - rate_limiter = RateLimiter( - max_calls=self.config.requests_per_min * self.config.log_page_size, - period=60, - ) - - list_entries = client.list_entries( - filter_=filter, - page_size=self.config.log_page_size, - max_results=limit, - ) - - for i, entry in enumerate(list_entries): - if i == 0: - logger.info(f"Starting log load from GCP Logging for {client.project}") - if i % 1000 == 0: - logger.info(f"Loaded {i} log entries from GCP Log for {client.project}") - self.report.total_query_log_entries += 1 - - if rate_limiter: - with rate_limiter: - yield entry - else: - yield entry - - logger.info( - f"Finished loading {self.report.total_query_log_entries} log entries from GCP Logging for {client.project}" - ) - - def _generate_filter(self, audit_templates: Dict[str, str]) -> str: - # We adjust the filter values a bit, since we need to make sure that the join - # between query events and read events is complete. For example, this helps us - # handle the case where the read happens within our time range but the query - # completion event is delayed and happens after the configured end time. - - start_time = (self.start_time - self.config.max_query_duration).strftime( - BQ_DATETIME_FORMAT - ) - self.report.log_entry_start_time = start_time - end_time = (self.end_time + self.config.max_query_duration).strftime( - BQ_DATETIME_FORMAT - ) - self.report.log_entry_end_time = end_time - filter = audit_templates[BQ_FILTER_RULE_TEMPLATE].format( - start_time=start_time, end_time=end_time - ) - return filter - @staticmethod def _get_destination_table(event: AuditEvent) -> Optional[BigQueryTableRef]: if ( @@ -1011,27 +834,54 @@ def _parse_exported_bigquery_audit_metadata( def _get_parsed_bigquery_log_events( self, project_id: str, limit: Optional[int] = None ) -> Iterable[AuditEvent]: + audit_log_api = BigQueryAuditLogApi( + self.report.audit_log_api_perf, + self.config.rate_limit, + self.config.requests_per_min, + ) + # We adjust the filter values a bit, since we need to make sure that the join + # between query events and read events is complete. For example, this helps us + # handle the case where the read happens within our time range but the query + # completion event is delayed and happens after the configured end time. + corrected_start_time = self.start_time - self.config.max_query_duration + corrected_end_time = self.end_time + -self.config.max_query_duration + self.report.audit_start_time = corrected_start_time + self.report.audit_end_time = corrected_end_time + parse_fn: Callable[[Any], Optional[AuditEvent]] if self.config.use_exported_bigquery_audit_metadata: - bq_client = get_bigquery_client(self.config) - entries = self._get_exported_bigquery_audit_metadata( + bq_client = self.config.get_bigquery_client() + + entries = audit_log_api.get_exported_bigquery_audit_metadata( bigquery_client=bq_client, + bigquery_audit_metadata_datasets=self.config.bigquery_audit_metadata_datasets, + bigquery_audit_metadata_query_template=bigquery_audit_metadata_query_template_usage, + use_date_sharded_audit_log_tables=self.config.use_date_sharded_audit_log_tables, + start_time=corrected_start_time, + end_time=corrected_end_time, limit=limit, ) parse_fn = self._parse_exported_bigquery_audit_metadata else: - logging_client = _make_gcp_logging_client( - project_id, self.config.extra_client_options + logging_client = self.config.make_gcp_logging_client(project_id) + logger.info( + f"Start loading log entries from BigQuery for {project_id} " + f"with start_time={corrected_start_time} and end_time={corrected_end_time}" ) - entries = self._get_bigquery_log_entries_via_gcp_logging( - logging_client, limit=limit + entries = audit_log_api.get_bigquery_log_entries_via_gcp_logging( + logging_client, + filter=self._generate_filter(corrected_start_time, corrected_end_time), + log_page_size=self.config.log_page_size, + limit=limit, ) parse_fn = self._parse_bigquery_log_entry for entry in entries: try: + self.report.num_usage_total_log_entries[project_id] += 1 event = parse_fn(entry) if event: + self.report.num_usage_parsed_log_entries[project_id] += 1 yield event except Exception as e: logger.warning( @@ -1042,6 +892,12 @@ def _get_parsed_bigquery_log_events( f"log-parse-{project_id}", e, group="usage-log-parse" ) + def _generate_filter(self, corrected_start_time, corrected_end_time): + return BQ_FILTER_RULE_TEMPLATE_V2_USAGE.format( + start_time=corrected_start_time.strftime(BQ_DATETIME_FORMAT), + end_time=corrected_end_time.strftime(BQ_DATETIME_FORMAT), + ) + def get_tables_from_query( self, default_project: str, query: str ) -> Optional[List[BigQueryTableRef]]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py index c8623798f6937..bbe52b5d98ba3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py @@ -365,8 +365,8 @@ def populate_lineage( # Populate table level lineage by getting upstream tables from stl_scan redshift table query = RedshiftQuery.stl_scan_based_lineage_query( self.config.database, - self.config.start_time, - self.config.end_time, + self.start_time, + self.end_time, ) populate_calls.append((query, LineageCollectorType.QUERY_SCAN)) elif self.config.table_lineage_mode == LineageMode.SQL_BASED: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 811ea67981e18..240e0ffa1a0b6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -543,15 +543,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.connection.close() - lru_cache_functions: List[Callable] = [ - self.data_dictionary.get_tables_for_database, - self.data_dictionary.get_views_for_database, - self.data_dictionary.get_columns_for_schema, - self.data_dictionary.get_pk_constraints_for_schema, - self.data_dictionary.get_fk_constraints_for_schema, - ] - for func in lru_cache_functions: - self.report.lru_cache_info[func.__name__] = func.cache_info()._asdict() # type: ignore + self.report_cache_info() # TODO: The checkpoint state for stale entity detection can be committed here. @@ -596,6 +588,17 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ) and self.usage_extractor: yield from self.usage_extractor.get_usage_workunits(discovered_datasets) + def report_cache_info(self): + lru_cache_functions: List[Callable] = [ + self.data_dictionary.get_tables_for_database, + self.data_dictionary.get_views_for_database, + self.data_dictionary.get_columns_for_schema, + self.data_dictionary.get_pk_constraints_for_schema, + self.data_dictionary.get_fk_constraints_for_schema, + ] + for func in lru_cache_functions: + self.report.lru_cache_info[func.__name__] = func.cache_info()._asdict() # type: ignore + def report_warehouse_failure(self): if self.config.warehouse is not None: self.report_error( diff --git a/metadata-ingestion/src/datahub/utilities/perf_timer.py b/metadata-ingestion/src/datahub/utilities/perf_timer.py index 3fac1d68c3a9e..18384420bfefb 100644 --- a/metadata-ingestion/src/datahub/utilities/perf_timer.py +++ b/metadata-ingestion/src/datahub/utilities/perf_timer.py @@ -1,26 +1,49 @@ +import logging import time from contextlib import AbstractContextManager from typing import Any, Optional +logger: logging.Logger = logging.getLogger(__name__) + class PerfTimer(AbstractContextManager): """ A context manager that gives easy access to elapsed time for performance measurement. + """ - start_time: Optional[float] = None - end_time: Optional[float] = None + def __init__(self) -> None: + self.start_time: Optional[float] = None + self.end_time: Optional[float] = None + self._past_active_time: float = 0 + self.paused: bool = False + self._error_state = False def start(self) -> None: + if self.end_time is not None: + self._past_active_time = self.elapsed_seconds() + self.start_time = time.perf_counter() self.end_time = None + self.paused = False + + def pause(self) -> "PerfTimer": + self.assert_timer_is_running() + self._past_active_time = self.elapsed_seconds() + self.start_time = None + self.end_time = None + self.paused = True + return self def finish(self) -> None: - assert self.start_time is not None + self.assert_timer_is_running() self.end_time = time.perf_counter() def __enter__(self) -> "PerfTimer": - self.start() + if self.paused: # Entering paused timer context, NO OP + pass + else: + self.start() return self def __exit__( @@ -29,16 +52,46 @@ def __exit__( exc: Any, traceback: Any, ) -> Optional[bool]: - self.finish() + if self.paused: # Exiting paused timer context, resume timer + self.start() + else: + self.finish() return None def elapsed_seconds(self) -> float: """ Returns the elapsed time in seconds. """ + if self.paused or not self.start_time: + return self._past_active_time - assert self.start_time is not None if self.end_time is None: - return time.perf_counter() - self.start_time + return (time.perf_counter() - self.start_time) + (self._past_active_time) + else: + return (self.end_time - self.start_time) + self._past_active_time + + def assert_timer_is_running(self) -> None: + """ + Returns true if timer is in running state. + Timer is in NOT in running state if + 1. it has never been started. + 2. it is in paused state. + 3. it had been started and finished in the past but not started again. + """ + if self.start_time is None or self.paused or self.end_time: + self._error_state = True + logger.warning("Did you forget to start the timer ?") + + def __repr__(self) -> str: + return repr(self.as_obj()) + + def __str__(self) -> str: + return self.__repr__() + + def as_obj(self) -> Optional[str]: + if self.start_time is None: + return None else: - return self.end_time - self.start_time + time_taken = self.elapsed_seconds() + state = " (error)" if self._error_state else "" + return f"{time_taken:.3f} seconds{state}" diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py index cc3ee1f6ceaa4..602401134dcd3 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py @@ -4,8 +4,10 @@ from freezegun import freeze_time from google.cloud.bigquery.table import TableListItem +from datahub.ingestion.source.bigquery_v2.bigquery import BigqueryV2Source from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( BigqueryDataset, + BigQuerySchemaApi, BigqueryTable, ) from tests.test_helpers import mce_helpers @@ -15,15 +17,9 @@ @freeze_time(FROZEN_TIME) -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_tables_for_dataset" -) -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery.BigqueryV2Source.get_core_table_details" -) -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_datasets_for_project_id" -) +@patch.object(BigQuerySchemaApi, "get_tables_for_dataset") +@patch.object(BigqueryV2Source, "get_core_table_details") +@patch.object(BigQuerySchemaApi, "get_datasets_for_project_id") @patch("google.cloud.bigquery.Client") def test_bigquery_v2_ingest( client, diff --git a/metadata-ingestion/tests/unit/test_bigquery_lineage.py b/metadata-ingestion/tests/unit/test_bigquery_lineage.py index 9b09fa36ba586..e23494963e475 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_lineage.py +++ b/metadata-ingestion/tests/unit/test_bigquery_lineage.py @@ -3,6 +3,7 @@ import pytest +import datahub.emitter.mce_builder as builder from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( BigQueryTableRef, QueryEvent, @@ -81,7 +82,9 @@ def lineage_entries() -> List[QueryEvent]: def test_lineage_with_timestamps(lineage_entries: List[QueryEvent]) -> None: config = BigQueryV2Config() report = BigQueryV2Report() - extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(config, report) + extractor: BigqueryLineageExtractor = BigqueryLineageExtractor( + config, report, lambda x: builder.make_dataset_urn("bigquery", str(x)) + ) bq_table = BigQueryTableRef.from_string_name( "projects/my_project/datasets/my_dataset/tables/my_table" @@ -96,7 +99,6 @@ def test_lineage_with_timestamps(lineage_entries: List[QueryEvent]) -> None: bq_table=bq_table, bq_table_urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,my_project.my_dataset.my_table,PROD)", lineage_metadata=lineage_map, - platform="bigquery", ) assert upstream_lineage assert len(upstream_lineage.upstreams) == 4 @@ -105,7 +107,9 @@ def test_lineage_with_timestamps(lineage_entries: List[QueryEvent]) -> None: def test_column_level_lineage(lineage_entries: List[QueryEvent]) -> None: config = BigQueryV2Config(extract_column_lineage=True, incremental_lineage=False) report = BigQueryV2Report() - extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(config, report) + extractor: BigqueryLineageExtractor = BigqueryLineageExtractor( + config, report, lambda x: builder.make_dataset_urn("bigquery", str(x)) + ) bq_table = BigQueryTableRef.from_string_name( "projects/my_project/datasets/my_dataset/tables/my_table" @@ -120,7 +124,6 @@ def test_column_level_lineage(lineage_entries: List[QueryEvent]) -> None: bq_table=bq_table, bq_table_urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,my_project.my_dataset.my_table,PROD)", lineage_metadata=lineage_map, - platform="bigquery", ) assert upstream_lineage assert len(upstream_lineage.upstreams) == 2 diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py index 6907f926249f5..4fc6c31626ba8 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_source.py +++ b/metadata-ingestion/tests/unit/test_bigquery_source.py @@ -18,9 +18,10 @@ BigQueryTableRef, ) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config +from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( - BigQueryDataDictionary, BigqueryProject, + BigQuerySchemaApi, BigqueryView, ) from datahub.ingestion.source.bigquery_v2.lineage import ( @@ -92,15 +93,17 @@ def test_bigquery_uri_with_credential(): raise e -@patch("google.cloud.bigquery.client.Client") -def test_get_projects_with_project_ids(client_mock): +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_get_projects_with_project_ids(get_bq_client_mock): + client_mock = MagicMock() + get_bq_client_mock.return_value = client_mock config = BigQueryV2Config.parse_obj( { "project_ids": ["test-1", "test-2"], } ) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1")) - assert source._get_projects(client_mock) == [ + assert source._get_projects() == [ BigqueryProject("test-1", "test-1"), BigqueryProject("test-2", "test-2"), ] @@ -110,14 +113,17 @@ def test_get_projects_with_project_ids(client_mock): {"project_ids": ["test-1", "test-2"], "project_id": "test-3"} ) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test2")) - assert source._get_projects(client_mock) == [ + assert source._get_projects() == [ BigqueryProject("test-1", "test-1"), BigqueryProject("test-2", "test-2"), ] assert client_mock.list_projects.call_count == 0 -def test_get_projects_with_project_ids_overrides_project_id_pattern(): +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_get_projects_with_project_ids_overrides_project_id_pattern( + get_bq_client_mock, +): config = BigQueryV2Config.parse_obj( { "project_ids": ["test-project", "test-project-2"], @@ -125,7 +131,7 @@ def test_get_projects_with_project_ids_overrides_project_id_pattern(): } ) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) - projects = source._get_projects(MagicMock()) + projects = source._get_projects() assert projects == [ BigqueryProject(id="test-project", name="test-project"), BigqueryProject(id="test-project-2", name="test-project-2"), @@ -143,7 +149,8 @@ def test_platform_instance_config_always_none(): assert config.platform_instance is None -def test_get_dataplatform_instance_aspect_returns_project_id(): +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_get_dataplatform_instance_aspect_returns_project_id(get_bq_client_mock): project_id = "project_id" expected_instance = ( f"urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,{project_id})" @@ -162,7 +169,8 @@ def test_get_dataplatform_instance_aspect_returns_project_id(): assert metadata.aspect.instance == expected_instance -def test_get_dataplatform_instance_default_no_instance(): +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_get_dataplatform_instance_default_no_instance(get_bq_client_mock): config = BigQueryV2Config.parse_obj({}) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) @@ -176,18 +184,22 @@ def test_get_dataplatform_instance_default_no_instance(): assert metadata.aspect.instance is None -@patch("google.cloud.bigquery.client.Client") -def test_get_projects_with_single_project_id(client_mock): +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_get_projects_with_single_project_id(get_bq_client_mock): + client_mock = MagicMock() + get_bq_client_mock.return_value = client_mock config = BigQueryV2Config.parse_obj({"project_id": "test-3"}) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1")) - assert source._get_projects(client_mock) == [ + assert source._get_projects() == [ BigqueryProject("test-3", "test-3"), ] assert client_mock.list_projects.call_count == 0 -@patch("google.cloud.bigquery.client.Client") -def test_get_projects_by_list(client_mock): +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_get_projects_by_list(get_bq_client_mock): + client_mock = MagicMock() + get_bq_client_mock.return_value = client_mock client_mock.list_projects.return_value = [ SimpleNamespace( project_id="test-1", @@ -201,15 +213,16 @@ def test_get_projects_by_list(client_mock): config = BigQueryV2Config.parse_obj({}) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1")) - assert source._get_projects(client_mock) == [ + assert source._get_projects() == [ BigqueryProject("test-1", "one"), BigqueryProject("test-2", "two"), ] assert client_mock.list_projects.call_count == 1 -@patch.object(BigQueryDataDictionary, "get_projects") -def test_get_projects_filter_by_pattern(get_projects_mock): +@patch.object(BigQuerySchemaApi, "get_projects") +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_get_projects_filter_by_pattern(get_bq_client_mock, get_projects_mock): get_projects_mock.return_value = [ BigqueryProject("test-project", "Test Project"), BigqueryProject("test-project-2", "Test Project 2"), @@ -219,31 +232,35 @@ def test_get_projects_filter_by_pattern(get_projects_mock): {"project_id_pattern": {"deny": ["^test-project$"]}} ) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) - projects = source._get_projects(MagicMock()) + projects = source._get_projects() assert projects == [ BigqueryProject(id="test-project-2", name="Test Project 2"), ] -@patch.object(BigQueryDataDictionary, "get_projects") -def test_get_projects_list_empty(get_projects_mock): +@patch.object(BigQuerySchemaApi, "get_projects") +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_get_projects_list_empty(get_bq_client_mock, get_projects_mock): get_projects_mock.return_value = [] config = BigQueryV2Config.parse_obj( {"project_id_pattern": {"deny": ["^test-project$"]}} ) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) - projects = source._get_projects(MagicMock()) + projects = source._get_projects() assert len(source.report.failures) == 1 assert projects == [] -@patch.object(BigQueryDataDictionary, "get_projects") +@patch.object(BigQueryV2Config, "get_bigquery_client") def test_get_projects_list_failure( - get_projects_mock: MagicMock, caplog: pytest.LogCaptureFixture + get_bq_client_mock: MagicMock, + caplog: pytest.LogCaptureFixture, ) -> None: error_str = "my error" - get_projects_mock.side_effect = GoogleAPICallError(error_str) + bq_client_mock = MagicMock() + get_bq_client_mock.return_value = bq_client_mock + bq_client_mock.list_projects.side_effect = GoogleAPICallError(error_str) config = BigQueryV2Config.parse_obj( {"project_id_pattern": {"deny": ["^test-project$"]}} @@ -251,27 +268,29 @@ def test_get_projects_list_failure( source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) caplog.records.clear() with caplog.at_level(logging.ERROR): - projects = source._get_projects(MagicMock()) + projects = source._get_projects() assert len(caplog.records) == 1 assert error_str in caplog.records[0].msg assert len(source.report.failures) == 1 assert projects == [] -@patch.object(BigQueryDataDictionary, "get_projects") -def test_get_projects_list_fully_filtered(get_projects_mock): +@patch.object(BigQuerySchemaApi, "get_projects") +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_get_projects_list_fully_filtered(get_projects_mock, get_bq_client_mock): get_projects_mock.return_value = [BigqueryProject("test-project", "Test Project")] config = BigQueryV2Config.parse_obj( {"project_id_pattern": {"deny": ["^test-project$"]}} ) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) - projects = source._get_projects(MagicMock()) + projects = source._get_projects() assert len(source.report.failures) == 0 assert projects == [] -def test_simple_upstream_table_generation(): +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_simple_upstream_table_generation(get_bq_client_mock): a: BigQueryTableRef = BigQueryTableRef( BigqueryTableIdentifier( project_id="test-project", dataset="test-dataset", table="a" @@ -302,7 +321,10 @@ def test_simple_upstream_table_generation(): assert list(upstreams)[0].table == str(b) -def test_upstream_table_generation_with_temporary_table_without_temp_upstream(): +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_upstream_table_generation_with_temporary_table_without_temp_upstream( + get_bq_client_mock, +): a: BigQueryTableRef = BigQueryTableRef( BigqueryTableIdentifier( project_id="test-project", dataset="test-dataset", table="a" @@ -332,7 +354,8 @@ def test_upstream_table_generation_with_temporary_table_without_temp_upstream(): assert list(upstreams) == [] -def test_upstream_table_column_lineage_with_temp_table(): +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_upstream_table_column_lineage_with_temp_table(get_bq_client_mock): from datahub.ingestion.api.common import PipelineContext a: BigQueryTableRef = BigQueryTableRef( @@ -406,7 +429,10 @@ def test_upstream_table_column_lineage_with_temp_table(): assert upstream.column_confidence == 0.7 -def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstream(): +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstream( + get_bq_client_mock, +): a: BigQueryTableRef = BigQueryTableRef( BigqueryTableIdentifier( project_id="test-project", dataset="test-dataset", table="a" @@ -466,11 +492,11 @@ def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstr assert sorted_list[1].table == str(e) -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_tables_for_dataset" -) -@patch("google.cloud.bigquery.client.Client") -def test_table_processing_logic(client_mock, data_dictionary_mock): +@patch.object(BigQuerySchemaApi, "get_tables_for_dataset") +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_table_processing_logic(get_bq_client_mock, data_dictionary_mock): + client_mock = MagicMock() + get_bq_client_mock.return_value = client_mock config = BigQueryV2Config.parse_obj( { "project_id": "test-project", @@ -523,7 +549,7 @@ def test_table_processing_logic(client_mock, data_dictionary_mock): _ = list( source.get_tables_for_dataset( - conn=client_mock, project_id="test-project", dataset_name="test-dataset" + project_id="test-project", dataset_name="test-dataset" ) ) @@ -531,17 +557,19 @@ def test_table_processing_logic(client_mock, data_dictionary_mock): # args only available from python 3.8 and that's why call_args_list is sooo ugly tables: Dict[str, TableListItem] = data_dictionary_mock.call_args_list[0][0][ - 3 + 2 ] # alternatively for table in tables.keys(): assert table in ["test-table", "test-sharded-table_20220102"] -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_tables_for_dataset" -) -@patch("google.cloud.bigquery.client.Client") -def test_table_processing_logic_date_named_tables(client_mock, data_dictionary_mock): +@patch.object(BigQuerySchemaApi, "get_tables_for_dataset") +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_table_processing_logic_date_named_tables( + get_bq_client_mock, data_dictionary_mock +): + client_mock = MagicMock() + get_bq_client_mock.return_value = client_mock # test that tables with date names are processed correctly config = BigQueryV2Config.parse_obj( { @@ -595,7 +623,7 @@ def test_table_processing_logic_date_named_tables(client_mock, data_dictionary_m _ = list( source.get_tables_for_dataset( - conn=client_mock, project_id="test-project", dataset_name="test-dataset" + project_id="test-project", dataset_name="test-dataset" ) ) @@ -603,7 +631,7 @@ def test_table_processing_logic_date_named_tables(client_mock, data_dictionary_m # args only available from python 3.8 and that's why call_args_list is sooo ugly tables: Dict[str, TableListItem] = data_dictionary_mock.call_args_list[0][0][ - 3 + 2 ] # alternatively for table in tables.keys(): assert tables[table].table_id in ["test-table", "20220103"] @@ -644,16 +672,16 @@ def bigquery_view_2() -> BigqueryView: ) -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_query_result" -) -@patch("google.cloud.bigquery.client.Client") +@patch.object(BigQuerySchemaApi, "get_query_result") +@patch.object(BigQueryV2Config, "get_bigquery_client") def test_get_views_for_dataset( - client_mock: Mock, + get_bq_client_mock: Mock, query_mock: Mock, bigquery_view_1: BigqueryView, bigquery_view_2: BigqueryView, ) -> None: + client_mock = MagicMock() + get_bq_client_mock.return_value = client_mock assert bigquery_view_1.last_altered row1 = create_row( dict( @@ -675,9 +703,11 @@ def test_get_views_for_dataset( ) ) query_mock.return_value = [row1, row2] + bigquery_data_dictionary = BigQuerySchemaApi( + BigQueryV2Report().schema_api_perf, client_mock + ) - views = BigQueryDataDictionary.get_views_for_dataset( - conn=client_mock, + views = bigquery_data_dictionary.get_views_for_dataset( project_id="test-project", dataset_name="test-dataset", has_data_read=False, @@ -686,7 +716,10 @@ def test_get_views_for_dataset( @patch.object(BigqueryV2Source, "gen_dataset_workunits", lambda *args, **kwargs: []) -def test_gen_view_dataset_workunits(bigquery_view_1, bigquery_view_2): +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_gen_view_dataset_workunits( + get_bq_client_mock, bigquery_view_1, bigquery_view_2 +): project_id = "test-project" dataset_name = "test-dataset" config = BigQueryV2Config.parse_obj( diff --git a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py index 6ee1f05f0582c..4cf42da4395f9 100644 --- a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py +++ b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py @@ -4,7 +4,6 @@ from freezegun import freeze_time from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( - BQ_AUDIT_V2, BigqueryTableIdentifier, BigQueryTableRef, ) @@ -111,10 +110,12 @@ def test_bigqueryv2_filters(): OR protoPayload.metadata.tableDataRead.reason = "JOB" )""" # noqa: W293 - source = BigQueryUsageExtractor( - config, BigQueryV2Report(), dataset_urn_builder=lambda _: "" - ) - filter: str = source._generate_filter(BQ_AUDIT_V2) + + corrected_start_time = config.start_time - config.max_query_duration + corrected_end_time = config.end_time + config.max_query_duration + filter: str = BigQueryUsageExtractor( + config, BigQueryV2Report(), lambda x: "" + )._generate_filter(corrected_start_time, corrected_end_time) assert filter == expected_filter diff --git a/metadata-ingestion/tests/unit/utilities/test_perf_timer.py b/metadata-ingestion/tests/unit/utilities/test_perf_timer.py new file mode 100644 index 0000000000000..d5fde314c2b57 --- /dev/null +++ b/metadata-ingestion/tests/unit/utilities/test_perf_timer.py @@ -0,0 +1,46 @@ +import time +from functools import partial + +import pytest + +from datahub.utilities.perf_timer import PerfTimer + +approx = partial(pytest.approx, rel=1e-2) + + +def test_perf_timer_simple(): + with PerfTimer() as timer: + time.sleep(1) + assert approx(timer.elapsed_seconds()) == 1 + + assert approx(timer.elapsed_seconds()) == 1 + + +def test_perf_timer_paused_timer(): + with PerfTimer() as current_timer: + time.sleep(1) + assert approx(current_timer.elapsed_seconds()) == 1 + with current_timer.pause(): + time.sleep(2) + assert approx(current_timer.elapsed_seconds()) == 1 + assert approx(current_timer.elapsed_seconds()) == 1 + time.sleep(1) + + assert approx(current_timer.elapsed_seconds()) == 2 + + +def test_generator_with_paused_timer(): + def generator_function(): + with PerfTimer() as inner_timer: + time.sleep(1) + for i in range(10): + time.sleep(0.2) + with inner_timer.pause(): + time.sleep(0.2) + yield i + assert approx(inner_timer.elapsed_seconds()) == 1 + 0.2 * 10 + + with PerfTimer() as outer_timer: + seq = generator_function() + list([i for i in seq]) + assert approx(outer_timer.elapsed_seconds()) == 1 + 0.2 * 10 + 0.2 * 10 From f4da93988e8cbb14c74946ddc72fdbd4205a015e Mon Sep 17 00:00:00 2001 From: Tony Ouyang Date: Fri, 15 Sep 2023 13:26:17 -0700 Subject: [PATCH 03/37] feat(ingestion/dynamodb): Add DynamoDB as new metadata ingestion source (#8768) Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> --- .../app/ingest/source/builder/constants.ts | 4 + .../app/ingest/source/builder/sources.json | 7 + datahub-web-react/src/images/dynamodblogo.png | Bin 0 -> 60888 bytes .../docs/sources/dynamodb/dynamodb_post.md | 29 ++ .../docs/sources/dynamodb/dynamodb_pre.md | 26 + .../docs/sources/dynamodb/dynamodb_recipe.yml | 25 + metadata-ingestion/setup.py | 2 + .../ingestion/source/dynamodb/__init__.py | 0 .../ingestion/source/dynamodb/dynamodb.py | 469 ++++++++++++++++++ ...default_platform_instance_mces_golden.json | 132 +++++ ...ynamodb_platform_instance_mces_golden.json | 132 +++++ .../integration/dynamodb/test_dynamodb.py | 95 ++++ .../main/resources/boot/data_platforms.json | 10 + 13 files changed, 931 insertions(+) create mode 100644 datahub-web-react/src/images/dynamodblogo.png create mode 100644 metadata-ingestion/docs/sources/dynamodb/dynamodb_post.md create mode 100644 metadata-ingestion/docs/sources/dynamodb/dynamodb_pre.md create mode 100644 metadata-ingestion/docs/sources/dynamodb/dynamodb_recipe.yml create mode 100644 metadata-ingestion/src/datahub/ingestion/source/dynamodb/__init__.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py create mode 100644 metadata-ingestion/tests/integration/dynamodb/dynamodb_default_platform_instance_mces_golden.json create mode 100644 metadata-ingestion/tests/integration/dynamodb/dynamodb_platform_instance_mces_golden.json create mode 100644 metadata-ingestion/tests/integration/dynamodb/test_dynamodb.py diff --git a/datahub-web-react/src/app/ingest/source/builder/constants.ts b/datahub-web-react/src/app/ingest/source/builder/constants.ts index 8d41c3533575a..61667a941765c 100644 --- a/datahub-web-react/src/app/ingest/source/builder/constants.ts +++ b/datahub-web-react/src/app/ingest/source/builder/constants.ts @@ -27,6 +27,7 @@ import powerbiLogo from '../../../../images/powerbilogo.png'; import modeLogo from '../../../../images/modelogo.png'; import databricksLogo from '../../../../images/databrickslogo.png'; import verticaLogo from '../../../../images/verticalogo.png'; +import dynamodbLogo from '../../../../images/dynamodblogo.png'; export const ATHENA = 'athena'; export const ATHENA_URN = `urn:li:dataPlatform:${ATHENA}`; @@ -43,6 +44,8 @@ export const DBT = 'dbt'; export const DBT_URN = `urn:li:dataPlatform:${DBT}`; export const DRUID = 'druid'; export const DRUID_URN = `urn:li:dataPlatform:${DRUID}`; +export const DYNAMODB = 'dynamodb'; +export const DYNAMODB_URN = `urn:li:dataPlatform:${DYNAMODB}`; export const ELASTICSEARCH = 'elasticsearch'; export const ELASTICSEARCH_URN = `urn:li:dataPlatform:${ELASTICSEARCH}`; export const FEAST = 'feast'; @@ -107,6 +110,7 @@ export const PLATFORM_URN_TO_LOGO = { [CLICKHOUSE_URN]: clickhouseLogo, [DBT_URN]: dbtLogo, [DRUID_URN]: druidLogo, + [DYNAMODB_URN]: dynamodbLogo, [ELASTICSEARCH_URN]: elasticsearchLogo, [FEAST_URN]: feastLogo, [GLUE_URN]: glueLogo, diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json index 13643c58f72e1..b4ea2db018bd8 100644 --- a/datahub-web-react/src/app/ingest/source/builder/sources.json +++ b/datahub-web-react/src/app/ingest/source/builder/sources.json @@ -125,6 +125,13 @@ "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/mongodb/", "recipe": "source:\n type: mongodb\n config:\n # Coordinates\n connect_uri: # Your MongoDB connect URI, e.g. \"mongodb://localhost\"\n\n # Credentials\n # Add secret in Secrets Tab with relevant names for each variable\n username: \"${MONGO_USERNAME}\" # Your MongoDB username, e.g. admin\n password: \"${MONGO_PASSWORD}\" # Your MongoDB password, e.g. password_01\n\n # Options (recommended)\n enableSchemaInference: True\n useRandomSampling: True\n maxSchemaSize: 300" }, + { + "urn": "urn:li:dataPlatform:dynamodb", + "name": "dynamodb", + "displayName": "DynamoDB", + "docsUrl": "https://datahubproject.io/docs/metadata-ingestion/", + "recipe": "source:\n type: dynamodb\n config:\n platform_instance: \"AWS_ACCOUNT_ID\"\n aws_access_key_id : '${AWS_ACCESS_KEY_ID}'\n aws_secret_access_key : '${AWS_SECRET_ACCESS_KEY}'\n # User could use the below option to provide a list of primary keys of a table in dynamodb format,\n # those items from given primary keys will be included when we scan the table.\n # For each table we can retrieve up to 16 MB of data, which can contain as many as 100 items.\n # We'll enforce the the primary keys list size not to exceed 100\n # The total items we'll try to retrieve in these two scenarios:\n # 1. If user don't specify include_table_item: we'll retrieve up to 100 items\n # 2. If user specifies include_table_item: we'll retrieve up to 100 items plus user specified items in\n # the table, with a total not more than 200 items\n # include_table_item:\n # table_name:\n # [\n # {\n # 'partition_key_name': { 'attribute_type': 'attribute_value' },\n # 'sort_key_name': { 'attribute_type': 'attribute_value' },\n # },\n # ]" + }, { "urn": "urn:li:dataPlatform:glue", "name": "glue", diff --git a/datahub-web-react/src/images/dynamodblogo.png b/datahub-web-react/src/images/dynamodblogo.png new file mode 100644 index 0000000000000000000000000000000000000000..f5beafb035772cacfc5fbe5da985ae8da1c74973 GIT binary patch literal 60888 zcmeEvc|29y8~5(kO#^AB2&JefM+s$2rAg7K!R&_07#T91QmK$`10qAIR5B%;2q!{i zER~sqL}iLN#NmC`S_gH1_w)Y$zMp&kh-2-wp85Mc>)C7XF6!*u%r|BJ6pEtwwr<(D zo1&&ND2k_d(nR=+SM!e<@ME6CrUMRpY|I>-j87b)*8gQ=azt#am9hDe-A9c7ay+%S4$r~>H6#*>fVqP@I%Vvd7j}ip!MJ9KMnk+f&Vn{p9cQZz<(O}|3?E%eGk(= zkT`;Q;X_@;YM#zRKRNtrLq+qcD?K$%nJYX8vpa@+>IeJ{J%_%i%^e4q(uXFrqbW3c zU_8f0w7v0g#&G@SNJ_nYJ8S6sfUNcKy-79b!U(0)O6Wp|AL}e_V9pixCRQzXXf=1p zb{Tg_b987H8M@VSsJwPk^*FJPP-gm!)bX$o^EIJtd**OOl zEf4&lN4Ro_oJNPT$gJs|7Tg6-k1VKufr%$i`(eo)x;Qe#e8HtfkIW*pct#hT#-$}G zydYWO4=3uC63y1;ELKYAV#2+$*c>u6b##`4M2R~zI;-;%8G55=>?&@{Lw`Q*54L8@G^9fr z!iJ^Q)A;lmjPdHV>Ze`B9gtd$TtQ6uB+DHmin3vjFdx5yNPKvn3L&LnJ7UW6%a{~> zMZrv>j1aLA1}!&$`M{#@%%)Ze!2+Ll%w{NPU>X6ESWiY<6VJE#!2(?pywyUtu_xuv zWFS9?zh1#r;Vd4m{pn5MDll{)YdKe}KS#u3t|DS-StW7B(ij!%6cOwG(-E;oZ9?OU zwdWvLtdvo)g1BNWYvqbnH7b@CSF938u2@#1Vx8xTwIz)!mi(w#*+eYb^%|~NLZf2U z6R`x(#B+`#Pn*T?lpq!nB&?^-OwQbaLg5d#4T-ImomjQ%0$IK6$q+xcOF_J7{Y}yh_we!e)6WXAQ0`o0*(^fn+l!OME|SHJvm8YhV(lwAkqEn`ju)E_Q|6H=-C}r(I?+DMD9iCO zXcY{mq)nzwAXKKolr3B;(+QQ+go^gefy9Sct_>Ce{xCBAQ$x z^RSTde`3DkrHz$VgHs{CwqoF;-^2b2X{G8Q5*JWRYbO>(8aWn4NpC1G+6MFh5zS{{ z>5Z1b(vT`oWjXGLq>U`;Ljr$0xDSk!&P-0^igJ*M5=2A^0#T4pAyFELDA$Q7a%bCG zxuVDuQSK8_&Vwji-%li>(1<8B5JibA$_^sRZ6Zqjq$*@b^XZJlwODu>sa0sa#!7?X zM8?jGSjB>*MrhetKCKR*5!A9EgE-Q8Vy@H~$=OGUCbL6SbK82dIbGk1)i_ncASbhe zF#qLjn{6mr*LEMr)>8mOmpQ@OC|IG5=+8Uk16iC~!!^J>HDdC5>m=+ZbQ-qPba6}X zJmPZG_@g&rdz{5SN6(&IRSyP788W-{7|}Zq!3l35Wyd9z{hpBGmT+Mbmfnrr1--co zDiZ@MYn4x1OBURqwL$}#mbD(f!w>5$M<9dM(!eF;Y0z&0S>aeUb#iwEPFezWG>3%sAkZVh8ie8&2-(S8q2jSnG!Tjltd?;^ zvAn2~qNwpRlE>pRc0xPThiH$+i?zoKgr1YkrqsEOD1uwOEtp(&dOKm9XC+q=HJI^; z@y1HFBkRo}ieNclXHon+egWeKF%;8}%idAUo-JUttqf7$X(g^ENI5nENNwhZfbs*v z`b$ho4ER$fC6p6uE0>flA$28wfg%D4z(wYIE-7O|iULw{TvA?KQlyMyP0n%@+eExs z>@FdttcA72KRtQ;QQ|J+jk%<(F)4-@km8mM%&lBfg@hE+g*=zkH7==WJk&YOSgC|N zOI?z?ASvdauP>kWgRAzEgGB9ps8Xe<3^7X{O>W|*T_DElBvs?3_yvO8`haOfxS*ps z^mNeQF0TG~9}%I1F!01LF;=3X${GUk^b5kC(FQD3@*+zfvk^iU2_aI`G3TCbV_Jd? zkXU#mi2Kp`usse0$1eyPvF}~NLp%Y-?K6|dCvqVvNRP-g7w;U6mSs5_&Eaa+hqw~A zu#*NwJjUDTKb(wYiH;5sp&o)z`?x|$bA_6Pg`$B_@gqXzaD~#qLfr?U9JxZpbA=++ z9dkAag?1lED0!|>ld({OAXNE?Q1^*YG$Pan5Gsu;R5@3uiA1QSS&sB!Zj7?75&iMv zLXtWCY#U|BoyC$OQV9~NVqwT;WM~H&B2^x2mK^Ea-O{wqSiIn1VEVupzeRVUkx*PgyDHO*I^pNZ(^n=RE=QJ_TgPIt8qQRuMJ)VsfU z>fMhz7w#RJx>_r(-?aK}>CJ@i8!JETJvmOk$U$?Xhw8gX$*YYW8x=EuE@95bs!-em zOf-U2OmLWSmzxZ5foyvoJM^u|#!AfBaGeo^93^=|2IoaZQtnH9eN#5v{ZnRcCaCNd zswgD@U^@@CbE+CWhkJT|R&I|3_cog8VLSnTYHhn}7M(p*Yc@Rv)g^M9(i9KJ>~yMF z3SWXIfWuPMtiSIb9K32Tb+$S%6uwf0PY&JMVG6$(&BW~Z)`&LtidmyB3PrtzttzEI?1aN00O|%CRHw6Y7|5ze{R!+>HiS)1&ZJa5l`7Er4&RGhFLt|f| zm@Wo*fIKhP5O^ax7foQgR>tg1vEqea-KJp~! z7Z*0<#_YI;E?@=4&Nbiu3319D4Om?G#_zE4QQ@> zi^jQV^w!?A4__{!Mf(E4~Rl52uMXpih8L&Tj?ZA#I@E2l)t z2q4oNy#r&gFC(Y2B{9zrY=>0kw83AfTlZ`Zg<4mknHTWN*RRT@?a{Nsx-^<1e<(CI z2k8x4VrgTqqn*1cb~mlJ0m%3i5|!z@{hhSOQHC)YsW>5Mb;3{{ZpVUKAw>lld)o~J zU`cdl3>o|h{@93#tp@i#A--SGDi^g?>5mRrolNzxn+{BCV#}K8R-a8ehKxn+)d9=Y z%0igXE3o#1)+6^lfZdmR)Ecrss!AS@;*vi|c586W%GhK=bi5Pz>Lxx|W;Ip;ZNpW$ z;{s*+Sjq?q5*u;zfib2f4xWMw;8ORc>`-u1CufPX3-?E*S5!UGGGe<}6u9ESS`q;txG4o;Ph zL!}Vw$|4>pYes*=VXN*{4H~h+GbQlM4NH#BRXCWm1M6$zrN7e*SAYXq;GMks_$5aj z6B+Hn*roWA^lrs~OMXB&6^!Z#s)PnAE8IsQK|MR?m3!icFDr#WY;}8V;Q5Qe9`^eL z-JgNgC~Aj!1$eu9HW|~Xh9$t(so=?*+Tm9|Tt8b}?psp(2)P$U^BRr&9;_w{`r<;9 zY7Y|jLB=@jU%`dPaMMl4{ulVt0rYADT}I?$&sYP9P`xFIc$Chel|}obOZBl!oxji<1va<%A?=}0?c4C%a=^J+++q?$0*Q) z-Q_`e-@R%g1XHrcnyd2%{8;CHi;vE2>?N+mOt5;ouI3wpIURmXFL=wl$NG9H*do+U zOVH@geH#%U*pc3=Uo|W9Ii9^3@xn@!_TKd=1N+ej44MxZU~RL91iI*Zo*lZi;&0?2 zN!b0DOM^KQ_X=7C;kynyC0AY}(Debk%7X{l40kDC`Ryposm8a!_9CQ#OIX&ux`S7( zh?5z$T94$Fy{IaqBF_ARTt+4k_zp19N-vjf03kuF6R%Y&!Qg7kMPb zvW!Vyr=R&4tfMmyPds)(=+1sWEX@7;HD4V%1i^w#2ds+wpbEykZP*2=QuYom5;K2b zmhLazny8fXL4Gn?_<=7V@$rQyn6HyjV0aX_bk-imO8SD$qqQDzG(Uk40|9;8p;1|E zld&6`x6-WZR*_}pJoA8`Ak?a%nDfBPst0Fl3Pm82^kat+a$TGOV)0^ri$0nOT6th4 ze7y~_*r2?@tiVKXK|*V|A4Ni(O0rM+@Ff|EjcXN(G8Y3;>mQP=u5(!S1pAvW(3piC zCJ5l6GUsF&_NR!CzarNsEUW~mg7J1MQB>jdUy$n>4%eV7kWJ^7?H6?a0vdvHW`o|b zgrf*zx#JMBZ+7flPZrqWkpAI|HMT<^mPTnc9VIaZb_irH$18p7N6 z_T+UnAt+F2s@S};+uliATOK1xz5p5NTbVG3ol78D0+lN}oJbs*y*Xi*3|@&d;w@;X zfOdGnXN7|0iD#KzMcKzl3lollOxxcCuyqlOR3NdP24r(_d(UYY%oEC816)b{Lg~VrfF{zPgRx-2TKJXV0%tnpY$qF3%#^fXk`a0D3n24pYgCU!N9Kz;L24&D|* zG@-@(&|rBH6;DIgk;Pb&DBzc#dn>U7i9_FdVY__$d=v-%*q3VU1ygL!y`9J%Jq8%1 zgrx06tvZS>>LYZG3kQngb%46KF!sd;ng(fb1d=Ip%g7I34m}6D(=pTceidn1#mr|w z63;TJ7Ri(w5#~}rw#7A4awXzjb-`1B(|qi_^eLU}ZQx#?5ufy%H|`#MHe;Euyf}~m zG<$I@y0pg!AcI+ekv37Rw8bQ7utzsDi<(WH==%z`ZtPwTkiL}&sO08rxs zKM1$Oj3gExbxdL;lYn_}oAKPV6lxCoLEMy_F9W$HdzYZqIt)?|B4JxnCbJj5_~F!T zw%)f&mw@quXCpO=`ZcT&O(^%Q^C zMxos9t7dHtsmICG@hb$(qd$BxSOfkZfqO6NjgV{=1TIz~RnsMW{vLo#jT3*wRwT80 zBoQjBB_BKtUy`pXkNC8hiGaC8aBJ`lA)hlOx`Gyz9S&_jkBXBW82y45kHS|$&`3F7 zG6DHp#0-QlmtqM%(IQb&#Cy1}xLA2_fEeeZ7p>9cQ@id;@Qp(_q?2&9@|097l<~9C z4I(`idq@&^Q1!;ecmqiFiKCF$K{qI=hyVzY%)65#WsW0hU@U(<2vtdDiE}614T;&Q zi%OjLxbgQOvhQ92DOM;qv|+)}^#Gp)(2;_OWN^r~ffCLi33qjU6an-~T&$4lo>Kf9 zs@iY5vPcY!BS`=i)s-1sF(k+KB>S*SEC-ii9Na=OLudcUK5H-PT2l|6Lf$CUFn($b z2^~Zz76`mpjo>xn4Yu2n1-Wgo2|GweOJSp{kJWGxtKm&UpfoqF(DtVsCOZ=Pa;%0M zfbOFJMw~cPfa{M7vat zL*&5}JdE&b$9gV)*_we!`6xslVRhU9bv#?P)K`8Yj(c0j{jiHL{0P3h9NikAv}%m2 zMHD9JuR$9NivG}I1i-XHutv_L zHz2bS;&~7D{30lrn?8rrDk&fXadZ+_UoQdJZ!JSKZ9g{ThVL^)W7LClBLUAmh#L*& z5@7i+n~CLhWMDUiw~X{e3?)UdRdu?=Z9`B=v&EgJYJom3;?S5Ch;n6yEY2YGN>a(E zAA_wFb1?>!i>NI4Q9)7+z4DicH`gp26e%)We|&rmrz4h`z{#SL>Z=S^TvMok`t33ptpz8z2T1;dgq z3~dIEo7Al!kz$xzIloKC4#D=!$u6G7>J-bjpit)QJsBBqpTJ>>$FrODzR#ok&ti0k z8QR3faUzj7HsV%W?#iu+eh4L+e4>~OSYBOTPR8Y~`zE%1pGO5{2tE`2v!huclQx|m zzR;$a^Z@MN($M}*?YZ-m>Tw>QEH=1`bgZvary{aeO2?e_$rQG7 zG?We-Cpd%AJxj*N-qh8-?JzQaE_O)Y9SEZ!BY6C1SZ2#(rms3Fn^SXzU3cEtCS{P9 zdaqsmd1%PdiGEPRw%PpUD~?S{S(yRa_Szo8&Lw?S?lA0X8>B-fs-5~in=yX%QqkTG zwL(_M{DrE<(amm#7%mlON3uAe0Hc6O)%|qF3(k2|i7Cdc6$d_cN^MPyhT4(oo6leC zcc!;Cx@#U~5$iSN;ryKw*|kyDA=6nfEpJB$MT=~hdA)CHxFOnp+Rm3p{j8{~5b2es z)fva;54QS5@wKaaEwHjEZk+dnZvr(*AYgyiJ*l!cEdADLayR(e-Llf#dXf&KiotCq zcH!k*8+N`B&_1nHLfgE4AU;J%LzOn2VtJXcL{if7SnhU5bJXd%(;4=>DamDJ?el0O~uXx)o~kKfj(}I5ZtP0B-bX3#({y4qv;%MkKyn;XDWjTRl+X1O4i0>h4WmwCXxO6bQZS zH^l(BUA;X!UBXm64p`g!htxElXESORP{)?9+)=T#s7w5BzHPu$97ym}e;RIDv|s0WeH`p>O8;h*5@TU$IQeBAwXK8K0)LIom+wZ zaDS_i3E@3xYSET?iEOq8i6Z(+92d!-oM%fYW-##lgqcwwRIyr&(RllvoV#3;%%zI*VfONLaK44}z&-Pjv1 z2KsYkczNhYFKpl6KDRjlIb%oS3kc}Gzq1)C*4Bn=_;R}%j(RECYGn&1cI547OI!G9 zJh-ExxU8-J;TqG<48~IvF{We%t0;)m2;LZniZMmpRq=q0H%3EKR|F=aSNXGSJrzEs zD;$AL!u*j_?*1h||41jq<@iD2(@L{!j{}~CAvFCPCD1;!Id3+y37KH(j^=l~_U4G5 z>3yCF;Y7`mhj7eomv!)eG4QNmwfbUPa}AFq6i{eBE-G;URbJ~%LMCX@_;V9aM^-%h z5pQMVYKrQajc`oG`V@~@ATxBb%fVYhiy*z?E#pfD2O6_CC(LpIs@YoWDoeuhTuT(D zQ}pxC;-Ai3IdtM_g?obYL^B_ldCAVzx7rY4XhAH3Kzc>M-HKz#`~YV}UQUWJ@tqPq z8^ez4%_*7v$pnZg#dCe^uMFQNU1a~n{E5Iqknz^w6#`|MbDJL>r6mYx3sv7v=rUIO zl>YX9FCzie^7*sCn^&Br&T6`D6NB?6Iu$WTaWE0r*sH(Nto=i5_wVq!m;sXC-UV4G zG9sP*$ZaA;_rG|1ASb&VNglkORWz?(6avTz1&|qT4Vi)S3!zqu#1O=fccl`Neszlw zcvg55;0MBvdCz@nZr6hE-yV%V&;_t&rh_U0C|p=t2Fic7>umo8fvMc(6Wi_5TzP8+ z0{8PyIHn222R?wqq#y`Q#0SJyL!IG>dgA82i1?{jJjRSnp|0AHRZt1j6IYEQ@aeC! z81}gabDMGUF~gmFGmL#baS|_XVzHle8GFti3mcuun{T^5FD{f3oV51}Gj!R7QNzjA z-I@WYWl8#0M#(u>(s?V{b!Xj>&8g+)&HbfaZo0}c?k@e_EY4tq!1?jCw1&DA1t0tD zrBLAZTSAkYJ`c9aCj@iaqVK=Bndw}~nzl#5SJqXTqp}NNDd{8t4+0$wK68tiKgwA} zvummh4AYLlX1v6*8Pzhu9Wrs88Z{m}r9EBd&7ZSnhTJN|Skg^itB#4bbIb)>Rry`2 z%RNe@hTJXWVK=ZplkFWpFd$H0H{4;9=nBl1i@we!#FO5oZYagB~ZQK(_vKD$go;>BJ=usKb(|3%uBj*fu! zuHbHz#=K{FIs6o1#>!|C@MhW%|M80fzBvUawB+qAG=(_a?{=+gIA3z*y-F} zp1XI$_RE{3_#Ypd;twc~Z(L{_waDbt5Q0rf6gGtjYv>cVy|aYP?xjS_`C1$V+fmo} zvpQ)OqP>p!kTjUDbF9Go!1eh$n9Z{odluSxxmJG~dWQ_S_A}N#+@nIZi_Y=>c(uxR zwXSjga9dH!TIzr5n!=UsQMzN*=n!46;8k;!N5Yy|x++uWh}f#rik{iU75B36M; zs={acyFGV7To1i3>ZqIl$G*g$WuC+QaYK#a!)2L*fKczlQF<c6!}vI;6^C9!j*fuHhgB$i zgC$L3{SsvMg4AD&B`PA!q%{NlW|ed10pTDx`?S3Eh#kta?#H3>*FmR3s0Ytbdjneh zdwaoNMZ9l*LH81xefQKOts)!Pg3rUHRI{aLF!tEVo*1qg$(22*+@{apK5BHbi0>=@ z+H=Mo`{q5(Gl#O;5o+9}&i)~_x4%qif3IH6^4Ng3(LHkJunE%9hP@$QgCqUn7{XIQ zyOfH<1DaoZ4WMF)(Ub?}5FXUH>q?EB+IyJ{`= zxK@_~i=AtKf%)^$McA`?v|N3U;qDkp*oI@IK`lWJYxrRHH;_;`?9cx9UWrN_mal)7vWk<{0i+1$qY#O%9CJlk##(P&z4Q z8+9IFHW%SE3bqfZ6F`T19;C!cLWzBk@vks%^!3+D`vKfWUDI<{F%v+;^(a!FmAxiF zU6QfA67%S2g*gx1Q{KD7#zsNebueBTjNh~$OrIwN_eADvdYTVcqByNYj;AAI*Wso2utEl{VnT-#xGYTiKg;F)pbZr#dEM0UcxKYH%P-%25M@m6KeLt!14 znqglzJpFmN;b4|6Fow1UzzgL^8$<6B=QvOHb7gIjYJRi{d8Zqgjbg*(JAfdiV~(5| zJicqaV0*{;=WGey0oNvEgo8i+tzL#p$;oncE5KWD)D>-ZrhtN!+Q+J@6&u*j*zhV) z+MGfe9tIwe^X587e?wN5^R%A>^*L<+H{JT*`#FW}cBN<)^*Ok=N`JHLR}^ImVreD$ z+r6i5m#Tkvnl}oaa5Ytm zce>R5MWVf%4_CU{yzxVKlC?JfV!R_Q9a5NZ|m@m%gj!)8lD-4V^LI{?rrL3ErSm^uaWvYj>n(B~ov2gOf)lk{Drtv=Fz$h7WZFF9EbdOHa-&D8^Jym|o z&DpD*1=7F`xHr|+vlsl?mR)N-sh_D1^B?#DyIEB{?Cfg|FPX5Xf=V9JO+5^$Z@^9* zadaf6?)##KaKz+Ou4(lZjowYD5s?D=R;fi-lbhEO<9;vrs1gev4yUtWU~|*O%NlIw z@DM|q7@|wgoV@>slNp({{VO0um|PfB0* z8z4|xuZy~)MkPSpBPoakEK96@*5glJG-cjO@AMkXzRn6@|7;3>dtk9$h--Db$9LOo z6i765k{BW)0vaAxN1{{Vy%>R)Xo zNFHzVUH>~E(9c&PH>;adJlYf@bvaOo!aYfPv7E2$fJgQF4ATEW0n6pyTQcY3Y1itn zB3|sAdvmgp{%2wRGwpuQN7Ufne@6r+_kgB&m+`Q7gi1%pJU|@E*7E;?2z}&KPv6Bs zmcC%G!3|O!L&!pFtenUaZu$vO6ZOVGP2^F#DVv2VyPf%eqNt;dV4@uiGKdV?O@+ z3)|SF&ft3L)zT#XFJ;0$oJ5R%jv(4dl>kYv8CeO{J=LE@ z6w>neI_%Pra9Z#@6l*xqNkPpj>~N%aU?wf$Mbx-wJu`S_H=Pt;GIPOSKZkl2(6;c* z+q>?Kn9$^=rMq8Q>zTxl_h_|c*jUy+tvS~H!ke#b$%=j#Wxd4fhJoWK#*8RYAK|A_ zQf*&M5DiM9@h|7=8`yFmDtBM5-qRX(hyA7k4d_YYyixYq!A&?SG7_dB;GsU}GX!%o zhwC!CTEo(#Ub)YG97mD*}Qq>*nMz2N(qE;v|ccqg^`(r}OAu+>8NtKP*YbKnR5gdcaS z_RrQW<{U!n>S~}HF!T+x6 zlzbRsCE}<+Q|MBw*H!!184d~OM3^H|gW}j8Xw3HByFb}OwTW(T5zj;9SpYP^mY`DwDj`zi{qFU_vuv_VkWH%oo)ByXPKf~k1}*P;fU9$ z$c!10*&Me*75$SazHxUHFIOGppX2SEnKnywh$Dm6Bac#{vFhjbRe^8oTm!sNQfSAW zdGvRRCU9xJ%rr+Q69BQux1hA4R1F5Xj!yLc83AH*FqYv6zzV!W@j z!}B0mk@?5iw3OV=wtV5#C>B_rp0g(jg$@-Ydhcsj#e36dJ?Td%=A7vL`MWrM3ny|6 zZt?m{PhibmHZfhXyyoH!zqSL1I4?9{#$4`$+}1h|KVCQyGjCT+xVwB&#)D5C9Z8}G zrcrtcuIzw@;lYl9{y(;-%uw$RO8WjauQJ)Ss=l9d z9rp$vo(pTxnu!W*H@!Q_6izG)HZSfGhFZ(#_U^-%20~+$zqbsgRh(p3++*9xK3Pit zQ&zP-Z=fdB?X|lL=kz&{s({pWl-^ZsnZ&5=7~yPPpMF$wS;rs#iG8j9?WWeSEVe`2 z0(!r^bN3=m?}3EWez{@o*Jv_&q6j{2e!G`!ZcU6(;iGI*-U~^+wRVV1uPE%eW^&z> zJv)`{sl;D5E=Ub9A~SPv?dJBR&%v1XJnr=K4iz2Bf7yhol-a{JzVqQWt7Yv@$z3la zyE|0VJHu4-a$en`J~O}EMJ7IA;5ogp$aE@rK#(|b-9CQR^n$RJNl`wqB{6)IRQr1M z>Q=T1&99~unR7C0;)2X7pYIU@g??=hR;EZpAH!%GfsKg07k4L#G&zL2&j-EfozpzN zLfbVlq)sxYWszg}xeiJ%?z^4ZPuI+iku`+^sP1Ioo9_^9w|3ui6nUb1U$-(k9}HT3 zfy;J5!5MbFoU^EaxX&L96Nu4?HU=AITXNpyIu1FBgSr&B&-KOx{?S`;q<2tn?sAIi zj^?txGBz!0?L3i#GnsSdEojesVXd}b-s93C`$FV|*7zv{ie7b(@O`Lz5!|CX@R#4m zLt(vU3Yk$%l}#OMorl|Y@D_!QV<-cd1FFpR>+tk-ZHq-p*IP}LK6M4h((Pea-c(Ro z%E9!6tIZzf-x`p!%zhr#=dR}~J%uI2SC`?t%TsNCN|@#|$6~}lEJ1eqxvPKl9()z1 zmcIrLQno+;cuG=Rd7WpmKy6f6&7Qun*o^U=O}TlTf5cEMd90KEEMQUZiI}y1mEXSn zm39w++6(kvtTmOou=LkVr`Fk$aTC0(lB!5%CmjttsW^3tfk# z)6Vcvw)%&@g`yBqF?0^kZU2(xn)-7ZQ`B8r_2q9dqBoAHtW084Yty0CcMhoySl+$X z1~#Wj&W}k_4)xaVa-9N&fa=t`JrMBEAVWbW^#4#8)nE1sl*Yg((Li~c53h>%YRfzw zPsJ;eFc&~UBF>(jUc54WeDx7-i0qe7Ha*(IgE?PJIA6Kxa(Q&m(htGSyId>&R00>c z3o%agK5Xq+qmt9U%tKqZZe07-(ycSR8Fa36|QMCt&&YqR5vHF^gRI^N`if_&@o z^lOqLp_$E7J4(N`FLOh26PpG31!`qe7tVUQO8qi)S6T@_y+heuXnvv zJ5IH%#a7M3v%&}9YS-~tbk@vPk|f8W$9rGT7>E)8Popw4(-UU4X|fx@(@uPXEl1z23Lhi!WD+tKUC*(fz_io{*2P+Ppp#&udG%4zMjYj z%%C48nO^$oSaC%$t;!B{eEOp7cC}9X{AK~D@9IjEoLIf6g8e;oY^AUn=sCL{eY)U2 z_h}R`C3dIwz(vPMp|-+q=1H(bYnTRm$fmho$<`GmYm?5Es9HGD7KU>DRN}q`F}v@{ z^HqeP;t(~NI*Wf1VBklV>rY|C|KYoWKN^gjB!5Xha|3XlA0!lRV5S{dS(<~|9xFFP z^q8BKcjOq&=awqcj9+pNK)y3v#=JxPSRwMa5{t5RM>(^t(Apz>HXt@i_dZ|#nQoXG zWOCUeect?qu18MYOHZ|22CLemtcNPw)rl;3MZQ`T_(tnt|A_Xi!J&CL+c58te5bo; z1g|6$KzDO@a2eOvbJDh7rTk=ZkiD0J%pJiWCV&{CVz7LW6UJFq)s5x59(|0|J8rNM zb-Alm|Fyt6-A^{0y$G$|2Omczc>!tOcjmlHird}fk7I}73dvp>vYbF&AyjuPl^mSp zXgmPbu$|5s%ha?K;`wUZ#tlgSRUCdq%^dKG>G=^{=Hlk&j{dBHAF0S=J~**H3zMRD zEW_(#$oEKB8pCF9RC5_X7?K51Tee`|su~=1zl!X+n>|g3Dy)f=J`j@I!0Ux5K zQnrp3Sm1xrs50pbZ*d$$ELPS3Mu`GxE=-l)-1_0v^<#zI57PA#0puy`bg1aLB_b}2 z)1vxR^xb(f^dLLFXx|BiN#47!{78Lzb|w}66zfUl!?OD!f$uZu0KH)KXJqx8T~3I_zA>>|(Gb1IgmDZ90At~=5zqShZyQ2I z1=tTlcLMOD`!)bPJsO{1qJPsFgDdLucAPd8Dnn`pRhMb5b=1D?I-JtoxuE?Cbf5-@ zo(M#TXh(k3OGJ*a4NK3Oa#w19^Fr6|)@Nba2yl!N_L4FMQnn}i%Q^OswJSB}=7gSD zw+xy`(ny1e-Q@^xx8vkP-`)xN)`66`V8(E6o#OKXyb#BI``Jb(S6PuCcb+9I_|O!9 zurd%oTL8Pq-_eS4+Jg%fyqk5Hkyf1L#E5~7O-l#os?4*p%qm=E>?VwhqT=>S`l|GI z2Kvd!gIH>h-Ul>zm`fEGNIQ$}^uoJvYd-+X&2gT7XlDhw^V|-Y+&35Ny${tXD-)ZW zpQyco_@fNt{<$lhYJ-~~oC6ho%Z?SULB(MPKEzBq8Qu$6|{_Jzy@cdXU9Rg zjA7aKq%`EdZa0QyC|7pk>@pV#+rG#&{^9wD=!>?W_T7~8*-#{TLqU*)UHizb~>m%H$#!y#pmfAQOr z&Xer!z3!^s?ea~hzJK15c?x4xiEcyO7Llbxwj0pyfN3{>{(}7CPd9bL(?3?C7Perf z%?EkDOR?YkUc~4eb`7$zm-$N^b@W?tR;Gu|9pAhERhZ$_o3~*;^*Z{N^0jc=jPEBi z>eHXqsGRSx`=)gFadPtP02@Po)71i5tiGeDwQfLk6*g16S7X1mX4?k4BWZXAeVeDN zZiEL2GSsYd+haNx>`qM7pBioGDRN%IOryU*sNY5xDiddNXHZ{i=v=va!n2#{&+efZ zj}YbcyWC{nRS;clkZ;O1>lBH*a!E4ZDZ4lK*wSLXN%!h6orcSb@r}ZW<1gvybDlJp zo$=E~ysx-%&Ae+?kwQk1!gB>vHltp}+TFYHNm9a0k(!J!n-ij{YmbY8j>2$gFP745 zJ@;78^VEVzhMODTIBo!;DCez_Hy#e&>TinciZf-2n6mfxi9|Xn9y`vqK417@zrM6@MaUDm%mGxIiLo}Rs0Zr$9SeqpY?J&snZ{i{uX)K>nOw)r+_Z^-Uj zsq%^9H+zuk2gxfAO#j~8dha&v$_=4it}l`$&cIO*5 z8Rjun>N*Y7{6$wZy;Tyv@8|s?J}G(jo7DmuM;Rud4Z=#%C1ckwFAsVF4^)fBhDg); z_btzCzZw=b8QPS|XV(o6h#K4XcpUKTJW^hHKz8vo=^@XjyE)r4Ub|bE`%eFUOLWIqh=zG)V(jCEr3l-gA~OjTGZU+Ya?L*JPgi5T{n$0h@v$Y(+e!a%PH^ zc361+2KH$aGcn-w&cO!mLEF3`I@Thg5G?0DBf)_!M)fm0&rU;u|ylju6(V8PVD z;^#9Yii?{X+>3o3C-`bG>u#nVwU{0T$ zl|UCRUUKks#X1u>Ady{aPi zT#~CoHfXS!4WbCB&YI7anYwc4o$ZpnW-?8iTZ%j^V{7?;afQG!k+;Iz4}7ch4Be&R z-l9z7Q7e_ld)20XtMfPt$HYoDZ5vax*Ng9^{Vq@36qG$|rTByDH#H1Pg-Efp%Vj*Q z(6xUeKqyhkco1Iqv3O(mH-!+kWH9^fEm_P_I#R8g-D>vF>|^B>vGba4T-C`R=@JaOvdbSffS zQ*&arjECyc>C}M3w+{BTpB)$N26Epmxb12aEwAnSVK}k(oC=V@_sf>GR}2Ton7f;5mT^^k7a4yuVD{b@$Zj!(kskW;gPrWL;}Y z^Vm{zFX@Q>bJ%HpLIN_wdM~g%7kDrhh=dKZ;B>kL$^h%&oa-G|2_8j9U4f^PJu) z({C7lxsdaD&BSm1XPm(U6miduwMDB~reKYYDX(ZF+jD9Id&s#H4pcuyj~RwSp<$s7 z#rg^j_V|O3!?ib3j2KNKc}7LBvkjMi3g61siXG-mcWgF zdauJ_R&Cc`{npMXzz-c}Z*X(z@a+DcgiqxwZd)SG^+4)WP033wkF%f%>gd=#4R>+P zcCW^`vwruKj5)?@S}n_Xp?nluIO@3(alEq=X(ztw35Kl;@ou+^L}~WuQo#{tE%85e zd*D<5Gq*%EkcZxtMLs{0G~{RInyvhNdw`oV6^qQ}M%l4A+uj(LF=u&VpvL|%^cn}c zsm?0M4fsRvc*#(aGMti>bg#3owlF>^!DJ~6Y<`JOdy*^kUzTuQ9=GuZMCx}$e_dBR zHrOl}9Q3*ve89L3W+&>eh8l|g%bly|X9b_}U(TEf+D_J(Xl!9Ysvx%YrH9P1|vfXFN3O<=P8n00v>dhY!xp@q}9m)aqG+QzpBp<6io`UW_^y<(?^<5;i z)XezP`)3FiUC^CUruHZXQuTkg7$G>2TmtVwCgM=k)VT2agQvT|py|MFG%kl%M_~odl3|;_4 zkQzbh70`gC$F*RBRCpc_s+y5WSC(UhCE_JF2AicT;MlM*TIy z{Q9WFL@4Eq_-p)7Tg)Jmr4XGn{g$PN{8xMDO)O2nZo?CKs1UT||4rD72gbPf_#R_FT)We0yDWZT`kW45jRFr`Kk zr*|}fpF78FwTy?H1l38w1~%jeRF~cDZ#9A1lzcei=l>C*V#~(Rq(x2Ey7*0?hLzPa zOlGHjL(t&%T#S(2d6iBYKO-UrCA{7dI7a?xk+^Q)E@T0`I7KQJ>5fo z(Sd4mAWg(Iq-6EH9e~~+Wke?!yLzhrj{KWhwQQJCC8R4OgC0UDnT)(Y!Zg?z&Jla$ z$o=el0cYV$(Cu^wc#T4Y${$X_b2peq4&2!VO74V *EW!nr}QXr_{)h9}Zd5 z(UHf9ZZJyQ42C^r_`bNqMlbz2o??JZ#k8MXwYKvgRl$UI;WIfUw`(P z-pgc0hht4vdi_Dlm#U*b_HX6*6vma>+^J2+i=KX0z#o5p!jIuYb6M-_p& z7t}JAwo%yu-&45O7OcZ*XV^@Szn<*MfM9R~lGMDvAxzo@-L6zQ;JGeno!2Gj1sIi;*uBJgkl zWd*>i-VFHwe6u-7`aEP2q|1^jd_50b-w@~ECsY*|G;#CL?}ZgSVUlMli(da0fluv4 zc(2Lst@}`{!O;qul-Kn?7heQ*YG8*~w^@d2j34(QIs}tF_o;bKBCismPUYYD+5a^w zP1IdSEfFau9X|xpB8yyIL;P6y8osN}o$%~q#0t=}dNZI(iHK13){+w(z36bY8pbJ? z3Z$bn3^yt!r#y^fJsNU*`A5__G{e*9sNUdAh15)Qct>nW4SvFCr9gIXNuSri+ETIC z|KhMzy=F>3Ix zO#lA8`a6M;s zBtLpW=AACex-Ii%EyHQhTZ4~9{$r>*)PG{B+82_y9Khd+h43ZlpDpAy%-rwv#bYVy zzd@pu|1|WUC;aCLL<9eM!hfFdpC^p8Es-Yv%MM5h8n5xb+sM-~S^n#=^Uu<<@Q)hh zv78@0p>4UDiB;@JDz!a5-os9%5#v~s28&n5-FWhXcIakhPxp=R(2Aa(6P)gxpP6ja z4dSN877hgwP$*HXC8RmgvTXB=@bOf-Rdn*a{UWthJ?ZHNbuu$lkosFmgO>Ssg_L03 z>TLpWoyxY2(9okO^qyw;aiz-1ZOCDDYwMF4Djtq;OBcXyQTz_tvquP#^;7A6!W4b> z#TGj#qZP=No=rVT&)w}*{rcp=(`uJ!79OpUd!XqHWdYUfD`nT5uV$Xme+l+ew*OZ zfGqWeQ}00&A6l9FDZ}|9ahc)sP;ZXhl7e5LQ)YSGw5fvg`71|7O+cJ2n6_EgLuT6L z&+HsQrojAhZqV;T{TkJ=+aH$;iX2zG{}^EKHPMgCNEnx)s5klC=ZSsqEX3*l9ZB*p zfsYa)tQtf>{VKG1yJw{@y(Gh@o28!c_xit@uJy0U?TDdNt^GtTCU(I^cgbt@(e)1L zF@879*)o^v6Hl0qAH))D^Dy)s=eBU*#Q*(e>pyMUHI;d57=BbV562HqYj+fyZvEGN;Yn<{~JX z!)`E#Yn)X_muHTHdp6CIr*`p%-v|+6O+C2OPwtOC!OWCD`aU8~IOIQaK`Yyy9pwZ% z9M{k1br0m8?|VC#alms$?)ULIyE&Xor@_}$@ctCdpsqxOwpC6|d65EZ=1%Nuuw63y z(m}zc*SiJ=lU{USHQpufut!&yG>xdy|8QMnFPrgahTOYJEYrc-DXhkieGC;4m8CdV z_y2tLG51a40Ne*G=2-3zdlTo!_dT-TU3OyUxmKPIFp==sa|2Y`skicd0h&}@Ib|pu zdsf4KyxY;IGnjeoLqZMUonhn{Ct`zn8rpwIC9@5oUwGO?{NU?1lHK3bZ(sNR?#Isi z)?t1gnlbKUT{?*A$^$Nr!*>K#r4;0u{Ij3v%s)Dz6M5GY(dXz^6nX76TAbSY(}T)< zcFME3ILEKUuG@9C!V7_5A=dWDLiLOTZxql=MZXB4W?73q-b&?%@C?c)7TQlYcj_1U zo_Rb=8^1eC?BIvb{TVV}o8OMh>^RXqAp8p5p!gyF2lNQBGhEL3F_ z$3``y7r#zr^FyQ2t{M#99eY{>zNwRUl;cV8&tlx>8Tt1A!eHlcYjyF7J?6p8`r_nP zqtYO?FAwGBAEV_TbeOYRp~P&-+{B=aGpD6=-bGjZY0Z48OUpR(P`WDG&hQ1z_=U!@ zZF~I|#835KesP7+X_e+J_chK~UTGE2yc!nuHGi#e#rabQLp!4dMl?dv$y+8zzMsVh zhDN@LO&AR{^rwYmz6;+b{jS=vMm4%rW2{pS-=_b*_x+FLe~SHo+I?uo06;Y1H#fd| z9tk|<7$&N6NGVlbet)1S`+IlnUx8IogQhElLp=tomSx~jqc_AHX|{Ru~NJbKJ5*q!T5zb(L<+s{jC`8wP2c`}xp&!(Cs6e;`^G}i z)=c95{XJi@?)AD$8igOP(>_uxArZCd8(ej9v>KppwV1v=W&N{ZjTgW;`F|4dy0rVX zo4&Z*H3)9A+*drlckol!0~LSNlbtx0F=-kb(Q34@JE9YGQ)%H6_t)Lb`;k*zRd(_9 z?%>pt$>mXI9+S&q?)jgbywO`p6Sbd0^;OcOPpVu*wMbLsZnN>JmHZzhqwZI+?>#%- zc3k9X;N?Hi0W?+e@&ioq*w5L9-89=I$-i-|pT+I33gVW3%a72Aaw7Op3f)Qlg|%^tfS%d7V# z%Plx)`NfYQMPJXr9cTiQk9ocR)3A_?K9_-YvIn>A5m+~F;C+G2ICN{#!5vBo_&H^b z>VWbMdG|vrI3)8Ur@!gAex~_Qx7v>%gQ1m)ffGTa z>UJFrbbNvN?2W>6?c$TeEkVs3#bp7-*?s(BqR-ni>C5~)N3O$TL4<_1F(lB7Xt(0E9R zQYmRT4M&6KS>tK=t-bfD_`di1c|Y%8zklxer_R~W+H37Kul?+2Jwx2Cvru|YXFe+G z?Y@hzpv920CsQqjW)Dw)<#aBls>gArzFdW?&PdNdACUWK6$4RCxsYSl*PmYf*m$q2 zn^8aDdQ{SO*#F~CmcI+0_qrmvawLvViF1ycrj-G!%XmsPC2@xmXVf=BdS@ zqJ~mlMbtb0;wIIfG67f1sla2w1*Q+u)-Y1zhhc9W`~b0bW$ClxN-TS>see#Y9{TyI z$wnBj#b(A~S}<;IaQT(rlc#@!_`j(N^=}UNZ>CEr|LY3=bp`*rf`5x%$U6SP70^5S zMdk2Ipit5Q2_ni|OtxFRlytpoXnCamAbF25r7V^DIoJ6xWKsPq(DANIns4@`+%a5q zq~710XC&kFIY}?cDmX8+=(Fo96Anr%sCWyA3cHGfzoF>hluVAkI=b^ zQV*tnM_pVgXZTsU6Y{9x*$HzE6l~YUMWSw!+_*)|^;=K!hj-gqKAd{+e_pJ;R4!Z7 zyG>CVw$J8S-bD%Zlf#E?&q~OR-m9q8^pjYNSV2jD>WUgZXlIGu`e3pK9~@Z<6uaCv zOwG-OLm=fw@+YmQHF&^DAT4<~7&Hi{4$r?d#jCV^g7VFS=osSlD9@rgtLW_TucbzX z(xkZ9N{kNM*KMH@4PBk;Cg z9Td5Oh}YKx)bOQouM(5J{p3nphrgCynuM>bUp$K;c(mRdtuV9L4K2G}O!q}!boP@L zgH4c*l4Z!Y4w`0GSmZTm<ueKdm= z8iShKz^Af!CL`vwI`pQdeja*(%1a1B^s{foVpVk&nv87|C!Lx(KoaCOGJ;jWaS0A7jhaezWIu ze1hwJzsQ{zdr@Cj%J~)a;l75>V!6UuERWb0d8g6q z4YGftU4;zTNcagS_Z%0Q)r%bjv#`c5`c@8t9(bc!xsT1SH~tS5UfqMFbO=dlKi(D5 zdJ3j=&hE|L2yOnoW^h_C6wys_2XAe1Ypp^WsdW!&BsKNm7$22Fa_w--C+gB>Ig7kp z2~_@Ty`GM)%H8~C&v1K-4P#j*^M6;zc~WAf15F0oCUAG2S!vccufLcN4ok*8_p(GUv`UDW$A0 z(m;0*X*Y=$J8$vXlFITIv61k9I(e!O{Ve_GO#RQVllH5#JZ;fzm&mitXvpNs!qk(& ze*BuqAKwIHnlnemfgQ_XSHKwlevAfBhnUW8)T;Bv-wckD}OuL z6ODmIYq()<{K5`0bm2s+MMwp|X5qsWLG<9yJoE9bAX;EoRLYxuCyYThnWRe!f)Vn) zs3X&A`K`d^a0OcCtBL1fII25NinS&?h$73+oLCI=QQdERzAiStxlZ3u=D1HY=|B}1 z+D@C$0H#5U?6<l$xNK{ZeTJ zqZNo~9_m<=GSV1~K%CshK?13>C8-lK=-)EI0GTuWabCS~0UJBQTpqD~?3hT1i8qaZ&7)MN1MupAN&{cm!|7Qy@!nZVV?HN5+1strc6B4wV(z`;G~k~ zXWgD&*gc{$CdSTuDIYXKJ&C1FCYIRuJDYzU(l}uzH=I1;hfILU-k*UPX|KF5w|)-S zXF=qn4@^vdcxBbs)b8}cpMxPjNMbw_ES@L!8;dz6Ea19(dVKin_+k0A#zl-inEm|( z2e!gHl0jJ=(7I{ok&i50uB01%n$-ZJfLrTYEYBWfJk!FT9sJp*3H!#I>j)38@ad@`4NM9!~uu|H*(9ANL;==Q=sL zyOsuN$9X7)7YdFWcwKXW@fvbU@_)tgKXEzJRWj^)!#{kXp=t*#I~3D2H-$e^YJp#N zqt6xRFK*;zBfn%6Uncd9d9mDmi{246 z`TSrOyyz*Sfam8OQ$vUAd)nPv(^V(KjYucFTebPIL8xMho}o*Ada+0G)lT$gj0SXY z;2J1qgXU|x!kA*TNtvy=%7ip52a#&B{hnA+dP}sopHj%KC`lu;_5YFMU7eF42>vFe zwZ1jM?^UtHg-AiaOdyjLqcl<%IXY~fu|R3)v`dorLS*WbC+z0ZJD0;LDX)f+D^mPuraPi&IFzpR z>}sfaI&-u%$-Z)3f%qTkmwo1}+{?=?5Xa54Zb8ZUNjn2aN5$6N50v$tKEK(|j%)MB zXUF)|V}}%@OBPuei#KOxcT>|cLOZtXZ1&qsQE?NxS8^;Sd(=~EwA=omT$RL>)|D2J zeZa$5hU4y$zEw2s(T5@_r8;4!NXyanm#lIh8!pg?P+?KyT6h3d8GoYG3W9s{ltz7` z`~;enTpvoIR@aP)qX8Cfym6W%J$s#o`|`ZsdcNMM%55B=y2{}w^vH{U!zn{NzEN;d(0xP-8oGuQ#B@!P2L9A~v?> zp)0(vL!uO*KA`Ar6Xx7^R59hs#}8>-j5l7tF{NeDb#$C5*2NLoVZKAnA^n5x0TLGv zobBpzUXbcw2H9&AyUPv`O!o^7TfHJ}$X>&B`0^2JvG859(~He^bNi#dcrE1lWyE9a z6DutO{6}RXT!y!eaL1pLmcRPsi}?*?|HBhe#9Z_=)#J?Sma+M zUTZErvgLsn@0)m!x77zuNPTEu3EjiM*;($ZgQgKHk6Jypa}d!qO1+uR-;O$S_CuNt z;}(w+A8m>CeX6gK+EUqyh?h!<=Te6YsDz+1o(dmYJnmo5J6%3=R(R3ML;Zu|*WSut z)VRAM%~LloiNz|AMlV!TRg7CX+3&C9+Y-0^(oCE-(RHh#X10v&n6 zE6TY^G{VehwzN5=-qPw++{YmX zSD5e@b({NdO`KEI^zu;OzJ7>$RFp)jXKCSSqg-)!Pany+knGmRsvq!sDp^~KmhKUM zU&Dy1;QSOrZk@M|Qu{jikWl3lUpxxG*eewlxG+wQv+U;95Kzol5R#eX6~`^%n5SD4 zn<@?L^YY@dkt)%L9~wN0+}yjySU5n!J$xUxf5w*vt1pr>QFqCH@=&B+Xnmo)=Na=C zMf)?_!a5+>LHDc&x16J|mDShf>?xW4i~JHuGnAq8iK5-_TWp3U-P>9ZMR8U-hF*w= zN`hj}3O>=&sN%?3`cSMj>O?_o@0nNyLo1L?f&|vva%IC^kM%W{8roV#B-NWGMJ)rU zlEWL0j{aPg`torvydg(YZl3@9FAZx(=P!W;7~-6hF7XvHa&WWJ%nG?QXg-Lzzhkze zzoy{naz!aK?{`MqM`!ldW+XKzxt;(gL-)fzgud`an9Zu1jF8SQ)v&qYmSANje4LgC(Pm{_9d(DN1ur8#1kN1LVtx>Cs=}Xg=FouWSDm~+KNU7U&5;Ogm^jbx}o|G0j zPu}BubIbzt4$^X2%u$O6iPy#@|GsVDG(!+QFUDN);Ff}F>&l}LXF7`glbr-FOI7ZN zyXGAx*aV}dG!3lLG?0k=7<^*?2D|p3#b1zLiPH9pvCXR7(n|mQ+9Q2l>IM(X(UR4$ zylJGByM(rG-JwhmP3$uCD~w)yyd5k+fGr+ix2i4XD{8u-m6B4MiNhRNqmwBjB?vnk z6aKIJzSb>!k+J4Mc3WfVuD6b1hcywT+X_>w$GvAa#6Zot@|Yk5o0O2H$lMos;`G=~ zo3pa;ebHI%S3{HPx)NC?oo{?S0e1! zXsmlGy(^At<{?@ggc|urZ;V@^$zhxr(_#}J)m{zjetu|RTW+0g-`ncb;1vO5$CDQ} zaWl)nEa){NQ}d8h8)ZR8MPEU3aM$M%pE9m59rMLj5y_~j6-{aNoAt(*Tg<_sAKK#j9|$#3TDzEK`WJSD^r`0jmsdHR=BpTah1{6{$rZ|X81Cvq zOr1So7VcAAF)iDSk%wC1wLAtv8Xn8r%?i2Md8G625`&bGCr)n^h8Z&uwWQLtfAJf& zqrF%M7{VV~!m!p^Pg`fqS+=(XHV1*!lcYFo(;XxAS~(*_=rJR^)A1T4hg6C9z-kuA z;u-@v9Fy!b5GjG+*6Mtud94GIhmixMZw^BqSnWkdZFU=I-nF+~1&$2OBS5qK(=(pV z(TO>g2IkY^4vY>koZg_Eok2vh?Z1QzW^S$TaAB8<dmfI;-@I&RMb2C19)TPQ{Gm;3T8J3ZAjiPzrF*~n9|9jaiS@*AK_-M`;1p{3+d__sMG5?4d@DJ+ zZ^MDhEjz!?Y>BKa+?`rp+0=goY}Uk{@W*eT&)w&Ig~ue5A_Gpk4gs&%9l$PU&E3wk zGZ}yE$#+?^(evW>t4Mw1$T`Or&vS~O3%9#+TiWWpVu?C|qjJ#Li5y3_&C!)~*M{Dk zC><=z2tq(f11TS%PNQhf`~Eq)m3zE+RCh(e@JL$!Ywof?Msn`@wAXZG$aw&e9d1lR zF$FM3JU*2$SKgLxm*N)C&32qTUz z9p3b2Ji;;i){GknC2L1Ayb;;yh(w~S3I8R;mB;`yr?r0evZFD6$%{+Gu)5F=yueuk zgpAL&eM1&WQ4ng9j@q0`ompDw^wa-FS6%PVb|8^rku$Z{dH*wHOl|i7eOR{*%WuW8(dd3| z>#E-b=t$0FbmTXQHrdQ6IyArWh2z|$`n;36Wyn?6`a1uGGCxSfXN5wvdGe)-U4}&U zk-S|X&|kCreNoqH)5J3G!+pD47ozt<*e^9cYHVnxd5V?%Xg_Div0ow%ooscPOe)&< z#<#P&5V_*3I`RCFIi#ZCI1~#WC!?hbn%0&~Zq#>7Ec1^*RZRzZu?l_N^Zy9MT#kQI zChcn%J3`@-gYG}sZ9~#-R6-5vGm)Coc6z5K zDV^<<9L%dig%(|qY;d()NfhOmF>;zOW&C>Mm!{swhTPwL5bQ(9ZdM+$f_yJwp~-;> zK~k<=!yl1?Q+C>hsyM@W@}|Y#zm+#Req--sik|st70cml5YDHkm27Q1rpU>$lEQGL$t@7qU%dvl8v%N& zKed%&2^pTy)MXU*pi@OIq{lf;AoL%?af#GjkM91i&$hjhck{geZ1KDJU2TEquI|xQ zaH57LNUCYfYu{lI+#0Sf{IDP%6(`itRR}h}NOBSO z#8e5#KST|F@5z+4dgFY@Yb6f@hqJ3?sH$JnKx-u;N{gqzQd?gFNlW7=ke93IL{Mga zkUu-1IMX;eh65JrNLnYD$o+GOGcn#*XR#1hfvw#ca+GDrZ!xEX-GA1C)E&Rqa>J(b zR8&&I+FsrmSI6Lmv?N<%pszQk8iT`=$8XbyVDbdHk~1jfg_s_GG|?Gqz&M@p(sLTq z^MlOeYK#+8Go zrd4zL>TfC)P@XO(eQrwd%B|mK=5TO}J=U+7bf!2kEOf=*opUA|RdH!)Y$%M|ew(VA zlqyx7>717?<5=B!{sPu=4>1Av&eGi+jRxCQ`?^@}eNCU9!WP&q*(cvipT771)Y7ir zO66>`jh+21z-hx9@VW^X+gGFeWmJ?#^!Fs`YxOCwn?_tJoNauDTjVKeS94DVuac_B z*C94?RCoL!)8ixj1Df z)2A?J23x)T98%?0c5W})_OjQ&kd)#r&r5gb=H8GTJs_IsbE28wz0d2b6Av$CeFPIb zQtx5g`;%MQ<>?+r(fE)%7^@~5c8NeL*As_I^{}?~;tFB={nP)X`aKHmKlo&~m&jB123S@qwy}%RzVV~oxO)gq;M3OhkJHVz`HV{ZRgoRr-HPKu9y z%)xHI>1elV=tA0|{R1r@O(E&b`vmK(y-7rpqrA?Ee`{y5k4}Y>frFISUn{9Vj>_Sx z(FF16kkf;0dLq;mFV5D`5`Rv#%TVjX2HRxkvUE!o3zRFFCq+5v2s*9WkDfmdoX3ljK}u*=^zN)n?e9eVYhBZZ z+gJZV+t44dgwoS%^ody);4=4k^Ui~C8SfQBDf}Cy)%5V@yUe>%!pl*Kb4`_q3I5fa z*L-u72T&|V#(QDUcsCZ#$iy;^X3f+`pedD^Wp-KU&L9qz$vI+rP_f zg74x&QxJ{l6Uei}OTWZvaOd-IS$DWPDv&S$hG^wsR*IYgK;J#Y6Fq|oDk8#_MmFm? zceo3|b>6gb8g8ir4SKF!b+sS~6SdSSDE#10?M{6QW5<5+d!lO(!VluJ$0-n!-4o|=CzfVS&b@|JN9evAfR32KnXJD;R@k# zAvk<#jWfG#i*xtx5{IrMBKwZN0|*n9aR}cq?PG|xmx}!=h=QRBrb^5HvLIU8fF&<* z0)M1unZHtyh!4T*=-)8T8@k5t66)VKF_iQ5+rxu3AC=son=z7JzsNWjOjdkzVfB5l zOgFm#1I$koQaIrCtxVQ+bm#Mkbx0ZLvBZ!O5%JGb2Sk8BtJz$&J~@HQ6UZ2>QF8Oj z^XGuXfu{Lvywy+n_FtSIb8D6F5;Ph5C>AwclvK||Z95wLsq1s2}3@z_sU5MHDLZjVc85jIQyKrDXgF`HBSc(kgT)_H=!xkz!YteGm*t%izW3sJ==4TV-SO zO%*$rIhi3=A?|(spzV}$>eJPZHE(M=9y){+q0GgX9~1c_LfdeLm%|+kdG6B2FLm#5 zjOf-4FOuSF4J|jiiYhc~;i~6|2TVz?PqA8w#oTocD(g6FVrsI<*ntOLK%EH_hiG$*CAfL+DK(-@_W%Au+E|)t*g)} zi?h^WUUcCT^Y6n$Th=20*X7}_LsT3S#bOhc;j0KYUM**cc7%ID%2AOIGAom2RqfNlYZiO#jNb^V$a`7vW`?%7cz>(7x%moW2<377 z$jO@M{7Q!oJt6oiJ{ix>w=Hzer=HI|)u9${A0Io~?MCDepzGb^E00~etQp{*S$`9R z2G&ngeS#E@{_dQy-7F%l=$IPwMg>LCMv7Op`%sXAL@W`No+%csxceSkTboZO0xn}= zovBdx3zBz=>UydlrunHQ^sE1;Rxi94tKMyhcBLurouLL?!H$R`(dQD+`;k%@0q(IhO1uBmE$2G-T&0~?;_pikK>7NqZ zbo}}Ix!(lciMqY+aB*3JPM0%3Rq~e=8ip24L1d>9-xCoyCak&9yXOTg)Et+V%VYg8 zW4^LN*tww$YLfI(V6fg9yh9|ieAZ(B9Zox!_qB+JCkbkPLeipl?cdMvh`bP3U-Zz{|E zqeetPKMUWn8W|ta&Z(9YN+`+%W-SuK+~md{h<^Rvi?F+UQ;WHYYG%75t_RM05j*%a zzcN`QR<|0lN;=)@6iTk>Eo|EY`mt@@nlZW{GvosimXK1xY8FccvK*ux-B@*~Zk-oy zU7v=>)J@+muRvk`Col`DjehRQ-!9!rjeA&F*{HTl*sGMkLI!pQ_}?LAS-NF~$Fb=8uqg^{BbN0sGyRs|B11|1no`-lzqtkF+|()@zy`EMi&YwpWbV~W zFBQ3-cSDK`v%To)kzVcNn(n>Y=)eNiwG&5_Yk}I+>%3Tt-|xK|d3Rr*1FZ3G7sA}Z z4;swbIx9et>xAbsW%wmf8PgI-8#Ag~egSTsjSRK#3Ia3Skwo_B|0XEqRaj8UZ@pfG z1Vm0+biWCF?&xTY$?qRJrRR|cAoMOAl4;9-rnW31 zaxyjhP*{N95>89XPu4r8sm``4yS%!URp-j z3-&+YtB8qdOmtj_wTvdV-)ha~n(hzZzZaSvl)!##h(tc-4kWRHtI^$er4^m!B$CC@ z`%OD^fACPr~gz zZ}@ifJqTK$nYHzxlbyUoa=^mruwaw(6T+e_FJkJg9e<&IM_Vb9mH)jeSf5CoSFER6 z&@iGs{0lFno|JVnq38a}e{u@E?|+Lxl<#uv(@Z3H#Ky8i(7;D56C-5YEp zaj$JHc&Iy`@A_NTo2DyW?rc280VNc?gm!y&OFIhEX#QK%ur4UO+#7S@0tcfT{Jkq; z&fLVhmJrmlRc7diJ-ikp7AFL^WC?_&7pUgb80^zf%++_ei4@V`^W`z`EtE(2q9TIj zv`k>uX1jA|-gKbwWvq=2L^PgwGv(bx$K3b+lSjxFk%$RFp+DphGI-_P*jlNKaXqN||9f>zy-*=@j01w0#zK)~-}@1$ z?B>0U)KbJZ+;3Q_y63QbfZ9bBCkK7&5IPv+Ex(1D>e%HopFQB`YBdjqN?`Ns{njBS zMtzL%ldC`|r}Pj5F8uv-qg9+;Z=q6V$a4GT_O!o&fx;oly=SvfMA!xI{6ebB6h+}I z$A1SqSIqgm$#-P6D8xh|J}8O;17V7Q56zBhw^*|@x!S}ZLndYR;qE(@L%(Ggth2*N z|LCZ7=t4fvDfXPz(*BXG)^7s4fr?j1kCe}!Y;74`X*4Q6-DdcEC^L7?c8|8wQb>FR zW}rkt{TSrJo_T_oQDy~N?V<73@6YXZy{#@uwnDINDY6LjA^~{ux_ZQ5PHk`c-nV|s z?~*8D{@t(?mcXa4k6mgoup&KL%Q}SW9c|RlT~yXDyjbcp|$>g!M1(GKC*89?!t!t;%ck?x+$$< zP#;n6gd||HiTj+N>;X9z!XsL1DF996|<|8lfmD5tAl`ww|m{19X;AT169LJ|DcD2LebG zh+eR!$7@+aAwuqZ_4SxfgtCJ+3z?8?+C$0M$P6>4-O7Z!bE|q7;Z_-7%N`+EddC=@ zLp=A&|KQv8k7ZSML%LV*>#~`2m1dNh(W|48^>SIBFjMkk?EN?_V{R=h=+9SvFoGl^ zJ}5JadJd~T5~Se85bIp%JBzEW{MAL5q@GyB^i%th^-Ym&u$_sl?+)u`0j1}}4g(wf zZ=o>IAB|kEs?S8xMGNj;q2c-l4kqiFV5zHFw6-YSEbhBw(uUA4mR%9n)_Gm?rf?l{ z4i|7%hK*+HUZGo)_{!fbz1(R59$*sK9{mt@fy<*o@F$Z;)aK;$fY~gxSxIvIyYC?W zb55pc1`z(ol4Rvnk%ZIR_PiO!i|9mtoqngZtW;C1EpPeTazzl{;=W&UF=g0Qocl{E zw&58wcVP)wjv=@py3O-zAOY`O4~Nvm^%M^~U$K$D8_nvdfDmi93~Gj>&&FziG-fdJ zeB%6E$XwlkZ%Q7;H#?mc&p0~r-TdvsYp~;aN;D||8O}$zu>Ad}vdkkvDV@w$%7KC6 z2{PL%T#{mip(xXn#N3-c*;oImYROF7aZVBE-4gzH^Mv6+(D4unz1?o_fYALRSE4t<>-mVT8#%Cy7&ZVfVhyj*42?`OmoLU8-boe46*=3vx1X46AsftAO4c?=echij4`- zDGyM;aA34U+xl{__jm>f8W6|?zkyO$hyI`YA$_F-KC??>-ZQHF1w=+lGWkk zli-S?IXDlgJ4_u#d5MHc#rJ;AAbmZ5_rBIgz>HKccMjD<^bvvR!&Jb8n1KplQ)Ffe zLODsFVVZ95;|_&2Ok~V+wMDTk!-E zzk1+^6?mJY=I!BNJEjg8p}HM}P%*>@S6pxP?*r#QIw#Wg-sFHGq(ZRSJf3>e(w_gC zdB&@~^*az%Y!#mfnF!^wYabLb^ldE}-Kz4&SO26Xi+G&~IurRf*$^@NuvlPZ1NaoY zC6c{j%nwtwdRUJAYmo)eZUXhA;zpwq{&`Q|#wsB`o(3%{jCS;H`=TWT8z4}^rCYhk z8Xh->3aI#DfUV!?7`17%t`_ zPJ$w_929&e>kfhyP2)6)Zd#Pr|I=ZQ%QVV2{x@f=9I-Ot^iIcNyD%~EN2EfcX(Cs= zIeA26y5b-bA+HA6+XExUYJ_%YemuhKpB zgzL3pCG`tw14ta+h@0bkZysT1U*-FIBacLfY_MMUsSWEqTFXN$OG~s4{Zp=z(_Old zD8yg=vKL1?gbKNdl3RS~6nmGqV<|}j-UMUnppd{5skGxlHuA@`(BD(yT4RzCQvmwS z9;A2x*>Zdk%0`&pt(kXsw1F*BidtiPKktiy34S|^D{x(KYw3S%)G>8&PDx{!;>Fv- z;zP^*ceDklO#<5z^!YUxZrd`E3Qy|D24grsmY9sggD6c-n7PU+j)=^l`V;opuGOxd z4*QqbsffIMDLZ)*C8HDWot|E$_=uN*w_lm(W>%N-fmgR?U^&W+<&=j$Vu#>#C-aGv z8%4?3wHql6g;Wi1<)MmA-}*I3`;Vpa%s4N!)s*uou(&a^Sxyc9hltNx2Rqp&M^Uaw zyFdTu#;VLPF?5iGqPa(fpt?Bc4oU;1PhT`LlQJ4XXR;tMwvb|x1fMRB3()T9g^=N) zp=&Qt{JR4G`>sXK+eC6|O76krf;+6BHE5jTT<+;3g7)A}2cbkS=W}4On(O3JxRVVZ z<%7vr&627t|CWLYe&9V!-!P9mEdOO_l(LcXDziuVJ?$)x%Remho6pqr?3sf{-*|s& zorc`*4>c^Pxxi@EjHB-^j_Di0sUW3}1YE=fgyA@1^HM##xP z@>mnQFi*goT%ZyU?u-Xrpi3^W-EC9lS+6x858I0lS3g7flXVCgh1|11aHrjJ%n@s| z6=?~jR`J(w!^QU4fU?{-MRspAawMer${`sD$zs^@c|SK+IHcUe?4TR8LP(y3)c0&4 z;#MsP@9gYIT{fVHwq>7HG_cJQ?-`tZ&3`ji2$4i9ZR%x&MnXM5)MyD zyF*qfeD;e~{Ez?~i@xQEqJ{G9r+1QU7b8eob0aJSmG#s$ywNCfeN*J=d6BISGlL0t zmm^n+bB@Q@y>P1H+eEM*ULbX-4HYsXbC4niT2D>cHTWclizxok<27@lK~=B2!_@AN z5p#JV;c1+@9Iget0Wt&2b3kLZLME#G1+&*3SYb479sv$OG+q<$ZM>#q*g_(S(4}vG zYQ+ImH5g4TexbtQU+?l+WI^Jg#lvjw7c2p;gZa(L zSBvemh=9B`2i>#r)E<~2dh>vn!?Ww%v5C-@-`T1J`J;E|EU>-Bm~O5kU?mrckfP+{ zP-I1lp^EKK{1K?)Y9m9!6)sCnlt7X#AG!+o1INy;%kT?Kw%d6ut>N=T3y#W)8=KT2 zam>ql;9IMDdSH6eM&=)fiG|QFtZ#@v;E1Hihj;O!@Xvnqo^TAVVJN3>y*3w zZkzaHSTlG_Yx%`kt=`5j;VU2BO^xCONLiyQHE`x2+UHPr*a|_MHzCYYKb+s}BoG-v zm~lZZyEQTct`nbxeNiY>@+l7cH(=On)koNst{xxGQC64TsP_cU0B;^OgVymQdcCjW zwVuIV%&um`J-`Key0_6s1UsT&d?P_TcvIxvk@c^M>lC_N`5x8q*U!ME>bMHg(+7^& zCigDGI#8Qc%TK_M5cP;f)>5oags=g02m3K^!mFQv+rz<@C|3EaaX+>i-@x%wMc*9QjgeZagWI)5^6Tg( z6K6n4a=qA2{xVsRWdkgzY%cg^x)f$cKL;^iCf`%zYud;i^>g5*ih)}l>Bmat^y1@{ zt!Wjj9|NTwvytQ}-8Ep@Q`n&A?xDlp5qRr`IUj2q+d6cAKCZG{h+-S_IHo7U9C-9s zAKM)(F7aX{A)QIsh;eE{qM_ph-zl1LL2LX0DYO!61&lEOxC{PRcFZ5U*4d7&q=&Ho zG1%n_BgQ7;&H6olX5Yk<_mk~dS!onw!i=_KPH_~jx(=3bgcn%BoQo$Y&!6ySdUl~- zQ$ZmsufWyJLNb2g7sIC)oE!6I*5oF5ylH-OOGa}nHYAhAr#6mgKaY|PLGH{t$*$&A z0p@1*lQDP3w8P$}90)%>@7eM`5^3n~+l;Lx#~)wt2q`r)Rbo0KTjZYX+q7oTlN{_R=?e_+l}jkVcjyxK)Y!Bf zl)>dbL{XDnSdD%FM<9*y!yFOngFPT+#A|Yo(s~-0e|%>Xu|t~^k&ua?M3C-8#OTN0 zsppEq+wSdqZx+${erUe?ZX01pIECDyI z4qs|-;mL1+^60W{&e8+rbjrDy&$Z>h__PG%gmKuR4dGcuXyE^K%8f}>Qrs^Z&yrmS zt9gXIlE@8MjU@1mqKYS0gSAx|Z7!!focwSaY618pMC-DnKb3(qJ}sdg$U{-q5{>+` zQN*D*@lCuylc^~;$op>TAA)aRLDr8BEU<6*avlRCyH-q+9eUn_p*e?u!%LwMq#eSh zAoBcI^RF6DX0+t7DNF&`_g44l&*^|iULDJ~N62tLkx=G&xg~1pTiPFLa#8j|f6f$L zb9i}M`E*^Ooj;!EeB$lpysjozGnJdCVTOi@cDcIs+RKYh9nCsA`c>D~HTMrm%aKbJ zvn-JnI7QuL$R34%vhaCqA=l!X&iUw>9O|Xc!;_5i2XK*)F@8WKXNGh9fs~u&_TSKO z;30cP-2R7X*a{E-$?e={nH-_B1ZQncuSef~kA-Kvs4jdmSmb@aJxi(6QXC;wKC&p) zI^m?iO(poki=S53x)s#EP7FN;NKv+L%HlrDp~DFG@H6pRzCrDlA5I*`1}z#@8TUB_ zP5cS>Oc2S>w{PxDjKMx;iet>lhn{#4KLaXK+~bgJgv70UbsZyaAOjh59e$RF_jp56 zg+r^cP1Sy+owN((R*a)3k>0Py4?=OS`<ErB4iLBxK2H_CPoaH7&bD~={t>R~@{-S8yiM>MTpEO^pH4 z;r(?Cr2^~2XcdY0u(IeHSVi7jvWi>J?Q4Lwp4*VMisC2Xo_{@Dn;M6CceQj|YcYD+ z5p5$`2r%GivZVwTm`d*{7G`=AexT`jdA_}DP3$WSI9<<*@go;4vIjrT7ZrcAJmnf< zI{l7i{K=SAut?0!@c?~sbEktU7ksC!JQ+vU zLgirv>UbQ~ZdYTw*57X}3{$>z#!1G@&+s2z9j|r4GF&B?EMZAaxrV&Odi+2LJ?v6y zcl}ZX;^W}3M89Uv0JkKM6wraw(N@y*LxE{*g(_ z$%l?eJ*>GglrSuB^cw2`)>Uga=km6A6u@raqvOm4tgMQ#1Mr<%5M3JgITK0h5q@br z{UgkZA4(!pdR*mILJ+YprhY6EGl zLzcZ4OG`R5sJ#-FN_eZ4IP5bNfuZjaXv>DA;edy-7^}#)ky96Y;!euj066_AzQ7Hr zikX9+=%ns79cL?$33=Q6J8oxtR)$YnQY^9ullO|&9i++FO)K?f;b+b6HW*})THIwy&(e6o6Z&ODG z!-@h%`loHC7R1eBBz2aLumoL8?<2@#_yLNe!e^#A_`!pisHxJ-IndaPisHcR^q8GE zDH%1TLCw7K$cP9TnVUGt33)U$A~+kt+#-*tedI};G=%gYjm&au1=5ybxhmVZwNZf3 z_ac4a^GB%V295NG+3uac@Hu)_Dd0c9V>(jz?c`PEwbLuk&j+G}pmWyq>lj2FyQZv5 z0CIH*B}rPf<+vQKMSykjA2p1Sh5I~7QixxjYWiDS%R^M02Yxk zEGig_2*FYfSmv;?WDzV&FcuNOk~M}!17i^(SgHXFYHorE%OY4rF_!0mC2I_e2FCK7 zU}*v@3)on;6D*<_%X7f8eGJPMjO97O(gav|*;uw?ER+rwa=&uJ%%@w3Sd^Dwv2>8; z6^e_`OxpvG#L=L=o5{g4@Y#4|F&cF`m)TNtIFFQ;;=_TD z<_^NCXOHx|;gLD`Swz|vkaP&x3lBWcC{IXJfv;dx^Md4}APGVucVO0XOv zAb_QSjb%H*f;EO}2MF}CZiK`XBcaX0VxezyYr8d-=u&z$R$00k8HsUgixOv#93mqg zWMq?yy&f_waB~Okur)FFVf;)H6?W)Leavs?BAy%`pcRC?Q76dt3itEm*H(E=-KRRBC*<2G34=c51zzyq|S)*{ABhFupC&PD&U3Jh~tqwvhWPD z@MRno-;r`67!r7tS7XjA?{RDE0Jn=4z6m>?r{o#uWoiw((CdqU#$6d$R?5)?qQI@~ zD{AusK}>r)23bZPH8$PUu=azkz~ zL6bWLGmY9r(42Q`!)Yy`c|Y`(jd7H?2&Ek5dsqhyoqEJ zFb!J<#Yegs>D|ir^C5x9&x}1n9u3`2%lAX?O^2++PneK)mH578TN-jl*&|-q@P#&D z`=(8vUO})p7O^3K9ALmQjRqt1iZ2kft{OIgTkEl)m8ZJ3^{_$q5s>=?iRqRGqgiYa zvi@6t6Oc=8ZFPjrj?ktGW~c5GET(`(l7R5Dv5T>4i@RG} zBO6Nr8-!rd1}yBg2(z(}wNNU3lL_w?R$_O|E>tN<=6|*vI$F)GFpBnAb|Kqv*p;#m?C84#)>lmH`ybL$;dJ?vTMbe znpmyNuB!ASQfdvZG7SD6t8+$sQqD zF$J}cD;s7Ov1WGZLqYA6#x8wG2s{~p|07^cyYF%VMvP+41dvO(ss zK}hLC(OSOp5RmWVAcQknIv)whUt?HE=>s5MY%B-IKuGCBiELG4gN$Jzr4Q`4-(zF> zF%Ckoi0I@JkRxMQNa+J0b!;q0$3RHwLp|TRiw!b{g_J%Nt*MrcrFtBMV0o_diGVC0 z!$L|Q010MeIm8BGmp)X#P;kB>a^_Iw;ZB+qyY!(HZ>uy&qBz4ft&QlC11WtdqbO7M z$Q1SnyY!(V^_HCBhd7gbe-gC%L!OCU`cT^*$goHLWRI{*A4>SaQZmA%8)6G&mp&Bz z{*ul1$Tqn+*`%>cAId<4jpphY8g}VJ&Aq>xOm=k)jRuKL)ZB+`7_KoGQu=0.6.2"}, + "dynamodb": aws_common, # Starting with 7.14.0 python client is checking if it is connected to elasticsearch client. If its not it throws # UnsupportedProductError # https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/release-notes.html#rn-7-14-0 @@ -557,6 +558,7 @@ def get_long_description(): "dbt = datahub.ingestion.source.dbt.dbt_core:DBTCoreSource", "dbt-cloud = datahub.ingestion.source.dbt.dbt_cloud:DBTCloudSource", "druid = datahub.ingestion.source.sql.druid:DruidSource", + "dynamodb = datahub.ingestion.source.dynamodb.dynamodb:DynamoDBSource", "elasticsearch = datahub.ingestion.source.elastic_search:ElasticsearchSource", "feast = datahub.ingestion.source.feast:FeastRepositorySource", "glue = datahub.ingestion.source.aws.glue:GlueSource", diff --git a/metadata-ingestion/src/datahub/ingestion/source/dynamodb/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py new file mode 100644 index 0000000000000..6b7c118373673 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py @@ -0,0 +1,469 @@ +import logging +from dataclasses import field +from typing import Any, Counter, Dict, Iterable, List, Optional, Type, Union + +import boto3 +import pydantic +from botocore.client import BaseClient +from pydantic.fields import Field + +from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.emitter.mce_builder import ( + make_data_platform_urn, + make_dataplatform_instance_urn, + make_dataset_urn_with_platform_instance, +) +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.schema_inference.object import SchemaDescription +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalHandler, + StaleEntityRemovalSourceReport, + StatefulIngestionConfigBase, + StatefulStaleMetadataRemovalConfig, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionSourceBase, +) +from datahub.metadata.com.linkedin.pegasus2avro.schema import ( + ArrayTypeClass, + BooleanTypeClass, + BytesTypeClass, + NullTypeClass, + NumberTypeClass, + RecordTypeClass, + SchemaField, + SchemaFieldDataType, + SchemalessClass, + SchemaMetadata, + StringTypeClass, + UnionTypeClass, +) +from datahub.metadata.schema_classes import ( + DataPlatformInstanceClass, + DatasetPropertiesClass, +) + +MAX_ITEMS_TO_RETRIEVE = 100 +PAGE_SIZE = 100 +MAX_SCHEMA_SIZE = 300 +MAX_PRIMARY_KEYS_SIZE = 100 + +logger: logging.Logger = logging.getLogger(__name__) + + +class DynamoDBConfig(DatasetSourceConfigMixin, StatefulIngestionConfigBase): + # TODO: refactor the config to use AwsConnectionConfig and create a method get_dynamodb_client + # in the class to provide optional region name input + aws_access_key_id: str = Field(description="AWS Access Key ID.") + aws_secret_access_key: pydantic.SecretStr = Field(description="AWS Secret Key.") + + # This config option allows user to include a list of items from a table when we scan and construct the schema, + # the key of this dict is table name and the value is the list of item primary keys in dynamodb format, + # if the table use composite key then the value should have partition key and sort key present + include_table_item: Optional[Dict[str, List[Dict]]] = Field( + default=None, + description="[Advanced] The primary keys of items of a table in dynamodb format the user would like to include in schema. " + 'Refer "Advanced Configurations" section for more details', + ) + + table_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for tables to filter in ingestion.", + ) + # Custom Stateful Ingestion settings + stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None + + +class DynamoDBSourceReport(StaleEntityRemovalSourceReport): + filtered: List[str] = field(default_factory=list) + + def report_dropped(self, name: str) -> None: + self.filtered.append(name) + + +# map attribute data types to native types +_attribute_type_to_native_type_mapping: Dict[str, str] = { + "N": "Numbers", + "B": "Bytes", + "S": "String", + "M": "Map", + "L": "List", + "SS": "String List", + "NS": "Number List", + "BS": "Binary Set", + "NULL": "Null", + # if the attribute type is NULL the attribute value will be true or false. + "BOOL": "Boolean", + "mixed": "mixed", +} +# map DynamoDB attribute types to DataHub classes +_attribute_type_to_field_type_mapping: Dict[str, Type] = { + "N": NumberTypeClass, + "B": BytesTypeClass, + "S": StringTypeClass, + "M": RecordTypeClass, + "L": ArrayTypeClass, + "SS": ArrayTypeClass, + "NS": ArrayTypeClass, + "BS": ArrayTypeClass, + "NULL": BooleanTypeClass, + "BOOL": BooleanTypeClass, + "mixed": UnionTypeClass, +} + + +@platform_name("DynamoDB", id="dynamodb") +@config_class(DynamoDBConfig) +@support_status(SupportStatus.TESTING) +@capability( + SourceCapability.PLATFORM_INSTANCE, + "By default, platform_instance will use the AWS account id", +) +@capability( + SourceCapability.DELETION_DETECTION, + "Optionally enabled via `stateful_ingestion.remove_stale_metadata`", + supported=True, +) +class DynamoDBSource(StatefulIngestionSourceBase): + """ + This plugin extracts the following: + + AWS DynamoDB table names with their region, and infer schema of attribute names and types by scanning + the table + + """ + + config: DynamoDBConfig + report: DynamoDBSourceReport + platform: str + + def __init__(self, ctx: PipelineContext, config: DynamoDBConfig, platform: str): + super().__init__(config, ctx) + self.config = config + self.report = DynamoDBSourceReport() + self.platform = platform + + @classmethod + def create(cls, config_dict: dict, ctx: PipelineContext) -> "DynamoDBSource": + config = DynamoDBConfig.parse_obj(config_dict) + return cls(ctx, config, "dynamodb") + + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + # This is a offline call to get available region names from botocore library + session = boto3.Session() + dynamodb_regions = session.get_available_regions("dynamodb") + logger.info(f"region names {dynamodb_regions}") + + # traverse databases in sorted order so output is consistent + for region in dynamodb_regions: + try: + # create a new dynamodb client for each region, + # it seems for one client we could only list the table of one specific region, + # the list_tables() method don't take any config that related to region + # TODO: list table returns maximum number 100, need to implement pagination here + dynamodb_client = boto3.client( + "dynamodb", + region_name=region, + aws_access_key_id=self.config.aws_access_key_id + if self.config.aws_access_key_id + else None, + aws_secret_access_key=self.config.aws_secret_access_key.get_secret_value() + if self.config.aws_secret_access_key + else None, + ) + table_names: List[str] = dynamodb_client.list_tables()["TableNames"] + except Exception as ex: + # TODO: If regions is config input then this would be self.report.report_warning, + # we can create dynamodb client to take aws region or regions as user input + logger.info(f"exception happen in region {region}, skipping: {ex}") + continue + for table_name in sorted(table_names): + if not self.config.table_pattern.allowed(table_name): + continue + table_info = dynamodb_client.describe_table(TableName=table_name)[ + "Table" + ] + account_id = table_info["TableArn"].split(":")[4] + if not self.config.table_pattern.allowed(table_name): + self.report.report_dropped(table_name) + continue + platform_instance = self.config.platform_instance or account_id + dataset_name = f"{region}.{table_name}" + dataset_urn = make_dataset_urn_with_platform_instance( + platform=self.platform, + platform_instance=platform_instance, + name=dataset_name, + ) + dataset_properties = DatasetPropertiesClass( + tags=[], + customProperties={ + "table.arn": table_info["TableArn"], + "table.totalItems": str(table_info["ItemCount"]), + }, + ) + primary_key_dict = self.extract_primary_key_from_key_schema(table_info) + table_schema = self.construct_schema_from_dynamodb( + dynamodb_client, table_name + ) + schema_metadata = self.construct_schema_metadata( + table_name, + dataset_urn, + dataset_properties, + table_schema, + primary_key_dict, + ) + + yield MetadataChangeProposalWrapper( + entityUrn=dataset_urn, + aspect=schema_metadata, + ).as_workunit() + + yield MetadataChangeProposalWrapper( + entityUrn=dataset_urn, + aspect=dataset_properties, + ).as_workunit() + + platform_instance_aspect = DataPlatformInstanceClass( + platform=make_data_platform_urn(self.platform), + instance=make_dataplatform_instance_urn( + self.platform, platform_instance + ), + ) + + yield MetadataChangeProposalWrapper( + entityUrn=dataset_urn, + aspect=platform_instance_aspect, + ).as_workunit() + + def construct_schema_from_dynamodb( + self, + dynamodb_client: BaseClient, + table_name: str, + ) -> Dict[str, SchemaDescription]: + """ + This will use the dynamodb client to scan the given table to retrieve items with pagination, + and construct the schema of this table by reading the attributes of the retrieved items + """ + paginator = dynamodb_client.get_paginator("scan") + schema: Dict[str, SchemaDescription] = {} + """ + https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/dynamodb.html#DynamoDB.Paginator.Scan + Note that the behavior of the pagination does not align with the documentation according to https://stackoverflow.com/questions/39201093/how-to-use-boto3-pagination + + What we'll do is to create a paginator and boto3 library handles the pagination automatically. We'll iterate through pages + and retrieve the items from page. + + The MaxItems is the total number of items to return, and PageSize is the size of each page, we are assigning same value + to these two config. If MaxItems is more than PageSize then we expect MaxItems / PageSize pages in response_iterator will return + """ + self.include_table_item_to_schema(dynamodb_client, table_name, schema) + response_iterator = paginator.paginate( + TableName=table_name, + PaginationConfig={ + "MaxItems": MAX_ITEMS_TO_RETRIEVE, + "PageSize": PAGE_SIZE, + }, + ) + # iterate through pagination result to retrieve items + for page in response_iterator: + items = page["Items"] + if len(items) > 0: + self.construct_schema_from_items(items, schema) + + return schema + + def include_table_item_to_schema( + self, + dynamodb_client: Any, + table_name: str, + schema: Dict[str, SchemaDescription], + ) -> None: + """ + It will look up in the config include_table_item dict to see if the current table name exists as key, + if it exists then get the items by primary key from the table and put it to schema + """ + if self.config.include_table_item is None: + return + if table_name not in self.config.include_table_item.keys(): + return + primary_key_list = self.config.include_table_item.get(table_name) + assert isinstance(primary_key_list, List) + if len(primary_key_list) > MAX_PRIMARY_KEYS_SIZE: + logger.info( + f"the provided primary keys list size exceeded the max size for table {table_name}, we'll only process the first {MAX_PRIMARY_KEYS_SIZE} items" + ) + primary_key_list = primary_key_list[0:MAX_PRIMARY_KEYS_SIZE] + items = [] + response = dynamodb_client.batch_get_item( + RequestItems={table_name: {"Keys": primary_key_list}} + ).get("Responses", None) + if response is None: + logger.error( + f"failed to retrieve item from table {table_name} by the given key {primary_key_list}" + ) + return + items = response.get(table_name) + + self.construct_schema_from_items(items, schema) + + def construct_schema_from_items( + slef, items: List[Dict[str, Dict]], schema: Dict[str, SchemaDescription] + ) -> None: + """ + https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/dynamodb.html#DynamoDB.Client.scan + each item in the list is a dict, the key represents the attribute name, + and the value is a one entry dict, more details in the below code comments + we are writing our own construct schema method, take the attribute name as key and SchemaDescription as value + """ + for document in items: + # the key is the attribute name and the value is a dict with only one entry, + # whose key is the data type and value is the data + for key, value in document.items(): + if value is not None: + data_type = list(value.keys())[0] + if key not in schema: + schema[key] = { + "types": Counter(data_type), + "count": 1, + # It seems we don't have collapsed field name so we are using attribute name here + "delimited_name": key, + "type": data_type, + "nullable": False, + } + else: + # update the type count + schema[key]["types"].update({data_type: 1}) + schema[key]["count"] += 1 + # if we found an attribute name with different attribute type, we consider this attribute type as "mixed" + field_types = schema[key]["types"] + if len(field_types.keys()) > 1: + schema[key]["type"] = "mixed" + + def construct_schema_metadata( + self, + table_name: str, + dataset_urn: str, + dataset_properties: DatasetPropertiesClass, + schema: Dict[str, SchemaDescription], + primary_key_dict: Dict[str, str], + ) -> SchemaMetadata: + """ " + To construct the schema metadata, it will first sort the schema by the occurrence of attribute names + in descending order and truncate the schema by MAX_SCHEMA_SIZE, and then start to construct the + schema metadata sorted by attribute name + """ + + canonical_schema: List[SchemaField] = [] + schema_size = len(schema.values()) + table_fields = list(schema.values()) + + if schema_size > MAX_SCHEMA_SIZE: + # downsample the schema, using frequency as the sort key + self.report.report_warning( + key=dataset_urn, + reason=f"Downsampling the table schema because MAX_SCHEMA_SIZE threshold is {MAX_SCHEMA_SIZE}", + ) + # Add this information to the custom properties so user can know they are looking at down sampled schema + dataset_properties.customProperties["schema.downsampled"] = "True" + dataset_properties.customProperties["schema.totalFields"] = f"{schema_size}" + # append each schema field (sort so output is consistent) + for schema_field in sorted( + table_fields, + key=lambda x: x["delimited_name"], + )[0:MAX_SCHEMA_SIZE]: + field_path = schema_field["delimited_name"] + native_data_type = self.get_native_type(schema_field["type"], table_name) + type = self.get_field_type(schema_field["type"], table_name) + description = None + nullable = True + if field_path in primary_key_dict: + description = ( + "Partition Key" + if primary_key_dict.get(field_path) == "HASH" + else "Sort Key" + ) + # primary key should not be nullable + nullable = False + + field = SchemaField( + fieldPath=field_path, + nativeDataType=native_data_type, + type=type, + description=description, + nullable=nullable, + recursive=False, + ) + canonical_schema.append(field) + + # create schema metadata object for table + schema_metadata = SchemaMetadata( + schemaName=table_name, + platform=f"urn:li:dataPlatform:{self.platform}", + version=0, + hash="", + platformSchema=SchemalessClass(), + fields=canonical_schema, + ) + return schema_metadata + + def extract_primary_key_from_key_schema( + self, table_info: Dict[str, Any] + ) -> Dict[str, str]: + key_schema = table_info.get("KeySchema") + primary_key_dict = {} + assert isinstance(key_schema, List) + for key in key_schema: + attribute_name = key.get("AttributeName") + key_type = key.get("KeyType") + primary_key_dict[attribute_name] = key_type + return primary_key_dict + + def get_native_type(self, attribute_type: Union[type, str], table_name: str) -> str: + assert isinstance(attribute_type, str) + type_string: Optional[str] = _attribute_type_to_native_type_mapping.get( + attribute_type + ) + if type_string is None: + self.report.report_warning( + table_name, f"unable to map type {attribute_type} to native data type" + ) + return _attribute_type_to_native_type_mapping[attribute_type] + return type_string + + def get_field_type( + self, attribute_type: Union[type, str], table_name: str + ) -> SchemaFieldDataType: + assert isinstance(attribute_type, str) + type_class: Optional[type] = _attribute_type_to_field_type_mapping.get( + attribute_type + ) + + if type_class is None: + self.report.report_warning( + table_name, + f"unable to map type {attribute_type} to metadata schema field type", + ) + type_class = NullTypeClass + return SchemaFieldDataType(type=type_class()) + + def get_report(self) -> DynamoDBSourceReport: + return self.report diff --git a/metadata-ingestion/tests/integration/dynamodb/dynamodb_default_platform_instance_mces_golden.json b/metadata-ingestion/tests/integration/dynamodb/dynamodb_default_platform_instance_mces_golden.json new file mode 100644 index 0000000000000..f3d6c9809f5d2 --- /dev/null +++ b/metadata-ingestion/tests/integration/dynamodb/dynamodb_default_platform_instance_mces_golden.json @@ -0,0 +1,132 @@ +[ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dynamodb,123456789012.us-west-2.Location,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "Location", + "platform": "urn:li:dataPlatform:dynamodb", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.Schemaless": {} + }, + "fields": [ + { + "fieldPath": "address", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "String", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "city", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "String", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "partitionKey", + "nullable": false, + "description": "Partition Key", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "String", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "zip", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "Numbers", + "recursive": false, + "isPartOfKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1693396800000, + "runId": "dynamodb-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dynamodb,123456789012.us-west-2.Location,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "table.arn": "arn:aws:dynamodb:us-west-2:123456789012:table/Location", + "table.totalItems": "1" + }, + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1693396800000, + "runId": "dynamodb-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dynamodb,123456789012.us-west-2.Location,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:dynamodb", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:dynamodb,123456789012)" + } + }, + "systemMetadata": { + "lastObserved": 1693396800000, + "runId": "dynamodb-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dynamodb,123456789012.us-west-2.Location,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1693396800000, + "runId": "dynamodb-test" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/dynamodb/dynamodb_platform_instance_mces_golden.json b/metadata-ingestion/tests/integration/dynamodb/dynamodb_platform_instance_mces_golden.json new file mode 100644 index 0000000000000..b1176b1fd5786 --- /dev/null +++ b/metadata-ingestion/tests/integration/dynamodb/dynamodb_platform_instance_mces_golden.json @@ -0,0 +1,132 @@ +[ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dynamodb,dynamodb_test.us-west-2.Location,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "Location", + "platform": "urn:li:dataPlatform:dynamodb", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.Schemaless": {} + }, + "fields": [ + { + "fieldPath": "address", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "String", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "city", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "String", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "partitionKey", + "nullable": false, + "description": "Partition Key", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "String", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "zip", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "Numbers", + "recursive": false, + "isPartOfKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1693396800000, + "runId": "dynamodb-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dynamodb,dynamodb_test.us-west-2.Location,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "table.arn": "arn:aws:dynamodb:us-west-2:123456789012:table/Location", + "table.totalItems": "1" + }, + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1693396800000, + "runId": "dynamodb-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dynamodb,dynamodb_test.us-west-2.Location,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:dynamodb", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:dynamodb,dynamodb_test)" + } + }, + "systemMetadata": { + "lastObserved": 1693396800000, + "runId": "dynamodb-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dynamodb,dynamodb_test.us-west-2.Location,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1693396800000, + "runId": "dynamodb-test" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/dynamodb/test_dynamodb.py b/metadata-ingestion/tests/integration/dynamodb/test_dynamodb.py new file mode 100644 index 0000000000000..ef2446ddd8d62 --- /dev/null +++ b/metadata-ingestion/tests/integration/dynamodb/test_dynamodb.py @@ -0,0 +1,95 @@ +import pathlib + +import boto3 +import pytest +from freezegun import freeze_time +from moto import mock_dynamodb + +from datahub.ingestion.run.pipeline import Pipeline +from tests.test_helpers import mce_helpers + +test_resources_dir = pathlib.Path(__file__).parent +FROZEN_TIME = "2023-08-30 12:00:00" + + +@freeze_time(FROZEN_TIME) +@mock_dynamodb +@pytest.mark.integration +def test_dynamodb(pytestconfig, tmp_path, mock_time): + boto3.setup_default_session() + client = boto3.client("dynamodb", region_name="us-west-2") + client.create_table( + TableName="Location", + KeySchema=[ + {"AttributeName": "partitionKey", "KeyType": "HASH"}, + ], + AttributeDefinitions=[ + {"AttributeName": "partitionKey", "AttributeType": "S"}, + ], + ProvisionedThroughput={"ReadCapacityUnits": 10, "WriteCapacityUnits": 10}, + ) + client.put_item( + TableName="Location", + Item={ + "partitionKey": {"S": "1"}, + "city": {"S": "San Francisco"}, + "address": {"S": "1st Market st"}, + "zip": {"N": "94000"}, + }, + ) + + pipeline_default_platform_instance = Pipeline.create( + { + "run_id": "dynamodb-test", + "source": { + "type": "dynamodb", + "config": { + "aws_access_key_id": "test", + "aws_secret_access_key": "test", + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/dynamodb_default_platform_instance_mces.json", + }, + }, + } + ) + pipeline_default_platform_instance.run() + pipeline_default_platform_instance.raise_from_status() + mce_helpers.check_golden_file( + pytestconfig, + output_path=f"{tmp_path}/dynamodb_default_platform_instance_mces.json", + golden_path=test_resources_dir + / "dynamodb_default_platform_instance_mces_golden.json", + ignore_paths=mce_helpers.IGNORE_PATH_TIMESTAMPS, + ) + + pipeline_with_platform_instance = Pipeline.create( + { + "run_id": "dynamodb-test", + "source": { + "type": "dynamodb", + "config": { + "platform_instance": "dynamodb_test", + "aws_access_key_id": "test", + "aws_secret_access_key": "test", + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/dynamodb_platform_instance_mces.json", + }, + }, + } + ) + pipeline_with_platform_instance.run() + pipeline_with_platform_instance.raise_from_status() + mce_helpers.check_golden_file( + pytestconfig, + output_path=f"{tmp_path}/dynamodb_platform_instance_mces.json", + golden_path=test_resources_dir / "dynamodb_platform_instance_mces_golden.json", + ignore_paths=mce_helpers.IGNORE_PATH_TIMESTAMPS, + ) diff --git a/metadata-service/war/src/main/resources/boot/data_platforms.json b/metadata-service/war/src/main/resources/boot/data_platforms.json index 2abe81d93236c..7a7cec60aa25f 100644 --- a/metadata-service/war/src/main/resources/boot/data_platforms.json +++ b/metadata-service/war/src/main/resources/boot/data_platforms.json @@ -544,5 +544,15 @@ "type": "FILE_SYSTEM", "logoUrl": "/assets/platforms/gcslogo.svg" } + }, + { + "urn": "urn:li:dataPlatform:dynamodb", + "aspect": { + "datasetNameDelimiter": ".", + "name": "dynamodb", + "displayName": "DynamoDB", + "type": "KEY_VALUE_STORE", + "logoUrl": "/assets/platforms/dynamodblogo.png" + } } ] From 99d7eb756c09a3313a4c1bda6f96a0953004b58c Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Sat, 16 Sep 2023 02:06:04 +0530 Subject: [PATCH 04/37] feat(ingest/bigquery): support bigquery profiling with sampling (#8794) --- .../ingestion/source/ge_data_profiler.py | 222 ++++++++++++------ .../ingestion/source/ge_profiling_config.py | 20 +- 2 files changed, 162 insertions(+), 80 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index 4394d108486be..01e083d566168 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -616,6 +616,9 @@ def generate_dataset_profile( # noqa: C901 (complexity) logger.debug(f"profiling {self.dataset_name}: flushing stage 1 queries") self.query_combiner.flush() + if self.config.use_sampling and not self.config.limit: + self.update_dataset_batch_use_sampling(profile) + columns_profiling_queue: List[_SingleColumnSpec] = [] if columns_to_profile: for column in all_columns: @@ -737,6 +740,61 @@ def generate_dataset_profile( # noqa: C901 (complexity) self.query_combiner.flush() return profile + def update_dataset_batch_use_sampling(self, profile: DatasetProfileClass) -> None: + if ( + self.dataset.engine.dialect.name.lower() == BIGQUERY + and profile.rowCount + and profile.rowCount > self.config.sample_size + ): + """ + According to BigQuery Sampling Docs(https://cloud.google.com/bigquery/docs/table-sampling), + BigQuery does not cache the results of a query that includes a TABLESAMPLE clause and the + query may return different results every time. Calculating different column level metrics + on different sampling results is possible however each query execution would incur the cost + of reading data from storage. Also, using different table samples may create non-coherent + representation of column level metrics, for example, minimum value of a column in one sample + can be greater than maximum value of the column in another sample. + + It is observed that for a simple select * query with TABLESAMPLE, results are cached and + stored in temporary table. This can be (ab)used and all column level profiling calculations + can be performed against it. + + Risks: + 1. All the risks mentioned in notes of `create_bigquery_temp_table` are also + applicable here. + 2. TABLESAMPLE query may read entire table for small tables that are written + as single data block. This may incorrectly label datasetProfile's partition as + "SAMPLE", although profile is for entire table. + 3. Table Sampling in BigQuery is a Pre-GA (Preview) feature. + """ + sample_pc = 100 * self.config.sample_size / profile.rowCount + sql = ( + f"SELECT * FROM {str(self.dataset._table)} " + + f"TABLESAMPLE SYSTEM ({sample_pc:.3f} percent)" + ) + temp_table_name = create_bigquery_temp_table( + self, + sql, + self.dataset_name, + self.dataset.engine.engine.raw_connection(), + ) + if temp_table_name: + self.dataset._table = sa.text(temp_table_name) + logger.debug(f"Setting table name to be {self.dataset._table}") + + if ( + profile.partitionSpec + and profile.partitionSpec.type == PartitionTypeClass.FULL_TABLE + ): + profile.partitionSpec = PartitionSpecClass( + type=PartitionTypeClass.QUERY, partition="SAMPLE" + ) + elif ( + profile.partitionSpec + and profile.partitionSpec.type == PartitionTypeClass.PARTITION + ): + profile.partitionSpec.partition += " SAMPLE" + @dataclasses.dataclass class GEContext: @@ -961,84 +1019,18 @@ def _generate_single_profile( if platform == BIGQUERY and ( custom_sql or self.config.limit or self.config.offset ): - # On BigQuery, we need to bypass GE's mechanism for creating temporary tables because - # it requires create/delete table permissions. - import google.cloud.bigquery.job.query - from google.cloud.bigquery.dbapi.cursor import Cursor as BigQueryCursor - - raw_connection = self.base_engine.raw_connection() - try: - cursor: "BigQueryCursor" = cast( - "BigQueryCursor", raw_connection.cursor() - ) - if custom_sql is not None: - # Note that limit and offset are not supported for custom SQL. - # Presence of custom SQL represents that the bigquery table - # is either partitioned or sharded - bq_sql = custom_sql - else: - bq_sql = f"SELECT * FROM `{table}`" - if self.config.limit: - bq_sql += f" LIMIT {self.config.limit}" - if self.config.offset: - bq_sql += f" OFFSET {self.config.offset}" - try: - cursor.execute(bq_sql) - except Exception as e: - if not self.config.catch_exceptions: - raise e - logger.exception( - f"Encountered exception while profiling {pretty_name}" - ) - self.report.report_warning( - pretty_name, - f"Profiling exception {e} when running custom sql {bq_sql}", - ) - return None - - # Great Expectations batch v2 API, which is the one we're using, requires - # a concrete table name against which profiling is executed. Normally, GE - # creates a table with an expiry time of 24 hours. However, we don't want the - # temporary tables to stick around that long, so we'd also have to delete them - # ourselves. As such, the profiler required create and delete table permissions - # on BigQuery. - # - # It turns out that we can (ab)use the BigQuery cached results feature - # to avoid creating temporary tables ourselves. For almost all queries, BigQuery - # will store the results in a temporary, cached results table when an explicit - # destination table is not provided. These tables are pretty easy to identify - # because they live in "anonymous datasets" and have a name that looks like - # "project-id._d60e97aec7f471046a960419adb6d44e98300db7.anon10774d0ea85fd20fe9671456c5c53d5f1b85e1b17bedb232dfce91661a219ee3" - # These tables are per-user and per-project, so there's no risk of permissions escalation. - # As per the docs, the cached results tables typically have a lifetime of 24 hours, - # which should be plenty for our purposes. - # See https://cloud.google.com/bigquery/docs/cached-results for more details. - # - # The code below extracts the name of the cached results table from the query job - # and points GE to that table for profiling. - # - # Risks: - # 1. If the query results are larger than the maximum response size, BigQuery will - # not cache the results. According to the docs https://cloud.google.com/bigquery/quotas, - # the maximum response size is 10 GB compressed. - # 2. The cache lifetime of 24 hours is "best-effort" and hence not guaranteed. - # 3. Tables with column-level security may not be cached, and tables with row-level - # security will not be cached. - # 4. BigQuery "discourages" using cached results directly, but notes that - # the current semantics do allow it. - # - # The better long-term solution would be to use a subquery avoid this whole - # temporary table dance. However, that would require either a) upgrading to - # use GE's batch v3 API or b) bypassing GE altogether. - - query_job: Optional[ - "google.cloud.bigquery.job.query.QueryJob" - ] = cursor._query_job - assert query_job - temp_destination_table = query_job.destination - bigquery_temp_table = f"{temp_destination_table.project}.{temp_destination_table.dataset_id}.{temp_destination_table.table_id}" - finally: - raw_connection.close() + if custom_sql is not None: + # Note that limit and offset are not supported for custom SQL. + bq_sql = custom_sql + else: + bq_sql = f"SELECT * FROM `{table}`" + if self.config.limit: + bq_sql += f" LIMIT {self.config.limit}" + if self.config.offset: + bq_sql += f" OFFSET {self.config.offset}" + bigquery_temp_table = create_bigquery_temp_table( + self, bq_sql, pretty_name, self.base_engine.raw_connection() + ) if platform == BIGQUERY: if bigquery_temp_table: @@ -1128,6 +1120,7 @@ def _get_ge_dataset( **batch_kwargs, }, ) + if platform == BIGQUERY: # This is done as GE makes the name as DATASET.TABLE # but we want it to be PROJECT.DATASET.TABLE instead for multi-project setups @@ -1153,3 +1146,76 @@ def _get_column_types_to_ignore(dialect_name: str) -> List[str]: return ["JSON"] return [] + + +def create_bigquery_temp_table( + instance: Union[DatahubGEProfiler, _SingleDatasetProfiler], + bq_sql: str, + table_pretty_name: str, + raw_connection: Any, +) -> Optional[str]: + # On BigQuery, we need to bypass GE's mechanism for creating temporary tables because + # it requires create/delete table permissions. + import google.cloud.bigquery.job.query + from google.cloud.bigquery.dbapi.cursor import Cursor as BigQueryCursor + + try: + cursor: "BigQueryCursor" = cast("BigQueryCursor", raw_connection.cursor()) + try: + cursor.execute(bq_sql) + except Exception as e: + if not instance.config.catch_exceptions: + raise e + logger.exception( + f"Encountered exception while profiling {table_pretty_name}" + ) + instance.report.report_warning( + table_pretty_name, + f"Profiling exception {e} when running custom sql {bq_sql}", + ) + return None + + # Great Expectations batch v2 API, which is the one we're using, requires + # a concrete table name against which profiling is executed. Normally, GE + # creates a table with an expiry time of 24 hours. However, we don't want the + # temporary tables to stick around that long, so we'd also have to delete them + # ourselves. As such, the profiler required create and delete table permissions + # on BigQuery. + # + # It turns out that we can (ab)use the BigQuery cached results feature + # to avoid creating temporary tables ourselves. For almost all queries, BigQuery + # will store the results in a temporary, cached results table when an explicit + # destination table is not provided. These tables are pretty easy to identify + # because they live in "anonymous datasets" and have a name that looks like + # "project-id._d60e97aec7f471046a960419adb6d44e98300db7.anon10774d0ea85fd20fe9671456c5c53d5f1b85e1b17bedb232dfce91661a219ee3" + # These tables are per-user and per-project, so there's no risk of permissions escalation. + # As per the docs, the cached results tables typically have a lifetime of 24 hours, + # which should be plenty for our purposes. + # See https://cloud.google.com/bigquery/docs/cached-results for more details. + # + # The code below extracts the name of the cached results table from the query job + # and points GE to that table for profiling. + # + # Risks: + # 1. If the query results are larger than the maximum response size, BigQuery will + # not cache the results. According to the docs https://cloud.google.com/bigquery/quotas, + # the maximum response size is 10 GB compressed. + # 2. The cache lifetime of 24 hours is "best-effort" and hence not guaranteed. + # 3. Tables with column-level security may not be cached, and tables with row-level + # security will not be cached. + # 4. BigQuery "discourages" using cached results directly, but notes that + # the current semantics do allow it. + # + # The better long-term solution would be to use a subquery avoid this whole + # temporary table dance. However, that would require either a) upgrading to + # use GE's batch v3 API or b) bypassing GE altogether. + + query_job: Optional[ + "google.cloud.bigquery.job.query.QueryJob" + ] = cursor._query_job + assert query_job + temp_destination_table = query_job.destination + bigquery_temp_table = f"{temp_destination_table.project}.{temp_destination_table.dataset_id}.{temp_destination_table.table_id}" + return bigquery_temp_table + finally: + raw_connection.close() diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py index 1488b55062b68..77761c529ba0b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py @@ -145,10 +145,26 @@ class GEProfilingConfig(ConfigModel): # Hidden option - used for debugging purposes. catch_exceptions: bool = Field(default=True, description="") - partition_profiling_enabled: bool = Field(default=True, description="") + partition_profiling_enabled: bool = Field( + default=True, + description="Whether to profile partitioned tables. Only BigQuery supports this. " + "If enabled, latest partition data is used for profiling.", + ) partition_datetime: Optional[datetime.datetime] = Field( default=None, - description="For partitioned datasets profile only the partition which matches the datetime or profile the latest one if not set. Only Bigquery supports this.", + description="If specified, profile only the partition which matches this datetime. " + "If not specified, profile the latest partition. Only Bigquery supports this.", + ) + use_sampling: bool = Field( + default=True, + description="Whether to profile column level stats on sample of table. Only BigQuery supports this. " + "If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ", + ) + + sample_size: int = Field( + default=1000, + description="Number of rows to be sampled from table for column level profiling." + "Applicable only if `use_sampling` is set to True.", ) @pydantic.root_validator(pre=True) From 5882fe407535b2362dcfcda7c1e123e6067d7e89 Mon Sep 17 00:00:00 2001 From: Kos Korchak <97058061+kkorchak@users.noreply.github.com> Date: Mon, 18 Sep 2023 16:14:02 -0400 Subject: [PATCH 05/37] Fix for edit_documentation and glossary_navigation cypress tests (#8838) --- .../cypress/e2e/glossary/glossary_navigation.js | 6 ++---- .../cypress/e2e/mutations/edit_documentation.js | 13 +++++++------ 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js b/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js index cd5622d0cd903..de9fa7ecda1f0 100644 --- a/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js +++ b/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js @@ -28,8 +28,6 @@ describe("glossary sidebar navigation test", () => { //ensure the new term is under the parent term group in the navigation sidebar cy.get('*[class^="GlossaryBrowser"]').contains(glossaryTermGroup).click(); cy.get('*[class^="GlossaryEntitiesList"]').contains(glossaryTerm).should("be.visible"); - cy.get('*[class^="GlossaryBrowser"] [aria-label="down"]').click().wait(1000); - cy.get('*[class^="GlossaryBrowser"]').contains(glossaryTerm).should("not.exist"); //move a term group from the root level to be under a parent term group cy.goToGlossaryList(); cy.clickOptionWithText(glossaryTermGroup); @@ -41,8 +39,8 @@ describe("glossary sidebar navigation test", () => { cy.get("button").contains("Move").click(); cy.waitTextVisible("Moved Term Group!"); //ensure it is no longer on the sidebar navigator at the top level but shows up under the new parent - cy.get('*[class^="GlossaryBrowser"] [aria-label="down"]').click().wait(1000); - cy.get('*[class^="GlossaryBrowser"]').contains(glossaryTermGroup).should("not.exist"); + cy.get('*[class^="GlossaryBrowser"]').contains(glossaryParentGroup).click(); + cy.get('*[class^="GlossaryEntitiesList"]').contains(glossaryTermGroup).should("be.visible"); //delete a term group cy.goToGlossaryList(); cy.clickOptionWithText(glossaryParentGroup); diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js b/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js index e4e5a39ce1100..83b66e2cb2549 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js @@ -37,8 +37,8 @@ describe("edit documentation and link to dataset", () => { cy.contains("Sample doc").trigger("mouseover", { force: true }); cy.get('[data-icon="delete"]').click(); cy.waitTextVisible("Link Removed"); - cy.get("button").contains("Add Link").click(); - cy.get("#addLinkForm_url").type(wrong_url); + cy.get("button").contains("Add Link").click().wait(1000); + cy.get('[role="dialog"] #addLinkForm_url').type(wrong_url); cy.waitTextVisible("This field must be a valid url."); cy.focused().clear(); cy.waitTextVisible("A URL is required."); @@ -54,9 +54,9 @@ describe("edit documentation and link to dataset", () => { it("open test domain page, remove and add dataset link", () => { cy.loginWithCredentials(); cy.visit("/domain/urn:li:domain:marketing/Entities"); - cy.get("[role='tab']").contains("Documentation").click(); - cy.get("button").contains("Add Link").click(); - cy.get("#addLinkForm_url").type(wrong_url); + cy.waitTextVisible("SampleCypressKafkaDataset"); + cy.get("button").contains("Add Link").click().wait(1000); + cy.get('[role="dialog"] #addLinkForm_url').type(wrong_url); cy.waitTextVisible("This field must be a valid url."); cy.focused().clear(); cy.waitTextVisible("A URL is required."); @@ -66,6 +66,7 @@ describe("edit documentation and link to dataset", () => { cy.get('[role="dialog"] button').contains("Add").click(); cy.waitTextVisible("Link Added"); cy.get("[role='tab']").contains("Documentation").click(); + cy.waitTextVisible("Edit"); cy.get(`[href='${correct_url}']`).should("be.visible"); cy.contains("Sample doc").trigger("mouseover", { force: true }); cy.get('[data-icon="delete"]').click(); @@ -94,4 +95,4 @@ describe("edit documentation and link to dataset", () => { cy.waitTextVisible("Foo field description has changed"); cy.waitTextVisible("(edited)"); }); -}); +}); \ No newline at end of file From 85fa5a1c4fdf2b4c4439558fa3a4cbbfd3491fbf Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Mon, 18 Sep 2023 16:14:33 -0400 Subject: [PATCH 06/37] feat(ui/java) Update domains to be nested (#8841) Allow the ability to now nest domains underneath other domains. This should work much like the business glossary where you can add domains underneath other domains, move domains underneath other domains or at the root, and navigate domains using a nice new navigator. --- .../datahub/graphql/GmsGraphQLEngine.java | 15 +- .../exception/DataHubGraphQLErrorCode.java | 1 + .../graphql/featureflags/FeatureFlags.java | 1 + .../resolvers/config/AppConfigResolver.java | 1 + .../domain/CreateDomainResolver.java | 29 ++- .../domain/DeleteDomainResolver.java | 6 + .../domain/DomainEntitiesResolver.java | 12 +- .../resolvers/domain/ListDomainsResolver.java | 16 +- .../domain/ParentDomainsResolver.java | 59 +++++ .../resolvers/mutate/MoveDomainResolver.java | 89 +++++++ .../resolvers/mutate/UpdateNameResolver.java | 14 ++ .../resolvers/mutate/util/DomainUtils.java | 222 ++++++++++++++++++ .../src/main/resources/app.graphql | 7 + .../src/main/resources/entity.graphql | 50 +++- .../domain/CreateDomainResolverTest.java | 177 +++++++++++++- .../domain/DeleteDomainResolverTest.java | 27 +++ .../domain/ListDomainsResolverTest.java | 48 +++- .../domain/MoveDomainResolverTest.java | 140 +++++++++++ .../domain/ParentDomainsResolverTest.java | 95 ++++++++ .../glossary/UpdateNameResolverTest.java | 12 + datahub-web-react/src/app/SearchRoutes.tsx | 14 +- datahub-web-react/src/app/analytics/event.ts | 9 + .../src/app/domain/CreateDomainModal.tsx | 97 ++++++-- .../src/app/domain/DomainIcon.tsx | 11 + .../src/app/domain/DomainRoutes.tsx | 39 +++ .../src/app/domain/DomainSearch.tsx | 143 +++++++++++ .../src/app/domain/DomainsContext.tsx | 21 ++ .../src/app/domain/DomainsList.tsx | 12 +- .../src/app/domain/ManageDomainsPage.tsx | 31 ++- .../nestedDomains/DomainsSidebarHeader.tsx | 58 +++++ .../app/domain/nestedDomains/DomainsTitle.tsx | 18 ++ .../nestedDomains/ManageDomainsPageV2.tsx | 60 +++++ .../nestedDomains/ManageDomainsSidebar.tsx | 28 +++ .../app/domain/nestedDomains/RootDomains.tsx | 31 +++ .../domainNavigator/DomainNavigator.tsx | 37 +++ .../domainNavigator/DomainNode.tsx | 137 +++++++++++ .../domainNavigator/useHasDomainChildren.ts | 29 +++ .../src/app/domain/useListDomains.tsx | 27 +++ datahub-web-react/src/app/domain/utils.ts | 72 +++++- .../src/app/entity/EntityRegistry.tsx | 6 + .../src/app/entity/domain/DomainEntity.tsx | 22 +- .../domain/preview/DomainEntitiesSnippet.tsx | 45 ++++ .../src/app/entity/domain/preview/Preview.tsx | 21 +- .../entity/glossaryNode/preview/Preview.tsx | 2 +- .../entity/glossaryTerm/preview/Preview.tsx | 2 +- .../EntityDropdown/DomainParentSelect.tsx | 108 +++++++++ .../shared/EntityDropdown/EntityDropdown.tsx | 35 +-- .../shared/EntityDropdown/MoveDomainModal.tsx | 102 ++++++++ .../EntityDropdown/NodeParentSelect.tsx | 79 ++----- .../shared/EntityDropdown/useDeleteEntity.tsx | 7 + .../EntityDropdown/useHandleDeleteDomain.ts | 27 +++ .../useHandleMoveDomainComplete.ts | 40 ++++ .../EntityDropdown/useParentSelector.ts | 76 ++++++ .../app/entity/shared/EntityDropdown/utils.ts | 50 +++- .../src/app/entity/shared/constants.ts | 1 + .../containers/profile/EntityProfile.tsx | 2 + .../containers/profile/header/EntityName.tsx | 28 ++- .../PlatformContentContainer.tsx | 1 + .../PlatformContent/PlatformContentView.tsx | 13 +- .../profile/sidebar/Domain/SetDomainModal.tsx | 78 +++--- .../src/app/entity/shared/types.ts | 3 + .../src/app/glossary/BusinessGlossaryPage.tsx | 6 - .../src/app/glossary/GlossarySidebar.tsx | 12 +- .../policy/PolicyPrivilegeForm.tsx | 88 ++++--- .../src/app/preview/DefaultPreviewCard.tsx | 8 +- .../renderer/component/DomainSearchList.tsx | 58 ++++- .../renderer/component/HoverEntityTooltip.tsx | 6 +- .../src/app/search/SearchResultList.tsx | 3 +- .../src/app/search/SearchResults.tsx | 3 +- .../autoComplete/AutoCompleteEntity.tsx | 6 +- .../src/app/search/filters/FilterOption.tsx | 21 +- .../{ParentNodes.tsx => ParentEntities.tsx} | 53 +++-- .../src/app/search/filters/utils.tsx | 15 ++ .../src/app/search/sidebar/BrowseSidebar.tsx | 3 +- .../src/app/search/sidebar/ExpandableNode.tsx | 30 +-- .../src/app/shared/LogoCountCard.tsx | 26 +- .../src/app/shared/admin/HeaderLinks.tsx | 9 +- .../src/app/shared/components.tsx | 49 ++++ .../src/app/shared/deleteUtils.ts | 4 +- .../src/app/shared/sidebar/components.tsx | 23 ++ .../src/app/shared/styleUtils.ts | 7 + .../src/app/shared/tags/AddTagsTermsModal.tsx | 6 +- .../src/app/shared/tags/DomainLink.tsx | 9 +- datahub-web-react/src/app/shared/useToggle.ts | 24 +- datahub-web-react/src/app/useAppConfig.ts | 5 + datahub-web-react/src/appConfigContext.tsx | 1 + datahub-web-react/src/conf/Global.ts | 1 + datahub-web-react/src/graphql/app.graphql | 1 + datahub-web-react/src/graphql/domain.graphql | 30 ++- .../src/graphql/fragments.graphql | 32 +++ datahub-web-react/src/graphql/preview.graphql | 5 + datahub-web-react/src/graphql/search.graphql | 10 + .../authorization/ResolvedResourceSpec.java | 32 --- .../com/linkedin/domain/DomainProperties.pdl | 15 ++ .../DomainFieldResolverProvider.java | 68 +++++- .../authorization/DataHubAuthorizerTest.java | 145 ++++++++++-- .../src/main/resources/application.yml | 1 + .../datahubusage/DataHubUsageEventType.java | 1 + node_modules/.yarn-integrity | 12 + .../cypress/cypress/e2e/mutations/domains.js | 23 +- yarn.lock | 4 + 101 files changed, 3083 insertions(+), 415 deletions(-) create mode 100644 datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/ParentDomainsResolver.java create mode 100644 datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/MoveDomainResolver.java create mode 100644 datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/MoveDomainResolverTest.java create mode 100644 datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/ParentDomainsResolverTest.java create mode 100644 datahub-web-react/src/app/domain/DomainIcon.tsx create mode 100644 datahub-web-react/src/app/domain/DomainRoutes.tsx create mode 100644 datahub-web-react/src/app/domain/DomainSearch.tsx create mode 100644 datahub-web-react/src/app/domain/DomainsContext.tsx create mode 100644 datahub-web-react/src/app/domain/nestedDomains/DomainsSidebarHeader.tsx create mode 100644 datahub-web-react/src/app/domain/nestedDomains/DomainsTitle.tsx create mode 100644 datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx create mode 100644 datahub-web-react/src/app/domain/nestedDomains/ManageDomainsSidebar.tsx create mode 100644 datahub-web-react/src/app/domain/nestedDomains/RootDomains.tsx create mode 100644 datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNavigator.tsx create mode 100644 datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNode.tsx create mode 100644 datahub-web-react/src/app/domain/nestedDomains/domainNavigator/useHasDomainChildren.ts create mode 100644 datahub-web-react/src/app/domain/useListDomains.tsx create mode 100644 datahub-web-react/src/app/entity/domain/preview/DomainEntitiesSnippet.tsx create mode 100644 datahub-web-react/src/app/entity/shared/EntityDropdown/DomainParentSelect.tsx create mode 100644 datahub-web-react/src/app/entity/shared/EntityDropdown/MoveDomainModal.tsx create mode 100644 datahub-web-react/src/app/entity/shared/EntityDropdown/useHandleDeleteDomain.ts create mode 100644 datahub-web-react/src/app/entity/shared/EntityDropdown/useHandleMoveDomainComplete.ts create mode 100644 datahub-web-react/src/app/entity/shared/EntityDropdown/useParentSelector.ts rename datahub-web-react/src/app/search/filters/{ParentNodes.tsx => ParentEntities.tsx} (54%) create mode 100644 datahub-web-react/src/app/shared/components.tsx create mode 100644 datahub-web-react/src/app/shared/sidebar/components.tsx create mode 100644 datahub-web-react/src/app/shared/styleUtils.ts create mode 100644 node_modules/.yarn-integrity create mode 100644 yarn.lock diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java index 682710ad5d539..d86234cf59306 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java @@ -81,6 +81,7 @@ import com.linkedin.datahub.graphql.generated.Notebook; import com.linkedin.datahub.graphql.generated.Owner; import com.linkedin.datahub.graphql.generated.OwnershipTypeEntity; +import com.linkedin.datahub.graphql.generated.ParentDomainsResult; import com.linkedin.datahub.graphql.generated.PolicyMatchCriterionValue; import com.linkedin.datahub.graphql.generated.QueryEntity; import com.linkedin.datahub.graphql.generated.QuerySubject; @@ -124,6 +125,7 @@ import com.linkedin.datahub.graphql.resolvers.domain.DeleteDomainResolver; import com.linkedin.datahub.graphql.resolvers.domain.DomainEntitiesResolver; import com.linkedin.datahub.graphql.resolvers.domain.ListDomainsResolver; +import com.linkedin.datahub.graphql.resolvers.domain.ParentDomainsResolver; import com.linkedin.datahub.graphql.resolvers.domain.SetDomainResolver; import com.linkedin.datahub.graphql.resolvers.domain.UnsetDomainResolver; import com.linkedin.datahub.graphql.resolvers.embed.UpdateEmbedResolver; @@ -186,6 +188,7 @@ import com.linkedin.datahub.graphql.resolvers.mutate.BatchSetDomainResolver; import com.linkedin.datahub.graphql.resolvers.mutate.BatchUpdateDeprecationResolver; import com.linkedin.datahub.graphql.resolvers.mutate.BatchUpdateSoftDeletedResolver; +import com.linkedin.datahub.graphql.resolvers.mutate.MoveDomainResolver; import com.linkedin.datahub.graphql.resolvers.mutate.MutableTypeBatchResolver; import com.linkedin.datahub.graphql.resolvers.mutate.MutableTypeResolver; import com.linkedin.datahub.graphql.resolvers.mutate.RemoveLinkResolver; @@ -944,6 +947,7 @@ private void configureMutationResolvers(final RuntimeWiring.Builder builder) { .dataFetcher("removeGroup", new RemoveGroupResolver(this.entityClient)) .dataFetcher("updateUserStatus", new UpdateUserStatusResolver(this.entityClient)) .dataFetcher("createDomain", new CreateDomainResolver(this.entityClient, this.entityService)) + .dataFetcher("moveDomain", new MoveDomainResolver(this.entityService, this.entityClient)) .dataFetcher("deleteDomain", new DeleteDomainResolver(entityClient)) .dataFetcher("setDomain", new SetDomainResolver(this.entityClient, this.entityService)) .dataFetcher("batchSetDomain", new BatchSetDomainResolver(this.entityService)) @@ -1029,6 +1033,13 @@ private void configureGenericEntityResolvers(final RuntimeWiring.Builder builder .dataFetcher("entities", new EntityTypeBatchResolver(entityTypes, (env) -> ((BrowseResults) env.getSource()).getEntities())) ) + .type("ParentDomainsResult", typeWiring -> typeWiring + .dataFetcher("domains", new EntityTypeBatchResolver(entityTypes, + (env) -> { + final ParentDomainsResult result = env.getSource(); + return result != null ? result.getDomains() : null; + })) + ) .type("EntityRelationshipLegacy", typeWiring -> typeWiring .dataFetcher("entity", new EntityTypeResolver(entityTypes, (env) -> ((EntityRelationshipLegacy) env.getSource()).getEntity())) @@ -1675,8 +1686,8 @@ private void configureGlossaryRelationshipResolvers(final RuntimeWiring.Builder private void configureDomainResolvers(final RuntimeWiring.Builder builder) { builder.type("Domain", typeWiring -> typeWiring .dataFetcher("entities", new DomainEntitiesResolver(this.entityClient)) - .dataFetcher("relationships", new EntityRelationshipsResultResolver(graphClient) - ) + .dataFetcher("parentDomains", new ParentDomainsResolver(this.entityClient)) + .dataFetcher("relationships", new EntityRelationshipsResultResolver(graphClient)) ); builder.type("DomainAssociation", typeWiring -> typeWiring .dataFetcher("domain", diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/exception/DataHubGraphQLErrorCode.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/exception/DataHubGraphQLErrorCode.java index db3e1dd03e419..44695c334855f 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/exception/DataHubGraphQLErrorCode.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/exception/DataHubGraphQLErrorCode.java @@ -4,6 +4,7 @@ public enum DataHubGraphQLErrorCode { BAD_REQUEST(400), UNAUTHORIZED(403), NOT_FOUND(404), + CONFLICT(409), SERVER_ERROR(500); private final int _code; diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java index de3c217db01ec..4d6133f18df05 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java @@ -16,4 +16,5 @@ public class FeatureFlags { private PreProcessHooks preProcessHooks; private boolean showAcrylInfo = false; private boolean showAccessManagement = false; + private boolean nestedDomainsEnabled = false; } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java index 09df985b19cf5..f6bc68caa0821 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java @@ -172,6 +172,7 @@ public CompletableFuture get(final DataFetchingEnvironment environmen .setShowBrowseV2(_featureFlags.isShowBrowseV2()) .setShowAcrylInfo(_featureFlags.isShowAcrylInfo()) .setShowAccessManagement(_featureFlags.isShowAccessManagement()) + .setNestedDomainsEnabled(_featureFlags.isNestedDomainsEnabled()) .build(); appConfig.setFeatureFlags(featureFlagsConfig); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/CreateDomainResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/CreateDomainResolver.java index 39aa1ea28da20..1930cdc1f8667 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/CreateDomainResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/CreateDomainResolver.java @@ -1,14 +1,18 @@ package com.linkedin.datahub.graphql.resolvers.domain; import com.linkedin.common.AuditStamp; +import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; import com.linkedin.data.template.SetMode; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.authorization.AuthorizationUtils; import com.linkedin.datahub.graphql.exception.AuthorizationException; +import com.linkedin.datahub.graphql.exception.DataHubGraphQLErrorCode; +import com.linkedin.datahub.graphql.exception.DataHubGraphQLException; import com.linkedin.datahub.graphql.generated.CreateDomainInput; import com.linkedin.datahub.graphql.generated.OwnerEntityType; import com.linkedin.datahub.graphql.generated.OwnershipType; +import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils; import com.linkedin.datahub.graphql.resolvers.mutate.util.OwnerUtils; import com.linkedin.domain.DomainProperties; import com.linkedin.entity.client.EntityClient; @@ -19,8 +23,11 @@ import com.linkedin.mxe.MetadataChangeProposal; import graphql.schema.DataFetcher; import graphql.schema.DataFetchingEnvironment; + +import java.net.URISyntaxException; import java.util.UUID; import java.util.concurrent.CompletableFuture; + import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -45,9 +52,9 @@ public CompletableFuture get(DataFetchingEnvironment environment) throws final QueryContext context = environment.getContext(); final CreateDomainInput input = bindArgument(environment.getArgument("input"), CreateDomainInput.class); + final Urn parentDomain = input.getParentDomain() != null ? UrnUtils.getUrn(input.getParentDomain()) : null; return CompletableFuture.supplyAsync(() -> { - if (!AuthorizationUtils.canCreateDomains(context)) { throw new AuthorizationException("Unauthorized to perform this action. Please contact your DataHub administrator."); } @@ -64,6 +71,17 @@ public CompletableFuture get(DataFetchingEnvironment environment) throws throw new IllegalArgumentException("This Domain already exists!"); } + if (parentDomain != null && !_entityClient.exists(parentDomain, context.getAuthentication())) { + throw new IllegalArgumentException("Parent Domain does not exist!"); + } + + if (DomainUtils.hasNameConflict(input.getName(), parentDomain, context, _entityClient)) { + throw new DataHubGraphQLException( + String.format("\"%s\" already exists in this domain. Please pick a unique name.", input.getName()), + DataHubGraphQLErrorCode.CONFLICT + ); + } + // Create the MCP final MetadataChangeProposal proposal = buildMetadataChangeProposalWithKey(key, DOMAIN_ENTITY_NAME, DOMAIN_PROPERTIES_ASPECT_NAME, mapDomainProperties(input, context)); @@ -77,6 +95,8 @@ public CompletableFuture get(DataFetchingEnvironment environment) throws } OwnerUtils.addCreatorAsOwner(context, domainUrn, OwnerEntityType.CORP_USER, ownershipType, _entityService); return domainUrn; + } catch (DataHubGraphQLException e) { + throw e; } catch (Exception e) { log.error("Failed to create Domain with id: {}, name: {}: {}", input.getId(), input.getName(), e.getMessage()); throw new RuntimeException(String.format("Failed to create Domain with id: %s, name: %s", input.getId(), input.getName()), e); @@ -89,6 +109,13 @@ private DomainProperties mapDomainProperties(final CreateDomainInput input, fina result.setName(input.getName()); result.setDescription(input.getDescription(), SetMode.IGNORE_NULL); result.setCreated(new AuditStamp().setActor(UrnUtils.getUrn(context.getActorUrn())).setTime(System.currentTimeMillis())); + if (input.getParentDomain() != null) { + try { + result.setParentDomain(Urn.createFromString(input.getParentDomain())); + } catch (URISyntaxException e) { + throw new RuntimeException(String.format("Failed to create Domain Urn from string: %s", input.getParentDomain()), e); + } + } return result; } } \ No newline at end of file diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/DeleteDomainResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/DeleteDomainResolver.java index 60a03fcddcc4d..9ab90e8b4ff72 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/DeleteDomainResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/DeleteDomainResolver.java @@ -4,6 +4,7 @@ import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.authorization.AuthorizationUtils; import com.linkedin.datahub.graphql.exception.AuthorizationException; +import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils; import com.linkedin.entity.client.EntityClient; import graphql.schema.DataFetcher; import graphql.schema.DataFetchingEnvironment; @@ -32,6 +33,11 @@ public CompletableFuture get(final DataFetchingEnvironment environment) if (AuthorizationUtils.canManageDomains(context) || AuthorizationUtils.canDeleteEntity(urn, context)) { try { + // Make sure there are no child domains + if (DomainUtils.hasChildDomains(urn, context, _entityClient)) { + throw new RuntimeException(String.format("Cannot delete domain %s which has child domains", domainUrn)); + } + _entityClient.deleteEntity(urn, context.getAuthentication()); log.info(String.format("I've successfully deleted the entity %s with urn", domainUrn)); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/DomainEntitiesResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/DomainEntitiesResolver.java index 06bfa36fc3c14..0bf551c4683e6 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/DomainEntitiesResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/DomainEntitiesResolver.java @@ -1,6 +1,5 @@ package com.linkedin.datahub.graphql.resolvers.domain; -import com.google.common.collect.ImmutableList; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.Domain; import com.linkedin.datahub.graphql.generated.DomainEntitiesInput; @@ -67,17 +66,22 @@ public CompletableFuture get(final DataFetchingEnvironment enviro try { + final CriterionArray criteria = new CriterionArray(); final Criterion filterCriterion = new Criterion() .setField(DOMAINS_FIELD_NAME + ".keyword") .setCondition(Condition.EQUAL) .setValue(urn); + criteria.add(filterCriterion); + if (input.getFilters() != null) { + input.getFilters().forEach(filter -> { + criteria.add(new Criterion().setField(filter.getField()).setValue(filter.getValue())); + }); + } return UrnSearchResultsMapper.map(_entityClient.searchAcrossEntities( SEARCHABLE_ENTITY_TYPES.stream().map(EntityTypeMapper::getName).collect(Collectors.toList()), query, - new Filter().setOr(new ConjunctiveCriterionArray( - new ConjunctiveCriterion().setAnd(new CriterionArray(ImmutableList.of(filterCriterion))) - )), + new Filter().setOr(new ConjunctiveCriterionArray(new ConjunctiveCriterion().setAnd(criteria))), start, count, null, diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/ListDomainsResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/ListDomainsResolver.java index 6ed8639592d6e..3a751e502eb10 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/ListDomainsResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/ListDomainsResolver.java @@ -1,22 +1,24 @@ package com.linkedin.datahub.graphql.resolvers.domain; import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; import com.linkedin.datahub.graphql.QueryContext; -import com.linkedin.datahub.graphql.authorization.AuthorizationUtils; -import com.linkedin.datahub.graphql.exception.AuthorizationException; import com.linkedin.datahub.graphql.generated.Domain; import com.linkedin.datahub.graphql.generated.EntityType; import com.linkedin.datahub.graphql.generated.ListDomainsInput; import com.linkedin.datahub.graphql.generated.ListDomainsResult; +import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils; import com.linkedin.entity.client.EntityClient; import com.linkedin.metadata.Constants; import com.linkedin.metadata.query.SearchFlags; +import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.SortCriterion; import com.linkedin.metadata.query.filter.SortOrder; import com.linkedin.metadata.search.SearchEntity; import com.linkedin.metadata.search.SearchResult; import graphql.schema.DataFetcher; import graphql.schema.DataFetchingEnvironment; + import java.util.ArrayList; import java.util.List; import java.util.concurrent.CompletableFuture; @@ -30,7 +32,6 @@ * Resolver used for listing all Domains defined within DataHub. Requires the MANAGE_DOMAINS platform privilege. */ public class ListDomainsResolver implements DataFetcher> { - private static final Integer DEFAULT_START = 0; private static final Integer DEFAULT_COUNT = 20; private static final String DEFAULT_QUERY = ""; @@ -48,18 +49,19 @@ public CompletableFuture get(final DataFetchingEnvironment en return CompletableFuture.supplyAsync(() -> { - if (AuthorizationUtils.canCreateDomains(context)) { final ListDomainsInput input = bindArgument(environment.getArgument("input"), ListDomainsInput.class); final Integer start = input.getStart() == null ? DEFAULT_START : input.getStart(); final Integer count = input.getCount() == null ? DEFAULT_COUNT : input.getCount(); final String query = input.getQuery() == null ? DEFAULT_QUERY : input.getQuery(); + final Urn parentDomainUrn = input.getParentDomain() != null ? UrnUtils.getUrn(input.getParentDomain()) : null; + final Filter filter = DomainUtils.buildParentDomainFilter(parentDomainUrn); try { - // First, get all group Urns. + // First, get all domain Urns. final SearchResult gmsResult = _entityClient.search( Constants.DOMAIN_ENTITY_NAME, query, - null, + filter, new SortCriterion().setField(DOMAIN_CREATED_TIME_INDEX_FIELD_NAME).setOrder(SortOrder.DESCENDING), start, count, @@ -78,8 +80,6 @@ public CompletableFuture get(final DataFetchingEnvironment en } catch (Exception e) { throw new RuntimeException("Failed to list domains", e); } - } - throw new AuthorizationException("Unauthorized to perform this action. Please contact your DataHub administrator."); }); } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/ParentDomainsResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/ParentDomainsResolver.java new file mode 100644 index 0000000000000..dcaa7d61ed90c --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/domain/ParentDomainsResolver.java @@ -0,0 +1,59 @@ +package com.linkedin.datahub.graphql.resolvers.domain; + +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.Entity; +import com.linkedin.datahub.graphql.generated.ParentDomainsResult; +import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils; +import com.linkedin.entity.client.EntityClient; +import graphql.schema.DataFetcher; +import graphql.schema.DataFetchingEnvironment; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.CompletableFuture; + +import static com.linkedin.metadata.Constants.DOMAIN_ENTITY_NAME; + +public class ParentDomainsResolver implements DataFetcher> { + + private final EntityClient _entityClient; + + public ParentDomainsResolver(final EntityClient entityClient) { + _entityClient = entityClient; + } + + @Override + public CompletableFuture get(DataFetchingEnvironment environment) { + final QueryContext context = environment.getContext(); + final Urn urn = UrnUtils.getUrn(((Entity) environment.getSource()).getUrn()); + final List parentDomains = new ArrayList<>(); + final Set visitedParentUrns = new HashSet<>(); + + if (!DOMAIN_ENTITY_NAME.equals(urn.getEntityType())) { + throw new IllegalArgumentException(String.format("Failed to resolve parents for entity type %s", urn)); + } + + return CompletableFuture.supplyAsync(() -> { + try { + Entity parentDomain = DomainUtils.getParentDomain(urn, context, _entityClient); + + while (parentDomain != null && !visitedParentUrns.contains(parentDomain.getUrn())) { + parentDomains.add(parentDomain); + visitedParentUrns.add(parentDomain.getUrn()); + parentDomain = DomainUtils.getParentDomain(Urn.createFromString(parentDomain.getUrn()), context, _entityClient); + } + + final ParentDomainsResult result = new ParentDomainsResult(); + result.setCount(parentDomains.size()); + result.setDomains(parentDomains); + return result; + } catch (Exception e) { + throw new RuntimeException(String.format("Failed to load parent domains for entity %s", urn), e); + } + }); + } +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/MoveDomainResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/MoveDomainResolver.java new file mode 100644 index 0000000000000..e5e3a5a0ee42e --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/MoveDomainResolver.java @@ -0,0 +1,89 @@ +package com.linkedin.datahub.graphql.resolvers.mutate; + +import com.linkedin.common.urn.CorpuserUrn; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.data.template.SetMode; +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.authorization.AuthorizationUtils; +import com.linkedin.datahub.graphql.exception.AuthorizationException; +import com.linkedin.datahub.graphql.exception.DataHubGraphQLErrorCode; +import com.linkedin.datahub.graphql.exception.DataHubGraphQLException; +import com.linkedin.datahub.graphql.generated.MoveDomainInput; +import com.linkedin.datahub.graphql.resolvers.ResolverUtils; +import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils; +import com.linkedin.domain.DomainProperties; +import com.linkedin.entity.client.EntityClient; +import com.linkedin.metadata.Constants; +import com.linkedin.metadata.entity.EntityService; +import com.linkedin.metadata.entity.EntityUtils; +import graphql.schema.DataFetcher; +import graphql.schema.DataFetchingEnvironment; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +import java.util.concurrent.CompletableFuture; + +@Slf4j +@RequiredArgsConstructor +public class MoveDomainResolver implements DataFetcher> { + + private final EntityService _entityService; + private final EntityClient _entityClient; + + @Override + public CompletableFuture get(DataFetchingEnvironment environment) throws Exception { + final MoveDomainInput input = ResolverUtils.bindArgument(environment.getArgument("input"), MoveDomainInput.class); + final QueryContext context = environment.getContext(); + final Urn resourceUrn = UrnUtils.getUrn(input.getResourceUrn()); + final Urn newParentDomainUrn = input.getParentDomain() != null ? UrnUtils.getUrn(input.getParentDomain()) : null; + + return CompletableFuture.supplyAsync(() -> { + if (!AuthorizationUtils.canManageDomains(context)) { + throw new AuthorizationException("Unauthorized to perform this action. Please contact your DataHub administrator."); + } + + try { + if (!resourceUrn.getEntityType().equals(Constants.DOMAIN_ENTITY_NAME)) { + throw new IllegalArgumentException("Resource is not a domain."); + } + + DomainProperties properties = (DomainProperties) EntityUtils.getAspectFromEntity( + resourceUrn.toString(), + Constants.DOMAIN_PROPERTIES_ASPECT_NAME, _entityService, + null + ); + + if (properties == null) { + throw new IllegalArgumentException("Domain properties do not exist."); + } + + if (newParentDomainUrn != null) { + if (!newParentDomainUrn.getEntityType().equals(Constants.DOMAIN_ENTITY_NAME)) { + throw new IllegalArgumentException("Parent entity is not a domain."); + } + if (!_entityService.exists(newParentDomainUrn)) { + throw new IllegalArgumentException("Parent entity does not exist."); + } + } + + if (DomainUtils.hasNameConflict(properties.getName(), newParentDomainUrn, context, _entityClient)) { + throw new DataHubGraphQLException( + String.format("\"%s\" already exists in the destination domain. Please pick a unique name.", properties.getName()), + DataHubGraphQLErrorCode.CONFLICT + ); + } + + properties.setParentDomain(newParentDomainUrn, SetMode.REMOVE_IF_NULL); + Urn actor = CorpuserUrn.createFromString(context.getActorUrn()); + MutationUtils.persistAspect(resourceUrn, Constants.DOMAIN_PROPERTIES_ASPECT_NAME, properties, actor, _entityService); + return true; + } catch (DataHubGraphQLException e) { + throw e; + } catch (Exception e) { + log.error("Failed to move domain {} to parent {} : {}", input.getResourceUrn(), input.getParentDomain(), e.getMessage()); + throw new RuntimeException(String.format("Failed to move domain %s to %s", input.getResourceUrn(), input.getParentDomain()), e); + } + }); + } +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/UpdateNameResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/UpdateNameResolver.java index 225bee54142c4..0e316ac1296ee 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/UpdateNameResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/UpdateNameResolver.java @@ -6,8 +6,11 @@ import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.authorization.AuthorizationUtils; import com.linkedin.datahub.graphql.exception.AuthorizationException; +import com.linkedin.datahub.graphql.exception.DataHubGraphQLErrorCode; +import com.linkedin.datahub.graphql.exception.DataHubGraphQLException; import com.linkedin.datahub.graphql.generated.UpdateNameInput; import com.linkedin.datahub.graphql.resolvers.dataproduct.DataProductAuthorizationUtils; +import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils; import com.linkedin.datahub.graphql.resolvers.mutate.util.GlossaryUtils; import com.linkedin.dataproduct.DataProductProperties; import com.linkedin.domain.DomainProperties; @@ -124,14 +127,25 @@ private Boolean updateDomainName( try { DomainProperties domainProperties = (DomainProperties) EntityUtils.getAspectFromEntity( targetUrn.toString(), Constants.DOMAIN_PROPERTIES_ASPECT_NAME, _entityService, null); + if (domainProperties == null) { throw new IllegalArgumentException("Domain does not exist"); } + + if (DomainUtils.hasNameConflict(input.getName(), DomainUtils.getParentDomainSafely(domainProperties), context, _entityClient)) { + throw new DataHubGraphQLException( + String.format("\"%s\" already exists in this domain. Please pick a unique name.", input.getName()), + DataHubGraphQLErrorCode.CONFLICT + ); + } + domainProperties.setName(input.getName()); Urn actor = CorpuserUrn.createFromString(context.getActorUrn()); persistAspect(targetUrn, Constants.DOMAIN_PROPERTIES_ASPECT_NAME, domainProperties, actor, _entityService); return true; + } catch (DataHubGraphQLException e) { + throw e; } catch (Exception e) { throw new RuntimeException(String.format("Failed to perform update against input %s", input), e); } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/DomainUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/DomainUtils.java index b57160be09d32..585fbdf53a2ba 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/DomainUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/DomainUtils.java @@ -5,29 +5,55 @@ import com.linkedin.common.UrnArray; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; +import com.linkedin.data.DataMap; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.authorization.AuthorizationUtils; import com.datahub.authorization.ConjunctivePrivilegeGroup; import com.datahub.authorization.DisjunctivePrivilegeGroup; +import com.linkedin.datahub.graphql.generated.Entity; import com.linkedin.datahub.graphql.generated.ResourceRefInput; +import com.linkedin.datahub.graphql.types.common.mappers.UrnToEntityMapper; +import com.linkedin.domain.DomainProperties; import com.linkedin.domain.Domains; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.client.EntityClient; import com.linkedin.metadata.Constants; import com.linkedin.metadata.authorization.PoliciesConfig; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.entity.EntityUtils; +import com.linkedin.metadata.query.filter.Condition; +import com.linkedin.metadata.query.filter.ConjunctiveCriterion; +import com.linkedin.metadata.query.filter.ConjunctiveCriterionArray; +import com.linkedin.metadata.query.filter.Criterion; +import com.linkedin.metadata.query.filter.CriterionArray; +import com.linkedin.metadata.query.filter.Filter; +import com.linkedin.metadata.search.SearchEntity; +import com.linkedin.metadata.search.SearchResult; import com.linkedin.mxe.MetadataChangeProposal; + +import com.linkedin.r2.RemoteInvocationException; import java.util.ArrayList; +import java.util.Collections; import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; import javax.annotation.Nonnull; import javax.annotation.Nullable; + import lombok.extern.slf4j.Slf4j; import static com.linkedin.datahub.graphql.resolvers.mutate.MutationUtils.*; +import static com.linkedin.metadata.Constants.*; // TODO: Move to consuming from DomainService. @Slf4j public class DomainUtils { + private static final String PARENT_DOMAIN_INDEX_FIELD_NAME = "parentDomain.keyword"; + private static final String HAS_PARENT_DOMAIN_INDEX_FIELD_NAME = "hasParentDomain"; + private static final String NAME_INDEX_FIELD_NAME = "name"; + private static final ConjunctivePrivilegeGroup ALL_PRIVILEGES_GROUP = new ConjunctivePrivilegeGroup(ImmutableList.of( PoliciesConfig.EDIT_ENTITY_PRIVILEGE.getType() )); @@ -85,4 +111,200 @@ public static void validateDomain(Urn domainUrn, EntityService entityService) { throw new IllegalArgumentException(String.format("Failed to validate Domain with urn %s. Urn does not exist.", domainUrn)); } } + + private static List buildRootDomainCriteria() { + final List criteria = new ArrayList<>(); + + criteria.add( + new Criterion() + .setField(HAS_PARENT_DOMAIN_INDEX_FIELD_NAME) + .setValue("false") + .setCondition(Condition.EQUAL) + ); + criteria.add( + new Criterion() + .setField(HAS_PARENT_DOMAIN_INDEX_FIELD_NAME) + .setValue("") + .setCondition(Condition.IS_NULL) + ); + + return criteria; + } + + private static List buildParentDomainCriteria(@Nonnull final Urn parentDomainUrn) { + final List criteria = new ArrayList<>(); + + criteria.add( + new Criterion() + .setField(HAS_PARENT_DOMAIN_INDEX_FIELD_NAME) + .setValue("true") + .setCondition(Condition.EQUAL) + ); + criteria.add( + new Criterion() + .setField(PARENT_DOMAIN_INDEX_FIELD_NAME) + .setValue(parentDomainUrn.toString()) + .setCondition(Condition.EQUAL) + ); + + return criteria; + } + + private static Criterion buildNameCriterion(@Nonnull final String name) { + return new Criterion() + .setField(NAME_INDEX_FIELD_NAME) + .setValue(name) + .setCondition(Condition.EQUAL); + } + + /** + * Builds a filter that ORs together the root parent criterion / ANDs together the parent domain criterion. + * The reason for the OR on root is elastic can have a null|false value to represent an root domain in the index. + * @param name an optional name to AND in to each condition of the filter + * @param parentDomainUrn the parent domain (null means root). + * @return the Filter + */ + public static Filter buildNameAndParentDomainFilter(@Nullable final String name, @Nullable final Urn parentDomainUrn) { + if (parentDomainUrn == null) { + return new Filter().setOr( + new ConjunctiveCriterionArray( + buildRootDomainCriteria().stream().map(parentCriterion -> { + final CriterionArray array = new CriterionArray(parentCriterion); + if (name != null) { + array.add(buildNameCriterion(name)); + } + return new ConjunctiveCriterion().setAnd(array); + }).collect(Collectors.toList()) + ) + ); + } + + final CriterionArray andArray = new CriterionArray(buildParentDomainCriteria(parentDomainUrn)); + if (name != null) { + andArray.add(buildNameCriterion(name)); + } + return new Filter().setOr( + new ConjunctiveCriterionArray( + new ConjunctiveCriterion().setAnd(andArray) + ) + ); + } + + public static Filter buildParentDomainFilter(@Nullable final Urn parentDomainUrn) { + return buildNameAndParentDomainFilter(null, parentDomainUrn); + } + + /** + * Check if a domain has any child domains + * @param domainUrn the URN of the domain to check + * @param context query context (includes authorization context to authorize the request) + * @param entityClient client used to perform the check + * @return true if the domain has any child domains, false if it does not + */ + public static boolean hasChildDomains( + @Nonnull final Urn domainUrn, + @Nonnull final QueryContext context, + @Nonnull final EntityClient entityClient + ) throws RemoteInvocationException { + Filter parentDomainFilter = buildParentDomainFilter(domainUrn); + // Search for entities matching parent domain + // Limit count to 1 for existence check + final SearchResult searchResult = entityClient.filter( + DOMAIN_ENTITY_NAME, + parentDomainFilter, + null, + 0, + 1, + context.getAuthentication()); + return (searchResult.getNumEntities() > 0); + } + + private static Map getDomainsByNameAndParent( + @Nonnull final String name, + @Nullable final Urn parentDomainUrn, + @Nonnull final QueryContext context, + @Nonnull final EntityClient entityClient + ) { + try { + final Filter filter = buildNameAndParentDomainFilter(name, parentDomainUrn); + + final SearchResult searchResult = entityClient.filter( + DOMAIN_ENTITY_NAME, + filter, + null, + 0, + 1000, + context.getAuthentication()); + + final Set domainUrns = searchResult.getEntities() + .stream() + .map(SearchEntity::getEntity) + .collect(Collectors.toSet()); + + return entityClient.batchGetV2( + DOMAIN_ENTITY_NAME, + domainUrns, + Collections.singleton(DOMAIN_PROPERTIES_ASPECT_NAME), + context.getAuthentication()); + } catch (Exception e) { + throw new RuntimeException("Failed fetching Domains by name and parent", e); + } + } + + public static boolean hasNameConflict( + @Nonnull final String name, + @Nullable final Urn parentDomainUrn, + @Nonnull final QueryContext context, + @Nonnull final EntityClient entityClient + ) { + final Map entities = getDomainsByNameAndParent(name, parentDomainUrn, context, entityClient); + + // Even though we searched by name, do one more pass to check the name is unique + return entities.values().stream().anyMatch(entityResponse -> { + if (entityResponse.getAspects().containsKey(DOMAIN_PROPERTIES_ASPECT_NAME)) { + DataMap dataMap = entityResponse.getAspects().get(DOMAIN_PROPERTIES_ASPECT_NAME).getValue().data(); + DomainProperties domainProperties = new DomainProperties(dataMap); + return (domainProperties.hasName() && domainProperties.getName().equals(name)); + } + return false; + }); + } + + @Nullable + public static Entity getParentDomain( + @Nonnull final Urn urn, + @Nonnull final QueryContext context, + @Nonnull final EntityClient entityClient + ) { + try { + final EntityResponse entityResponse = entityClient.getV2( + DOMAIN_ENTITY_NAME, + urn, + Collections.singleton(DOMAIN_PROPERTIES_ASPECT_NAME), + context.getAuthentication() + ); + + if (entityResponse != null && entityResponse.getAspects().containsKey(DOMAIN_PROPERTIES_ASPECT_NAME)) { + final DomainProperties properties = new DomainProperties(entityResponse.getAspects().get(DOMAIN_PROPERTIES_ASPECT_NAME).getValue().data()); + final Urn parentDomainUrn = getParentDomainSafely(properties); + return parentDomainUrn != null ? UrnToEntityMapper.map(parentDomainUrn) : null; + } + } catch (Exception e) { + throw new RuntimeException(String.format("Failed to retrieve parent domain for entity %s", urn), e); + } + + return null; + } + + /** + * Get a parent domain only if hasParentDomain was set. There is strange elastic behavior where moving a domain + * to the root leaves the parentDomain field set but makes hasParentDomain false. This helper makes sure that queries + * to elastic where hasParentDomain=false and parentDomain=value only gives us the parentDomain if hasParentDomain=true. + * @param properties the domain properties aspect + * @return the parentDomain or null + */ + @Nullable + public static Urn getParentDomainSafely(@Nonnull final DomainProperties properties) { + return properties.hasParentDomain() ? properties.getParentDomain() : null; + } } \ No newline at end of file diff --git a/datahub-graphql-core/src/main/resources/app.graphql b/datahub-graphql-core/src/main/resources/app.graphql index a5057bcf644da..075a3b0fac43b 100644 --- a/datahub-graphql-core/src/main/resources/app.graphql +++ b/datahub-graphql-core/src/main/resources/app.graphql @@ -441,10 +441,17 @@ type FeatureFlagsConfig { Whether we should show CTAs in the UI related to moving to Managed DataHub by Acryl. """ showAcrylInfo: Boolean! + """ Whether we should show AccessManagement tab in the datahub UI. """ showAccessManagement: Boolean! + + """ + Enables the nested Domains feature that allows users to have sub-Domains. + If this is off, Domains appear "flat" again. + """ + nestedDomainsEnabled: Boolean! } """ diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index 044c405942a3c..39f86948c77c4 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -434,6 +434,11 @@ type Mutation { """ createDomain(input: CreateDomainInput!): String + """ + Moves a domain to be parented under another domain. + """ + moveDomain(input: MoveDomainInput!): Boolean + """ Delete a Domain """ @@ -7735,6 +7740,21 @@ input UpdateParentNodeInput { resourceUrn: String! } +""" +Input for updating the parent domain of a domain. +""" +input MoveDomainInput { + """ + The new parent domain urn. If parentDomain is null, this will remove the parent from this entity + """ + parentDomain: String + + """ + The primary key of the resource to update the parent domain for + """ + resourceUrn: String! +} + """ Input for updating the name of an entity """ @@ -9584,15 +9604,31 @@ type Domain implements Entity { """ entities(input: DomainEntitiesInput): SearchResults + """ + Recursively get the lineage of parent domains for this entity + """ + parentDomains: ParentDomainsResult + """ Edges extending from this entity """ relationships(input: RelationshipsInput!): EntityRelationshipsResult } +""" +All of the parent domains starting from a single Domain through all of its ancestors +""" +type ParentDomainsResult { + """ + The number of parent domains bubbling up for this entity + """ + count: Int! - - + """ + A list of parent domains in order from direct parent, to parent's parent etc. If there are no parents, return an empty list + """ + domains: [Entity!]! +} """ Properties about a domain @@ -9652,6 +9688,11 @@ input CreateDomainInput { Optional description for the Domain """ description: String + + """ + Optional parent domain urn for the domain + """ + parentDomain: String } """ @@ -9672,6 +9713,11 @@ input ListDomainsInput { Optional search query """ query: String + + """ + Optional parent domain + """ + parentDomain: String } """ diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/CreateDomainResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/CreateDomainResolverTest.java index 8c19f1dc3eb34..560a3865ce9e1 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/CreateDomainResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/CreateDomainResolverTest.java @@ -6,35 +6,57 @@ import com.linkedin.common.urn.UrnUtils; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.CreateDomainInput; +import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils; import com.linkedin.domain.DomainProperties; +import com.linkedin.entity.Aspect; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.EnvelopedAspectMap; import com.linkedin.entity.client.EntityClient; import com.linkedin.events.metadata.ChangeType; import com.linkedin.metadata.Constants; import com.linkedin.metadata.key.DomainKey; +import com.linkedin.metadata.search.SearchEntity; +import com.linkedin.metadata.search.SearchEntityArray; +import com.linkedin.metadata.search.SearchResult; import com.linkedin.metadata.utils.GenericRecordUtils; import com.linkedin.metadata.entity.EntityService; import com.linkedin.mxe.MetadataChangeProposal; import com.linkedin.r2.RemoteInvocationException; import graphql.schema.DataFetchingEnvironment; + +import java.util.HashMap; +import java.util.Map; import java.util.concurrent.CompletionException; import org.mockito.Mockito; import org.testng.annotations.Test; import static com.linkedin.datahub.graphql.TestUtils.*; +import static com.linkedin.metadata.Constants.DOMAIN_PROPERTIES_ASPECT_NAME; import static org.testng.Assert.*; public class CreateDomainResolverTest { + private static final Urn TEST_DOMAIN_URN = Urn.createFromTuple("domain", "test-id"); + private static final Urn TEST_PARENT_DOMAIN_URN = Urn.createFromTuple("domain", "test-parent-id"); + private static final CreateDomainInput TEST_INPUT = new CreateDomainInput( "test-id", "test-name", - "test-description" + "test-description", + TEST_PARENT_DOMAIN_URN.toString() + ); + + private static final CreateDomainInput TEST_INPUT_NO_PARENT_DOMAIN = new CreateDomainInput( + "test-id", + "test-name", + "test-description", + null ); + private static final Urn TEST_ACTOR_URN = UrnUtils.getUrn("urn:li:corpuser:test"); - private static final String TEST_ENTITY_URN = "urn:li:dataset:(urn:li:dataPlatform:mysql,my-test,PROD)"; - private static final String TEST_TAG_1_URN = "urn:li:tag:test-id-1"; - private static final String TEST_TAG_2_URN = "urn:li:tag:test-id-2"; + @Test public void testGetSuccess() throws Exception { @@ -43,12 +65,31 @@ public void testGetSuccess() throws Exception { EntityService mockService = getMockEntityService(); CreateDomainResolver resolver = new CreateDomainResolver(mockClient, mockService); + Mockito.when(mockClient.exists( + Mockito.eq(TEST_DOMAIN_URN), + Mockito.any(Authentication.class) + )).thenReturn(false); + + Mockito.when(mockClient.exists( + Mockito.eq(TEST_PARENT_DOMAIN_URN), + Mockito.any(Authentication.class) + )).thenReturn(true); + // Execute resolver QueryContext mockContext = getMockAllowContext(); DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); Mockito.when(mockEnv.getArgument(Mockito.eq("input"))).thenReturn(TEST_INPUT); Mockito.when(mockEnv.getContext()).thenReturn(mockContext); + Mockito.when(mockClient.filter( + Mockito.eq(Constants.DOMAIN_ENTITY_NAME), + Mockito.eq(DomainUtils.buildNameAndParentDomainFilter(TEST_INPUT.getName(), TEST_PARENT_DOMAIN_URN)), + Mockito.eq(null), + Mockito.any(Integer.class), + Mockito.any(Integer.class), + Mockito.any(Authentication.class) + )).thenReturn(new SearchResult().setEntities(new SearchEntityArray())); + resolver.get(mockEnv).get(); final DomainKey key = new DomainKey(); @@ -60,6 +101,7 @@ public void testGetSuccess() throws Exception { props.setDescription("test-description"); props.setName("test-name"); props.setCreated(new AuditStamp().setActor(TEST_ACTOR_URN).setTime(0L)); + props.setParentDomain(TEST_PARENT_DOMAIN_URN); proposal.setAspectName(Constants.DOMAIN_PROPERTIES_ASPECT_NAME); proposal.setAspect(GenericRecordUtils.serializeAspect(props)); proposal.setChangeType(ChangeType.UPSERT); @@ -72,6 +114,133 @@ public void testGetSuccess() throws Exception { ); } + @Test + public void testGetSuccessNoParentDomain() throws Exception { + EntityClient mockClient = Mockito.mock(EntityClient.class); + EntityService mockService = Mockito.mock(EntityService.class); + CreateDomainResolver resolver = new CreateDomainResolver(mockClient, mockService); + + Mockito.when(mockClient.exists( + Mockito.eq(TEST_DOMAIN_URN), + Mockito.any(Authentication.class) + )).thenReturn(false); + + QueryContext mockContext = getMockAllowContext(); + DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); + Mockito.when(mockEnv.getArgument(Mockito.eq("input"))).thenReturn(TEST_INPUT_NO_PARENT_DOMAIN); + Mockito.when(mockEnv.getContext()).thenReturn(mockContext); + + Mockito.when(mockClient.filter( + Mockito.eq(Constants.DOMAIN_ENTITY_NAME), + Mockito.eq(DomainUtils.buildNameAndParentDomainFilter(TEST_INPUT.getName(), null)), + Mockito.eq(null), + Mockito.any(Integer.class), + Mockito.any(Integer.class), + Mockito.any(Authentication.class) + )).thenReturn(new SearchResult().setEntities(new SearchEntityArray())); + + resolver.get(mockEnv).get(); + + final DomainKey key = new DomainKey(); + key.setId("test-id"); + final MetadataChangeProposal proposal = new MetadataChangeProposal(); + proposal.setEntityKeyAspect(GenericRecordUtils.serializeAspect(key)); + proposal.setEntityType(Constants.DOMAIN_ENTITY_NAME); + DomainProperties props = new DomainProperties(); + props.setDescription("test-description"); + props.setName("test-name"); + props.setCreated(new AuditStamp().setActor(TEST_ACTOR_URN).setTime(0L)); + proposal.setAspectName(Constants.DOMAIN_PROPERTIES_ASPECT_NAME); + proposal.setAspect(GenericRecordUtils.serializeAspect(props)); + proposal.setChangeType(ChangeType.UPSERT); + + Mockito.verify(mockClient, Mockito.times(1)).ingestProposal( + Mockito.argThat(new CreateDomainProposalMatcher(proposal)), + Mockito.any(Authentication.class), + Mockito.eq(false) + ); + } + + @Test + public void testGetInvalidParent() throws Exception { + EntityClient mockClient = Mockito.mock(EntityClient.class); + EntityService mockService = Mockito.mock(EntityService.class); + CreateDomainResolver resolver = new CreateDomainResolver(mockClient, mockService); + + Mockito.when(mockClient.exists( + Mockito.eq(TEST_DOMAIN_URN), + Mockito.any(Authentication.class) + )).thenReturn(false); + + Mockito.when(mockClient.exists( + Mockito.eq(TEST_PARENT_DOMAIN_URN), + Mockito.any(Authentication.class) + )).thenReturn(false); + + QueryContext mockContext = getMockAllowContext(); + DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); + Mockito.when(mockEnv.getArgument(Mockito.eq("input"))).thenReturn(TEST_INPUT); + Mockito.when(mockEnv.getContext()).thenReturn(mockContext); + + assertThrows(CompletionException.class, () -> resolver.get(mockEnv).join()); + } + + @Test + public void testGetNameConflict() throws Exception { + EntityClient mockClient = Mockito.mock(EntityClient.class); + EntityService mockService = Mockito.mock(EntityService.class); + CreateDomainResolver resolver = new CreateDomainResolver(mockClient, mockService); + + Mockito.when(mockClient.exists( + Mockito.eq(TEST_DOMAIN_URN), + Mockito.any(Authentication.class) + )).thenReturn(false); + + Mockito.when(mockClient.exists( + Mockito.eq(TEST_PARENT_DOMAIN_URN), + Mockito.any(Authentication.class) + )).thenReturn(true); + + QueryContext mockContext = getMockAllowContext(); + DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); + Mockito.when(mockEnv.getArgument(Mockito.eq("input"))).thenReturn(TEST_INPUT); + Mockito.when(mockEnv.getContext()).thenReturn(mockContext); + + Mockito.when(mockClient.filter( + Mockito.eq(Constants.DOMAIN_ENTITY_NAME), + Mockito.eq(DomainUtils.buildNameAndParentDomainFilter(TEST_INPUT.getName(), TEST_PARENT_DOMAIN_URN)), + Mockito.eq(null), + Mockito.any(Integer.class), + Mockito.any(Integer.class), + Mockito.any(Authentication.class) + )).thenReturn(new SearchResult().setEntities( + new SearchEntityArray(new SearchEntity().setEntity(TEST_DOMAIN_URN)) + )); + + DomainProperties domainProperties = new DomainProperties(); + domainProperties.setDescription(TEST_INPUT.getDescription()); + domainProperties.setName(TEST_INPUT.getName()); + domainProperties.setCreated(new AuditStamp().setActor(TEST_ACTOR_URN).setTime(0L)); + domainProperties.setParentDomain(TEST_PARENT_DOMAIN_URN); + + EntityResponse entityResponse = new EntityResponse(); + EnvelopedAspectMap envelopedAspectMap = new EnvelopedAspectMap(); + envelopedAspectMap.put(DOMAIN_PROPERTIES_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(domainProperties.data()))); + entityResponse.setAspects(envelopedAspectMap); + + Map entityResponseMap = new HashMap<>(); + entityResponseMap.put(TEST_DOMAIN_URN, entityResponse); + + Mockito.when(mockClient.batchGetV2( + Mockito.eq(Constants.DOMAIN_ENTITY_NAME), + Mockito.any(), + Mockito.any(), + Mockito.any(Authentication.class) + )).thenReturn(entityResponseMap); + + assertThrows(CompletionException.class, () -> resolver.get(mockEnv).join()); + } + @Test public void testGetUnauthorized() throws Exception { // Create resolver diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/DeleteDomainResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/DeleteDomainResolverTest.java index 1c450b0e85424..9bcdbe6d2a0e0 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/DeleteDomainResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/DeleteDomainResolverTest.java @@ -4,6 +4,7 @@ import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.entity.client.EntityClient; +import com.linkedin.metadata.search.SearchResult; import graphql.schema.DataFetchingEnvironment; import java.util.concurrent.CompletionException; import org.mockito.Mockito; @@ -28,6 +29,10 @@ public void testGetSuccess() throws Exception { Mockito.when(mockEnv.getArgument(Mockito.eq("urn"))).thenReturn(TEST_URN); Mockito.when(mockEnv.getContext()).thenReturn(mockContext); + // Domain has 0 child domains + Mockito.when(mockClient.filter(Mockito.eq("domain"), Mockito.any(), Mockito.any(), Mockito.eq(0), Mockito.eq(1), Mockito.any())) + .thenReturn(new SearchResult().setNumEntities(0)); + assertTrue(resolver.get(mockEnv).get()); Mockito.verify(mockClient, Mockito.times(1)).deleteEntity( @@ -36,6 +41,28 @@ public void testGetSuccess() throws Exception { ); } + @Test + public void testDeleteWithChildDomains() throws Exception { + EntityClient mockClient = Mockito.mock(EntityClient.class); + DeleteDomainResolver resolver = new DeleteDomainResolver(mockClient); + + // Execute resolver + QueryContext mockContext = getMockAllowContext(); + DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); + Mockito.when(mockEnv.getArgument(Mockito.eq("urn"))).thenReturn(TEST_URN); + Mockito.when(mockEnv.getContext()).thenReturn(mockContext); + + // Domain has child domains + Mockito.when(mockClient.filter(Mockito.eq("domain"), Mockito.any(), Mockito.any(), Mockito.eq(0), Mockito.eq(1), Mockito.any())) + .thenReturn(new SearchResult().setNumEntities(1)); + + assertThrows(CompletionException.class, () -> resolver.get(mockEnv).join()); + + Mockito.verify(mockClient, Mockito.times(0)).deleteEntity( + Mockito.any(), + Mockito.any(Authentication.class)); + } + @Test public void testGetUnauthorized() throws Exception { // Create resolver diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/ListDomainsResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/ListDomainsResolverTest.java index c143f3480fcff..bd8a8f98de497 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/ListDomainsResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/ListDomainsResolverTest.java @@ -5,6 +5,7 @@ import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.ListDomainsInput; +import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils; import com.linkedin.entity.client.EntityClient; import com.linkedin.metadata.Constants; import com.linkedin.metadata.query.SearchFlags; @@ -28,9 +29,14 @@ public class ListDomainsResolverTest { private static final Urn TEST_DOMAIN_URN = Urn.createFromTuple("domain", "test-id"); + private static final Urn TEST_PARENT_DOMAIN_URN = Urn.createFromTuple("domain", "test-parent-id"); private static final ListDomainsInput TEST_INPUT = new ListDomainsInput( - 0, 20, null + 0, 20, null, TEST_PARENT_DOMAIN_URN.toString() + ); + + private static final ListDomainsInput TEST_INPUT_NO_PARENT_DOMAIN = new ListDomainsInput( + 0, 20, null, null ); @Test @@ -41,7 +47,7 @@ public void testGetSuccess() throws Exception { Mockito.when(mockClient.search( Mockito.eq(Constants.DOMAIN_ENTITY_NAME), Mockito.eq(""), - Mockito.eq(null), + Mockito.eq(DomainUtils.buildParentDomainFilter(TEST_PARENT_DOMAIN_URN)), Mockito.eq(new SortCriterion().setField(DOMAIN_CREATED_TIME_INDEX_FIELD_NAME).setOrder(SortOrder.DESCENDING)), Mockito.eq(0), Mockito.eq(20), @@ -71,6 +77,44 @@ public void testGetSuccess() throws Exception { assertEquals(resolver.get(mockEnv).get().getDomains().get(0).getUrn(), TEST_DOMAIN_URN.toString()); } + @Test + public void testGetSuccessNoParentDomain() throws Exception { + // Create resolver + EntityClient mockClient = Mockito.mock(EntityClient.class); + + Mockito.when(mockClient.search( + Mockito.eq(Constants.DOMAIN_ENTITY_NAME), + Mockito.eq(""), + Mockito.eq(DomainUtils.buildParentDomainFilter(null)), + Mockito.eq(new SortCriterion().setField(DOMAIN_CREATED_TIME_INDEX_FIELD_NAME).setOrder(SortOrder.DESCENDING)), + Mockito.eq(0), + Mockito.eq(20), + Mockito.any(Authentication.class), + Mockito.eq(new SearchFlags().setFulltext(true)) + )).thenReturn( + new SearchResult() + .setFrom(0) + .setPageSize(1) + .setNumEntities(1) + .setEntities(new SearchEntityArray(ImmutableSet.of(new SearchEntity().setEntity(TEST_DOMAIN_URN)))) + ); + + ListDomainsResolver resolver = new ListDomainsResolver(mockClient); + + // Execute resolver + QueryContext mockContext = getMockAllowContext(); + DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); + Mockito.when(mockEnv.getArgument(Mockito.eq("input"))).thenReturn(TEST_INPUT_NO_PARENT_DOMAIN); + Mockito.when(mockEnv.getContext()).thenReturn(mockContext); + + // Data Assertions + assertEquals((int) resolver.get(mockEnv).get().getStart(), 0); + assertEquals((int) resolver.get(mockEnv).get().getCount(), 1); + assertEquals((int) resolver.get(mockEnv).get().getTotal(), 1); + assertEquals(resolver.get(mockEnv).get().getDomains().size(), 1); + assertEquals(resolver.get(mockEnv).get().getDomains().get(0).getUrn(), TEST_DOMAIN_URN.toString()); + } + @Test public void testGetUnauthorized() throws Exception { // Create resolver diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/MoveDomainResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/MoveDomainResolverTest.java new file mode 100644 index 0000000000000..4059c180b0eb0 --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/MoveDomainResolverTest.java @@ -0,0 +1,140 @@ +package com.linkedin.datahub.graphql.resolvers.domain; + +import com.datahub.authentication.Authentication; +import com.linkedin.common.AuditStamp; +import com.linkedin.common.urn.CorpuserUrn; +import com.linkedin.common.urn.Urn; +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.MoveDomainInput; +import com.linkedin.datahub.graphql.resolvers.mutate.MoveDomainResolver; +import com.linkedin.datahub.graphql.resolvers.mutate.MutationUtils; +import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils; +import com.linkedin.domain.DomainProperties; +import com.linkedin.entity.client.EntityClient; +import com.linkedin.metadata.Constants; +import com.linkedin.metadata.entity.EntityService; +import com.linkedin.metadata.search.SearchEntityArray; +import com.linkedin.metadata.search.SearchResult; +import com.linkedin.mxe.MetadataChangeProposal; +import graphql.schema.DataFetchingEnvironment; +import org.mockito.Mockito; +import org.testng.annotations.Test; + +import java.util.concurrent.CompletionException; + +import static com.linkedin.datahub.graphql.TestUtils.*; +import static com.linkedin.metadata.Constants.*; +import static org.testng.Assert.assertThrows; +import static org.testng.Assert.assertTrue; + +public class MoveDomainResolverTest { + + private static final String CONTAINER_URN = "urn:li:container:00005397daf94708a8822b8106cfd451"; + private static final String PARENT_DOMAIN_URN = "urn:li:domain:00005397daf94708a8822b8106cfd451"; + private static final String DOMAIN_URN = "urn:li:domain:11115397daf94708a8822b8106cfd451"; + private static final MoveDomainInput INPUT = new MoveDomainInput(PARENT_DOMAIN_URN, DOMAIN_URN); + private static final MoveDomainInput INVALID_INPUT = new MoveDomainInput(CONTAINER_URN, DOMAIN_URN); + private static final CorpuserUrn TEST_ACTOR_URN = new CorpuserUrn("test"); + + private MetadataChangeProposal setupTests(DataFetchingEnvironment mockEnv, EntityService mockService, EntityClient mockClient) throws Exception { + QueryContext mockContext = getMockAllowContext(); + Mockito.when(mockContext.getAuthentication()).thenReturn(Mockito.mock(Authentication.class)); + Mockito.when(mockContext.getActorUrn()).thenReturn(TEST_ACTOR_URN.toString()); + Mockito.when(mockEnv.getContext()).thenReturn(mockContext); + + final String name = "test name"; + Mockito.when(mockService.getAspect( + Urn.createFromString(DOMAIN_URN), + Constants.DOMAIN_PROPERTIES_ASPECT_NAME, + 0)) + .thenReturn(new DomainProperties().setName(name)); + + Mockito.when(mockClient.filter( + Mockito.eq(Constants.DOMAIN_ENTITY_NAME), + Mockito.eq(DomainUtils.buildNameAndParentDomainFilter(name, Urn.createFromString(PARENT_DOMAIN_URN))), + Mockito.eq(null), + Mockito.any(Integer.class), + Mockito.any(Integer.class), + Mockito.any(Authentication.class) + )).thenReturn(new SearchResult().setEntities(new SearchEntityArray())); + + DomainProperties properties = new DomainProperties(); + properties.setName(name); + properties.setParentDomain(Urn.createFromString(PARENT_DOMAIN_URN)); + return MutationUtils.buildMetadataChangeProposalWithUrn(Urn.createFromString(DOMAIN_URN), + DOMAIN_PROPERTIES_ASPECT_NAME, properties); + } + + @Test + public void testGetSuccess() throws Exception { + EntityService mockService = Mockito.mock(EntityService.class); + EntityClient mockClient = Mockito.mock(EntityClient.class); + Mockito.when(mockService.exists(Urn.createFromString(PARENT_DOMAIN_URN))).thenReturn(true); + DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); + Mockito.when(mockEnv.getArgument("input")).thenReturn(INPUT); + + MoveDomainResolver resolver = new MoveDomainResolver(mockService, mockClient); + setupTests(mockEnv, mockService, mockClient); + + assertTrue(resolver.get(mockEnv).get()); + Mockito.verify(mockService, Mockito.times(1)).ingestProposal( + Mockito.any(MetadataChangeProposal.class), + Mockito.any(AuditStamp.class), + Mockito.eq(false) + ); + } + + @Test + public void testGetFailureEntityDoesNotExist() throws Exception { + EntityService mockService = Mockito.mock(EntityService.class); + EntityClient mockClient = Mockito.mock(EntityClient.class); + Mockito.when(mockService.exists(Urn.createFromString(PARENT_DOMAIN_URN))).thenReturn(true); + DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); + Mockito.when(mockEnv.getArgument("input")).thenReturn(INPUT); + + QueryContext mockContext = getMockAllowContext(); + Mockito.when(mockContext.getAuthentication()).thenReturn(Mockito.mock(Authentication.class)); + Mockito.when(mockContext.getActorUrn()).thenReturn(TEST_ACTOR_URN.toString()); + Mockito.when(mockEnv.getContext()).thenReturn(mockContext); + + Mockito.when(mockService.getAspect( + Urn.createFromString(DOMAIN_URN), + DOMAIN_PROPERTIES_ASPECT_NAME, + 0)) + .thenReturn(null); + + MoveDomainResolver resolver = new MoveDomainResolver(mockService, mockClient); + assertThrows(CompletionException.class, () -> resolver.get(mockEnv).join()); + verifyNoIngestProposal(mockService); + } + + @Test + public void testGetFailureParentDoesNotExist() throws Exception { + EntityService mockService = Mockito.mock(EntityService.class); + EntityClient mockClient = Mockito.mock(EntityClient.class); + Mockito.when(mockService.exists(Urn.createFromString(PARENT_DOMAIN_URN))).thenReturn(false); + DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); + Mockito.when(mockEnv.getArgument("input")).thenReturn(INPUT); + + MoveDomainResolver resolver = new MoveDomainResolver(mockService, mockClient); + setupTests(mockEnv, mockService, mockClient); + + assertThrows(CompletionException.class, () -> resolver.get(mockEnv).join()); + verifyNoIngestProposal(mockService); + } + + @Test + public void testGetFailureParentIsNotDomain() throws Exception { + EntityService mockService = Mockito.mock(EntityService.class); + EntityClient mockClient = Mockito.mock(EntityClient.class); + Mockito.when(mockService.exists(Urn.createFromString(PARENT_DOMAIN_URN))).thenReturn(true); + DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); + Mockito.when(mockEnv.getArgument("input")).thenReturn(INVALID_INPUT); + + MoveDomainResolver resolver = new MoveDomainResolver(mockService, mockClient); + setupTests(mockEnv, mockService, mockClient); + + assertThrows(CompletionException.class, () -> resolver.get(mockEnv).join()); + verifyNoIngestProposal(mockService); + } +} diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/ParentDomainsResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/ParentDomainsResolverTest.java new file mode 100644 index 0000000000000..7bd7c3afac001 --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/domain/ParentDomainsResolverTest.java @@ -0,0 +1,95 @@ +package com.linkedin.datahub.graphql.resolvers.domain; + +import com.datahub.authentication.Authentication; +import com.linkedin.common.urn.Urn; +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.Domain; +import com.linkedin.datahub.graphql.generated.EntityType; +import com.linkedin.datahub.graphql.generated.ParentDomainsResult; +import com.linkedin.domain.DomainProperties; +import com.linkedin.entity.Aspect; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.EnvelopedAspectMap; +import com.linkedin.entity.client.EntityClient; +import graphql.schema.DataFetchingEnvironment; +import org.mockito.Mockito; +import org.testng.annotations.Test; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import static com.linkedin.metadata.Constants.*; +import static org.testng.Assert.assertEquals; + +public class ParentDomainsResolverTest { + @Test + public void testGetSuccessForDomain() throws Exception { + EntityClient mockClient = Mockito.mock(EntityClient.class); + QueryContext mockContext = Mockito.mock(QueryContext.class); + Mockito.when(mockContext.getAuthentication()).thenReturn(Mockito.mock(Authentication.class)); + DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); + Mockito.when(mockEnv.getContext()).thenReturn(mockContext); + + Urn domainUrn = Urn.createFromString("urn:li:domain:00005397daf94708a8822b8106cfd451"); + Domain domainEntity = new Domain(); + domainEntity.setUrn(domainUrn.toString()); + domainEntity.setType(EntityType.DOMAIN); + Mockito.when(mockEnv.getSource()).thenReturn(domainEntity); + + final DomainProperties parentDomain1 = new DomainProperties().setParentDomain(Urn.createFromString( + "urn:li:domain:11115397daf94708a8822b8106cfd451") + ).setName("test def"); + final DomainProperties parentDomain2 = new DomainProperties().setParentDomain(Urn.createFromString( + "urn:li:domain:22225397daf94708a8822b8106cfd451") + ).setName("test def 2"); + + Map domainAspects = new HashMap<>(); + domainAspects.put(DOMAIN_PROPERTIES_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(parentDomain1.data()))); + + Map parentDomain1Aspects = new HashMap<>(); + parentDomain1Aspects.put(DOMAIN_PROPERTIES_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect( + new DomainProperties().setName("domain parent 1").setParentDomain(parentDomain2.getParentDomain()).data() + ))); + + Map parentDomain2Aspects = new HashMap<>(); + parentDomain2Aspects.put(DOMAIN_PROPERTIES_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect( + new DomainProperties().setName("domain parent 2").data() + ))); + + Mockito.when(mockClient.getV2( + Mockito.eq(domainUrn.getEntityType()), + Mockito.eq(domainUrn), + Mockito.eq(Collections.singleton(DOMAIN_PROPERTIES_ASPECT_NAME)), + Mockito.any(Authentication.class) + )).thenReturn(new EntityResponse().setAspects(new EnvelopedAspectMap(domainAspects))); + + Mockito.when(mockClient.getV2( + Mockito.eq(parentDomain1.getParentDomain().getEntityType()), + Mockito.eq(parentDomain1.getParentDomain()), + Mockito.eq(Collections.singleton(DOMAIN_PROPERTIES_ASPECT_NAME)), + Mockito.any(Authentication.class) + )).thenReturn(new EntityResponse().setAspects(new EnvelopedAspectMap(parentDomain1Aspects))); + + Mockito.when(mockClient.getV2( + Mockito.eq(parentDomain2.getParentDomain().getEntityType()), + Mockito.eq(parentDomain2.getParentDomain()), + Mockito.eq(Collections.singleton(DOMAIN_PROPERTIES_ASPECT_NAME)), + Mockito.any(Authentication.class) + )).thenReturn(new EntityResponse().setAspects(new EnvelopedAspectMap(parentDomain2Aspects))); + + ParentDomainsResolver resolver = new ParentDomainsResolver(mockClient); + ParentDomainsResult result = resolver.get(mockEnv).get(); + + Mockito.verify(mockClient, Mockito.times(3)).getV2( + Mockito.any(), + Mockito.any(), + Mockito.any(), + Mockito.any() + ); + assertEquals(result.getCount(), 2); + assertEquals(result.getDomains().get(0).getUrn(), parentDomain1.getParentDomain().toString()); + assertEquals(result.getDomains().get(1).getUrn(), parentDomain2.getParentDomain().toString()); + } +} \ No newline at end of file diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/glossary/UpdateNameResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/glossary/UpdateNameResolverTest.java index 064e2dd3bd59b..eee9cfbae8fcb 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/glossary/UpdateNameResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/glossary/UpdateNameResolverTest.java @@ -8,12 +8,15 @@ import com.linkedin.datahub.graphql.generated.UpdateNameInput; import com.linkedin.datahub.graphql.resolvers.mutate.MutationUtils; import com.linkedin.datahub.graphql.resolvers.mutate.UpdateNameResolver; +import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils; import com.linkedin.domain.DomainProperties; import com.linkedin.entity.client.EntityClient; import com.linkedin.glossary.GlossaryNodeInfo; import com.linkedin.glossary.GlossaryTermInfo; import com.linkedin.metadata.Constants; import com.linkedin.metadata.entity.EntityService; +import com.linkedin.metadata.search.SearchEntityArray; +import com.linkedin.metadata.search.SearchResult; import com.linkedin.mxe.MetadataChangeProposal; import graphql.schema.DataFetchingEnvironment; import org.mockito.Mockito; @@ -121,6 +124,15 @@ public void testGetSuccessForDomain() throws Exception { 0)) .thenReturn(new DomainProperties().setName(name)); + Mockito.when(mockClient.filter( + Mockito.eq(Constants.DOMAIN_ENTITY_NAME), + Mockito.eq(DomainUtils.buildNameAndParentDomainFilter(INPUT_FOR_DOMAIN.getName(), null)), + Mockito.eq(null), + Mockito.any(Integer.class), + Mockito.any(Integer.class), + Mockito.any(Authentication.class) + )).thenReturn(new SearchResult().setEntities(new SearchEntityArray())); + DomainProperties properties = new DomainProperties(); properties.setName(NEW_NAME); final MetadataChangeProposal proposal = MutationUtils.buildMetadataChangeProposalWithUrn(Urn.createFromString(DOMAIN_URN), diff --git a/datahub-web-react/src/app/SearchRoutes.tsx b/datahub-web-react/src/app/SearchRoutes.tsx index 82606befd2663..d2ad4ab6f4db1 100644 --- a/datahub-web-react/src/app/SearchRoutes.tsx +++ b/datahub-web-react/src/app/SearchRoutes.tsx @@ -8,20 +8,27 @@ import { EntityPage } from './entity/EntityPage'; import { BrowseResultsPage } from './browse/BrowseResultsPage'; import { SearchPage } from './search/SearchPage'; import { AnalyticsPage } from './analyticsDashboard/components/AnalyticsPage'; -import { ManageDomainsPage } from './domain/ManageDomainsPage'; import { ManageIngestionPage } from './ingest/ManageIngestionPage'; import GlossaryRoutes from './glossary/GlossaryRoutes'; import { SettingsPage } from './settings/SettingsPage'; +import DomainRoutes from './domain/DomainRoutes'; +import { useIsNestedDomainsEnabled } from './useAppConfig'; +import { ManageDomainsPage } from './domain/ManageDomainsPage'; /** * Container for all searchable page routes */ export const SearchRoutes = (): JSX.Element => { const entityRegistry = useEntityRegistry(); + const isNestedDomainsEnabled = useIsNestedDomainsEnabled(); + const entities = isNestedDomainsEnabled + ? entityRegistry.getEntitiesForSearchRoutes() + : entityRegistry.getNonGlossaryEntities(); + return ( - {entityRegistry.getNonGlossaryEntities().map((entity) => ( + {entities.map((entity) => ( { /> } /> } /> - } /> + {isNestedDomainsEnabled && } />} + {!isNestedDomainsEnabled && } />} } /> } /> } /> diff --git a/datahub-web-react/src/app/analytics/event.ts b/datahub-web-react/src/app/analytics/event.ts index 84173b522fb07..28cd61ff3171a 100644 --- a/datahub-web-react/src/app/analytics/event.ts +++ b/datahub-web-react/src/app/analytics/event.ts @@ -55,6 +55,7 @@ export enum EventType { ShowStandardHomepageEvent, CreateGlossaryEntityEvent, CreateDomainEvent, + MoveDomainEvent, CreateIngestionSourceEvent, UpdateIngestionSourceEvent, DeleteIngestionSourceEvent, @@ -454,6 +455,13 @@ export interface CreateGlossaryEntityEvent extends BaseEvent { export interface CreateDomainEvent extends BaseEvent { type: EventType.CreateDomainEvent; + parentDomainUrn?: string; +} + +export interface MoveDomainEvent extends BaseEvent { + type: EventType.MoveDomainEvent; + oldParentDomainUrn?: string; + parentDomainUrn?: string; } // Managed Ingestion Events @@ -653,6 +661,7 @@ export type Event = | ShowStandardHomepageEvent | CreateGlossaryEntityEvent | CreateDomainEvent + | MoveDomainEvent | CreateIngestionSourceEvent | UpdateIngestionSourceEvent | DeleteIngestionSourceEvent diff --git a/datahub-web-react/src/app/domain/CreateDomainModal.tsx b/datahub-web-react/src/app/domain/CreateDomainModal.tsx index 9fd24b551c0af..ca1bc30596003 100644 --- a/datahub-web-react/src/app/domain/CreateDomainModal.tsx +++ b/datahub-web-react/src/app/domain/CreateDomainModal.tsx @@ -5,9 +5,12 @@ import { useCreateDomainMutation } from '../../graphql/domain.generated'; import { useEnterKeyListener } from '../shared/useEnterKeyListener'; import { validateCustomUrnId } from '../shared/textUtil'; import analytics, { EventType } from '../analytics'; +import DomainParentSelect from '../entity/shared/EntityDropdown/DomainParentSelect'; +import { useIsNestedDomainsEnabled } from '../useAppConfig'; +import { useDomainsContext } from './DomainsContext'; const SuggestedNamesGroup = styled.div` - margin-top: 12px; + margin-top: 8px; `; const ClickableTag = styled(Tag)` @@ -16,9 +19,38 @@ const ClickableTag = styled(Tag)` } `; +const FormItem = styled(Form.Item)` + .ant-form-item-label { + padding-bottom: 2px; + } +`; + +const FormItemWithMargin = styled(FormItem)` + margin-bottom: 16px; +`; + +const FormItemNoMargin = styled(FormItem)` + margin-bottom: 0; +`; + +const FormItemLabel = styled(Typography.Text)` + font-weight: 600; + color: #373d44; +`; + +const AdvancedLabel = styled(Typography.Text)` + color: #373d44; +`; + type Props = { onClose: () => void; - onCreate: (urn: string, id: string | undefined, name: string, description: string | undefined) => void; + onCreate: ( + urn: string, + id: string | undefined, + name: string, + description: string | undefined, + parentDomain?: string, + ) => void; }; const SUGGESTED_DOMAIN_NAMES = ['Engineering', 'Marketing', 'Sales', 'Product']; @@ -28,7 +60,12 @@ const NAME_FIELD_NAME = 'name'; const DESCRIPTION_FIELD_NAME = 'description'; export default function CreateDomainModal({ onClose, onCreate }: Props) { + const isNestedDomainsEnabled = useIsNestedDomainsEnabled(); const [createDomainMutation] = useCreateDomainMutation(); + const { entityData } = useDomainsContext(); + const [selectedParentUrn, setSelectedParentUrn] = useState( + (isNestedDomainsEnabled && entityData?.urn) || '', + ); const [createButtonEnabled, setCreateButtonEnabled] = useState(false); const [form] = Form.useForm(); @@ -39,6 +76,7 @@ export default function CreateDomainModal({ onClose, onCreate }: Props) { id: form.getFieldValue(ID_FIELD_NAME), name: form.getFieldValue(NAME_FIELD_NAME), description: form.getFieldValue(DESCRIPTION_FIELD_NAME), + parentDomain: selectedParentUrn || undefined, }, }, }) @@ -46,6 +84,7 @@ export default function CreateDomainModal({ onClose, onCreate }: Props) { if (!errors) { analytics.event({ type: EventType.CreateDomainEvent, + parentDomainUrn: selectedParentUrn || undefined, }); message.success({ content: `Created domain!`, @@ -56,6 +95,7 @@ export default function CreateDomainModal({ onClose, onCreate }: Props) { form.getFieldValue(ID_FIELD_NAME), form.getFieldValue(NAME_FIELD_NAME), form.getFieldValue(DESCRIPTION_FIELD_NAME), + selectedParentUrn || undefined, ); form.resetFields(); } @@ -74,7 +114,7 @@ export default function CreateDomainModal({ onClose, onCreate }: Props) { return ( field.errors.length > 0)); }} > - Name}> - Give your new Domain a name. - Parent (optional)}> + + + )} + Name}> + - + {SUGGESTED_DOMAIN_NAMES.map((name) => { return ( @@ -134,29 +181,29 @@ export default function CreateDomainModal({ onClose, onCreate }: Props) { ); })} - - Description}> - - An optional description for your new domain. You can change this later. - - + Description} + help="You can always change the description later." + > + - - + + - Advanced} key="1"> - Domain Id}> - - By default, a random UUID will be generated to uniquely identify this domain. If - you'd like to provide a custom id instead to more easily keep track of this domain, + Advanced Options} key="1"> + Domain Id} + help="By default, a random UUID will be generated to uniquely identify this domain. If + you'd like to provide a custom id instead to more easily keep track of this domain, you may provide it here. Be careful, you cannot easily change the domain id after - creation. - - + ({ @@ -170,8 +217,8 @@ export default function CreateDomainModal({ onClose, onCreate }: Props) { ]} > - - + + diff --git a/datahub-web-react/src/app/domain/DomainIcon.tsx b/datahub-web-react/src/app/domain/DomainIcon.tsx new file mode 100644 index 0000000000000..0fe9892f0c281 --- /dev/null +++ b/datahub-web-react/src/app/domain/DomainIcon.tsx @@ -0,0 +1,11 @@ +import Icon from '@ant-design/icons/lib/components/Icon'; +import React from 'react'; +import { ReactComponent as DomainsIcon } from '../../images/domain.svg'; + +type Props = { + style?: React.CSSProperties; +}; + +export default function DomainIcon({ style }: Props) { + return ; +} diff --git a/datahub-web-react/src/app/domain/DomainRoutes.tsx b/datahub-web-react/src/app/domain/DomainRoutes.tsx new file mode 100644 index 0000000000000..56811ddc48c0c --- /dev/null +++ b/datahub-web-react/src/app/domain/DomainRoutes.tsx @@ -0,0 +1,39 @@ +import React, { useState } from 'react'; +import styled from 'styled-components/macro'; +import { Switch, Route } from 'react-router-dom'; +import { PageRoutes } from '../../conf/Global'; +import { EntityPage } from '../entity/EntityPage'; +import { useEntityRegistry } from '../useEntityRegistry'; +import ManageDomainsPageV2 from './nestedDomains/ManageDomainsPageV2'; +import { EntityType } from '../../types.generated'; +import ManageDomainsSidebar from './nestedDomains/ManageDomainsSidebar'; +import { DomainsContext } from './DomainsContext'; +import { GenericEntityProperties } from '../entity/shared/types'; + +const ContentWrapper = styled.div` + display: flex; + flex: 1; + overflow: hidden; +`; + +export default function DomainRoutes() { + const entityRegistry = useEntityRegistry(); + const [entityData, setEntityData] = useState(null); + const [parentDomainsToUpdate, setParentDomainsToUpdate] = useState([]); + + return ( + + + + + } + /> + } /> + + + + ); +} diff --git a/datahub-web-react/src/app/domain/DomainSearch.tsx b/datahub-web-react/src/app/domain/DomainSearch.tsx new file mode 100644 index 0000000000000..e82dae9c2c9e6 --- /dev/null +++ b/datahub-web-react/src/app/domain/DomainSearch.tsx @@ -0,0 +1,143 @@ +import React, { CSSProperties, useRef, useState } from 'react'; +import { Link } from 'react-router-dom'; +import styled from 'styled-components/macro'; +import Highlight from 'react-highlighter'; +import { useGetSearchResultsForMultipleQuery } from '../../graphql/search.generated'; +import { EntityType } from '../../types.generated'; +import { IconStyleType } from '../entity/Entity'; +import { ANTD_GRAY } from '../entity/shared/constants'; +import { SearchBar } from '../search/SearchBar'; +import ClickOutside from '../shared/ClickOutside'; +import { useEntityRegistry } from '../useEntityRegistry'; +import DomainIcon from './DomainIcon'; +import ParentEntities from '../search/filters/ParentEntities'; +import { getParentDomains } from './utils'; + +const DomainSearchWrapper = styled.div` + position: relative; +`; + +const ResultsWrapper = styled.div` + background-color: white; + border-radius: 5px; + box-shadow: 0 3px 6px -4px rgb(0 0 0 / 12%), 0 6px 16px 0 rgb(0 0 0 / 8%), 0 9px 28px 8px rgb(0 0 0 / 5%); + max-height: 380px; + overflow: auto; + padding: 8px; + position: absolute; + max-height: 210px; + overflow: auto; + width: calc(100% - 24px); + left: 12px; + top: 45px; + z-index: 1; +`; + +const SearchResult = styled(Link)` + color: #262626; + display: flex; + align-items: center; + gap: 8px; + height: 100%; + padding: 6px 8px; + width: 100%; + &:hover { + background-color: ${ANTD_GRAY[3]}; + color: #262626; + } +`; + +const IconWrapper = styled.span``; + +const highlightMatchStyle: CSSProperties = { + fontWeight: 'bold', + background: 'none', + padding: 0, +}; + +function DomainSearch() { + const [query, setQuery] = useState(''); + const [isSearchBarFocused, setIsSearchBarFocused] = useState(false); + const entityRegistry = useEntityRegistry(); + + const { data } = useGetSearchResultsForMultipleQuery({ + variables: { + input: { + types: [EntityType.Domain], + query, + start: 0, + count: 50, + }, + }, + skip: !query, + }); + + const searchResults = data?.searchAcrossEntities?.searchResults; + const timerRef = useRef(-1); + const handleQueryChange = (q: string) => { + window.clearTimeout(timerRef.current); + timerRef.current = window.setTimeout(() => { + setQuery(q); + }, 250); + }; + + return ( + + setIsSearchBarFocused(false)}> + null} + onQueryChange={(q) => handleQueryChange(q)} + entityRegistry={entityRegistry} + onFocus={() => setIsSearchBarFocused(true)} + /> + {isSearchBarFocused && searchResults && !!searchResults.length && ( + + {searchResults.map((result) => { + return ( + setIsSearchBarFocused(false)} + > + + {result.entity.type === EntityType.Domain ? ( + + ) : ( + entityRegistry.getIcon(result.entity.type, 12, IconStyleType.ACCENT) + )} + +
+ + + {entityRegistry.getDisplayName(result.entity.type, result.entity)} + +
+
+ ); + })} +
+ )} +
+
+ ); +} + +export default DomainSearch; diff --git a/datahub-web-react/src/app/domain/DomainsContext.tsx b/datahub-web-react/src/app/domain/DomainsContext.tsx new file mode 100644 index 0000000000000..ecbdaebd03817 --- /dev/null +++ b/datahub-web-react/src/app/domain/DomainsContext.tsx @@ -0,0 +1,21 @@ +import React, { useContext } from 'react'; +import { GenericEntityProperties } from '../entity/shared/types'; + +export interface DomainsContextType { + entityData: GenericEntityProperties | null; + setEntityData: (entityData: GenericEntityProperties | null) => void; + parentDomainsToUpdate: string[]; + setParentDomainsToUpdate: (values: string[]) => void; +} + +export const DomainsContext = React.createContext({ + entityData: null, + setEntityData: () => {}, + parentDomainsToUpdate: [], // used to tell domains to refetch their children count after updates (create, move, delete) + setParentDomainsToUpdate: () => {}, +}); + +export const useDomainsContext = () => { + const { entityData, setEntityData, parentDomainsToUpdate, setParentDomainsToUpdate } = useContext(DomainsContext); + return { entityData, setEntityData, parentDomainsToUpdate, setParentDomainsToUpdate }; +}; diff --git a/datahub-web-react/src/app/domain/DomainsList.tsx b/datahub-web-react/src/app/domain/DomainsList.tsx index f5fea36e32bda..b1095726808fe 100644 --- a/datahub-web-react/src/app/domain/DomainsList.tsx +++ b/datahub-web-react/src/app/domain/DomainsList.tsx @@ -18,8 +18,8 @@ import { OnboardingTour } from '../onboarding/OnboardingTour'; import { DOMAINS_INTRO_ID, DOMAINS_CREATE_DOMAIN_ID } from '../onboarding/config/DomainsOnboardingConfig'; import { getElasticCappedTotalValueText } from '../entity/shared/constants'; import { StyledTable } from '../entity/shared/components/styled/StyledTable'; -import { IconStyleType } from '../entity/Entity'; import { DomainOwnersColumn, DomainListMenuColumn, DomainNameColumn } from './DomainListColumns'; +import DomainIcon from './DomainIcon'; const DomainsContainer = styled.div``; @@ -82,7 +82,6 @@ export const DomainsList = () => { }, 2000); }; - const logoIcon = entityRegistry.getIcon(EntityType.Domain, 12, IconStyleType.ACCENT); const allColumns = [ { title: 'Name', @@ -91,7 +90,14 @@ export const DomainsList = () => { sorter: (sourceA, sourceB) => { return sourceA.name.localeCompare(sourceB.name); }, - render: DomainNameColumn(logoIcon), + render: DomainNameColumn( + , + ), }, { title: 'Owners', diff --git a/datahub-web-react/src/app/domain/ManageDomainsPage.tsx b/datahub-web-react/src/app/domain/ManageDomainsPage.tsx index 6172ac0246f58..3e19da1875037 100644 --- a/datahub-web-react/src/app/domain/ManageDomainsPage.tsx +++ b/datahub-web-react/src/app/domain/ManageDomainsPage.tsx @@ -1,7 +1,9 @@ import { Typography } from 'antd'; -import React from 'react'; +import React, { useState } from 'react'; import styled from 'styled-components'; import { DomainsList } from './DomainsList'; +import { DomainsContext } from './DomainsContext'; +import { GenericEntityProperties } from '../entity/shared/types'; const PageContainer = styled.div` padding-top: 20px; @@ -22,17 +24,22 @@ const PageTitle = styled(Typography.Title)` const ListContainer = styled.div``; export const ManageDomainsPage = () => { + const [entityData, setEntityData] = useState(null); + const [parentDomainsToUpdate, setParentDomainsToUpdate] = useState([]); + return ( - - - Domains - - View your DataHub Domains. Take administrative actions. - - - - - - + + + + Domains + + View your DataHub Domains. Take administrative actions. + + + + + + + ); }; diff --git a/datahub-web-react/src/app/domain/nestedDomains/DomainsSidebarHeader.tsx b/datahub-web-react/src/app/domain/nestedDomains/DomainsSidebarHeader.tsx new file mode 100644 index 0000000000000..d9ff18514d8cf --- /dev/null +++ b/datahub-web-react/src/app/domain/nestedDomains/DomainsSidebarHeader.tsx @@ -0,0 +1,58 @@ +import { useApolloClient } from '@apollo/client'; +import { PlusOutlined } from '@ant-design/icons'; +import { Button } from 'antd'; +import React, { useState } from 'react'; +import { Link } from 'react-router-dom'; +import styled from 'styled-components'; +import { ANTD_GRAY, ANTD_GRAY_V2 } from '../../entity/shared/constants'; +import DomainsTitle from './DomainsTitle'; +import { PageRoutes } from '../../../conf/Global'; +import CreateDomainModal from '../CreateDomainModal'; +import { updateListDomainsCache } from '../utils'; +import { useDomainsContext } from '../DomainsContext'; + +const HeaderWrapper = styled.div` + border-bottom: 1px solid ${ANTD_GRAY[4]}; + padding: 16px; + font-size: 20px; + display: flex; + align-items: center; + justify-content: space-between; +`; + +const StyledButton = styled(Button)` + box-shadow: none; + border-color: ${ANTD_GRAY_V2[6]}; +`; + +const StyledLink = styled(Link)` + color: inherit; + + &:hover { + color: inherit; + } +`; + +export default function DomainsSidebarHeader() { + const { setParentDomainsToUpdate } = useDomainsContext(); + const [isCreatingDomain, setIsCreatingDomain] = useState(false); + const client = useApolloClient(); + + return ( + + + + + } onClick={() => setIsCreatingDomain(true)} /> + {isCreatingDomain && ( + setIsCreatingDomain(false)} + onCreate={(urn, id, name, description, parentDomain) => { + updateListDomainsCache(client, urn, id, name, description, parentDomain); + if (parentDomain) setParentDomainsToUpdate([parentDomain]); + }} + /> + )} + + ); +} diff --git a/datahub-web-react/src/app/domain/nestedDomains/DomainsTitle.tsx b/datahub-web-react/src/app/domain/nestedDomains/DomainsTitle.tsx new file mode 100644 index 0000000000000..3aa7c8330d079 --- /dev/null +++ b/datahub-web-react/src/app/domain/nestedDomains/DomainsTitle.tsx @@ -0,0 +1,18 @@ +import React from 'react'; +import styled from 'styled-components'; +import DomainIcon from '../DomainIcon'; + +const IconWrapper = styled.span` + margin-right: 10px; +`; + +export default function DomainsTitle() { + return ( + + + + + Domains + + ); +} diff --git a/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx b/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx new file mode 100644 index 0000000000000..486169c3559d3 --- /dev/null +++ b/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx @@ -0,0 +1,60 @@ +import { useApolloClient } from '@apollo/client'; +import { Button } from 'antd'; +import { PlusOutlined } from '@ant-design/icons'; +import React, { useEffect, useState } from 'react'; +import styled from 'styled-components/macro'; +import DomainsTitle from './DomainsTitle'; +import RootDomains from './RootDomains'; +import { DOMAINS_CREATE_DOMAIN_ID, DOMAINS_INTRO_ID } from '../../onboarding/config/DomainsOnboardingConfig'; +import { OnboardingTour } from '../../onboarding/OnboardingTour'; +import { ANTD_GRAY_V2 } from '../../entity/shared/constants'; +import CreateDomainModal from '../CreateDomainModal'; +import { updateListDomainsCache } from '../utils'; +import { useDomainsContext } from '../DomainsContext'; + +const PageWrapper = styled.div` + background-color: ${ANTD_GRAY_V2[1]}; + flex: 1; + display: flex; + flex-direction: column; +`; + +const Header = styled.div` + display: flex; + justify-content: space-between; + padding: 32px 24px; + font-size: 30px; + align-items: center; +`; + +export default function ManageDomainsPageV2() { + const { setEntityData, setParentDomainsToUpdate } = useDomainsContext(); + const [isCreatingDomain, setIsCreatingDomain] = useState(false); + const client = useApolloClient(); + + useEffect(() => { + setEntityData(null); + }, [setEntityData]); + + return ( + + +
+ + +
+ + {isCreatingDomain && ( + setIsCreatingDomain(false)} + onCreate={(urn, id, name, description, parentDomain) => { + updateListDomainsCache(client, urn, id, name, description, parentDomain); + if (parentDomain) setParentDomainsToUpdate([parentDomain]); + }} + /> + )} +
+ ); +} diff --git a/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsSidebar.tsx b/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsSidebar.tsx new file mode 100644 index 0000000000000..827031138dcdb --- /dev/null +++ b/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsSidebar.tsx @@ -0,0 +1,28 @@ +import React, { useState } from 'react'; +import { MAX_BROWSER_WIDTH, MIN_BROWSWER_WIDTH } from '../../glossary/BusinessGlossaryPage'; +import { ProfileSidebarResizer } from '../../entity/shared/containers/profile/sidebar/ProfileSidebarResizer'; +import DomainsSidebarHeader from './DomainsSidebarHeader'; +import { SidebarWrapper } from '../../shared/sidebar/components'; +import DomainNavigator from './domainNavigator/DomainNavigator'; +import DomainSearch from '../DomainSearch'; + +export default function ManageDomainsSidebar() { + const [browserWidth, setBrowserWith] = useState(window.innerWidth * 0.2); + + return ( + <> + + + + + + + setBrowserWith(Math.min(Math.max(width, MIN_BROWSWER_WIDTH), MAX_BROWSER_WIDTH)) + } + initialSize={browserWidth} + isSidebarOnLeft + /> + + ); +} diff --git a/datahub-web-react/src/app/domain/nestedDomains/RootDomains.tsx b/datahub-web-react/src/app/domain/nestedDomains/RootDomains.tsx new file mode 100644 index 0000000000000..757119919e336 --- /dev/null +++ b/datahub-web-react/src/app/domain/nestedDomains/RootDomains.tsx @@ -0,0 +1,31 @@ +import React from 'react'; +import styled from 'styled-components'; +import { Message } from '../../shared/Message'; +import { ResultWrapper } from '../../search/SearchResultList'; +import { useEntityRegistry } from '../../useEntityRegistry'; +import { EntityType } from '../../../types.generated'; +import useListDomains from '../useListDomains'; + +const DomainsWrapper = styled.div` + overflow: auto; + padding: 0 28px 16px 28px; +`; + +export default function RootDomains() { + const entityRegistry = useEntityRegistry(); + const { loading, error, data, sortedDomains } = useListDomains({}); + + return ( + <> + {!data && loading && } + {error && } + + {sortedDomains?.map((domain) => ( + + {entityRegistry.renderSearchResult(EntityType.Domain, { entity: domain, matchedFields: [] })} + + ))} + + + ); +} diff --git a/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNavigator.tsx b/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNavigator.tsx new file mode 100644 index 0000000000000..0fbcffb9a260c --- /dev/null +++ b/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNavigator.tsx @@ -0,0 +1,37 @@ +import { Alert } from 'antd'; +import React from 'react'; +import styled from 'styled-components'; +import useListDomains from '../../useListDomains'; +import DomainNode from './DomainNode'; +import { Domain } from '../../../../types.generated'; + +const NavigatorWrapper = styled.div` + font-size: 14px; + max-height: calc(100% - 65px); + padding: 8px 8px 16px 16px; + overflow: auto; +`; + +interface Props { + domainUrnToHide?: string; + selectDomainOverride?: (domain: Domain) => void; +} + +export default function DomainNavigator({ domainUrnToHide, selectDomainOverride }: Props) { + const { sortedDomains, error } = useListDomains({}); + + return ( + + {error && } + {sortedDomains?.map((domain) => ( + + ))} + + ); +} diff --git a/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNode.tsx b/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNode.tsx new file mode 100644 index 0000000000000..09c8e13853bb7 --- /dev/null +++ b/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNode.tsx @@ -0,0 +1,137 @@ +import { Typography } from 'antd'; +import React, { useEffect, useMemo } from 'react'; +import { useHistory } from 'react-router'; +import styled from 'styled-components'; +import { Domain } from '../../../../types.generated'; +import { useEntityRegistry } from '../../../useEntityRegistry'; +import { RotatingTriangle } from '../../../shared/sidebar/components'; +import DomainIcon from '../../DomainIcon'; +import useListDomains from '../../useListDomains'; +import useToggle from '../../../shared/useToggle'; +import { BodyContainer, BodyGridExpander } from '../../../shared/components'; +import { ANTD_GRAY_V2 } from '../../../entity/shared/constants'; +import { useDomainsContext } from '../../DomainsContext'; +import { applyOpacity } from '../../../shared/styleUtils'; +import useHasDomainChildren from './useHasDomainChildren'; + +const RowWrapper = styled.div` + align-items: center; + display: flex; + padding: 2px 2px 4px 0; + overflow: hidden; +`; + +const NameWrapper = styled(Typography.Text)<{ isSelected: boolean; addLeftPadding: boolean }>` + flex: 1; + overflow: hidden; + padding: 2px; + ${(props) => + props.isSelected && `background-color: ${applyOpacity(props.theme.styles['primary-color'] || '', 10)};`} + ${(props) => props.addLeftPadding && 'padding-left: 22px;'} + + &:hover { + ${(props) => !props.isSelected && `background-color: ${ANTD_GRAY_V2[1]};`} + cursor: pointer; + } + + svg { + margin-right: 6px; + } +`; + +const ButtonWrapper = styled.span` + margin-right: 4px; + font-size: 16px; + height: 16px; + width: 16px; + + svg { + height: 10px; + width: 10px; + } + + .ant-btn { + height: 16px; + width: 16px; + } +`; + +const StyledExpander = styled(BodyGridExpander)` + padding-left: 24px; +`; + +interface Props { + domain: Domain; + numDomainChildren: number; + domainUrnToHide?: string; + selectDomainOverride?: (domain: Domain) => void; +} + +export default function DomainNode({ domain, numDomainChildren, domainUrnToHide, selectDomainOverride }: Props) { + const shouldHideDomain = domainUrnToHide === domain.urn; + const history = useHistory(); + const entityRegistry = useEntityRegistry(); + const { entityData } = useDomainsContext(); + const { isOpen, isClosing, toggle, toggleOpen } = useToggle({ + initialValue: false, + closeDelay: 250, + }); + const { sortedDomains } = useListDomains({ parentDomain: domain.urn, skip: !isOpen || shouldHideDomain }); + const isOnEntityPage = entityData && entityData.urn === domain.urn; + const displayName = entityRegistry.getDisplayName(domain.type, isOnEntityPage ? entityData : domain); + const isInSelectMode = !!selectDomainOverride; + const hasDomainChildren = useHasDomainChildren({ domainUrn: domain.urn, numDomainChildren }); + + const shouldAutoOpen = useMemo( + () => !isInSelectMode && entityData?.parentDomains?.domains.some((parent) => parent.urn === domain.urn), + [isInSelectMode, entityData, domain.urn], + ); + + useEffect(() => { + if (shouldAutoOpen) toggleOpen(); + }, [shouldAutoOpen, toggleOpen]); + + function handleSelectDomain() { + if (selectDomainOverride) { + selectDomainOverride(domain); + } else { + history.push(entityRegistry.getEntityUrl(domain.type, domain.urn)); + } + } + + if (shouldHideDomain) return null; + + return ( + <> + + {hasDomainChildren && ( + + + + )} + + {!isInSelectMode && } + {displayName} + + + + + {sortedDomains?.map((childDomain) => ( + + ))} + + + + ); +} diff --git a/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/useHasDomainChildren.ts b/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/useHasDomainChildren.ts new file mode 100644 index 0000000000000..d16d5de23fbaf --- /dev/null +++ b/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/useHasDomainChildren.ts @@ -0,0 +1,29 @@ +import { useEffect } from 'react'; +import { useGetDomainChildrenCountLazyQuery } from '../../../../graphql/domain.generated'; +import { useDomainsContext } from '../../DomainsContext'; + +interface Props { + domainUrn: string; + numDomainChildren: number; // number that comes from parent query to render this domain +} + +export default function useHasDomainChildren({ domainUrn, numDomainChildren }: Props) { + const { parentDomainsToUpdate, setParentDomainsToUpdate } = useDomainsContext(); + const [getDomainChildrenCount, { data: childrenData }] = useGetDomainChildrenCountLazyQuery(); + + useEffect(() => { + let timer; + // fetch updated children count to determine if we show triangle toggle + if (parentDomainsToUpdate.includes(domainUrn)) { + timer = setTimeout(() => { + getDomainChildrenCount({ variables: { urn: domainUrn } }); + setParentDomainsToUpdate(parentDomainsToUpdate.filter((urn) => urn !== domainUrn)); + }, 2000); + } + return () => { + if (timer) window.clearTimeout(timer); + }; + }, [domainUrn, getDomainChildrenCount, parentDomainsToUpdate, setParentDomainsToUpdate]); + + return childrenData ? !!childrenData.domain?.children?.total : !!numDomainChildren; +} diff --git a/datahub-web-react/src/app/domain/useListDomains.tsx b/datahub-web-react/src/app/domain/useListDomains.tsx new file mode 100644 index 0000000000000..74f6b454f11d4 --- /dev/null +++ b/datahub-web-react/src/app/domain/useListDomains.tsx @@ -0,0 +1,27 @@ +import { useListDomainsQuery } from '../../graphql/domain.generated'; +import { useSortedDomains } from './utils'; + +interface Props { + parentDomain?: string; + skip?: boolean; + sortBy?: 'displayName'; +} + +export default function useListDomains({ parentDomain, skip, sortBy = 'displayName' }: Props) { + const { data, error, loading, refetch } = useListDomainsQuery({ + skip, + variables: { + input: { + start: 0, + count: 1000, // don't paginate the home page, get all root level domains + parentDomain, + }, + }, + fetchPolicy: 'network-only', // always use network request first to populate cache + nextFetchPolicy: 'cache-first', // then use cache after that so we can manipulate it + }); + + const sortedDomains = useSortedDomains(data?.listDomains?.domains, sortBy); + + return { data, sortedDomains, error, loading, refetch }; +} diff --git a/datahub-web-react/src/app/domain/utils.ts b/datahub-web-react/src/app/domain/utils.ts index 3af161bc44565..8273c33e2c41d 100644 --- a/datahub-web-react/src/app/domain/utils.ts +++ b/datahub-web-react/src/app/domain/utils.ts @@ -1,9 +1,18 @@ +import { ApolloClient } from '@apollo/client'; +import { useEffect } from 'react'; +import { isEqual } from 'lodash'; import { ListDomainsDocument, ListDomainsQuery } from '../../graphql/domain.generated'; +import { Entity, EntityType } from '../../types.generated'; +import { GenericEntityProperties } from '../entity/shared/types'; +import usePrevious from '../shared/usePrevious'; +import { useDomainsContext } from './DomainsContext'; +import { useEntityRegistry } from '../useEntityRegistry'; +import EntityRegistry from '../entity/EntityRegistry'; /** * Add an entry to the list domains cache. */ -export const addToListDomainsCache = (client, newDomain, pageSize) => { +export const addToListDomainsCache = (client, newDomain, pageSize, parentDomain?: string) => { // Read the data from our cache for this query. const currData: ListDomainsQuery | null = client.readQuery({ query: ListDomainsDocument, @@ -11,6 +20,7 @@ export const addToListDomainsCache = (client, newDomain, pageSize) => { input: { start: 0, count: pageSize, + parentDomain, }, }, }); @@ -25,6 +35,7 @@ export const addToListDomainsCache = (client, newDomain, pageSize) => { input: { start: 0, count: pageSize, + parentDomain, }, }, data: { @@ -38,10 +49,39 @@ export const addToListDomainsCache = (client, newDomain, pageSize) => { }); }; +export const updateListDomainsCache = ( + client: ApolloClient, + urn: string, + id: string | undefined, + name: string, + description: string | undefined, + parentDomain?: string, +) => { + addToListDomainsCache( + client, + { + urn, + id: id || null, + type: EntityType.Domain, + properties: { + name, + description: description || null, + }, + ownership: null, + entities: null, + children: null, + dataProducts: null, + parentDomains: null, + }, + 1000, + parentDomain, + ); +}; + /** * Remove an entry from the list domains cache. */ -export const removeFromListDomainsCache = (client, urn, page, pageSize) => { +export const removeFromListDomainsCache = (client, urn, page, pageSize, parentDomain?: string) => { // Read the data from our cache for this query. const currData: ListDomainsQuery | null = client.readQuery({ query: ListDomainsDocument, @@ -49,6 +89,7 @@ export const removeFromListDomainsCache = (client, urn, page, pageSize) => { input: { start: (page - 1) * pageSize, count: pageSize, + parentDomain, }, }, }); @@ -63,6 +104,7 @@ export const removeFromListDomainsCache = (client, urn, page, pageSize) => { input: { start: (page - 1) * pageSize, count: pageSize, + parentDomain, }, }, data: { @@ -75,3 +117,29 @@ export const removeFromListDomainsCache = (client, urn, page, pageSize) => { }, }); }; + +export function useUpdateDomainEntityDataOnChange(entityData: GenericEntityProperties | null, entityType: EntityType) { + const { setEntityData } = useDomainsContext(); + const previousEntityData = usePrevious(entityData); + + useEffect(() => { + if (EntityType.Domain === entityType && !isEqual(entityData, previousEntityData)) { + setEntityData(entityData); + } + }); +} + +export function useSortedDomains(domains?: Array, sortBy?: 'displayName') { + const entityRegistry = useEntityRegistry(); + if (!domains || !sortBy) return domains; + return [...domains].sort((a, b) => { + const nameA = entityRegistry.getDisplayName(EntityType.Domain, a) || ''; + const nameB = entityRegistry.getDisplayName(EntityType.Domain, b) || ''; + return nameA.localeCompare(nameB); + }); +} + +export function getParentDomains(domain: T, entityRegistry: EntityRegistry) { + const props = entityRegistry.getGenericEntityProperties(EntityType.Domain, domain); + return props?.parentDomains?.domains ?? []; +} diff --git a/datahub-web-react/src/app/entity/EntityRegistry.tsx b/datahub-web-react/src/app/entity/EntityRegistry.tsx index 56b085cf69f4a..6642c2c7b0467 100644 --- a/datahub-web-react/src/app/entity/EntityRegistry.tsx +++ b/datahub-web-react/src/app/entity/EntityRegistry.tsx @@ -45,6 +45,12 @@ export default class EntityRegistry { return this.entities; } + getEntitiesForSearchRoutes(): Array> { + return this.entities.filter( + (entity) => !GLOSSARY_ENTITY_TYPES.includes(entity.type) && entity.type !== EntityType.Domain, + ); + } + getNonGlossaryEntities(): Array> { return this.entities.filter((entity) => !GLOSSARY_ENTITY_TYPES.includes(entity.type)); } diff --git a/datahub-web-react/src/app/entity/domain/DomainEntity.tsx b/datahub-web-react/src/app/entity/domain/DomainEntity.tsx index 3b3045abe2a7c..68c06935dbbe5 100644 --- a/datahub-web-react/src/app/entity/domain/DomainEntity.tsx +++ b/datahub-web-react/src/app/entity/domain/DomainEntity.tsx @@ -1,5 +1,4 @@ import * as React from 'react'; -import { FolderOutlined } from '@ant-design/icons'; import { Domain, EntityType, SearchResult } from '../../../types.generated'; import { Entity, EntityCapabilityType, IconStyleType, PreviewType } from '../Entity'; import { Preview } from './preview/Preview'; @@ -14,7 +13,7 @@ import { EntityMenuItems } from '../shared/EntityDropdown/EntityDropdown'; import { EntityActionItem } from '../shared/entity/EntityActions'; import DataProductsTab from './DataProductsTab/DataProductsTab'; import { EntityProfileTab } from '../shared/constants'; -// import { EntityActionItem } from '../shared/entity/EntityActions'; +import DomainIcon from '../../domain/DomainIcon'; /** * Definition of the DataHub Domain entity. @@ -24,21 +23,26 @@ export class DomainEntity implements Entity { icon = (fontSize: number, styleType: IconStyleType, color?: string) => { if (styleType === IconStyleType.TAB_VIEW) { - return ; + return ; } if (styleType === IconStyleType.HIGHLIGHT) { - return ; + return ; } if (styleType === IconStyleType.SVG) { return ( - + ); } return ( - { useEntityQuery={useGetDomainQuery} useUpdateQuery={undefined} getOverrideProperties={this.getOverridePropertiesFromEntity} - headerDropdownItems={new Set([EntityMenuItems.DELETE])} + headerDropdownItems={new Set([EntityMenuItems.MOVE, EntityMenuItems.DELETE])} headerActionItems={new Set([EntityActionItem.BATCH_ADD_DOMAIN])} isNameEditable tabs={[ @@ -102,11 +106,11 @@ export class DomainEntity implements Entity { renderPreview = (_: PreviewType, data: Domain) => { return ( ); @@ -116,11 +120,11 @@ export class DomainEntity implements Entity { const data = result.entity as Domain; return ( ); diff --git a/datahub-web-react/src/app/entity/domain/preview/DomainEntitiesSnippet.tsx b/datahub-web-react/src/app/entity/domain/preview/DomainEntitiesSnippet.tsx new file mode 100644 index 0000000000000..6d36964004d64 --- /dev/null +++ b/datahub-web-react/src/app/entity/domain/preview/DomainEntitiesSnippet.tsx @@ -0,0 +1,45 @@ +import { DatabaseOutlined, FileDoneOutlined } from '@ant-design/icons'; +import { VerticalDivider } from '@remirror/react'; +import React from 'react'; +import styled from 'styled-components'; +import { SearchResultFields_Domain_Fragment } from '../../../../graphql/search.generated'; +import { ANTD_GRAY_V2 } from '../../shared/constants'; +import DomainIcon from '../../../domain/DomainIcon'; +import { pluralize } from '../../../shared/textUtil'; + +const Wrapper = styled.div` + color: ${ANTD_GRAY_V2[8]}; + font-size: 12px; + display: flex; + align-items: center; + + svg { + margin-right: 4px; + } +`; + +const StyledDivider = styled(VerticalDivider)` + &&& { + margin: 0 8px; + } +`; + +interface Props { + domain: SearchResultFields_Domain_Fragment; +} + +export default function DomainEntitiesSnippet({ domain }: Props) { + const entityCount = domain.entities?.total || 0; + const subDomainCount = domain.children?.total || 0; + const dataProductCount = domain.dataProducts?.total || 0; + + return ( + + {entityCount} {entityCount === 1 ? 'entity' : 'entities'} + + {subDomainCount} {pluralize(subDomainCount, 'sub-domain')} + + {dataProductCount} {pluralize(dataProductCount, 'data product')} + + ); +} diff --git a/datahub-web-react/src/app/entity/domain/preview/Preview.tsx b/datahub-web-react/src/app/entity/domain/preview/Preview.tsx index 18cb2bb75df03..83198f6eba2d8 100644 --- a/datahub-web-react/src/app/entity/domain/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/domain/preview/Preview.tsx @@ -1,23 +1,24 @@ import React from 'react'; -import { EntityType, Owner, SearchInsight } from '../../../../types.generated'; +import { Domain, EntityType, Owner, SearchInsight } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { useEntityRegistry } from '../../../useEntityRegistry'; -import { IconStyleType } from '../../Entity'; +import DomainEntitiesSnippet from './DomainEntitiesSnippet'; +import DomainIcon from '../../../domain/DomainIcon'; export const Preview = ({ + domain, urn, name, description, owners, - count, insights, logoComponent, }: { + domain: Domain; urn: string; name: string; description?: string | null; owners?: Array | null; - count?: number | null; insights?: Array | null; logoComponent?: JSX.Element; }): JSX.Element => { @@ -29,11 +30,19 @@ export const Preview = ({ urn={urn} description={description || ''} type="Domain" - typeIcon={entityRegistry.getIcon(EntityType.Domain, 14, IconStyleType.ACCENT)} + typeIcon={ + + } owners={owners} insights={insights} logoComponent={logoComponent} - entityCount={count || undefined} + parentEntities={domain.parentDomains?.domains} + snippet={} /> ); }; diff --git a/datahub-web-react/src/app/entity/glossaryNode/preview/Preview.tsx b/datahub-web-react/src/app/entity/glossaryNode/preview/Preview.tsx index 6c6ea163c6786..3938049059e4d 100644 --- a/datahub-web-react/src/app/entity/glossaryNode/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/glossaryNode/preview/Preview.tsx @@ -27,7 +27,7 @@ export const Preview = ({ owners={owners} logoComponent={} type={entityRegistry.getEntityName(EntityType.GlossaryNode)} - parentNodes={parentNodes} + parentEntities={parentNodes?.nodes} /> ); }; diff --git a/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx b/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx index b6802e37652cb..ee87633cb6fa9 100644 --- a/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx @@ -39,7 +39,7 @@ export const Preview = ({ type="Glossary Term" typeIcon={entityRegistry.getIcon(EntityType.GlossaryTerm, 14, IconStyleType.ACCENT)} deprecation={deprecation} - parentNodes={parentNodes} + parentEntities={parentNodes?.nodes} domain={domain} entityTitleSuffix={ View Related Entities diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/DomainParentSelect.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/DomainParentSelect.tsx new file mode 100644 index 0000000000000..d43b04ec11a16 --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/DomainParentSelect.tsx @@ -0,0 +1,108 @@ +import React, { MouseEvent } from 'react'; +import { Select } from 'antd'; +import { CloseCircleFilled } from '@ant-design/icons'; +import styled from 'styled-components'; +import { Domain, EntityType } from '../../../../types.generated'; +import { useEntityRegistry } from '../../../useEntityRegistry'; +import ClickOutside from '../../../shared/ClickOutside'; +import { BrowserWrapper } from '../../../shared/tags/AddTagsTermsModal'; +import useParentSelector from './useParentSelector'; +import DomainNavigator from '../../../domain/nestedDomains/domainNavigator/DomainNavigator'; +import { useDomainsContext } from '../../../domain/DomainsContext'; +import ParentEntities from '../../../search/filters/ParentEntities'; +import { getParentDomains } from '../../../domain/utils'; + +const SearchResultContainer = styled.div` + display: flex; + flex-direction: column; + justify-content: center; +`; + +// filter out entity itself and its children +export function filterResultsForMove(entity: Domain, entityUrn: string) { + return ( + entity.urn !== entityUrn && + entity.__typename === 'Domain' && + !entity.parentDomains?.domains.some((node) => node.urn === entityUrn) + ); +} + +interface Props { + selectedParentUrn: string; + setSelectedParentUrn: (parent: string) => void; + isMoving?: boolean; +} + +export default function DomainParentSelect({ selectedParentUrn, setSelectedParentUrn, isMoving }: Props) { + const entityRegistry = useEntityRegistry(); + const { entityData } = useDomainsContext(); + const domainUrn = entityData?.urn; + + const { + searchResults, + searchQuery, + isFocusedOnInput, + selectedParentName, + selectParentFromBrowser, + onSelectParent, + handleSearch, + clearSelectedParent, + setIsFocusedOnInput, + } = useParentSelector({ + entityType: EntityType.Domain, + entityData, + selectedParentUrn, + setSelectedParentUrn, + }); + const domainSearchResultsFiltered = + isMoving && domainUrn + ? searchResults.filter((r) => filterResultsForMove(r.entity as Domain, domainUrn)) + : searchResults; + + function selectDomain(domain: Domain) { + selectParentFromBrowser(domain.urn, entityRegistry.getDisplayName(EntityType.Domain, domain)); + } + + const isShowingDomainNavigator = !searchQuery && isFocusedOnInput; + + const handleFocus = () => setIsFocusedOnInput(true); + const handleClickOutside = () => setIsFocusedOnInput(false); + + const handleClear = (event: MouseEvent) => { + // Prevent, otherwise antd will close the select menu but leaves it focused + event.stopPropagation(); + clearSelectedParent(); + }; + + return ( + + + + + + + ); +} diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx index 3442c57ba2d61..be975249b2670 100644 --- a/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx @@ -20,7 +20,10 @@ import { ANTD_GRAY } from '../constants'; import { useEntityRegistry } from '../../../useEntityRegistry'; import useDeleteEntity from './useDeleteEntity'; import { getEntityProfileDeleteRedirectPath } from '../../../shared/deleteUtils'; -import { isDeleteDisabled } from './utils'; +import { shouldDisplayChildDeletionWarning, isDeleteDisabled, isMoveDisabled } from './utils'; +import { useUserContext } from '../../../context/useUserContext'; +import MoveDomainModal from './MoveDomainModal'; +import { useIsNestedDomainsEnabled } from '../../../useAppConfig'; export enum EntityMenuItems { COPY_URL, @@ -89,8 +92,10 @@ function EntityDropdown(props: Props) { options, } = props; + const me = useUserContext(); const entityRegistry = useEntityRegistry(); const [updateDeprecation] = useUpdateDeprecationMutation(); + const isNestedDomainsEnabled = useIsNestedDomainsEnabled(); const { onDeleteEntity, hasBeenDeleted } = useDeleteEntity( urn, entityType, @@ -131,9 +136,9 @@ function EntityDropdown(props: Props) { const pageUrl = window.location.href; const isGlossaryEntity = entityType === EntityType.GlossaryNode || entityType === EntityType.GlossaryTerm; - const entityHasChildren = !!entityData?.children?.total; - const canManageGlossaryEntity = !!entityData?.privileges?.canManageEntity; + const isDomainEntity = entityType === EntityType.Domain; const canCreateGlossaryEntity = !!entityData?.privileges?.canManageChildren; + const isDomainMoveHidden = !isNestedDomainsEnabled && isDomainEntity; /** * A default path to redirect to if the entity is deleted. @@ -192,10 +197,10 @@ function EntityDropdown(props: Props) { )} - {menuItems.has(EntityMenuItems.MOVE) && ( + {!isDomainMoveHidden && menuItems.has(EntityMenuItems.MOVE) && ( setIsMoveModalVisible(true)} > @@ -206,17 +211,16 @@ function EntityDropdown(props: Props) { {menuItems.has(EntityMenuItems.DELETE) && ( @@ -252,7 +256,10 @@ function EntityDropdown(props: Props) { refetch={refetchForEntity} /> )} - {isMoveModalVisible && setIsMoveModalVisible(false)} />} + {isMoveModalVisible && isGlossaryEntity && ( + setIsMoveModalVisible(false)} /> + )} + {isMoveModalVisible && isDomainEntity && setIsMoveModalVisible(false)} />} {hasBeenDeleted && !onDelete && deleteRedirectPath && } ); diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/MoveDomainModal.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/MoveDomainModal.tsx new file mode 100644 index 0000000000000..cdbf6fdabf3c9 --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/MoveDomainModal.tsx @@ -0,0 +1,102 @@ +import React, { useState } from 'react'; +import styled from 'styled-components/macro'; +import { message, Button, Modal, Typography, Form } from 'antd'; +import { useRefetch } from '../EntityContext'; +import { useEntityRegistry } from '../../../useEntityRegistry'; +import { useMoveDomainMutation } from '../../../../graphql/domain.generated'; +import DomainParentSelect from './DomainParentSelect'; +import { useHandleMoveDomainComplete } from './useHandleMoveDomainComplete'; +import { useDomainsContext } from '../../../domain/DomainsContext'; +import { EntityType } from '../../../../types.generated'; + +const StyledItem = styled(Form.Item)` + margin-bottom: 0; +`; + +const OptionalWrapper = styled.span` + font-weight: normal; +`; + +interface Props { + onClose: () => void; +} + +function MoveDomainModal(props: Props) { + const { onClose } = props; + const { entityData } = useDomainsContext(); + const domainUrn = entityData?.urn; + const [form] = Form.useForm(); + const entityRegistry = useEntityRegistry(); + const [selectedParentUrn, setSelectedParentUrn] = useState(''); + const refetch = useRefetch(); + + const [moveDomainMutation] = useMoveDomainMutation(); + + const { handleMoveDomainComplete } = useHandleMoveDomainComplete(); + + function moveDomain() { + if (!domainUrn) return; + + moveDomainMutation({ + variables: { + input: { + resourceUrn: domainUrn, + parentDomain: selectedParentUrn || undefined, + }, + }, + }) + .then(() => { + message.loading({ content: 'Updating...', duration: 2 }); + const newParentToUpdate = selectedParentUrn || undefined; + handleMoveDomainComplete(domainUrn, newParentToUpdate); + setTimeout(() => { + message.success({ + content: `Moved ${entityRegistry.getEntityName(EntityType.Domain)}!`, + duration: 2, + }); + refetch(); + }, 2000); + }) + .catch((e) => { + message.destroy(); + message.error({ content: `Failed to move: \n ${e.message || ''}`, duration: 3 }); + }); + onClose(); + } + + return ( + + + + + } + > +
+ + Move To (optional) + + } + > + + + + +
+
+ ); +} + +export default MoveDomainModal; diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/NodeParentSelect.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/NodeParentSelect.tsx index 86c2b84a67c3d..c3bfac35c2ca6 100644 --- a/datahub-web-react/src/app/entity/shared/EntityDropdown/NodeParentSelect.tsx +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/NodeParentSelect.tsx @@ -1,12 +1,12 @@ -import React, { useState, useEffect } from 'react'; +import React from 'react'; import { Select } from 'antd'; -import { useGetSearchResultsLazyQuery } from '../../../../graphql/search.generated'; -import { EntityType, GlossaryNode } from '../../../../types.generated'; +import { EntityType, GlossaryNode, SearchResult } from '../../../../types.generated'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { useEntityData } from '../EntityContext'; import ClickOutside from '../../../shared/ClickOutside'; import GlossaryBrowser from '../../../glossary/GlossaryBrowser/GlossaryBrowser'; import { BrowserWrapper } from '../../../shared/tags/AddTagsTermsModal'; +import useParentSelector from './useParentSelector'; // filter out entity itself and its children export function filterResultsForMove(entity: GlossaryNode, entityUrn: string) { @@ -25,60 +25,29 @@ interface Props { function NodeParentSelect(props: Props) { const { selectedParentUrn, setSelectedParentUrn, isMoving } = props; - const [selectedParentName, setSelectedParentName] = useState(''); - const [isFocusedOnInput, setIsFocusedOnInput] = useState(false); - const [searchQuery, setSearchQuery] = useState(''); const entityRegistry = useEntityRegistry(); const { entityData, urn: entityDataUrn, entityType } = useEntityData(); - const [nodeSearch, { data: nodeData }] = useGetSearchResultsLazyQuery(); - let nodeSearchResults = nodeData?.search?.searchResults || []; - if (isMoving) { - nodeSearchResults = nodeSearchResults.filter((r) => - filterResultsForMove(r.entity as GlossaryNode, entityDataUrn), - ); - } - - useEffect(() => { - if (entityData && selectedParentUrn === entityDataUrn) { - const displayName = entityRegistry.getDisplayName(EntityType.GlossaryNode, entityData); - setSelectedParentName(displayName); - } - }, [entityData, entityRegistry, selectedParentUrn, entityDataUrn]); - - function handleSearch(text: string) { - setSearchQuery(text); - nodeSearch({ - variables: { - input: { - type: EntityType.GlossaryNode, - query: text, - start: 0, - count: 5, - }, - }, - }); - } + const { + searchResults, + searchQuery, + isFocusedOnInput, + selectedParentName, + selectParentFromBrowser, + onSelectParent, + handleSearch, + clearSelectedParent, + setIsFocusedOnInput, + } = useParentSelector({ + entityType: EntityType.GlossaryNode, + entityData, + selectedParentUrn, + setSelectedParentUrn, + }); - function onSelectParentNode(parentNodeUrn: string) { - const selectedNode = nodeSearchResults.find((result) => result.entity.urn === parentNodeUrn); - if (selectedNode) { - setSelectedParentUrn(parentNodeUrn); - const displayName = entityRegistry.getDisplayName(selectedNode.entity.type, selectedNode.entity); - setSelectedParentName(displayName); - } - } - - function clearSelectedParent() { - setSelectedParentUrn(''); - setSelectedParentName(''); - setSearchQuery(''); - } - - function selectNodeFromBrowser(urn: string, displayName: string) { - setIsFocusedOnInput(false); - setSelectedParentUrn(urn); - setSelectedParentName(displayName); + let nodeSearchResults: SearchResult[] = []; + if (isMoving) { + nodeSearchResults = searchResults.filter((r) => filterResultsForMove(r.entity as GlossaryNode, entityDataUrn)); } const isShowingGlossaryBrowser = !searchQuery && isFocusedOnInput; @@ -91,7 +60,7 @@ function NodeParentSelect(props: Props) { allowClear filterOption={false} value={selectedParentName} - onSelect={onSelectParentNode} + onSelect={onSelectParent} onSearch={handleSearch} onClear={clearSelectedParent} onFocus={() => setIsFocusedOnInput(true)} @@ -107,7 +76,7 @@ function NodeParentSelect(props: Props) { diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/useDeleteEntity.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/useDeleteEntity.tsx index c4647b995337b..1e4737135ed74 100644 --- a/datahub-web-react/src/app/entity/shared/EntityDropdown/useDeleteEntity.tsx +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/useDeleteEntity.tsx @@ -6,6 +6,7 @@ import { getDeleteEntityMutation } from '../../../shared/deleteUtils'; import analytics, { EventType } from '../../../analytics'; import { useGlossaryEntityData } from '../GlossaryEntityContext'; import { getParentNodeToUpdate, updateGlossarySidebar } from '../../../glossary/utils'; +import { useHandleDeleteDomain } from './useHandleDeleteDomain'; /** * Performs the flow for deleting an entity of a given type. @@ -25,6 +26,7 @@ function useDeleteEntity( const [hasBeenDeleted, setHasBeenDeleted] = useState(false); const entityRegistry = useEntityRegistry(); const { isInGlossaryContext, urnsToUpdate, setUrnsToUpdate } = useGlossaryEntityData(); + const { handleDeleteDomain } = useHandleDeleteDomain({ entityData, urn }); const maybeDeleteEntity = getDeleteEntityMutation(type)(); const deleteEntity = (maybeDeleteEntity && maybeDeleteEntity[0]) || undefined; @@ -47,6 +49,11 @@ function useDeleteEntity( duration: 2, }); } + + if (entityData.type === EntityType.Domain) { + handleDeleteDomain(); + } + setTimeout( () => { setHasBeenDeleted(true); diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/useHandleDeleteDomain.ts b/datahub-web-react/src/app/entity/shared/EntityDropdown/useHandleDeleteDomain.ts new file mode 100644 index 0000000000000..ebbb8f9968a6a --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/useHandleDeleteDomain.ts @@ -0,0 +1,27 @@ +import { useApolloClient } from '@apollo/client'; +import { GenericEntityProperties } from '../types'; +import { removeFromListDomainsCache } from '../../../domain/utils'; +import { useDomainsContext } from '../../../domain/DomainsContext'; + +interface DeleteDomainProps { + entityData: GenericEntityProperties; + urn: string; +} + +export function useHandleDeleteDomain({ entityData, urn }: DeleteDomainProps) { + const client = useApolloClient(); + const { parentDomainsToUpdate, setParentDomainsToUpdate } = useDomainsContext(); + + const handleDeleteDomain = () => { + if (entityData.parentDomains && entityData.parentDomains.domains.length > 0) { + const parentDomainUrn = entityData.parentDomains.domains[0].urn; + + removeFromListDomainsCache(client, urn, 1, 1000, parentDomainUrn); + setParentDomainsToUpdate([...parentDomainsToUpdate, parentDomainUrn]); + } else { + removeFromListDomainsCache(client, urn, 1, 1000); + } + }; + + return { handleDeleteDomain }; +} diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/useHandleMoveDomainComplete.ts b/datahub-web-react/src/app/entity/shared/EntityDropdown/useHandleMoveDomainComplete.ts new file mode 100644 index 0000000000000..81f19331e18b7 --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/useHandleMoveDomainComplete.ts @@ -0,0 +1,40 @@ +import { useApolloClient } from '@apollo/client'; +import { removeFromListDomainsCache, updateListDomainsCache } from '../../../domain/utils'; +import { useDomainsContext } from '../../../domain/DomainsContext'; +import { Domain } from '../../../../types.generated'; +import analytics from '../../../analytics/analytics'; +import { EventType } from '../../../analytics'; + +export function useHandleMoveDomainComplete() { + const client = useApolloClient(); + const { entityData, parentDomainsToUpdate, setParentDomainsToUpdate } = useDomainsContext(); + + const handleMoveDomainComplete = (urn: string, newParentUrn?: string) => { + if (!entityData) return; + + const domain = entityData as Domain; + const oldParentUrn = domain.parentDomains?.domains.length ? domain.parentDomains.domains[0].urn : undefined; + + analytics.event({ + type: EventType.MoveDomainEvent, + oldParentDomainUrn: oldParentUrn, + parentDomainUrn: newParentUrn, + }); + + removeFromListDomainsCache(client, urn, 1, 1000, oldParentUrn); + updateListDomainsCache( + client, + domain.urn, + undefined, + domain.properties?.name ?? '', + domain.properties?.description ?? '', + newParentUrn, + ); + const newParentDomainsToUpdate = [...parentDomainsToUpdate]; + if (oldParentUrn) newParentDomainsToUpdate.push(oldParentUrn); + if (newParentUrn) newParentDomainsToUpdate.push(newParentUrn); + setParentDomainsToUpdate(newParentDomainsToUpdate); + }; + + return { handleMoveDomainComplete }; +} diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/useParentSelector.ts b/datahub-web-react/src/app/entity/shared/EntityDropdown/useParentSelector.ts new file mode 100644 index 0000000000000..32b5d8ca790cc --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/useParentSelector.ts @@ -0,0 +1,76 @@ +import { useEffect, useState } from 'react'; +import { useGetSearchResultsLazyQuery } from '../../../../graphql/search.generated'; +import { EntityType } from '../../../../types.generated'; +import { useEntityRegistry } from '../../../useEntityRegistry'; +import { GenericEntityProperties } from '../types'; + +interface Props { + entityType: EntityType; + entityData: GenericEntityProperties | null; + selectedParentUrn: string; + setSelectedParentUrn: (parent: string) => void; +} + +export default function useParentSelector({ entityType, entityData, selectedParentUrn, setSelectedParentUrn }: Props) { + const [selectedParentName, setSelectedParentName] = useState(); + const [isFocusedOnInput, setIsFocusedOnInput] = useState(false); + const [searchQuery, setSearchQuery] = useState(''); + const entityRegistry = useEntityRegistry(); + + const [search, { data }] = useGetSearchResultsLazyQuery(); + const searchResults = data?.search?.searchResults || []; + + useEffect(() => { + if (entityData && selectedParentUrn === entityData.urn) { + const displayName = entityRegistry.getDisplayName(entityType, entityData); + setSelectedParentName(displayName); + } + }, [entityData, entityRegistry, selectedParentUrn, entityData?.urn, entityType]); + + function handleSearch(text: string) { + setSearchQuery(text); + search({ + variables: { + input: { + type: entityType, + query: text, + start: 0, + count: 5, + }, + }, + }); + } + + function onSelectParent(parentUrn: string) { + const selectedParent = searchResults.find((result) => result.entity.urn === parentUrn); + if (selectedParent) { + setSelectedParentUrn(parentUrn); + const displayName = entityRegistry.getDisplayName(selectedParent.entity.type, selectedParent.entity); + setSelectedParentName(displayName); + } + } + + function clearSelectedParent() { + setSelectedParentUrn(''); + setSelectedParentName(undefined); + setSearchQuery(''); + } + + function selectParentFromBrowser(urn: string, displayName: string) { + setIsFocusedOnInput(false); + setSelectedParentUrn(urn); + setSelectedParentName(displayName); + } + + return { + searchQuery, + searchResults, + isFocusedOnInput, + selectedParentName, + onSelectParent, + handleSearch, + setIsFocusedOnInput, + selectParentFromBrowser, + clearSelectedParent, + }; +} diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/utils.ts b/datahub-web-react/src/app/entity/shared/EntityDropdown/utils.ts index 9e3d14cfd32e1..0a4c2c34441a4 100644 --- a/datahub-web-react/src/app/entity/shared/EntityDropdown/utils.ts +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/utils.ts @@ -1,7 +1,11 @@ -import { EntityType } from '../../../../types.generated'; +import { EntityType, PlatformPrivileges } from '../../../../types.generated'; import { GenericEntityProperties } from '../types'; -export function isDeleteDisabled(entityType: EntityType, entityData: GenericEntityProperties | null) { +export function isDeleteDisabled( + entityType: EntityType, + entityData: GenericEntityProperties | null, + platformPrivileges: PlatformPrivileges | null | undefined, +) { if (entityType === EntityType.GlossaryTerm || entityType === EntityType.GlossaryNode) { const entityHasChildren = !!entityData?.children?.total; const canManageGlossaryEntity = !!entityData?.privileges?.canManageEntity; @@ -11,5 +15,47 @@ export function isDeleteDisabled(entityType: EntityType, entityData: GenericEnti if (entityType === EntityType.DataProduct) { return false; // TODO: update with permissions } + if (entityType === EntityType.Domain) { + const entityHasChildren = !!entityData?.children?.total; + const canManageDomains = !!platformPrivileges?.manageDomains; + const canDeleteDomainEntity = !entityHasChildren && canManageDomains; + return !canDeleteDomainEntity; + } + return false; +} + +export function isMoveDisabled( + entityType: EntityType, + entityData: GenericEntityProperties | null, + platformPrivileges: PlatformPrivileges | null | undefined, +) { + if (entityType === EntityType.GlossaryTerm || entityType === EntityType.GlossaryNode) { + const canManageGlossaryEntity = !!entityData?.privileges?.canManageEntity; + return !canManageGlossaryEntity; + } + if (entityType === EntityType.Domain) { + const canManageDomains = !!platformPrivileges?.manageDomains; + return !canManageDomains; + } + return false; +} + +export function shouldDisplayChildDeletionWarning( + entityType: EntityType, + entityData: GenericEntityProperties | null, + platformPrivileges: PlatformPrivileges | null | undefined, +) { + if (entityType === EntityType.GlossaryTerm || entityType === EntityType.GlossaryNode) { + const entityHasChildren = !!entityData?.children?.total; + const canManageGlossaryEntity = !!entityData?.privileges?.canManageEntity; + const hasTooltip = entityHasChildren && canManageGlossaryEntity; + return hasTooltip; + } + if (entityType === EntityType.Domain) { + const entityHasChildren = !!entityData?.children?.total; + const canManageDomains = !!platformPrivileges?.manageDomains; + const hasTooltip = entityHasChildren && canManageDomains; + return hasTooltip; + } return false; } diff --git a/datahub-web-react/src/app/entity/shared/constants.ts b/datahub-web-react/src/app/entity/shared/constants.ts index 447780fb0d641..9df5923d18542 100644 --- a/datahub-web-react/src/app/entity/shared/constants.ts +++ b/datahub-web-react/src/app/entity/shared/constants.ts @@ -21,6 +21,7 @@ export const ANTD_GRAY = { }; export const ANTD_GRAY_V2 = { + 1: '#F8F9Fa', 2: '#F3F5F6', 5: '#DDE0E4', 6: '#B2B8BD', diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/EntityProfile.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/EntityProfile.tsx index 8a559013c892c..5384eb94429ed 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/EntityProfile.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/EntityProfile.tsx @@ -45,6 +45,7 @@ import { LINEAGE_GRAPH_TIME_FILTER_ID, } from '../../../../onboarding/config/LineageGraphOnboardingConfig'; import { useAppConfig } from '../../../../useAppConfig'; +import { useUpdateDomainEntityDataOnChange } from '../../../../domain/utils'; type Props = { urn: string; @@ -212,6 +213,7 @@ export const EntityProfile = ({ useGetDataForProfile({ urn, entityType, useEntityQuery, getOverrideProperties }); useUpdateGlossaryEntityDataOnChange(entityData, entityType); + useUpdateDomainEntityDataOnChange(entityData, entityType); const maybeUpdateEntity = useUpdateQuery?.({ onCompleted: () => refetch(), diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityName.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityName.tsx index d6df1cf8818df..762bd5f9111a0 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityName.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityName.tsx @@ -33,17 +33,27 @@ function EntityName(props: Props) { const { urn, entityType, entityData } = useEntityData(); const entityName = entityData ? entityRegistry.getDisplayName(entityType, entityData) : ''; const [updatedName, setUpdatedName] = useState(entityName); + const [isEditing, setIsEditing] = useState(false); useEffect(() => { setUpdatedName(entityName); }, [entityName]); - const [updateName] = useUpdateNameMutation(); + const [updateName, { loading: isMutatingName }] = useUpdateNameMutation(); - const handleSaveName = (name: string) => { + const handleStartEditing = () => { + setIsEditing(true); + }; + + const handleChangeName = (name: string) => { + if (name === entityName) { + setIsEditing(false); + return; + } setUpdatedName(name); updateName({ variables: { input: { name, urn } } }) .then(() => { + setIsEditing(false); message.success({ content: 'Name Updated', duration: 2 }); refetch(); if (isInGlossaryContext) { @@ -62,13 +72,19 @@ function EntityName(props: Props) { return ( <> {isNameEditable ? ( - + {updatedName} ) : ( - - {entityData && entityRegistry.getDisplayName(entityType, entityData)} - + {entityName} )} ); diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/header/PlatformContent/PlatformContentContainer.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/header/PlatformContent/PlatformContentContainer.tsx index 5e87f093c3778..0eb223c04d439 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/header/PlatformContent/PlatformContentContainer.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/header/PlatformContent/PlatformContentContainer.tsx @@ -50,6 +50,7 @@ function PlatformContentContainer() { parentContainers={entityData?.parentContainers?.containers} parentContainersRef={contentRef} areContainersTruncated={isContentTruncated} + parentEntities={entityData?.parentDomains?.domains} /> ); } diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/header/PlatformContent/PlatformContentView.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/header/PlatformContent/PlatformContentView.tsx index 51a422ba93418..1090dac501d0b 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/header/PlatformContent/PlatformContentView.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/header/PlatformContent/PlatformContentView.tsx @@ -2,15 +2,16 @@ import React from 'react'; import styled from 'styled-components'; import { Typography, Image } from 'antd'; import { Maybe } from 'graphql/jsutils/Maybe'; -import { Container, GlossaryNode } from '../../../../../../../types.generated'; +import { Container, Entity } from '../../../../../../../types.generated'; import { ANTD_GRAY } from '../../../../constants'; import ContainerLink from './ContainerLink'; -import ParentNodesView, { +import { StyledRightOutlined, ParentNodesWrapper as ParentContainersWrapper, Ellipsis, StyledTooltip, } from './ParentNodesView'; +import ParentEntities from '../../../../../../search/filters/ParentEntities'; const LogoIcon = styled.span` display: flex; @@ -75,14 +76,14 @@ interface Props { typeIcon?: JSX.Element; entityType?: string; parentContainers?: Maybe[] | null; - parentNodes?: GlossaryNode[] | null; + parentEntities?: Entity[] | null; parentContainersRef: React.RefObject; areContainersTruncated: boolean; } function PlatformContentView(props: Props) { const { - parentNodes, + parentEntities, platformName, platformLogoUrl, platformNames, @@ -103,7 +104,7 @@ function PlatformContentView(props: Props) { {typeIcon && {typeIcon}} {entityType} - {(!!platformName || !!instanceId || !!parentContainers?.length || !!parentNodes?.length) && ( + {(!!platformName || !!instanceId || !!parentContainers?.length || !!parentEntities?.length) && ( )} {platformName && ( @@ -146,7 +147,7 @@ function PlatformContentView(props: Props) { {directParentContainer && } - + ); } diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/sidebar/Domain/SetDomainModal.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/sidebar/Domain/SetDomainModal.tsx index fe49409b00653..405442e8d7f50 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/sidebar/Domain/SetDomainModal.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/sidebar/Domain/SetDomainModal.tsx @@ -2,14 +2,16 @@ import React, { useRef, useState } from 'react'; import { Button, Form, message, Modal, Select } from 'antd'; import { useGetSearchResultsLazyQuery } from '../../../../../../../graphql/search.generated'; -import { Entity, EntityType } from '../../../../../../../types.generated'; +import { Domain, Entity, EntityType } from '../../../../../../../types.generated'; import { useBatchSetDomainMutation } from '../../../../../../../graphql/mutations.generated'; import { useEntityRegistry } from '../../../../../../useEntityRegistry'; import { useEnterKeyListener } from '../../../../../../shared/useEnterKeyListener'; -import { useGetRecommendations } from '../../../../../../shared/recommendation'; import { DomainLabel } from '../../../../../../shared/DomainLabel'; import { handleBatchError } from '../../../../utils'; import { tagRender } from '../tagRenderer'; +import { BrowserWrapper } from '../../../../../../shared/tags/AddTagsTermsModal'; +import DomainNavigator from '../../../../../../domain/nestedDomains/domainNavigator/DomainNavigator'; +import ClickOutside from '../../../../../../shared/ClickOutside'; type Props = { urns: string[]; @@ -28,6 +30,7 @@ type SelectedDomain = { export const SetDomainModal = ({ urns, onCloseModal, refetch, defaultValue, onOkOverride, titleOverride }: Props) => { const entityRegistry = useEntityRegistry(); + const [isFocusedOnInput, setIsFocusedOnInput] = useState(false); const [inputValue, setInputValue] = useState(''); const [selectedDomain, setSelectedDomain] = useState( defaultValue @@ -42,8 +45,8 @@ export const SetDomainModal = ({ urns, onCloseModal, refetch, defaultValue, onOk const domainSearchResults = domainSearchData?.search?.searchResults?.map((searchResult) => searchResult.entity) || []; const [batchSetDomainMutation] = useBatchSetDomainMutation(); - const [recommendedData] = useGetRecommendations([EntityType.Domain]); const inputEl = useRef(null); + const isShowingDomainNavigator = !inputValue && isFocusedOnInput; const onModalClose = () => { setInputValue(''); @@ -74,7 +77,7 @@ export const SetDomainModal = ({ urns, onCloseModal, refetch, defaultValue, onOk ); }; - const domainResult = !inputValue || inputValue.length === 0 ? recommendedData : domainSearchResults; + const domainResult = !inputValue || inputValue.length === 0 ? [] : domainSearchResults; const domainSearchOptions = domainResult?.map((result) => { return renderSearchResult(result); @@ -95,6 +98,15 @@ export const SetDomainModal = ({ urns, onCloseModal, refetch, defaultValue, onOk } }; + function selectDomainFromBrowser(domain: Domain) { + setIsFocusedOnInput(false); + setSelectedDomain({ + displayName: entityRegistry.getDisplayName(EntityType.Domain, domain), + type: EntityType.Domain, + urn: domain.urn, + }); + } + const onDeselectDomain = () => { setInputValue(''); setSelectedDomain(undefined); @@ -148,6 +160,11 @@ export const SetDomainModal = ({ urns, onCloseModal, refetch, defaultValue, onOk setInputValue(''); } + function handleCLickOutside() { + // delay closing the domain navigator so we don't get a UI "flash" between showing search results and navigator + setTimeout(() => setIsFocusedOnInput(false), 0); + } + return (
- + + + + + +
diff --git a/datahub-web-react/src/app/entity/shared/types.ts b/datahub-web-react/src/app/entity/shared/types.ts index e36f5050a24b7..6596711d4e82a 100644 --- a/datahub-web-react/src/app/entity/shared/types.ts +++ b/datahub-web-react/src/app/entity/shared/types.ts @@ -37,6 +37,7 @@ import { FabricType, BrowsePathV2, DataJobInputOutput, + ParentDomainsResult, } from '../../../types.generated'; import { FetchedEntity } from '../../lineage/types'; @@ -65,6 +66,7 @@ export type EntitySubHeaderSection = { export type GenericEntityProperties = { urn?: string; + type?: EntityType; name?: Maybe; properties?: Maybe<{ description?: Maybe; @@ -98,6 +100,7 @@ export type GenericEntityProperties = { status?: Maybe; deprecation?: Maybe; parentContainers?: Maybe; + parentDomains?: Maybe; children?: Maybe; parentNodes?: Maybe; isAChildren?: Maybe; diff --git a/datahub-web-react/src/app/glossary/BusinessGlossaryPage.tsx b/datahub-web-react/src/app/glossary/BusinessGlossaryPage.tsx index 2adeb6b1684dc..11f54cb5078e6 100644 --- a/datahub-web-react/src/app/glossary/BusinessGlossaryPage.tsx +++ b/datahub-web-react/src/app/glossary/BusinessGlossaryPage.tsx @@ -38,12 +38,6 @@ const MainContentWrapper = styled.div` flex-direction: column; `; -export const BrowserWrapper = styled.div<{ width: number }>` - max-height: 100%; - width: ${(props) => props.width}px; - min-width: ${(props) => props.width}px; -`; - export const MAX_BROWSER_WIDTH = 500; export const MIN_BROWSWER_WIDTH = 200; diff --git a/datahub-web-react/src/app/glossary/GlossarySidebar.tsx b/datahub-web-react/src/app/glossary/GlossarySidebar.tsx index 0bdcbf707ce09..2d620fb06df38 100644 --- a/datahub-web-react/src/app/glossary/GlossarySidebar.tsx +++ b/datahub-web-react/src/app/glossary/GlossarySidebar.tsx @@ -1,14 +1,8 @@ import React, { useState } from 'react'; -import styled from 'styled-components/macro'; import GlossarySearch from './GlossarySearch'; import GlossaryBrowser from './GlossaryBrowser/GlossaryBrowser'; import { ProfileSidebarResizer } from '../entity/shared/containers/profile/sidebar/ProfileSidebarResizer'; - -const BrowserWrapper = styled.div<{ width: number }>` - max-height: 100%; - width: ${(props) => props.width}px; - min-width: ${(props) => props.width}px; -`; +import { SidebarWrapper } from '../shared/sidebar/components'; export const MAX_BROWSER_WIDTH = 500; export const MIN_BROWSWER_WIDTH = 200; @@ -18,10 +12,10 @@ export default function GlossarySidebar() { return ( <> - + - + setBrowserWith(Math.min(Math.max(width, MIN_BROWSWER_WIDTH), MAX_BROWSER_WIDTH)) diff --git a/datahub-web-react/src/app/permissions/policy/PolicyPrivilegeForm.tsx b/datahub-web-react/src/app/permissions/policy/PolicyPrivilegeForm.tsx index c57273c2ea3d9..1520388a5033a 100644 --- a/datahub-web-react/src/app/permissions/policy/PolicyPrivilegeForm.tsx +++ b/datahub-web-react/src/app/permissions/policy/PolicyPrivilegeForm.tsx @@ -1,4 +1,4 @@ -import React, { useMemo } from 'react'; +import React, { useMemo, useState } from 'react'; import { Link } from 'react-router-dom'; import { Form, Select, Tag, Tooltip, Typography } from 'antd'; import styled from 'styled-components/macro'; @@ -9,7 +9,7 @@ import { useGetSearchResultsForMultipleLazyQuery, useGetSearchResultsLazyQuery, } from '../../../graphql/search.generated'; -import { ResourceFilter, PolicyType, EntityType } from '../../../types.generated'; +import { ResourceFilter, PolicyType, EntityType, Domain } from '../../../types.generated'; import { convertLegacyResourceFilter, createCriterionValue, @@ -21,6 +21,9 @@ import { mapResourceTypeToPrivileges, setFieldValues, } from './policyUtils'; +import DomainNavigator from '../../domain/nestedDomains/domainNavigator/DomainNavigator'; +import { BrowserWrapper } from '../../shared/tags/AddTagsTermsModal'; +import ClickOutside from '../../shared/ClickOutside'; type Props = { policyType: PolicyType; @@ -55,6 +58,8 @@ export default function PolicyPrivilegeForm({ setPrivileges, }: Props) { const entityRegistry = useEntityRegistry(); + const [domainInputValue, setDomainInputValue] = useState(''); + const [isFocusedOnInput, setIsFocusedOnInput] = useState(false); // Configuration used for displaying options const { @@ -98,6 +103,7 @@ export default function PolicyPrivilegeForm({ const resourceSelectValue = resourceEntities.map((criterionValue) => criterionValue.value); const domainSelectValue = getFieldValues(resources.filter, 'DOMAIN').map((criterionValue) => criterionValue.value); const privilegesSelectValue = privileges; + const isShowingDomainNavigator = !domainInputValue && isFocusedOnInput; // Construct privilege options for dropdown const platformPrivileges = policiesConfig?.platformPrivileges || []; @@ -193,13 +199,14 @@ export default function PolicyPrivilegeForm({ }; // When a domain is selected, add its urn to the list of domains - const onSelectDomain = (domain) => { + const onSelectDomain = (domainUrn, domainObj?: Domain) => { const filter = resources.filter || { criteria: [], }; + const domainEntity = domainObj || getEntityFromSearchResults(domainSearchResults, domainUrn); const updatedFilter = setFieldValues(filter, 'DOMAIN', [ ...domains, - createCriterionValueWithEntity(domain, getEntityFromSearchResults(domainSearchResults, domain) || null), + createCriterionValueWithEntity(domainUrn, domainEntity || null), ]); setResources({ ...resources, @@ -207,6 +214,11 @@ export default function PolicyPrivilegeForm({ }); }; + function selectDomainFromBrowser(domain: Domain) { + onSelectDomain(domain.urn, domain); + setIsFocusedOnInput(false); + } + // When a domain is deselected, remove its urn from the list of domains const onDeselectDomain = (domain) => { const filter = resources.filter || { @@ -243,6 +255,7 @@ export default function PolicyPrivilegeForm({ // Handle domain search, if the domain type has an associated EntityType mapping. const handleDomainSearch = (text: string) => { const trimmedText: string = text.trim(); + setDomainInputValue(trimmedText); searchDomains({ variables: { input: { @@ -276,6 +289,15 @@ export default function PolicyPrivilegeForm({ : displayStr; }; + function handleCLickOutside() { + // delay closing the domain navigator so we don't get a UI "flash" between showing search results and navigator + setTimeout(() => setIsFocusedOnInput(false), 0); + } + + function handleBlur() { + setDomainInputValue(''); + } + return ( {showResourceFilterInput && ( @@ -342,33 +364,41 @@ export default function PolicyPrivilegeForm({ )} {showResourceFilterInput && ( - Domain}> + Select Domains}> - Search for domains the policy should apply to. If none is selected, policy is applied to{' '} - all resources in all domains. + The policy will apply to any chosen domains and all their nested domains. If none are + selected, the policy is applied to all resources of in all domains. - + + + + + + )} Privileges}> diff --git a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx index 319c8ed0a3e1d..36c4c020e7131 100644 --- a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx +++ b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx @@ -14,10 +14,10 @@ import { CorpUser, Deprecation, Domain, - ParentNodesResult, EntityPath, DataProduct, Health, + Entity, } from '../../types.generated'; import TagTermGroup from '../shared/tags/TagTermGroup'; import { ANTD_GRAY } from '../entity/shared/constants'; @@ -191,7 +191,7 @@ interface Props { // how the listed node is connected to the source node degree?: number; parentContainers?: ParentContainersResult | null; - parentNodes?: ParentNodesResult | null; + parentEntities?: Entity[] | null; previewType?: Maybe; paths?: EntityPath[]; health?: Health[]; @@ -231,7 +231,7 @@ export default function DefaultPreviewCard({ onClick, degree, parentContainers, - parentNodes, + parentEntities, platforms, logoUrls, previewType, @@ -280,7 +280,7 @@ export default function DefaultPreviewCard({ typeIcon={typeIcon} entityType={type} parentContainers={parentContainers?.containers} - parentNodes={parentNodes?.nodes} + parentEntities={parentEntities} parentContainersRef={contentRef} areContainersTruncated={isContentTruncated} /> diff --git a/datahub-web-react/src/app/recommendations/renderer/component/DomainSearchList.tsx b/datahub-web-react/src/app/recommendations/renderer/component/DomainSearchList.tsx index d3cc35ef6a932..c82521dab1bc9 100644 --- a/datahub-web-react/src/app/recommendations/renderer/component/DomainSearchList.tsx +++ b/datahub-web-react/src/app/recommendations/renderer/component/DomainSearchList.tsx @@ -1,10 +1,14 @@ +import { ArrowRightOutlined } from '@ant-design/icons'; import React from 'react'; import { Link } from 'react-router-dom'; import styled from 'styled-components'; import { Domain, EntityType, RecommendationContent } from '../../../../types.generated'; -import { IconStyleType } from '../../../entity/Entity'; import { LogoCountCard } from '../../../shared/LogoCountCard'; import { useEntityRegistry } from '../../../useEntityRegistry'; +import DomainIcon from '../../../domain/DomainIcon'; +import { PageRoutes } from '../../../../conf/Global'; +import { HomePageButton } from '../../../shared/components'; +import { HoverEntityTooltip } from './HoverEntityTooltip'; const DomainListContainer = styled.div` display: flex; @@ -13,6 +17,17 @@ const DomainListContainer = styled.div` flex-wrap: wrap; `; +const AllDomainsWrapper = styled.div` + color: ${(props) => props.theme.styles['primary-color']}; + font-size: 14px; +`; + +const AllDomainsText = styled.div` + margin-bottom: 8px; +`; + +const NUM_DOMAIN_CARDS = 9; + type Props = { content: Array; onClick?: (index: number) => void; @@ -23,7 +38,8 @@ export const DomainSearchList = ({ content, onClick }: Props) => { const domainsWithCounts: Array<{ domain: Domain; count?: number }> = content .map((cnt) => ({ domain: cnt.entity, count: cnt.params?.contentParams?.count })) - .filter((domainWithCount) => domainWithCount.domain !== null && domainWithCount !== undefined) as Array<{ + .filter((domainWithCount) => domainWithCount?.domain !== null) + .slice(0, NUM_DOMAIN_CARDS) as Array<{ domain: Domain; count?: number; }>; @@ -31,18 +47,34 @@ export const DomainSearchList = ({ content, onClick }: Props) => { return ( {domainsWithCounts.map((domain, index) => ( - onClick?.(index)} - > - - + + onClick?.(index)} + > + + } + count={domain.count} + /> + + ))} + + + + View All Domains + + + + ); }; diff --git a/datahub-web-react/src/app/recommendations/renderer/component/HoverEntityTooltip.tsx b/datahub-web-react/src/app/recommendations/renderer/component/HoverEntityTooltip.tsx index a39a39cd52db9..9ff0a1a2f940b 100644 --- a/datahub-web-react/src/app/recommendations/renderer/component/HoverEntityTooltip.tsx +++ b/datahub-web-react/src/app/recommendations/renderer/component/HoverEntityTooltip.tsx @@ -1,3 +1,4 @@ +import { TooltipPlacement } from 'antd/es/tooltip'; import { Tooltip } from 'antd'; import React from 'react'; import { Entity } from '../../../../types.generated'; @@ -9,9 +10,10 @@ type Props = { // whether the tooltip can be opened or if it should always stay closed canOpen?: boolean; children: React.ReactNode; + placement?: TooltipPlacement; }; -export const HoverEntityTooltip = ({ entity, canOpen = true, children }: Props) => { +export const HoverEntityTooltip = ({ entity, canOpen = true, children, placement }: Props) => { const entityRegistry = useEntityRegistry(); if (!entity || !entity.type || !entity.urn) { @@ -23,7 +25,7 @@ export const HoverEntityTooltip = ({ entity, canOpen = true, children }: Props) {entityRegistry.renderPreview(entity.type, PreviewType.HOVER_CARD, entity)}} diff --git a/datahub-web-react/src/app/search/SearchResultList.tsx b/datahub-web-react/src/app/search/SearchResultList.tsx index 386b22f34602b..f8ca9a46d1a81 100644 --- a/datahub-web-react/src/app/search/SearchResultList.tsx +++ b/datahub-web-react/src/app/search/SearchResultList.tsx @@ -31,7 +31,7 @@ const ThinDivider = styled(Divider)` margin-bottom: 16px; `; -const ResultWrapper = styled.div<{ showUpdatedStyles: boolean }>` +export const ResultWrapper = styled.div<{ showUpdatedStyles: boolean }>` ${(props) => props.showUpdatedStyles && ` @@ -39,7 +39,6 @@ const ResultWrapper = styled.div<{ showUpdatedStyles: boolean }>` border-radius: 5px; margin: 0 auto 8px auto; padding: 8px 16px; - max-width: 1200px; border-bottom: 1px solid ${ANTD_GRAY[5]}; `} `; diff --git a/datahub-web-react/src/app/search/SearchResults.tsx b/datahub-web-react/src/app/search/SearchResults.tsx index d21213f462f54..b93e835970196 100644 --- a/datahub-web-react/src/app/search/SearchResults.tsx +++ b/datahub-web-react/src/app/search/SearchResults.tsx @@ -27,6 +27,7 @@ import useToggleSidebar from './useToggleSidebar'; import SearchSortSelect from './sorting/SearchSortSelect'; import { combineSiblingsInSearchResults } from './utils/combineSiblingsInSearchResults'; import SearchQuerySuggester from './suggestions/SearchQuerySugggester'; +import { ANTD_GRAY_V2 } from '../entity/shared/constants'; const SearchResultsWrapper = styled.div<{ v2Styles: boolean }>` display: flex; @@ -55,7 +56,7 @@ const ResultContainer = styled.div<{ v2Styles: boolean }>` ? ` display: flex; flex-direction: column; - background-color: #F8F9FA; + background-color: ${ANTD_GRAY_V2[1]}; ` : ` max-width: calc(100% - 260px); diff --git a/datahub-web-react/src/app/search/autoComplete/AutoCompleteEntity.tsx b/datahub-web-react/src/app/search/autoComplete/AutoCompleteEntity.tsx index d241a3895f19f..2154837fa5e26 100644 --- a/datahub-web-react/src/app/search/autoComplete/AutoCompleteEntity.tsx +++ b/datahub-web-react/src/app/search/autoComplete/AutoCompleteEntity.tsx @@ -10,6 +10,8 @@ import AutoCompleteEntityIcon from './AutoCompleteEntityIcon'; import { SuggestionText } from './styledComponents'; import AutoCompletePlatformNames from './AutoCompletePlatformNames'; import { getPlatformName } from '../../entity/shared/utils'; +import { getParentEntities } from '../filters/utils'; +import ParentEntities from '../filters/ParentEntities'; const AutoCompleteEntityWrapper = styled.div` display: flex; @@ -76,11 +78,12 @@ export default function AutoCompleteEntity({ query, entity, siblings, hasParentT // Need to reverse parentContainers since it returns direct parent first. const orderedParentContainers = [...parentContainers].reverse(); const subtype = genericEntityProps?.subTypes?.typeNames?.[0]; + const parentEntities = getParentEntities(entity) || []; const showPlatforms = !!platforms.length; const showPlatformDivider = !!platforms.length && !!parentContainers.length; const showParentContainers = !!parentContainers.length; - const showHeader = showPlatforms || showParentContainers; + const showHeader = showPlatforms || showParentContainers || parentEntities.length > 0; return ( @@ -96,6 +99,7 @@ export default function AutoCompleteEntity({ query, entity, siblings, hasParentT {showPlatforms && } {showPlatformDivider && } {showParentContainers && } + )} ` @@ -102,6 +102,10 @@ const ArrowButton = styled(Button)<{ isOpen: boolean }>` `} `; +const ParentWrapper = styled.div` + max-width: 220px; +`; + interface Props { filterOption: FilterOptionType; selectedFilterOptions: FilterOptionType[]; @@ -124,8 +128,7 @@ export default function FilterOption({ const shouldShowIcon = field === PLATFORM_FILTER_NAME && icon !== null; const shouldShowTagColor = field === TAGS_FILTER_NAME && entity?.type === EntityType.Tag; const isSubTypeFilter = field === TYPE_NAMES_FILTER_NAME; - const isGlossaryTerm = entity?.type === EntityType.GlossaryTerm; - const parentNodes: GlossaryNode[] = isGlossaryTerm ? (entity as GlossaryTerm).parentNodes?.nodes || [] : []; + const parentEntities: Entity[] = getParentEntities(entity as Entity) || []; // only entity type filters return 10,000 max aggs const countText = count === MAX_COUNT_VAL && field === ENTITY_SUB_TYPE_FILTER_NAME ? '10k+' : formatNumber(count); @@ -143,7 +146,7 @@ export default function FilterOption({ return ( <> - 0} addPadding={addPadding}> + 0} addPadding={addPadding}> - {isGlossaryTerm && } + {parentEntities.length > 0 && ( + + + + )} {shouldShowIcon && <>{icon}} {shouldShowTagColor && ( diff --git a/datahub-web-react/src/app/search/filters/ParentNodes.tsx b/datahub-web-react/src/app/search/filters/ParentEntities.tsx similarity index 54% rename from datahub-web-react/src/app/search/filters/ParentNodes.tsx rename to datahub-web-react/src/app/search/filters/ParentEntities.tsx index 7012f07c16e64..2504d5f0ff25a 100644 --- a/datahub-web-react/src/app/search/filters/ParentNodes.tsx +++ b/datahub-web-react/src/app/search/filters/ParentEntities.tsx @@ -2,19 +2,16 @@ import { FolderOpenOutlined } from '@ant-design/icons'; import { Tooltip, Typography } from 'antd'; import React from 'react'; import styled from 'styled-components'; -import { EntityType, GlossaryNode, GlossaryTerm } from '../../../types.generated'; +import { Entity } from '../../../types.generated'; import { ANTD_GRAY } from '../../entity/shared/constants'; import { useEntityRegistry } from '../../useEntityRegistry'; -const NUM_VISIBLE_NODES = 2; - const ParentNodesWrapper = styled.div` font-size: 12px; color: ${ANTD_GRAY[7]}; display: flex; align-items: center; margin-bottom: 3px; - max-width: 220px; overflow: hidden; `; @@ -27,54 +24,62 @@ export const ArrowWrapper = styled.span` margin: 0 3px; `; +const StyledTooltip = styled(Tooltip)` + display: flex; + white-space: nowrap; + overflow: hidden; +`; + +const DEFAULT_NUM_VISIBLE = 2; + interface Props { - glossaryTerm: GlossaryTerm; + parentEntities: Entity[]; + numVisible?: number; } -export default function ParentNodes({ glossaryTerm }: Props) { +export default function ParentEntities({ parentEntities, numVisible = DEFAULT_NUM_VISIBLE }: Props) { const entityRegistry = useEntityRegistry(); - const parentNodes: GlossaryNode[] = glossaryTerm.parentNodes?.nodes || []; - // parent nodes are returned with direct parent first - const orderedParentNodes = [...parentNodes].reverse(); - const visibleNodes = orderedParentNodes.slice(orderedParentNodes.length - NUM_VISIBLE_NODES); - const numHiddenNodes = orderedParentNodes.length - NUM_VISIBLE_NODES; - const includeNodePathTooltip = parentNodes.length > NUM_VISIBLE_NODES; + // parent nodes/domains are returned with direct parent first + const orderedParentEntities = [...parentEntities].reverse(); + const numHiddenEntities = orderedParentEntities.length - numVisible; + const hasHiddenEntities = numHiddenEntities > 0; + const visibleNodes = hasHiddenEntities ? orderedParentEntities.slice(numHiddenEntities) : orderedParentEntities; - if (!parentNodes.length) return null; + if (!parentEntities.length) return null; return ( - - {orderedParentNodes.map((glossaryNode, index) => ( + {orderedParentEntities.map((parentEntity, index) => ( <> - {entityRegistry.getDisplayName(EntityType.GlossaryNode, glossaryNode)} + {entityRegistry.getDisplayName(parentEntity.type, parentEntity)} - {index !== orderedParentNodes.length - 1 && {'>'}} + {index !== orderedParentEntities.length - 1 && {'>'}} ))} } > - {numHiddenNodes > 0 && - [...Array(numHiddenNodes)].map(() => ( + {hasHiddenEntities && + [...Array(numHiddenEntities)].map(() => ( <> {'>'} ))} - {visibleNodes.map((glossaryNode, index) => { - const displayName = entityRegistry.getDisplayName(EntityType.GlossaryNode, glossaryNode); + {visibleNodes.map((parentEntity, index) => { + const displayName = entityRegistry.getDisplayName(parentEntity.type, parentEntity); return ( <> - + {displayName} {index !== visibleNodes.length - 1 && {'>'}} @@ -82,6 +87,6 @@ export default function ParentNodes({ glossaryTerm }: Props) { ); })} - + ); } diff --git a/datahub-web-react/src/app/search/filters/utils.tsx b/datahub-web-react/src/app/search/filters/utils.tsx index fbde71d6a2e9a..6ea9d0e8baa4f 100644 --- a/datahub-web-react/src/app/search/filters/utils.tsx +++ b/datahub-web-react/src/app/search/filters/utils.tsx @@ -14,10 +14,12 @@ import { AggregationMetadata, DataPlatform, DataPlatformInstance, + Domain, Entity, EntityType, FacetFilterInput, FacetMetadata, + GlossaryTerm, } from '../../../types.generated'; import { IconStyleType } from '../../entity/Entity'; import { @@ -331,3 +333,16 @@ export function canCreateViewFromFilters(activeFilters: FacetFilterInput[]) { } return true; } + +export function getParentEntities(entity: Entity): Entity[] | null { + if (!entity) { + return null; + } + if (entity.type === EntityType.GlossaryTerm) { + return (entity as GlossaryTerm).parentNodes?.nodes || []; + } + if (entity.type === EntityType.Domain) { + return (entity as Domain).parentDomains?.domains || []; + } + return null; +} diff --git a/datahub-web-react/src/app/search/sidebar/BrowseSidebar.tsx b/datahub-web-react/src/app/search/sidebar/BrowseSidebar.tsx index b5e9272cc5273..0d3d40c4a71af 100644 --- a/datahub-web-react/src/app/search/sidebar/BrowseSidebar.tsx +++ b/datahub-web-react/src/app/search/sidebar/BrowseSidebar.tsx @@ -6,13 +6,14 @@ import { BrowseProvider } from './BrowseContext'; import SidebarLoadingError from './SidebarLoadingError'; import { SEARCH_RESULTS_BROWSE_SIDEBAR_ID } from '../../onboarding/config/SearchOnboardingConfig'; import useSidebarEntities from './useSidebarEntities'; +import { ANTD_GRAY_V2 } from '../../entity/shared/constants'; const Sidebar = styled.div<{ visible: boolean; width: number }>` height: 100%; width: ${(props) => (props.visible ? `${props.width}px` : '0')}; transition: width 250ms ease-in-out; border-right: 1px solid ${(props) => props.theme.styles['border-color-base']}; - background-color: #f8f9fa; + background-color: ${ANTD_GRAY_V2[1]}; background: white; `; diff --git a/datahub-web-react/src/app/search/sidebar/ExpandableNode.tsx b/datahub-web-react/src/app/search/sidebar/ExpandableNode.tsx index 32d2c4af948ef..ba93cf94fba2b 100644 --- a/datahub-web-react/src/app/search/sidebar/ExpandableNode.tsx +++ b/datahub-web-react/src/app/search/sidebar/ExpandableNode.tsx @@ -1,9 +1,10 @@ import React, { MouseEventHandler, ReactNode } from 'react'; import styled from 'styled-components'; import { VscTriangleRight } from 'react-icons/vsc'; -import { Button, Typography } from 'antd'; +import { Typography } from 'antd'; import { UpCircleOutlined } from '@ant-design/icons'; import { ANTD_GRAY } from '../../entity/shared/constants'; +import { BaseButton, BodyContainer, BodyGridExpander, RotatingButton } from '../../shared/components'; const Layout = styled.div` margin-left: 8px; @@ -11,17 +12,6 @@ const Layout = styled.div` const HeaderContainer = styled.div``; -const BodyGridExpander = styled.div<{ isOpen: boolean }>` - display: grid; - grid-template-rows: ${(props) => (props.isOpen ? '1fr' : '0fr')}; - transition: grid-template-rows 250ms; - overflow: hidden; -`; - -const BodyContainer = styled.div` - min-height: 0; -`; - type ExpandableNodeProps = { isOpen: boolean; header: ReactNode; @@ -68,22 +58,6 @@ ExpandableNode.HeaderLeft = styled.div` align-items: center; `; -const BaseButton = styled(Button)` - &&& { - display: flex; - align-items: center; - justify-content: center; - border: none; - box-shadow: none; - border-radius: 50%; - } -`; - -const RotatingButton = styled(BaseButton)<{ deg: number }>` - transform: rotate(${(props) => props.deg}deg); - transition: transform 250ms; -`; - ExpandableNode.StaticButton = ({ icon, onClick }: { icon: JSX.Element; onClick?: () => void }) => { const onClickButton: MouseEventHandler = (e) => { e.stopPropagation(); diff --git a/datahub-web-react/src/app/shared/LogoCountCard.tsx b/datahub-web-react/src/app/shared/LogoCountCard.tsx index 3e2f74ebe5166..ebf0d9cd4f54e 100644 --- a/datahub-web-react/src/app/shared/LogoCountCard.tsx +++ b/datahub-web-react/src/app/shared/LogoCountCard.tsx @@ -1,27 +1,9 @@ import React from 'react'; -import { Image, Typography, Button } from 'antd'; +import { Image, Typography } from 'antd'; import styled from 'styled-components'; import { ANTD_GRAY } from '../entity/shared/constants'; import { formatNumber } from './formatNumber'; - -const Container = styled(Button)` - margin-right: 12px; - margin-left: 12px; - margin-bottom: 12px; - width: 160px; - height: 140px; - display: flex; - justify-content: center; - border-radius: 4px; - align-items: center; - flex-direction: column; - border: 1px solid ${ANTD_GRAY[4]}; - box-shadow: ${(props) => props.theme.styles['box-shadow']}; - &&:hover { - box-shadow: ${(props) => props.theme.styles['box-shadow-hover']}; - } - white-space: unset; -`; +import { HomePageButton } from './components'; const PlatformLogo = styled(Image)` max-height: 32px; @@ -53,7 +35,7 @@ type Props = { export const LogoCountCard = ({ logoUrl, logoComponent, name, count, onClick }: Props) => { return ( - + {(logoUrl && ) || logoComponent} @@ -68,6 +50,6 @@ export const LogoCountCard = ({ logoUrl, logoComponent, name, count, onClick }: {count !== undefined && {formatNumber(count)}} - + ); }; diff --git a/datahub-web-react/src/app/shared/admin/HeaderLinks.tsx b/datahub-web-react/src/app/shared/admin/HeaderLinks.tsx index 39035d5bff562..ced7d8642576b 100644 --- a/datahub-web-react/src/app/shared/admin/HeaderLinks.tsx +++ b/datahub-web-react/src/app/shared/admin/HeaderLinks.tsx @@ -5,7 +5,6 @@ import { BarChartOutlined, BookOutlined, SettingOutlined, - FolderOutlined, SolutionOutlined, DownOutlined, } from '@ant-design/icons'; @@ -16,6 +15,7 @@ import { ANTD_GRAY } from '../../entity/shared/constants'; import { HOME_PAGE_INGESTION_ID } from '../../onboarding/config/HomePageOnboardingConfig'; import { useUpdateEducationStepIdsAllowlist } from '../../onboarding/useUpdateEducationStepIdsAllowlist'; import { useUserContext } from '../../context/useUserContext'; +import DomainIcon from '../../domain/DomainIcon'; const LinkWrapper = styled.span` margin-right: 0px; @@ -124,7 +124,12 @@ export function HeaderLinks(props: Props) { - + Domains Manage related groups of data assets diff --git a/datahub-web-react/src/app/shared/components.tsx b/datahub-web-react/src/app/shared/components.tsx new file mode 100644 index 0000000000000..68d2fb52cfdba --- /dev/null +++ b/datahub-web-react/src/app/shared/components.tsx @@ -0,0 +1,49 @@ +import { Button } from 'antd'; +import styled from 'styled-components'; +import { ANTD_GRAY } from '../entity/shared/constants'; + +export const HomePageButton = styled(Button)` + margin-right: 12px; + margin-left: 12px; + margin-bottom: 12px; + width: 160px; + height: 140px; + display: flex; + justify-content: center; + border-radius: 4px; + align-items: center; + flex-direction: column; + border: 1px solid ${ANTD_GRAY[4]}; + box-shadow: ${(props) => props.theme.styles['box-shadow']}; + &&:hover { + box-shadow: ${(props) => props.theme.styles['box-shadow-hover']}; + } + white-space: unset; +`; + +export const BaseButton = styled(Button)` + &&& { + display: flex; + align-items: center; + justify-content: center; + border: none; + box-shadow: none; + border-radius: 50%; + } +`; + +export const RotatingButton = styled(BaseButton)<{ deg: number }>` + transform: rotate(${(props) => props.deg}deg); + transition: transform 250ms; +`; + +export const BodyGridExpander = styled.div<{ isOpen: boolean }>` + display: grid; + grid-template-rows: ${(props) => (props.isOpen ? '1fr' : '0fr')}; + transition: grid-template-rows 250ms; + overflow: hidden; +`; + +export const BodyContainer = styled.div` + min-height: 0; +`; diff --git a/datahub-web-react/src/app/shared/deleteUtils.ts b/datahub-web-react/src/app/shared/deleteUtils.ts index c1bfeac37372b..37a3758712ad6 100644 --- a/datahub-web-react/src/app/shared/deleteUtils.ts +++ b/datahub-web-react/src/app/shared/deleteUtils.ts @@ -1,3 +1,4 @@ +import { PageRoutes } from '../../conf/Global'; import { useDeleteAssertionMutation } from '../../graphql/assertion.generated'; import { useDeleteDataProductMutation } from '../../graphql/dataProduct.generated'; import { useDeleteDomainMutation } from '../../graphql/domain.generated'; @@ -18,10 +19,11 @@ export const getEntityProfileDeleteRedirectPath = (type: EntityType, entityData: switch (type) { case EntityType.CorpGroup: case EntityType.CorpUser: - case EntityType.Domain: case EntityType.Tag: // Return Home. return '/'; + case EntityType.Domain: + return `${PageRoutes.DOMAINS}`; case EntityType.GlossaryNode: case EntityType.GlossaryTerm: // Return to glossary page. diff --git a/datahub-web-react/src/app/shared/sidebar/components.tsx b/datahub-web-react/src/app/shared/sidebar/components.tsx new file mode 100644 index 0000000000000..5d123d6022790 --- /dev/null +++ b/datahub-web-react/src/app/shared/sidebar/components.tsx @@ -0,0 +1,23 @@ +import React from 'react'; +import { RightOutlined } from '@ant-design/icons'; +import styled from 'styled-components'; +import { RotatingButton } from '../components'; + +export const SidebarWrapper = styled.div<{ width: number }>` + max-height: 100%; + width: ${(props) => props.width}px; + min-width: ${(props) => props.width}px; +`; + +export function RotatingTriangle({ isOpen, onClick }: { isOpen: boolean; onClick?: () => void }) { + return ( + } + onClick={onClick} + /> + ); +} diff --git a/datahub-web-react/src/app/shared/styleUtils.ts b/datahub-web-react/src/app/shared/styleUtils.ts new file mode 100644 index 0000000000000..21bc866218cb8 --- /dev/null +++ b/datahub-web-react/src/app/shared/styleUtils.ts @@ -0,0 +1,7 @@ +export function applyOpacity(hexColor: string, opacity: number) { + if (hexColor.length !== 7) return hexColor; + + const updatedOpacity = Math.round(opacity * 2.55); + + return hexColor + updatedOpacity.toString(16).padStart(2, '0'); +} diff --git a/datahub-web-react/src/app/shared/tags/AddTagsTermsModal.tsx b/datahub-web-react/src/app/shared/tags/AddTagsTermsModal.tsx index 01e11ceb9a738..80d239def391c 100644 --- a/datahub-web-react/src/app/shared/tags/AddTagsTermsModal.tsx +++ b/datahub-web-react/src/app/shared/tags/AddTagsTermsModal.tsx @@ -50,15 +50,15 @@ const StyleTag = styled(CustomTag)` line-height: 16px; `; -export const BrowserWrapper = styled.div<{ isHidden: boolean }>` +export const BrowserWrapper = styled.div<{ isHidden: boolean; width?: string; maxHeight?: number }>` background-color: white; border-radius: 5px; box-shadow: 0 3px 6px -4px rgb(0 0 0 / 12%), 0 6px 16px 0 rgb(0 0 0 / 8%), 0 9px 28px 8px rgb(0 0 0 / 5%); - max-height: 380px; + max-height: ${(props) => (props.maxHeight ? props.maxHeight : '380')}px; overflow: auto; position: absolute; transition: opacity 0.2s; - width: 480px; + width: ${(props) => (props.width ? props.width : '480px')}; z-index: 1051; ${(props) => props.isHidden && diff --git a/datahub-web-react/src/app/shared/tags/DomainLink.tsx b/datahub-web-react/src/app/shared/tags/DomainLink.tsx index 1c14b71369ed6..a14114ce43e43 100644 --- a/datahub-web-react/src/app/shared/tags/DomainLink.tsx +++ b/datahub-web-react/src/app/shared/tags/DomainLink.tsx @@ -3,10 +3,10 @@ import React from 'react'; import { Link } from 'react-router-dom'; import styled from 'styled-components'; import { Domain, EntityType } from '../../../types.generated'; -import { IconStyleType } from '../../entity/Entity'; import { HoverEntityTooltip } from '../../recommendations/renderer/component/HoverEntityTooltip'; import { useEntityRegistry } from '../../useEntityRegistry'; import { ANTD_GRAY } from '../../entity/shared/constants'; +import DomainIcon from '../../domain/DomainIcon'; const DomainLinkContainer = styled(Link)` display: inline-block; @@ -39,7 +39,12 @@ function DomainContent({ domain, name, closable, onClose, tagStyle, fontSize }: return ( - {entityRegistry.getIcon(EntityType.Domain, fontSize || 10, IconStyleType.ACCENT, ANTD_GRAY[9])} + {displayName} diff --git a/datahub-web-react/src/app/shared/useToggle.ts b/datahub-web-react/src/app/shared/useToggle.ts index b020bf030f079..a73c702c4351b 100644 --- a/datahub-web-react/src/app/shared/useToggle.ts +++ b/datahub-web-react/src/app/shared/useToggle.ts @@ -1,4 +1,4 @@ -import { useState } from 'react'; +import { useMemo, useState } from 'react'; const NOOP = (_: boolean) => {}; @@ -9,25 +9,39 @@ const useToggle = ({ initialValue = false, closeDelay = 0, openDelay = 0, onTogg const isClosing = transition === 'closing'; const isTransitioning = transition !== null; - const toggle = () => { - if (isOpen) { + const toggleClose = useMemo( + () => () => { setTransition('closing'); window.setTimeout(() => { setIsOpen(false); setTransition(null); onToggle(false); }, closeDelay); - } else { + }, + [closeDelay, onToggle], + ); + + const toggleOpen = useMemo( + () => () => { setTransition('opening'); window.setTimeout(() => { setIsOpen(true); setTransition(null); onToggle(true); }, openDelay); + }, + [openDelay, onToggle], + ); + + const toggle = () => { + if (isOpen) { + toggleClose(); + } else { + toggleOpen(); } }; - return { isOpen, isClosing, isOpening, isTransitioning, toggle } as const; + return { isOpen, isClosing, isOpening, isTransitioning, toggle, toggleOpen, toggleClose } as const; }; export default useToggle; diff --git a/datahub-web-react/src/app/useAppConfig.ts b/datahub-web-react/src/app/useAppConfig.ts index cdc8f92210a0d..821d00b9017c3 100644 --- a/datahub-web-react/src/app/useAppConfig.ts +++ b/datahub-web-react/src/app/useAppConfig.ts @@ -12,3 +12,8 @@ export function useIsShowAcrylInfoEnabled() { const appConfig = useAppConfig(); return appConfig.config.featureFlags.showAcrylInfo; } + +export function useIsNestedDomainsEnabled() { + const appConfig = useAppConfig(); + return appConfig.config.featureFlags.nestedDomainsEnabled; +} diff --git a/datahub-web-react/src/appConfigContext.tsx b/datahub-web-react/src/appConfigContext.tsx index 096c2fd6ef0e5..4087ad453687c 100644 --- a/datahub-web-react/src/appConfigContext.tsx +++ b/datahub-web-react/src/appConfigContext.tsx @@ -49,6 +49,7 @@ export const DEFAULT_APP_CONFIG = { showBrowseV2: true, showAcrylInfo: false, showAccessManagement: false, + nestedDomainsEnabled: true, }, }; diff --git a/datahub-web-react/src/conf/Global.ts b/datahub-web-react/src/conf/Global.ts index e1220b8c81b53..82378bb621427 100644 --- a/datahub-web-react/src/conf/Global.ts +++ b/datahub-web-react/src/conf/Global.ts @@ -24,6 +24,7 @@ export enum PageRoutes { INGESTION = '/ingestion', SETTINGS = '/settings', DOMAINS = '/domains', + DOMAIN = '/domain', GLOSSARY = '/glossary', SETTINGS_VIEWS = '/settings/views', EMBED = '/embed', diff --git a/datahub-web-react/src/graphql/app.graphql b/datahub-web-react/src/graphql/app.graphql index 228fa1c9430d0..4e9bbb11d8c5a 100644 --- a/datahub-web-react/src/graphql/app.graphql +++ b/datahub-web-react/src/graphql/app.graphql @@ -64,6 +64,7 @@ query appConfig { showBrowseV2 showAcrylInfo showAccessManagement + nestedDomainsEnabled } } } diff --git a/datahub-web-react/src/graphql/domain.graphql b/datahub-web-react/src/graphql/domain.graphql index d72ff336bf9e7..951b93fcba9af 100644 --- a/datahub-web-react/src/graphql/domain.graphql +++ b/datahub-web-react/src/graphql/domain.graphql @@ -2,10 +2,14 @@ query getDomain($urn: String!) { domain(urn: $urn) { urn id + type properties { name description } + parentDomains { + ...parentDomainsFields + } ownership { ...ownershipFields } @@ -23,6 +27,9 @@ query getDomain($urn: String!) { } } } + children: relationships(input: { types: ["IsPartOf"], direction: INCOMING, start: 0, count: 0 }) { + total + } } } @@ -33,16 +40,29 @@ query listDomains($input: ListDomainsInput!) { total domains { urn + id + type properties { name description } + parentDomains { + ...parentDomainsFields + } ownership { ...ownershipFields } - entities(input: { start: 0, count: 1 }) { - total - } + ...domainEntitiesFields + } + } +} + +query getDomainChildrenCount($urn: String!) { + domain(urn: $urn) { + urn + type + children: relationships(input: { types: ["IsPartOf"], direction: INCOMING, start: 0, count: 0 }) { + total } } } @@ -51,6 +71,10 @@ mutation createDomain($input: CreateDomainInput!) { createDomain(input: $input) } +mutation moveDomain($input: MoveDomainInput!) { + moveDomain(input: $input) +} + mutation deleteDomain($urn: String!) { deleteDomain(urn: $urn) } diff --git a/datahub-web-react/src/graphql/fragments.graphql b/datahub-web-react/src/graphql/fragments.graphql index c3ac2139e687b..72474911b9310 100644 --- a/datahub-web-react/src/graphql/fragments.graphql +++ b/datahub-web-react/src/graphql/fragments.graphql @@ -82,6 +82,20 @@ fragment parentNodesFields on ParentNodesResult { } } +fragment parentDomainsFields on ParentDomainsResult { + count + domains { + urn + type + ... on Domain { + properties { + name + description + } + } + } +} + fragment ownershipFields on Ownership { owners { owner { @@ -931,6 +945,20 @@ fragment parentContainerFields on Container { } } +fragment domainEntitiesFields on Domain { + entities(input: { start: 0, count: 0 }) { + total + } + dataProducts: entities( + input: { start: 0, count: 0, filters: [{ field: "_entityType", value: "DATA_PRODUCT" }] } + ) { + total + } + children: relationships(input: { types: ["IsPartOf"], direction: INCOMING, start: 0, count: 0 }) { + total + } +} + fragment entityDomain on DomainAssociation { domain { urn @@ -939,6 +967,10 @@ fragment entityDomain on DomainAssociation { name description } + parentDomains { + ...parentDomainsFields + } + ...domainEntitiesFields } associatedUrn } diff --git a/datahub-web-react/src/graphql/preview.graphql b/datahub-web-react/src/graphql/preview.graphql index 03635ab1b66d5..e104d62c67074 100644 --- a/datahub-web-react/src/graphql/preview.graphql +++ b/datahub-web-react/src/graphql/preview.graphql @@ -304,7 +304,12 @@ fragment entityPreview on Entity { urn properties { name + description + } + parentDomains { + ...parentDomainsFields } + ...domainEntitiesFields } ... on Container { ...entityContainer diff --git a/datahub-web-react/src/graphql/search.graphql b/datahub-web-react/src/graphql/search.graphql index 94ff263c02039..2297c2d0c1d07 100644 --- a/datahub-web-react/src/graphql/search.graphql +++ b/datahub-web-react/src/graphql/search.graphql @@ -155,6 +155,9 @@ fragment autoCompleteFields on Entity { properties { name } + parentDomains { + ...parentDomainsFields + } } ... on DataProduct { properties { @@ -671,6 +674,10 @@ fragment searchResultFields on Entity { ownership { ...ownershipFields } + parentDomains { + ...parentDomainsFields + } + ...domainEntitiesFields } ... on Container { properties { @@ -825,6 +832,9 @@ fragment facetFields on FacetMetadata { properties { name } + parentDomains { + ...parentDomainsFields + } } ... on Container { platform { diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResolvedResourceSpec.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResolvedResourceSpec.java index 0dae1bd386ccd..53dd0be44f963 100644 --- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResolvedResourceSpec.java +++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResolvedResourceSpec.java @@ -3,7 +3,6 @@ import java.util.Collections; import java.util.Map; import java.util.Set; -import javax.annotation.Nullable; import lombok.Getter; import lombok.RequiredArgsConstructor; import lombok.ToString; @@ -26,21 +25,6 @@ public Set getFieldValues(ResourceFieldType resourceFieldType) { return fieldResolvers.get(resourceFieldType).getFieldValuesFuture().join().getValues(); } - /** - * Fetch the entity-registry type for a resource. ('dataset', 'dashboard', 'chart'). - * @return the entity type. - */ - public String getType() { - if (!fieldResolvers.containsKey(ResourceFieldType.RESOURCE_TYPE)) { - throw new UnsupportedOperationException( - "Failed to resolve resource type! No field resolver for RESOURCE_TYPE provided."); - } - Set resourceTypes = - fieldResolvers.get(ResourceFieldType.RESOURCE_TYPE).getFieldValuesFuture().join().getValues(); - assert resourceTypes.size() == 1; // There should always be a single resource type. - return resourceTypes.stream().findFirst().get(); - } - /** * Fetch the owners for a resource. * @return a set of owner urns, or empty set if none exist. @@ -51,20 +35,4 @@ public Set getOwners() { } return fieldResolvers.get(ResourceFieldType.OWNER).getFieldValuesFuture().join().getValues(); } - - /** - * Fetch the domain for a Resolved Resource Spec - * @return a Domain or null if one does not exist. - */ - @Nullable - public String getDomain() { - if (!fieldResolvers.containsKey(ResourceFieldType.DOMAIN)) { - return null; - } - Set domains = fieldResolvers.get(ResourceFieldType.DOMAIN).getFieldValuesFuture().join().getValues(); - if (domains.size() > 0) { - return domains.stream().findFirst().get(); - } - return null; - } } diff --git a/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl index 5c8c8a4912e4c..89f44a433b7ba 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl @@ -1,6 +1,7 @@ namespace com.linkedin.domain import com.linkedin.common.AuditStamp +import com.linkedin.common.Urn /** * Information about a Domain @@ -36,4 +37,18 @@ record DomainProperties { } } created: optional AuditStamp + + /** + * Optional: Parent of the domain + */ + @Relationship = { + "name": "IsPartOf", + "entityTypes": [ "domain" ], + } + @Searchable = { + "fieldName": "parentDomain", + "fieldType": "URN", + "hasValuesFieldName": "hasParentDomain" + } + parentDomain: optional Urn } diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DomainFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DomainFieldResolverProvider.java index ae87812f3b79c..68c1dd4f644e5 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DomainFieldResolverProvider.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DomainFieldResolverProvider.java @@ -6,15 +6,23 @@ import com.datahub.authorization.ResourceSpec; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; +import com.linkedin.domain.DomainProperties; import com.linkedin.domain.Domains; import com.linkedin.entity.EntityResponse; import com.linkedin.entity.EnvelopedAspect; import com.linkedin.entity.client.EntityClient; + import java.util.Collections; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; import java.util.stream.Collectors; + import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import javax.annotation.Nonnull; + import static com.linkedin.metadata.Constants.*; @@ -38,8 +46,40 @@ public FieldResolver getFieldResolver(ResourceSpec resourceSpec) { return FieldResolver.getResolverFromFunction(resourceSpec, this::getDomains); } + private Set getBatchedParentDomains(@Nonnull final Set urns) { + final Set parentUrns = new HashSet<>(); + + try { + final Map batchResponse = _entityClient.batchGetV2( + DOMAIN_ENTITY_NAME, + urns, + Collections.singleton(DOMAIN_PROPERTIES_ASPECT_NAME), + _systemAuthentication + ); + + batchResponse.forEach((urn, entityResponse) -> { + if (entityResponse.getAspects().containsKey(DOMAIN_PROPERTIES_ASPECT_NAME)) { + final DomainProperties properties = new DomainProperties(entityResponse.getAspects().get(DOMAIN_PROPERTIES_ASPECT_NAME).getValue().data()); + if (properties.hasParentDomain()) { + parentUrns.add(properties.getParentDomain()); + } + } + }); + + } catch (Exception e) { + log.error( + "Error while retrieving parent domains for {} urns including \"{}\"", + urns.size(), + urns.stream().findFirst().map(Urn::toString).orElse(""), + e + ); + } + + return parentUrns; + } + private FieldResolver.FieldValue getDomains(ResourceSpec resourceSpec) { - Urn entityUrn = UrnUtils.getUrn(resourceSpec.getResource()); + final Urn entityUrn = UrnUtils.getUrn(resourceSpec.getResource()); // In the case that the entity is a domain, the associated domain is the domain itself if (entityUrn.getEntityType().equals(DOMAIN_ENTITY_NAME)) { return FieldResolver.FieldValue.builder() @@ -47,7 +87,7 @@ private FieldResolver.FieldValue getDomains(ResourceSpec resourceSpec) { .build(); } - EnvelopedAspect domainsAspect; + final EnvelopedAspect domainsAspect; try { EntityResponse response = _entityClient.getV2(entityUrn.getEntityType(), entityUrn, Collections.singleton(DOMAINS_ASPECT_NAME), _systemAuthentication); @@ -59,9 +99,25 @@ private FieldResolver.FieldValue getDomains(ResourceSpec resourceSpec) { log.error("Error while retrieving domains aspect for urn {}", entityUrn, e); return FieldResolver.emptyFieldValue(); } - Domains domains = new Domains(domainsAspect.getValue().data()); - return FieldResolver.FieldValue.builder() - .values(domains.getDomains().stream().map(Object::toString).collect(Collectors.toSet())) - .build(); + + /* + * Build up a set of all directly referenced domains and any of the domains' parent domains. + * To avoid cycles we remove any parents we've already visited to prevent an infinite loop cycle. + */ + + final Set domainUrns = new HashSet<>(new Domains(domainsAspect.getValue().data()).getDomains()); + Set batchedParentUrns = getBatchedParentDomains(domainUrns); + batchedParentUrns.removeAll(domainUrns); + + while (!batchedParentUrns.isEmpty()) { + domainUrns.addAll(batchedParentUrns); + batchedParentUrns = getBatchedParentDomains(batchedParentUrns); + batchedParentUrns.removeAll(domainUrns); + } + + return FieldResolver.FieldValue.builder().values(domainUrns + .stream() + .map(Object::toString) + .collect(Collectors.toSet())).build(); } } diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java index 1ed794be15490..2e48123fb1813 100644 --- a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java +++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java @@ -12,7 +12,10 @@ import com.linkedin.common.OwnershipType; import com.linkedin.common.UrnArray; import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; import com.linkedin.data.template.StringArray; +import com.linkedin.domain.DomainProperties; +import com.linkedin.domain.Domains; import com.linkedin.entity.Aspect; import com.linkedin.entity.EntityResponse; import com.linkedin.entity.EnvelopedAspect; @@ -25,16 +28,19 @@ import com.linkedin.policy.DataHubActorFilter; import com.linkedin.policy.DataHubPolicyInfo; import com.linkedin.policy.DataHubResourceFilter; + import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Optional; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; -import static com.linkedin.metadata.Constants.DATAHUB_POLICY_INFO_ASPECT_NAME; -import static com.linkedin.metadata.Constants.OWNERSHIP_ASPECT_NAME; -import static com.linkedin.metadata.Constants.POLICY_ENTITY_NAME; +import javax.annotation.Nullable; + +import static com.linkedin.metadata.Constants.*; import static com.linkedin.metadata.authorization.PoliciesConfig.ACTIVE_POLICY_STATE; import static com.linkedin.metadata.authorization.PoliciesConfig.INACTIVE_POLICY_STATE; import static com.linkedin.metadata.authorization.PoliciesConfig.METADATA_POLICY_TYPE; @@ -52,6 +58,9 @@ public class DataHubAuthorizerTest { public static final String DATAHUB_SYSTEM_CLIENT_ID = "__datahub_system"; + private static final Urn PARENT_DOMAIN_URN = UrnUtils.getUrn("urn:li:domain:parent"); + private static final Urn CHILD_DOMAIN_URN = UrnUtils.getUrn("urn:li:domain:child"); + private EntityClient _entityClient; private DataHubAuthorizer _dataHubAuthorizer; @@ -61,39 +70,71 @@ public void setupTest() throws Exception { // Init mocks. final Urn activePolicyUrn = Urn.createFromString("urn:li:dataHubPolicy:0"); - final DataHubPolicyInfo activePolicy = createDataHubPolicyInfo(true, ImmutableList.of("EDIT_ENTITY_TAGS")); + final DataHubPolicyInfo activePolicy = createDataHubPolicyInfo(true, ImmutableList.of("EDIT_ENTITY_TAGS"), null); final EnvelopedAspectMap activeAspectMap = new EnvelopedAspectMap(); activeAspectMap.put(DATAHUB_POLICY_INFO_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(activePolicy.data()))); final Urn inactivePolicyUrn = Urn.createFromString("urn:li:dataHubPolicy:1"); - final DataHubPolicyInfo inactivePolicy = createDataHubPolicyInfo(false, ImmutableList.of("EDIT_ENTITY_OWNERS")); + final DataHubPolicyInfo inactivePolicy = createDataHubPolicyInfo(false, ImmutableList.of("EDIT_ENTITY_OWNERS"), null); final EnvelopedAspectMap inactiveAspectMap = new EnvelopedAspectMap(); inactiveAspectMap.put(DATAHUB_POLICY_INFO_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(inactivePolicy.data()))); + final Urn parentDomainPolicyUrn = Urn.createFromString("urn:li:dataHubPolicy:2"); + final DataHubPolicyInfo parentDomainPolicy = createDataHubPolicyInfo(true, ImmutableList.of("EDIT_ENTITY_DOCS"), PARENT_DOMAIN_URN); + final EnvelopedAspectMap parentDomainPolicyAspectMap = new EnvelopedAspectMap(); + parentDomainPolicyAspectMap.put(DATAHUB_POLICY_INFO_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(parentDomainPolicy.data()))); + + final Urn childDomainPolicyUrn = Urn.createFromString("urn:li:dataHubPolicy:3"); + final DataHubPolicyInfo childDomainPolicy = createDataHubPolicyInfo(true, ImmutableList.of("EDIT_ENTITY_STATUS"), CHILD_DOMAIN_URN); + final EnvelopedAspectMap childDomainPolicyAspectMap = new EnvelopedAspectMap(); + childDomainPolicyAspectMap.put(DATAHUB_POLICY_INFO_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(childDomainPolicy.data()))); + final SearchResult policySearchResult = new SearchResult(); - policySearchResult.setNumEntities(2); - policySearchResult.setEntities(new SearchEntityArray(ImmutableList.of(new SearchEntity().setEntity(activePolicyUrn), - new SearchEntity().setEntity(inactivePolicyUrn)))); + policySearchResult.setNumEntities(3); + policySearchResult.setEntities( + new SearchEntityArray( + ImmutableList.of( + new SearchEntity().setEntity(activePolicyUrn), + new SearchEntity().setEntity(inactivePolicyUrn), + new SearchEntity().setEntity(parentDomainPolicyUrn), + new SearchEntity().setEntity(childDomainPolicyUrn) + ) + ) + ); when(_entityClient.search(eq("dataHubPolicy"), eq(""), isNull(), any(), anyInt(), anyInt(), any(), eq(new SearchFlags().setFulltext(true)))).thenReturn(policySearchResult); when(_entityClient.batchGetV2(eq(POLICY_ENTITY_NAME), - eq(ImmutableSet.of(activePolicyUrn, inactivePolicyUrn)), eq(null), any())).thenReturn( + eq(ImmutableSet.of(activePolicyUrn, inactivePolicyUrn, parentDomainPolicyUrn, childDomainPolicyUrn)), eq(null), any())).thenReturn( ImmutableMap.of( activePolicyUrn, new EntityResponse().setUrn(activePolicyUrn).setAspects(activeAspectMap), - inactivePolicyUrn, new EntityResponse().setUrn(inactivePolicyUrn).setAspects(inactiveAspectMap) + inactivePolicyUrn, new EntityResponse().setUrn(inactivePolicyUrn).setAspects(inactiveAspectMap), + parentDomainPolicyUrn, new EntityResponse().setUrn(parentDomainPolicyUrn).setAspects(parentDomainPolicyAspectMap), + childDomainPolicyUrn, new EntityResponse().setUrn(childDomainPolicyUrn).setAspects(childDomainPolicyAspectMap) ) ); final List userUrns = ImmutableList.of(Urn.createFromString("urn:li:corpuser:user3"), Urn.createFromString("urn:li:corpuser:user4")); final List groupUrns = ImmutableList.of(Urn.createFromString("urn:li:corpGroup:group3"), Urn.createFromString("urn:li:corpGroup:group4")); - EntityResponse entityResponse = new EntityResponse(); - EnvelopedAspectMap envelopedAspectMap = new EnvelopedAspectMap(); - envelopedAspectMap.put(OWNERSHIP_ASPECT_NAME, new EnvelopedAspect() + EntityResponse ownershipResponse = new EntityResponse(); + EnvelopedAspectMap ownershipAspectMap = new EnvelopedAspectMap(); + ownershipAspectMap.put(OWNERSHIP_ASPECT_NAME, new EnvelopedAspect() .setValue(new com.linkedin.entity.Aspect(createOwnershipAspect(userUrns, groupUrns).data()))); - entityResponse.setAspects(envelopedAspectMap); + ownershipResponse.setAspects(ownershipAspectMap); when(_entityClient.getV2(any(), any(), eq(Collections.singleton(OWNERSHIP_ASPECT_NAME)), any())) - .thenReturn(entityResponse); + .thenReturn(ownershipResponse); + + // Mocks to get domains on a resource + when(_entityClient.getV2(any(), any(), eq(Collections.singleton(DOMAINS_ASPECT_NAME)), any())) + .thenReturn(createDomainsResponse(CHILD_DOMAIN_URN)); + + // Mocks to get parent domains on a domain + when(_entityClient.batchGetV2(any(), eq(Collections.singleton(CHILD_DOMAIN_URN)), eq(Collections.singleton(DOMAIN_PROPERTIES_ASPECT_NAME)), any())) + .thenReturn(createDomainPropertiesBatchResponse(PARENT_DOMAIN_URN)); + + // Mocks to reach the stopping point on domain parents + when(_entityClient.batchGetV2(any(), eq(Collections.singleton(PARENT_DOMAIN_URN)), eq(Collections.singleton(DOMAIN_PROPERTIES_ASPECT_NAME)), any())) + .thenReturn(createDomainPropertiesBatchResponse(null)); final Authentication systemAuthentication = new Authentication( new Actor(ActorType.USER, DATAHUB_SYSTEM_CLIENT_ID), @@ -229,7 +270,46 @@ public void testAuthorizedActorsActivePolicy() throws Exception { )); } - private DataHubPolicyInfo createDataHubPolicyInfo(boolean active, List privileges) throws Exception { + @Test + public void testAuthorizationOnDomainWithPrivilegeIsAllowed() { + ResourceSpec resourceSpec = new ResourceSpec("dataset", "urn:li:dataset:test"); + + AuthorizationRequest request = new AuthorizationRequest( + "urn:li:corpuser:test", + "EDIT_ENTITY_STATUS", + Optional.of(resourceSpec) + ); + + assertEquals(_dataHubAuthorizer.authorize(request).getType(), AuthorizationResult.Type.ALLOW); + } + + @Test + public void testAuthorizationOnDomainWithParentPrivilegeIsAllowed() { + ResourceSpec resourceSpec = new ResourceSpec("dataset", "urn:li:dataset:test"); + + AuthorizationRequest request = new AuthorizationRequest( + "urn:li:corpuser:test", + "EDIT_ENTITY_DOCS", + Optional.of(resourceSpec) + ); + + assertEquals(_dataHubAuthorizer.authorize(request).getType(), AuthorizationResult.Type.ALLOW); + } + + @Test + public void testAuthorizationOnDomainWithoutPrivilegeIsDenied() { + ResourceSpec resourceSpec = new ResourceSpec("dataset", "urn:li:dataset:test"); + + AuthorizationRequest request = new AuthorizationRequest( + "urn:li:corpuser:test", + "EDIT_ENTITY_DOC_LINKS", + Optional.of(resourceSpec) + ); + + assertEquals(_dataHubAuthorizer.authorize(request).getType(), AuthorizationResult.Type.DENY); + } + + private DataHubPolicyInfo createDataHubPolicyInfo(boolean active, List privileges, @Nullable final Urn domain) throws Exception { final DataHubPolicyInfo dataHubPolicyInfo = new DataHubPolicyInfo(); dataHubPolicyInfo.setType(METADATA_POLICY_TYPE); dataHubPolicyInfo.setState(active ? ACTIVE_POLICY_STATE : INACTIVE_POLICY_STATE); @@ -252,7 +332,13 @@ private DataHubPolicyInfo createDataHubPolicyInfo(boolean active, List p final DataHubResourceFilter resourceFilter = new DataHubResourceFilter(); resourceFilter.setAllResources(true); resourceFilter.setType("dataset"); + + if (domain != null) { + resourceFilter.setFilter(FilterUtils.newFilter(ImmutableMap.of(ResourceFieldType.DOMAIN, Collections.singletonList(domain.toString())))); + } + dataHubPolicyInfo.setResources(resourceFilter); + return dataHubPolicyInfo; } @@ -284,6 +370,33 @@ private Ownership createOwnershipAspect(final List userOwners, final List domainUrns = ImmutableList.of(domainUrn); + final EntityResponse domainsResponse = new EntityResponse(); + EnvelopedAspectMap domainsAspectMap = new EnvelopedAspectMap(); + final Domains domains = new Domains(); + domains.setDomains(new UrnArray(domainUrns)); + domainsAspectMap.put(DOMAINS_ASPECT_NAME, new EnvelopedAspect() + .setValue(new com.linkedin.entity.Aspect(domains.data()))); + domainsResponse.setAspects(domainsAspectMap); + return domainsResponse; + } + + private Map createDomainPropertiesBatchResponse(@Nullable final Urn parentDomainUrn) { + final Map batchResponse = new HashMap<>(); + final EntityResponse response = new EntityResponse(); + EnvelopedAspectMap aspectMap = new EnvelopedAspectMap(); + final DomainProperties properties = new DomainProperties(); + if (parentDomainUrn != null) { + properties.setParentDomain(parentDomainUrn); + } + aspectMap.put(DOMAIN_PROPERTIES_ASPECT_NAME, new EnvelopedAspect() + .setValue(new com.linkedin.entity.Aspect(properties.data()))); + response.setAspects(aspectMap); + batchResponse.put(parentDomainUrn, response); + return batchResponse; + } + private AuthorizerContext createAuthorizerContext(final Authentication systemAuthentication, final EntityClient entityClient) { return new AuthorizerContext(Collections.emptyMap(), new DefaultResourceSpecResolver(systemAuthentication, entityClient)); } diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index 6fd7b9e6a295c..ea959bebf25ad 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -300,6 +300,7 @@ featureFlags: preProcessHooks: uiEnabled: ${PRE_PROCESS_HOOKS_UI_ENABLED:true} # Circumvents Kafka for processing index updates for UI changes sourced from GraphQL to avoid processing delays showAcrylInfo: ${SHOW_ACRYL_INFO:false} # Show different CTAs within DataHub around moving to Managed DataHub. Set to true for the demo site. + nestedDomainsEnabled: ${NESTED_DOMAINS_ENABLED:true} # Enables the nested Domains feature that allows users to have sub-Domains. If this is off, Domains appear "flat" again entityChangeEvents: enabled: ${ENABLE_ENTITY_CHANGE_EVENTS_HOOK:true} diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/datahubusage/DataHubUsageEventType.java b/metadata-service/services/src/main/java/com/linkedin/metadata/datahubusage/DataHubUsageEventType.java index 14b301f93f4ef..036fb20b33f20 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/datahubusage/DataHubUsageEventType.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/datahubusage/DataHubUsageEventType.java @@ -53,6 +53,7 @@ public enum DataHubUsageEventType { SHOW_STANDARD_HOME_PAGE_EVENT("ShowStandardHomepageEvent"), CREATE_GLOSSARY_ENTITY_EVENT("CreateGlossaryEntityEvent"), CREATE_DOMAIN_EVENT("CreateDomainEvent"), + MOVE_DOMAIN_EVENT("MoveDomainEvent"), CREATE_INGESTION_SOURCE_EVENT("CreateIngestionSourceEvent"), UPDATE_INGESTION_SOURCE_EVENT("UpdateIngestionSourceEvent"), DELETE_INGESTION_SOURCE_EVENT("DeleteIngestionSourceEvent"), diff --git a/node_modules/.yarn-integrity b/node_modules/.yarn-integrity new file mode 100644 index 0000000000000..42a6cb985ab1b --- /dev/null +++ b/node_modules/.yarn-integrity @@ -0,0 +1,12 @@ +{ + "systemParams": "darwin-arm64-93", + "modulesFolders": [ + "node_modules" + ], + "flags": [], + "linkedModules": [], + "topLevelPatterns": [], + "lockfileEntries": {}, + "files": [], + "artifacts": {} +} \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/domains.js b/smoke-test/tests/cypress/cypress/e2e/mutations/domains.js index c3608e235391c..3de0e9b4b893e 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/domains.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/domains.js @@ -1,14 +1,32 @@ +import { aliasQuery, hasOperationName } from "../utils"; + const test_domain_id = Math.floor(Math.random() * 100000); const test_domain = `CypressDomainTest ${test_domain_id}` const test_domain_urn = `urn:li:domain:${test_domain_id}` describe("add remove domain", () => { + beforeEach(() => { + cy.intercept("POST", "/api/v2/graphql", (req) => { + aliasQuery(req, "appConfig"); + }); + }); + + const setDomainsFeatureFlag = (isOn) => { + cy.intercept("POST", "/api/v2/graphql", (req) => { + if (hasOperationName(req, "appConfig")) { + req.reply((res) => { + res.body.data.appConfig.featureFlags.nestedDomainsEnabled = isOn; + }); + } + }); + }; + it("create domain", () => { cy.loginWithCredentials(); cy.goToDomainList(); cy.clickOptionWithText("New Domain"); - cy.waitTextVisible("Create new Domain"); + cy.waitTextVisible("Create New Domain"); cy.get('[data-testid="create-domain-name"]').click().type(test_domain) cy.clickOptionWithText('Advanced') cy.get('[data-testid="create-domain-id"]').click().type(test_domain_id) @@ -17,6 +35,7 @@ describe("add remove domain", () => { }) it("add entities to domain", () => { + setDomainsFeatureFlag(false); cy.loginWithCredentials(); cy.goToDomainList(); cy.clickOptionWithText(test_domain); @@ -32,6 +51,7 @@ describe("add remove domain", () => { }) it("remove entity from domain", () => { + setDomainsFeatureFlag(false); cy.loginWithCredentials(); cy.goToDomainList(); cy.removeDomainFromDataset( @@ -42,6 +62,7 @@ describe("add remove domain", () => { }) it("delete a domain and ensure dangling reference is deleted on entities", () => { + setDomainsFeatureFlag(false); cy.loginWithCredentials(); cy.goToDomainList(); cy.get('[data-testid="dropdown-menu-' + test_domain_urn + '"]').click(); diff --git a/yarn.lock b/yarn.lock new file mode 100644 index 0000000000000..fb57ccd13afbd --- /dev/null +++ b/yarn.lock @@ -0,0 +1,4 @@ +# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + + From 67af68284f952b2a466d58be5b4427c488de522b Mon Sep 17 00:00:00 2001 From: Gabe Lyons Date: Tue, 19 Sep 2023 09:02:24 -0700 Subject: [PATCH 07/37] dcs(ml-models): enhancing ml model documentation (#8848) --- docs/api/tutorials/ml.md | 146 +++++++++++++++--- .../examples/library/create_mlfeature.py | 13 +- .../library/create_mlfeature_table.py | 21 ++- .../examples/library/create_mlmodel.py | 8 +- .../examples/library/create_mlmodel_group.py | 4 +- .../examples/library/create_mlprimarykey.py | 34 ++++ .../examples/library/read_mlprimarykey.py | 12 ++ 7 files changed, 201 insertions(+), 37 deletions(-) create mode 100644 metadata-ingestion/examples/library/create_mlprimarykey.py create mode 100644 metadata-ingestion/examples/library/read_mlprimarykey.py diff --git a/docs/api/tutorials/ml.md b/docs/api/tutorials/ml.md index cb77556d48ebf..e88c941c90467 100644 --- a/docs/api/tutorials/ml.md +++ b/docs/api/tutorials/ml.md @@ -7,11 +7,12 @@ import TabItem from '@theme/TabItem'; Machine learning systems have become a crucial feature in modern data stacks. However, the relationships between the different components of a machine learning system, such as features, models, and feature tables, can be complex. -Thus, it is essential for these systems to be discoverable to facilitate easy access and utilization by other members of the organization. +DataHub makes these relationships discoverable and facilitate utilization by other members of the organization. -For more information on ML entities, please refer to the following docs: +For technical details on ML entities, please refer to the following docs: - [MlFeature](/docs/generated/metamodel/entities/mlFeature.md) +- [MlPrimaryKey](/docs/generated/metamodel/entities/mlPrimaryKey.md) - [MlFeatureTable](/docs/generated/metamodel/entities/mlFeatureTable.md) - [MlModel](/docs/generated/metamodel/entities/mlModel.md) - [MlModelGroup](/docs/generated/metamodel/entities/mlModelGroup.md) @@ -20,9 +21,11 @@ For more information on ML entities, please refer to the following docs: This guide will show you how to -- Create ML entities: MlFeature, MlFeatureTable, MlModel, MlModelGroup -- Read ML entities: MlFeature, MlFeatureTable, MlModel, MlModelGroup -- Attach MlFeatureTable or MlModel to MlFeature +- Create ML entities: MlFeature, MlFeatureTable, MlModel, MlModelGroup, MlPrimaryKey +- Read ML entities: MlFeature, MlFeatureTable, MlModel, MlModelGroup, MlPrimaryKey +- Attach MlModel to MlFeature +- Attach MlFeatures to MlFeatureTable +- Attached MlFeatures to upstream Datasets that power them ## Prerequisites @@ -33,6 +36,8 @@ For detailed steps, please refer to [Datahub Quickstart Guide](/docs/quickstart. ### Create MlFeature +An ML Feature represents an instance of a feature that can be used across different machine learning models. Features are organized into Feature Tables to be consumed by machine learning models. For example, if we were modeling features for a Users Feature Table, the Features would be `age`, `sign_up_date`, `active_in_past_30_days` and so forth.Using Features in DataHub allows users to see the sources a feature was generated from and how a feature is used to train models. + @@ -40,13 +45,31 @@ For detailed steps, please refer to [Datahub Quickstart Guide](/docs/quickstart. {{ inline /metadata-ingestion/examples/library/create_mlfeature.py show_path_as_comment }} ``` -Note that when creating a feature, you can access a list of data sources using `sources`. +Note that when creating a feature, you create upstream lineage to the data warehouse using `sources`. + + + + +### Create MlPrimaryKey + +An ML Primary Key represents a specific element of a Feature Table that indicates what group the other features belong to. For example, if a Feature Table contained features for Users, the ML Primary Key would likely be `user_id` or some similar unique identifier for a user. Using ML Primary Keys in DataHub allow users to indicate how ML Feature Tables are structured. + + + + +```python +{{ inline /metadata-ingestion/examples/library/create_mlprimarykey.py show_path_as_comment }} +``` + +Note that when creating a primary key, you create upstream lineage to the data warehouse using `sources`. ### Create MlFeatureTable +A feature table represents a group of similar Features that can all be used together to train a model. For example, if there was a Users Feature Table, it would contain documentation around how to use the Users collection of Features and references to each Feature and ML Primary Key contained within it. + @@ -54,14 +77,14 @@ Note that when creating a feature, you can access a list of data sources using ` {{ inline /metadata-ingestion/examples/library/create_mlfeature_table.py show_path_as_comment }} ``` -Note that when creating a feature table, you can access a list of features using `mlFeatures`. +Note that when creating a feature table, you connect the table to its features and primary key using `mlFeatures` and `mlPrimaryKeys`. ### Create MlModel -Please note that an MlModel represents the outcome of a single training run for a model, not the collective results of all model runs. +An ML Model in Acryl represents an individual version of a trained Machine Learning Model. Another way to think about the ML Model entity is as an istance of a training run. An ML Model entity tracks the exact ML Features used in that instance of training, along with the training results. This entity does not represents all versions of a ML Model. For example, if we train a model for homepage customization on a certain day, that would be a ML Model in DataHub. If you re-train the model the next day off of new data or with different parameters, that would produce a second ML Model entity. @@ -70,15 +93,15 @@ Please note that an MlModel represents the outcome of a single training run for {{ inline /metadata-ingestion/examples/library/create_mlmodel.py show_path_as_comment }} ``` -Note that when creating a model, you can access a list of features using `mlFeatures`. -Additionally, you can access the relationship to model groups with `groups`. +Note that when creating a model, you link it to a list of features using `mlFeatures`. This indicates how the individual instance of the model was trained. +Additionally, you can access the relationship to model groups with `groups`. An ML Model is connected to the warehouse tables it depends on via its dependency on the ML Features it reads from. ### Create MlModelGroup -Please note that an MlModelGroup serves as a container for all the runs of a single ML model. +An ML Model Group represents the grouping of all training runs of a single Machine Learning model category. It will store documentation about the group of ML Models, along with references to each individual ML Model instance. @@ -94,18 +117,14 @@ Please note that an MlModelGroup serves as a container for all the runs of a sin You can search the entities in DataHub UI. -

- -

- ## Read ML Entities ### Read MLFeature @@ -192,6 +211,93 @@ Expected response:
+### Read MlPrimaryKey + + + + +```json +query { + mlPrimaryKey(urn: "urn:li:mlPrimaryKey:(user_features,user_id)"){ + name + featureNamespace + description + dataType + properties { + description + dataType + version { + versionTag + } + } + } +} +``` + +Expected response: + +```json +{ + "data": { + "mlPrimaryKey": { + "name": "user_id", + "featureNamespace": "user_features", + "description": "User's internal ID", + "dataType": "ORDINAL", + "properties": { + "description": "User's internal ID", + "dataType": "ORDINAL", + "version": null + } + } + }, + "extensions": {} +} +``` + + + + +```json +curl --location --request POST 'http://localhost:8080/api/graphql' \ +--header 'Authorization: Bearer ' \ +--header 'Content-Type: application/json' \ +--data-raw '{ + "query": "query { mlPrimaryKey(urn: \"urn:li:mlPrimaryKey:(user_features,user_id)\"){ name featureNamespace description dataType properties { description dataType version { versionTag } } }}" +}' +``` + +Expected response: + +```json +{ + "data": { + "mlPrimaryKey": { + "name": "user_id", + "featureNamespace": "user_features", + "description": "User's internal ID", + "dataType": "ORDINAL", + "properties": { + "description": "User's internal ID", + "dataType": "ORDINAL", + "version": null + } + } + }, + "extensions": {} +} +``` + + + + +```python +{{ inline /metadata-ingestion/examples/library/read_mlprimarykey.py show_path_as_comment }} +``` + + + + ### Read MLFeatureTable @@ -232,8 +338,7 @@ Expected Response: { "name": "test_BOOL_LIST_feature" }, - ... - { + ...{ "name": "test_STRING_feature" } ] @@ -273,8 +378,7 @@ Expected Response: { "name": "test_BOOL_LIST_feature" }, - ... - { + ...{ "name": "test_STRING_feature" } ] @@ -507,14 +611,10 @@ Expected Response: (Note that this entity does not exist in the sample ingestion You can access to `Features` or `Group` Tab of each entity to view the added entities. -

- -

- diff --git a/metadata-ingestion/examples/library/create_mlfeature.py b/metadata-ingestion/examples/library/create_mlfeature.py index 81104fdb4984f..0f6d146dbf144 100644 --- a/metadata-ingestion/examples/library/create_mlfeature.py +++ b/metadata-ingestion/examples/library/create_mlfeature.py @@ -7,11 +7,11 @@ emitter = DatahubRestEmitter(gms_server="http://localhost:8080", extra_headers={}) dataset_urn = builder.make_dataset_urn( - name="fct_users_deleted", platform="hive", env="PROD" + name="fct_users_created", platform="hive", env="PROD" ) feature_urn = builder.make_ml_feature_urn( - feature_table_name="my-feature-table", - feature_name="my-feature", + feature_table_name="users_feature_table", + feature_name="user_signup_date", ) # Create feature @@ -21,7 +21,12 @@ entityUrn=feature_urn, aspectName="mlFeatureProperties", aspect=models.MLFeaturePropertiesClass( - description="my feature", sources=[dataset_urn], dataType="TEXT" + description="Represents the date the user created their account", + # attaching a source to a feature creates lineage between the feature + # and the upstream dataset. This is how lineage between your data warehouse + # and machine learning ecosystem is established. + sources=[dataset_urn], + dataType="TIME", ), ) diff --git a/metadata-ingestion/examples/library/create_mlfeature_table.py b/metadata-ingestion/examples/library/create_mlfeature_table.py index 1a8fa142376e4..d579d36a0811a 100644 --- a/metadata-ingestion/examples/library/create_mlfeature_table.py +++ b/metadata-ingestion/examples/library/create_mlfeature_table.py @@ -7,18 +7,31 @@ emitter = DatahubRestEmitter(gms_server="http://localhost:8080", extra_headers={}) feature_table_urn = builder.make_ml_feature_table_urn( - feature_table_name="my-feature-table", platform="feast" + feature_table_name="users_feature_table", platform="feast" ) + feature_urns = [ builder.make_ml_feature_urn( - feature_name="my-feature", feature_table_name="my-feature-table" + feature_name="user_signup_date", feature_table_name="users_feature_table" ), builder.make_ml_feature_urn( - feature_name="my-feature2", feature_table_name="my-feature-table" + feature_name="user_last_active_date", feature_table_name="users_feature_table" ), ] + +primary_key_urns = [ + builder.make_ml_primary_key_urn( + feature_table_name="users_feature_table", + primary_key_name="user_id", + ) +] + feature_table_properties = models.MLFeatureTablePropertiesClass( - description="Test description", mlFeatures=feature_urns + description="Test description", + # link your features to a feature table + mlFeatures=feature_urns, + # link your primary keys to the feature table + mlPrimaryKeys=primary_key_urns, ) # MCP creation diff --git a/metadata-ingestion/examples/library/create_mlmodel.py b/metadata-ingestion/examples/library/create_mlmodel.py index 630e682eff842..92ca8b93e8208 100644 --- a/metadata-ingestion/examples/library/create_mlmodel.py +++ b/metadata-ingestion/examples/library/create_mlmodel.py @@ -6,19 +6,19 @@ # Create an emitter to DataHub over REST emitter = DatahubRestEmitter(gms_server="http://localhost:8080", extra_headers={}) model_urn = builder.make_ml_model_urn( - model_name="my-test-model", platform="science", env="PROD" + model_name="my-recommendations-model-run-1", platform="science", env="PROD" ) model_group_urns = [ builder.make_ml_model_group_urn( - group_name="my-model-group", platform="science", env="PROD" + group_name="my-recommendations-model-group", platform="science", env="PROD" ) ] feature_urns = [ builder.make_ml_feature_urn( - feature_name="my-feature", feature_table_name="my-feature-table" + feature_name="user_signup_date", feature_table_name="users_feature_table" ), builder.make_ml_feature_urn( - feature_name="my-feature2", feature_table_name="my-feature-table" + feature_name="user_last_active_date", feature_table_name="users_feature_table" ), ] diff --git a/metadata-ingestion/examples/library/create_mlmodel_group.py b/metadata-ingestion/examples/library/create_mlmodel_group.py index 325c6e4cc3ccd..e39d26ac0f64e 100644 --- a/metadata-ingestion/examples/library/create_mlmodel_group.py +++ b/metadata-ingestion/examples/library/create_mlmodel_group.py @@ -6,7 +6,7 @@ # Create an emitter to DataHub over REST emitter = DatahubRestEmitter(gms_server="http://localhost:8080", extra_headers={}) model_group_urn = builder.make_ml_model_group_urn( - group_name="my-model-group", platform="science", env="PROD" + group_name="my-recommendations-model-group", platform="science", env="PROD" ) @@ -16,7 +16,7 @@ entityUrn=model_group_urn, aspectName="mlModelGroupProperties", aspect=models.MLModelGroupPropertiesClass( - description="my model group", + description="Grouping of ml model training runs related to home page recommendations.", ), ) diff --git a/metadata-ingestion/examples/library/create_mlprimarykey.py b/metadata-ingestion/examples/library/create_mlprimarykey.py new file mode 100644 index 0000000000000..3fb397183a07f --- /dev/null +++ b/metadata-ingestion/examples/library/create_mlprimarykey.py @@ -0,0 +1,34 @@ +import datahub.emitter.mce_builder as builder +import datahub.metadata.schema_classes as models +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.rest_emitter import DatahubRestEmitter + +# Create an emitter to DataHub over REST +emitter = DatahubRestEmitter(gms_server="http://localhost:8080", extra_headers={}) + +dataset_urn = builder.make_dataset_urn( + name="fct_users_created", platform="hive", env="PROD" +) +primary_key_urn = builder.make_ml_primary_key_urn( + feature_table_name="users_feature_table", + primary_key_name="user_id", +) + +# Create feature +metadata_change_proposal = MetadataChangeProposalWrapper( + entityType="mlPrimaryKey", + changeType=models.ChangeTypeClass.UPSERT, + entityUrn=primary_key_urn, + aspectName="mlPrimaryKeyProperties", + aspect=models.MLPrimaryKeyPropertiesClass( + description="Represents the id of the user the other features relate to.", + # attaching a source to a ml primary key creates lineage between the feature + # and the upstream dataset. This is how lineage between your data warehouse + # and machine learning ecosystem is established. + sources=[dataset_urn], + dataType="TEXT", + ), +) + +# Emit metadata! +emitter.emit(metadata_change_proposal) diff --git a/metadata-ingestion/examples/library/read_mlprimarykey.py b/metadata-ingestion/examples/library/read_mlprimarykey.py new file mode 100644 index 0000000000000..ce2e87cae0b92 --- /dev/null +++ b/metadata-ingestion/examples/library/read_mlprimarykey.py @@ -0,0 +1,12 @@ +from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph + +# Imports for metadata model classes +from datahub.metadata.schema_classes import MLPrimaryKeyPropertiesClass + +# First we get the current owners +gms_endpoint = "http://localhost:8080" +graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint)) + +urn = "urn:li:mlPrimaryKey:(user_features,user_id)" +result = graph.get_aspect(entity_urn=urn, aspect_type=MLPrimaryKeyPropertiesClass) +print(result) From 47b7e2984cc958e8d232609b0444fb27d60e9ab4 Mon Sep 17 00:00:00 2001 From: Gabe Lyons Date: Tue, 19 Sep 2023 09:07:08 -0700 Subject: [PATCH 08/37] logging(lineage): adding some lineage explorer and impact analysis logging (#8849) --- datahub-web-react/src/app/analytics/event.ts | 16 ++++++++++++++++ .../styled/search/EmbeddedListSearch.tsx | 17 ++++++++++++++++- .../src/app/lineage/LineageExplorer.tsx | 13 ++++++++++++- .../datahubusage/DataHubUsageEventType.java | 2 ++ 4 files changed, 46 insertions(+), 2 deletions(-) diff --git a/datahub-web-react/src/app/analytics/event.ts b/datahub-web-react/src/app/analytics/event.ts index 28cd61ff3171a..2734026400933 100644 --- a/datahub-web-react/src/app/analytics/event.ts +++ b/datahub-web-react/src/app/analytics/event.ts @@ -35,6 +35,8 @@ export enum EventType { SearchBarExploreAllClickEvent, SearchResultsExploreAllClickEvent, SearchAcrossLineageEvent, + VisualLineageViewEvent, + VisualLineageExpandGraphEvent, SearchAcrossLineageResultsViewEvent, DownloadAsCsvEvent, SignUpEvent, @@ -340,12 +342,23 @@ export interface HomePageRecommendationClickEvent extends BaseEvent { index?: number; } +export interface VisualLineageViewEvent extends BaseEvent { + type: EventType.VisualLineageViewEvent; + entityType?: EntityType; +} + +export interface VisualLineageExpandGraphEvent extends BaseEvent { + type: EventType.VisualLineageExpandGraphEvent; + targetEntityType?: EntityType; +} + export interface SearchAcrossLineageEvent extends BaseEvent { type: EventType.SearchAcrossLineageEvent; query: string; entityTypeFilter?: EntityType; pageNumber: number; originPath: string; + maxDegree?: string; } export interface SearchAcrossLineageResultsViewEvent extends BaseEvent { type: EventType.SearchAcrossLineageResultsViewEvent; @@ -353,6 +366,7 @@ export interface SearchAcrossLineageResultsViewEvent extends BaseEvent { entityTypeFilter?: EntityType; page?: number; total: number; + maxDegree?: string; } export interface DownloadAsCsvEvent extends BaseEvent { @@ -641,6 +655,8 @@ export type Event = | RecommendationImpressionEvent | SearchAcrossLineageEvent | SearchAcrossLineageResultsViewEvent + | VisualLineageViewEvent + | VisualLineageExpandGraphEvent | DownloadAsCsvEvent | RecommendationClickEvent | HomePageRecommendationClickEvent diff --git a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearch.tsx b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearch.tsx index 4119a341c5f1b..e27a63b98f012 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearch.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearch.tsx @@ -7,7 +7,7 @@ import { FacetMetadata, SearchAcrossEntitiesInput, } from '../../../../../../types.generated'; -import { UnionType } from '../../../../../search/utils/constants'; +import { DEGREE_FILTER_NAME, UnionType } from '../../../../../search/utils/constants'; import { SearchCfg } from '../../../../../../conf'; import { EmbeddedListSearchResults } from './EmbeddedListSearchResults'; import EmbeddedListSearchHeader from './EmbeddedListSearchHeader'; @@ -27,6 +27,7 @@ import { import { useEntityContext } from '../../../EntityContext'; import { EntityActionProps } from './EntitySearchResults'; import { useUserContext } from '../../../../../context/useUserContext'; +import analytics, { EventType } from '../../../../../analytics'; const Container = styled.div` display: flex; @@ -266,6 +267,20 @@ export const EmbeddedListSearch = ({ const finalFacets = (fixedFilters && removeFixedFiltersFromFacets(fixedFilters, data?.facets || [])) || data?.facets; + // used for logging impact anlaysis events + const degreeFilter = filters.find((filter) => filter.field === DEGREE_FILTER_NAME); + + // we already have some lineage logging through Tab events, but this adds additional context, particularly degree + if (!loading && (degreeFilter?.values?.length || 0) > 0) { + analytics.event({ + type: EventType.SearchAcrossLineageResultsViewEvent, + query, + page, + total: data?.total || 0, + maxDegree: degreeFilter?.values?.sort()?.reverse()[0] || '1', + }); + } + return ( {error && } diff --git a/datahub-web-react/src/app/lineage/LineageExplorer.tsx b/datahub-web-react/src/app/lineage/LineageExplorer.tsx index 2683b9125ad28..ed0b26bde11ef 100644 --- a/datahub-web-react/src/app/lineage/LineageExplorer.tsx +++ b/datahub-web-react/src/app/lineage/LineageExplorer.tsx @@ -17,6 +17,7 @@ import { SHOW_COLUMNS_URL_PARAMS, useIsShowColumnsMode } from './utils/useIsShow import { ErrorSection } from '../shared/error/ErrorSection'; import usePrevious from '../shared/usePrevious'; import { useGetLineageTimeParams } from './utils/useGetLineageTimeParams'; +import analytics, { EventType } from '../analytics'; const DEFAULT_DISTANCE_FROM_TOP = 106; @@ -85,7 +86,13 @@ export default function LineageExplorer({ urn, type }: Props) { // they should be added to the dependency array below. useEffect(() => { setAsyncEntities({}); - }, [isHideSiblingMode, startTimeMillis, endTimeMillis]); + // this can also be our hook for emitting the tracking event + + analytics.event({ + type: EventType.VisualLineageViewEvent, + entityType: entityData?.type, + }); + }, [isHideSiblingMode, startTimeMillis, endTimeMillis, entityData?.type]); useEffect(() => { if (showColumns) { @@ -183,6 +190,10 @@ export default function LineageExplorer({ urn, type }: Props) { onLineageExpand={(asyncData: EntityAndType) => { resetAsyncEntity(asyncData.entity.urn); maybeAddAsyncLoadedEntity(asyncData); + analytics.event({ + type: EventType.VisualLineageExpandGraphEvent, + targetEntityType: asyncData?.type, + }); }} refetchCenterNode={() => { refetch().then(() => { diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/datahubusage/DataHubUsageEventType.java b/metadata-service/services/src/main/java/com/linkedin/metadata/datahubusage/DataHubUsageEventType.java index 036fb20b33f20..c1018e2031b17 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/datahubusage/DataHubUsageEventType.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/datahubusage/DataHubUsageEventType.java @@ -67,6 +67,8 @@ public enum DataHubUsageEventType { MANUALLY_DELETE_LINEAGE_EVENT("ManuallyDeleteLineageEvent"), LINEAGE_GRAPH_TIME_RANGE_SELECTION_EVENT("LineageGraphTimeRangeSelectionEvent"), LINEAGE_TAB_TIME_RANGE_SELECTION_EVENT("LineageTabTimeRangeSelectionEvent"), + VISUAL_LINEAGE_EXPAND_GRAPH_EVENT("VisualLineageExpandGraphEvent"), + VISUAL_LINEAGE_VIEW_EVENT("VisualLineageViewEvent"), CREATE_QUERY_EVENT("CreateQueryEvent"), DELETE_QUERY_EVENT("DeleteQueryEvent"), UPDATE_QUERY_EVENT("UpdateQueryEvent"), From 35eb194fa38092b37a654fd89238f6556c55aa43 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 20 Sep 2023 01:01:53 -0700 Subject: [PATCH 09/37] fix(gms): lower telemetry error log level (#8860) --- .../src/main/java/com/datahub/telemetry/TrackingService.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/telemetry/TrackingService.java b/metadata-service/auth-impl/src/main/java/com/datahub/telemetry/TrackingService.java index 85f25895e0d49..ac27e1a16c8b7 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/telemetry/TrackingService.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/telemetry/TrackingService.java @@ -102,7 +102,8 @@ public void emitAnalyticsEvent(@Nonnull final JsonNode event) { try { _mixpanelAPI.sendMessage(_mixpanelMessageBuilder.event(getClientId(), eventType, sanitizedEvent)); } catch (IOException e) { - log.error("Failed to send event to Mixpanel", e); + log.info("Failed to send event to Mixpanel; this does not affect the functionality of the application"); + log.debug("Failed to send event to Mixpanel", e); } } From 9fdfa49028f588b339a632b05cf521b659ac20ca Mon Sep 17 00:00:00 2001 From: siladitya <68184387+siladitya2@users.noreply.github.com> Date: Wed, 20 Sep 2023 16:55:19 +0200 Subject: [PATCH 10/37] fix(datahub-gms) usage stats queryRange API's Authorization error for Dataset Owners (#8819) Co-authored-by: si-chakraborty --- .../com/linkedin/metadata/resources/usage/UsageStats.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java index ddfdec0315f6b..be70cf9c494ef 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java @@ -380,8 +380,10 @@ public Task query(@ActionParam(PARAM_RESOURCE) @Nonnull String public Task queryRange(@ActionParam(PARAM_RESOURCE) @Nonnull String resource, @ActionParam(PARAM_DURATION) @Nonnull WindowDuration duration, @ActionParam(PARAM_RANGE) UsageTimeRange range) { Authentication auth = AuthenticationContext.getAuthentication(); + Urn resourceUrn = UrnUtils.getUrn(resource); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.VIEW_DATASET_USAGE_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.VIEW_DATASET_USAGE_PRIVILEGE), + new ResourceSpec(resourceUrn.getEntityType(), resourceUrn.toString()))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to query usage."); } From b46635959c42ff5ea94ea0668024d501c82775cf Mon Sep 17 00:00:00 2001 From: Zachary McNellis Date: Wed, 20 Sep 2023 14:20:10 -0400 Subject: [PATCH 11/37] docs(observability): Add Custom Assertion user guide (#8854) Co-authored-by: John Joyce --- docs-website/sidebars.js | 1 + .../observe/custom-assertions.md | 316 ++++++++++++++++++ 2 files changed, 317 insertions(+) create mode 100644 docs/managed-datahub/observe/custom-assertions.md diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 12691e9f8268a..03ea38fd622d4 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -437,6 +437,7 @@ module.exports = { Observability: [ "docs/managed-datahub/observe/freshness-assertions", "docs/managed-datahub/observe/volume-assertions", + "docs/managed-datahub/observe/custom-assertions", ], }, ], diff --git a/docs/managed-datahub/observe/custom-assertions.md b/docs/managed-datahub/observe/custom-assertions.md new file mode 100644 index 0000000000000..d52ac4b38cb4b --- /dev/null +++ b/docs/managed-datahub/observe/custom-assertions.md @@ -0,0 +1,316 @@ +--- +description: This page provides an overview of working with DataHub SQL Assertions +--- +import FeatureAvailability from '@site/src/components/FeatureAvailability'; + + +# Custom Assertions + + + + +> ⚠️ The **Custom Assertions** feature is currently in private beta, part of the **Acryl Observe** module, and may only be available to a +> limited set of design partners. +> +> If you are interested in trying it and providing feedback, please reach out to your Acryl Customer Success +> representative. + +## Introduction + +Can you remember a time when the meaning of Data Warehouse Table that you depended on fundamentally changed, with little or no notice? +If the answer is yes, how did you find out? We'll take a guess - someone looking at an internal reporting dashboard or worse, a user using your your product, sounded an alarm when +a number looked a bit out of the ordinary. Perhaps your table initially tracked purchases made on your company's e-commerce web store, but suddenly began to include purchases made +through your company's new mobile app. + +There are many reasons why an important Table on Snowflake, Redshift, or BigQuery may change in its meaning - application code bugs, new feature rollouts, +changes to key metric definitions, etc. Often times, these changes break important assumptions made about the data used in building key downstream data products +like reporting dashboards or data-driven product features. + +What if you could reduce the time to detect these incidents, so that the people responsible for the data were made aware of data +issues _before_ anyone else? With Acryl DataHub **Custom Assertions**, you can. + +Acryl DataHub allows users to define complex expectations about a particular warehouse Table through custom SQL queries, and then monitor those expectations over time as the table grows and changes. + +In this article, we'll cover the basics of monitoring Custom Assertions - what they are, how to configure them, and more - so that you and your team can +start building trust in your most important data assets. + +Let's get started! + +## Support + +Custom Assertions are currently supported for: + +1. Snowflake +2. Redshift +3. BigQuery + +Note that an Ingestion Source _must_ be configured with the data platform of your choice in Acryl DataHub's **Ingestion** +tab. + +> Note that SQL Assertions are not yet supported if you are connecting to your warehouse +> using the DataHub CLI or a Remote Ingestion Executor. + +## What is a Custom Assertion? + +A **Custom Assertion** is a highly configurable Data Quality rule used to monitor a Data Warehouse Table +for unexpected or sudden changes in its meaning. Custom Assertions are defined through a raw SQL query that is evaluated against +the Table. You have full control over the SQL query, and can use any SQL features supported by your Data Warehouse. +Custom Assertions can be particularly useful when you have complex tables or relationships +that are used to generate important metrics or reports, and where the meaning of the table is expected to be stable over time. +If you have existing SQL queries that you already use to monitor your data, you may find that Custom Assertions are an easy way to port them +to Acryl DataHub to get started. + +For example, imagine that you have a Table that tracks the number of purchases made on your company's e-commerce web store. +You have a SQL query that you use to calculate the number of purchases made in the past 24 hours, and you'd like to monitor this +metric over time to ensure that it is always greater than 1000. You can use a Custom Assertion to do this! + + +### Anatomy of a Custom Assertion + +At the most basic level, **Custom Assertions** consist of a few important parts: + +1. An **Evaluation Schedule** +2. A **Query** +3. An **Condition Type** +4. An **Assertion Description** + +In this section, we'll give an overview of each. + +#### 1. Evaluation Schedule + +The **Evaluation Schedule**: This defines how often to query the given warehouse Table. This should usually +be configured to match the expected change frequency of the Table, although it can also be less frequently depending +on the requirements. You can also specify specific days of the week, hours in the day, or even +minutes in an hour. + + +#### 2. Query + +The **Query**: This is the SQL query that will be used to evaluate the Table. The query should return a single row with a single column. Currently only numeric values are supported (integer and floats). The query can be as simple or as complex as you'd like, and can use any SQL features supported by your Data Warehouse. This requires that the configured user account has read access to the asset. Make sure to use the fully qualified name of the Table in your query. + + +Use the "Try it out" button to test your query and ensure that it returns a single row with a single column. The query will be run against the Table in the context of the configured user account, so ensure that the user has read access to the Table. + + +#### 3. Condition Type + +The **Condition Type**: This defines the conditions under which the Assertion will **fail**. The list of supported operations is: +- **Is Equal To**: The assertion will fail if the query result is equal to the configured value +- **Is Not Equal To**: The assertion will fail if the query result is not equal to the configured value +- **Is Greater Than**: The assertion will fail if the query result is greater than the configured value +- **Is Less Than**: The assertion will fail if the query result is less than the configured value +- **Is False**: The assertion will fail if the query result is false (i.e. 0) +- **Is outside a range**: The assertion will fail if the query result is outside the configured range +- **Grows More Than**: The assertion will fail if the query result grows more than the configured range. This can be either a percentage (**Percentage**) or an absolute value (**Differential**). +- **Grows Less Than**: The assertion will fail if the query result grows less than the configured percentage. This can be either a percentage (**Percentage**) or an absolute value (**Differential**). +- **Growth is outside a range**: The assertion will fail if the query result growth is outside the configured range. This can be either a percentage (**Percentage**) or an absolute value (**Differential**). + +Custom Assertions also have an off switch: they can be started or stopped at any time with the click of button. + +#### 4. Assertion Description + +The **Assertion Description**: This is a human-readable description of the Assertion. It should be used to describe the meaning of the Assertion, and can be used to provide additional context to users who are viewing the Assertion. + + +## Creating a Custom Assertion + +### Prerequisites + +1. **Permissions**: To create or delete Custom Assertions for a specific entity on DataHub, you'll need to be granted the + `Edit Assertions` and `Edit Monitors` privileges for the entity. This is granted to Entity owners by default. + +2. **Data Platform Connection**: In order to create a Custom Assertion, you'll need to have an **Ingestion Source** configured to your + Data Platform: Snowflake, BigQuery, or Redshift under the **Integrations** tab. + +Once these are in place, you're ready to create your Custom Assertions! + +### Steps + +1. Navigate to the Table you want to monitor +2. Click the **Validations** tab + +

+ +

+ +3. Click **+ Create Assertion** + +

+ +

+ +4. Choose **Custom** + +5. Configure the evaluation **schedule**. This is the frequency at which the assertion will be evaluated to produce a pass or fail result, and the times + when the query will be executed. + +6. Provide a SQL **query** that will be used to evaluate the Table. The query should return a single row with a single column. Currently only numeric values are supported (integer and floats). The query can be as simple or as complex as you'd like, and can use any SQL features supported by your Data Warehouse. Make sure to use the fully qualified name of the Table in your query. + +

+ +

+ +7. Configure the evaluation **condition type**. This determines the cases in which the new assertion will fail when it is evaluated. + +

+ +

+ +8. Add a **description** for the assertion. This is a human-readable description of the Assertion. It should be used to describe the meaning of the Assertion, and can be used to provide additional context to users who are viewing the Assertion. + +

+ +

+ +9. (Optional) Use the **Try it out** button to test your query and ensure that it returns a single row with a single column, and passes the configured condition type. + +

+ +

+ +10. Click **Next** +11. Configure actions that should be taken when the Custom Assertion passes or fails + +

+ +

+ +- **Raise incident**: Automatically raise a new DataHub Incident for the Table whenever the Custom Assertion is failing. This + may indicate that the Table is unfit for consumption. Configure Slack Notifications under **Settings** to be notified when + an incident is created due to an Assertion failure. +- **Resolve incident**: Automatically resolved any incidents that were raised due to failures in this Custom Assertion. Note that + any other incidents will not be impacted. + +1. Click **Save**. + +And that's it! DataHub will now begin to monitor your Custom Assertion for the table. + +To view the time of the next Custom Assertion evaluation, simply click **Custom** and then click on your +new Assertion: + +

+ +

+ +Once your assertion has run, you will begin to see Success or Failure status for the Table + +

+ +

+ + +## Stopping a Custom Assertion + +In order to temporarily stop the evaluation of a Custom Assertion: + +1. Navigate to the **Validations** tab of the Table with the assertion +2. Click **Custom** to open the Custom Assertions list +3. Click the three-dot menu on the right side of the assertion you want to disable +4. Click **Stop** + +

+ +

+ +To resume the Custom Assertion, simply click **Turn On**. + +

+ +

+ + +## Creating Custom Assertions via API + +Under the hood, Acryl DataHub implements Custom Assertion Monitoring using two "entity" concepts: + +- **Assertion**: The specific expectation for the custom assertion, e.g. "The table was changed in the past 7 hours" + or "The table is changed on a schedule of every day by 8am". This is the "what". + +- **Monitor**: The process responsible for evaluating the Assertion on a given evaluation schedule and using specific + mechanisms. This is the "how". + +Note that to create or delete Assertions and Monitors for a specific entity on DataHub, you'll need the +`Edit Assertions` and `Edit Monitors` privileges for it. + +#### GraphQL + +In order to create a Custom Assertion that is being monitored on a specific **Evaluation Schedule**, you'll need to use 2 +GraphQL mutation queries to create a Custom Assertion entity and create an Assertion Monitor entity responsible for evaluating it. + +Start by creating the Custom Assertion entity using the `createSqlAssertion` query and hang on to the 'urn' field of the Assertion entity +you get back. Then continue by creating a Monitor entity using the `createAssertionMonitor`. + +##### Examples + +To create a Custom Assertion Entity that checks whether a query result is greater than 100: + +```json +mutation createSqlAssertion { + createSqlAssertion( + input: { + entityUrn: "", + type: METRIC, + description: "", + statement: "", + operator: GREATER_THAN, + parameters: { + value: { + value: "100", + type: NUMBER + } + } + } + ) { + urn + } +} +``` + +The supported custom assertion types are `METRIC` and `METRIC_CHANGE`. If you choose `METRIC_CHANGE`, +you will need to provide a `changeType` parameter with either `ABSOLUTE` or `PERCENTAGE` values. +The supported operator types are `EQUAL_TO`, `NOT_EQUAL_TO`, `GREATER_THAN`, `GREATER_THAN_OR_EQUAL_TO`, `LESS_THAN`, `LESS_THAN_OR_EQUAL_TO`, and `BETWEEN` (requires minValue, maxValue). +The supported parameter types are `NUMBER`. + +To create an Assertion Monitor Entity that evaluates the custom assertion every 8 hours: + +```json +mutation createAssertionMonitor { + createAssertionMonitor( + input: { + entityUrn: "", + assertionUrn: "", + schedule: { + cron: "0 */8 * * *", + timezone: "America/Los_Angeles" + }, + parameters: { + type: DATASET_SQL + } + } + ) { + urn + } +} +``` + +This entity defines _when_ to run the check (Using CRON format - every 8th hour) and _how_ to run the check (using the Information Schema). + +After creating the monitor, the new assertion will start to be evaluated every 8 hours in your selected timezone. + +You can delete assertions along with their monitors using GraphQL mutations: `deleteAssertion` and `deleteMonitor`. + +### Tips + +:::info +**Authorization** + +Remember to always provide a DataHub Personal Access Token when calling the GraphQL API. To do so, just add the 'Authorization' header as follows: + +``` +Authorization: Bearer +``` + +**Exploring GraphQL API** + +Also, remember that you can play with an interactive version of the Acryl GraphQL API at `https://your-account-id.acryl.io/api/graphiql` +::: From 6c6216aaa2a3a4f9723f0e508b6ebac8ba1230f2 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 20 Sep 2023 12:00:23 -0700 Subject: [PATCH 12/37] fix(airflow): fix provider loading exception (#8861) --- .../airflow-plugin/setup.py | 3 ++- .../src/datahub_airflow_plugin/__init__.py | 16 +++++++++++++ .../datahub_airflow_plugin/hooks/datahub.py | 23 +++++++++++++++---- metadata-ingestion/setup.py | 1 - 4 files changed, 37 insertions(+), 6 deletions(-) diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index 18e605ae76ebd..47069f59c314d 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -80,7 +80,8 @@ def get_long_description(): entry_points = { - "airflow.plugins": "acryl-datahub-airflow-plugin = datahub_airflow_plugin.datahub_plugin:DatahubPlugin" + "airflow.plugins": "acryl-datahub-airflow-plugin = datahub_airflow_plugin.datahub_plugin:DatahubPlugin", + "apache_airflow_provider": ["provider_info=datahub_provider:get_provider_info"], } diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/__init__.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/__init__.py index b2c45d3a1e75d..e4040e3a17dfd 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/__init__.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/__init__.py @@ -18,4 +18,20 @@ def get_provider_info(): "package-name": f"{__package_name__}", "name": f"{__package_name__}", "description": "Datahub metadata collector plugin", + "connection-types": [ + { + "hook-class-name": "datahub_airflow_plugin.hooks.datahub.DatahubRestHook", + "connection-type": "datahub-rest", + }, + { + "hook-class-name": "datahub_airflow_plugin.hooks.datahub.DatahubKafkaHook", + "connection-type": "datahub-kafka", + }, + ], + # Deprecated method of providing connection types, kept for backwards compatibility. + # We can remove with Airflow 3. + "hook-class-names": [ + "datahub_airflow_plugin.hooks.datahub.DatahubRestHook", + "datahub_airflow_plugin.hooks.datahub.DatahubKafkaHook", + ], } diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py index aed858c6c4df0..8fb7363f8cad1 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py @@ -29,7 +29,7 @@ class DatahubRestHook(BaseHook): conn_name_attr = "datahub_rest_conn_id" default_conn_name = "datahub_rest_default" - conn_type = "datahub_rest" + conn_type = "datahub-rest" hook_name = "DataHub REST Server" def __init__(self, datahub_rest_conn_id: str = default_conn_name) -> None: @@ -50,6 +50,15 @@ def get_ui_field_behaviour() -> Dict: }, } + def test_connection(self) -> Tuple[bool, str]: + try: + emitter = self.make_emitter() + emitter.test_connection() + except Exception as e: + return False, str(e) + + return True, "Successfully connected to DataHub." + def _get_config(self) -> Tuple[str, Optional[str], Optional[int]]: conn: "Connection" = self.get_connection(self.datahub_rest_conn_id) @@ -99,7 +108,7 @@ class DatahubKafkaHook(BaseHook): conn_name_attr = "datahub_kafka_conn_id" default_conn_name = "datahub_kafka_default" - conn_type = "datahub_kafka" + conn_type = "datahub-kafka" hook_name = "DataHub Kafka Sink" def __init__(self, datahub_kafka_conn_id: str = default_conn_name) -> None: @@ -194,9 +203,15 @@ def get_underlying_hook(self) -> Union[DatahubRestHook, DatahubKafkaHook]: # We need to figure out the underlying hook type. First check the # conn_type. If that fails, attempt to guess using the conn id name. - if conn.conn_type == DatahubRestHook.conn_type: + if ( + conn.conn_type == DatahubRestHook.conn_type + or conn.conn_type == DatahubRestHook.conn_type.replace("-", "_") + ): return DatahubRestHook(self.datahub_conn_id) - elif conn.conn_type == DatahubKafkaHook.conn_type: + elif ( + conn.conn_type == DatahubKafkaHook.conn_type + or conn.conn_type == DatahubKafkaHook.conn_type.replace("-", "_") + ): return DatahubKafkaHook(self.datahub_conn_id) elif "rest" in self.datahub_conn_id: return DatahubRestHook(self.datahub_conn_id) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index a119eba25be2a..b9169186174fa 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -643,7 +643,6 @@ def get_long_description(): "datahub = datahub.ingestion.reporting.datahub_ingestion_run_summary_provider:DatahubIngestionRunSummaryProvider", "file = datahub.ingestion.reporting.file_reporter:FileReporter", ], - "apache_airflow_provider": ["provider_info=datahub_provider:get_provider_info"], } From 04833dd48f1ecf3bf6e6e0e93fd27f3f81cb8a82 Mon Sep 17 00:00:00 2001 From: Kos Korchak <97058061+kkorchak@users.noreply.github.com> Date: Wed, 20 Sep 2023 15:05:15 -0400 Subject: [PATCH 13/37] fix(): Fix glossary_navigation.js (#8864) --- .../cypress/cypress/e2e/glossary/glossary_navigation.js | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js b/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js index de9fa7ecda1f0..e0d2bf240d74d 100644 --- a/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js +++ b/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js @@ -20,8 +20,7 @@ describe("glossary sidebar navigation test", () => { cy.waitTextVisible("No documentation yet"); cy.openThreeDotDropdown(); cy.clickOptionWithText("Move"); - cy.get('[role="dialog"] [data-icon="close-circle"]').click({force: true}); - cy.get('[role="dialog"]').contains(glossaryTermGroup).click(); + cy.get('[role="dialog"]').contains(glossaryTermGroup).click({force: true}); cy.get('[role="dialog"]').contains(glossaryTermGroup).should("be.visible"); cy.get("button").contains("Move").click(); cy.waitTextVisible("Moved Glossary Term!"); @@ -33,8 +32,7 @@ describe("glossary sidebar navigation test", () => { cy.clickOptionWithText(glossaryTermGroup); cy.openThreeDotDropdown(); cy.clickOptionWithText("Move"); - cy.get('[role="dialog"] [data-icon="close-circle"]').click({force: true}); - cy.get('[role="dialog"]').contains(glossaryParentGroup).click(); + cy.get('[role="dialog"]').contains(glossaryParentGroup).click({force: true}); cy.get('[role="dialog"]').contains(glossaryParentGroup).should("be.visible"); cy.get("button").contains("Move").click(); cy.waitTextVisible("Moved Term Group!"); From bf9209231e25a1ede94e0655b44506b9564b5f93 Mon Sep 17 00:00:00 2001 From: Kos Korchak <97058061+kkorchak@users.noreply.github.com> Date: Wed, 20 Sep 2023 15:46:25 -0400 Subject: [PATCH 14/37] test(cypress): Managing Secrets Cypress test (#8863) --- .../cypress/e2e/mutations/managing_secrets.js | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 smoke-test/tests/cypress/cypress/e2e/mutations/managing_secrets.js diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/managing_secrets.js b/smoke-test/tests/cypress/cypress/e2e/mutations/managing_secrets.js new file mode 100644 index 0000000000000..466bb2ef0757e --- /dev/null +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/managing_secrets.js @@ -0,0 +1,105 @@ +const number = Math.floor(Math.random() * 100000); +const accound_id = `account${number}`; +const warehouse_id = `warehouse${number}`; +const username = `user${number}`; +const password = `password${number}`; +const role = `role${number}`; +const ingestion_source_name = `ingestion source ${number}`; + +describe("managing secrets for ingestion creation", () => { + it("create a secret, create ingestion source using a secret, remove a secret", () => { + cy.loginWithCredentials(); + //navigate to the manage ingestion page → secrets + cy.goToIngestionPage(); + cy.clickOptionWithText("Secrets"); + //create a new secret + cy.clickOptionWithText("Create new secret"); + cy.get('[role="dialog"]').contains("Create a new Secret").should("be.visible"); + cy.get('[role="dialog"] #name').type(`secretname${number}`); + cy.get('[role="dialog"] #value').type(`secretvalue${number}`); + cy.get('[role="dialog"] #description').type(`secretdescription${number}`); + cy.get('#createSecretButton').click(); + cy.waitTextVisible("Successfully created Secret!"); + cy.waitTextVisible(`secretname${number}`); + cy.waitTextVisible(`secretdescription${number}`).wait(5000)//prevent issue with missing secret + //create an ingestion source using a secret + cy.goToIngestionPage(); + cy.clickOptionWithText("Create new source"); + cy.clickOptionWithText("Snowflake"); + cy.waitTextVisible("Snowflake Recipe"); + cy.get("#account_id").type(accound_id); + cy.get("#warehouse").type(warehouse_id); + cy.get("#username").type(username); + cy.get("#password").click().wait(1000); + cy.contains(`secretname${number}`).click({force: true}); + cy.focused().blur(); + cy.get("#role").type(role); + cy.get("button").contains("Next").click(); + cy.waitTextVisible("Configure an Ingestion Schedule"); + cy.get("button").contains("Next").click(); + cy.waitTextVisible("Give this ingestion source a name."); + cy.get('[data-testid="source-name-input"]').type(ingestion_source_name); + cy.get("button").contains("Save").click(); + cy.waitTextVisible("Successfully created ingestion source!").wait(5000)//prevent issue with missing form data + cy.waitTextVisible(ingestion_source_name); + cy.get("button").contains("Pending...").should("be.visible"); + //remove a secret + cy.clickOptionWithText("Secrets"); + cy.waitTextVisible(`secretname${number}`); + cy.get('[data-icon="delete"]').first().click(); + cy.waitTextVisible("Confirm Secret Removal"); + cy.get("button").contains("Yes").click(); + cy.waitTextVisible("Removed secret."); + cy.ensureTextNotPresent(`secretname${number}`); + cy.ensureTextNotPresent(`secretdescription${number}`); + //remove ingestion source + cy.goToIngestionPage(); + cy.get('[data-testid="delete-button"]').first().click(); + cy.waitTextVisible("Confirm Ingestion Source Removal"); + cy.get("button").contains("Yes").click(); + cy.waitTextVisible("Removed ingestion source."); + cy.ensureTextNotPresent(ingestion_source_name) + //verify secret is not present during ingestion source creation for password dropdown + cy.clickOptionWithText("Create new source"); + cy.clickOptionWithText("Snowflake"); + cy.waitTextVisible("Snowflake Recipe"); + cy.get("#account_id").type(accound_id); + cy.get("#warehouse").type(warehouse_id); + cy.get("#username").type(username); + cy.get("#password").click().wait(1000); + cy.ensureTextNotPresent(`secretname${number}`); + //verify secret can be added during ingestion source creation and used successfully + cy.clickOptionWithText("Create Secret"); + cy.get('[role="dialog"]').contains("Create a new Secret").should("be.visible"); + cy.get('[role="dialog"] #name').type(`secretname${number}`); + cy.get('[role="dialog"] #value').type(`secretvalue${number}`); + cy.get('[role="dialog"] #description').type(`secretdescription${number}`); + cy.get('#createSecretButton').click(); + cy.waitTextVisible("Created secret!"); + cy.get("#role").type(role); + cy.get("button").contains("Next").click(); + cy.waitTextVisible("Configure an Ingestion Schedule"); + cy.get("button").contains("Next").click(); + cy.waitTextVisible("Give this ingestion source a name."); + cy.get('[data-testid="source-name-input"]').type(ingestion_source_name); + cy.get("button").contains("Save").click(); + cy.waitTextVisible("Successfully created ingestion source!").wait(5000)//prevent issue with missing form data + cy.waitTextVisible(ingestion_source_name); + cy.get("button").contains("Pending...").should("be.visible"); + //Remove ingestion source and secret + cy.goToIngestionPage(); + cy.get('[data-testid="delete-button"]').first().click(); + cy.waitTextVisible("Confirm Ingestion Source Removal"); + cy.get("button").contains("Yes").click(); + cy.waitTextVisible("Removed ingestion source."); + cy.ensureTextNotPresent(ingestion_source_name) + cy.clickOptionWithText("Secrets"); + cy.waitTextVisible(`secretname${number}`); + cy.get('[data-icon="delete"]').first().click(); + cy.waitTextVisible("Confirm Secret Removal"); + cy.get("button").contains("Yes").click(); + cy.waitTextVisible("Removed secret."); + cy.ensureTextNotPresent(`secretname${number}`); + cy.ensureTextNotPresent(`secretdescription${number}`); + }) +}); \ No newline at end of file From ee7930b465f64ea1af2b7876859cc37bdce7aecb Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Thu, 21 Sep 2023 09:28:36 -0400 Subject: [PATCH 15/37] feat(ui) Make certain things disabled if read only mode is enabled (#8870) --- .../src/app/domain/nestedDomains/ManageDomainsPageV2.tsx | 1 + .../src/app/entity/user/UserEditProfileModal.tsx | 6 ++++++ datahub-web-react/src/app/settings/SettingsPage.tsx | 3 ++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx b/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx index 486169c3559d3..0e5c035df00c1 100644 --- a/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx +++ b/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx @@ -17,6 +17,7 @@ const PageWrapper = styled.div` flex: 1; display: flex; flex-direction: column; + overflow: hidden; `; const Header = styled.div` diff --git a/datahub-web-react/src/app/entity/user/UserEditProfileModal.tsx b/datahub-web-react/src/app/entity/user/UserEditProfileModal.tsx index e36bf1972a56e..d9314df7e11ae 100644 --- a/datahub-web-react/src/app/entity/user/UserEditProfileModal.tsx +++ b/datahub-web-react/src/app/entity/user/UserEditProfileModal.tsx @@ -138,6 +138,7 @@ export default function UserEditProfileModal({ visible, onClose, onSave, editMod placeholder="John Smith" value={data.name} onChange={(event) => setData({ ...data, name: event.target.value })} + disabled={readOnlyModeEnabled} />
setData({ ...data, title: event.target.value })} + disabled={readOnlyModeEnabled} /> setData({ ...data, team: event.target.value })} + disabled={readOnlyModeEnabled} />
setData({ ...data, email: event.target.value })} + disabled={readOnlyModeEnabled} /> setData({ ...data, slack: event.target.value })} + disabled={readOnlyModeEnabled} /> setData({ ...data, phone: event.target.value })} + disabled={readOnlyModeEnabled} /> diff --git a/datahub-web-react/src/app/settings/SettingsPage.tsx b/datahub-web-react/src/app/settings/SettingsPage.tsx index 339cc0cf44bac..06592656ac719 100644 --- a/datahub-web-react/src/app/settings/SettingsPage.tsx +++ b/datahub-web-react/src/app/settings/SettingsPage.tsx @@ -89,12 +89,13 @@ export const SettingsPage = () => { const isPoliciesEnabled = config?.policiesConfig.enabled; const isIdentityManagementEnabled = config?.identityManagementConfig.enabled; const isViewsEnabled = config?.viewsConfig.enabled; + const { readOnlyModeEnabled } = config.featureFlags; const showPolicies = (isPoliciesEnabled && me && me?.platformPrivileges?.managePolicies) || false; const showUsersGroups = (isIdentityManagementEnabled && me && me?.platformPrivileges?.manageIdentities) || false; const showViews = isViewsEnabled || false; const showOwnershipTypes = me && me?.platformPrivileges?.manageOwnershipTypes; - const showHomePagePosts = me && me?.platformPrivileges?.manageGlobalAnnouncements; + const showHomePagePosts = me && me?.platformPrivileges?.manageGlobalAnnouncements && !readOnlyModeEnabled; return ( From 6ce35c9654eb85d94ddf86a1b81ef14139d30292 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Thu, 21 Sep 2023 21:35:58 +0530 Subject: [PATCH 16/37] fix(ingest): fix mode lint error (#8875) --- metadata-ingestion/src/datahub/ingestion/source/mode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/mode.py b/metadata-ingestion/src/datahub/ingestion/source/mode.py index 0cf9932ba0878..a000c66a406c2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mode.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mode.py @@ -746,7 +746,7 @@ def get_request(): # respect Retry-After sleep_time = error_response.headers.get("retry-after") if sleep_time is not None: - time.sleep(sleep_time) + time.sleep(float(sleep_time)) raise HTTPError429 raise http_error From 21eb4dfc12cb0fb0591ab943edaa591b5bfa2682 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Thu, 21 Sep 2023 13:01:55 -0500 Subject: [PATCH 17/37] feat(search): update to support OpenSearch 2.x (#8852) --- build.gradle | 21 +- datahub-graphql-core/build.gradle | 1 + .../analytics/service/AnalyticsService.java | 38 +- .../graphql/resolvers/ResolverUtilsTest.java | 4 +- .../auth/ListAccessTokensResolverTest.java | 10 +- .../GetIngestionSourceResolverTest.java | 3 +- .../resolvers/mutate/SiblingsUtilsTest.java | 4 +- .../datahub/graphql/utils/DateUtilTest.java | 5 +- .../upgrade/config/NoCodeCleanupConfig.java | 2 +- .../DeleteLegacySearchIndicesStep.java | 6 +- .../nocodecleanup/NoCodeCleanupUpgrade.java | 2 +- .../steps/BuildIndicesPostStep.java | 4 +- .../steps/BuildIndicesPreStep.java | 10 +- .../elasticsearch/steps/CleanIndicesStep.java | 2 +- .../system/elasticsearch/util/IndexUtils.java | 12 +- docker/build.gradle | 10 + docker/docker-compose-with-cassandra.yml | 7 +- docker/docker-compose-without-neo4j.yml | 7 +- docker/docker-compose.yml | 7 +- docker/elasticsearch/env/docker.env | 1 + .../docker-compose-m1.quickstart.yml | 7 +- ...er-compose-without-neo4j-m1.quickstart.yml | 7 +- ...ocker-compose-without-neo4j.quickstart.yml | 7 +- .../quickstart/docker-compose.quickstart.yml | 7 +- li-utils/build.gradle | 1 + .../mxe-utils-avro-1.7/build.gradle | 1 + .../java/datahub-client/build.gradle | 1 + .../resources/MetadataChangeProposal.avsc | 5 + .../datahub/protobuf/ProtobufDatasetTest.java | 2 +- .../datahub/protobuf/ProtobufUtilsTest.java | 2 +- .../protobuf/model/ProtobufEnumTest.java | 2 +- .../protobuf/model/ProtobufFieldTest.java | 2 +- .../protobuf/model/ProtobufGraphTest.java | 2 +- .../protobuf/model/ProtobufMessageTest.java | 2 +- .../model/ProtobufOneOfFieldTest.java | 2 +- .../protobuf/visitors/VisitContextTest.java | 2 +- .../visitors/dataset/DatasetVisitorTest.java | 2 +- .../dataset/DescriptionVisitorTest.java | 2 +- .../visitors/dataset/DomainVisitorTest.java | 2 +- .../InstitutionalMemoryVisitorTest.java | 2 +- .../KafkaTopicPropertyVisitorTest.java | 2 +- .../dataset/OwnershipVisitorTest.java | 2 +- .../visitors/dataset/PropertyVisitorTest.java | 2 +- .../dataset/TermAssociationVisitorTest.java | 2 +- .../ProtobufExtensionFieldVisitorTest.java | 2 +- .../field/SchemaFieldVisitorTest.java | 2 +- .../protobuf/visitors/tag/TagVisitorTest.java | 2 +- metadata-io/build.gradle | 15 +- .../graph/elastic/ESGraphQueryDAO.java | 38 +- .../graph/elastic/ESGraphWriteDAO.java | 10 +- .../elastic/ElasticSearchGraphService.java | 4 +- .../graph/elastic/TimeFilterUtils.java | 6 +- .../candidatesource/MostPopularSource.java | 22 +- .../candidatesource/RecentlyEditedSource.java | 24 +- .../candidatesource/RecentlyViewedSource.java | 24 +- .../elasticsearch/ElasticSearchService.java | 2 +- .../indexbuilder/ESIndexBuilder.java | 74 ++-- .../indexbuilder/ReindexConfig.java | 2 +- .../elasticsearch/query/ESBrowseDAO.java | 30 +- .../elasticsearch/query/ESSearchDAO.java | 32 +- .../request/AggregationQueryBuilder.java | 4 +- .../request/AutocompleteRequestHandler.java | 18 +- .../query/request/PITAwareSearchRequest.java | 4 +- .../query/request/SearchQueryBuilder.java | 42 +- .../query/request/SearchRequestHandler.java | 34 +- .../elasticsearch/update/BulkListener.java | 12 +- .../elasticsearch/update/ESBulkProcessor.java | 26 +- .../elasticsearch/update/ESWriteDAO.java | 18 +- .../metadata/search/utils/ESUtils.java | 28 +- .../systemmetadata/ESSystemMetadataDAO.java | 40 +- .../ElasticSearchSystemMetadataService.java | 16 +- .../ElasticSearchTimeseriesAspectService.java | 40 +- .../TimeseriesAspectIndexBuilders.java | 2 +- .../elastic/query/ESAggregatedStatsDAO.java | 36 +- .../linkedin/metadata/AspectUtilsTest.java | 2 +- .../metadata/ESTestConfiguration.java | 153 -------- .../com/linkedin/metadata/EbeanTestUtils.java | 12 +- .../update/BulkListenerTest.java | 4 +- .../update/ESBulkProcessorTest.java | 2 +- .../entity/EbeanAspectMigrationsDaoTest.java | 2 +- .../entity/EbeanEntityServiceTest.java | 8 +- .../ESGraphQueryDAOTest.java | 5 +- .../SearchGraphServiceTestBase.java} | 44 ++- .../TimeFilterUtilsTest.java | 6 +- .../SearchGraphServiceElasticSearchTest.java | 49 +++ .../SearchGraphServiceOpenSearchTest.java | 48 +++ ...eTest.java => LineageServiceTestBase.java} | 112 +++--- ...ceTest.java => SearchServiceTestBase.java} | 70 ++-- ...rviceTest.java => TestEntityTestBase.java} | 66 ++-- .../elasticsearch/ElasticSearchSuite.java | 32 ++ .../GoldenElasticSearchTest.java | 44 +++ .../IndexBuilderElasticSearchTest.java | 30 ++ .../LineageDataFixtureElasticSearchTest.java | 43 ++ .../LineageServiceElasticSearchTest.java | 66 ++++ .../SampleDataFixtureElasticSearchTest.java | 45 +++ .../SearchDAOElasticSearchTest.java | 35 ++ .../SearchServiceElasticSearchTest.java | 65 ++++ ...ystemMetadataServiceElasticSearchTest.java | 47 +++ .../TestEntityElasticSearchTest.java | 65 ++++ ...eseriesAspectServiceElasticSearchTest.java | 46 +++ .../elasticsearch/query/ESSearchDAOTest.java | 312 --------------- .../GoldenTestBase.java} | 65 ++-- .../LineageDataFixtureTestBase.java} | 39 +- .../SampleDataFixtureTestBase.java} | 366 +++++++++--------- .../IndexBuilderTestBase.java} | 65 ++-- .../indexbuilder/MappingsBuilderTest.java | 4 +- .../opensearch/GoldenOpenSearchTest.java | 44 +++ .../IndexBuilderOpenSearchTest.java | 30 ++ .../LineageDataFixtureOpenSearchTest.java | 43 ++ .../LineageServiceOpenSearchTest.java | 65 ++++ .../search/opensearch/OpenSearchSuite.java | 31 ++ .../SampleDataFixtureOpenSearchTest.java | 44 +++ .../opensearch/SearchDAOOpenSearchTest.java | 33 ++ .../SearchServiceOpenSearchTest.java | 65 ++++ .../SystemMetadataServiceOpenSearchTest.java | 47 +++ .../opensearch/TestEntityOpenSearchTest.java | 65 ++++ ...TimeseriesAspectServiceOpenSearchTest.java | 46 +++ .../BrowseDAOTest.java} | 19 +- .../search/query/SearchDAOTestBase.java | 307 +++++++++++++++ .../request/AggregationQueryBuilderTest.java | 6 +- .../AutocompleteRequestHandlerTest.java | 18 +- .../request/CustomizedQueryHandlerTest.java | 16 +- .../query/request/SearchQueryBuilderTest.java | 26 +- .../request/SearchRequestHandlerTest.java | 29 +- .../metadata/search/utils/ESUtilsTest.java | 2 +- ...ava => SystemMetadataServiceTestBase.java} | 41 +- .../timeline/EbeanTimelineServiceTest.java | 2 +- .../TimeseriesAspectServiceTestBase.java} | 43 +- .../test/fixtures/elasticsearch/Utils.java | 22 -- .../test/DataGenerator.java | 5 +- .../test/fixtures/search}/EntityExporter.java | 18 +- .../test/fixtures/search}/FixtureReader.java | 10 +- .../test/fixtures/search}/FixtureWriter.java | 24 +- .../fixtures/search}/LineageExporter.java | 14 +- .../SampleDataFixtureConfiguration.java} | 21 +- .../fixtures/search/SearchFixtureUtils.java} | 51 ++- .../SearchLineageFixtureConfiguration.java} | 16 +- .../test/models/Anonymized.java | 2 +- .../test/models/DatasetAnonymized.java | 2 +- .../test/models/GraphAnonymized.java | 4 +- .../search/ElasticsearchTestContainer.java | 42 ++ .../test/search/OpenSearchTestContainer.java | 43 ++ .../test/search/SearchTestContainer.java | 14 + .../test/search/SearchTestUtils.java} | 60 ++- .../config/SearchCommonTestConfiguration.java | 63 +++ .../SearchTestContainerConfiguration.java | 88 +++++ .../src/test/resources/testng-other.xml | 14 + .../src/test/resources/testng-search.xml | 16 + metadata-io/src/test/resources/testng.xml | 14 + metadata-jobs/mae-consumer/build.gradle | 1 + .../kafka/elasticsearch/ElasticEvent.java | 2 +- .../elasticsearch/ElasticsearchConnector.java | 10 +- .../kafka/elasticsearch/JsonElasticEvent.java | 12 +- .../kafka/elasticsearch/MCEElasticEvent.java | 12 +- metadata-models/build.gradle | 1 + metadata-service/auth-impl/build.gradle | 1 + .../common/RestHighLevelClientFactory.java | 6 +- .../factory/graphql/GraphQLEngineFactory.java | 2 +- .../MostPopularCandidateSourceFactory.java | 2 +- .../RecentlyEditedCandidateSourceFactory.java | 2 +- ...ecentlySearchedCandidateSourceFactory.java | 2 +- .../RecentlyViewedCandidateSourceFactory.java | 2 +- .../BaseElasticSearchComponentsFactory.java | 2 +- .../ElasticSearchBulkProcessorFactory.java | 4 +- .../ElasticSearchIndexBuilderFactory.java | 2 +- .../gms/factory/telemetry/DailyReport.java | 2 +- .../telemetry/ScheduledAnalyticsFactory.java | 2 +- ...ElasticSearchBulkProcessorFactoryTest.java | 2 +- .../telemetry/TelemetryUtilsTest.java | 2 +- .../controller/HealthCheckController.java | 10 +- .../OpenAPIAnalyticsTestConfiguration.java | 2 +- .../openapi/util/OpenApiEntitiesUtilTest.java | 2 +- .../elastic/OperationsController.java | 2 +- .../operations/OperationsResource.java | 2 +- .../operations/OperationsResourceTest.java | 5 +- metadata-service/services/build.gradle | 2 - .../RecentlySearchedSource.java | 24 +- .../systemmetadata/SystemMetadataService.java | 2 +- .../gms/servlet/ConfigSearchExport.java | 20 +- .../java/com/datahub/gms/util/CSVWriter.java | 4 +- metadata-utils/build.gradle | 1 + .../linkedin/metadata/utils/SearchUtil.java | 4 +- smoke-test/cypress-dev.sh | 3 +- smoke-test/run-quickstart.sh | 9 + smoke-test/set-cypress-creds.sh | 3 +- .../tests/cypress/cypress/e2e/login/login.js | 2 +- 186 files changed, 2923 insertions(+), 1595 deletions(-) delete mode 100644 metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java rename metadata-io/src/test/java/com/linkedin/metadata/graph/{elastic => search}/ESGraphQueryDAOTest.java (98%) rename metadata-io/src/test/java/com/linkedin/metadata/graph/{elastic/ElasticSearchGraphServiceTest.java => search/SearchGraphServiceTestBase.java} (93%) rename metadata-io/src/test/java/com/linkedin/metadata/graph/{elastic => search}/TimeFilterUtilsTest.java (82%) create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/graph/search/elasticsearch/SearchGraphServiceElasticSearchTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/graph/search/opensearch/SearchGraphServiceOpenSearchTest.java rename metadata-io/src/test/java/com/linkedin/metadata/search/{LineageSearchServiceTest.java => LineageServiceTestBase.java} (94%) rename metadata-io/src/test/java/com/linkedin/metadata/search/{SearchServiceTest.java => SearchServiceTestBase.java} (92%) rename metadata-io/src/test/java/com/linkedin/metadata/search/{elasticsearch/ElasticSearchServiceTest.java => TestEntityTestBase.java} (86%) create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchSuite.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/GoldenElasticSearchTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/IndexBuilderElasticSearchTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/LineageDataFixtureElasticSearchTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/LineageServiceElasticSearchTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/SampleDataFixtureElasticSearchTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/SearchDAOElasticSearchTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/SearchServiceElasticSearchTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/SystemMetadataServiceElasticSearchTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/TestEntityElasticSearchTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/TimeseriesAspectServiceElasticSearchTest.java delete mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAOTest.java rename metadata-io/src/test/java/com/linkedin/metadata/search/{elasticsearch/fixtures/ElasticSearchGoldenTest.java => fixtures/GoldenTestBase.java} (74%) rename metadata-io/src/test/java/com/linkedin/metadata/search/{elasticsearch/fixtures/SearchLineageDataFixtureTests.java => fixtures/LineageDataFixtureTestBase.java} (52%) rename metadata-io/src/test/java/com/linkedin/metadata/search/{elasticsearch/fixtures/SampleDataFixtureTests.java => fixtures/SampleDataFixtureTestBase.java} (81%) rename metadata-io/src/test/java/com/linkedin/metadata/search/{elasticsearch/indexbuilder/ESIndexBuilderTest.java => indexbuilder/IndexBuilderTestBase.java} (85%) rename metadata-io/src/test/java/com/linkedin/metadata/search/{elasticsearch => }/indexbuilder/MappingsBuilderTest.java (98%) create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/GoldenOpenSearchTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/IndexBuilderOpenSearchTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/LineageDataFixtureOpenSearchTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/LineageServiceOpenSearchTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/OpenSearchSuite.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/SampleDataFixtureOpenSearchTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/SearchDAOOpenSearchTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/SearchServiceOpenSearchTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/SystemMetadataServiceOpenSearchTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/TestEntityOpenSearchTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/TimeseriesAspectServiceOpenSearchTest.java rename metadata-io/src/test/java/com/linkedin/metadata/search/{elasticsearch/query/ESBrowseDAOTest.java => query/BrowseDAOTest.java} (86%) create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/query/SearchDAOTestBase.java rename metadata-io/src/test/java/com/linkedin/metadata/search/{elasticsearch => }/query/request/AggregationQueryBuilderTest.java (94%) rename metadata-io/src/test/java/com/linkedin/metadata/search/{elasticsearch => }/query/request/AutocompleteRequestHandlerTest.java (88%) rename metadata-io/src/test/java/com/linkedin/metadata/search/{elasticsearch => }/query/request/CustomizedQueryHandlerTest.java (93%) rename metadata-io/src/test/java/com/linkedin/metadata/search/{elasticsearch => }/query/request/SearchQueryBuilderTest.java (95%) rename metadata-io/src/test/java/com/linkedin/metadata/search/{elasticsearch => }/query/request/SearchRequestHandlerTest.java (95%) rename metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/{ElasticSearchSystemMetadataServiceTest.java => SystemMetadataServiceTestBase.java} (84%) rename metadata-io/src/test/java/com/linkedin/metadata/timeseries/{elastic/ElasticSearchTimeseriesAspectServiceTest.java => search/TimeseriesAspectServiceTestBase.java} (97%) delete mode 100644 metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/Utils.java rename metadata-io/src/test/java/io/{datahub => datahubproject}/test/DataGenerator.java (99%) rename metadata-io/src/test/java/io/{datahub/test/fixtures/elasticsearch => datahubproject/test/fixtures/search}/EntityExporter.java (81%) rename metadata-io/src/test/java/io/{datahub/test/fixtures/elasticsearch => datahubproject/test/fixtures/search}/FixtureReader.java (93%) rename metadata-io/src/test/java/io/{datahub/test/fixtures/elasticsearch => datahubproject/test/fixtures/search}/FixtureWriter.java (75%) rename metadata-io/src/test/java/io/{datahub/test/fixtures/elasticsearch => datahubproject/test/fixtures/search}/LineageExporter.java (95%) rename metadata-io/src/test/java/{com/linkedin/metadata/ESSampleDataFixture.java => io/datahubproject/test/fixtures/search/SampleDataFixtureConfiguration.java} (94%) rename metadata-io/src/test/java/{com/linkedin/metadata/ESTestFixtureUtils.java => io/datahubproject/test/fixtures/search/SearchFixtureUtils.java} (67%) rename metadata-io/src/test/java/{com/linkedin/metadata/ESSearchLineageFixture.java => io/datahubproject/test/fixtures/search/SearchLineageFixtureConfiguration.java} (95%) rename metadata-io/src/test/java/io/{datahub => datahubproject}/test/models/Anonymized.java (97%) rename metadata-io/src/test/java/io/{datahub => datahubproject}/test/models/DatasetAnonymized.java (97%) rename metadata-io/src/test/java/io/{datahub => datahubproject}/test/models/GraphAnonymized.java (82%) create mode 100644 metadata-io/src/test/java/io/datahubproject/test/search/ElasticsearchTestContainer.java create mode 100644 metadata-io/src/test/java/io/datahubproject/test/search/OpenSearchTestContainer.java create mode 100644 metadata-io/src/test/java/io/datahubproject/test/search/SearchTestContainer.java rename metadata-io/src/test/java/{com/linkedin/metadata/ESTestUtils.java => io/datahubproject/test/search/SearchTestUtils.java} (74%) create mode 100644 metadata-io/src/test/java/io/datahubproject/test/search/config/SearchCommonTestConfiguration.java create mode 100644 metadata-io/src/test/java/io/datahubproject/test/search/config/SearchTestContainerConfiguration.java create mode 100644 metadata-io/src/test/resources/testng-other.xml create mode 100644 metadata-io/src/test/resources/testng-search.xml create mode 100644 metadata-io/src/test/resources/testng.xml diff --git a/build.gradle b/build.gradle index 1b6b82d51c2d4..07a0e6ad1f49f 100644 --- a/build.gradle +++ b/build.gradle @@ -8,7 +8,7 @@ buildscript { ext.openTelemetryVersion = '1.18.0' ext.neo4jVersion = '4.4.9' ext.testContainersVersion = '1.17.4' - ext.elasticsearchVersion = '7.10.2' + ext.elasticsearchVersion = '2.9.0' // ES 7.10, Opensearch 1.x, 2.x ext.jacksonVersion = '2.15.2' ext.jettyVersion = '9.4.46.v20220331' ext.playVersion = '2.8.18' @@ -90,15 +90,15 @@ project.ext.externalDependency = [ 'ebean': 'io.ebean:ebean:' + ebeanVersion, 'ebeanAgent': 'io.ebean:ebean-agent:' + ebeanVersion, 'ebeanDdl': 'io.ebean:ebean-ddl-generator:' + ebeanVersion, - 'elasticSearchRest': 'org.elasticsearch.client:elasticsearch-rest-high-level-client:' + elasticsearchVersion, - 'elasticSearchTransport': 'org.elasticsearch.client:transport:' + elasticsearchVersion, + 'elasticSearchRest': 'org.opensearch.client:opensearch-rest-high-level-client:' + elasticsearchVersion, + 'elasticSearchJava': 'org.opensearch.client:opensearch-java:2.6.0', 'findbugsAnnotations': 'com.google.code.findbugs:annotations:3.0.1', 'graphqlJava': 'com.graphql-java:graphql-java:19.5', 'graphqlJavaScalars': 'com.graphql-java:graphql-java-extended-scalars:19.1', 'gson': 'com.google.code.gson:gson:2.8.9', 'guice': 'com.google.inject:guice:4.2.3', 'guava': 'com.google.guava:guava:32.1.2-jre', - 'h2': 'com.h2database:h2:2.1.214', + 'h2': 'com.h2database:h2:2.2.224', 'hadoopCommon':'org.apache.hadoop:hadoop-common:2.7.2', 'hadoopMapreduceClient':'org.apache.hadoop:hadoop-mapreduce-client-core:2.7.2', "hadoopClient": "org.apache.hadoop:hadoop-client:$hadoop3Version", @@ -202,13 +202,15 @@ project.ext.externalDependency = [ 'springActuator': "org.springframework.boot:spring-boot-starter-actuator:$springBootVersion", 'swaggerAnnotations': 'io.swagger.core.v3:swagger-annotations:2.1.12', 'swaggerCli': 'io.swagger.codegen.v3:swagger-codegen-cli:3.0.41', - 'testng': 'org.testng:testng:7.3.0', + 'testngJava8': 'org.testng:testng:7.5.1', + 'testng': 'org.testng:testng:7.8.0', 'testContainers': 'org.testcontainers:testcontainers:' + testContainersVersion, 'testContainersJunit': 'org.testcontainers:junit-jupiter:' + testContainersVersion, 'testContainersPostgresql':'org.testcontainers:postgresql:' + testContainersVersion, 'testContainersElasticsearch': 'org.testcontainers:elasticsearch:' + testContainersVersion, 'testContainersCassandra': 'org.testcontainers:cassandra:' + testContainersVersion, 'testContainersKafka': 'org.testcontainers:kafka:' + testContainersVersion, + 'testContainersOpenSearch': 'org.opensearch:opensearch-testcontainers:2.0.0', 'typesafeConfig':'com.typesafe:config:1.4.1', 'wiremock':'com.github.tomakehurst:wiremock:2.10.0', 'zookeeper': 'org.apache.zookeeper:zookeeper:3.4.14', @@ -257,7 +259,6 @@ subprojects { plugins.withType(JavaPlugin) { dependencies { - testImplementation externalDependency.testng constraints { implementation('io.netty:netty-all:4.1.86.Final') implementation('org.apache.commons:commons-compress:1.21') @@ -268,12 +269,6 @@ subprojects { } } - tasks.withType(Test) { - if (!name.startsWith('integ')) { - useTestNG() - } - } - checkstyle { configDirectory = file("${project.rootDir}/gradle/checkstyle") sourceSets = [ getProject().sourceSets.main, getProject().sourceSets.test ] @@ -292,6 +287,8 @@ subprojects { javaLauncher = javaToolchains.launcherFor { languageVersion = JavaLanguageVersion.of(11) } + // https://docs.gradle.org/current/userguide/performance.html + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 } afterEvaluate { diff --git a/datahub-graphql-core/build.gradle b/datahub-graphql-core/build.gradle index 89ba8f17b6aeb..fba0031351b58 100644 --- a/datahub-graphql-core/build.gradle +++ b/datahub-graphql-core/build.gradle @@ -24,6 +24,7 @@ dependencies { annotationProcessor externalDependency.lombok testImplementation externalDependency.mockito + testImplementation externalDependency.testng } graphqlCodegen { diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/service/AnalyticsService.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/service/AnalyticsService.java index 44b1779f8b006..4135a7b0da148 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/service/AnalyticsService.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/service/AnalyticsService.java @@ -20,25 +20,25 @@ import javax.annotation.Nonnull; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.aggregations.AggregationBuilder; -import org.elasticsearch.search.aggregations.AggregationBuilders; -import org.elasticsearch.search.aggregations.Aggregations; -import org.elasticsearch.search.aggregations.BucketOrder; -import org.elasticsearch.search.aggregations.bucket.MultiBucketsAggregation; -import org.elasticsearch.search.aggregations.bucket.filter.Filter; -import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval; -import org.elasticsearch.search.aggregations.bucket.histogram.Histogram; -import org.elasticsearch.search.aggregations.bucket.terms.Terms; -import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder; -import org.elasticsearch.search.aggregations.metrics.Cardinality; -import org.elasticsearch.search.builder.SearchSourceBuilder; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.QueryBuilder; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.aggregations.AggregationBuilder; +import org.opensearch.search.aggregations.AggregationBuilders; +import org.opensearch.search.aggregations.Aggregations; +import org.opensearch.search.aggregations.BucketOrder; +import org.opensearch.search.aggregations.bucket.MultiBucketsAggregation; +import org.opensearch.search.aggregations.bucket.filter.Filter; +import org.opensearch.search.aggregations.bucket.histogram.DateHistogramInterval; +import org.opensearch.search.aggregations.bucket.histogram.Histogram; +import org.opensearch.search.aggregations.bucket.terms.Terms; +import org.opensearch.search.aggregations.bucket.terms.TermsAggregationBuilder; +import org.opensearch.search.aggregations.metrics.Cardinality; +import org.opensearch.search.builder.SearchSourceBuilder; @Slf4j diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/ResolverUtilsTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/ResolverUtilsTest.java index c391615db9268..7cd548a4790ba 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/ResolverUtilsTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/ResolverUtilsTest.java @@ -15,7 +15,6 @@ import com.linkedin.metadata.query.filter.CriterionArray; import com.linkedin.metadata.query.filter.Filter; import graphql.schema.DataFetchingEnvironment; -import junit.framework.TestCase; import org.testng.annotations.Test; import org.mockito.Mockito; @@ -24,9 +23,10 @@ import java.util.stream.Collectors; import static com.linkedin.datahub.graphql.resolvers.ResolverUtils.*; +import static org.testng.AssertJUnit.assertEquals; -public class ResolverUtilsTest extends TestCase { +public class ResolverUtilsTest { @Test public void testCriterionFromFilter() throws Exception { diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/auth/ListAccessTokensResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/auth/ListAccessTokensResolverTest.java index 8c23335b7e9d3..54b8d23bab301 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/auth/ListAccessTokensResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/auth/ListAccessTokensResolverTest.java @@ -10,15 +10,15 @@ import com.linkedin.metadata.Constants; import graphql.schema.DataFetchingEnvironment; import java.util.Collections; -import junit.framework.TestCase; import org.mockito.Mockito; +import org.testng.annotations.Test; import static com.linkedin.datahub.graphql.resolvers.ResolverUtils.*; -public class ListAccessTokensResolverTest extends TestCase { +public class ListAccessTokensResolverTest { -// @Test + @Test public void testGetSuccess() throws Exception { final DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); final QueryContext mockAllowContext = TestUtils.getMockAllowContext(); @@ -36,13 +36,13 @@ public void testGetSuccess() throws Exception { Mockito.when(mockEnv.getArgument(Mockito.eq("input"))).thenReturn(input); final EntityClient mockClient = Mockito.mock(EntityClient.class); - Mockito.when(mockClient.filter( + Mockito.when(Mockito.eq(mockClient.filter( Mockito.eq(Constants.ACCESS_TOKEN_ENTITY_NAME), Mockito.eq(buildFilter(filters, Collections.emptyList())), Mockito.notNull(), Mockito.eq(input.getStart()), Mockito.eq(input.getCount()), - Mockito.eq(getAuthentication(mockEnv)))) + Mockito.eq(getAuthentication(mockEnv))))) .thenReturn(null); final ListAccessTokensResolver resolver = new ListAccessTokensResolver(mockClient); diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/ingest/source/GetIngestionSourceResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/ingest/source/GetIngestionSourceResolverTest.java index 2d9f43029c479..ebafd1782e000 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/ingest/source/GetIngestionSourceResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/ingest/source/GetIngestionSourceResolverTest.java @@ -14,11 +14,12 @@ import com.linkedin.r2.RemoteInvocationException; import graphql.schema.DataFetchingEnvironment; import java.util.HashSet; + import org.mockito.Mockito; import org.testng.annotations.Test; -import static org.testng.Assert.*; import static com.linkedin.datahub.graphql.resolvers.ingest.IngestTestUtils.*; +import static org.testng.Assert.assertThrows; public class GetIngestionSourceResolverTest { diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/mutate/SiblingsUtilsTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/mutate/SiblingsUtilsTest.java index d8325e9a74740..1adf7b1200574 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/mutate/SiblingsUtilsTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/mutate/SiblingsUtilsTest.java @@ -6,7 +6,6 @@ import com.linkedin.common.urn.UrnUtils; import com.linkedin.datahub.graphql.resolvers.mutate.util.SiblingsUtils; import com.linkedin.metadata.entity.EntityService; -import junit.framework.TestCase; import org.mockito.Mockito; import org.testng.annotations.Test; @@ -14,8 +13,9 @@ import java.util.Optional; import static com.linkedin.metadata.Constants.SIBLINGS_ASPECT_NAME; +import static org.testng.AssertJUnit.assertEquals; -public class SiblingsUtilsTest extends TestCase { +public class SiblingsUtilsTest { private static final String TEST_DATASET_URN1 = "urn:li:dataset:(urn:li:dataPlatform:hive,fct_cypress_users_created,PROD)"; private static final String TEST_DATASET_URN2 = "urn:li:dataset:(urn:li:dataPlatform:hive,fct_cypress_users_created2,PROD)"; diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/utils/DateUtilTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/utils/DateUtilTest.java index 989ebc18e9f6c..0a58ff88586c6 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/utils/DateUtilTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/utils/DateUtilTest.java @@ -1,12 +1,13 @@ package com.linkedin.datahub.graphql.utils; import com.linkedin.datahub.graphql.util.DateUtil; -import junit.framework.TestCase; import org.joda.time.DateTime; import org.mockito.Mockito; import org.testng.annotations.Test; -public class DateUtilTest extends TestCase { +import static org.testng.AssertJUnit.assertEquals; + +public class DateUtilTest { private DateTime setTimeParts(int dayOfMonth, boolean zeroTime) { DateTime result = new DateTime() diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeCleanupConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeCleanupConfig.java index 0fb8b0eb6e20f..23ea81009fa1d 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeCleanupConfig.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeCleanupConfig.java @@ -5,7 +5,7 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import io.ebean.Database; import javax.annotation.Nonnull; -import org.elasticsearch.client.RestHighLevelClient; +import org.opensearch.client.RestHighLevelClient; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.ApplicationContext; import org.springframework.context.annotation.Bean; diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocodecleanup/DeleteLegacySearchIndicesStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocodecleanup/DeleteLegacySearchIndicesStep.java index 15bbe40d1e566..9a64d5fe1810c 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocodecleanup/DeleteLegacySearchIndicesStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocodecleanup/DeleteLegacySearchIndicesStep.java @@ -7,9 +7,9 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import java.util.function.Function; import lombok.RequiredArgsConstructor; -import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; +import org.opensearch.action.admin.indices.delete.DeleteIndexRequest; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; // Do we need SQL-tech specific migration paths? diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocodecleanup/NoCodeCleanupUpgrade.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocodecleanup/NoCodeCleanupUpgrade.java index 2b5e23c5f8269..a5d8d6ce9b666 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocodecleanup/NoCodeCleanupUpgrade.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocodecleanup/NoCodeCleanupUpgrade.java @@ -9,7 +9,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; -import org.elasticsearch.client.RestHighLevelClient; +import org.opensearch.client.RestHighLevelClient; public class NoCodeCleanupUpgrade implements Upgrade { diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/elasticsearch/steps/BuildIndicesPostStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/elasticsearch/steps/BuildIndicesPostStep.java index 465a5fe342667..2feca1f27e625 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/elasticsearch/steps/BuildIndicesPostStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/elasticsearch/steps/BuildIndicesPostStep.java @@ -16,8 +16,8 @@ import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.admin.indices.settings.put.UpdateSettingsRequest; -import org.elasticsearch.client.RequestOptions; +import org.opensearch.action.admin.indices.settings.put.UpdateSettingsRequest; +import org.opensearch.client.RequestOptions; import static com.linkedin.datahub.upgrade.system.elasticsearch.util.IndexUtils.INDEX_BLOCKS_WRITE_SETTING; import static com.linkedin.datahub.upgrade.system.elasticsearch.util.IndexUtils.getAllReindexConfigs; diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/elasticsearch/steps/BuildIndicesPreStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/elasticsearch/steps/BuildIndicesPreStep.java index 6f2f3a8bd727c..82b9428c89fb8 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/elasticsearch/steps/BuildIndicesPreStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/elasticsearch/steps/BuildIndicesPreStep.java @@ -19,10 +19,10 @@ import com.linkedin.metadata.shared.ElasticSearchIndexed; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.ElasticsearchStatusException; -import org.elasticsearch.action.admin.indices.settings.put.UpdateSettingsRequest; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.indices.ResizeRequest; +import org.opensearch.OpenSearchStatusException; +import org.opensearch.action.admin.indices.settings.put.UpdateSettingsRequest; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.indices.ResizeRequest; import static com.linkedin.datahub.upgrade.system.elasticsearch.util.IndexUtils.INDEX_BLOCKS_WRITE_SETTING; import static com.linkedin.datahub.upgrade.system.elasticsearch.util.IndexUtils.getAllReindexConfigs; @@ -97,7 +97,7 @@ private boolean blockWrites(String indexName) throws InterruptedException, IOExc ack = _esComponents.getSearchClient().indices() .putSettings(request, RequestOptions.DEFAULT).isAcknowledged(); log.info("Updated index {} with new settings. Settings: {}, Acknowledged: {}", indexName, indexSettings, ack); - } catch (ElasticsearchStatusException | IOException ese) { + } catch (OpenSearchStatusException | IOException ese) { // Cover first run case, indices won't exist so settings updates won't work nor will the rest of the preConfigure steps. // Since no data are in there they are skippable. // Have to hack around HighLevelClient not sending the actual Java type nor having an easy way to extract it :( diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/elasticsearch/steps/CleanIndicesStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/elasticsearch/steps/CleanIndicesStep.java index f60aa283c0140..bb042bac6df95 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/elasticsearch/steps/CleanIndicesStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/elasticsearch/steps/CleanIndicesStep.java @@ -9,7 +9,7 @@ import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; import com.linkedin.metadata.shared.ElasticSearchIndexed; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.client.RestHighLevelClient; +import org.opensearch.client.RestHighLevelClient; import java.util.List; import java.util.function.Function; diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/elasticsearch/util/IndexUtils.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/elasticsearch/util/IndexUtils.java index fa414798ccfea..4b04feac62cbf 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/elasticsearch/util/IndexUtils.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/elasticsearch/util/IndexUtils.java @@ -4,12 +4,12 @@ import com.linkedin.metadata.shared.ElasticSearchIndexed; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.NotImplementedException; -import org.elasticsearch.action.admin.indices.alias.get.GetAliasesRequest; -import org.elasticsearch.action.admin.indices.settings.get.GetSettingsRequest; -import org.elasticsearch.action.admin.indices.settings.get.GetSettingsResponse; -import org.elasticsearch.client.GetAliasesResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; +import org.opensearch.action.admin.indices.alias.get.GetAliasesRequest; +import org.opensearch.action.admin.indices.settings.get.GetSettingsRequest; +import org.opensearch.action.admin.indices.settings.get.GetSettingsResponse; +import org.opensearch.client.GetAliasesResponse; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; import java.io.IOException; import java.util.ArrayList; diff --git a/docker/build.gradle b/docker/build.gradle index ae101fe1defc5..0faea626e982d 100644 --- a/docker/build.gradle +++ b/docker/build.gradle @@ -38,6 +38,16 @@ task quickstart(type: Exec, dependsOn: ':metadata-ingestion:install') { // environment "ACTIONS_VERSION", 'alpine3.17-slim' // environment "DATAHUB_ACTIONS_IMAGE", 'nginx' + // Elastic + // environment "DATAHUB_SEARCH_IMAGE", 'elasticsearch' + // environment "DATAHUB_SEARCH_TAG", '7.10.1' + + // OpenSearch + environment "DATAHUB_SEARCH_IMAGE", 'opensearchproject/opensearch' + environment "DATAHUB_SEARCH_TAG", '2.9.0' + environment "XPACK_SECURITY_ENABLED", 'plugins.security.disabled=true' + environment "USE_AWS_ELASTICSEARCH", 'true' + def cmd = [ 'source ../metadata-ingestion/venv/bin/activate && ', 'datahub docker quickstart', diff --git a/docker/docker-compose-with-cassandra.yml b/docker/docker-compose-with-cassandra.yml index 08f8cc1ec9c45..9543e67da07f2 100644 --- a/docker/docker-compose-with-cassandra.yml +++ b/docker/docker-compose-with-cassandra.yml @@ -96,6 +96,9 @@ services: context: ../ dockerfile: docker/elasticsearch-setup/Dockerfile env_file: elasticsearch-setup/env/docker.env + environment: + - ELASTICSEARCH_USE_SSL=${ELASTICSEARCH_USE_SSL:-false} + - USE_AWS_ELASTICSEARCH=${USE_AWS_ELASTICSEARCH:-false} depends_on: elasticsearch: condition: service_healthy @@ -117,13 +120,13 @@ services: elasticsearch: container_name: elasticsearch hostname: elasticsearch - image: elasticsearch:7.10.1 + image: ${DATAHUB_SEARCH_IMAGE:-elasticsearch}:${DATAHUB_SEARCH_TAG:-7.10.1} ports: - 9200:9200 env_file: elasticsearch/env/docker.env environment: - discovery.type=single-node - - xpack.security.enabled=false + - ${XPACK_SECURITY_ENABLED:-xpack.security.enabled=false} healthcheck: test: curl -sS --fail http://elasticsearch:9200/_cluster/health?wait_for_status=yellow&timeout=0s start_period: 5s diff --git a/docker/docker-compose-without-neo4j.yml b/docker/docker-compose-without-neo4j.yml index 0b2e4f76b8fa9..022362782f742 100644 --- a/docker/docker-compose-without-neo4j.yml +++ b/docker/docker-compose-without-neo4j.yml @@ -81,6 +81,9 @@ services: context: ../ dockerfile: docker/elasticsearch-setup/Dockerfile env_file: elasticsearch-setup/env/docker.env + environment: + - ELASTICSEARCH_USE_SSL=${ELASTICSEARCH_USE_SSL:-false} + - USE_AWS_ELASTICSEARCH=${USE_AWS_ELASTICSEARCH:-false} depends_on: elasticsearch: condition: service_healthy @@ -104,13 +107,13 @@ services: elasticsearch: container_name: elasticsearch hostname: elasticsearch - image: elasticsearch:7.10.1 + image: ${DATAHUB_SEARCH_IMAGE:-elasticsearch}:${DATAHUB_SEARCH_TAG:-7.10.1} ports: - ${DATAHUB_MAPPED_ELASTIC_PORT:-9200}:9200 env_file: elasticsearch/env/docker.env environment: - discovery.type=single-node - - xpack.security.enabled=false + - ${XPACK_SECURITY_ENABLED:-xpack.security.enabled=false} deploy: resources: limits: diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index d07ea5fa88f8b..a486689e050a2 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -83,6 +83,9 @@ services: context: ../ dockerfile: docker/elasticsearch-setup/Dockerfile env_file: elasticsearch-setup/env/docker.env + environment: + - ELASTICSEARCH_USE_SSL=${ELASTICSEARCH_USE_SSL:-false} + - USE_AWS_ELASTICSEARCH=${USE_AWS_ELASTICSEARCH:-false} depends_on: elasticsearch: condition: service_healthy @@ -109,13 +112,13 @@ services: elasticsearch: container_name: elasticsearch hostname: elasticsearch - image: elasticsearch:7.10.1 + image: ${DATAHUB_SEARCH_IMAGE:-elasticsearch}:${DATAHUB_SEARCH_TAG:-7.10.1} ports: - ${DATAHUB_MAPPED_ELASTIC_PORT:-9200}:9200 env_file: elasticsearch/env/docker.env environment: - discovery.type=single-node - - xpack.security.enabled=false + - ${XPACK_SECURITY_ENABLED:-xpack.security.enabled=false} deploy: resources: limits: diff --git a/docker/elasticsearch/env/docker.env b/docker/elasticsearch/env/docker.env index 4b1f0215ea6c8..46b5836dedd28 100644 --- a/docker/elasticsearch/env/docker.env +++ b/docker/elasticsearch/env/docker.env @@ -1 +1,2 @@ ES_JAVA_OPTS="-Xms256m -Xmx512m -Dlog4j2.formatMsgNoLookups=true" +OPENSEARCH_JAVA_OPTS="-Xms512m -Xmx512m -Dlog4j2.formatMsgNoLookups=true" \ No newline at end of file diff --git a/docker/quickstart/docker-compose-m1.quickstart.yml b/docker/quickstart/docker-compose-m1.quickstart.yml index 38418bc8c41b9..89e9aaa0defd6 100644 --- a/docker/quickstart/docker-compose-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-m1.quickstart.yml @@ -161,8 +161,9 @@ services: memory: 1G environment: - discovery.type=single-node - - xpack.security.enabled=false + - ${XPACK_SECURITY_ENABLED:-xpack.security.enabled=false} - ES_JAVA_OPTS=-Xms256m -Xmx512m -Dlog4j2.formatMsgNoLookups=true + - OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m -Dlog4j2.formatMsgNoLookups=true healthcheck: interval: 1s retries: 3 @@ -170,7 +171,7 @@ services: test: curl -sS --fail http://elasticsearch:$${DATAHUB_MAPPED_ELASTIC_PORT:-9200}/_cluster/health?wait_for_status=yellow&timeout=0s timeout: 5s hostname: elasticsearch - image: elasticsearch:7.10.1 + image: ${DATAHUB_SEARCH_IMAGE:-elasticsearch}:${DATAHUB_SEARCH_TAG:-7.10.1} ports: - ${DATAHUB_MAPPED_ELASTIC_PORT:-9200}:9200 volumes: @@ -181,6 +182,8 @@ services: elasticsearch: condition: service_healthy environment: + - ELASTICSEARCH_USE_SSL=${ELASTICSEARCH_USE_SSL:-false} + - USE_AWS_ELASTICSEARCH=${USE_AWS_ELASTICSEARCH:-false} - ELASTICSEARCH_HOST=elasticsearch - ELASTICSEARCH_PORT=9200 - ELASTICSEARCH_PROTOCOL=http diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index cf879faa6a3f0..f6284edc83648 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -154,8 +154,9 @@ services: memory: 1G environment: - discovery.type=single-node - - xpack.security.enabled=false + - ${XPACK_SECURITY_ENABLED:-xpack.security.enabled=false} - ES_JAVA_OPTS=-Xms256m -Xmx512m -Dlog4j2.formatMsgNoLookups=true + - OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m -Dlog4j2.formatMsgNoLookups=true healthcheck: interval: 1s retries: 3 @@ -163,7 +164,7 @@ services: test: curl -sS --fail http://elasticsearch:$${DATAHUB_MAPPED_ELASTIC_PORT:-9200}/_cluster/health?wait_for_status=yellow&timeout=0s timeout: 5s hostname: elasticsearch - image: elasticsearch:7.10.1 + image: ${DATAHUB_SEARCH_IMAGE:-elasticsearch}:${DATAHUB_SEARCH_TAG:-7.10.1} ports: - ${DATAHUB_MAPPED_ELASTIC_PORT:-9200}:9200 volumes: @@ -174,6 +175,8 @@ services: elasticsearch: condition: service_healthy environment: + - ELASTICSEARCH_USE_SSL=${ELASTICSEARCH_USE_SSL:-false} + - USE_AWS_ELASTICSEARCH=${USE_AWS_ELASTICSEARCH:-false} - ELASTICSEARCH_HOST=elasticsearch - ELASTICSEARCH_PORT=9200 - ELASTICSEARCH_PROTOCOL=http diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index 007830078d2b4..4e3503e35c0db 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -154,8 +154,9 @@ services: memory: 1G environment: - discovery.type=single-node - - xpack.security.enabled=false + - ${XPACK_SECURITY_ENABLED:-xpack.security.enabled=false} - ES_JAVA_OPTS=-Xms256m -Xmx512m -Dlog4j2.formatMsgNoLookups=true + - OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m -Dlog4j2.formatMsgNoLookups=true healthcheck: interval: 1s retries: 3 @@ -163,7 +164,7 @@ services: test: curl -sS --fail http://elasticsearch:$${DATAHUB_MAPPED_ELASTIC_PORT:-9200}/_cluster/health?wait_for_status=yellow&timeout=0s timeout: 5s hostname: elasticsearch - image: elasticsearch:7.10.1 + image: ${DATAHUB_SEARCH_IMAGE:-elasticsearch}:${DATAHUB_SEARCH_TAG:-7.10.1} ports: - ${DATAHUB_MAPPED_ELASTIC_PORT:-9200}:9200 volumes: @@ -174,6 +175,8 @@ services: elasticsearch: condition: service_healthy environment: + - ELASTICSEARCH_USE_SSL=${ELASTICSEARCH_USE_SSL:-false} + - USE_AWS_ELASTICSEARCH=${USE_AWS_ELASTICSEARCH:-false} - ELASTICSEARCH_HOST=elasticsearch - ELASTICSEARCH_PORT=9200 - ELASTICSEARCH_PROTOCOL=http diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index 390543b92123f..e2f52064389e0 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -161,8 +161,9 @@ services: memory: 1G environment: - discovery.type=single-node - - xpack.security.enabled=false + - ${XPACK_SECURITY_ENABLED:-xpack.security.enabled=false} - ES_JAVA_OPTS=-Xms256m -Xmx512m -Dlog4j2.formatMsgNoLookups=true + - OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m -Dlog4j2.formatMsgNoLookups=true healthcheck: interval: 1s retries: 3 @@ -170,7 +171,7 @@ services: test: curl -sS --fail http://elasticsearch:$${DATAHUB_MAPPED_ELASTIC_PORT:-9200}/_cluster/health?wait_for_status=yellow&timeout=0s timeout: 5s hostname: elasticsearch - image: elasticsearch:7.10.1 + image: ${DATAHUB_SEARCH_IMAGE:-elasticsearch}:${DATAHUB_SEARCH_TAG:-7.10.1} ports: - ${DATAHUB_MAPPED_ELASTIC_PORT:-9200}:9200 volumes: @@ -181,6 +182,8 @@ services: elasticsearch: condition: service_healthy environment: + - ELASTICSEARCH_USE_SSL=${ELASTICSEARCH_USE_SSL:-false} + - USE_AWS_ELASTICSEARCH=${USE_AWS_ELASTICSEARCH:-false} - ELASTICSEARCH_HOST=elasticsearch - ELASTICSEARCH_PORT=9200 - ELASTICSEARCH_PROTOCOL=http diff --git a/li-utils/build.gradle b/li-utils/build.gradle index 8f526cffba094..1d5222e39185a 100644 --- a/li-utils/build.gradle +++ b/li-utils/build.gradle @@ -28,6 +28,7 @@ dependencies { testImplementation externalDependency.commonsIo testImplementation project(':test-models') testImplementation project(path: ':test-models', configuration: 'testDataTemplate') + testImplementation externalDependency.testngJava8 } idea { diff --git a/metadata-events/mxe-utils-avro-1.7/build.gradle b/metadata-events/mxe-utils-avro-1.7/build.gradle index 82249d393578c..3b137965d6c19 100644 --- a/metadata-events/mxe-utils-avro-1.7/build.gradle +++ b/metadata-events/mxe-utils-avro-1.7/build.gradle @@ -5,6 +5,7 @@ dependencies { api project(':metadata-models') api spec.product.pegasus.dataAvro1_6 + testImplementation externalDependency.testng testImplementation project(':test-models') testImplementation project(path: ':test-models', configuration: 'testDataTemplate') diff --git a/metadata-integration/java/datahub-client/build.gradle b/metadata-integration/java/datahub-client/build.gradle index fc72fc4257491..95de3cdb3c526 100644 --- a/metadata-integration/java/datahub-client/build.gradle +++ b/metadata-integration/java/datahub-client/build.gradle @@ -49,6 +49,7 @@ dependencies { annotationProcessor externalDependency.lombok // VisibleForTesting compileOnly externalDependency.guava + testImplementation externalDependency.testngJava8 testImplementation externalDependency.mockito testImplementation externalDependency.mockServer testImplementation externalDependency.mockServerClient diff --git a/metadata-integration/java/datahub-client/src/main/resources/MetadataChangeProposal.avsc b/metadata-integration/java/datahub-client/src/main/resources/MetadataChangeProposal.avsc index 6a723090fda07..64216636af26d 100644 --- a/metadata-integration/java/datahub-client/src/main/resources/MetadataChangeProposal.avsc +++ b/metadata-integration/java/datahub-client/src/main/resources/MetadataChangeProposal.avsc @@ -143,6 +143,11 @@ "type" : [ "string", "null" ], "doc" : "The last run id that produced the metadata. Populated in case of batch-ingestion.", "default" : "no-run-id-provided" + }, { + "name" : "pipelineName", + "type" : [ "null", "string" ], + "doc" : "The ingestion pipeline id that produced the metadata. Populated in case of batch ingestion.", + "default" : null }, { "name" : "registryName", "type" : [ "null", "string" ], diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufDatasetTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufDatasetTest.java index 748990752f45b..bbb8e532f1033 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufDatasetTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufDatasetTest.java @@ -26,7 +26,7 @@ import datahub.protobuf.model.ProtobufField; import datahub.protobuf.visitors.ProtobufModelVisitor; import datahub.protobuf.visitors.VisitContext; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.Set; diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufUtilsTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufUtilsTest.java index 58e78435a43a5..3a00edca8284a 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufUtilsTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufUtilsTest.java @@ -3,7 +3,7 @@ import com.google.protobuf.DescriptorProtos; import com.google.protobuf.ExtensionRegistry; import datahub.protobuf.model.ProtobufGraph; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufEnumTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufEnumTest.java index 3696f5795e1f9..7c98077690d66 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufEnumTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufEnumTest.java @@ -5,7 +5,7 @@ import com.google.protobuf.DescriptorProtos.FileDescriptorProto; import com.linkedin.schema.EnumType; import com.linkedin.schema.SchemaFieldDataType; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.List; import java.util.Set; diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufFieldTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufFieldTest.java index a21acf7f6c113..543b815f7f72b 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufFieldTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufFieldTest.java @@ -17,7 +17,7 @@ import com.linkedin.schema.SchemaMetadata; import com.linkedin.schema.StringType; import datahub.protobuf.ProtobufDataset; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.Arrays; diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufGraphTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufGraphTest.java index a7e6dd035160c..80ffafff3f451 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufGraphTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufGraphTest.java @@ -1,7 +1,7 @@ package datahub.protobuf.model; import com.google.protobuf.DescriptorProtos.FileDescriptorSet; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.HashSet; diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufMessageTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufMessageTest.java index 035c16552aeb5..e961b6ffd2d61 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufMessageTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufMessageTest.java @@ -5,7 +5,7 @@ import com.linkedin.schema.MapType; import com.linkedin.schema.RecordType; import com.linkedin.schema.SchemaFieldDataType; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.List; import java.util.Set; diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufOneOfFieldTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufOneOfFieldTest.java index f9b168437643b..438e0a79206bd 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufOneOfFieldTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufOneOfFieldTest.java @@ -6,7 +6,7 @@ import com.google.protobuf.DescriptorProtos.OneofDescriptorProto; import com.linkedin.schema.SchemaFieldDataType; import com.linkedin.schema.UnionType; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.List; import java.util.Set; diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/VisitContextTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/VisitContextTest.java index 9645c6b66ef5f..ceebefb3a207e 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/VisitContextTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/VisitContextTest.java @@ -5,7 +5,7 @@ import datahub.protobuf.model.ProtobufElement; import datahub.protobuf.model.ProtobufGraph; import org.jgrapht.GraphPath; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.List; diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DatasetVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DatasetVisitorTest.java index 165823d8e4925..fb51f42a6c759 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DatasetVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DatasetVisitorTest.java @@ -2,7 +2,7 @@ import com.linkedin.common.urn.DatasetUrn; import com.linkedin.data.template.RecordTemplate; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.net.URISyntaxException; diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DescriptionVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DescriptionVisitorTest.java index c5c20f8928ec3..4edc65b29d663 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DescriptionVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DescriptionVisitorTest.java @@ -1,7 +1,7 @@ package datahub.protobuf.visitors.dataset; import datahub.protobuf.model.ProtobufGraph; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.List; diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DomainVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DomainVisitorTest.java index 0420953a647cb..b3fa2c8fd081b 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DomainVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DomainVisitorTest.java @@ -2,7 +2,7 @@ import com.linkedin.common.urn.Urn; import datahub.protobuf.model.ProtobufGraph; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.List; diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/InstitutionalMemoryVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/InstitutionalMemoryVisitorTest.java index a313681c5a5a0..09fc0a3765436 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/InstitutionalMemoryVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/InstitutionalMemoryVisitorTest.java @@ -2,7 +2,7 @@ import com.linkedin.common.InstitutionalMemoryMetadata; import com.linkedin.common.url.Url; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.List; diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/KafkaTopicPropertyVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/KafkaTopicPropertyVisitorTest.java index 84e7eb19f893b..971500b5f43a2 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/KafkaTopicPropertyVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/KafkaTopicPropertyVisitorTest.java @@ -2,7 +2,7 @@ import com.linkedin.data.template.StringMap; import com.linkedin.dataset.DatasetProperties; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.List; diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/OwnershipVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/OwnershipVisitorTest.java index cf2649e86dc43..b087c683f9ffe 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/OwnershipVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/OwnershipVisitorTest.java @@ -6,7 +6,7 @@ import com.linkedin.common.OwnershipType; import com.linkedin.common.urn.Urn; import datahub.protobuf.model.ProtobufGraph; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.List; diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/PropertyVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/PropertyVisitorTest.java index 2316416729bef..dc3647cdf34c8 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/PropertyVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/PropertyVisitorTest.java @@ -2,7 +2,7 @@ import com.linkedin.data.template.StringMap; import com.linkedin.dataset.DatasetProperties; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.List; diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/TermAssociationVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/TermAssociationVisitorTest.java index 04fd52cf82e84..c140a798ef6e6 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/TermAssociationVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/TermAssociationVisitorTest.java @@ -2,7 +2,7 @@ import com.linkedin.common.GlossaryTermAssociation; import com.linkedin.common.urn.GlossaryTermUrn; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.List; diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/ProtobufExtensionFieldVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/ProtobufExtensionFieldVisitorTest.java index 0a1928310bfc2..57a8cf1d63cd2 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/ProtobufExtensionFieldVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/ProtobufExtensionFieldVisitorTest.java @@ -15,7 +15,7 @@ import com.linkedin.schema.StringType; import com.linkedin.util.Pair; import datahub.protobuf.ProtobufDataset; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.net.URISyntaxException; diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/SchemaFieldVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/SchemaFieldVisitorTest.java index 6c855e70d7f37..1da29b5320637 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/SchemaFieldVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/SchemaFieldVisitorTest.java @@ -7,7 +7,7 @@ import com.linkedin.schema.UnionType; import com.linkedin.util.Pair; import datahub.protobuf.ProtobufDataset; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.List; diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/tag/TagVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/tag/TagVisitorTest.java index 6fe1098f5e99a..84ab1312a7d8a 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/tag/TagVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/tag/TagVisitorTest.java @@ -3,7 +3,7 @@ import com.linkedin.tag.TagProperties; import datahub.protobuf.visitors.tags.TagVisitor; import datahub.event.MetadataChangeProposalWrapper; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.List; diff --git a/metadata-io/build.gradle b/metadata-io/build.gradle index a2c643516dce6..ad54cf6524398 100644 --- a/metadata-io/build.gradle +++ b/metadata-io/build.gradle @@ -31,7 +31,7 @@ dependencies { api externalDependency.datastaxOssCore api externalDependency.datastaxOssQueryBuilder api externalDependency.elasticSearchRest - api externalDependency.elasticSearchTransport + api externalDependency.elasticSearchJava implementation externalDependency.javatuples api externalDependency.javaxValidation runtimeOnly externalDependency.jna @@ -64,6 +64,7 @@ dependencies { testImplementation externalDependency.testContainers testImplementation externalDependency.testContainersJunit testImplementation externalDependency.testContainersElasticsearch + testImplementation externalDependency.testContainersOpenSearch testImplementation externalDependency.testContainersCassandra testImplementation externalDependency.lombok testImplementation externalDependency.springBootTest @@ -101,14 +102,20 @@ dependencies { } test { - // https://docs.gradle.org/current/userguide/performance.html - maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 + doFirst { + // override, testng controlling parallelization + // increasing >1 will merely run all tests extra times + maxParallelForks = 1 + } + useTestNG() { + suites 'src/test/resources/testng.xml' + } testLogging.showStandardStreams = true testLogging.exceptionFormat = 'full' } tasks.withType(Test) { - enableAssertions = false + enableAssertions = false } project.compileJava { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java index 8df7a9600ca94..946931a54f4ec 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java @@ -45,15 +45,15 @@ import lombok.Value; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.tuple.Pair; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.SearchHit; -import org.elasticsearch.search.builder.SearchSourceBuilder; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.QueryBuilder; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.SearchHit; +import org.opensearch.search.builder.SearchSourceBuilder; import static com.linkedin.metadata.graph.elastic.ElasticSearchGraphService.*; @@ -297,12 +297,12 @@ private List getLineageRelationships(@Nonnull List ent // Get search query for given list of edges and source urns @VisibleForTesting - static QueryBuilder getQueryForLineage( - @Nonnull List urns, - @Nonnull List lineageEdges, - @Nonnull GraphFilters graphFilters, - @Nullable Long startTimeMillis, - @Nullable Long endTimeMillis) { + public static QueryBuilder getQueryForLineage( + @Nonnull List urns, + @Nonnull List lineageEdges, + @Nonnull GraphFilters graphFilters, + @Nullable Long startTimeMillis, + @Nullable Long endTimeMillis) { BoolQueryBuilder query = QueryBuilders.boolQuery(); if (lineageEdges.isEmpty()) { return query; @@ -361,10 +361,10 @@ static QueryBuilder getQueryForLineage( * physically stored inside the Graph Store. */ @VisibleForTesting - static void addEdgeToPaths( - @Nonnull final Map existingPaths, - @Nonnull final Urn parentUrn, - @Nonnull final Urn childUrn) { + public static void addEdgeToPaths( + @Nonnull final Map existingPaths, + @Nonnull final Urn parentUrn, + @Nonnull final Urn childUrn) { // Collect all full-paths to this child node. This is what will be returned. UrnArrayArray pathsToParent = existingPaths.get(parentUrn); if (pathsToParent != null && pathsToParent.size() > 0) { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphWriteDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphWriteDAO.java index 8d2fcaa857541..f8b0e8a291e7a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphWriteDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphWriteDAO.java @@ -10,11 +10,11 @@ import javax.annotation.Nullable; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.delete.DeleteRequest; -import org.elasticsearch.action.update.UpdateRequest; -import org.elasticsearch.common.xcontent.XContentType; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.reindex.BulkByScrollResponse; +import org.opensearch.action.delete.DeleteRequest; +import org.opensearch.action.update.UpdateRequest; +import org.opensearch.common.xcontent.XContentType; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.reindex.BulkByScrollResponse; import static com.linkedin.metadata.graph.elastic.ESGraphQueryDAO.buildQuery; import static com.linkedin.metadata.graph.elastic.ElasticSearchGraphService.INDEX_NAME; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java index 346befca22559..02e36af343b07 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java @@ -45,8 +45,8 @@ import javax.annotation.Nullable; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.index.query.QueryBuilders; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.index.query.QueryBuilders; @Slf4j diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/TimeFilterUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/TimeFilterUtils.java index 66422c5997d17..1df938f902e0f 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/TimeFilterUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/TimeFilterUtils.java @@ -1,9 +1,9 @@ package com.linkedin.metadata.graph.elastic; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.QueryBuilder; +import org.opensearch.index.query.QueryBuilders; import static com.linkedin.metadata.graph.elastic.ESGraphQueryDAO.*; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/MostPopularSource.java b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/MostPopularSource.java index ea1f6cead80a9..6985ceb00afd2 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/MostPopularSource.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/MostPopularSource.java @@ -28,17 +28,17 @@ import javax.annotation.Nonnull; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.client.indices.GetIndexRequest; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.aggregations.AggregationBuilder; -import org.elasticsearch.search.aggregations.AggregationBuilders; -import org.elasticsearch.search.aggregations.bucket.terms.ParsedTerms; -import org.elasticsearch.search.builder.SearchSourceBuilder; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.client.indices.GetIndexRequest; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.aggregations.AggregationBuilder; +import org.opensearch.search.aggregations.AggregationBuilders; +import org.opensearch.search.aggregations.bucket.terms.ParsedTerms; +import org.opensearch.search.builder.SearchSourceBuilder; @Slf4j diff --git a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlyEditedSource.java b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlyEditedSource.java index 402b579b13879..dc30d4c80abc0 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlyEditedSource.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlyEditedSource.java @@ -28,18 +28,18 @@ import javax.annotation.Nonnull; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.client.indices.GetIndexRequest; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.aggregations.AggregationBuilder; -import org.elasticsearch.search.aggregations.AggregationBuilders; -import org.elasticsearch.search.aggregations.BucketOrder; -import org.elasticsearch.search.aggregations.bucket.terms.ParsedTerms; -import org.elasticsearch.search.builder.SearchSourceBuilder; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.client.indices.GetIndexRequest; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.aggregations.AggregationBuilder; +import org.opensearch.search.aggregations.AggregationBuilders; +import org.opensearch.search.aggregations.BucketOrder; +import org.opensearch.search.aggregations.bucket.terms.ParsedTerms; +import org.opensearch.search.builder.SearchSourceBuilder; @Slf4j diff --git a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlyViewedSource.java b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlyViewedSource.java index 6ef207dada497..0836c569ed5d1 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlyViewedSource.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlyViewedSource.java @@ -28,18 +28,18 @@ import javax.annotation.Nonnull; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.client.indices.GetIndexRequest; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.aggregations.AggregationBuilder; -import org.elasticsearch.search.aggregations.AggregationBuilders; -import org.elasticsearch.search.aggregations.BucketOrder; -import org.elasticsearch.search.aggregations.bucket.terms.ParsedTerms; -import org.elasticsearch.search.builder.SearchSourceBuilder; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.client.indices.GetIndexRequest; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.aggregations.AggregationBuilder; +import org.opensearch.search.aggregations.AggregationBuilders; +import org.opensearch.search.aggregations.BucketOrder; +import org.opensearch.search.aggregations.bucket.terms.ParsedTerms; +import org.opensearch.search.builder.SearchSourceBuilder; @Slf4j diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java index 32adce458770d..bf4dffe9e5fb8 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java @@ -27,7 +27,7 @@ import com.linkedin.metadata.shared.ElasticSearchIndexed; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.search.SearchResponse; +import org.opensearch.action.search.SearchResponse; @Slf4j diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java index 14f67ddcbf337..10c2fd725dca9 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java @@ -30,36 +30,36 @@ import lombok.Getter; import lombok.extern.slf4j.Slf4j; import org.apache.http.client.config.RequestConfig; -import org.elasticsearch.ElasticsearchException; -import org.elasticsearch.action.admin.cluster.node.tasks.list.ListTasksRequest; -import org.elasticsearch.action.admin.indices.alias.IndicesAliasesRequest; -import org.elasticsearch.action.admin.indices.alias.IndicesAliasesRequest.AliasActions; -import org.elasticsearch.action.admin.indices.alias.get.GetAliasesRequest; -import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest; -import org.elasticsearch.action.admin.indices.settings.get.GetSettingsRequest; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.client.GetAliasesResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.client.core.CountRequest; -import org.elasticsearch.client.indices.CreateIndexRequest; -import org.elasticsearch.client.indices.GetIndexRequest; -import org.elasticsearch.client.indices.GetIndexResponse; -import org.elasticsearch.client.indices.GetMappingsRequest; -import org.elasticsearch.client.indices.PutMappingRequest; -import org.elasticsearch.client.tasks.TaskSubmissionResponse; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.common.unit.TimeValue; -import org.elasticsearch.index.query.QueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.index.reindex.ReindexRequest; -import org.elasticsearch.action.admin.indices.settings.put.UpdateSettingsRequest; -import org.elasticsearch.search.SearchHit; -import org.elasticsearch.search.builder.SearchSourceBuilder; -import org.elasticsearch.search.sort.SortBuilders; -import org.elasticsearch.search.sort.SortOrder; -import org.elasticsearch.tasks.TaskInfo; +import org.opensearch.OpenSearchException; +import org.opensearch.action.admin.cluster.node.tasks.list.ListTasksRequest; +import org.opensearch.action.admin.indices.alias.IndicesAliasesRequest; +import org.opensearch.action.admin.indices.alias.IndicesAliasesRequest.AliasActions; +import org.opensearch.action.admin.indices.alias.get.GetAliasesRequest; +import org.opensearch.action.admin.indices.delete.DeleteIndexRequest; +import org.opensearch.action.admin.indices.settings.get.GetSettingsRequest; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.client.GetAliasesResponse; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.client.core.CountRequest; +import org.opensearch.client.indices.CreateIndexRequest; +import org.opensearch.client.indices.GetIndexRequest; +import org.opensearch.client.indices.GetIndexResponse; +import org.opensearch.client.indices.GetMappingsRequest; +import org.opensearch.client.indices.PutMappingRequest; +import org.opensearch.client.tasks.TaskSubmissionResponse; +import org.opensearch.common.settings.Settings; +import org.opensearch.common.unit.TimeValue; +import org.opensearch.index.query.QueryBuilder; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.index.reindex.ReindexRequest; +import org.opensearch.action.admin.indices.settings.put.UpdateSettingsRequest; +import org.opensearch.search.SearchHit; +import org.opensearch.search.builder.SearchSourceBuilder; +import org.opensearch.search.sort.SortBuilders; +import org.opensearch.search.sort.SortOrder; +import org.opensearch.tasks.TaskInfo; @Slf4j @@ -117,7 +117,7 @@ public ESIndexBuilder(RestHighLevelClient searchClient, int numShards, int numRe RetryConfig config = RetryConfig.custom() .maxAttempts(Math.max(1, numRetries)) .waitDuration(Duration.ofSeconds(10)) - .retryOnException(e -> e instanceof ElasticsearchException) + .retryOnException(e -> e instanceof OpenSearchException) .failAfterMaxAttempts(true) .build(); @@ -153,7 +153,8 @@ public ReindexConfig buildReindexState(String indexName, Map map Settings currentSettings = _searchClient.indices() .getSettings(new GetSettingsRequest().indices(indexName), RequestOptions.DEFAULT) .getIndexToSettings() - .valuesIt() + .values() + .iterator() .next(); builder.currentSettings(currentSettings); @@ -170,6 +171,15 @@ public ReindexConfig buildReindexState(String indexName, Map map return builder.build(); } + /** + * Builds index with given name, mappings and settings + * Deprecated: Use the `buildIndex(ReindexConfig indexState) to enforce conventions via ReindexConfig class + * earlier in the process. + * @param indexName index name + * @param mappings ES mappings + * @param settings ES settings + * @throws IOException ES error + */ @Deprecated public void buildIndex(String indexName, Map mappings, Map settings) throws IOException { buildIndex(buildReindexState(indexName, mappings, settings)); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ReindexConfig.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ReindexConfig.java index a0c0bd85c04c6..4f5f2926d3da0 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ReindexConfig.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ReindexConfig.java @@ -10,7 +10,7 @@ import lombok.Getter; import lombok.experimental.Accessors; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.common.settings.Settings; +import org.opensearch.common.settings.Settings; import java.util.List; import java.util.Map; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java index 3cb3c441afd68..5fd0a80d23c50 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java @@ -38,21 +38,21 @@ import lombok.Value; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang.StringUtils; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.SearchHit; -import org.elasticsearch.search.aggregations.AggregationBuilder; -import org.elasticsearch.search.aggregations.AggregationBuilders; -import org.elasticsearch.search.aggregations.bucket.terms.IncludeExclude; -import org.elasticsearch.search.aggregations.bucket.terms.ParsedTerms; -import org.elasticsearch.search.aggregations.bucket.terms.Terms; -import org.elasticsearch.search.builder.SearchSourceBuilder; -import org.elasticsearch.search.sort.SortOrder; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.QueryBuilder; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.SearchHit; +import org.opensearch.search.aggregations.AggregationBuilder; +import org.opensearch.search.aggregations.AggregationBuilders; +import org.opensearch.search.aggregations.bucket.terms.IncludeExclude; +import org.opensearch.search.aggregations.bucket.terms.ParsedTerms; +import org.opensearch.search.aggregations.bucket.terms.Terms; +import org.opensearch.search.builder.SearchSourceBuilder; +import org.opensearch.search.sort.SortOrder; import static com.linkedin.metadata.utils.SearchUtil.filterSoftDeletedByDefault; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java index f3864d99ba5e9..cbaf70ca22617 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java @@ -35,20 +35,20 @@ import javax.annotation.Nullable; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.client.Request; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.Response; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.client.core.CountRequest; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.common.xcontent.LoggingDeprecationHandler; -import org.elasticsearch.common.xcontent.NamedXContentRegistry; -import org.elasticsearch.common.xcontent.XContentParser; -import org.elasticsearch.common.xcontent.XContentType; -import org.elasticsearch.search.SearchModule; -import org.elasticsearch.search.builder.SearchSourceBuilder; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.client.Request; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.Response; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.client.core.CountRequest; +import org.opensearch.common.settings.Settings; +import org.opensearch.common.xcontent.LoggingDeprecationHandler; +import org.opensearch.core.xcontent.NamedXContentRegistry; +import org.opensearch.core.xcontent.XContentParser; +import org.opensearch.common.xcontent.XContentType; +import org.opensearch.search.SearchModule; +import org.opensearch.search.builder.SearchSourceBuilder; import static com.linkedin.metadata.Constants.*; import static com.linkedin.metadata.models.registry.template.util.TemplateUtil.*; @@ -63,7 +63,7 @@ public class ESSearchDAO { private static final NamedXContentRegistry X_CONTENT_REGISTRY; static { - SearchModule searchModule = new SearchModule(Settings.EMPTY, false, Collections.emptyList()); + SearchModule searchModule = new SearchModule(Settings.EMPTY, Collections.emptyList()); X_CONTENT_REGISTRY = new NamedXContentRegistry(searchModule.getNamedXContents()); } @@ -137,7 +137,7 @@ private AggregationMetadata transformAggregationMetadata(@Nonnull AggregationMet } @VisibleForTesting - SearchResult transformIndexIntoEntityName(SearchResult result) { + public SearchResult transformIndexIntoEntityName(SearchResult result) { return result.setMetadata(result.getMetadata().setAggregations(transformIndexIntoEntityName(result.getMetadata().getAggregations()))); } private ScrollResult transformIndexIntoEntityName(ScrollResult result) { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilder.java index d95bbcf893628..e2bdea84eda0e 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilder.java @@ -11,8 +11,8 @@ import javax.annotation.Nonnull; import javax.annotation.Nullable; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.search.aggregations.AggregationBuilder; -import org.elasticsearch.search.aggregations.AggregationBuilders; +import org.opensearch.search.aggregations.AggregationBuilder; +import org.opensearch.search.aggregations.AggregationBuilders; import static com.linkedin.metadata.utils.SearchUtil.*; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/AutocompleteRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/AutocompleteRequestHandler.java index f4be46e58f3b8..bba3a9fa4232d 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/AutocompleteRequestHandler.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/AutocompleteRequestHandler.java @@ -24,15 +24,15 @@ import javax.annotation.Nonnull; import javax.annotation.Nullable; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.MultiMatchQueryBuilder; -import org.elasticsearch.index.query.QueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.SearchHit; -import org.elasticsearch.search.builder.SearchSourceBuilder; -import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.MultiMatchQueryBuilder; +import org.opensearch.index.query.QueryBuilder; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.SearchHit; +import org.opensearch.search.builder.SearchSourceBuilder; +import org.opensearch.search.fetch.subphase.highlight.HighlightBuilder; import static com.linkedin.metadata.models.SearchableFieldSpecExtractor.PRIMARY_URN_SEARCH_PROPERTIES; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/PITAwareSearchRequest.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/PITAwareSearchRequest.java index c0b1ac028e9d4..79c00fc7cdd20 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/PITAwareSearchRequest.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/PITAwareSearchRequest.java @@ -1,7 +1,7 @@ package com.linkedin.metadata.search.elasticsearch.query.request; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.support.IndicesOptions; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.support.IndicesOptions; public class PITAwareSearchRequest extends SearchRequest { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java index b01c736ec23ae..ce88f31449c35 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java @@ -35,24 +35,24 @@ import com.linkedin.metadata.search.utils.ESUtils; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.common.lucene.search.function.CombineFunction; -import org.elasticsearch.common.lucene.search.function.FieldValueFactorFunction; -import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.common.xcontent.LoggingDeprecationHandler; -import org.elasticsearch.common.xcontent.NamedXContentRegistry; -import org.elasticsearch.common.xcontent.XContentParser; -import org.elasticsearch.common.xcontent.XContentType; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.Operator; -import org.elasticsearch.index.query.QueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.index.query.QueryStringQueryBuilder; -import org.elasticsearch.index.query.SimpleQueryStringBuilder; -import org.elasticsearch.index.query.functionscore.FieldValueFactorFunctionBuilder; -import org.elasticsearch.index.query.functionscore.FunctionScoreQueryBuilder; -import org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders; -import org.elasticsearch.search.SearchModule; +import org.opensearch.common.lucene.search.function.CombineFunction; +import org.opensearch.common.lucene.search.function.FieldValueFactorFunction; +import org.opensearch.common.lucene.search.function.FunctionScoreQuery; +import org.opensearch.common.settings.Settings; +import org.opensearch.common.xcontent.LoggingDeprecationHandler; +import org.opensearch.core.xcontent.NamedXContentRegistry; +import org.opensearch.common.xcontent.XContentType; +import org.opensearch.core.xcontent.XContentParser; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.Operator; +import org.opensearch.index.query.QueryBuilder; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.index.query.QueryStringQueryBuilder; +import org.opensearch.index.query.SimpleQueryStringBuilder; +import org.opensearch.index.query.functionscore.FieldValueFactorFunctionBuilder; +import org.opensearch.index.query.functionscore.FunctionScoreQueryBuilder; +import org.opensearch.index.query.functionscore.ScoreFunctionBuilders; +import org.opensearch.search.SearchModule; import static com.linkedin.metadata.models.SearchableFieldSpecExtractor.PRIMARY_URN_SEARCH_PROPERTIES; import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.*; @@ -69,7 +69,7 @@ public class SearchQueryBuilder { } private static final NamedXContentRegistry X_CONTENT_REGISTRY; static { - SearchModule searchModule = new SearchModule(Settings.EMPTY, false, Collections.emptyList()); + SearchModule searchModule = new SearchModule(Settings.EMPTY, Collections.emptyList()); X_CONTENT_REGISTRY = new NamedXContentRegistry(searchModule.getNamedXContents()); } @@ -135,7 +135,7 @@ private QueryBuilder buildInternalQuery(@Nullable QueryConfiguration customQuery * @return A set of SearchFieldConfigs containing the searchable fields from the input entities. */ @VisibleForTesting - Set getStandardFields(@Nonnull Collection entitySpecs) { + public Set getStandardFields(@Nonnull Collection entitySpecs) { Set fields = new HashSet<>(); // Always present final float urnBoost = Float.parseFloat((String) PRIMARY_URN_SEARCH_PROPERTIES.get("boostScore")); @@ -168,7 +168,7 @@ Set getStandardFields(@Nonnull Collection entityS } @VisibleForTesting - Set getFieldsFromEntitySpec(EntitySpec entitySpec) { + public Set getFieldsFromEntitySpec(EntitySpec entitySpec) { Set fields = new HashSet<>(); List searchableFieldSpecs = entitySpec.getSearchableFieldSpecs(); for (SearchableFieldSpec fieldSpec : searchableFieldSpecs) { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java index dbd933d59d7f3..5fcc10b7af5cf 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java @@ -54,23 +54,23 @@ import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang.StringUtils; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.common.text.Text; -import org.elasticsearch.common.unit.TimeValue; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.SearchHit; -import org.elasticsearch.search.aggregations.Aggregation; -import org.elasticsearch.search.aggregations.AggregationBuilders; -import org.elasticsearch.search.aggregations.Aggregations; -import org.elasticsearch.search.aggregations.bucket.terms.ParsedTerms; -import org.elasticsearch.search.aggregations.bucket.terms.Terms; -import org.elasticsearch.search.builder.SearchSourceBuilder; -import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder; -import org.elasticsearch.search.fetch.subphase.highlight.HighlightField; -import org.elasticsearch.search.suggest.term.TermSuggestion; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.common.text.Text; +import org.opensearch.common.unit.TimeValue; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.QueryBuilder; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.SearchHit; +import org.opensearch.search.aggregations.Aggregation; +import org.opensearch.search.aggregations.AggregationBuilders; +import org.opensearch.search.aggregations.Aggregations; +import org.opensearch.search.aggregations.bucket.terms.ParsedTerms; +import org.opensearch.search.aggregations.bucket.terms.Terms; +import org.opensearch.search.builder.SearchSourceBuilder; +import org.opensearch.search.fetch.subphase.highlight.HighlightBuilder; +import org.opensearch.search.fetch.subphase.highlight.HighlightField; +import org.opensearch.search.suggest.term.TermSuggestion; import static com.linkedin.metadata.search.utils.ESUtils.NAME_SUGGESTION; import static com.linkedin.metadata.search.utils.ESUtils.toFacetField; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/BulkListener.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/BulkListener.java index 297453bdce517..be64df3179a9d 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/BulkListener.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/BulkListener.java @@ -2,11 +2,11 @@ import com.linkedin.metadata.utils.metrics.MetricUtils; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.DocWriteRequest; -import org.elasticsearch.action.bulk.BulkProcessor; -import org.elasticsearch.action.bulk.BulkRequest; -import org.elasticsearch.action.bulk.BulkResponse; -import org.elasticsearch.action.support.WriteRequest; +import org.opensearch.action.DocWriteRequest; +import org.opensearch.action.bulk.BulkProcessor; +import org.opensearch.action.bulk.BulkRequest; +import org.opensearch.action.bulk.BulkResponse; +import org.opensearch.action.support.WriteRequest; import java.util.Arrays; import java.util.HashMap; @@ -76,7 +76,7 @@ private static String buildMetricName(DocWriteRequest.OpType opType, String stat public static String buildBulkRequestSummary(BulkRequest request) { return request.requests().stream().map(req -> String.format( "Failed to perform bulk request: index [%s], optype: [%s], type [%s], id [%s]", - req.index(), req.opType(), req.type(), req.id()) + req.index(), req.opType(), req.opType(), req.id()) ).collect(Collectors.joining(";")); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/ESBulkProcessor.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/ESBulkProcessor.java index a7ece47a7f5d6..a1e5b363d8a78 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/ESBulkProcessor.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/ESBulkProcessor.java @@ -7,19 +7,19 @@ import lombok.NonNull; import lombok.Setter; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.DocWriteRequest; -import org.elasticsearch.action.bulk.BackoffPolicy; -import org.elasticsearch.action.bulk.BulkProcessor; -import org.elasticsearch.action.bulk.BulkResponse; -import org.elasticsearch.action.support.WriteRequest; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.client.tasks.TaskSubmissionResponse; -import org.elasticsearch.common.Nullable; -import org.elasticsearch.common.unit.TimeValue; -import org.elasticsearch.index.query.QueryBuilder; -import org.elasticsearch.index.reindex.BulkByScrollResponse; -import org.elasticsearch.index.reindex.DeleteByQueryRequest; +import org.opensearch.action.DocWriteRequest; +import org.opensearch.action.bulk.BackoffPolicy; +import org.opensearch.action.bulk.BulkProcessor; +import org.opensearch.action.bulk.BulkResponse; +import org.opensearch.action.support.WriteRequest; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.client.tasks.TaskSubmissionResponse; +import org.opensearch.common.Nullable; +import org.opensearch.common.unit.TimeValue; +import org.opensearch.index.query.QueryBuilder; +import org.opensearch.index.reindex.BulkByScrollResponse; +import org.opensearch.index.reindex.DeleteByQueryRequest; import java.io.Closeable; import java.io.IOException; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/ESWriteDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/ESWriteDAO.java index 1a63f2d4d0312..edcdf5654028c 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/ESWriteDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/ESWriteDAO.java @@ -6,15 +6,15 @@ import javax.annotation.Nonnull; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.delete.DeleteRequest; -import org.elasticsearch.action.update.UpdateRequest; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.client.indices.GetIndexRequest; -import org.elasticsearch.client.indices.GetIndexResponse; -import org.elasticsearch.common.xcontent.XContentType; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.script.Script; +import org.opensearch.action.delete.DeleteRequest; +import org.opensearch.action.update.UpdateRequest; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.client.indices.GetIndexRequest; +import org.opensearch.client.indices.GetIndexResponse; +import org.opensearch.common.xcontent.XContentType; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.script.Script; @Slf4j diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java index 12c081a5c25a6..9a7d9a1b4c420 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java @@ -16,21 +16,21 @@ import javax.annotation.Nonnull; import javax.annotation.Nullable; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.client.RequestOptions; +import org.opensearch.client.RequestOptions; import org.apache.commons.lang.StringUtils; -import org.elasticsearch.common.unit.TimeValue; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.builder.PointInTimeBuilder; -import org.elasticsearch.search.builder.SearchSourceBuilder; -import org.elasticsearch.search.sort.FieldSortBuilder; -import org.elasticsearch.search.sort.ScoreSortBuilder; -import org.elasticsearch.search.sort.SortOrder; -import org.elasticsearch.search.suggest.SuggestBuilder; -import org.elasticsearch.search.suggest.SuggestBuilders; -import org.elasticsearch.search.suggest.SuggestionBuilder; -import org.elasticsearch.search.suggest.term.TermSuggestionBuilder; +import org.opensearch.common.unit.TimeValue; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.QueryBuilder; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.builder.PointInTimeBuilder; +import org.opensearch.search.builder.SearchSourceBuilder; +import org.opensearch.search.sort.FieldSortBuilder; +import org.opensearch.search.sort.ScoreSortBuilder; +import org.opensearch.search.sort.SortOrder; +import org.opensearch.search.suggest.SuggestBuilder; +import org.opensearch.search.suggest.SuggestBuilders; +import org.opensearch.search.suggest.SuggestionBuilder; +import org.opensearch.search.suggest.term.TermSuggestionBuilder; import static com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig.KEYWORD_FIELDS; import static com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig.PATH_HIERARCHY_FIELDS; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ESSystemMetadataDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ESSystemMetadataDAO.java index c7e8d0940c530..5eb03eb23d01a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ESSystemMetadataDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ESSystemMetadataDAO.java @@ -13,26 +13,26 @@ import javax.annotation.Nullable; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.delete.DeleteRequest; -import org.elasticsearch.action.delete.DeleteResponse; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.action.update.UpdateRequest; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.client.tasks.GetTaskRequest; -import org.elasticsearch.client.tasks.GetTaskResponse; -import org.elasticsearch.common.xcontent.XContentType; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.index.reindex.BulkByScrollResponse; -import org.elasticsearch.search.aggregations.AggregationBuilders; -import org.elasticsearch.search.aggregations.PipelineAggregatorBuilders; -import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder; -import org.elasticsearch.search.aggregations.pipeline.BucketSortPipelineAggregationBuilder; -import org.elasticsearch.search.builder.SearchSourceBuilder; -import org.elasticsearch.search.sort.FieldSortBuilder; -import org.elasticsearch.search.sort.SortOrder; +import org.opensearch.action.delete.DeleteRequest; +import org.opensearch.action.delete.DeleteResponse; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.action.update.UpdateRequest; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.client.tasks.GetTaskRequest; +import org.opensearch.client.tasks.GetTaskResponse; +import org.opensearch.common.xcontent.XContentType; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.index.reindex.BulkByScrollResponse; +import org.opensearch.search.aggregations.AggregationBuilders; +import org.opensearch.search.aggregations.PipelineAggregatorBuilders; +import org.opensearch.search.aggregations.bucket.terms.TermsAggregationBuilder; +import org.opensearch.search.aggregations.pipeline.BucketSortPipelineAggregationBuilder; +import org.opensearch.search.builder.SearchSourceBuilder; +import org.opensearch.search.sort.FieldSortBuilder; +import org.opensearch.search.sort.SortOrder; import static com.linkedin.metadata.systemmetadata.ElasticSearchSystemMetadataService.INDEX_NAME; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java index 3fcb62424853a..dd8e19861ccd2 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java @@ -31,14 +31,14 @@ import javax.annotation.Nullable; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.client.tasks.GetTaskResponse; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.SearchHits; -import org.elasticsearch.search.aggregations.bucket.filter.ParsedFilter; -import org.elasticsearch.search.aggregations.bucket.terms.ParsedStringTerms; -import org.elasticsearch.search.aggregations.bucket.terms.Terms; -import org.elasticsearch.search.aggregations.metrics.ParsedMax; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.client.tasks.GetTaskResponse; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.SearchHits; +import org.opensearch.search.aggregations.bucket.filter.ParsedFilter; +import org.opensearch.search.aggregations.bucket.terms.ParsedStringTerms; +import org.opensearch.search.aggregations.bucket.terms.Terms; +import org.opensearch.search.aggregations.metrics.ParsedMax; @Slf4j diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java index 01fe41718d7f0..43ba87f474d6a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java @@ -49,26 +49,26 @@ import javax.annotation.Nonnull; import javax.annotation.Nullable; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.action.update.UpdateRequest; -import org.elasticsearch.client.Request; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.Response; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.client.core.CountRequest; -import org.elasticsearch.client.core.CountResponse; -import org.elasticsearch.client.tasks.TaskSubmissionResponse; -import org.elasticsearch.common.unit.TimeValue; -import org.elasticsearch.common.xcontent.XContentType; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.SearchHit; -import org.elasticsearch.search.SearchHits; -import org.elasticsearch.search.builder.SearchSourceBuilder; -import org.elasticsearch.search.sort.SortBuilders; -import org.elasticsearch.search.sort.SortOrder; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.action.update.UpdateRequest; +import org.opensearch.client.Request; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.Response; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.client.core.CountRequest; +import org.opensearch.client.core.CountResponse; +import org.opensearch.client.tasks.TaskSubmissionResponse; +import org.opensearch.common.unit.TimeValue; +import org.opensearch.common.xcontent.XContentType; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.QueryBuilder; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.SearchHit; +import org.opensearch.search.SearchHits; +import org.opensearch.search.builder.SearchSourceBuilder; +import org.opensearch.search.sort.SortBuilders; +import org.opensearch.search.sort.SortOrder; import static com.linkedin.metadata.Constants.*; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/indexbuilder/TimeseriesAspectIndexBuilders.java b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/indexbuilder/TimeseriesAspectIndexBuilders.java index 6c5dbf2582c05..b0751a9c6f9ea 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/indexbuilder/TimeseriesAspectIndexBuilders.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/indexbuilder/TimeseriesAspectIndexBuilders.java @@ -17,7 +17,7 @@ import javax.annotation.Nullable; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.index.query.QueryBuilder; +import org.opensearch.index.query.QueryBuilder; @Slf4j diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/query/ESAggregatedStatsDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/query/ESAggregatedStatsDAO.java index 5389d602ae5c1..316d25d1f37f4 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/query/ESAggregatedStatsDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/query/ESAggregatedStatsDAO.java @@ -30,24 +30,24 @@ import javax.annotation.Nonnull; import javax.annotation.Nullable; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.aggregations.AggregationBuilder; -import org.elasticsearch.search.aggregations.AggregationBuilders; -import org.elasticsearch.search.aggregations.Aggregations; -import org.elasticsearch.search.aggregations.BucketOrder; -import org.elasticsearch.search.aggregations.PipelineAggregatorBuilders; -import org.elasticsearch.search.aggregations.bucket.MultiBucketsAggregation; -import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval; -import org.elasticsearch.search.aggregations.metrics.ParsedCardinality; -import org.elasticsearch.search.aggregations.metrics.ParsedSum; -import org.elasticsearch.search.aggregations.pipeline.MaxBucketPipelineAggregationBuilder; -import org.elasticsearch.search.aggregations.pipeline.ParsedBucketMetricValue; -import org.elasticsearch.search.builder.SearchSourceBuilder; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.aggregations.AggregationBuilder; +import org.opensearch.search.aggregations.AggregationBuilders; +import org.opensearch.search.aggregations.Aggregations; +import org.opensearch.search.aggregations.BucketOrder; +import org.opensearch.search.aggregations.PipelineAggregatorBuilders; +import org.opensearch.search.aggregations.bucket.MultiBucketsAggregation; +import org.opensearch.search.aggregations.bucket.histogram.DateHistogramInterval; +import org.opensearch.search.aggregations.metrics.ParsedCardinality; +import org.opensearch.search.aggregations.metrics.ParsedSum; +import org.opensearch.search.aggregations.pipeline.MaxBucketPipelineAggregationBuilder; +import org.opensearch.search.aggregations.pipeline.ParsedBucketMetricValue; +import org.opensearch.search.builder.SearchSourceBuilder; @Slf4j diff --git a/metadata-io/src/test/java/com/linkedin/metadata/AspectUtilsTest.java b/metadata-io/src/test/java/com/linkedin/metadata/AspectUtilsTest.java index 46d08bc8887b9..54fb2bc8b1f65 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/AspectUtilsTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/AspectUtilsTest.java @@ -39,7 +39,7 @@ public AspectUtilsTest() throws EntityRegistryException { @Test public void testAdditionalChanges() { - Database server = EbeanTestUtils.createTestServer(); + Database server = EbeanTestUtils.createTestServer(AspectUtilsTest.class.getSimpleName()); EbeanAspectDao aspectDao = new EbeanAspectDao(server); aspectDao.setConnectionValidated(true); EventProducer mockProducer = mock(EventProducer.class); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java b/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java deleted file mode 100644 index 327447341badf..0000000000000 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java +++ /dev/null @@ -1,153 +0,0 @@ -package com.linkedin.metadata; - -import com.fasterxml.jackson.dataformat.yaml.YAMLMapper; -import com.linkedin.metadata.config.search.CustomConfiguration; -import com.linkedin.metadata.config.search.ElasticSearchConfiguration; -import com.linkedin.metadata.config.search.ExactMatchConfiguration; -import com.linkedin.metadata.config.search.PartialConfiguration; -import com.linkedin.metadata.config.search.SearchConfiguration; -import com.linkedin.metadata.config.search.WordGramConfiguration; -import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; -import com.linkedin.metadata.models.registry.ConfigEntityRegistry; -import com.linkedin.metadata.models.registry.EntityRegistry; -import com.linkedin.metadata.models.registry.EntityRegistryException; -import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; -import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; -import com.linkedin.metadata.version.GitVersion; -import java.util.Optional; -import org.apache.http.HttpHost; -import org.apache.http.impl.nio.reactor.IOReactorConfig; -import org.elasticsearch.action.support.WriteRequest; -import org.elasticsearch.client.RestClient; -import org.elasticsearch.client.RestClientBuilder; -import org.elasticsearch.client.RestHighLevelClient; -import org.springframework.beans.factory.annotation.Qualifier; -import org.springframework.boot.test.context.TestConfiguration; -import org.springframework.context.annotation.Bean; -import org.springframework.context.annotation.Primary; -import org.springframework.context.annotation.Scope; -import org.testcontainers.elasticsearch.ElasticsearchContainer; - -import javax.annotation.Nonnull; - -import java.util.Map; - - -@TestConfiguration -public class ESTestConfiguration { - private static final int HTTP_PORT = 9200; - public static final int REFRESH_INTERVAL_SECONDS = 5; - - public static void syncAfterWrite(ESBulkProcessor bulkProcessor) throws InterruptedException { - bulkProcessor.flush(); - Thread.sleep(1000); - } - - @Bean - public SearchConfiguration searchConfiguration() { - SearchConfiguration searchConfiguration = new SearchConfiguration(); - searchConfiguration.setMaxTermBucketSize(20); - - ExactMatchConfiguration exactMatchConfiguration = new ExactMatchConfiguration(); - exactMatchConfiguration.setExclusive(false); - exactMatchConfiguration.setExactFactor(10.0f); - exactMatchConfiguration.setWithPrefix(true); - exactMatchConfiguration.setPrefixFactor(6.0f); - exactMatchConfiguration.setCaseSensitivityFactor(0.7f); - exactMatchConfiguration.setEnableStructured(true); - - WordGramConfiguration wordGramConfiguration = new WordGramConfiguration(); - wordGramConfiguration.setTwoGramFactor(1.2f); - wordGramConfiguration.setThreeGramFactor(1.5f); - wordGramConfiguration.setFourGramFactor(1.8f); - - PartialConfiguration partialConfiguration = new PartialConfiguration(); - partialConfiguration.setFactor(0.4f); - partialConfiguration.setUrnFactor(0.5f); - - searchConfiguration.setExactMatch(exactMatchConfiguration); - searchConfiguration.setWordGram(wordGramConfiguration); - searchConfiguration.setPartial(partialConfiguration); - return searchConfiguration; - } - - @Bean - public CustomSearchConfiguration customSearchConfiguration() throws Exception { - CustomConfiguration customConfiguration = new CustomConfiguration(); - customConfiguration.setEnabled(true); - customConfiguration.setFile("search_config_builder_test.yml"); - return customConfiguration.resolve(new YAMLMapper()); - } - - @Scope("singleton") - @Bean(name = "testElasticsearchContainer") - @Nonnull - public ElasticsearchContainer elasticsearchContainer() { - ESTestUtils.ES_CONTAINER.start(); - return ESTestUtils.ES_CONTAINER; - } - - @Primary - @Scope("singleton") - @Bean(name = "elasticSearchRestHighLevelClient") - @Nonnull - public RestHighLevelClient getElasticsearchClient(@Qualifier("testElasticsearchContainer") ElasticsearchContainer esContainer) { - // A helper method to create an ElasticseachContainer defaulting to the current image and version, with the ability - // within firewalled environments to override with an environment variable to point to the offline repository. - // A helper method to construct a standard rest client for Elastic search. - final RestClientBuilder builder = - RestClient.builder(new HttpHost( - "localhost", - esContainer.getMappedPort(HTTP_PORT), "http") - ).setHttpClientConfigCallback(httpAsyncClientBuilder -> - httpAsyncClientBuilder.setDefaultIOReactorConfig(IOReactorConfig.custom().setIoThreadCount(1).build())); - - builder.setRequestConfigCallback(requestConfigBuilder -> requestConfigBuilder. - setConnectionRequestTimeout(30000)); - - return new RestHighLevelClient(builder); - } - - /* - Cannot use the factory class without circular dependencies - */ - @Primary - @Bean(name = "elasticSearchBulkProcessor") - @Nonnull - public ESBulkProcessor getBulkProcessor(@Qualifier("elasticSearchRestHighLevelClient") RestHighLevelClient searchClient) { - return ESBulkProcessor.builder(searchClient) - .async(true) - /* - * Force a refresh as part of this request. This refresh policy does not scale for high indexing or search throughput but is useful - * to present a consistent view to for indices with very low traffic. And it is wonderful for tests! - */ - .writeRequestRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE) - .bulkRequestsLimit(10000) - .bulkFlushPeriod(REFRESH_INTERVAL_SECONDS - 1) - .retryInterval(1L) - .numRetries(1) - .build(); - } - - @Primary - @Bean(name = "elasticSearchIndexBuilder") - @Nonnull - protected ESIndexBuilder getIndexBuilder(@Qualifier("elasticSearchRestHighLevelClient") RestHighLevelClient searchClient) { - GitVersion gitVersion = new GitVersion("0.0.0-test", "123456", Optional.empty()); - return new ESIndexBuilder(searchClient, 1, 1, 3, 1, Map.of(), - false, false, - new ElasticSearchConfiguration(), gitVersion); - } - - @Bean(name = "entityRegistry") - public EntityRegistry entityRegistry() throws EntityRegistryException { - return new ConfigEntityRegistry( - ESTestConfiguration.class.getClassLoader().getResourceAsStream("entity-registry.yml")); - } - - @Bean(name = "longTailEntityRegistry") - public EntityRegistry longTailEntityRegistry() throws EntityRegistryException { - return new ConfigEntityRegistry( - ESTestConfiguration.class.getClassLoader().getResourceAsStream("entity-registry.yml")); - } -} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/EbeanTestUtils.java b/metadata-io/src/test/java/com/linkedin/metadata/EbeanTestUtils.java index 180166e963fca..c6eefede8a860 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/EbeanTestUtils.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/EbeanTestUtils.java @@ -2,7 +2,7 @@ import io.ebean.Database; import io.ebean.DatabaseFactory; -import io.ebean.config.ServerConfig; +import io.ebean.config.DatabaseConfig; import io.ebean.datasource.DataSourceConfig; import javax.annotation.Nonnull; @@ -13,19 +13,19 @@ private EbeanTestUtils() { } @Nonnull - public static Database createTestServer() { - return DatabaseFactory.create(createTestingH2ServerConfig()); + public static Database createTestServer(String instanceId) { + return DatabaseFactory.create(createTestingH2ServerConfig(instanceId)); } @Nonnull - private static ServerConfig createTestingH2ServerConfig() { + private static DatabaseConfig createTestingH2ServerConfig(String instanceId) { DataSourceConfig dataSourceConfig = new DataSourceConfig(); dataSourceConfig.setUsername("tester"); dataSourceConfig.setPassword(""); - dataSourceConfig.setUrl("jdbc:h2:mem:test;IGNORECASE=TRUE;mode=mysql;"); + dataSourceConfig.setUrl(String.format("jdbc:h2:mem:%s;IGNORECASE=TRUE;mode=mysql;", instanceId)); dataSourceConfig.setDriver("org.h2.Driver"); - ServerConfig serverConfig = new ServerConfig(); + DatabaseConfig serverConfig = new DatabaseConfig(); serverConfig.setName("gma"); serverConfig.setDataSourceConfig(dataSourceConfig); serverConfig.setDdlGenerate(true); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/elasticsearch/update/BulkListenerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/elasticsearch/update/BulkListenerTest.java index 154131ceb6fee..10a73cbe532a2 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/elasticsearch/update/BulkListenerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/elasticsearch/update/BulkListenerTest.java @@ -1,8 +1,8 @@ package com.linkedin.metadata.elasticsearch.update; import com.linkedin.metadata.search.elasticsearch.update.BulkListener; -import org.elasticsearch.action.bulk.BulkRequest; -import org.elasticsearch.action.support.WriteRequest; +import org.opensearch.action.bulk.BulkRequest; +import org.opensearch.action.support.WriteRequest; import org.mockito.Mockito; import org.testng.annotations.Test; diff --git a/metadata-io/src/test/java/com/linkedin/metadata/elasticsearch/update/ESBulkProcessorTest.java b/metadata-io/src/test/java/com/linkedin/metadata/elasticsearch/update/ESBulkProcessorTest.java index 5c882e5158f90..2d84c9f3444de 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/elasticsearch/update/ESBulkProcessorTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/elasticsearch/update/ESBulkProcessorTest.java @@ -1,7 +1,7 @@ package com.linkedin.metadata.elasticsearch.update; import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; -import org.elasticsearch.client.RestHighLevelClient; +import org.opensearch.client.RestHighLevelClient; import org.mockito.Mockito; import org.testng.annotations.Test; diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanAspectMigrationsDaoTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanAspectMigrationsDaoTest.java index 9e453e6e75677..38b2ed4ed199a 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanAspectMigrationsDaoTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanAspectMigrationsDaoTest.java @@ -22,7 +22,7 @@ public EbeanAspectMigrationsDaoTest() throws EntityRegistryException { @BeforeMethod public void setupTest() { - Database server = EbeanTestUtils.createTestServer(); + Database server = EbeanTestUtils.createTestServer(EbeanAspectMigrationsDaoTest.class.getSimpleName()); _mockProducer = mock(EventProducer.class); EbeanAspectDao dao = new EbeanAspectDao(server); dao.setConnectionValidated(true); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java index 90f9baa4ca4c2..e8a7d8740d328 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java @@ -22,7 +22,7 @@ import com.linkedin.metadata.utils.PegasusUtils; import com.linkedin.mxe.MetadataChangeProposal; import com.linkedin.mxe.SystemMetadata; -import io.datahub.test.DataGenerator; +import io.datahubproject.test.DataGenerator; import io.ebean.Database; import io.ebean.Transaction; import io.ebean.TxScope; @@ -61,7 +61,8 @@ public EbeanEntityServiceTest() throws EntityRegistryException { @BeforeMethod public void setupTest() { - Database server = EbeanTestUtils.createTestServer(); + Database server = EbeanTestUtils.createTestServer(EbeanEntityServiceTest.class.getSimpleName()); + _mockProducer = mock(EventProducer.class); _aspectDao = new EbeanAspectDao(server); @@ -239,6 +240,7 @@ public void testNestedTransactions() throws AssertionError { System.out.println("done"); } + @Test public void dataGeneratorThreadingTest() { DataGenerator dataGenerator = new DataGenerator(_entityServiceImpl); @@ -262,7 +264,7 @@ public void dataGeneratorThreadingTest() { * This test is designed to detect multi-threading persistence exceptions like duplicate key, * exceptions that exceed retry limits or unnecessary versions. */ - @Test + @Test // ensure same thread as h2 public void multiThreadingTest() { DataGenerator dataGenerator = new DataGenerator(_entityServiceImpl); Database server = ((EbeanAspectDao) _entityServiceImpl._aspectDao).getServer(); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAOTest.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/ESGraphQueryDAOTest.java similarity index 98% rename from metadata-io/src/test/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAOTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/graph/search/ESGraphQueryDAOTest.java index 3ba2c858fb1a3..baed3ade0d207 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAOTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/ESGraphQueryDAOTest.java @@ -1,4 +1,4 @@ -package com.linkedin.metadata.graph.elastic; +package com.linkedin.metadata.graph.search; import com.google.common.collect.ImmutableList; import com.google.common.io.Resources; @@ -8,6 +8,7 @@ import com.linkedin.common.urn.UrnUtils; import com.linkedin.metadata.Constants; import com.linkedin.metadata.graph.GraphFilters; +import com.linkedin.metadata.graph.elastic.ESGraphQueryDAO; import com.linkedin.metadata.models.registry.LineageRegistry; import com.linkedin.metadata.query.filter.RelationshipDirection; import java.net.URL; @@ -16,7 +17,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import org.elasticsearch.index.query.QueryBuilder; +import org.opensearch.index.query.QueryBuilder; import org.testng.Assert; import org.testng.annotations.Test; diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java similarity index 93% rename from metadata-io/src/test/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphServiceTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java index 1717e466359d3..0ce43c9d31571 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java @@ -1,12 +1,11 @@ -package com.linkedin.metadata.graph.elastic; +package com.linkedin.metadata.graph.search; -import com.linkedin.metadata.config.search.GraphQueryConfiguration; import com.linkedin.common.FabricType; import com.linkedin.common.urn.DataPlatformUrn; import com.linkedin.common.urn.DatasetUrn; import com.linkedin.common.urn.TagUrn; import com.linkedin.common.urn.Urn; -import com.linkedin.metadata.ESTestConfiguration; +import com.linkedin.metadata.config.search.GraphQueryConfiguration; import com.linkedin.metadata.graph.Edge; import com.linkedin.metadata.graph.EntityLineageResult; import com.linkedin.metadata.graph.GraphService; @@ -14,6 +13,9 @@ import com.linkedin.metadata.graph.LineageDirection; import com.linkedin.metadata.graph.RelatedEntitiesResult; import com.linkedin.metadata.graph.RelatedEntity; +import com.linkedin.metadata.graph.elastic.ESGraphQueryDAO; +import com.linkedin.metadata.graph.elastic.ESGraphWriteDAO; +import com.linkedin.metadata.graph.elastic.ElasticSearchGraphService; import com.linkedin.metadata.models.registry.LineageRegistry; import com.linkedin.metadata.models.registry.SnapshotEntityRegistry; import com.linkedin.metadata.query.filter.Filter; @@ -23,18 +25,17 @@ import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; -import java.util.Arrays; -import java.util.Collections; -import org.elasticsearch.client.RestHighLevelClient; +import io.datahubproject.test.search.SearchTestUtils; import org.junit.Assert; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.context.annotation.Import; +import org.opensearch.client.RestHighLevelClient; import org.testng.SkipException; import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; import javax.annotation.Nonnull; +import java.util.Arrays; +import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.List; @@ -43,15 +44,16 @@ import static com.linkedin.metadata.search.utils.QueryUtils.*; import static org.testng.Assert.assertEquals; -@Import(ESTestConfiguration.class) -public class ElasticSearchGraphServiceTest extends GraphServiceTestBase { +abstract public class SearchGraphServiceTestBase extends GraphServiceTestBase { - @Autowired - private RestHighLevelClient _searchClient; - @Autowired - private ESBulkProcessor _bulkProcessor; - @Autowired - private ESIndexBuilder _esIndexBuilder; + @Nonnull + abstract protected RestHighLevelClient getSearchClient(); + + @Nonnull + abstract protected ESBulkProcessor getBulkProcessor(); + + @Nonnull + abstract protected ESIndexBuilder getIndexBuilder(); private final IndexConvention _indexConvention = new IndexConventionImpl(null); private final String _indexName = _indexConvention.getIndexName(INDEX_NAME); @@ -74,10 +76,10 @@ public void wipe() throws Exception { @Nonnull private ElasticSearchGraphService buildService() { LineageRegistry lineageRegistry = new LineageRegistry(SnapshotEntityRegistry.getInstance()); - ESGraphQueryDAO readDAO = new ESGraphQueryDAO(_searchClient, lineageRegistry, _indexConvention, GraphQueryConfiguration.testDefaults); - ESGraphWriteDAO writeDAO = new ESGraphWriteDAO(_indexConvention, _bulkProcessor, 1); - return new ElasticSearchGraphService(lineageRegistry, _bulkProcessor, _indexConvention, writeDAO, readDAO, - _esIndexBuilder); + ESGraphQueryDAO readDAO = new ESGraphQueryDAO(getSearchClient(), lineageRegistry, _indexConvention, GraphQueryConfiguration.testDefaults); + ESGraphWriteDAO writeDAO = new ESGraphWriteDAO(_indexConvention, getBulkProcessor(), 1); + return new ElasticSearchGraphService(lineageRegistry, getBulkProcessor(), _indexConvention, writeDAO, readDAO, + getIndexBuilder()); } @Override @@ -88,7 +90,7 @@ protected GraphService getGraphService() { @Override protected void syncAfterWrite() throws Exception { - ESTestConfiguration.syncAfterWrite(_bulkProcessor); + SearchTestUtils.syncAfterWrite(getBulkProcessor()); } @Override diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/elastic/TimeFilterUtilsTest.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/TimeFilterUtilsTest.java similarity index 82% rename from metadata-io/src/test/java/com/linkedin/metadata/graph/elastic/TimeFilterUtilsTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/graph/search/TimeFilterUtilsTest.java index 988a7ccc70741..989f9ae197239 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/elastic/TimeFilterUtilsTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/TimeFilterUtilsTest.java @@ -1,9 +1,11 @@ -package com.linkedin.metadata.graph.elastic; +package com.linkedin.metadata.graph.search; import com.google.common.io.Resources; import java.net.URL; import java.nio.charset.StandardCharsets; -import org.elasticsearch.index.query.QueryBuilder; + +import com.linkedin.metadata.graph.elastic.TimeFilterUtils; +import org.opensearch.index.query.QueryBuilder; import org.testng.Assert; import org.testng.annotations.Test; diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/search/elasticsearch/SearchGraphServiceElasticSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/elasticsearch/SearchGraphServiceElasticSearchTest.java new file mode 100644 index 0000000000000..7b550311bf823 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/elasticsearch/SearchGraphServiceElasticSearchTest.java @@ -0,0 +1,49 @@ +package com.linkedin.metadata.graph.search.elasticsearch; + +import com.linkedin.metadata.graph.search.SearchGraphServiceTestBase; +import com.linkedin.metadata.search.elasticsearch.ElasticSearchSuite; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; + +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import org.jetbrains.annotations.NotNull; +import org.opensearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.testng.AssertJUnit; +import org.testng.annotations.Test; + +@Import({ElasticSearchSuite.class, SearchTestContainerConfiguration.class}) +public class SearchGraphServiceElasticSearchTest extends SearchGraphServiceTestBase { + + @Autowired + private RestHighLevelClient _searchClient; + @Autowired + private ESBulkProcessor _bulkProcessor; + @Autowired + private ESIndexBuilder _esIndexBuilder; + + @NotNull + @Override + protected RestHighLevelClient getSearchClient() { + return _searchClient; + } + + @NotNull + @Override + protected ESBulkProcessor getBulkProcessor() { + return _bulkProcessor; + } + + @NotNull + @Override + protected ESIndexBuilder getIndexBuilder() { + return _esIndexBuilder; + } + + @Test + public void initTest() { + AssertJUnit.assertNotNull(_searchClient); + } + +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/search/opensearch/SearchGraphServiceOpenSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/opensearch/SearchGraphServiceOpenSearchTest.java new file mode 100644 index 0000000000000..eabfb523fb910 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/opensearch/SearchGraphServiceOpenSearchTest.java @@ -0,0 +1,48 @@ +package com.linkedin.metadata.graph.search.opensearch; + +import com.linkedin.metadata.graph.search.SearchGraphServiceTestBase; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; +import com.linkedin.metadata.search.opensearch.OpenSearchSuite; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import org.jetbrains.annotations.NotNull; +import org.opensearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.testng.AssertJUnit; +import org.testng.annotations.Test; + +@Import({OpenSearchSuite.class, SearchTestContainerConfiguration.class}) +public class SearchGraphServiceOpenSearchTest extends SearchGraphServiceTestBase { + + @Autowired + private RestHighLevelClient _searchClient; + @Autowired + private ESBulkProcessor _bulkProcessor; + @Autowired + private ESIndexBuilder _esIndexBuilder; + + @NotNull + @Override + protected RestHighLevelClient getSearchClient() { + return _searchClient; + } + + @NotNull + @Override + protected ESBulkProcessor getBulkProcessor() { + return _bulkProcessor; + } + + @NotNull + @Override + protected ESIndexBuilder getIndexBuilder() { + return _esIndexBuilder; + } + + @Test + public void initTest() { + AssertJUnit.assertNotNull(_searchClient); + } + +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/LineageSearchServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/LineageServiceTestBase.java similarity index 94% rename from metadata-io/src/test/java/com/linkedin/metadata/search/LineageSearchServiceTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/search/LineageServiceTestBase.java index faff9f780e31c..461a146022446 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/LineageSearchServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/LineageServiceTestBase.java @@ -1,8 +1,5 @@ package com.linkedin.metadata.search; -import com.linkedin.metadata.config.cache.EntityDocCountCacheConfiguration; -import com.linkedin.metadata.config.cache.SearchLineageCacheConfiguration; -import com.linkedin.metadata.config.search.SearchConfiguration; import com.datahub.test.Snapshot; import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; @@ -16,8 +13,10 @@ import com.linkedin.common.urn.UrnUtils; import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor; import com.linkedin.data.template.LongMap; -import com.linkedin.metadata.ESTestConfiguration; import com.linkedin.metadata.TestEntityUtil; +import com.linkedin.metadata.config.cache.EntityDocCountCacheConfiguration; +import com.linkedin.metadata.config.cache.SearchLineageCacheConfiguration; +import com.linkedin.metadata.config.search.SearchConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; import com.linkedin.metadata.graph.EntityLineageResult; import com.linkedin.metadata.graph.GraphService; @@ -47,47 +46,60 @@ import com.linkedin.metadata.search.utils.QueryUtils; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; -import java.net.URISyntaxException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import org.elasticsearch.client.RestHighLevelClient; import org.junit.Assert; import org.mockito.Mockito; -import org.springframework.beans.factory.annotation.Autowired; +import org.opensearch.client.RestHighLevelClient; import org.springframework.cache.CacheManager; import org.springframework.cache.concurrent.ConcurrentMapCacheManager; -import org.springframework.context.annotation.Import; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; -import static com.linkedin.metadata.Constants.*; -import static com.linkedin.metadata.ESTestConfiguration.*; -import static org.mockito.ArgumentMatchers.*; -import static org.mockito.Mockito.*; -import static org.testng.Assert.*; - -@Import(ESTestConfiguration.class) -public class LineageSearchServiceTest extends AbstractTestNGSpringContextTests { - - @Autowired - private RestHighLevelClient _searchClient; - @Autowired - private ESBulkProcessor _bulkProcessor; - @Autowired - private ESIndexBuilder _esIndexBuilder; - @Autowired - private SearchConfiguration _searchConfiguration; - @Autowired - private CustomSearchConfiguration _customSearchConfiguration; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.net.URISyntaxException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static com.linkedin.metadata.Constants.DATASET_ENTITY_NAME; +import static com.linkedin.metadata.Constants.ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH; +import static io.datahubproject.test.search.SearchTestUtils.syncAfterWrite; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyInt; +import static org.mockito.ArgumentMatchers.anySet; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.reset; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNull; +import static org.testng.Assert.assertTrue; + +abstract public class LineageServiceTestBase extends AbstractTestNGSpringContextTests { + + @Nonnull + abstract protected RestHighLevelClient getSearchClient(); + + @Nonnull + abstract protected ESBulkProcessor getBulkProcessor(); + + @Nonnull + abstract protected ESIndexBuilder getIndexBuilder(); + + @Nonnull + abstract protected SearchConfiguration getSearchConfiguration(); + + @Nonnull + abstract protected CustomSearchConfiguration getCustomSearchConfiguration(); private EntityRegistry _entityRegistry; private IndexConvention _indexConvention; @@ -142,18 +154,18 @@ private void resetService(boolean withCache, boolean withLightingCache) { public void wipe() throws Exception { _elasticSearchService.clear(); clearCache(false); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); } @Nonnull private ElasticSearchService buildEntitySearchService() { EntityIndexBuilders indexBuilders = - new EntityIndexBuilders(_esIndexBuilder, _entityRegistry, + new EntityIndexBuilders(getIndexBuilder(), _entityRegistry, _indexConvention, _settingsBuilder); - ESSearchDAO searchDAO = new ESSearchDAO(_entityRegistry, _searchClient, _indexConvention, false, - ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH, _searchConfiguration, null); - ESBrowseDAO browseDAO = new ESBrowseDAO(_entityRegistry, _searchClient, _indexConvention, _searchConfiguration, _customSearchConfiguration); - ESWriteDAO writeDAO = new ESWriteDAO(_entityRegistry, _searchClient, _indexConvention, _bulkProcessor, 1); + ESSearchDAO searchDAO = new ESSearchDAO(_entityRegistry, getSearchClient(), _indexConvention, false, + ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH, getSearchConfiguration(), null); + ESBrowseDAO browseDAO = new ESBrowseDAO(_entityRegistry, getSearchClient(), _indexConvention, getSearchConfiguration(), getCustomSearchConfiguration()); + ESWriteDAO writeDAO = new ESWriteDAO(_entityRegistry, getSearchClient(), _indexConvention, getBulkProcessor(), 1); return new ElasticSearchService(indexBuilders, searchDAO, browseDAO, writeDAO); } @@ -198,7 +210,7 @@ public void testSearchService() throws Exception { document.set("textFieldOverride", JsonNodeFactory.instance.textNode("textFieldOverride")); document.set("browsePaths", JsonNodeFactory.instance.textNode("/a/b/c")); _elasticSearchService.upsertDocument(ENTITY_NAME, document.toString(), urn.toString()); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); when(_graphService.getLineage(eq(TEST_URN), eq(LineageDirection.DOWNSTREAM), anyInt(), anyInt(), anyInt(), eq(null), eq(null))).thenReturn(mockResult(Collections.emptyList())); @@ -232,7 +244,7 @@ public void testSearchService() throws Exception { document2.set("textFieldOverride", JsonNodeFactory.instance.textNode("textFieldOverride2")); document2.set("browsePaths", JsonNodeFactory.instance.textNode("/b/c")); _elasticSearchService.upsertDocument(ENTITY_NAME, document2.toString(), urn2.toString()); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); searchResult = searchAcrossLineage(null, TEST1); assertEquals(searchResult.getNumEntities().intValue(), 1); @@ -306,7 +318,7 @@ public void testSearchService() throws Exception { // Cleanup _elasticSearchService.deleteDocument(ENTITY_NAME, urn.toString()); _elasticSearchService.deleteDocument(ENTITY_NAME, urn2.toString()); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); when(_graphService.getLineage(eq(TEST_URN), eq(LineageDirection.DOWNSTREAM), anyInt(), anyInt(), anyInt())).thenReturn( @@ -350,7 +362,7 @@ public void testScrollAcrossLineage() throws Exception { document.set("textFieldOverride", JsonNodeFactory.instance.textNode("textFieldOverride")); document.set("browsePaths", JsonNodeFactory.instance.textNode("/a/b/c")); _elasticSearchService.upsertDocument(ENTITY_NAME, document.toString(), urn.toString()); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); when(_graphService.getLineage(eq(TEST_URN), eq(LineageDirection.DOWNSTREAM), anyInt(), anyInt(), anyInt(), eq(null), eq(null))).thenReturn(mockResult(Collections.emptyList())); @@ -383,7 +395,7 @@ public void testScrollAcrossLineage() throws Exception { // Cleanup _elasticSearchService.deleteDocument(ENTITY_NAME, urn.toString()); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); when(_graphService.getLineage(eq(TEST_URN), eq(LineageDirection.DOWNSTREAM), anyInt(), anyInt(), anyInt())).thenReturn( @@ -424,7 +436,7 @@ public void testLightningSearchService() throws Exception { document.set("textFieldOverride", JsonNodeFactory.instance.textNode("textFieldOverride")); document.set("browsePaths", JsonNodeFactory.instance.textNode("/a/b/c")); _elasticSearchService.upsertDocument(ENTITY_NAME, document.toString(), urn.toString()); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); when(_graphService.getLineage(eq(TEST_URN), eq(LineageDirection.DOWNSTREAM), anyInt(), anyInt(), anyInt(), eq(null), eq(null))).thenReturn(mockResult(Collections.emptyList())); @@ -461,7 +473,7 @@ public void testLightningSearchService() throws Exception { document2.set("textFieldOverride", JsonNodeFactory.instance.textNode("textFieldOverride2")); document2.set("browsePaths", JsonNodeFactory.instance.textNode("/b/c")); _elasticSearchService.upsertDocument(ENTITY_NAME, document2.toString(), urn2.toString()); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); searchResult = searchAcrossLineage(null, testStar); assertEquals(searchResult.getNumEntities().intValue(), 1); @@ -616,7 +628,7 @@ public void testLightningSearchService() throws Exception { // Cleanup _elasticSearchService.deleteDocument(ENTITY_NAME, urn.toString()); _elasticSearchService.deleteDocument(ENTITY_NAME, urn2.toString()); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); when(_graphService.getLineage(eq(TEST_URN), eq(LineageDirection.DOWNSTREAM), anyInt(), anyInt(), anyInt())).thenReturn( diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTestBase.java similarity index 92% rename from metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTestBase.java index ad836664d7f6d..c0144d36843f5 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTestBase.java @@ -1,7 +1,5 @@ package com.linkedin.metadata.search; -import com.linkedin.metadata.config.cache.EntityDocCountCacheConfiguration; -import com.linkedin.metadata.config.search.SearchConfiguration; import com.datahub.test.Snapshot; import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; @@ -9,7 +7,8 @@ import com.linkedin.common.urn.TestEntityUrn; import com.linkedin.common.urn.Urn; import com.linkedin.data.template.StringArray; -import com.linkedin.metadata.ESTestConfiguration; +import com.linkedin.metadata.config.cache.EntityDocCountCacheConfiguration; +import com.linkedin.metadata.config.search.SearchConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.models.registry.SnapshotEntityRegistry; @@ -33,11 +32,9 @@ import com.linkedin.metadata.search.ranker.SimpleRanker; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; -import org.elasticsearch.client.RestHighLevelClient; -import org.springframework.beans.factory.annotation.Autowired; +import org.opensearch.client.RestHighLevelClient; import org.springframework.cache.CacheManager; import org.springframework.cache.concurrent.ConcurrentMapCacheManager; -import org.springframework.context.annotation.Import; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeMethod; @@ -45,23 +42,28 @@ import javax.annotation.Nonnull; -import static com.linkedin.metadata.Constants.*; -import static com.linkedin.metadata.ESTestConfiguration.syncAfterWrite; +import static com.linkedin.metadata.Constants.ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH; +import static io.datahubproject.test.search.SearchTestUtils.syncAfterWrite; import static org.testng.Assert.assertEquals; -@Import(ESTestConfiguration.class) -public class SearchServiceTest extends AbstractTestNGSpringContextTests { - - @Autowired - private RestHighLevelClient _searchClient; - @Autowired - private ESBulkProcessor _bulkProcessor; - @Autowired - private ESIndexBuilder _esIndexBuilder; - @Autowired - private SearchConfiguration _searchConfiguration; - @Autowired - private CustomSearchConfiguration _customSearchConfiguration; + +abstract public class SearchServiceTestBase extends AbstractTestNGSpringContextTests { + + @Nonnull + abstract protected RestHighLevelClient getSearchClient(); + + @Nonnull + abstract protected ESBulkProcessor getBulkProcessor(); + + @Nonnull + abstract protected ESIndexBuilder getIndexBuilder(); + + @Nonnull + abstract protected SearchConfiguration getSearchConfiguration(); + + @Nonnull + abstract protected CustomSearchConfiguration getCustomSearchConfiguration(); + private EntityRegistry _entityRegistry; private IndexConvention _indexConvention; private SettingsBuilder _settingsBuilder; @@ -100,19 +102,19 @@ private void resetSearchService() { @BeforeMethod public void wipe() throws Exception { _elasticSearchService.clear(); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); } @Nonnull private ElasticSearchService buildEntitySearchService() { EntityIndexBuilders indexBuilders = - new EntityIndexBuilders(_esIndexBuilder, _entityRegistry, + new EntityIndexBuilders(getIndexBuilder(), _entityRegistry, _indexConvention, _settingsBuilder); - ESSearchDAO searchDAO = new ESSearchDAO(_entityRegistry, _searchClient, _indexConvention, false, - ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH, _searchConfiguration, null); - ESBrowseDAO browseDAO = new ESBrowseDAO(_entityRegistry, _searchClient, _indexConvention, _searchConfiguration, _customSearchConfiguration); - ESWriteDAO writeDAO = new ESWriteDAO(_entityRegistry, _searchClient, _indexConvention, - _bulkProcessor, 1); + ESSearchDAO searchDAO = new ESSearchDAO(_entityRegistry, getSearchClient(), _indexConvention, false, + ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH, getSearchConfiguration(), null); + ESBrowseDAO browseDAO = new ESBrowseDAO(_entityRegistry, getSearchClient(), _indexConvention, getSearchConfiguration(), getCustomSearchConfiguration()); + ESWriteDAO writeDAO = new ESWriteDAO(_entityRegistry, getSearchClient(), _indexConvention, + getBulkProcessor(), 1); return new ElasticSearchService(indexBuilders, searchDAO, browseDAO, writeDAO); } @@ -139,7 +141,7 @@ public void testSearchService() throws Exception { document.set("textFieldOverride", JsonNodeFactory.instance.textNode("textFieldOverride")); document.set("browsePaths", JsonNodeFactory.instance.textNode("/a/b/c")); _elasticSearchService.upsertDocument(ENTITY_NAME, document.toString(), urn.toString()); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", null, null, 0, 10, new SearchFlags().setFulltext(true)); @@ -154,7 +156,7 @@ public void testSearchService() throws Exception { document2.set("textFieldOverride", JsonNodeFactory.instance.textNode("textFieldOverride2")); document2.set("browsePaths", JsonNodeFactory.instance.textNode("/b/c")); _elasticSearchService.upsertDocument(ENTITY_NAME, document2.toString(), urn2.toString()); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "'test2'", null, null, 0, 10, new SearchFlags().setFulltext(true)); @@ -167,7 +169,7 @@ public void testSearchService() throws Exception { _elasticSearchService.deleteDocument(ENTITY_NAME, urn.toString()); _elasticSearchService.deleteDocument(ENTITY_NAME, urn2.toString()); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "'test2'", null, null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 0); @@ -233,7 +235,7 @@ public void testAdvancedSearchOr() throws Exception { document3.set("platform", JsonNodeFactory.instance.textNode("snowflake")); _elasticSearchService.upsertDocument(ENTITY_NAME, document3.toString(), urn3.toString()); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", filterWithCondition, null, 0, 10, new SearchFlags().setFulltext(true)); @@ -304,7 +306,7 @@ public void testAdvancedSearchSoftDelete() throws Exception { document.set("removed", JsonNodeFactory.instance.booleanNode(false)); _elasticSearchService.upsertDocument(ENTITY_NAME, document3.toString(), urn3.toString()); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", filterWithCondition, null, 0, 10, new SearchFlags().setFulltext(true)); @@ -369,7 +371,7 @@ public void testAdvancedSearchNegated() throws Exception { document.set("removed", JsonNodeFactory.instance.booleanNode(false)); _elasticSearchService.upsertDocument(ENTITY_NAME, document3.toString(), urn3.toString()); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", filterWithCondition, null, 0, 10, new SearchFlags().setFulltext(true)); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/TestEntityTestBase.java similarity index 86% rename from metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchServiceTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/search/TestEntityTestBase.java index 9a6d2dc6fc1fa..d358c03c612d0 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/TestEntityTestBase.java @@ -1,19 +1,18 @@ -package com.linkedin.metadata.search.elasticsearch; +package com.linkedin.metadata.search; -import com.linkedin.metadata.config.search.SearchConfiguration; import com.datahub.test.Snapshot; import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; import com.linkedin.common.urn.TestEntityUrn; import com.linkedin.common.urn.Urn; -import com.linkedin.metadata.ESTestConfiguration; import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor; import com.linkedin.metadata.browse.BrowseResult; +import com.linkedin.metadata.config.search.SearchConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.models.registry.SnapshotEntityRegistry; import com.linkedin.metadata.query.SearchFlags; -import com.linkedin.metadata.search.SearchResult; +import com.linkedin.metadata.search.elasticsearch.ElasticSearchService; import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; import com.linkedin.metadata.search.elasticsearch.indexbuilder.EntityIndexBuilders; import com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder; @@ -23,10 +22,7 @@ import com.linkedin.metadata.search.elasticsearch.update.ESWriteDAO; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; -import java.util.List; -import org.elasticsearch.client.RestHighLevelClient; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.context.annotation.Import; +import org.opensearch.client.RestHighLevelClient; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testcontainers.shaded.com.google.common.collect.ImmutableMap; import org.testng.annotations.BeforeClass; @@ -34,24 +30,28 @@ import org.testng.annotations.Test; import javax.annotation.Nonnull; +import java.util.List; -import static com.linkedin.metadata.Constants.*; -import static com.linkedin.metadata.ESTestConfiguration.syncAfterWrite; +import static com.linkedin.metadata.Constants.ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH; +import static io.datahubproject.test.search.SearchTestUtils.syncAfterWrite; import static org.testng.Assert.assertEquals; -@Import(ESTestConfiguration.class) -public class ElasticSearchServiceTest extends AbstractTestNGSpringContextTests { +abstract public class TestEntityTestBase extends AbstractTestNGSpringContextTests { - @Autowired - private RestHighLevelClient _searchClient; - @Autowired - private ESBulkProcessor _bulkProcessor; - @Autowired - private ESIndexBuilder _esIndexBuilder; - @Autowired - private SearchConfiguration _searchConfiguration; - @Autowired - private CustomSearchConfiguration _customSearchConfiguration; + @Nonnull + abstract protected RestHighLevelClient getSearchClient(); + + @Nonnull + abstract protected ESBulkProcessor getBulkProcessor(); + + @Nonnull + abstract protected ESIndexBuilder getIndexBuilder(); + + @Nonnull + abstract protected SearchConfiguration getSearchConfiguration(); + + @Nonnull + abstract protected CustomSearchConfiguration getCustomSearchConfiguration(); private EntityRegistry _entityRegistry; private IndexConvention _indexConvention; @@ -83,12 +83,12 @@ public void wipe() throws Exception { @Nonnull private ElasticSearchService buildService() { EntityIndexBuilders indexBuilders = - new EntityIndexBuilders(_esIndexBuilder, _entityRegistry, _indexConvention, _settingsBuilder); - ESSearchDAO searchDAO = new ESSearchDAO(_entityRegistry, _searchClient, _indexConvention, false, - ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH, _searchConfiguration, null); - ESBrowseDAO browseDAO = new ESBrowseDAO(_entityRegistry, _searchClient, _indexConvention, _searchConfiguration, _customSearchConfiguration); + new EntityIndexBuilders(getIndexBuilder(), _entityRegistry, _indexConvention, _settingsBuilder); + ESSearchDAO searchDAO = new ESSearchDAO(_entityRegistry, getSearchClient(), _indexConvention, false, + ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH, getSearchConfiguration(), null); + ESBrowseDAO browseDAO = new ESBrowseDAO(_entityRegistry, getSearchClient(), _indexConvention, getSearchConfiguration(), getCustomSearchConfiguration()); ESWriteDAO writeDAO = - new ESWriteDAO(_entityRegistry, _searchClient, _indexConvention, _bulkProcessor, 1); + new ESWriteDAO(_entityRegistry, getSearchClient(), _indexConvention, getBulkProcessor(), 1); return new ElasticSearchService(indexBuilders, searchDAO, browseDAO, writeDAO); } @@ -109,7 +109,7 @@ public void testElasticSearchServiceStructuredQuery() throws Exception { document.set("browsePaths", JsonNodeFactory.instance.textNode("/a/b/c")); document.set("foreignKey", JsonNodeFactory.instance.textNode("urn:li:tag:Node.Value")); _elasticSearchService.upsertDocument(ENTITY_NAME, document.toString(), urn.toString()); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); searchResult = _elasticSearchService.search(List.of(ENTITY_NAME), "test", null, null, 0, 10, new SearchFlags().setFulltext(false)); assertEquals(searchResult.getNumEntities().intValue(), 1); @@ -134,7 +134,7 @@ public void testElasticSearchServiceStructuredQuery() throws Exception { document2.set("textFieldOverride", JsonNodeFactory.instance.textNode("textFieldOverride2")); document2.set("browsePaths", JsonNodeFactory.instance.textNode("/b/c")); _elasticSearchService.upsertDocument(ENTITY_NAME, document2.toString(), urn2.toString()); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); searchResult = _elasticSearchService.search(List.of(ENTITY_NAME), "test2", null, null, 0, 10, new SearchFlags().setFulltext(false)); assertEquals(searchResult.getNumEntities().intValue(), 1); @@ -152,7 +152,7 @@ public void testElasticSearchServiceStructuredQuery() throws Exception { _elasticSearchService.deleteDocument(ENTITY_NAME, urn.toString()); _elasticSearchService.deleteDocument(ENTITY_NAME, urn2.toString()); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); searchResult = _elasticSearchService.search(List.of(ENTITY_NAME), "test2", null, null, 0, 10, new SearchFlags().setFulltext(false)); assertEquals(searchResult.getNumEntities().intValue(), 0); browseResult = _elasticSearchService.browse(ENTITY_NAME, "", null, 0, 10); @@ -174,7 +174,7 @@ public void testElasticSearchServiceFulltext() throws Exception { document.set("browsePaths", JsonNodeFactory.instance.textNode("/a/b/c")); document.set("foreignKey", JsonNodeFactory.instance.textNode("urn:li:tag:Node.Value")); _elasticSearchService.upsertDocument(ENTITY_NAME, document.toString(), urn.toString()); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); searchResult = _elasticSearchService.search(List.of(ENTITY_NAME), "test", null, null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 1); @@ -191,7 +191,7 @@ public void testElasticSearchServiceFulltext() throws Exception { document2.set("textFieldOverride", JsonNodeFactory.instance.textNode("textFieldOverride2")); document2.set("browsePaths", JsonNodeFactory.instance.textNode("/b/c")); _elasticSearchService.upsertDocument(ENTITY_NAME, document2.toString(), urn2.toString()); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); searchResult = _elasticSearchService.search(List.of(ENTITY_NAME), "test2", null, null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 1); @@ -203,7 +203,7 @@ public void testElasticSearchServiceFulltext() throws Exception { _elasticSearchService.deleteDocument(ENTITY_NAME, urn.toString()); _elasticSearchService.deleteDocument(ENTITY_NAME, urn2.toString()); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); searchResult = _elasticSearchService.search(List.of(ENTITY_NAME), "test2", null, null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 0); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchSuite.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchSuite.java new file mode 100644 index 0000000000000..750423a024dcc --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchSuite.java @@ -0,0 +1,32 @@ +package com.linkedin.metadata.search.elasticsearch; + +import io.datahubproject.test.search.ElasticsearchTestContainer; +import org.springframework.boot.test.context.TestConfiguration; +import org.springframework.context.annotation.Bean; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testcontainers.containers.GenericContainer; +import org.testng.annotations.AfterSuite; + + +@TestConfiguration +public class ElasticSearchSuite extends AbstractTestNGSpringContextTests { + + private static final ElasticsearchTestContainer ELASTICSEARCH_TEST_CONTAINER; + private static GenericContainer container; + static { + ELASTICSEARCH_TEST_CONTAINER = new ElasticsearchTestContainer(); + } + + @AfterSuite + public void after() { + ELASTICSEARCH_TEST_CONTAINER.stopContainer(); + } + + @Bean(name = "testSearchContainer") + public GenericContainer testSearchContainer() { + if (container == null) { + container = ELASTICSEARCH_TEST_CONTAINER.startContainer(); + } + return container; + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/GoldenElasticSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/GoldenElasticSearchTest.java new file mode 100644 index 0000000000000..cfacd4c15409a --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/GoldenElasticSearchTest.java @@ -0,0 +1,44 @@ +package com.linkedin.metadata.search.elasticsearch; + +import com.linkedin.metadata.search.fixtures.GoldenTestBase; +import io.datahubproject.test.fixtures.search.SampleDataFixtureConfiguration; +import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.metadata.search.SearchService; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import org.jetbrains.annotations.NotNull; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.context.annotation.Import; +import org.testng.annotations.Test; + +import static org.testng.AssertJUnit.assertNotNull; + +@Import({ElasticSearchSuite.class, SampleDataFixtureConfiguration.class, SearchTestContainerConfiguration.class}) +public class GoldenElasticSearchTest extends GoldenTestBase { + + @Autowired + @Qualifier("longTailSearchService") + protected SearchService searchService; + + @Autowired + @Qualifier("entityRegistry") + private EntityRegistry entityRegistry; + + + @NotNull + @Override + protected EntityRegistry getEntityRegistry() { + return entityRegistry; + } + + @NotNull + @Override + protected SearchService getSearchService() { + return searchService; + } + + @Test + public void initTest() { + assertNotNull(searchService); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/IndexBuilderElasticSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/IndexBuilderElasticSearchTest.java new file mode 100644 index 0000000000000..20f4ee52f0e62 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/IndexBuilderElasticSearchTest.java @@ -0,0 +1,30 @@ +package com.linkedin.metadata.search.elasticsearch; + +import com.linkedin.metadata.search.indexbuilder.IndexBuilderTestBase; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import org.jetbrains.annotations.NotNull; +import org.opensearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.testng.annotations.Test; + +import static org.testng.AssertJUnit.assertNotNull; + + +@Import({ElasticSearchSuite.class, SearchTestContainerConfiguration.class}) +public class IndexBuilderElasticSearchTest extends IndexBuilderTestBase { + + @Autowired + private RestHighLevelClient _searchClient; + + @NotNull + @Override + protected RestHighLevelClient getSearchClient() { + return _searchClient; + } + + @Test + public void initTest() { + assertNotNull(_searchClient); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/LineageDataFixtureElasticSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/LineageDataFixtureElasticSearchTest.java new file mode 100644 index 0000000000000..0cb49bc555421 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/LineageDataFixtureElasticSearchTest.java @@ -0,0 +1,43 @@ +package com.linkedin.metadata.search.elasticsearch; + +import com.linkedin.metadata.search.fixtures.LineageDataFixtureTestBase; +import io.datahubproject.test.fixtures.search.SearchLineageFixtureConfiguration; +import com.linkedin.metadata.search.LineageSearchService; +import com.linkedin.metadata.search.SearchService; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import org.jetbrains.annotations.NotNull; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.context.annotation.Import; +import org.testng.AssertJUnit; +import org.testng.annotations.Test; + + +@Import({ElasticSearchSuite.class, SearchLineageFixtureConfiguration.class, SearchTestContainerConfiguration.class}) +public class LineageDataFixtureElasticSearchTest extends LineageDataFixtureTestBase { + + @Autowired + @Qualifier("searchLineageSearchService") + protected SearchService searchService; + + @Autowired + @Qualifier("searchLineageLineageSearchService") + protected LineageSearchService lineageService; + + @NotNull + @Override + protected LineageSearchService getLineageService() { + return lineageService; + } + + @NotNull + @Override + protected SearchService getSearchService() { + return searchService; + } + + @Test + public void initTest() { + AssertJUnit.assertNotNull(lineageService); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/LineageServiceElasticSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/LineageServiceElasticSearchTest.java new file mode 100644 index 0000000000000..613ec5a26ff66 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/LineageServiceElasticSearchTest.java @@ -0,0 +1,66 @@ +package com.linkedin.metadata.search.elasticsearch; + +import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.search.LineageServiceTestBase; +import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; +import io.datahubproject.test.search.config.SearchCommonTestConfiguration; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import org.jetbrains.annotations.NotNull; +import org.opensearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.testng.AssertJUnit; +import org.testng.annotations.Test; + + +@Import({ElasticSearchSuite.class, SearchCommonTestConfiguration.class, SearchTestContainerConfiguration.class}) +public class LineageServiceElasticSearchTest extends LineageServiceTestBase { + + @Autowired + private RestHighLevelClient _searchClient; + @Autowired + private ESBulkProcessor _bulkProcessor; + @Autowired + private ESIndexBuilder _esIndexBuilder; + @Autowired + private SearchConfiguration _searchConfiguration; + @Autowired + private CustomSearchConfiguration _customSearchConfiguration; + + @NotNull + @Override + protected RestHighLevelClient getSearchClient() { + return _searchClient; + } + + @NotNull + @Override + protected ESBulkProcessor getBulkProcessor() { + return _bulkProcessor; + } + + @NotNull + @Override + protected ESIndexBuilder getIndexBuilder() { + return _esIndexBuilder; + } + + @NotNull + @Override + protected SearchConfiguration getSearchConfiguration() { + return _searchConfiguration; + } + + @NotNull + @Override + protected CustomSearchConfiguration getCustomSearchConfiguration() { + return _customSearchConfiguration; + } + + @Test + public void initTest() { + AssertJUnit.assertNotNull(_searchClient); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/SampleDataFixtureElasticSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/SampleDataFixtureElasticSearchTest.java new file mode 100644 index 0000000000000..855f46d239118 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/SampleDataFixtureElasticSearchTest.java @@ -0,0 +1,45 @@ +package com.linkedin.metadata.search.elasticsearch; + +import com.linkedin.entity.client.EntityClient; +import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.metadata.search.SearchService; +import com.linkedin.metadata.search.fixtures.SampleDataFixtureTestBase; +import io.datahubproject.test.fixtures.search.SampleDataFixtureConfiguration; + +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import lombok.Getter; +import org.opensearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.context.annotation.Import; +import org.testng.annotations.Test; + +import static org.testng.AssertJUnit.assertNotNull; + + +/** + * Runs sample data fixture tests for Elasticsearch test container + */ +@Getter +@Import({ElasticSearchSuite.class, SampleDataFixtureConfiguration.class, SearchTestContainerConfiguration.class}) +public class SampleDataFixtureElasticSearchTest extends SampleDataFixtureTestBase { + @Autowired + private RestHighLevelClient searchClient; + + @Autowired + @Qualifier("sampleDataSearchService") + protected SearchService searchService; + + @Autowired + @Qualifier("sampleDataEntityClient") + protected EntityClient entityClient; + + @Autowired + @Qualifier("entityRegistry") + private EntityRegistry entityRegistry; + + @Test + public void initTest() { + assertNotNull(searchClient); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/SearchDAOElasticSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/SearchDAOElasticSearchTest.java new file mode 100644 index 0000000000000..1a6a20cd9df9d --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/SearchDAOElasticSearchTest.java @@ -0,0 +1,35 @@ +package com.linkedin.metadata.search.elasticsearch; + +import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.search.query.SearchDAOTestBase; +import io.datahubproject.test.fixtures.search.SampleDataFixtureConfiguration; +import com.linkedin.metadata.utils.elasticsearch.IndexConvention; + +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import lombok.Getter; +import org.opensearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; + +import org.springframework.beans.factory.annotation.Qualifier; +import org.testng.annotations.Test; + +import static org.testng.AssertJUnit.assertNotNull; + + +@Getter +@Import({ElasticSearchSuite.class, SampleDataFixtureConfiguration.class, SearchTestContainerConfiguration.class}) +public class SearchDAOElasticSearchTest extends SearchDAOTestBase { + @Autowired + private RestHighLevelClient searchClient; + @Autowired + private SearchConfiguration searchConfiguration; + @Autowired + @Qualifier("sampleDataIndexConvention") + IndexConvention indexConvention; + + @Test + public void initTest() { + assertNotNull(searchClient); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/SearchServiceElasticSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/SearchServiceElasticSearchTest.java new file mode 100644 index 0000000000000..a9e9feac28007 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/SearchServiceElasticSearchTest.java @@ -0,0 +1,65 @@ +package com.linkedin.metadata.search.elasticsearch; + +import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.search.SearchServiceTestBase; +import io.datahubproject.test.search.config.SearchCommonTestConfiguration; +import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import org.jetbrains.annotations.NotNull; +import org.opensearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.testng.AssertJUnit; +import org.testng.annotations.Test; + +@Import({ElasticSearchSuite.class, SearchCommonTestConfiguration.class, SearchTestContainerConfiguration.class}) +public class SearchServiceElasticSearchTest extends SearchServiceTestBase { + + @Autowired + private RestHighLevelClient _searchClient; + @Autowired + private ESBulkProcessor _bulkProcessor; + @Autowired + private ESIndexBuilder _esIndexBuilder; + @Autowired + private SearchConfiguration _searchConfiguration; + @Autowired + private CustomSearchConfiguration _customSearchConfiguration; + + @NotNull + @Override + protected RestHighLevelClient getSearchClient() { + return _searchClient; + } + + @NotNull + @Override + protected ESBulkProcessor getBulkProcessor() { + return _bulkProcessor; + } + + @NotNull + @Override + protected ESIndexBuilder getIndexBuilder() { + return _esIndexBuilder; + } + + @NotNull + @Override + protected SearchConfiguration getSearchConfiguration() { + return _searchConfiguration; + } + + @NotNull + @Override + protected CustomSearchConfiguration getCustomSearchConfiguration() { + return _customSearchConfiguration; + } + + @Test + public void initTest() { + AssertJUnit.assertNotNull(_searchClient); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/SystemMetadataServiceElasticSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/SystemMetadataServiceElasticSearchTest.java new file mode 100644 index 0000000000000..7365887fb9b2e --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/SystemMetadataServiceElasticSearchTest.java @@ -0,0 +1,47 @@ +package com.linkedin.metadata.search.elasticsearch; + +import com.linkedin.metadata.systemmetadata.SystemMetadataServiceTestBase; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import org.jetbrains.annotations.NotNull; +import org.opensearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.testng.AssertJUnit; +import org.testng.annotations.Test; + + +@Import({ElasticSearchSuite.class, SearchTestContainerConfiguration.class}) +public class SystemMetadataServiceElasticSearchTest extends SystemMetadataServiceTestBase { + + @Autowired + private RestHighLevelClient _searchClient; + @Autowired + private ESBulkProcessor _bulkProcessor; + @Autowired + private ESIndexBuilder _esIndexBuilder; + + @NotNull + @Override + protected RestHighLevelClient getSearchClient() { + return _searchClient; + } + + @NotNull + @Override + protected ESBulkProcessor getBulkProcessor() { + return _bulkProcessor; + } + + @NotNull + @Override + protected ESIndexBuilder getIndexBuilder() { + return _esIndexBuilder; + } + + @Test + public void initTest() { + AssertJUnit.assertNotNull(_searchClient); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/TestEntityElasticSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/TestEntityElasticSearchTest.java new file mode 100644 index 0000000000000..bec610b20dca1 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/TestEntityElasticSearchTest.java @@ -0,0 +1,65 @@ +package com.linkedin.metadata.search.elasticsearch; + +import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; +import com.linkedin.metadata.search.TestEntityTestBase; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; +import io.datahubproject.test.search.config.SearchCommonTestConfiguration; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import org.jetbrains.annotations.NotNull; +import org.opensearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.testng.AssertJUnit; +import org.testng.annotations.Test; + +@Import({ElasticSearchSuite.class, SearchCommonTestConfiguration.class, SearchTestContainerConfiguration.class}) +public class TestEntityElasticSearchTest extends TestEntityTestBase { + + @Autowired + private RestHighLevelClient _searchClient; + @Autowired + private ESBulkProcessor _bulkProcessor; + @Autowired + private ESIndexBuilder _esIndexBuilder; + @Autowired + private SearchConfiguration _searchConfiguration; + @Autowired + private CustomSearchConfiguration _customSearchConfiguration; + + @NotNull + @Override + protected RestHighLevelClient getSearchClient() { + return _searchClient; + } + + @NotNull + @Override + protected ESBulkProcessor getBulkProcessor() { + return _bulkProcessor; + } + + @NotNull + @Override + protected ESIndexBuilder getIndexBuilder() { + return _esIndexBuilder; + } + + @NotNull + @Override + protected SearchConfiguration getSearchConfiguration() { + return _searchConfiguration; + } + + @NotNull + @Override + protected CustomSearchConfiguration getCustomSearchConfiguration() { + return _customSearchConfiguration; + } + + @Test + public void initTest() { + AssertJUnit.assertNotNull(_searchClient); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/TimeseriesAspectServiceElasticSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/TimeseriesAspectServiceElasticSearchTest.java new file mode 100644 index 0000000000000..5b85904edc923 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/TimeseriesAspectServiceElasticSearchTest.java @@ -0,0 +1,46 @@ +package com.linkedin.metadata.search.elasticsearch; + +import com.linkedin.metadata.timeseries.search.TimeseriesAspectServiceTestBase; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; +import org.jetbrains.annotations.NotNull; +import org.opensearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.testng.AssertJUnit; +import org.testng.annotations.Test; + +@Import({ElasticSearchSuite.class, SearchTestContainerConfiguration.class}) +public class TimeseriesAspectServiceElasticSearchTest extends TimeseriesAspectServiceTestBase { + + @Autowired + private RestHighLevelClient _searchClient; + @Autowired + private ESBulkProcessor _bulkProcessor; + @Autowired + private ESIndexBuilder _esIndexBuilder; + + @NotNull + @Override + protected RestHighLevelClient getSearchClient() { + return _searchClient; + } + + @NotNull + @Override + protected ESBulkProcessor getBulkProcessor() { + return _bulkProcessor; + } + + @NotNull + @Override + protected ESIndexBuilder getIndexBuilder() { + return _esIndexBuilder; + } + + @Test + public void initTest() { + AssertJUnit.assertNotNull(_searchClient); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAOTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAOTest.java deleted file mode 100644 index b506051e9bb5d..0000000000000 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAOTest.java +++ /dev/null @@ -1,312 +0,0 @@ -package com.linkedin.metadata.search.elasticsearch.query; - -import com.linkedin.metadata.config.search.SearchConfiguration; -import com.datahub.test.Snapshot; -import com.google.common.collect.ImmutableList; -import com.linkedin.data.template.LongMap; -import com.linkedin.data.template.StringArray; -import com.linkedin.metadata.ESSampleDataFixture; -import com.linkedin.metadata.models.registry.EntityRegistry; -import com.linkedin.metadata.models.registry.SnapshotEntityRegistry; -import com.linkedin.metadata.query.filter.Condition; -import com.linkedin.metadata.query.filter.ConjunctiveCriterion; -import com.linkedin.metadata.query.filter.ConjunctiveCriterionArray; -import com.linkedin.metadata.query.filter.CriterionArray; -import com.linkedin.metadata.query.filter.Filter; -import com.linkedin.metadata.search.AggregationMetadata; -import com.linkedin.metadata.search.AggregationMetadataArray; -import com.linkedin.metadata.search.FilterValueArray; -import com.linkedin.metadata.search.SearchEntityArray; -import com.linkedin.metadata.search.SearchResult; -import com.linkedin.metadata.search.SearchResultMetadata; -import com.linkedin.metadata.utils.SearchUtil; -import com.linkedin.metadata.utils.elasticsearch.IndexConvention; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import org.elasticsearch.client.RestHighLevelClient; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.context.annotation.Import; -import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; -import org.testng.annotations.Test; - -import com.linkedin.metadata.query.filter.Criterion; -import org.springframework.beans.factory.annotation.Qualifier; - -import static com.linkedin.metadata.Constants.*; -import static com.linkedin.metadata.utils.SearchUtil.*; -import static org.testng.Assert.*; - - -@Import(ESSampleDataFixture.class) -public class ESSearchDAOTest extends AbstractTestNGSpringContextTests { - @Autowired - private RestHighLevelClient _searchClient; - @Autowired - private SearchConfiguration _searchConfiguration; - @Autowired - @Qualifier("sampleDataIndexConvention") - IndexConvention _indexConvention; - EntityRegistry _entityRegistry = new SnapshotEntityRegistry(new Snapshot()); - - - - @Test - public void testTransformFilterForEntitiesNoChange() { - Criterion c = new Criterion().setValue("urn:li:tag:abc").setValues( - new StringArray(ImmutableList.of("urn:li:tag:abc", "urn:li:tag:def")) - ).setNegated(false).setCondition(Condition.EQUAL).setField("tags.keyword"); - - Filter f = new Filter().setOr( - new ConjunctiveCriterionArray(new ConjunctiveCriterion().setAnd(new CriterionArray(c)))); - - Filter transformedFilter = SearchUtil.transformFilterForEntities(f, _indexConvention); - assertEquals(f, transformedFilter); - } - - @Test - public void testTransformFilterForEntitiesNullFilter() { - Filter transformedFilter = SearchUtil.transformFilterForEntities(null, _indexConvention); - assertNotNull(_indexConvention); - assertEquals(null, transformedFilter); - } - - @Test - public void testTransformFilterForEntitiesWithChanges() { - - Criterion c = new Criterion().setValue("dataset").setValues( - new StringArray(ImmutableList.of("dataset")) - ).setNegated(false).setCondition(Condition.EQUAL).setField("_entityType"); - - Filter f = new Filter().setOr( - new ConjunctiveCriterionArray(new ConjunctiveCriterion().setAnd(new CriterionArray(c)))); - Filter originalF = null; - try { - originalF = f.copy(); - } catch (CloneNotSupportedException e) { - fail(e.getMessage()); - } - assertEquals(f, originalF); - - Filter transformedFilter = SearchUtil.transformFilterForEntities(f, _indexConvention); - assertNotEquals(originalF, transformedFilter); - - Criterion expectedNewCriterion = new Criterion().setValue("smpldat_datasetindex_v2").setValues( - new StringArray(ImmutableList.of("smpldat_datasetindex_v2")) - ).setNegated(false).setCondition(Condition.EQUAL).setField("_index"); - - Filter expectedNewFilter = new Filter().setOr( - new ConjunctiveCriterionArray(new ConjunctiveCriterion().setAnd(new CriterionArray(expectedNewCriterion)))); - - assertEquals(expectedNewFilter, transformedFilter); - } - - @Test - public void testTransformFilterForEntitiesWithUnderscore() { - - Criterion c = new Criterion().setValue("data_job").setValues( - new StringArray(ImmutableList.of("data_job")) - ).setNegated(false).setCondition(Condition.EQUAL).setField("_entityType"); - - Filter f = new Filter().setOr( - new ConjunctiveCriterionArray(new ConjunctiveCriterion().setAnd(new CriterionArray(c)))); - Filter originalF = null; - try { - originalF = f.copy(); - } catch (CloneNotSupportedException e) { - fail(e.getMessage()); - } - assertEquals(f, originalF); - - Filter transformedFilter = SearchUtil.transformFilterForEntities(f, _indexConvention); - assertNotEquals(originalF, transformedFilter); - - Criterion expectedNewCriterion = new Criterion().setValue("smpldat_datajobindex_v2").setValues( - new StringArray(ImmutableList.of("smpldat_datajobindex_v2")) - ).setNegated(false).setCondition(Condition.EQUAL).setField("_index"); - - Filter expectedNewFilter = new Filter().setOr( - new ConjunctiveCriterionArray(new ConjunctiveCriterion().setAnd(new CriterionArray(expectedNewCriterion)))); - - assertEquals(transformedFilter, expectedNewFilter); - } - - @Test - public void testTransformFilterForEntitiesWithSomeChanges() { - - Criterion criterionChanged = new Criterion().setValue("dataset").setValues( - new StringArray(ImmutableList.of("dataset")) - ).setNegated(false).setCondition(Condition.EQUAL).setField("_entityType"); - Criterion criterionUnchanged = new Criterion().setValue("urn:li:tag:abc").setValues( - new StringArray(ImmutableList.of("urn:li:tag:abc", "urn:li:tag:def")) - ).setNegated(false).setCondition(Condition.EQUAL).setField("tags.keyword"); - - Filter f = new Filter().setOr( - new ConjunctiveCriterionArray(new ConjunctiveCriterion().setAnd(new CriterionArray(criterionChanged, criterionUnchanged)))); - Filter originalF = null; - try { - originalF = f.copy(); - } catch (CloneNotSupportedException e) { - fail(e.getMessage()); - } - assertEquals(f, originalF); - - Filter transformedFilter = SearchUtil.transformFilterForEntities(f, _indexConvention); - assertNotEquals(originalF, transformedFilter); - - Criterion expectedNewCriterion = new Criterion().setValue("smpldat_datasetindex_v2").setValues( - new StringArray(ImmutableList.of("smpldat_datasetindex_v2")) - ).setNegated(false).setCondition(Condition.EQUAL).setField("_index"); - - Filter expectedNewFilter = new Filter().setOr( - new ConjunctiveCriterionArray(new ConjunctiveCriterion().setAnd(new CriterionArray(expectedNewCriterion, criterionUnchanged)))); - - assertEquals(expectedNewFilter, transformedFilter); - } - - @Test - public void testTransformIndexIntoEntityNameSingle() { - ESSearchDAO searchDAO = new ESSearchDAO(_entityRegistry, _searchClient, _indexConvention, false, - ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH, _searchConfiguration, null); - // Empty aggregations - final SearchResultMetadata searchResultMetadata = - new SearchResultMetadata().setAggregations(new AggregationMetadataArray()); - SearchResult result = new SearchResult().setEntities(new SearchEntityArray(new ArrayList<>())) - .setMetadata(searchResultMetadata) - .setFrom(0) - .setPageSize(100) - .setNumEntities(30); - SearchResult expectedResult = null; - try { - expectedResult = result.copy(); - } catch (CloneNotSupportedException e) { - fail(e.getMessage()); - } - assertEquals(expectedResult, searchDAO.transformIndexIntoEntityName(result)); - - // one facet, do not transform - Map aggMap = Map.of("urn:li:corpuser:datahub", Long.valueOf(3)); - - List aggregationMetadataList = new ArrayList<>(); - aggregationMetadataList.add(new AggregationMetadata().setName("owners") - .setDisplayName("Owned by") - .setAggregations(new LongMap(aggMap)) - .setFilterValues(new FilterValueArray(SearchUtil.convertToFilters(aggMap, Collections.emptySet()))) - ); - searchResultMetadata.setAggregations(new AggregationMetadataArray(aggregationMetadataList)); - result.setMetadata(searchResultMetadata); - - try { - expectedResult = result.copy(); - } catch (CloneNotSupportedException e) { - fail(e.getMessage()); - } - assertEquals(searchDAO.transformIndexIntoEntityName(result), expectedResult); - - // one facet, transform - Map entityTypeMap = Map.of("smpldat_datasetindex_v2", Long.valueOf(3)); - - aggregationMetadataList = List.of(new AggregationMetadata().setName("_entityType") - .setDisplayName("Type") - .setAggregations(new LongMap(entityTypeMap)) - .setFilterValues(new FilterValueArray(SearchUtil.convertToFilters(entityTypeMap, Collections.emptySet()))) - ); - searchResultMetadata.setAggregations(new AggregationMetadataArray(aggregationMetadataList)); - result.setMetadata(searchResultMetadata); - - Map expectedEntityTypeMap = Map.of("dataset", Long.valueOf(3)); - - List expectedAggregationMetadataList = List.of( - new AggregationMetadata().setName("_entityType") - .setDisplayName("Type") - .setAggregations(new LongMap(expectedEntityTypeMap)) - .setFilterValues(new FilterValueArray(SearchUtil.convertToFilters(expectedEntityTypeMap, Collections.emptySet()))) - ); - expectedResult.setMetadata(new SearchResultMetadata().setAggregations(new AggregationMetadataArray(expectedAggregationMetadataList))); - assertEquals(searchDAO.transformIndexIntoEntityName(result), expectedResult); - } - - @Test - public void testTransformIndexIntoEntityNameNested() { - ESSearchDAO searchDAO = new ESSearchDAO(_entityRegistry, _searchClient, _indexConvention, false, - ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH, _searchConfiguration, null); - // One nested facet - Map entityTypeMap = Map.of( - String.format("smpldat_datasetindex_v2%surn:li:corpuser:datahub", AGGREGATION_SEPARATOR_CHAR), Long.valueOf(3), - String.format("smpldat_datasetindex_v2%surn:li:corpuser:bfoo", AGGREGATION_SEPARATOR_CHAR), Long.valueOf(7), - "smpldat_datasetindex_v2", Long.valueOf(20) - ); - List aggregationMetadataList = List.of(new AggregationMetadata().setName("_entityType␞owners") - .setDisplayName("Type␞Owned By") - .setAggregations(new LongMap(entityTypeMap)) - .setFilterValues(new FilterValueArray(SearchUtil.convertToFilters(entityTypeMap, Collections.emptySet()))) - ); - SearchResult result = new SearchResult().setEntities(new SearchEntityArray(new ArrayList<>())) - .setMetadata(new SearchResultMetadata().setAggregations( - new AggregationMetadataArray(aggregationMetadataList) - )) - .setFrom(0) - .setPageSize(100) - .setNumEntities(50); - - Map expectedEntityTypeMap = Map.of( - String.format("dataset%surn:li:corpuser:datahub", AGGREGATION_SEPARATOR_CHAR), Long.valueOf(3), - String.format("dataset%surn:li:corpuser:bfoo", AGGREGATION_SEPARATOR_CHAR), Long.valueOf(7), - "dataset", Long.valueOf(20) - ); - - List expectedAggregationMetadataList = List.of(new AggregationMetadata().setName("_entityType␞owners") - .setDisplayName("Type␞Owned By") - .setAggregations(new LongMap(expectedEntityTypeMap)) - .setFilterValues(new FilterValueArray(SearchUtil.convertToFilters(expectedEntityTypeMap, Collections.emptySet()))) - ); - SearchResult expectedResult = new SearchResult().setEntities(new SearchEntityArray(new ArrayList<>())) - .setMetadata(new SearchResultMetadata().setAggregations( - new AggregationMetadataArray(expectedAggregationMetadataList))) - .setFrom(0) - .setPageSize(100) - .setNumEntities(50); - assertEquals(searchDAO.transformIndexIntoEntityName(result), expectedResult); - - // One nested facet, opposite order - entityTypeMap = Map.of( - String.format("urn:li:corpuser:datahub%ssmpldat_datasetindex_v2", AGGREGATION_SEPARATOR_CHAR), Long.valueOf(3), - String.format("urn:li:corpuser:datahub%ssmpldat_chartindex_v2", AGGREGATION_SEPARATOR_CHAR), Long.valueOf(7), - "urn:li:corpuser:datahub", Long.valueOf(20) - ); - aggregationMetadataList = List.of(new AggregationMetadata().setName("owners␞_entityType") - .setDisplayName("Owned By␞Type") - .setAggregations(new LongMap(entityTypeMap)) - .setFilterValues(new FilterValueArray(SearchUtil.convertToFilters(entityTypeMap, Collections.emptySet()))) - ); - result = new SearchResult().setEntities(new SearchEntityArray(new ArrayList<>())) - .setMetadata(new SearchResultMetadata().setAggregations( - new AggregationMetadataArray(aggregationMetadataList) - )) - .setFrom(0) - .setPageSize(100) - .setNumEntities(50); - - expectedEntityTypeMap = Map.of( - String.format("urn:li:corpuser:datahub%sdataset", AGGREGATION_SEPARATOR_CHAR), Long.valueOf(3), - String.format("urn:li:corpuser:datahub%schart", AGGREGATION_SEPARATOR_CHAR), Long.valueOf(7), - "urn:li:corpuser:datahub", Long.valueOf(20) - ); - - expectedAggregationMetadataList = List.of(new AggregationMetadata().setName("owners␞_entityType") - .setDisplayName("Owned By␞Type") - .setAggregations(new LongMap(expectedEntityTypeMap)) - .setFilterValues(new FilterValueArray(SearchUtil.convertToFilters(expectedEntityTypeMap, Collections.emptySet()))) - ); - expectedResult = new SearchResult().setEntities(new SearchEntityArray(new ArrayList<>())) - .setMetadata(new SearchResultMetadata().setAggregations( - new AggregationMetadataArray(expectedAggregationMetadataList))) - .setFrom(0) - .setPageSize(100) - .setNumEntities(50); - assertEquals(searchDAO.transformIndexIntoEntityName(result), expectedResult); - } - - -} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/GoldenTestBase.java similarity index 74% rename from metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/GoldenTestBase.java index d720c95fef84d..ed81f3cebd027 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/GoldenTestBase.java @@ -1,60 +1,47 @@ -package com.linkedin.metadata.search.elasticsearch.fixtures; +package com.linkedin.metadata.search.fixtures; import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.generated.EntityType; import com.linkedin.datahub.graphql.resolvers.EntityTypeMapper; -import com.linkedin.entity.client.EntityClient; -import com.linkedin.metadata.ESSampleDataFixture; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.search.MatchedFieldArray; import com.linkedin.metadata.search.SearchEntityArray; import com.linkedin.metadata.search.SearchResult; import com.linkedin.metadata.search.SearchService; -import org.elasticsearch.client.RestHighLevelClient; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.beans.factory.annotation.Qualifier; -import org.springframework.context.annotation.Import; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testng.annotations.Test; +import javax.annotation.Nonnull; import java.util.List; import java.util.stream.Collectors; import java.util.stream.Stream; -import static com.linkedin.metadata.ESTestUtils.*; +import static io.datahubproject.test.search.SearchTestUtils.searchAcrossCustomEntities; +import static io.datahubproject.test.search.SearchTestUtils.searchAcrossEntities; import static org.testng.Assert.assertTrue; -import static org.testng.AssertJUnit.*; +import static org.testng.AssertJUnit.assertNotNull; -@Import(ESSampleDataFixture.class) -public class ElasticSearchGoldenTest extends AbstractTestNGSpringContextTests { +abstract public class GoldenTestBase extends AbstractTestNGSpringContextTests { private static final List SEARCHABLE_LONGTAIL_ENTITIES = Stream.of(EntityType.CHART, EntityType.CONTAINER, EntityType.DASHBOARD, EntityType.DATASET, EntityType.DOMAIN, EntityType.TAG ).map(EntityTypeMapper::getName) .collect(Collectors.toList()); - @Autowired - private RestHighLevelClient _searchClient; - @Autowired - @Qualifier("longTailSearchService") - protected SearchService searchService; + @Nonnull + abstract protected EntityRegistry getEntityRegistry(); - @Autowired - @Qualifier("longTailEntityClient") - protected EntityClient entityClient; - - @Autowired - @Qualifier("longTailEntityRegistry") - private EntityRegistry entityRegistry; + @Nonnull + abstract protected SearchService getSearchService(); @Test public void testNameMatchPetProfiles() { /* Searching for "pet profiles" should return "pet_profiles" as the first 2 search results */ - assertNotNull(searchService); - assertNotNull(entityRegistry); - SearchResult searchResult = searchAcrossCustomEntities(searchService, "pet profiles", SEARCHABLE_LONGTAIL_ENTITIES); + assertNotNull(getSearchService()); + assertNotNull(getEntityRegistry()); + SearchResult searchResult = searchAcrossCustomEntities(getSearchService(), "pet profiles", SEARCHABLE_LONGTAIL_ENTITIES); assertTrue(searchResult.getEntities().size() >= 2); Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); Urn secondResultUrn = searchResult.getEntities().get(1).getEntity(); @@ -68,8 +55,8 @@ public void testNameMatchPetProfile() { /* Searching for "pet profile" should return "pet_profiles" as the first 2 search results */ - assertNotNull(searchService); - SearchResult searchResult = searchAcrossEntities(searchService, "pet profile", SEARCHABLE_LONGTAIL_ENTITIES); + assertNotNull(getSearchService()); + SearchResult searchResult = searchAcrossEntities(getSearchService(), "pet profile", SEARCHABLE_LONGTAIL_ENTITIES); assertTrue(searchResult.getEntities().size() >= 2); Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); Urn secondResultUrn = searchResult.getEntities().get(1).getEntity(); @@ -84,8 +71,8 @@ public void testGlossaryTerms() { Searching for "ReturnRate" should return all tables that have the glossary term applied before anything else */ - assertNotNull(searchService); - SearchResult searchResult = searchAcrossEntities(searchService, "ReturnRate", SEARCHABLE_LONGTAIL_ENTITIES); + assertNotNull(getSearchService()); + SearchResult searchResult = searchAcrossEntities(getSearchService(), "ReturnRate", SEARCHABLE_LONGTAIL_ENTITIES); SearchEntityArray entities = searchResult.getEntities(); assertTrue(searchResult.getEntities().size() >= 4); MatchedFieldArray firstResultMatchedFields = entities.get(0).getMatchedFields(); @@ -105,8 +92,8 @@ public void testNameMatchPartiallyQualified() { Searching for "analytics.pet_details" (partially qualified) should return the fully qualified table name as the first search results before any others */ - assertNotNull(searchService); - SearchResult searchResult = searchAcrossEntities(searchService, "analytics.pet_details", SEARCHABLE_LONGTAIL_ENTITIES); + assertNotNull(getSearchService()); + SearchResult searchResult = searchAcrossEntities(getSearchService(), "analytics.pet_details", SEARCHABLE_LONGTAIL_ENTITIES); assertTrue(searchResult.getEntities().size() >= 2); Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); Urn secondResultUrn = searchResult.getEntities().get(1).getEntity(); @@ -121,8 +108,8 @@ public void testNameMatchCollaborativeActionitems() { Searching for "collaborative actionitems" should return "collaborative_actionitems" as the first search result, followed by "collaborative_actionitems_old" */ - assertNotNull(searchService); - SearchResult searchResult = searchAcrossEntities(searchService, "collaborative actionitems", SEARCHABLE_LONGTAIL_ENTITIES); + assertNotNull(getSearchService()); + SearchResult searchResult = searchAcrossEntities(getSearchService(), "collaborative actionitems", SEARCHABLE_LONGTAIL_ENTITIES); assertTrue(searchResult.getEntities().size() >= 2); Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); Urn secondResultUrn = searchResult.getEntities().get(1).getEntity(); @@ -144,13 +131,17 @@ public void testNameMatchCustomerOrders() { Searching for "customer orders" should return "customer_orders" as the first search result, not suffixed by anything */ - assertNotNull(searchService); - SearchResult searchResult = searchAcrossEntities(searchService, "customer orders", SEARCHABLE_LONGTAIL_ENTITIES); + assertNotNull(getSearchService()); + SearchResult searchResult = searchAcrossEntities(getSearchService(), "customer orders", SEARCHABLE_LONGTAIL_ENTITIES); assertTrue(searchResult.getEntities().size() >= 2); Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); // Checks that the table name is not suffixed with anything - assertTrue(firstResultUrn.toString().contains("customer_orders,")); + assertTrue(firstResultUrn.toString().contains("customer_orders,"), + "Expected firstResultUrn to contain `customer_orders,` but results are " + + searchResult.getEntities().stream() + .map(e -> String.format("(Score: %s Urn: %s)", e.getScore(), e.getEntity().getId())) + .collect(Collectors.joining(", "))); Double firstResultScore = searchResult.getEntities().get(0).getScore(); Double secondResultScore = searchResult.getEntities().get(1).getScore(); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SearchLineageDataFixtureTests.java b/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/LineageDataFixtureTestBase.java similarity index 52% rename from metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SearchLineageDataFixtureTests.java rename to metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/LineageDataFixtureTestBase.java index 55f7d4618f479..eaf8feedeb6ed 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SearchLineageDataFixtureTests.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/LineageDataFixtureTestBase.java @@ -1,50 +1,43 @@ -package com.linkedin.metadata.search.elasticsearch.fixtures; +package com.linkedin.metadata.search.fixtures; import com.linkedin.common.urn.Urn; -import com.linkedin.metadata.ESSearchLineageFixture; -import com.linkedin.metadata.ESTestUtils; import com.linkedin.metadata.search.LineageSearchResult; import com.linkedin.metadata.search.LineageSearchService; import com.linkedin.metadata.search.SearchResult; import com.linkedin.metadata.search.SearchService; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.beans.factory.annotation.Qualifier; -import org.springframework.context.annotation.Import; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testng.annotations.Test; +import javax.annotation.Nonnull; import java.net.URISyntaxException; -import static com.linkedin.metadata.ESTestUtils.lineage; +import static io.datahubproject.test.search.SearchTestUtils.lineage; +import static io.datahubproject.test.search.SearchTestUtils.searchAcrossEntities; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertNotNull; +abstract public class LineageDataFixtureTestBase extends AbstractTestNGSpringContextTests { -@Import(ESSearchLineageFixture.class) -public class SearchLineageDataFixtureTests extends AbstractTestNGSpringContextTests { + @Nonnull + abstract protected LineageSearchService getLineageService(); - @Autowired - @Qualifier("searchLineageSearchService") - protected SearchService searchService; - - @Autowired - @Qualifier("searchLineageLineageSearchService") - protected LineageSearchService lineageService; + @Nonnull + abstract protected SearchService getSearchService(); @Test public void testFixtureInitialization() { - assertNotNull(searchService); - SearchResult noResult = ESTestUtils.searchAcrossEntities(searchService, "no results"); + assertNotNull(getSearchService()); + SearchResult noResult = searchAcrossEntities(getSearchService(), "no results"); assertEquals(noResult.getEntities().size(), 0); - SearchResult result = ESTestUtils.searchAcrossEntities(searchService, "e3859789eed1cef55288b44f016ee08290d9fd08973e565c112d8"); + SearchResult result = searchAcrossEntities(getSearchService(), "e3859789eed1cef55288b44f016ee08290d9fd08973e565c112d8"); assertEquals(result.getEntities().size(), 1); assertEquals(result.getEntities().get(0).getEntity().toString(), "urn:li:dataset:(urn:li:dataPlatform:9cf8c96,e3859789eed1cef55288b44f016ee08290d9fd08973e565c112d8,PROD)"); - LineageSearchResult lineageResult = lineage(lineageService, result.getEntities().get(0).getEntity(), 1); + LineageSearchResult lineageResult = lineage(getLineageService(), result.getEntities().get(0).getEntity(), 1); assertEquals(lineageResult.getEntities().size(), 10); } @@ -54,15 +47,15 @@ public void testDatasetLineage() throws URISyntaxException { "urn:li:dataset:(urn:li:dataPlatform:9cf8c96,e3859789eed1cef55288b44f016ee08290d9fd08973e565c112d8,PROD)"); // 1 hops - LineageSearchResult lineageResult = lineage(lineageService, testUrn, 1); + LineageSearchResult lineageResult = lineage(getLineageService(), testUrn, 1); assertEquals(lineageResult.getEntities().size(), 10); // 2 hops - lineageResult = lineage(lineageService, testUrn, 2); + lineageResult = lineage(getLineageService(), testUrn, 2); assertEquals(lineageResult.getEntities().size(), 5); // 3 hops - lineageResult = lineage(lineageService, testUrn, 3); + lineageResult = lineage(getLineageService(), testUrn, 3); assertEquals(lineageResult.getEntities().size(), 12); } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java b/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureTestBase.java similarity index 81% rename from metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java rename to metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureTestBase.java index 450378b247cea..1660504810296 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureTestBase.java @@ -1,4 +1,4 @@ -package com.linkedin.metadata.search.elasticsearch.fixtures; +package com.linkedin.metadata.search.fixtures; import com.datahub.authentication.Actor; import com.datahub.authentication.ActorType; @@ -12,7 +12,6 @@ import com.linkedin.datahub.graphql.types.corpuser.CorpUserType; import com.linkedin.datahub.graphql.types.dataset.DatasetType; import com.linkedin.entity.client.EntityClient; -import com.linkedin.metadata.ESSampleDataFixture; import com.linkedin.metadata.models.EntitySpec; import com.linkedin.metadata.models.SearchableFieldSpec; import com.linkedin.metadata.models.registry.EntityRegistry; @@ -28,22 +27,19 @@ import com.linkedin.metadata.search.SearchEntity; import com.linkedin.metadata.search.SearchResult; import com.linkedin.metadata.search.SearchService; - import com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig; import com.linkedin.r2.RemoteInvocationException; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.client.indices.AnalyzeRequest; -import org.elasticsearch.client.indices.AnalyzeResponse; -import org.elasticsearch.client.indices.GetMappingsRequest; -import org.elasticsearch.client.indices.GetMappingsResponse; import org.junit.Assert; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.beans.factory.annotation.Qualifier; -import org.springframework.context.annotation.Import; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.client.indices.AnalyzeRequest; +import org.opensearch.client.indices.AnalyzeResponse; +import org.opensearch.client.indices.GetMappingsRequest; +import org.opensearch.client.indices.GetMappingsResponse; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testng.annotations.Test; +import javax.annotation.Nonnull; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; @@ -54,36 +50,36 @@ import java.util.stream.IntStream; import java.util.stream.Stream; -import static com.linkedin.metadata.Constants.*; -import static com.linkedin.metadata.ESTestUtils.*; +import static com.linkedin.metadata.Constants.DATASET_ENTITY_NAME; +import static com.linkedin.metadata.Constants.DATA_JOB_ENTITY_NAME; import static com.linkedin.metadata.search.elasticsearch.query.request.SearchQueryBuilder.STRUCTURED_QUERY_PREFIX; -import static com.linkedin.metadata.utils.SearchUtil.*; +import static com.linkedin.metadata.utils.SearchUtil.AGGREGATION_SEPARATOR_CHAR; +import static io.datahubproject.test.search.SearchTestUtils.autocomplete; +import static io.datahubproject.test.search.SearchTestUtils.scroll; +import static io.datahubproject.test.search.SearchTestUtils.search; +import static io.datahubproject.test.search.SearchTestUtils.searchAcrossEntities; +import static io.datahubproject.test.search.SearchTestUtils.searchStructured; import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNotNull; import static org.testng.Assert.assertSame; import static org.testng.Assert.assertTrue; -import static org.testng.Assert.assertFalse; - -@Import(ESSampleDataFixture.class) -public class SampleDataFixtureTests extends AbstractTestNGSpringContextTests { - private static final Authentication AUTHENTICATION = +abstract public class SampleDataFixtureTestBase extends AbstractTestNGSpringContextTests { + protected static final Authentication AUTHENTICATION = new Authentication(new Actor(ActorType.USER, "test"), ""); - @Autowired - private RestHighLevelClient _searchClient; + @Nonnull + abstract protected EntityRegistry getEntityRegistry(); - @Autowired - @Qualifier("sampleDataSearchService") - protected SearchService searchService; + @Nonnull + abstract protected SearchService getSearchService(); - @Autowired - @Qualifier("sampleDataEntityClient") - protected EntityClient entityClient; + @Nonnull + abstract protected EntityClient getEntityClient(); - @Autowired - @Qualifier("entityRegistry") - private EntityRegistry entityRegistry; + @Nonnull + abstract protected RestHighLevelClient getSearchClient(); @Test public void testSearchFieldConfig() throws IOException { @@ -91,29 +87,29 @@ public void testSearchFieldConfig() throws IOException { For every field in every entity fixture, ensure proper detection of field types and analyzers */ Map fixtureEntities = new HashMap<>(); - fixtureEntities.put(entityRegistry.getEntitySpec("dataset"), "smpldat_datasetindex_v2"); - fixtureEntities.put(entityRegistry.getEntitySpec("chart"), "smpldat_chartindex_v2"); - fixtureEntities.put(entityRegistry.getEntitySpec("container"), "smpldat_containerindex_v2"); - fixtureEntities.put(entityRegistry.getEntitySpec("corpgroup"), "smpldat_corpgroupindex_v2"); - fixtureEntities.put(entityRegistry.getEntitySpec("corpuser"), "smpldat_corpuserindex_v2"); - fixtureEntities.put(entityRegistry.getEntitySpec("dashboard"), "smpldat_dashboardindex_v2"); - fixtureEntities.put(entityRegistry.getEntitySpec("dataflow"), "smpldat_dataflowindex_v2"); - fixtureEntities.put(entityRegistry.getEntitySpec("datajob"), "smpldat_datajobindex_v2"); - fixtureEntities.put(entityRegistry.getEntitySpec("domain"), "smpldat_domainindex_v2"); - fixtureEntities.put(entityRegistry.getEntitySpec("glossarynode"), "smpldat_glossarynodeindex_v2"); - fixtureEntities.put(entityRegistry.getEntitySpec("glossaryterm"), "smpldat_glossarytermindex_v2"); - fixtureEntities.put(entityRegistry.getEntitySpec("mlfeature"), "smpldat_mlfeatureindex_v2"); - fixtureEntities.put(entityRegistry.getEntitySpec("mlfeaturetable"), "smpldat_mlfeaturetableindex_v2"); - fixtureEntities.put(entityRegistry.getEntitySpec("mlmodelgroup"), "smpldat_mlmodelgroupindex_v2"); - fixtureEntities.put(entityRegistry.getEntitySpec("mlmodel"), "smpldat_mlmodelindex_v2"); - fixtureEntities.put(entityRegistry.getEntitySpec("mlprimarykey"), "smpldat_mlprimarykeyindex_v2"); - fixtureEntities.put(entityRegistry.getEntitySpec("tag"), "smpldat_tagindex_v2"); + fixtureEntities.put(getEntityRegistry().getEntitySpec("dataset"), "smpldat_datasetindex_v2"); + fixtureEntities.put(getEntityRegistry().getEntitySpec("chart"), "smpldat_chartindex_v2"); + fixtureEntities.put(getEntityRegistry().getEntitySpec("container"), "smpldat_containerindex_v2"); + fixtureEntities.put(getEntityRegistry().getEntitySpec("corpgroup"), "smpldat_corpgroupindex_v2"); + fixtureEntities.put(getEntityRegistry().getEntitySpec("corpuser"), "smpldat_corpuserindex_v2"); + fixtureEntities.put(getEntityRegistry().getEntitySpec("dashboard"), "smpldat_dashboardindex_v2"); + fixtureEntities.put(getEntityRegistry().getEntitySpec("dataflow"), "smpldat_dataflowindex_v2"); + fixtureEntities.put(getEntityRegistry().getEntitySpec("datajob"), "smpldat_datajobindex_v2"); + fixtureEntities.put(getEntityRegistry().getEntitySpec("domain"), "smpldat_domainindex_v2"); + fixtureEntities.put(getEntityRegistry().getEntitySpec("glossarynode"), "smpldat_glossarynodeindex_v2"); + fixtureEntities.put(getEntityRegistry().getEntitySpec("glossaryterm"), "smpldat_glossarytermindex_v2"); + fixtureEntities.put(getEntityRegistry().getEntitySpec("mlfeature"), "smpldat_mlfeatureindex_v2"); + fixtureEntities.put(getEntityRegistry().getEntitySpec("mlfeaturetable"), "smpldat_mlfeaturetableindex_v2"); + fixtureEntities.put(getEntityRegistry().getEntitySpec("mlmodelgroup"), "smpldat_mlmodelgroupindex_v2"); + fixtureEntities.put(getEntityRegistry().getEntitySpec("mlmodel"), "smpldat_mlmodelindex_v2"); + fixtureEntities.put(getEntityRegistry().getEntitySpec("mlprimarykey"), "smpldat_mlprimarykeyindex_v2"); + fixtureEntities.put(getEntityRegistry().getEntitySpec("tag"), "smpldat_tagindex_v2"); for (Map.Entry entry : fixtureEntities.entrySet()) { EntitySpec entitySpec = entry.getKey(); GetMappingsRequest req = new GetMappingsRequest().indices(entry.getValue()); - GetMappingsResponse resp = _searchClient.indices().getMapping(req, RequestOptions.DEFAULT); + GetMappingsResponse resp = getSearchClient().indices().getMapping(req, RequestOptions.DEFAULT); Map> mappings = (Map>) resp.mappings() .get(entry.getValue()).sourceAsMap().get("properties"); @@ -182,7 +178,7 @@ public void testSearchFieldConfig() throws IOException { public void testDatasetHasTags() throws IOException { GetMappingsRequest req = new GetMappingsRequest() .indices("smpldat_datasetindex_v2"); - GetMappingsResponse resp = _searchClient.indices().getMapping(req, RequestOptions.DEFAULT); + GetMappingsResponse resp = getSearchClient().indices().getMapping(req, RequestOptions.DEFAULT); Map> mappings = (Map>) resp.mappings() .get("smpldat_datasetindex_v2").sourceAsMap().get("properties"); assertTrue(mappings.containsKey("hasTags")); @@ -191,11 +187,11 @@ public void testDatasetHasTags() throws IOException { @Test public void testFixtureInitialization() { - assertNotNull(searchService); - SearchResult noResult = searchAcrossEntities(searchService, "no results"); + assertNotNull(getSearchService()); + SearchResult noResult = searchAcrossEntities(getSearchService(), "no results"); assertEquals(0, noResult.getEntities().size()); - final SearchResult result = searchAcrossEntities(searchService, "test"); + final SearchResult result = searchAcrossEntities(getSearchService(), "test"); Map expectedTypes = Map.of( "dataset", 13, @@ -209,7 +205,7 @@ public void testFixtureInitialization() { Map> actualTypes = new HashMap<>(); for (String key : expectedTypes.keySet()) { actualTypes.put(key, result.getEntities().stream() - .map(SearchEntity::getEntity).filter(entity -> key.equals(entity.getEntityType())).collect(Collectors.toList())); + .map(SearchEntity::getEntity).filter(entity -> key.equals(entity.getEntityType())).collect(Collectors.toList())); } expectedTypes.forEach((key, value) -> @@ -241,7 +237,7 @@ public void testDataPlatform() { .build(); expected.forEach((key, value) -> { - SearchResult result = searchAcrossEntities(searchService, key); + SearchResult result = searchAcrossEntities(getSearchService(), key); assertEquals(result.getEntities().size(), value.intValue(), String.format("Unexpected data platform `%s` hits.", key)); // max is 100 without pagination }); @@ -257,14 +253,14 @@ public void testUrn() { "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_BOOL_LIST_feature)", "urn:li:mlModel:(urn:li:dataPlatform:science,scienceModel,PROD)" ).forEach(query -> - assertTrue(searchAcrossEntities(searchService, query).getEntities().size() >= 1, - String.format("Unexpected >1 urn result for `%s`", query)) + assertTrue(searchAcrossEntities(getSearchService(), query).getEntities().size() >= 1, + String.format("Unexpected >1 urn result for `%s`", query)) ); } @Test public void testExactTable() { - SearchResult results = searchAcrossEntities(searchService, "stg_customers"); + SearchResult results = searchAcrossEntities(getSearchService(), "stg_customers"); assertEquals(results.getEntities().size(), 1, "Unexpected single urn result for `stg_customers`"); assertEquals(results.getEntities().get(0).getEntity().toString(), "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.stg_customers,PROD)"); @@ -281,7 +277,7 @@ public void testStemming() { testSets.forEach(testSet -> { Integer expectedResults = null; for (String testQuery : testSet) { - SearchResult results = searchAcrossEntities(searchService, testQuery); + SearchResult results = searchAcrossEntities(getSearchService(), testQuery); assertTrue(results.hasEntities() && !results.getEntities().isEmpty(), String.format("Expected search results for `%s`", testQuery)); @@ -299,7 +295,7 @@ public void testStemmingOverride() throws IOException { Set testSet = Set.of("customer", "customers"); Set results = testSet.stream() - .map(test -> searchAcrossEntities(searchService, test)) + .map(test -> searchAcrossEntities(getSearchService(), test)) .collect(Collectors.toSet()); results.forEach(r -> assertTrue(r.hasEntities() && !r.getEntities().isEmpty(), "Expected search results")); @@ -352,7 +348,7 @@ public void testDelimitedSynonym() throws IOException { "customer acquisition cost" ); List resultCounts = testSet.stream().map(q -> { - SearchResult result = searchAcrossEntities(searchService, q); + SearchResult result = searchAcrossEntities(getSearchService(), q); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), "Expected search results for: " + q); return result.getEntities().size(); @@ -363,26 +359,26 @@ public void testDelimitedSynonym() throws IOException { public void testNegateAnalysis() throws IOException { String queryWithMinus = "logging_events -bckp"; AnalyzeRequest request = AnalyzeRequest.withIndexAnalyzer( - "smpldat_datasetindex_v2", - "query_word_delimited", queryWithMinus + "smpldat_datasetindex_v2", + "query_word_delimited", queryWithMinus ); assertEquals(getTokens(request) - .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), - List.of("logging_events -bckp", "logging_ev", "-bckp", "log", "event", "bckp")); + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), + List.of("logging_events -bckp", "logging_ev", "-bckp", "log", "event", "bckp")); request = AnalyzeRequest.withIndexAnalyzer( - "smpldat_datasetindex_v2", - "word_gram_3", queryWithMinus + "smpldat_datasetindex_v2", + "word_gram_3", queryWithMinus ); assertEquals(getTokens(request) - .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("logging events -bckp")); + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("logging events -bckp")); request = AnalyzeRequest.withIndexAnalyzer( - "smpldat_datasetindex_v2", - "word_gram_4", queryWithMinus + "smpldat_datasetindex_v2", + "word_gram_4", queryWithMinus ); assertEquals(getTokens(request) - .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of()); + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of()); } @@ -391,49 +387,49 @@ public void testWordGram() throws IOException { String text = "hello.cat_cool_customer"; AnalyzeRequest request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", text); assertEquals(getTokens(request) - .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat", "cat cool", "cool customer")); + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat", "cat cool", "cool customer")); request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", text); assertEquals(getTokens(request) - .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat cool", "cat cool customer")); + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat cool", "cat cool customer")); request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", text); assertEquals(getTokens(request) - .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat cool customer")); + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat cool customer")); String testMoreSeparators = "quick.brown:fox jumped-LAZY_Dog"; request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", testMoreSeparators); assertEquals(getTokens(request) - .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), - List.of("quick brown", "brown fox", "fox jumped", "jumped lazy", "lazy dog")); + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), + List.of("quick brown", "brown fox", "fox jumped", "jumped lazy", "lazy dog")); request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", testMoreSeparators); assertEquals(getTokens(request) - .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), - List.of("quick brown fox", "brown fox jumped", "fox jumped lazy", "jumped lazy dog")); + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), + List.of("quick brown fox", "brown fox jumped", "fox jumped lazy", "jumped lazy dog")); request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", testMoreSeparators); assertEquals(getTokens(request) - .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), - List.of("quick brown fox jumped", "brown fox jumped lazy", "fox jumped lazy dog")); + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), + List.of("quick brown fox jumped", "brown fox jumped lazy", "fox jumped lazy dog")); String textWithQuotesAndDuplicateWord = "\"my_db.my_exact_table\""; request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", textWithQuotesAndDuplicateWord); assertEquals(getTokens(request) - .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db", "db my", "my exact", "exact table")); + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db", "db my", "my exact", "exact table")); request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", textWithQuotesAndDuplicateWord); assertEquals(getTokens(request) - .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db my", "db my exact", "my exact table")); + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db my", "db my exact", "my exact table")); request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", textWithQuotesAndDuplicateWord); assertEquals(getTokens(request) - .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db my exact", "db my exact table")); + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db my exact", "db my exact table")); String textWithParens = "(hi) there"; request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", textWithParens); assertEquals(getTokens(request) - .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hi there")); + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hi there")); String oneWordText = "hello"; for (String analyzer : List.of("word_gram_2", "word_gram_3", "word_gram_4")) { request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", analyzer, oneWordText); assertEquals(getTokens(request) - .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of()); + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of()); } } @@ -463,7 +459,7 @@ public void testUrnSynonym() throws IOException { "big query" ); List results = testSet.stream().map(query -> { - SearchResult result = searchAcrossEntities(searchService, query); + SearchResult result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), "Expected search results for: " + query); return result; }).collect(Collectors.toList()); @@ -504,9 +500,9 @@ public void testTokenizationWithNumber() throws IOException { ); List tokens = getTokens(request).map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()); assertEquals(tokens, List.of( - "harshal-playground-306419", "harshal", "playground", "306419", - "test_schema", "test", "schema", - "austin311_deriv", "austin311", "deriv"), + "harshal-playground-306419", "harshal", "playground", "306419", + "test_schema", "test", "schema", + "austin311_deriv", "austin311", "deriv"), String.format("Unexpected tokens. Found %s", tokens)); request = AnalyzeRequest.withIndexAnalyzer( @@ -622,7 +618,7 @@ public void testChartAutoComplete() throws InterruptedException, IOException { List.of("B", "Ba", "Baz", "Baz ", "Baz C", "Baz Ch", "Baz Cha", "Baz Char", "Baz Chart", "Baz Chart ") .forEach(query -> { try { - AutoCompleteResults result = autocomplete(new ChartType(entityClient), query); + AutoCompleteResults result = autocomplete(new ChartType(getEntityClient()), query); assertTrue(result.getEntities().size() == 2, String.format("Expected 2 results for `%s` found %s", query, result.getEntities().size())); } catch (Exception e) { @@ -637,7 +633,7 @@ public void testDatasetAutoComplete() { "excess_deaths_de", "excess_deaths_der", "excess_deaths_derived") .forEach(query -> { try { - AutoCompleteResults result = autocomplete(new DatasetType(entityClient), query); + AutoCompleteResults result = autocomplete(new DatasetType(getEntityClient()), query); assertTrue(result.getEntities().size() >= 1, String.format("Expected >= 1 results for `%s` found %s", query, result.getEntities().size())); } catch (Exception e) { @@ -652,7 +648,7 @@ public void testContainerAutoComplete() { "container-autocomp-test") .forEach(query -> { try { - AutoCompleteResults result = autocomplete(new ContainerType(entityClient), query); + AutoCompleteResults result = autocomplete(new ContainerType(getEntityClient()), query); assertTrue(result.getEntities().size() >= 1, String.format("Expected >= 1 results for `%s` found %s", query, result.getEntities().size())); } catch (Exception e) { @@ -666,7 +662,7 @@ public void testGroupAutoComplete() { List.of("T", "Te", "Tes", "Test ", "Test G", "Test Gro", "Test Group ") .forEach(query -> { try { - AutoCompleteResults result = autocomplete(new CorpGroupType(entityClient), query); + AutoCompleteResults result = autocomplete(new CorpGroupType(getEntityClient()), query); assertTrue(result.getEntities().size() == 1, String.format("Expected 1 results for `%s` found %s", query, result.getEntities().size())); } catch (Exception e) { @@ -680,7 +676,7 @@ public void testUserAutoComplete() { List.of("D", "Da", "Dat", "Data ", "Data H", "Data Hu", "Data Hub", "Data Hub ") .forEach(query -> { try { - AutoCompleteResults result = autocomplete(new CorpUserType(entityClient, null), query); + AutoCompleteResults result = autocomplete(new CorpUserType(getEntityClient(), null), query); assertTrue(result.getEntities().size() >= 1, String.format("Expected at least 1 results for `%s` found %s", query, result.getEntities().size())); } catch (Exception e) { @@ -702,7 +698,7 @@ public void testSmokeTestQueries() { ); Map results = expectedFulltextMinimums.entrySet().stream() - .collect(Collectors.toMap(Map.Entry::getKey, entry -> searchAcrossEntities(searchService, entry.getKey()))); + .collect(Collectors.toMap(Map.Entry::getKey, entry -> searchAcrossEntities(getSearchService(), entry.getKey()))); results.forEach((key, value) -> { Integer actualCount = value.getEntities().size(); @@ -719,7 +715,7 @@ public void testSmokeTestQueries() { ); results = expectedStructuredMinimums.entrySet().stream() - .collect(Collectors.toMap(Map.Entry::getKey, entry -> searchStructured(searchService, entry.getKey()))); + .collect(Collectors.toMap(Map.Entry::getKey, entry -> searchStructured(getSearchService(), entry.getKey()))); results.forEach((key, value) -> { Integer actualCount = value.getEntities().size(); @@ -772,7 +768,7 @@ public void testUnderscore() throws IOException { @Test public void testFacets() { Set expectedFacets = Set.of("entity", "typeNames", "platform", "origin", "tags"); - SearchResult testResult = searchAcrossEntities(searchService, "cypress"); + SearchResult testResult = searchAcrossEntities(getSearchService(), "cypress"); expectedFacets.forEach(facet -> { assertTrue(testResult.getMetadata().getAggregations().stream().anyMatch(agg -> agg.getName().equals(facet)), String.format("Failed to find facet `%s` in %s", facet, @@ -780,7 +776,7 @@ public void testFacets() { .map(AggregationMetadata::getName).collect(Collectors.toList()))); }); AggregationMetadata entityAggMeta = testResult.getMetadata().getAggregations().stream().filter( - aggMeta -> aggMeta.getName().equals("entity")).findFirst().get(); + aggMeta -> aggMeta.getName().equals("entity")).findFirst().get(); Map expectedEntityTypeCounts = new HashMap<>(); expectedEntityTypeCounts.put("container", 0L); expectedEntityTypeCounts.put("corpuser", 0L); @@ -805,28 +801,28 @@ public void testFacets() { @Test public void testNestedAggregation() { Set expectedFacets = Set.of("platform"); - SearchResult testResult = searchAcrossEntities(searchService, "cypress", List.copyOf(expectedFacets)); + SearchResult testResult = searchAcrossEntities(getSearchService(), "cypress", List.copyOf(expectedFacets)); assertEquals(testResult.getMetadata().getAggregations().size(), 1); expectedFacets.forEach(facet -> { assertTrue(testResult.getMetadata().getAggregations().stream().anyMatch(agg -> agg.getName().equals(facet)), - String.format("Failed to find facet `%s` in %s", facet, - testResult.getMetadata().getAggregations().stream() - .map(AggregationMetadata::getName).collect(Collectors.toList()))); + String.format("Failed to find facet `%s` in %s", facet, + testResult.getMetadata().getAggregations().stream() + .map(AggregationMetadata::getName).collect(Collectors.toList()))); }); expectedFacets = Set.of("platform", "typeNames", "_entityType", "entity"); - SearchResult testResult2 = searchAcrossEntities(searchService, "cypress", List.copyOf(expectedFacets)); + SearchResult testResult2 = searchAcrossEntities(getSearchService(), "cypress", List.copyOf(expectedFacets)); assertEquals(testResult2.getMetadata().getAggregations().size(), 4); expectedFacets.forEach(facet -> { assertTrue(testResult2.getMetadata().getAggregations().stream().anyMatch(agg -> agg.getName().equals(facet)), - String.format("Failed to find facet `%s` in %s", facet, - testResult2.getMetadata().getAggregations().stream() - .map(AggregationMetadata::getName).collect(Collectors.toList()))); + String.format("Failed to find facet `%s` in %s", facet, + testResult2.getMetadata().getAggregations().stream() + .map(AggregationMetadata::getName).collect(Collectors.toList()))); }); AggregationMetadata entityTypeAggMeta = testResult2.getMetadata().getAggregations().stream().filter( - aggMeta -> aggMeta.getName().equals("_entityType")).findFirst().get(); + aggMeta -> aggMeta.getName().equals("_entityType")).findFirst().get(); AggregationMetadata entityAggMeta = testResult2.getMetadata().getAggregations().stream().filter( - aggMeta -> aggMeta.getName().equals("entity")).findFirst().get(); + aggMeta -> aggMeta.getName().equals("entity")).findFirst().get(); assertEquals(entityTypeAggMeta.getAggregations(), entityAggMeta.getAggregations()); Map expectedEntityTypeCounts = new HashMap<>(); expectedEntityTypeCounts.put("container", 0L); @@ -849,24 +845,24 @@ public void testNestedAggregation() { assertEquals(entityTypeAggMeta.getAggregations(), expectedEntityTypeCounts); expectedFacets = Set.of("platform", "typeNames", "entity"); - SearchResult testResult3 = searchAcrossEntities(searchService, "cypress", List.copyOf(expectedFacets)); + SearchResult testResult3 = searchAcrossEntities(getSearchService(), "cypress", List.copyOf(expectedFacets)); assertEquals(testResult3.getMetadata().getAggregations().size(), 4); expectedFacets.forEach(facet -> { assertTrue(testResult3.getMetadata().getAggregations().stream().anyMatch(agg -> agg.getName().equals(facet)), - String.format("Failed to find facet `%s` in %s", facet, - testResult3.getMetadata().getAggregations().stream() - .map(AggregationMetadata::getName).collect(Collectors.toList()))); + String.format("Failed to find facet `%s` in %s", facet, + testResult3.getMetadata().getAggregations().stream() + .map(AggregationMetadata::getName).collect(Collectors.toList()))); }); AggregationMetadata entityTypeAggMeta3 = testResult3.getMetadata().getAggregations().stream().filter( - aggMeta -> aggMeta.getName().equals("_entityType")).findFirst().get(); + aggMeta -> aggMeta.getName().equals("_entityType")).findFirst().get(); AggregationMetadata entityAggMeta3 = testResult3.getMetadata().getAggregations().stream().filter( - aggMeta -> aggMeta.getName().equals("entity")).findFirst().get(); + aggMeta -> aggMeta.getName().equals("entity")).findFirst().get(); assertEquals(entityTypeAggMeta3.getAggregations(), entityAggMeta3.getAggregations()); assertEquals(entityTypeAggMeta3.getAggregations(), expectedEntityTypeCounts); String singleNestedFacet = String.format("_entityType%sowners", AGGREGATION_SEPARATOR_CHAR); expectedFacets = Set.of(singleNestedFacet); - SearchResult testResultSingleNested = searchAcrossEntities(searchService, "cypress", List.copyOf(expectedFacets)); + SearchResult testResultSingleNested = searchAcrossEntities(getSearchService(), "cypress", List.copyOf(expectedFacets)); assertEquals(testResultSingleNested.getMetadata().getAggregations().size(), 1); Map expectedNestedFacetCounts = new HashMap<>(); expectedNestedFacetCounts.put("datajob␞urn:li:corpuser:datahub", 2L); @@ -885,17 +881,17 @@ public void testNestedAggregation() { assertEquals(testResultSingleNested.getMetadata().getAggregations().get(0).getAggregations(), expectedNestedFacetCounts); expectedFacets = Set.of("platform", singleNestedFacet, "typeNames", "origin"); - SearchResult testResultNested = searchAcrossEntities(searchService, "cypress", List.copyOf(expectedFacets)); + SearchResult testResultNested = searchAcrossEntities(getSearchService(), "cypress", List.copyOf(expectedFacets)); assertEquals(testResultNested.getMetadata().getAggregations().size(), 4); expectedFacets.forEach(facet -> { assertTrue(testResultNested.getMetadata().getAggregations().stream().anyMatch(agg -> agg.getName().equals(facet)), - String.format("Failed to find facet `%s` in %s", facet, - testResultNested.getMetadata().getAggregations().stream() - .map(AggregationMetadata::getName).collect(Collectors.toList()))); + String.format("Failed to find facet `%s` in %s", facet, + testResultNested.getMetadata().getAggregations().stream() + .map(AggregationMetadata::getName).collect(Collectors.toList()))); }); List expectedNestedAgg = testResultNested.getMetadata().getAggregations().stream().filter( - agg -> agg.getName().equals(singleNestedFacet)).collect(Collectors.toList()); + agg -> agg.getName().equals(singleNestedFacet)).collect(Collectors.toList()); assertEquals(expectedNestedAgg.size(), 1); AggregationMetadata nestedAgg = expectedNestedAgg.get(0); assertEquals(nestedAgg.getDisplayName(), String.format("Type%sOwned By", AGGREGATION_SEPARATOR_CHAR)); @@ -959,7 +955,7 @@ public void testScrollAcrossEntities() throws IOException { int totalResults = 0; String scrollId = null; do { - ScrollResult result = scroll(searchService, query, batchSize, scrollId); + ScrollResult result = scroll(getSearchService(), query, batchSize, scrollId); int numResults = result.hasEntities() ? result.getEntities().size() : 0; assertTrue(numResults <= batchSize); totalResults += numResults; @@ -972,13 +968,13 @@ public void testScrollAcrossEntities() throws IOException { @Test public void testSearchAcrossMultipleEntities() { String query = "logging_events"; - SearchResult result = search(searchService, query); + SearchResult result = search(getSearchService(), query); assertEquals((int) result.getNumEntities(), 8); - result = search(searchService, List.of(DATASET_ENTITY_NAME, DATA_JOB_ENTITY_NAME), query); + result = search(getSearchService(), List.of(DATASET_ENTITY_NAME, DATA_JOB_ENTITY_NAME), query); assertEquals((int) result.getNumEntities(), 8); - result = search(searchService, List.of(DATASET_ENTITY_NAME), query); + result = search(getSearchService(), List.of(DATASET_ENTITY_NAME), query); assertEquals((int) result.getNumEntities(), 4); - result = search(searchService, List.of(DATA_JOB_ENTITY_NAME), query); + result = search(getSearchService(), List.of(DATA_JOB_ENTITY_NAME), query); assertEquals((int) result.getNumEntities(), 4); } @@ -1046,7 +1042,7 @@ public void testFragmentUrns() { ); testSet.forEach(query -> { - SearchResult result = searchAcrossEntities(searchService, query); + SearchResult result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected partial urn search results", query)); @@ -1064,7 +1060,7 @@ public void testPlatformTest() { List results = testFields.stream() .map(fieldName -> { final String query = String.format("%s:%s", fieldName, testPlatform.replaceAll(":", "\\\\:")); - SearchResult result = searchStructured(searchService, query); + SearchResult result = searchStructured(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()), @@ -1095,7 +1091,7 @@ public void testPlatformTest() { // Test field variations with/without .keyword List entityClientResults = testFilters.stream().map(filter -> { try { - return entityClient.search("dataset", "*", filter, null, 0, 100, + return getEntityClient().search("dataset", "*", filter, null, 0, 100, AUTHENTICATION, new SearchFlags().setFulltext(fulltextFlag)); } catch (RemoteInvocationException e) { throw new RuntimeException(e); @@ -1112,7 +1108,7 @@ public void testPlatformTest() { @Test public void testStructQueryFieldMatch() { String query = STRUCTURED_QUERY_PREFIX + "name: customers"; - SearchResult result = searchAcrossEntities(searchService, query); + SearchResult result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); @@ -1125,7 +1121,7 @@ public void testStructQueryFieldMatch() { @Test public void testStructQueryFieldPrefixMatch() { String query = STRUCTURED_QUERY_PREFIX + "name: customers*"; - SearchResult result = searchAcrossEntities(searchService, query); + SearchResult result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); @@ -1138,7 +1134,7 @@ public void testStructQueryFieldPrefixMatch() { @Test public void testStructQueryCustomPropertiesKeyPrefix() { String query = STRUCTURED_QUERY_PREFIX + "customProperties: node_type=*"; - SearchResult result = searchAcrossEntities(searchService, query); + SearchResult result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); @@ -1151,7 +1147,7 @@ public void testStructQueryCustomPropertiesKeyPrefix() { @Test public void testStructQueryCustomPropertiesMatch() { String query = STRUCTURED_QUERY_PREFIX + "customProperties: node_type=model"; - SearchResult result = searchAcrossEntities(searchService, query); + SearchResult result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); @@ -1169,7 +1165,7 @@ public void testCustomPropertiesQuoted() { ); Map results = expectedResults.entrySet().stream() - .collect(Collectors.toMap(Map.Entry::getKey, entry -> searchAcrossEntities(searchService, entry.getKey()))); + .collect(Collectors.toMap(Map.Entry::getKey, entry -> searchAcrossEntities(getSearchService(), entry.getKey()))); results.forEach((key, value) -> { Integer actualCount = value.getEntities().size(); @@ -1183,7 +1179,7 @@ public void testCustomPropertiesQuoted() { @Test public void testStructQueryFieldPaths() { String query = STRUCTURED_QUERY_PREFIX + "fieldPaths: customer_id"; - SearchResult result = searchAcrossEntities(searchService, query); + SearchResult result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); @@ -1196,7 +1192,7 @@ public void testStructQueryFieldPaths() { @Test public void testStructQueryBoolean() { String query = STRUCTURED_QUERY_PREFIX + "editedFieldTags:urn\\:li\\:tag\\:Legacy OR tags:urn\\:li\\:tag\\:testTag"; - SearchResult result = searchAcrossEntities(searchService, query); + SearchResult result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); @@ -1206,7 +1202,7 @@ public void testStructQueryBoolean() { assertEquals(result.getEntities().size(), 2); query = STRUCTURED_QUERY_PREFIX + "editedFieldTags:urn\\:li\\:tag\\:Legacy"; - result = searchAcrossEntities(searchService, query); + result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); @@ -1216,7 +1212,7 @@ public void testStructQueryBoolean() { assertEquals(result.getEntities().size(), 1); query = STRUCTURED_QUERY_PREFIX + "tags:urn\\:li\\:tag\\:testTag"; - result = searchAcrossEntities(searchService, query); + result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); @@ -1229,7 +1225,7 @@ public void testStructQueryBoolean() { @Test public void testStructQueryBrowsePaths() { String query = STRUCTURED_QUERY_PREFIX + "browsePaths:*/dbt/*"; - SearchResult result = searchAcrossEntities(searchService, query); + SearchResult result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); @@ -1242,7 +1238,7 @@ public void testStructQueryBrowsePaths() { @Test public void testOr() { String query = "stg_customers | logging_events"; - SearchResult result = searchAcrossEntities(searchService, query); + SearchResult result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()), @@ -1250,7 +1246,7 @@ public void testOr() { assertEquals(result.getEntities().size(), 9); query = "stg_customers"; - result = searchAcrossEntities(searchService, query); + result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()), @@ -1258,7 +1254,7 @@ public void testOr() { assertEquals(result.getEntities().size(), 1); query = "logging_events"; - result = searchAcrossEntities(searchService, query); + result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()), @@ -1269,7 +1265,7 @@ public void testOr() { @Test public void testNegate() { String query = "logging_events -bckp"; - SearchResult result = searchAcrossEntities(searchService, query); + SearchResult result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()), @@ -1277,7 +1273,7 @@ public void testNegate() { assertEquals(result.getEntities().size(), 7); query = "logging_events"; - result = searchAcrossEntities(searchService, query); + result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()), @@ -1288,7 +1284,7 @@ public void testNegate() { @Test public void testPrefix() { String query = "bigquery"; - SearchResult result = searchAcrossEntities(searchService, query); + SearchResult result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()), @@ -1296,7 +1292,7 @@ public void testPrefix() { assertEquals(result.getEntities().size(), 8); query = "big*"; - result = searchAcrossEntities(searchService, query); + result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()), @@ -1307,7 +1303,7 @@ public void testPrefix() { @Test public void testParens() { String query = "dbt | (bigquery + covid19)"; - SearchResult result = searchAcrossEntities(searchService, query); + SearchResult result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()), @@ -1315,7 +1311,7 @@ public void testParens() { assertEquals(result.getEntities().size(), 11); query = "dbt"; - result = searchAcrossEntities(searchService, query); + result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()), @@ -1323,7 +1319,7 @@ public void testParens() { assertEquals(result.getEntities().size(), 9); query = "bigquery + covid19"; - result = searchAcrossEntities(searchService, query); + result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()), @@ -1331,7 +1327,7 @@ public void testParens() { assertEquals(result.getEntities().size(), 2); query = "bigquery"; - result = searchAcrossEntities(searchService, query); + result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()), @@ -1339,7 +1335,7 @@ public void testParens() { assertEquals(result.getEntities().size(), 8); query = "covid19"; - result = searchAcrossEntities(searchService, query); + result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()), @@ -1349,55 +1345,55 @@ public void testParens() { @Test public void testGram() { String query = "jaffle shop customers"; - SearchResult result = searchAcrossEntities(searchService, query); + SearchResult result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), - String.format("%s - Expected search results", query)); + String.format("%s - Expected search results", query)); assertEquals(result.getEntities().get(0).getEntity().toString(), - "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.customers,PROD)", - "Expected exact match in 1st position"); + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.customers,PROD)", + "Expected exact match in 1st position"); query = "shop customers source"; - result = searchAcrossEntities(searchService, query); + result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), - String.format("%s - Expected search results", query)); + String.format("%s - Expected search results", query)); assertEquals(result.getEntities().get(0).getEntity().toString(), - "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.customers_source,PROD)", - "Expected ngram match in 1st position"); + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.customers_source,PROD)", + "Expected ngram match in 1st position"); query = "jaffle shop stg customers"; - result = searchAcrossEntities(searchService, query); + result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), - String.format("%s - Expected search results", query)); + String.format("%s - Expected search results", query)); assertEquals(result.getEntities().get(0).getEntity().toString(), - "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.stg_customers,PROD)", - "Expected ngram match in 1st position"); + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.stg_customers,PROD)", + "Expected ngram match in 1st position"); query = "jaffle shop transformers customers"; - result = searchAcrossEntities(searchService, query); + result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), - String.format("%s - Expected search results", query)); + String.format("%s - Expected search results", query)); assertEquals(result.getEntities().get(0).getEntity().toString(), - "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.transformers_customers,PROD)", - "Expected ngram match in 1st position"); + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.transformers_customers,PROD)", + "Expected ngram match in 1st position"); query = "shop raw customers"; - result = searchAcrossEntities(searchService, query); + result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), - String.format("%s - Expected search results", query)); + String.format("%s - Expected search results", query)); assertEquals(result.getEntities().get(0).getEntity().toString(), - "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_customers,PROD)", - "Expected ngram match in 1st position"); + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_customers,PROD)", + "Expected ngram match in 1st position"); } @Test public void testPrefixVsExact() { String query = "\"customers\""; - SearchResult result = searchAcrossEntities(searchService, query); + SearchResult result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); @@ -1415,7 +1411,7 @@ public void testPrefixVsExact() { public void testPrefixVsExactCaseSensitivity() { List insensitiveExactMatches = List.of("testExactMatchCase", "testexactmatchcase", "TESTEXACTMATCHCASE"); for (String query : insensitiveExactMatches) { - SearchResult result = searchAcrossEntities(searchService, query); + SearchResult result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), String.format("%s - Expected search results", query)); @@ -1432,33 +1428,33 @@ public void testPrefixVsExactCaseSensitivity() { @Test public void testColumnExactMatch() { String query = "unit_data"; - SearchResult result = searchAcrossEntities(searchService, query); + SearchResult result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), - String.format("%s - Expected search results", query)); + String.format("%s - Expected search results", query)); assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()), - String.format("%s - Expected search results to include matched fields", query)); + String.format("%s - Expected search results to include matched fields", query)); assertTrue(result.getEntities().size() > 2, - String.format("%s - Expected search results to have at least two results", query)); + String.format("%s - Expected search results to have at least two results", query)); assertEquals(result.getEntities().get(0).getEntity().toString(), - "urn:li:dataset:(urn:li:dataPlatform:testOnly," + query + ",PROD)", - "Expected table name exact match first"); + "urn:li:dataset:(urn:li:dataPlatform:testOnly," + query + ",PROD)", + "Expected table name exact match first"); query = "special_column_only_present_here_info"; - result = searchAcrossEntities(searchService, query); + result = searchAcrossEntities(getSearchService(), query); assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), - String.format("%s - Expected search results", query)); + String.format("%s - Expected search results", query)); assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()), - String.format("%s - Expected search results to include matched fields", query)); + String.format("%s - Expected search results to include matched fields", query)); assertTrue(result.getEntities().size() > 2, - String.format("%s - Expected search results to have at least two results", query)); + String.format("%s - Expected search results to have at least two results", query)); assertEquals(result.getEntities().get(0).getEntity().toString(), - "urn:li:dataset:(urn:li:dataPlatform:testOnly," + "important_units" + ",PROD)", - "Expected table with column name exact match first"); + "urn:li:dataset:(urn:li:dataPlatform:testOnly," + "important_units" + ",PROD)", + "Expected table with column name exact match first"); } private Stream getTokens(AnalyzeRequest request) throws IOException { - return _searchClient.indices().analyze(request, RequestOptions.DEFAULT).getTokens().stream(); + return getSearchClient().indices().analyze(request, RequestOptions.DEFAULT).getTokens().stream(); } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/indexbuilder/IndexBuilderTestBase.java similarity index 85% rename from metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilderTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/search/indexbuilder/IndexBuilderTestBase.java index 2416280cb8f93..4472af339c074 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/indexbuilder/IndexBuilderTestBase.java @@ -1,43 +1,40 @@ -package com.linkedin.metadata.search.elasticsearch.indexbuilder; +package com.linkedin.metadata.search.indexbuilder; -import com.linkedin.metadata.config.search.ElasticSearchConfiguration; import com.google.common.collect.ImmutableMap; -import com.linkedin.metadata.ESTestConfiguration; +import com.linkedin.metadata.config.search.ElasticSearchConfiguration; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; import com.linkedin.metadata.systemmetadata.SystemMetadataMappingsBuilder; import com.linkedin.metadata.version.GitVersion; -import java.util.Optional; -import org.elasticsearch.ElasticsearchException; -import org.elasticsearch.action.admin.indices.alias.get.GetAliasesRequest; -import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.client.indices.GetIndexRequest; -import org.elasticsearch.client.IndicesClient; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.indices.GetIndexResponse; -import org.elasticsearch.cluster.metadata.AliasMetadata; -import org.elasticsearch.rest.RestStatus; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.context.annotation.Import; +import org.opensearch.OpenSearchException; +import org.opensearch.action.admin.indices.alias.get.GetAliasesRequest; +import org.opensearch.action.admin.indices.delete.DeleteIndexRequest; +import org.opensearch.client.IndicesClient; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.client.indices.GetIndexRequest; +import org.opensearch.client.indices.GetIndexResponse; +import org.opensearch.cluster.metadata.AliasMetadata; +import org.opensearch.rest.RestStatus; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; +import javax.annotation.Nonnull; import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.stream.Collectors; -import static org.testng.Assert.assertTrue; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertNotEquals; +import static org.testng.Assert.*; + +abstract public class IndexBuilderTestBase extends AbstractTestNGSpringContextTests { -@Import(ESTestConfiguration.class) -public class ESIndexBuilderTest extends AbstractTestNGSpringContextTests { + @Nonnull + abstract protected RestHighLevelClient getSearchClient(); - @Autowired - private RestHighLevelClient _searchClient; private static IndicesClient _indexClient; private static final String TEST_INDEX_NAME = "esindex_builder_test"; private static ESIndexBuilder testDefaultBuilder; @@ -45,9 +42,9 @@ public class ESIndexBuilderTest extends AbstractTestNGSpringContextTests { @BeforeClass public void setup() { - _indexClient = _searchClient.indices(); + _indexClient = getSearchClient().indices(); GitVersion gitVersion = new GitVersion("0.0.0-test", "123456", Optional.empty()); - testDefaultBuilder = new ESIndexBuilder(_searchClient, 1, 0, 0, + testDefaultBuilder = new ESIndexBuilder(getSearchClient(), 1, 0, 0, 0, Map.of(), false, false, new ElasticSearchConfiguration(), gitVersion); } @@ -65,7 +62,7 @@ public static void wipe() throws Exception { }); _indexClient.delete(new DeleteIndexRequest(TEST_INDEX_NAME), RequestOptions.DEFAULT); - } catch (ElasticsearchException exception) { + } catch (OpenSearchException exception) { if (exception.status() != RestStatus.NOT_FOUND) { throw exception; } @@ -79,7 +76,7 @@ public static GetIndexResponse getTestIndex() throws IOException { @Test public void testESIndexBuilderCreation() throws Exception { GitVersion gitVersion = new GitVersion("0.0.0-test", "123456", Optional.empty()); - ESIndexBuilder customIndexBuilder = new ESIndexBuilder(_searchClient, 2, 0, 1, + ESIndexBuilder customIndexBuilder = new ESIndexBuilder(getSearchClient(), 2, 0, 1, 0, Map.of(), false, false, new ElasticSearchConfiguration(), gitVersion); customIndexBuilder.buildIndex(TEST_INDEX_NAME, Map.of(), Map.of()); @@ -93,7 +90,7 @@ public void testESIndexBuilderCreation() throws Exception { @Test public void testMappingReindex() throws Exception { GitVersion gitVersion = new GitVersion("0.0.0-test", "123456", Optional.empty()); - ESIndexBuilder enabledMappingReindex = new ESIndexBuilder(_searchClient, 1, 0, 0, + ESIndexBuilder enabledMappingReindex = new ESIndexBuilder(getSearchClient(), 1, 0, 0, 0, Map.of(), false, true, new ElasticSearchConfiguration(), gitVersion); @@ -111,7 +108,7 @@ public void testMappingReindex() throws Exception { Map newProps = ((Map) SystemMetadataMappingsBuilder.getMappings().get("properties")) .entrySet().stream() .map(m -> !m.getKey().equals("urn") ? m - : Map.entry("urn", ImmutableMap.builder().put("type", "wildcard").build())) + : Map.entry("urn", ImmutableMap.builder().put("type", "text").build())) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); enabledMappingReindex.buildIndex(TEST_INDEX_NAME, Map.of("properties", newProps), Map.of()); @@ -134,7 +131,7 @@ public void testSettingsNumberOfShardsReindex() throws Exception { String expectedShards = "5"; GitVersion gitVersion = new GitVersion("0.0.0-test", "123456", Optional.empty()); - ESIndexBuilder changedShardBuilder = new ESIndexBuilder(_searchClient, + ESIndexBuilder changedShardBuilder = new ESIndexBuilder(getSearchClient(), Integer.parseInt(expectedShards), testDefaultBuilder.getNumReplicas(), testDefaultBuilder.getNumRetries(), @@ -162,7 +159,7 @@ public void testSettingsNumberOfShardsReindex() throws Exception { public void testSettingsNoReindex() throws Exception { GitVersion gitVersion = new GitVersion("0.0.0-test", "123456", Optional.empty()); List noReindexBuilders = List.of( - new ESIndexBuilder(_searchClient, + new ESIndexBuilder(getSearchClient(), testDefaultBuilder.getNumShards(), testDefaultBuilder.getNumReplicas() + 1, testDefaultBuilder.getNumRetries(), @@ -170,7 +167,7 @@ public void testSettingsNoReindex() throws Exception { Map.of(), true, false, new ElasticSearchConfiguration(), gitVersion), - new ESIndexBuilder(_searchClient, + new ESIndexBuilder(getSearchClient(), testDefaultBuilder.getNumShards(), testDefaultBuilder.getNumReplicas(), testDefaultBuilder.getNumRetries(), @@ -178,7 +175,7 @@ public void testSettingsNoReindex() throws Exception { Map.of(), true, false, new ElasticSearchConfiguration(), gitVersion), - new ESIndexBuilder(_searchClient, + new ESIndexBuilder(getSearchClient(), testDefaultBuilder.getNumShards() + 1, testDefaultBuilder.getNumReplicas(), testDefaultBuilder.getNumRetries(), @@ -186,7 +183,7 @@ public void testSettingsNoReindex() throws Exception { Map.of(), false, false, new ElasticSearchConfiguration(), gitVersion), - new ESIndexBuilder(_searchClient, + new ESIndexBuilder(getSearchClient(), testDefaultBuilder.getNumShards(), testDefaultBuilder.getNumReplicas() + 1, testDefaultBuilder.getNumRetries(), diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/indexbuilder/MappingsBuilderTest.java similarity index 98% rename from metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/search/indexbuilder/MappingsBuilderTest.java index 0b33185549299..0d2ce236d9f54 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/indexbuilder/MappingsBuilderTest.java @@ -1,8 +1,10 @@ -package com.linkedin.metadata.search.elasticsearch.indexbuilder; +package com.linkedin.metadata.search.indexbuilder; import com.google.common.collect.ImmutableMap; import com.linkedin.metadata.TestEntitySpecBuilder; import java.util.Map; + +import com.linkedin.metadata.search.elasticsearch.indexbuilder.MappingsBuilder; import org.testng.annotations.Test; import static org.testng.Assert.assertEquals; diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/GoldenOpenSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/GoldenOpenSearchTest.java new file mode 100644 index 0000000000000..3896ba749e85e --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/GoldenOpenSearchTest.java @@ -0,0 +1,44 @@ +package com.linkedin.metadata.search.opensearch; + +import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.metadata.search.SearchService; +import com.linkedin.metadata.search.fixtures.GoldenTestBase; +import io.datahubproject.test.fixtures.search.SampleDataFixtureConfiguration; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import org.jetbrains.annotations.NotNull; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.context.annotation.Import; +import org.testng.annotations.Test; + +import static org.testng.AssertJUnit.assertNotNull; + +@Import({OpenSearchSuite.class, SampleDataFixtureConfiguration.class, SearchTestContainerConfiguration.class}) +public class GoldenOpenSearchTest extends GoldenTestBase { + + @Autowired + @Qualifier("longTailSearchService") + protected SearchService searchService; + + @Autowired + @Qualifier("entityRegistry") + private EntityRegistry entityRegistry; + + + @NotNull + @Override + protected EntityRegistry getEntityRegistry() { + return entityRegistry; + } + + @NotNull + @Override + protected SearchService getSearchService() { + return searchService; + } + + @Test + public void initTest() { + assertNotNull(searchService); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/IndexBuilderOpenSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/IndexBuilderOpenSearchTest.java new file mode 100644 index 0000000000000..312b56364bd91 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/IndexBuilderOpenSearchTest.java @@ -0,0 +1,30 @@ +package com.linkedin.metadata.search.opensearch; + +import com.linkedin.metadata.search.indexbuilder.IndexBuilderTestBase; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import org.jetbrains.annotations.NotNull; +import org.opensearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.testng.annotations.Test; + +import static org.testng.AssertJUnit.assertNotNull; + + +@Import({OpenSearchSuite.class, SearchTestContainerConfiguration.class}) +public class IndexBuilderOpenSearchTest extends IndexBuilderTestBase { + + @Autowired + private RestHighLevelClient _searchClient; + + @NotNull + @Override + protected RestHighLevelClient getSearchClient() { + return _searchClient; + } + + @Test + public void initTest() { + assertNotNull(_searchClient); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/LineageDataFixtureOpenSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/LineageDataFixtureOpenSearchTest.java new file mode 100644 index 0000000000000..6fc0677ad6e39 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/LineageDataFixtureOpenSearchTest.java @@ -0,0 +1,43 @@ +package com.linkedin.metadata.search.opensearch; + +import com.linkedin.metadata.search.LineageSearchService; +import com.linkedin.metadata.search.SearchService; +import com.linkedin.metadata.search.fixtures.LineageDataFixtureTestBase; +import io.datahubproject.test.fixtures.search.SearchLineageFixtureConfiguration; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import org.jetbrains.annotations.NotNull; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.context.annotation.Import; +import org.testng.AssertJUnit; +import org.testng.annotations.Test; + + +@Import({OpenSearchSuite.class, SearchLineageFixtureConfiguration.class, SearchTestContainerConfiguration.class}) +public class LineageDataFixtureOpenSearchTest extends LineageDataFixtureTestBase { + + @Autowired + @Qualifier("searchLineageSearchService") + protected SearchService searchService; + + @Autowired + @Qualifier("searchLineageLineageSearchService") + protected LineageSearchService lineageService; + + @NotNull + @Override + protected LineageSearchService getLineageService() { + return lineageService; + } + + @NotNull + @Override + protected SearchService getSearchService() { + return searchService; + } + + @Test + public void initTest() { + AssertJUnit.assertNotNull(lineageService); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/LineageServiceOpenSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/LineageServiceOpenSearchTest.java new file mode 100644 index 0000000000000..1a6242c2211fd --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/LineageServiceOpenSearchTest.java @@ -0,0 +1,65 @@ +package com.linkedin.metadata.search.opensearch; + +import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; +import com.linkedin.metadata.search.LineageServiceTestBase; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; +import io.datahubproject.test.search.config.SearchCommonTestConfiguration; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import org.jetbrains.annotations.NotNull; +import org.opensearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.testng.AssertJUnit; +import org.testng.annotations.Test; + +@Import({OpenSearchSuite.class, SearchCommonTestConfiguration.class, SearchTestContainerConfiguration.class}) +public class LineageServiceOpenSearchTest extends LineageServiceTestBase { + + @Autowired + private RestHighLevelClient _searchClient; + @Autowired + private ESBulkProcessor _bulkProcessor; + @Autowired + private ESIndexBuilder _esIndexBuilder; + @Autowired + private SearchConfiguration _searchConfiguration; + @Autowired + private CustomSearchConfiguration _customSearchConfiguration; + + @NotNull + @Override + protected RestHighLevelClient getSearchClient() { + return _searchClient; + } + + @NotNull + @Override + protected ESBulkProcessor getBulkProcessor() { + return _bulkProcessor; + } + + @NotNull + @Override + protected ESIndexBuilder getIndexBuilder() { + return _esIndexBuilder; + } + + @NotNull + @Override + protected SearchConfiguration getSearchConfiguration() { + return _searchConfiguration; + } + + @NotNull + @Override + protected CustomSearchConfiguration getCustomSearchConfiguration() { + return _customSearchConfiguration; + } + + @Test + public void initTest() { + AssertJUnit.assertNotNull(_searchClient); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/OpenSearchSuite.java b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/OpenSearchSuite.java new file mode 100644 index 0000000000000..559c623c97d5a --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/OpenSearchSuite.java @@ -0,0 +1,31 @@ +package com.linkedin.metadata.search.opensearch; + +import io.datahubproject.test.search.OpenSearchTestContainer; +import org.springframework.boot.test.context.TestConfiguration; +import org.springframework.context.annotation.Bean; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testcontainers.containers.GenericContainer; +import org.testng.annotations.AfterSuite; + +@TestConfiguration +public class OpenSearchSuite extends AbstractTestNGSpringContextTests { + + private static final OpenSearchTestContainer OPENSEARCH_TEST_CONTAINER; + private static GenericContainer container; + static { + OPENSEARCH_TEST_CONTAINER = new OpenSearchTestContainer(); + } + + @AfterSuite + public void after() { + OPENSEARCH_TEST_CONTAINER.stopContainer(); + } + + @Bean(name = "testSearchContainer") + public GenericContainer testSearchContainer() { + if (container == null) { + container = OPENSEARCH_TEST_CONTAINER.startContainer(); + } + return container; + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/SampleDataFixtureOpenSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/SampleDataFixtureOpenSearchTest.java new file mode 100644 index 0000000000000..081eb5f70fc85 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/SampleDataFixtureOpenSearchTest.java @@ -0,0 +1,44 @@ +package com.linkedin.metadata.search.opensearch; + +import com.linkedin.entity.client.EntityClient; +import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.metadata.search.SearchService; +import com.linkedin.metadata.search.fixtures.SampleDataFixtureTestBase; +import io.datahubproject.test.fixtures.search.SampleDataFixtureConfiguration; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import lombok.Getter; +import org.opensearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.context.annotation.Import; +import org.testng.annotations.Test; + +import static org.testng.AssertJUnit.assertNotNull; + + +/** + * Runs sample data fixture tests for Opensearch test container + */ +@Getter +@Import({OpenSearchSuite.class, SampleDataFixtureConfiguration.class, SearchTestContainerConfiguration.class}) +public class SampleDataFixtureOpenSearchTest extends SampleDataFixtureTestBase { + @Autowired + private RestHighLevelClient searchClient; + + @Autowired + @Qualifier("sampleDataSearchService") + protected SearchService searchService; + + @Autowired + @Qualifier("sampleDataEntityClient") + protected EntityClient entityClient; + + @Autowired + @Qualifier("entityRegistry") + private EntityRegistry entityRegistry; + + @Test + public void initTest() { + assertNotNull(searchClient); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/SearchDAOOpenSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/SearchDAOOpenSearchTest.java new file mode 100644 index 0000000000000..0b166975da0d1 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/SearchDAOOpenSearchTest.java @@ -0,0 +1,33 @@ +package com.linkedin.metadata.search.opensearch; + +import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.search.query.SearchDAOTestBase; +import com.linkedin.metadata.utils.elasticsearch.IndexConvention; +import io.datahubproject.test.fixtures.search.SampleDataFixtureConfiguration; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import lombok.Getter; +import org.opensearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.context.annotation.Import; +import org.testng.annotations.Test; + +import static org.testng.AssertJUnit.assertNotNull; + + +@Getter +@Import({OpenSearchSuite.class, SampleDataFixtureConfiguration.class, SearchTestContainerConfiguration.class}) +public class SearchDAOOpenSearchTest extends SearchDAOTestBase { + @Autowired + private RestHighLevelClient searchClient; + @Autowired + private SearchConfiguration searchConfiguration; + @Autowired + @Qualifier("sampleDataIndexConvention") + IndexConvention indexConvention; + + @Test + public void initTest() { + assertNotNull(searchClient); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/SearchServiceOpenSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/SearchServiceOpenSearchTest.java new file mode 100644 index 0000000000000..8a55ba7b37ef9 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/SearchServiceOpenSearchTest.java @@ -0,0 +1,65 @@ +package com.linkedin.metadata.search.opensearch; + +import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; +import com.linkedin.metadata.search.SearchServiceTestBase; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; +import io.datahubproject.test.search.config.SearchCommonTestConfiguration; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import org.jetbrains.annotations.NotNull; +import org.opensearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.testng.AssertJUnit; +import org.testng.annotations.Test; + +@Import({OpenSearchSuite.class, SearchCommonTestConfiguration.class, SearchTestContainerConfiguration.class}) +public class SearchServiceOpenSearchTest extends SearchServiceTestBase { + + @Autowired + private RestHighLevelClient _searchClient; + @Autowired + private ESBulkProcessor _bulkProcessor; + @Autowired + private ESIndexBuilder _esIndexBuilder; + @Autowired + private SearchConfiguration _searchConfiguration; + @Autowired + private CustomSearchConfiguration _customSearchConfiguration; + + @NotNull + @Override + protected RestHighLevelClient getSearchClient() { + return _searchClient; + } + + @NotNull + @Override + protected ESBulkProcessor getBulkProcessor() { + return _bulkProcessor; + } + + @NotNull + @Override + protected ESIndexBuilder getIndexBuilder() { + return _esIndexBuilder; + } + + @NotNull + @Override + protected SearchConfiguration getSearchConfiguration() { + return _searchConfiguration; + } + + @NotNull + @Override + protected CustomSearchConfiguration getCustomSearchConfiguration() { + return _customSearchConfiguration; + } + + @Test + public void initTest() { + AssertJUnit.assertNotNull(_searchClient); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/SystemMetadataServiceOpenSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/SystemMetadataServiceOpenSearchTest.java new file mode 100644 index 0000000000000..f0bb8e1c12479 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/SystemMetadataServiceOpenSearchTest.java @@ -0,0 +1,47 @@ +package com.linkedin.metadata.search.opensearch; + +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; +import com.linkedin.metadata.systemmetadata.SystemMetadataServiceTestBase; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import org.jetbrains.annotations.NotNull; +import org.opensearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.testng.AssertJUnit; +import org.testng.annotations.Test; + + +@Import({OpenSearchSuite.class, SearchTestContainerConfiguration.class}) +public class SystemMetadataServiceOpenSearchTest extends SystemMetadataServiceTestBase { + + @Autowired + private RestHighLevelClient _searchClient; + @Autowired + private ESBulkProcessor _bulkProcessor; + @Autowired + private ESIndexBuilder _esIndexBuilder; + + @NotNull + @Override + protected RestHighLevelClient getSearchClient() { + return _searchClient; + } + + @NotNull + @Override + protected ESBulkProcessor getBulkProcessor() { + return _bulkProcessor; + } + + @NotNull + @Override + protected ESIndexBuilder getIndexBuilder() { + return _esIndexBuilder; + } + + @Test + public void initTest() { + AssertJUnit.assertNotNull(_searchClient); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/TestEntityOpenSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/TestEntityOpenSearchTest.java new file mode 100644 index 0000000000000..467f7fb43be1b --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/TestEntityOpenSearchTest.java @@ -0,0 +1,65 @@ +package com.linkedin.metadata.search.opensearch; + +import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; +import com.linkedin.metadata.search.TestEntityTestBase; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; +import io.datahubproject.test.search.config.SearchCommonTestConfiguration; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import org.jetbrains.annotations.NotNull; +import org.opensearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.testng.AssertJUnit; +import org.testng.annotations.Test; + +@Import({OpenSearchSuite.class, SearchCommonTestConfiguration.class, SearchTestContainerConfiguration.class}) +public class TestEntityOpenSearchTest extends TestEntityTestBase { + + @Autowired + private RestHighLevelClient _searchClient; + @Autowired + private ESBulkProcessor _bulkProcessor; + @Autowired + private ESIndexBuilder _esIndexBuilder; + @Autowired + private SearchConfiguration _searchConfiguration; + @Autowired + private CustomSearchConfiguration _customSearchConfiguration; + + @NotNull + @Override + protected RestHighLevelClient getSearchClient() { + return _searchClient; + } + + @NotNull + @Override + protected ESBulkProcessor getBulkProcessor() { + return _bulkProcessor; + } + + @NotNull + @Override + protected ESIndexBuilder getIndexBuilder() { + return _esIndexBuilder; + } + + @NotNull + @Override + protected SearchConfiguration getSearchConfiguration() { + return _searchConfiguration; + } + + @NotNull + @Override + protected CustomSearchConfiguration getCustomSearchConfiguration() { + return _customSearchConfiguration; + } + + @Test + public void initTest() { + AssertJUnit.assertNotNull(_searchClient); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/TimeseriesAspectServiceOpenSearchTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/TimeseriesAspectServiceOpenSearchTest.java new file mode 100644 index 0000000000000..3333b9f0942f5 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/opensearch/TimeseriesAspectServiceOpenSearchTest.java @@ -0,0 +1,46 @@ +package com.linkedin.metadata.search.opensearch; + +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; +import com.linkedin.metadata.timeseries.search.TimeseriesAspectServiceTestBase; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; +import org.jetbrains.annotations.NotNull; +import org.opensearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.testng.AssertJUnit; +import org.testng.annotations.Test; + +@Import({OpenSearchSuite.class, SearchTestContainerConfiguration.class}) +public class TimeseriesAspectServiceOpenSearchTest extends TimeseriesAspectServiceTestBase { + + @Autowired + private RestHighLevelClient _searchClient; + @Autowired + private ESBulkProcessor _bulkProcessor; + @Autowired + private ESIndexBuilder _esIndexBuilder; + + @NotNull + @Override + protected RestHighLevelClient getSearchClient() { + return _searchClient; + } + + @NotNull + @Override + protected ESBulkProcessor getBulkProcessor() { + return _bulkProcessor; + } + + @NotNull + @Override + protected ESIndexBuilder getIndexBuilder() { + return _esIndexBuilder; + } + + @Test + public void initTest() { + AssertJUnit.assertNotNull(_searchClient); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAOTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/BrowseDAOTest.java similarity index 86% rename from metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAOTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/search/query/BrowseDAOTest.java index 0a5f71345751b..91e7747afb4a1 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAOTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/BrowseDAOTest.java @@ -1,7 +1,8 @@ -package com.linkedin.metadata.search.elasticsearch.query; +package com.linkedin.metadata.search.query; import com.linkedin.common.urn.Urn; -import com.linkedin.metadata.ESTestConfiguration; +import com.linkedin.metadata.search.elasticsearch.query.ESBrowseDAO; +import io.datahubproject.test.search.config.SearchCommonTestConfiguration; import com.linkedin.metadata.config.search.SearchConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; import com.linkedin.metadata.entity.TestEntityRegistry; @@ -11,11 +12,11 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.search.SearchHit; -import org.elasticsearch.search.SearchHits; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.search.SearchHit; +import org.opensearch.search.SearchHits; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.annotation.Import; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; @@ -28,8 +29,8 @@ import static org.mockito.Mockito.when; import static org.testng.Assert.assertEquals; -@Import(ESTestConfiguration.class) -public class ESBrowseDAOTest extends AbstractTestNGSpringContextTests { +@Import(SearchCommonTestConfiguration.class) +public class BrowseDAOTest extends AbstractTestNGSpringContextTests { private RestHighLevelClient _mockClient; private ESBrowseDAO _browseDAO; diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/SearchDAOTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/SearchDAOTestBase.java new file mode 100644 index 0000000000000..2dbc142d45071 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/SearchDAOTestBase.java @@ -0,0 +1,307 @@ +package com.linkedin.metadata.search.query; + +import com.datahub.test.Snapshot; +import com.google.common.collect.ImmutableList; +import com.linkedin.data.template.LongMap; +import com.linkedin.data.template.StringArray; +import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.metadata.models.registry.SnapshotEntityRegistry; +import com.linkedin.metadata.query.filter.Condition; +import com.linkedin.metadata.query.filter.ConjunctiveCriterion; +import com.linkedin.metadata.query.filter.ConjunctiveCriterionArray; +import com.linkedin.metadata.query.filter.Criterion; +import com.linkedin.metadata.query.filter.CriterionArray; +import com.linkedin.metadata.query.filter.Filter; +import com.linkedin.metadata.search.AggregationMetadata; +import com.linkedin.metadata.search.AggregationMetadataArray; +import com.linkedin.metadata.search.FilterValueArray; +import com.linkedin.metadata.search.SearchEntityArray; +import com.linkedin.metadata.search.SearchResult; +import com.linkedin.metadata.search.SearchResultMetadata; +import com.linkedin.metadata.search.elasticsearch.query.ESSearchDAO; +import com.linkedin.metadata.utils.SearchUtil; +import com.linkedin.metadata.utils.elasticsearch.IndexConvention; +import org.opensearch.client.RestHighLevelClient; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import static com.linkedin.metadata.Constants.ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH; +import static com.linkedin.metadata.utils.SearchUtil.AGGREGATION_SEPARATOR_CHAR; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotEquals; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.fail; + +abstract public class SearchDAOTestBase extends AbstractTestNGSpringContextTests { + + abstract protected RestHighLevelClient getSearchClient(); + + abstract protected SearchConfiguration getSearchConfiguration(); + + abstract protected IndexConvention getIndexConvention(); + + EntityRegistry _entityRegistry = new SnapshotEntityRegistry(new Snapshot()); + + + @Test + public void testTransformFilterForEntitiesNoChange() { + Criterion c = new Criterion().setValue("urn:li:tag:abc").setValues( + new StringArray(ImmutableList.of("urn:li:tag:abc", "urn:li:tag:def")) + ).setNegated(false).setCondition(Condition.EQUAL).setField("tags.keyword"); + + Filter f = new Filter().setOr( + new ConjunctiveCriterionArray(new ConjunctiveCriterion().setAnd(new CriterionArray(c)))); + + Filter transformedFilter = SearchUtil.transformFilterForEntities(f, getIndexConvention()); + assertEquals(f, transformedFilter); + } + + @Test + public void testTransformFilterForEntitiesNullFilter() { + Filter transformedFilter = SearchUtil.transformFilterForEntities(null, getIndexConvention()); + assertNotNull(getIndexConvention()); + assertEquals(null, transformedFilter); + } + + @Test + public void testTransformFilterForEntitiesWithChanges() { + + Criterion c = new Criterion().setValue("dataset").setValues( + new StringArray(ImmutableList.of("dataset")) + ).setNegated(false).setCondition(Condition.EQUAL).setField("_entityType"); + + Filter f = new Filter().setOr( + new ConjunctiveCriterionArray(new ConjunctiveCriterion().setAnd(new CriterionArray(c)))); + Filter originalF = null; + try { + originalF = f.copy(); + } catch (CloneNotSupportedException e) { + fail(e.getMessage()); + } + assertEquals(f, originalF); + + Filter transformedFilter = SearchUtil.transformFilterForEntities(f, getIndexConvention()); + assertNotEquals(originalF, transformedFilter); + + Criterion expectedNewCriterion = new Criterion().setValue("smpldat_datasetindex_v2").setValues( + new StringArray(ImmutableList.of("smpldat_datasetindex_v2")) + ).setNegated(false).setCondition(Condition.EQUAL).setField("_index"); + + Filter expectedNewFilter = new Filter().setOr( + new ConjunctiveCriterionArray(new ConjunctiveCriterion().setAnd(new CriterionArray(expectedNewCriterion)))); + + assertEquals(expectedNewFilter, transformedFilter); + } + + @Test + public void testTransformFilterForEntitiesWithUnderscore() { + + Criterion c = new Criterion().setValue("data_job").setValues( + new StringArray(ImmutableList.of("data_job")) + ).setNegated(false).setCondition(Condition.EQUAL).setField("_entityType"); + + Filter f = new Filter().setOr( + new ConjunctiveCriterionArray(new ConjunctiveCriterion().setAnd(new CriterionArray(c)))); + Filter originalF = null; + try { + originalF = f.copy(); + } catch (CloneNotSupportedException e) { + fail(e.getMessage()); + } + assertEquals(f, originalF); + + Filter transformedFilter = SearchUtil.transformFilterForEntities(f, getIndexConvention()); + assertNotEquals(originalF, transformedFilter); + + Criterion expectedNewCriterion = new Criterion().setValue("smpldat_datajobindex_v2").setValues( + new StringArray(ImmutableList.of("smpldat_datajobindex_v2")) + ).setNegated(false).setCondition(Condition.EQUAL).setField("_index"); + + Filter expectedNewFilter = new Filter().setOr( + new ConjunctiveCriterionArray(new ConjunctiveCriterion().setAnd(new CriterionArray(expectedNewCriterion)))); + + assertEquals(transformedFilter, expectedNewFilter); + } + + @Test + public void testTransformFilterForEntitiesWithSomeChanges() { + + Criterion criterionChanged = new Criterion().setValue("dataset").setValues( + new StringArray(ImmutableList.of("dataset")) + ).setNegated(false).setCondition(Condition.EQUAL).setField("_entityType"); + Criterion criterionUnchanged = new Criterion().setValue("urn:li:tag:abc").setValues( + new StringArray(ImmutableList.of("urn:li:tag:abc", "urn:li:tag:def")) + ).setNegated(false).setCondition(Condition.EQUAL).setField("tags.keyword"); + + Filter f = new Filter().setOr( + new ConjunctiveCriterionArray(new ConjunctiveCriterion().setAnd(new CriterionArray(criterionChanged, criterionUnchanged)))); + Filter originalF = null; + try { + originalF = f.copy(); + } catch (CloneNotSupportedException e) { + fail(e.getMessage()); + } + assertEquals(f, originalF); + + Filter transformedFilter = SearchUtil.transformFilterForEntities(f, getIndexConvention()); + assertNotEquals(originalF, transformedFilter); + + Criterion expectedNewCriterion = new Criterion().setValue("smpldat_datasetindex_v2").setValues( + new StringArray(ImmutableList.of("smpldat_datasetindex_v2")) + ).setNegated(false).setCondition(Condition.EQUAL).setField("_index"); + + Filter expectedNewFilter = new Filter().setOr( + new ConjunctiveCriterionArray(new ConjunctiveCriterion().setAnd(new CriterionArray(expectedNewCriterion, criterionUnchanged)))); + + assertEquals(expectedNewFilter, transformedFilter); + } + + @Test + public void testTransformIndexIntoEntityNameSingle() { + ESSearchDAO searchDAO = new ESSearchDAO(_entityRegistry, getSearchClient(), getIndexConvention(), false, + ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH, getSearchConfiguration(), null); + // Empty aggregations + final SearchResultMetadata searchResultMetadata = + new SearchResultMetadata().setAggregations(new AggregationMetadataArray()); + SearchResult result = new SearchResult().setEntities(new SearchEntityArray(new ArrayList<>())) + .setMetadata(searchResultMetadata) + .setFrom(0) + .setPageSize(100) + .setNumEntities(30); + SearchResult expectedResult = null; + try { + expectedResult = result.copy(); + } catch (CloneNotSupportedException e) { + fail(e.getMessage()); + } + assertEquals(expectedResult, searchDAO.transformIndexIntoEntityName(result)); + + // one facet, do not transform + Map aggMap = Map.of("urn:li:corpuser:datahub", Long.valueOf(3)); + + List aggregationMetadataList = new ArrayList<>(); + aggregationMetadataList.add(new AggregationMetadata().setName("owners") + .setDisplayName("Owned by") + .setAggregations(new LongMap(aggMap)) + .setFilterValues(new FilterValueArray(SearchUtil.convertToFilters(aggMap, Collections.emptySet()))) + ); + searchResultMetadata.setAggregations(new AggregationMetadataArray(aggregationMetadataList)); + result.setMetadata(searchResultMetadata); + + try { + expectedResult = result.copy(); + } catch (CloneNotSupportedException e) { + fail(e.getMessage()); + } + assertEquals(searchDAO.transformIndexIntoEntityName(result), expectedResult); + + // one facet, transform + Map entityTypeMap = Map.of("smpldat_datasetindex_v2", Long.valueOf(3)); + + aggregationMetadataList = List.of(new AggregationMetadata().setName("_entityType") + .setDisplayName("Type") + .setAggregations(new LongMap(entityTypeMap)) + .setFilterValues(new FilterValueArray(SearchUtil.convertToFilters(entityTypeMap, Collections.emptySet()))) + ); + searchResultMetadata.setAggregations(new AggregationMetadataArray(aggregationMetadataList)); + result.setMetadata(searchResultMetadata); + + Map expectedEntityTypeMap = Map.of("dataset", Long.valueOf(3)); + + List expectedAggregationMetadataList = List.of( + new AggregationMetadata().setName("_entityType") + .setDisplayName("Type") + .setAggregations(new LongMap(expectedEntityTypeMap)) + .setFilterValues(new FilterValueArray(SearchUtil.convertToFilters(expectedEntityTypeMap, Collections.emptySet()))) + ); + expectedResult.setMetadata(new SearchResultMetadata().setAggregations(new AggregationMetadataArray(expectedAggregationMetadataList))); + assertEquals(searchDAO.transformIndexIntoEntityName(result), expectedResult); + } + + @Test + public void testTransformIndexIntoEntityNameNested() { + ESSearchDAO searchDAO = new ESSearchDAO(_entityRegistry, getSearchClient(), getIndexConvention(), false, + ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH, getSearchConfiguration(), null); + // One nested facet + Map entityTypeMap = Map.of( + String.format("smpldat_datasetindex_v2%surn:li:corpuser:datahub", AGGREGATION_SEPARATOR_CHAR), Long.valueOf(3), + String.format("smpldat_datasetindex_v2%surn:li:corpuser:bfoo", AGGREGATION_SEPARATOR_CHAR), Long.valueOf(7), + "smpldat_datasetindex_v2", Long.valueOf(20) + ); + List aggregationMetadataList = List.of(new AggregationMetadata().setName("_entityType␞owners") + .setDisplayName("Type␞Owned By") + .setAggregations(new LongMap(entityTypeMap)) + .setFilterValues(new FilterValueArray(SearchUtil.convertToFilters(entityTypeMap, Collections.emptySet()))) + ); + SearchResult result = new SearchResult().setEntities(new SearchEntityArray(new ArrayList<>())) + .setMetadata(new SearchResultMetadata().setAggregations( + new AggregationMetadataArray(aggregationMetadataList) + )) + .setFrom(0) + .setPageSize(100) + .setNumEntities(50); + + Map expectedEntityTypeMap = Map.of( + String.format("dataset%surn:li:corpuser:datahub", AGGREGATION_SEPARATOR_CHAR), Long.valueOf(3), + String.format("dataset%surn:li:corpuser:bfoo", AGGREGATION_SEPARATOR_CHAR), Long.valueOf(7), + "dataset", Long.valueOf(20) + ); + + List expectedAggregationMetadataList = List.of(new AggregationMetadata().setName("_entityType␞owners") + .setDisplayName("Type␞Owned By") + .setAggregations(new LongMap(expectedEntityTypeMap)) + .setFilterValues(new FilterValueArray(SearchUtil.convertToFilters(expectedEntityTypeMap, Collections.emptySet()))) + ); + SearchResult expectedResult = new SearchResult().setEntities(new SearchEntityArray(new ArrayList<>())) + .setMetadata(new SearchResultMetadata().setAggregations( + new AggregationMetadataArray(expectedAggregationMetadataList))) + .setFrom(0) + .setPageSize(100) + .setNumEntities(50); + assertEquals(searchDAO.transformIndexIntoEntityName(result), expectedResult); + + // One nested facet, opposite order + entityTypeMap = Map.of( + String.format("urn:li:corpuser:datahub%ssmpldat_datasetindex_v2", AGGREGATION_SEPARATOR_CHAR), Long.valueOf(3), + String.format("urn:li:corpuser:datahub%ssmpldat_chartindex_v2", AGGREGATION_SEPARATOR_CHAR), Long.valueOf(7), + "urn:li:corpuser:datahub", Long.valueOf(20) + ); + aggregationMetadataList = List.of(new AggregationMetadata().setName("owners␞_entityType") + .setDisplayName("Owned By␞Type") + .setAggregations(new LongMap(entityTypeMap)) + .setFilterValues(new FilterValueArray(SearchUtil.convertToFilters(entityTypeMap, Collections.emptySet()))) + ); + result = new SearchResult().setEntities(new SearchEntityArray(new ArrayList<>())) + .setMetadata(new SearchResultMetadata().setAggregations( + new AggregationMetadataArray(aggregationMetadataList) + )) + .setFrom(0) + .setPageSize(100) + .setNumEntities(50); + + expectedEntityTypeMap = Map.of( + String.format("urn:li:corpuser:datahub%sdataset", AGGREGATION_SEPARATOR_CHAR), Long.valueOf(3), + String.format("urn:li:corpuser:datahub%schart", AGGREGATION_SEPARATOR_CHAR), Long.valueOf(7), + "urn:li:corpuser:datahub", Long.valueOf(20) + ); + + expectedAggregationMetadataList = List.of(new AggregationMetadata().setName("owners␞_entityType") + .setDisplayName("Owned By␞Type") + .setAggregations(new LongMap(expectedEntityTypeMap)) + .setFilterValues(new FilterValueArray(SearchUtil.convertToFilters(expectedEntityTypeMap, Collections.emptySet()))) + ); + expectedResult = new SearchResult().setEntities(new SearchEntityArray(new ArrayList<>())) + .setMetadata(new SearchResultMetadata().setAggregations( + new AggregationMetadataArray(expectedAggregationMetadataList))) + .setFrom(0) + .setPageSize(100) + .setNumEntities(50); + assertEquals(searchDAO.transformIndexIntoEntityName(result), expectedResult); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java similarity index 94% rename from metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilderTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java index 36c8bb8f9a676..66e7b62741f4c 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java @@ -1,4 +1,4 @@ -package com.linkedin.metadata.search.elasticsearch.query.request; +package com.linkedin.metadata.search.query.request; import com.google.common.collect.ImmutableSet; import com.linkedin.metadata.config.search.SearchConfiguration; @@ -9,7 +9,9 @@ import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; -import org.elasticsearch.search.aggregations.AggregationBuilder; + +import com.linkedin.metadata.search.elasticsearch.query.request.AggregationQueryBuilder; +import org.opensearch.search.aggregations.AggregationBuilder; import org.testng.Assert; import org.testng.annotations.Test; diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AutocompleteRequestHandlerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AutocompleteRequestHandlerTest.java similarity index 88% rename from metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AutocompleteRequestHandlerTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AutocompleteRequestHandlerTest.java index be91cb0288950..34b98f38254cd 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AutocompleteRequestHandlerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AutocompleteRequestHandlerTest.java @@ -1,15 +1,17 @@ -package com.linkedin.metadata.search.elasticsearch.query.request; +package com.linkedin.metadata.search.query.request; import com.linkedin.metadata.TestEntitySpecBuilder; import java.util.List; import java.util.Map; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.MatchPhrasePrefixQueryBuilder; -import org.elasticsearch.index.query.MatchQueryBuilder; -import org.elasticsearch.index.query.MultiMatchQueryBuilder; -import org.elasticsearch.search.builder.SearchSourceBuilder; -import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder; + +import com.linkedin.metadata.search.elasticsearch.query.request.AutocompleteRequestHandler; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.MatchPhrasePrefixQueryBuilder; +import org.opensearch.index.query.MatchQueryBuilder; +import org.opensearch.index.query.MultiMatchQueryBuilder; +import org.opensearch.search.builder.SearchSourceBuilder; +import org.opensearch.search.fetch.subphase.highlight.HighlightBuilder; import org.testng.annotations.Test; import static org.testng.Assert.assertEquals; diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/CustomizedQueryHandlerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/CustomizedQueryHandlerTest.java similarity index 93% rename from metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/CustomizedQueryHandlerTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/search/query/request/CustomizedQueryHandlerTest.java index 3dad9c59c6b53..6b6664ffdf30e 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/CustomizedQueryHandlerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/CustomizedQueryHandlerTest.java @@ -1,4 +1,4 @@ -package com.linkedin.metadata.search.elasticsearch.query.request; +package com.linkedin.metadata.search.query.request; import com.linkedin.metadata.config.search.CustomConfiguration; import com.linkedin.metadata.config.search.SearchConfiguration; @@ -7,12 +7,14 @@ import com.linkedin.metadata.config.search.custom.QueryConfiguration; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.dataformat.yaml.YAMLMapper; -import org.elasticsearch.common.lucene.search.function.CombineFunction; -import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery; -import org.elasticsearch.index.query.MatchAllQueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.index.query.functionscore.FunctionScoreQueryBuilder; -import org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders; +import com.linkedin.metadata.search.elasticsearch.query.request.CustomizedQueryHandler; +import com.linkedin.metadata.search.elasticsearch.query.request.SearchQueryBuilder; +import org.opensearch.common.lucene.search.function.CombineFunction; +import org.opensearch.common.lucene.search.function.FunctionScoreQuery; +import org.opensearch.index.query.MatchAllQueryBuilder; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.index.query.functionscore.FunctionScoreQueryBuilder; +import org.opensearch.index.query.functionscore.ScoreFunctionBuilders; import org.testng.annotations.Test; import java.io.IOException; diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchQueryBuilderTest.java similarity index 95% rename from metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchQueryBuilderTest.java index 8e73b0ceeae8d..9c0815efdc8b4 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchQueryBuilderTest.java @@ -1,8 +1,10 @@ -package com.linkedin.metadata.search.elasticsearch.query.request; +package com.linkedin.metadata.search.query.request; import com.linkedin.data.schema.DataSchema; import com.linkedin.data.schema.PathSpec; -import com.linkedin.metadata.ESTestConfiguration; +import com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig; +import com.linkedin.metadata.search.elasticsearch.query.request.SearchQueryBuilder; +import io.datahubproject.test.search.config.SearchCommonTestConfiguration; import com.linkedin.metadata.config.search.CustomConfiguration; import com.linkedin.metadata.config.search.ExactMatchConfiguration; import com.linkedin.metadata.config.search.PartialConfiguration; @@ -26,15 +28,15 @@ import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.util.Pair; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.MatchAllQueryBuilder; -import org.elasticsearch.index.query.MatchPhrasePrefixQueryBuilder; -import org.elasticsearch.index.query.MatchPhraseQueryBuilder; -import org.elasticsearch.index.query.QueryBuilder; -import org.elasticsearch.index.query.QueryStringQueryBuilder; -import org.elasticsearch.index.query.SimpleQueryStringBuilder; -import org.elasticsearch.index.query.TermQueryBuilder; -import org.elasticsearch.index.query.functionscore.FunctionScoreQueryBuilder; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.MatchAllQueryBuilder; +import org.opensearch.index.query.MatchPhrasePrefixQueryBuilder; +import org.opensearch.index.query.MatchPhraseQueryBuilder; +import org.opensearch.index.query.QueryBuilder; +import org.opensearch.index.query.QueryStringQueryBuilder; +import org.opensearch.index.query.SimpleQueryStringBuilder; +import org.opensearch.index.query.TermQueryBuilder; +import org.opensearch.index.query.functionscore.FunctionScoreQueryBuilder; import org.mockito.Mockito; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.annotation.Import; @@ -50,7 +52,7 @@ import static org.testng.Assert.assertNull; import static org.testng.Assert.assertTrue; -@Import(ESTestConfiguration.class) +@Import(SearchCommonTestConfiguration.class) public class SearchQueryBuilderTest extends AbstractTestNGSpringContextTests { @Autowired diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java similarity index 95% rename from metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java index db56e2d34881b..90c6c523c588f 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java @@ -1,11 +1,12 @@ -package com.linkedin.metadata.search.elasticsearch.query.request; +package com.linkedin.metadata.search.query.request; import com.linkedin.metadata.config.search.ExactMatchConfiguration; import com.linkedin.metadata.config.search.PartialConfiguration; import com.linkedin.metadata.config.search.SearchConfiguration; import com.google.common.collect.ImmutableList; import com.linkedin.data.template.StringArray; -import com.linkedin.metadata.ESTestConfiguration; +import com.linkedin.metadata.search.elasticsearch.query.request.SearchRequestHandler; +import io.datahubproject.test.search.config.SearchCommonTestConfiguration; import com.linkedin.metadata.TestEntitySpecBuilder; import com.linkedin.metadata.config.search.WordGramConfiguration; import java.util.ArrayList; @@ -28,17 +29,17 @@ import com.linkedin.metadata.query.filter.Criterion; import com.linkedin.metadata.query.filter.CriterionArray; import com.linkedin.metadata.query.filter.Filter; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.ExistsQueryBuilder; -import org.elasticsearch.index.query.MatchQueryBuilder; -import org.elasticsearch.index.query.MultiMatchQueryBuilder; -import org.elasticsearch.index.query.TermsQueryBuilder; -import org.elasticsearch.search.aggregations.AggregationBuilder; -import org.elasticsearch.search.aggregations.AggregationBuilders; -import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder; -import org.elasticsearch.search.builder.SearchSourceBuilder; -import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.ExistsQueryBuilder; +import org.opensearch.index.query.MatchQueryBuilder; +import org.opensearch.index.query.MultiMatchQueryBuilder; +import org.opensearch.index.query.TermsQueryBuilder; +import org.opensearch.search.aggregations.AggregationBuilder; +import org.opensearch.search.aggregations.AggregationBuilders; +import org.opensearch.search.aggregations.bucket.terms.TermsAggregationBuilder; +import org.opensearch.search.builder.SearchSourceBuilder; +import org.opensearch.search.fetch.subphase.highlight.HighlightBuilder; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.annotation.Import; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; @@ -48,7 +49,7 @@ import static org.testng.Assert.*; -@Import(ESTestConfiguration.class) +@Import(SearchCommonTestConfiguration.class) public class SearchRequestHandlerTest extends AbstractTestNGSpringContextTests { @Autowired private EntityRegistry entityRegistry; diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/utils/ESUtilsTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/utils/ESUtilsTest.java index 4f364c246818f..ddd75a152c333 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/utils/ESUtilsTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/utils/ESUtilsTest.java @@ -4,7 +4,7 @@ import com.linkedin.data.template.StringArray; import com.linkedin.metadata.query.filter.Condition; import com.linkedin.metadata.query.filter.Criterion; -import org.elasticsearch.index.query.QueryBuilder; +import org.opensearch.index.query.QueryBuilder; import org.testng.Assert; import org.testng.annotations.Test; diff --git a/metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/SystemMetadataServiceTestBase.java similarity index 84% rename from metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataServiceTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/SystemMetadataServiceTestBase.java index 6e116df5b2906..e6a9bd7d198f7 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/SystemMetadataServiceTestBase.java @@ -1,6 +1,5 @@ package com.linkedin.metadata.systemmetadata; -import com.linkedin.metadata.ESTestConfiguration; import com.linkedin.metadata.run.AspectRowSummary; import com.linkedin.metadata.run.IngestionRunSummary; import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; @@ -9,9 +8,7 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; import com.linkedin.mxe.SystemMetadata; -import org.elasticsearch.client.RestHighLevelClient; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.context.annotation.Import; +import org.opensearch.client.RestHighLevelClient; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeMethod; @@ -20,18 +17,20 @@ import javax.annotation.Nonnull; import java.util.List; -import static com.linkedin.metadata.ESTestConfiguration.syncAfterWrite; +import static io.datahubproject.test.search.SearchTestUtils.syncAfterWrite; import static org.testng.Assert.assertEquals; -@Import(ESTestConfiguration.class) -public class ElasticSearchSystemMetadataServiceTest extends AbstractTestNGSpringContextTests { +abstract public class SystemMetadataServiceTestBase extends AbstractTestNGSpringContextTests { + + @Nonnull + abstract protected RestHighLevelClient getSearchClient(); + + @Nonnull + abstract protected ESBulkProcessor getBulkProcessor(); + + @Nonnull + abstract protected ESIndexBuilder getIndexBuilder(); - @Autowired - private RestHighLevelClient _searchClient; - @Autowired - private ESBulkProcessor _bulkProcessor; - @Autowired - private ESIndexBuilder _esIndexBuilder; private final IndexConvention _indexConvention = new IndexConventionImpl("es_system_metadata_service_test"); private ElasticSearchSystemMetadataService _client; @@ -49,8 +48,8 @@ public void wipe() throws Exception { @Nonnull private ElasticSearchSystemMetadataService buildService() { - ESSystemMetadataDAO dao = new ESSystemMetadataDAO(_searchClient, _indexConvention, _bulkProcessor, 1); - return new ElasticSearchSystemMetadataService(_bulkProcessor, _indexConvention, dao, _esIndexBuilder); + ESSystemMetadataDAO dao = new ESSystemMetadataDAO(getSearchClient(), _indexConvention, getBulkProcessor(), 1); + return new ElasticSearchSystemMetadataService(getBulkProcessor(), _indexConvention, dao, getIndexBuilder()); } @Test @@ -70,7 +69,7 @@ public void testListRuns() throws Exception { _client.insert(metadata2, "urn:li:chart:2", "chartKey"); _client.insert(metadata2, "urn:li:chart:2", "Ownership"); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); List runs = _client.listRuns(0, 20, false); @@ -99,7 +98,7 @@ public void testOverwriteRuns() throws Exception { _client.insert(metadata2, "urn:li:chart:2", "chartKey"); _client.insert(metadata2, "urn:li:chart:2", "Ownership"); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); List runs = _client.listRuns(0, 20, false); @@ -128,7 +127,7 @@ public void testFindByRunId() throws Exception { _client.insert(metadata2, "urn:li:chart:2", "chartKey"); _client.insert(metadata2, "urn:li:chart:2", "Ownership"); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); List rows = _client.findByRunId("abc-456", false, 0, ESUtils.MAX_RESULT_SIZE); @@ -156,11 +155,11 @@ public void testDelete() throws Exception { _client.insert(metadata2, "urn:li:chart:2", "chartKey"); _client.insert(metadata2, "urn:li:chart:2", "Ownership"); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); _client.deleteUrn("urn:li:chart:1"); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); List rows = _client.findByRunId("abc-456", false, 0, ESUtils.MAX_RESULT_SIZE); @@ -172,7 +171,7 @@ public void testDelete() throws Exception { public void testInsertNullData() throws Exception { _client.insert(null, "urn:li:chart:1", "chartKey"); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); List runs = _client.listRuns(0, 20, false); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/timeline/EbeanTimelineServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/timeline/EbeanTimelineServiceTest.java index 2703dd7fe6cbe..9e89328715510 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/timeline/EbeanTimelineServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/timeline/EbeanTimelineServiceTest.java @@ -27,7 +27,7 @@ public EbeanTimelineServiceTest() throws EntityRegistryException { @BeforeMethod public void setupTest() { - Database server = EbeanTestUtils.createTestServer(); + Database server = EbeanTestUtils.createTestServer(EbeanTimelineServiceTest.class.getSimpleName()); _aspectDao = new EbeanAspectDao(server); _aspectDao.setConnectionValidated(true); _entityTimelineService = new TimelineServiceImpl(_aspectDao, _testEntityRegistry); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java similarity index 97% rename from metadata-io/src/test/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectServiceTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java index d65234bf89d49..cc60ba8679e1f 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java @@ -1,4 +1,4 @@ -package com.linkedin.metadata.timeseries.elastic; +package com.linkedin.metadata.timeseries.search; import com.datahub.test.BatchType; import com.datahub.test.ComplexNestedRecord; @@ -16,7 +16,6 @@ import com.linkedin.data.template.StringArrayArray; import com.linkedin.data.template.StringMap; import com.linkedin.data.template.StringMapArray; -import com.linkedin.metadata.ESTestConfiguration; import com.linkedin.metadata.aspect.EnvelopedAspect; import com.linkedin.metadata.models.AspectSpec; import com.linkedin.metadata.models.DataSchemaFactory; @@ -32,6 +31,7 @@ import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import com.linkedin.metadata.search.utils.QueryUtils; +import com.linkedin.metadata.timeseries.elastic.ElasticSearchTimeseriesAspectService; import com.linkedin.metadata.timeseries.elastic.indexbuilder.TimeseriesAspectIndexBuilders; import com.linkedin.metadata.timeseries.transformer.TimeseriesAspectTransformer; import com.linkedin.metadata.utils.GenericRecordUtils; @@ -45,9 +45,7 @@ import com.linkedin.timeseries.GroupingBucket; import com.linkedin.timeseries.GroupingBucketType; import com.linkedin.timeseries.TimeWindowSize; -import org.elasticsearch.client.RestHighLevelClient; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.context.annotation.Import; +import org.opensearch.client.RestHighLevelClient; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -60,15 +58,15 @@ import java.util.stream.Collectors; import java.util.stream.Stream; -import static com.linkedin.metadata.Constants.*; -import static com.linkedin.metadata.ESTestConfiguration.syncAfterWrite; +import static com.linkedin.metadata.Constants.INGESTION_MAX_SERIALIZED_STRING_LENGTH; +import static com.linkedin.metadata.Constants.MAX_JACKSON_STRING_SIZE; +import static io.datahubproject.test.search.SearchTestUtils.syncAfterWrite; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertNotNull; import static org.testng.Assert.assertTrue; import static org.testng.Assert.fail; -@Import(ESTestConfiguration.class) -public class ElasticSearchTimeseriesAspectServiceTest extends AbstractTestNGSpringContextTests { +abstract public class TimeseriesAspectServiceTestBase extends AbstractTestNGSpringContextTests { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); static { int maxSize = Integer.parseInt(System.getenv().getOrDefault(INGESTION_MAX_SERIALIZED_STRING_LENGTH, MAX_JACKSON_STRING_SIZE)); @@ -85,12 +83,15 @@ public class ElasticSearchTimeseriesAspectServiceTest extends AbstractTestNGSpri private static final String ES_FIELD_TIMESTAMP = "timestampMillis"; private static final String ES_FIELD_STAT = "stat"; - @Autowired - private RestHighLevelClient _searchClient; - @Autowired - private ESBulkProcessor _bulkProcessor; - @Autowired - private ESIndexBuilder _esIndexBuilder; + @Nonnull + abstract protected RestHighLevelClient getSearchClient(); + + @Nonnull + abstract protected ESBulkProcessor getBulkProcessor(); + + @Nonnull + abstract protected ESIndexBuilder getIndexBuilder(); + private EntityRegistry _entityRegistry; private IndexConvention _indexConvention; private ElasticSearchTimeseriesAspectService _elasticSearchTimeseriesAspectService; @@ -116,9 +117,9 @@ public void setup() { @Nonnull private ElasticSearchTimeseriesAspectService buildService() { - return new ElasticSearchTimeseriesAspectService(_searchClient, _indexConvention, - new TimeseriesAspectIndexBuilders(_esIndexBuilder, _entityRegistry, - _indexConvention), _entityRegistry, _bulkProcessor, 1); + return new ElasticSearchTimeseriesAspectService(getSearchClient(), _indexConvention, + new TimeseriesAspectIndexBuilders(getIndexBuilder(), _entityRegistry, + _indexConvention), _entityRegistry, getBulkProcessor(), 1); } /* @@ -190,7 +191,7 @@ public void testUpsertProfiles() throws Exception { } }); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); } @Test(groups = "upsertUniqueMessageId") @@ -216,7 +217,7 @@ public void testUpsertProfilesWithUniqueMessageIds() throws Exception { } }); - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); List resultAspects = _elasticSearchTimeseriesAspectService.getAspectValues(urn, ENTITY_NAME, ASPECT_NAME, null, null, @@ -860,7 +861,7 @@ public void testCountByFilter() { @Test(groups = {"testCountAfterDelete"}, dependsOnGroups = {"deleteAspectValues1"}) public void testCountByFilterAfterDelete() throws InterruptedException { - syncAfterWrite(_bulkProcessor); + syncAfterWrite(getBulkProcessor()); // Test with filter Criterion hasUrnCriterion = new Criterion().setField("urn").setCondition(Condition.EQUAL).setValue(TEST_URN.toString()); diff --git a/metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/Utils.java b/metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/Utils.java deleted file mode 100644 index f96a6c50af33d..0000000000000 --- a/metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/Utils.java +++ /dev/null @@ -1,22 +0,0 @@ -package io.datahub.test.fixtures.elasticsearch; - -import com.fasterxml.jackson.core.StreamReadConstraints; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; - -import static com.linkedin.metadata.Constants.*; - - -public class Utils { - private Utils() { - - } - final public static String FIXTURE_BASE = "src/test/resources/elasticsearch"; - - final public static ObjectMapper OBJECT_MAPPER = new ObjectMapper() - .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - static { - int maxSize = Integer.parseInt(System.getenv().getOrDefault(INGESTION_MAX_SERIALIZED_STRING_LENGTH, MAX_JACKSON_STRING_SIZE)); - OBJECT_MAPPER.getFactory().setStreamReadConstraints(StreamReadConstraints.builder().maxStringLength(maxSize).build()); - } -} diff --git a/metadata-io/src/test/java/io/datahub/test/DataGenerator.java b/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java similarity index 99% rename from metadata-io/src/test/java/io/datahub/test/DataGenerator.java rename to metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java index 3b374993cde16..cfa9c1258583d 100644 --- a/metadata-io/src/test/java/io/datahub/test/DataGenerator.java +++ b/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java @@ -1,4 +1,4 @@ -package io.datahub.test; +package io.datahubproject.test; import com.linkedin.common.AuditStamp; import com.linkedin.common.GlossaryTermAssociation; @@ -111,7 +111,8 @@ public Stream> generateMCPs(String entityName, long }).map(mcp -> { // Expand with default aspects per normal return Stream.concat(Stream.of(mcp), - AspectUtils.getAdditionalChanges(mcp, entityService, true).stream()).collect(Collectors.toList()); + AspectUtils.getAdditionalChanges(mcp, entityService, true).stream()) + .collect(Collectors.toList()); }); } diff --git a/metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/EntityExporter.java b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/EntityExporter.java similarity index 81% rename from metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/EntityExporter.java rename to metadata-io/src/test/java/io/datahubproject/test/fixtures/search/EntityExporter.java index 5c34b9f549d9f..18fbf86f8668d 100644 --- a/metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/EntityExporter.java +++ b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/EntityExporter.java @@ -1,15 +1,15 @@ -package io.datahub.test.fixtures.elasticsearch; +package io.datahubproject.test.fixtures.search; import lombok.Builder; import lombok.NonNull; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.client.indices.GetMappingsRequest; -import org.elasticsearch.client.indices.GetMappingsResponse; -import org.elasticsearch.search.builder.SearchSourceBuilder; -import org.elasticsearch.search.sort.SortBuilders; -import org.elasticsearch.search.sort.SortOrder; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.client.indices.GetMappingsRequest; +import org.opensearch.client.indices.GetMappingsResponse; +import org.opensearch.search.builder.SearchSourceBuilder; +import org.opensearch.search.sort.SortBuilders; +import org.opensearch.search.sort.SortOrder; import java.io.IOException; import java.util.Set; diff --git a/metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/FixtureReader.java b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/FixtureReader.java similarity index 93% rename from metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/FixtureReader.java rename to metadata-io/src/test/java/io/datahubproject/test/fixtures/search/FixtureReader.java index a0c551b28b507..1b804a2346883 100644 --- a/metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/FixtureReader.java +++ b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/FixtureReader.java @@ -1,12 +1,12 @@ -package io.datahub.test.fixtures.elasticsearch; +package io.datahubproject.test.fixtures.search; import com.fasterxml.jackson.core.JsonProcessingException; import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import lombok.Builder; import lombok.NonNull; import org.apache.commons.io.FilenameUtils; -import org.elasticsearch.action.index.IndexRequest; -import org.elasticsearch.common.xcontent.XContentType; +import org.opensearch.action.index.IndexRequest; +import org.opensearch.common.xcontent.XContentType; import java.io.BufferedInputStream; import java.io.BufferedReader; @@ -23,12 +23,12 @@ import java.util.stream.Stream; import java.util.zip.GZIPInputStream; -import static io.datahub.test.fixtures.elasticsearch.Utils.OBJECT_MAPPER; +import static io.datahubproject.test.fixtures.search.SearchFixtureUtils.OBJECT_MAPPER; @Builder public class FixtureReader { @Builder.Default - private String inputBase = Utils.FIXTURE_BASE; + private String inputBase = SearchFixtureUtils.FIXTURE_BASE; @NonNull private ESBulkProcessor bulkProcessor; @NonNull diff --git a/metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/FixtureWriter.java b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/FixtureWriter.java similarity index 75% rename from metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/FixtureWriter.java rename to metadata-io/src/test/java/io/datahubproject/test/fixtures/search/FixtureWriter.java index 36b057bc22a37..0aefa006421fc 100644 --- a/metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/FixtureWriter.java +++ b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/FixtureWriter.java @@ -1,13 +1,13 @@ -package io.datahub.test.fixtures.elasticsearch; +package io.datahubproject.test.fixtures.search; import com.fasterxml.jackson.core.JsonProcessingException; import lombok.Builder; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.search.SearchHit; -import org.elasticsearch.search.SearchHits; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.search.SearchHit; +import org.opensearch.search.SearchHits; import javax.annotation.Nullable; import java.io.BufferedWriter; @@ -15,8 +15,6 @@ import java.io.IOException; import java.util.function.BiConsumer; -import static io.datahub.test.fixtures.elasticsearch.Utils.OBJECT_MAPPER; - /** * */ @@ -26,7 +24,7 @@ public class FixtureWriter { private RestHighLevelClient client; @Builder.Default - private String outputBase = Utils.FIXTURE_BASE; + private String outputBase = SearchFixtureUtils.FIXTURE_BASE; public void write(SearchRequest searchRequest, String relativeOutput, boolean append) { write(searchRequest, relativeOutput, append, null, null, null); @@ -53,14 +51,14 @@ public void write(SearchRequest searchRequest, String relativeOutput, boo if (outputType == null) { bw.write(hit.getSourceAsString()); } else { - O doc = OBJECT_MAPPER.readValue(hit.getSourceAsString(), outputType); - bw.write(OBJECT_MAPPER.writeValueAsString(doc)); + O doc = SearchFixtureUtils.OBJECT_MAPPER.readValue(hit.getSourceAsString(), outputType); + bw.write(SearchFixtureUtils.OBJECT_MAPPER.writeValueAsString(doc)); } bw.newLine(); // Fire callback if (callback != null) { - callback.accept(hit, OBJECT_MAPPER.readValue(hit.getSourceAsString(), callbackType)); + callback.accept(hit, SearchFixtureUtils.OBJECT_MAPPER.readValue(hit.getSourceAsString(), callbackType)); } } catch (JsonProcessingException e) { throw new RuntimeException(e); diff --git a/metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/LineageExporter.java b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/LineageExporter.java similarity index 95% rename from metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/LineageExporter.java rename to metadata-io/src/test/java/io/datahubproject/test/fixtures/search/LineageExporter.java index 3b236b36cdce1..5db07ee6fb8bc 100644 --- a/metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/LineageExporter.java +++ b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/LineageExporter.java @@ -1,14 +1,14 @@ -package io.datahub.test.fixtures.elasticsearch; +package io.datahubproject.test.fixtures.search; import com.google.common.collect.Lists; import lombok.Builder; import lombok.NonNull; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.builder.SearchSourceBuilder; -import org.elasticsearch.search.sort.SortBuilders; -import org.elasticsearch.search.sort.SortOrder; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.builder.SearchSourceBuilder; +import org.opensearch.search.sort.SortBuilders; +import org.opensearch.search.sort.SortOrder; import java.net.URLDecoder; import java.net.URLEncoder; diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SampleDataFixtureConfiguration.java similarity index 94% rename from metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java rename to metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SampleDataFixtureConfiguration.java index ef9992db1fb25..45bbd912bc794 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java +++ b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SampleDataFixtureConfiguration.java @@ -1,5 +1,6 @@ -package com.linkedin.metadata; +package io.datahubproject.test.fixtures.search; +import io.datahubproject.test.search.config.SearchCommonTestConfiguration; import com.linkedin.metadata.config.PreProcessHooks; import com.linkedin.metadata.config.cache.EntityDocCountCacheConfiguration; import com.linkedin.metadata.config.search.CustomConfiguration; @@ -30,9 +31,9 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; import com.linkedin.metadata.version.GitVersion; -import io.datahub.test.fixtures.elasticsearch.FixtureReader; + import java.util.Optional; -import org.elasticsearch.client.RestHighLevelClient; +import org.opensearch.client.RestHighLevelClient; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.boot.test.context.TestConfiguration; @@ -46,15 +47,15 @@ import java.util.Map; import static com.linkedin.metadata.Constants.*; -import static com.linkedin.metadata.ESTestConfiguration.REFRESH_INTERVAL_SECONDS; +import static io.datahubproject.test.search.config.SearchTestContainerConfiguration.REFRESH_INTERVAL_SECONDS; import static org.mockito.ArgumentMatchers.anySet; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @TestConfiguration -@Import(ESTestConfiguration.class) -public class ESSampleDataFixture { +@Import(SearchCommonTestConfiguration.class) +public class SampleDataFixtureConfiguration { /** * Interested in adding more fixtures? Here's what you will need to update? * 1. Create a new indexPrefix and FixtureName. Both are needed or else all fixtures will load on top of each other, @@ -118,7 +119,7 @@ protected EntityIndexBuilders entityIndexBuilders( @Bean(name = "longTailEntityIndexBuilders") protected EntityIndexBuilders longTailEntityIndexBuilders( - @Qualifier("longTailEntityRegistry") EntityRegistry longTailEntityRegistry, + @Qualifier("entityRegistry") EntityRegistry longTailEntityRegistry, @Qualifier("longTailIndexConvention") IndexConvention indexConvention ) { return entityIndexBuildersHelper(longTailEntityRegistry, indexConvention); @@ -147,7 +148,7 @@ protected ElasticSearchService entitySearchService( @Bean(name = "longTailEntitySearchService") protected ElasticSearchService longTailEntitySearchService( - @Qualifier("longTailEntityRegistry") EntityRegistry longTailEntityRegistry, + @Qualifier("entityRegistry") EntityRegistry longTailEntityRegistry, @Qualifier("longTailEntityIndexBuilders") EntityIndexBuilders longTailEndexBuilders, @Qualifier("longTailIndexConvention") IndexConvention longTailIndexConvention ) throws IOException { @@ -186,7 +187,7 @@ protected SearchService searchService( @Bean(name = "longTailSearchService") @Nonnull protected SearchService longTailSearchService( - @Qualifier("longTailEntityRegistry") EntityRegistry longTailEntityRegistry, + @Qualifier("entityRegistry") EntityRegistry longTailEntityRegistry, @Qualifier("longTailEntitySearchService") ElasticSearchService longTailEntitySearchService, @Qualifier("longTailEntityIndexBuilders") EntityIndexBuilders longTailIndexBuilders, @Qualifier("longTailPrefix") String longTailPrefix, @@ -248,7 +249,7 @@ protected EntityClient entityClient( protected EntityClient longTailEntityClient( @Qualifier("sampleDataSearchService") SearchService searchService, @Qualifier("sampleDataEntitySearchService") ElasticSearchService entitySearchService, - @Qualifier("longTailEntityRegistry") EntityRegistry longTailEntityRegistry + @Qualifier("entityRegistry") EntityRegistry longTailEntityRegistry ) { return entityClientHelper(searchService, entitySearchService, longTailEntityRegistry); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESTestFixtureUtils.java b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchFixtureUtils.java similarity index 67% rename from metadata-io/src/test/java/com/linkedin/metadata/ESTestFixtureUtils.java rename to metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchFixtureUtils.java index 914c5be9f5b09..d74dd041f082e 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESTestFixtureUtils.java +++ b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchFixtureUtils.java @@ -1,26 +1,45 @@ -package com.linkedin.metadata; +package io.datahubproject.test.fixtures.search; +import com.fasterxml.jackson.core.StreamReadConstraints; +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import io.datahubproject.test.search.ElasticsearchTestContainer; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; -import io.datahub.test.fixtures.elasticsearch.EntityExporter; -import io.datahub.test.fixtures.elasticsearch.FixtureReader; -import io.datahub.test.fixtures.elasticsearch.FixtureWriter; -import io.datahub.test.fixtures.elasticsearch.LineageExporter; -import io.datahub.test.models.DatasetAnonymized; -import org.elasticsearch.client.RestHighLevelClient; +import io.datahubproject.test.models.DatasetAnonymized; +import io.datahubproject.test.search.SearchTestUtils; +import org.opensearch.client.RestHighLevelClient; import org.springframework.boot.test.context.TestConfiguration; -import org.springframework.context.annotation.Import; +import org.springframework.context.annotation.Bean; +import org.testcontainers.containers.GenericContainer; import org.testng.annotations.Ignore; import org.testng.annotations.Test; import java.io.IOException; import java.util.Set; -import static com.linkedin.metadata.ESTestConfiguration.REFRESH_INTERVAL_SECONDS; -import static com.linkedin.metadata.ESTestUtils.environmentRestClientBuilder; +import static com.linkedin.metadata.Constants.INGESTION_MAX_SERIALIZED_STRING_LENGTH; +import static com.linkedin.metadata.Constants.MAX_JACKSON_STRING_SIZE; +/** + * This class is used for extracting and moving search fixture data. + */ @TestConfiguration -@Import(ESTestConfiguration.class) -public class ESTestFixtureUtils { +public class SearchFixtureUtils { + + final public static String FIXTURE_BASE = "src/test/resources/elasticsearch"; + + final public static ObjectMapper OBJECT_MAPPER = new ObjectMapper() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + static { + int maxSize = Integer.parseInt(System.getenv().getOrDefault(INGESTION_MAX_SERIALIZED_STRING_LENGTH, MAX_JACKSON_STRING_SIZE)); + OBJECT_MAPPER.getFactory().setStreamReadConstraints(StreamReadConstraints.builder().maxStringLength(maxSize).build()); + } + + @Bean(name = "testSearchContainer") + public GenericContainer testSearchContainer() { + return new ElasticsearchTestContainer().startContainer(); + } @Test @Ignore("Fixture capture lineage") @@ -37,7 +56,7 @@ private void extractSearchLineageTestFixture() throws IOException { String rootUrn = "urn:li:dataset:(urn:li:dataPlatform:teradata,teradata.simba.pp_bi_tables.tmis_daily_metrics_final_agg,PROD)"; // Set.of("system_metadata_service_v1", "datasetindex_v2", "graph_service_v1") - try (RestHighLevelClient client = new RestHighLevelClient(environmentRestClientBuilder())) { + try (RestHighLevelClient client = new RestHighLevelClient(SearchTestUtils.environmentRestClientBuilder())) { FixtureWriter fixtureWriter = FixtureWriter.builder() .client(client) .build(); @@ -76,7 +95,7 @@ private void extractEntityTestFixture() throws IOException { String prefix = ""; String commonSuffix = "index_v2"; - try (RestHighLevelClient client = new RestHighLevelClient(environmentRestClientBuilder())) { + try (RestHighLevelClient client = new RestHighLevelClient(SearchTestUtils.environmentRestClientBuilder())) { FixtureWriter fixtureWriter = FixtureWriter.builder() .client(client) .build(); @@ -102,7 +121,7 @@ private void extractEntityTestFixture() throws IOException { * 3. Uncomment and run test */ private void reindexTestFixtureData() throws IOException { - ESBulkProcessor bulkProcessor = ESBulkProcessor.builder(new RestHighLevelClient(environmentRestClientBuilder())) + ESBulkProcessor bulkProcessor = ESBulkProcessor.builder(new RestHighLevelClient(SearchTestUtils.environmentRestClientBuilder())) .async(true) .bulkRequestsLimit(1000) .retryInterval(1L) @@ -112,7 +131,7 @@ private void reindexTestFixtureData() throws IOException { FixtureReader reader = FixtureReader.builder() .bulkProcessor(bulkProcessor) .fixtureName("long_tail") - .refreshIntervalSeconds(REFRESH_INTERVAL_SECONDS) + .refreshIntervalSeconds(SearchTestContainerConfiguration.REFRESH_INTERVAL_SECONDS) .build(); reader.read(); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESSearchLineageFixture.java b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchLineageFixtureConfiguration.java similarity index 95% rename from metadata-io/src/test/java/com/linkedin/metadata/ESSearchLineageFixture.java rename to metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchLineageFixtureConfiguration.java index ade7435bf6652..93d3f108d9e47 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESSearchLineageFixture.java +++ b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchLineageFixtureConfiguration.java @@ -1,5 +1,7 @@ -package com.linkedin.metadata; +package io.datahubproject.test.fixtures.search; +import io.datahubproject.test.search.config.SearchCommonTestConfiguration; +import io.datahubproject.test.search.config.SearchTestContainerConfiguration; import com.linkedin.metadata.config.PreProcessHooks; import com.linkedin.metadata.config.cache.EntityDocCountCacheConfiguration; import com.linkedin.metadata.config.cache.SearchLineageCacheConfiguration; @@ -32,9 +34,10 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; import com.linkedin.metadata.version.GitVersion; -import io.datahub.test.fixtures.elasticsearch.FixtureReader; + import java.util.Optional; -import org.elasticsearch.client.RestHighLevelClient; + +import org.opensearch.client.RestHighLevelClient; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.boot.test.context.TestConfiguration; @@ -48,12 +51,11 @@ import java.util.Map; import static com.linkedin.metadata.Constants.*; -import static com.linkedin.metadata.ESTestConfiguration.REFRESH_INTERVAL_SECONDS; @TestConfiguration -@Import(ESTestConfiguration.class) -public class ESSearchLineageFixture { +@Import(SearchCommonTestConfiguration.class) +public class SearchLineageFixtureConfiguration { @Autowired private ESBulkProcessor _bulkProcessor; @@ -155,7 +157,7 @@ protected LineageSearchService lineageSearchService( .bulkProcessor(_bulkProcessor) .fixtureName(fixtureName) .targetIndexPrefix(prefix) - .refreshIntervalSeconds(REFRESH_INTERVAL_SECONDS) + .refreshIntervalSeconds(SearchTestContainerConfiguration.REFRESH_INTERVAL_SECONDS) .build() .read(); diff --git a/metadata-io/src/test/java/io/datahub/test/models/Anonymized.java b/metadata-io/src/test/java/io/datahubproject/test/models/Anonymized.java similarity index 97% rename from metadata-io/src/test/java/io/datahub/test/models/Anonymized.java rename to metadata-io/src/test/java/io/datahubproject/test/models/Anonymized.java index 1108097dff86a..6036473063059 100644 --- a/metadata-io/src/test/java/io/datahub/test/models/Anonymized.java +++ b/metadata-io/src/test/java/io/datahubproject/test/models/Anonymized.java @@ -1,4 +1,4 @@ -package io.datahub.test.models; +package io.datahubproject.test.models; import com.fasterxml.jackson.annotation.JsonSetter; diff --git a/metadata-io/src/test/java/io/datahub/test/models/DatasetAnonymized.java b/metadata-io/src/test/java/io/datahubproject/test/models/DatasetAnonymized.java similarity index 97% rename from metadata-io/src/test/java/io/datahub/test/models/DatasetAnonymized.java rename to metadata-io/src/test/java/io/datahubproject/test/models/DatasetAnonymized.java index 225f52d993931..35813d22067a6 100644 --- a/metadata-io/src/test/java/io/datahub/test/models/DatasetAnonymized.java +++ b/metadata-io/src/test/java/io/datahubproject/test/models/DatasetAnonymized.java @@ -1,4 +1,4 @@ -package io.datahub.test.models; +package io.datahubproject.test.models; import com.fasterxml.jackson.annotation.JsonGetter; diff --git a/metadata-io/src/test/java/io/datahub/test/models/GraphAnonymized.java b/metadata-io/src/test/java/io/datahubproject/test/models/GraphAnonymized.java similarity index 82% rename from metadata-io/src/test/java/io/datahub/test/models/GraphAnonymized.java rename to metadata-io/src/test/java/io/datahubproject/test/models/GraphAnonymized.java index 5e6c5d57e050e..3d2360ae04228 100644 --- a/metadata-io/src/test/java/io/datahub/test/models/GraphAnonymized.java +++ b/metadata-io/src/test/java/io/datahubproject/test/models/GraphAnonymized.java @@ -1,4 +1,4 @@ -package io.datahub.test.models; +package io.datahubproject.test.models; import com.fasterxml.jackson.annotation.JsonSetter; @@ -13,7 +13,7 @@ public static class GraphNode extends Anonymized { @JsonSetter("urn") public void setUrn(String urn) { - this.urn = Anonymized.anonymizeUrn(urn); + this.urn = anonymizeUrn(urn); } } } diff --git a/metadata-io/src/test/java/io/datahubproject/test/search/ElasticsearchTestContainer.java b/metadata-io/src/test/java/io/datahubproject/test/search/ElasticsearchTestContainer.java new file mode 100644 index 0000000000000..233a667d078dd --- /dev/null +++ b/metadata-io/src/test/java/io/datahubproject/test/search/ElasticsearchTestContainer.java @@ -0,0 +1,42 @@ +package io.datahubproject.test.search; + +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.utility.DockerImageName; + + +import static com.linkedin.metadata.DockerTestUtils.checkContainerEngine; + +public class ElasticsearchTestContainer implements SearchTestContainer { + private static final String ELASTIC_VERSION = "7.10.1"; + private static final String ELASTIC_IMAGE_NAME = "docker.elastic.co/elasticsearch/elasticsearch"; + private static final String ENV_ELASTIC_IMAGE_FULL_NAME = System.getenv("ELASTIC_IMAGE_FULL_NAME"); + private static final String ELASTIC_IMAGE_FULL_NAME = ENV_ELASTIC_IMAGE_FULL_NAME != null + ? ENV_ELASTIC_IMAGE_FULL_NAME : ELASTIC_IMAGE_NAME + ":" + ELASTIC_VERSION; + private static final DockerImageName DOCKER_IMAGE_NAME = DockerImageName.parse(ELASTIC_IMAGE_FULL_NAME) + .asCompatibleSubstituteFor(ELASTIC_IMAGE_NAME); + + protected static final GenericContainer ES_CONTAINER; + private boolean isStarted = false; + + // A helper method to create an ElasticsearchContainer defaulting to the current image and version, with the ability + // within firewalled environments to override with an environment variable to point to the offline repository. + static { + ES_CONTAINER = new org.testcontainers.elasticsearch.ElasticsearchContainer(DOCKER_IMAGE_NAME); + checkContainerEngine(ES_CONTAINER.getDockerClient()); + ES_CONTAINER.withEnv("ES_JAVA_OPTS", SEARCH_JAVA_OPTS).withStartupTimeout(STARTUP_TIMEOUT); + } + + @Override + public GenericContainer startContainer() { + if (!isStarted) { + ElasticsearchTestContainer.ES_CONTAINER.start(); + isStarted = true; + } + return ES_CONTAINER; + } + + @Override + public void stopContainer() { + ES_CONTAINER.stop(); + } +} diff --git a/metadata-io/src/test/java/io/datahubproject/test/search/OpenSearchTestContainer.java b/metadata-io/src/test/java/io/datahubproject/test/search/OpenSearchTestContainer.java new file mode 100644 index 0000000000000..d94b88b466f89 --- /dev/null +++ b/metadata-io/src/test/java/io/datahubproject/test/search/OpenSearchTestContainer.java @@ -0,0 +1,43 @@ +package io.datahubproject.test.search; + +import org.opensearch.testcontainers.OpensearchContainer; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.utility.DockerImageName; + + +import static com.linkedin.metadata.DockerTestUtils.checkContainerEngine; + +public class OpenSearchTestContainer implements SearchTestContainer { + private static final String OPENSEARCH_VERSION = "2.9.0"; + private static final String OPENSEARCH_IMAGE_NAME = "opensearchproject/opensearch"; + private static final String ENV_OPENSEARCH_IMAGE_FULL_NAME = System.getenv("OPENSEARCH_IMAGE_FULL_NAME"); + private static final String OPENSEARCH_IMAGE_FULL_NAME = ENV_OPENSEARCH_IMAGE_FULL_NAME != null + ? ENV_OPENSEARCH_IMAGE_FULL_NAME : OPENSEARCH_IMAGE_NAME + ":" + OPENSEARCH_VERSION; + private static final DockerImageName DOCKER_IMAGE_NAME = DockerImageName.parse(OPENSEARCH_IMAGE_FULL_NAME) + .asCompatibleSubstituteFor(OPENSEARCH_IMAGE_NAME); + + protected static final GenericContainer OS_CONTAINER; + private boolean isStarted = false; + + // A helper method to create an ElasticseachContainer defaulting to the current image and version, with the ability + // within firewalled environments to override with an environment variable to point to the offline repository. + static { + OS_CONTAINER = new OpensearchContainer(DOCKER_IMAGE_NAME); + checkContainerEngine(OS_CONTAINER.getDockerClient()); + OS_CONTAINER.withEnv("OPENSEARCH_JAVA_OPTS", SEARCH_JAVA_OPTS).withStartupTimeout(STARTUP_TIMEOUT); + } + + @Override + public GenericContainer startContainer() { + if (!isStarted) { + OS_CONTAINER.start(); + isStarted = true; + } + return OS_CONTAINER; + } + + @Override + public void stopContainer() { + OS_CONTAINER.stop(); + } +} diff --git a/metadata-io/src/test/java/io/datahubproject/test/search/SearchTestContainer.java b/metadata-io/src/test/java/io/datahubproject/test/search/SearchTestContainer.java new file mode 100644 index 0000000000000..67e1ee368f513 --- /dev/null +++ b/metadata-io/src/test/java/io/datahubproject/test/search/SearchTestContainer.java @@ -0,0 +1,14 @@ +package io.datahubproject.test.search; + +import org.testcontainers.containers.GenericContainer; + +import java.time.Duration; + +public interface SearchTestContainer { + String SEARCH_JAVA_OPTS = "-Xms64m -Xmx384m -XX:MaxDirectMemorySize=368435456"; + Duration STARTUP_TIMEOUT = Duration.ofMinutes(5); // usually < 1min + + GenericContainer startContainer(); + + void stopContainer(); +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java b/metadata-io/src/test/java/io/datahubproject/test/search/SearchTestUtils.java similarity index 74% rename from metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java rename to metadata-io/src/test/java/io/datahubproject/test/search/SearchTestUtils.java index 7e9605cbe3db0..414b9f927fada 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java +++ b/metadata-io/src/test/java/io/datahubproject/test/search/SearchTestUtils.java @@ -1,4 +1,4 @@ -package com.linkedin.metadata; +package io.datahubproject.test.search; import com.datahub.authentication.Authentication; import com.datahub.plugins.auth.authorization.Authorizer; @@ -17,48 +17,32 @@ import com.linkedin.metadata.search.ScrollResult; import com.linkedin.metadata.search.SearchResult; import com.linkedin.metadata.search.SearchService; -import java.time.Duration; -import java.util.List; -import java.util.Optional; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import javax.annotation.Nullable; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import org.apache.http.HttpHost; import org.apache.http.auth.AuthScope; import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.CredentialsProvider; import org.apache.http.impl.client.BasicCredentialsProvider; import org.apache.http.impl.nio.client.HttpAsyncClientBuilder; -import org.elasticsearch.client.RestClient; -import org.elasticsearch.client.RestClientBuilder; -import org.testcontainers.elasticsearch.ElasticsearchContainer; -import org.testcontainers.utility.DockerImageName; +import org.opensearch.client.RestClient; +import org.opensearch.client.RestClientBuilder; + +import javax.annotation.Nullable; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.Stream; import static com.linkedin.datahub.graphql.resolvers.search.SearchUtils.AUTO_COMPLETE_ENTITY_TYPES; import static com.linkedin.datahub.graphql.resolvers.search.SearchUtils.SEARCHABLE_ENTITY_TYPES; -import static com.linkedin.metadata.DockerTestUtils.checkContainerEngine; -public class ESTestUtils { - private ESTestUtils() { +public class SearchTestUtils { + private SearchTestUtils() { } - private static final String ELASTIC_VERSION = "7.10.1"; - private static final String ELASTIC_IMAGE_NAME = "docker.elastic.co/elasticsearch/elasticsearch"; - private static final String ENV_ELASTIC_IMAGE_FULL_NAME = System.getenv("ELASTIC_IMAGE_FULL_NAME"); - private static final String ELASTIC_IMAGE_FULL_NAME = ENV_ELASTIC_IMAGE_FULL_NAME != null - ? ENV_ELASTIC_IMAGE_FULL_NAME : ELASTIC_IMAGE_NAME + ":" + ELASTIC_VERSION; - private static final DockerImageName DOCKER_IMAGE_NAME = DockerImageName.parse(ELASTIC_IMAGE_FULL_NAME) - .asCompatibleSubstituteFor(ELASTIC_IMAGE_NAME); - - public static final ElasticsearchContainer ES_CONTAINER; - - // A helper method to create an ElasticseachContainer defaulting to the current image and version, with the ability - // within firewalled environments to override with an environment variable to point to the offline repository. - static { - ES_CONTAINER = new ElasticsearchContainer(DOCKER_IMAGE_NAME); - checkContainerEngine(ES_CONTAINER.getDockerClient()); - ES_CONTAINER.withEnv("ES_JAVA_OPTS", "-Xms64m -Xmx384m -XX:MaxDirectMemorySize=368435456") - .withStartupTimeout(Duration.ofMinutes(5)); // usually < 1min + public static void syncAfterWrite(ESBulkProcessor bulkProcessor) throws InterruptedException { + bulkProcessor.flush(); + Thread.sleep(1000); } public final static List SEARCHABLE_ENTITIES; @@ -75,7 +59,7 @@ public static SearchResult searchAcrossEntities(SearchService searchService, Str public static SearchResult searchAcrossEntities(SearchService searchService, String query, @Nullable List facets) { return searchService.searchAcrossEntities(SEARCHABLE_ENTITIES, query, null, null, 0, - 100, new SearchFlags().setFulltext(true).setSkipCache(true), facets); + 100, new SearchFlags().setFulltext(true).setSkipCache(true), facets); } public static SearchResult searchAcrossCustomEntities(SearchService searchService, String query, List searchableEntities) { @@ -89,12 +73,12 @@ public static SearchResult search(SearchService searchService, String query) { public static SearchResult search(SearchService searchService, List entities, String query) { return searchService.search(entities, query, null, null, 0, 100, - new SearchFlags().setFulltext(true).setSkipCache(true)); + new SearchFlags().setFulltext(true).setSkipCache(true)); } public static ScrollResult scroll(SearchService searchService, String query, int batchSize, @Nullable String scrollId) { return searchService.scrollAcrossEntities(SEARCHABLE_ENTITIES, query, null, null, - scrollId, "3m", batchSize, new SearchFlags().setFulltext(true).setSkipCache(true)); + scrollId, "3m", batchSize, new SearchFlags().setFulltext(true).setSkipCache(true)); } public static SearchResult searchStructured(SearchService searchService, String query) { @@ -112,9 +96,9 @@ public static LineageSearchResult lineage(LineageSearchService lineageSearchServ .build()); return lineageSearchService.searchAcrossLineage(root, LineageDirection.DOWNSTREAM, - SEARCHABLE_ENTITY_TYPES.stream().map(EntityTypeMapper::getName).collect(Collectors.toList()), - "*", hops, ResolverUtils.buildFilter(filters, List.of()), null, 0, 100, null, - null, new SearchFlags().setSkipCache(true)); + SEARCHABLE_ENTITY_TYPES.stream().map(EntityTypeMapper::getName).collect(Collectors.toList()), + "*", hops, ResolverUtils.buildFilter(filters, List.of()), null, 0, 100, null, + null, new SearchFlags().setSkipCache(true)); } public static AutoCompleteResults autocomplete(SearchableEntityType searchableEntityType, String query) throws Exception { @@ -160,4 +144,4 @@ public HttpAsyncClientBuilder customizeHttpClient( } }); } -} \ No newline at end of file +} diff --git a/metadata-io/src/test/java/io/datahubproject/test/search/config/SearchCommonTestConfiguration.java b/metadata-io/src/test/java/io/datahubproject/test/search/config/SearchCommonTestConfiguration.java new file mode 100644 index 0000000000000..530d3f4d53625 --- /dev/null +++ b/metadata-io/src/test/java/io/datahubproject/test/search/config/SearchCommonTestConfiguration.java @@ -0,0 +1,63 @@ +package io.datahubproject.test.search.config; + +import com.fasterxml.jackson.dataformat.yaml.YAMLMapper; +import com.linkedin.metadata.config.search.CustomConfiguration; +import com.linkedin.metadata.config.search.ExactMatchConfiguration; +import com.linkedin.metadata.config.search.PartialConfiguration; +import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.config.search.WordGramConfiguration; +import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; +import com.linkedin.metadata.models.registry.ConfigEntityRegistry; +import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.metadata.models.registry.EntityRegistryException; +import org.springframework.boot.test.context.TestConfiguration; +import org.springframework.context.annotation.Bean; + +/** + * This is common configuration for search regardless of which + * test container implementation. + */ +@TestConfiguration +public class SearchCommonTestConfiguration { + @Bean + public SearchConfiguration searchConfiguration() { + SearchConfiguration searchConfiguration = new SearchConfiguration(); + searchConfiguration.setMaxTermBucketSize(20); + + ExactMatchConfiguration exactMatchConfiguration = new ExactMatchConfiguration(); + exactMatchConfiguration.setExclusive(false); + exactMatchConfiguration.setExactFactor(10.0f); + exactMatchConfiguration.setWithPrefix(true); + exactMatchConfiguration.setPrefixFactor(6.0f); + exactMatchConfiguration.setCaseSensitivityFactor(0.7f); + exactMatchConfiguration.setEnableStructured(true); + + WordGramConfiguration wordGramConfiguration = new WordGramConfiguration(); + wordGramConfiguration.setTwoGramFactor(1.2f); + wordGramConfiguration.setThreeGramFactor(1.5f); + wordGramConfiguration.setFourGramFactor(1.8f); + + PartialConfiguration partialConfiguration = new PartialConfiguration(); + partialConfiguration.setFactor(0.4f); + partialConfiguration.setUrnFactor(0.5f); + + searchConfiguration.setExactMatch(exactMatchConfiguration); + searchConfiguration.setWordGram(wordGramConfiguration); + searchConfiguration.setPartial(partialConfiguration); + return searchConfiguration; + } + + @Bean + public CustomSearchConfiguration customSearchConfiguration() throws Exception { + CustomConfiguration customConfiguration = new CustomConfiguration(); + customConfiguration.setEnabled(true); + customConfiguration.setFile("search_config_builder_test.yml"); + return customConfiguration.resolve(new YAMLMapper()); + } + + @Bean(name = "entityRegistry") + public EntityRegistry entityRegistry() throws EntityRegistryException { + return new ConfigEntityRegistry( + SearchCommonTestConfiguration.class.getClassLoader().getResourceAsStream("entity-registry.yml")); + } +} diff --git a/metadata-io/src/test/java/io/datahubproject/test/search/config/SearchTestContainerConfiguration.java b/metadata-io/src/test/java/io/datahubproject/test/search/config/SearchTestContainerConfiguration.java new file mode 100644 index 0000000000000..2cfa9f9187825 --- /dev/null +++ b/metadata-io/src/test/java/io/datahubproject/test/search/config/SearchTestContainerConfiguration.java @@ -0,0 +1,88 @@ +package io.datahubproject.test.search.config; + +import com.linkedin.metadata.config.search.ElasticSearchConfiguration; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; +import com.linkedin.metadata.version.GitVersion; +import java.util.Optional; + +import org.apache.http.HttpHost; +import org.apache.http.impl.nio.reactor.IOReactorConfig; +import org.opensearch.action.support.WriteRequest; +import org.opensearch.client.RestClient; +import org.opensearch.client.RestClientBuilder; +import org.opensearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.boot.test.context.TestConfiguration; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Primary; +import org.testcontainers.containers.GenericContainer; + +import javax.annotation.Nonnull; + +import java.util.Map; + + +/** + * This configuration is for `test containers` it builds these objects tied to + * the test container instantiated for tests. Could be ES or OpenSearch, etc. + * + * Does your test required a running instance? If no, {@link io.datahubproject.test.search.config.SearchCommonTestConfiguration} instead. + */ +@TestConfiguration +public class SearchTestContainerConfiguration { + // This port is overridden by the specific test container instance + private static final int HTTP_PORT = 9200; + public static final int REFRESH_INTERVAL_SECONDS = 5; + + @Primary + @Bean(name = "searchRestHighLevelClient") + @Nonnull + public RestHighLevelClient getElasticsearchClient(@Qualifier("testSearchContainer") GenericContainer searchContainer) { + // A helper method to create a search test container defaulting to the current image and version, with the ability + // within firewalled environments to override with an environment variable to point to the offline repository. + // A helper method to construct a standard rest client for search. + final RestClientBuilder builder = + RestClient.builder(new HttpHost( + "localhost", + searchContainer.getMappedPort(HTTP_PORT), "http") + ).setHttpClientConfigCallback(httpAsyncClientBuilder -> + httpAsyncClientBuilder.setDefaultIOReactorConfig(IOReactorConfig.custom().setIoThreadCount(1).build())); + + builder.setRequestConfigCallback(requestConfigBuilder -> requestConfigBuilder. + setConnectionRequestTimeout(30000)); + + return new RestHighLevelClient(builder); + } + + /* + Cannot use the factory class without circular dependencies + */ + @Primary + @Bean(name = "searchBulkProcessor") + @Nonnull + public ESBulkProcessor getBulkProcessor(@Qualifier("searchRestHighLevelClient") RestHighLevelClient searchClient) { + return ESBulkProcessor.builder(searchClient) + .async(true) + /* + * Force a refresh as part of this request. This refresh policy does not scale for high indexing or search throughput but is useful + * to present a consistent view to for indices with very low traffic. And it is wonderful for tests! + */ + .writeRequestRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE) + .bulkRequestsLimit(10000) + .bulkFlushPeriod(REFRESH_INTERVAL_SECONDS - 1) + .retryInterval(1L) + .numRetries(1) + .build(); + } + + @Primary + @Bean(name = "searchIndexBuilder") + @Nonnull + protected ESIndexBuilder getIndexBuilder(@Qualifier("searchRestHighLevelClient") RestHighLevelClient searchClient) { + GitVersion gitVersion = new GitVersion("0.0.0-test", "123456", Optional.empty()); + return new ESIndexBuilder(searchClient, 1, 1, 3, 1, Map.of(), + false, false, + new ElasticSearchConfiguration(), gitVersion); + } +} diff --git a/metadata-io/src/test/resources/testng-other.xml b/metadata-io/src/test/resources/testng-other.xml new file mode 100644 index 0000000000000..e214fdb8c1f61 --- /dev/null +++ b/metadata-io/src/test/resources/testng-other.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/metadata-io/src/test/resources/testng-search.xml b/metadata-io/src/test/resources/testng-search.xml new file mode 100644 index 0000000000000..3b32ae34c1f5a --- /dev/null +++ b/metadata-io/src/test/resources/testng-search.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/metadata-io/src/test/resources/testng.xml b/metadata-io/src/test/resources/testng.xml new file mode 100644 index 0000000000000..fdd1c1a6c8921 --- /dev/null +++ b/metadata-io/src/test/resources/testng.xml @@ -0,0 +1,14 @@ + + + + + + + + + \ No newline at end of file diff --git a/metadata-jobs/mae-consumer/build.gradle b/metadata-jobs/mae-consumer/build.gradle index 69fe2255a6916..d36fd0de40d03 100644 --- a/metadata-jobs/mae-consumer/build.gradle +++ b/metadata-jobs/mae-consumer/build.gradle @@ -44,6 +44,7 @@ dependencies { testImplementation externalDependency.mockito implementation externalDependency.awsMskIamAuth + testImplementation externalDependency.testng testImplementation externalDependency.springBootTest testRuntimeOnly externalDependency.logbackClassic } diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/ElasticEvent.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/ElasticEvent.java index 7ba04ecd2389e..b0fade24e26ad 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/ElasticEvent.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/ElasticEvent.java @@ -2,7 +2,7 @@ import com.linkedin.events.metadata.ChangeType; import lombok.Data; -import org.elasticsearch.common.xcontent.XContentBuilder; +import org.opensearch.core.xcontent.XContentBuilder; @Data public abstract class ElasticEvent { diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/ElasticsearchConnector.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/ElasticsearchConnector.java index afa69c9f1750e..bea75f7b282ee 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/ElasticsearchConnector.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/ElasticsearchConnector.java @@ -5,11 +5,11 @@ import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.DocWriteRequest; -import org.elasticsearch.action.delete.DeleteRequest; -import org.elasticsearch.action.index.IndexRequest; -import org.elasticsearch.action.update.UpdateRequest; -import org.elasticsearch.common.xcontent.XContentType; +import org.opensearch.action.DocWriteRequest; +import org.opensearch.action.delete.DeleteRequest; +import org.opensearch.action.index.IndexRequest; +import org.opensearch.action.update.UpdateRequest; +import org.opensearch.common.xcontent.XContentType; @Slf4j diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/JsonElasticEvent.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/JsonElasticEvent.java index d42464051d7ec..230cd8433e6ff 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/JsonElasticEvent.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/JsonElasticEvent.java @@ -1,11 +1,11 @@ package com.linkedin.metadata.kafka.elasticsearch; -import org.elasticsearch.common.xcontent.DeprecationHandler; -import org.elasticsearch.common.xcontent.NamedXContentRegistry; -import org.elasticsearch.common.xcontent.XContentBuilder; -import org.elasticsearch.common.xcontent.XContentFactory; -import org.elasticsearch.common.xcontent.XContentParser; -import org.elasticsearch.common.xcontent.XContentType; +import org.opensearch.core.xcontent.DeprecationHandler; +import org.opensearch.core.xcontent.NamedXContentRegistry; +import org.opensearch.core.xcontent.XContentBuilder; +import org.opensearch.common.xcontent.XContentFactory; +import org.opensearch.core.xcontent.XContentParser; +import org.opensearch.common.xcontent.XContentType; import java.io.IOException; import javax.annotation.Nullable; diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/MCEElasticEvent.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/MCEElasticEvent.java index 184efa1573b35..a3d6dca75068b 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/MCEElasticEvent.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/MCEElasticEvent.java @@ -2,12 +2,12 @@ import com.linkedin.data.template.RecordTemplate; import com.datahub.util.RecordUtils; -import org.elasticsearch.common.xcontent.DeprecationHandler; -import org.elasticsearch.common.xcontent.NamedXContentRegistry; -import org.elasticsearch.common.xcontent.XContentBuilder; -import org.elasticsearch.common.xcontent.XContentFactory; -import org.elasticsearch.common.xcontent.XContentParser; -import org.elasticsearch.common.xcontent.XContentType; +import org.opensearch.core.xcontent.DeprecationHandler; +import org.opensearch.core.xcontent.NamedXContentRegistry; +import org.opensearch.core.xcontent.XContentBuilder; +import org.opensearch.common.xcontent.XContentFactory; +import org.opensearch.core.xcontent.XContentParser; +import org.opensearch.common.xcontent.XContentType; import java.io.IOException; import javax.annotation.Nullable; diff --git a/metadata-models/build.gradle b/metadata-models/build.gradle index db01be3ccebdf..53e7765152aef 100644 --- a/metadata-models/build.gradle +++ b/metadata-models/build.gradle @@ -34,6 +34,7 @@ dependencies { swaggerCodegen externalDependency.swaggerCli testImplementation externalDependency.guava + testImplementation externalDependency.testngJava8 } sourceSets { diff --git a/metadata-service/auth-impl/build.gradle b/metadata-service/auth-impl/build.gradle index 1ffeb99e7ad4a..60d622dea5447 100644 --- a/metadata-service/auth-impl/build.gradle +++ b/metadata-service/auth-impl/build.gradle @@ -24,4 +24,5 @@ dependencies { annotationProcessor externalDependency.lombok testImplementation externalDependency.mockito + testImplementation externalDependency.testng } \ No newline at end of file diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/RestHighLevelClientFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/RestHighLevelClientFactory.java index 1da66f3192f80..5f50b8f7f0508 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/RestHighLevelClientFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/RestHighLevelClientFactory.java @@ -24,9 +24,9 @@ import org.apache.http.nio.reactor.IOReactorException; import org.apache.http.nio.reactor.IOReactorExceptionHandler; import org.apache.http.ssl.SSLContexts; -import org.elasticsearch.client.RestClient; -import org.elasticsearch.client.RestClientBuilder; -import org.elasticsearch.client.RestHighLevelClient; +import org.opensearch.client.RestClient; +import org.opensearch.client.RestClientBuilder; +import org.opensearch.client.RestHighLevelClient; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.beans.factory.annotation.Value; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/graphql/GraphQLEngineFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/graphql/GraphQLEngineFactory.java index 0be69e5dad58d..d7aee59ca6dd1 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/graphql/GraphQLEngineFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/graphql/GraphQLEngineFactory.java @@ -39,7 +39,7 @@ import com.linkedin.metadata.version.GitVersion; import com.linkedin.usage.UsageClient; import javax.annotation.Nonnull; -import org.elasticsearch.client.RestHighLevelClient; +import org.opensearch.client.RestHighLevelClient; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.beans.factory.annotation.Value; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/recommendation/candidatesource/MostPopularCandidateSourceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/recommendation/candidatesource/MostPopularCandidateSourceFactory.java index c74f5e11cadce..c266b3635b16f 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/recommendation/candidatesource/MostPopularCandidateSourceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/recommendation/candidatesource/MostPopularCandidateSourceFactory.java @@ -7,7 +7,7 @@ import com.linkedin.metadata.recommendation.candidatesource.MostPopularSource; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import javax.annotation.Nonnull; -import org.elasticsearch.client.RestHighLevelClient; +import org.opensearch.client.RestHighLevelClient; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.context.annotation.Bean; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/recommendation/candidatesource/RecentlyEditedCandidateSourceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/recommendation/candidatesource/RecentlyEditedCandidateSourceFactory.java index 58584a4d957de..109cc8dbc82d1 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/recommendation/candidatesource/RecentlyEditedCandidateSourceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/recommendation/candidatesource/RecentlyEditedCandidateSourceFactory.java @@ -7,7 +7,7 @@ import com.linkedin.metadata.recommendation.candidatesource.RecentlyEditedSource; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import javax.annotation.Nonnull; -import org.elasticsearch.client.RestHighLevelClient; +import org.opensearch.client.RestHighLevelClient; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.context.annotation.Bean; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/recommendation/candidatesource/RecentlySearchedCandidateSourceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/recommendation/candidatesource/RecentlySearchedCandidateSourceFactory.java index b3779a132284f..5209f65a2ec63 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/recommendation/candidatesource/RecentlySearchedCandidateSourceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/recommendation/candidatesource/RecentlySearchedCandidateSourceFactory.java @@ -5,7 +5,7 @@ import com.linkedin.metadata.recommendation.candidatesource.RecentlySearchedSource; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import javax.annotation.Nonnull; -import org.elasticsearch.client.RestHighLevelClient; +import org.opensearch.client.RestHighLevelClient; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.context.annotation.Bean; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/recommendation/candidatesource/RecentlyViewedCandidateSourceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/recommendation/candidatesource/RecentlyViewedCandidateSourceFactory.java index d0505e8d2a3ea..aea40b4d8eb46 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/recommendation/candidatesource/RecentlyViewedCandidateSourceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/recommendation/candidatesource/RecentlyViewedCandidateSourceFactory.java @@ -7,7 +7,7 @@ import com.linkedin.metadata.recommendation.candidatesource.RecentlyViewedSource; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import javax.annotation.Nonnull; -import org.elasticsearch.client.RestHighLevelClient; +import org.opensearch.client.RestHighLevelClient; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.context.annotation.Bean; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/BaseElasticSearchComponentsFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/BaseElasticSearchComponentsFactory.java index eeb32ae1ddbf9..620af803723e7 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/BaseElasticSearchComponentsFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/BaseElasticSearchComponentsFactory.java @@ -8,7 +8,7 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import javax.annotation.Nonnull; import org.springframework.beans.factory.annotation.Value; -import org.elasticsearch.client.RestHighLevelClient; +import org.opensearch.client.RestHighLevelClient; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.context.annotation.Bean; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactory.java index 956157f70e6bc..fc6f92b2678f3 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactory.java @@ -6,8 +6,8 @@ import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.support.WriteRequest; -import org.elasticsearch.client.RestHighLevelClient; +import org.opensearch.action.support.WriteRequest; +import org.opensearch.client.RestHighLevelClient; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.beans.factory.annotation.Value; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactory.java index decbc2e12a998..495d77ccbb29f 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactory.java @@ -13,7 +13,7 @@ import javax.annotation.Nullable; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; -import org.elasticsearch.client.RestHighLevelClient; +import org.opensearch.client.RestHighLevelClient; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.beans.factory.annotation.Value; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/DailyReport.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/DailyReport.java index 2972316856a8d..2610ebd3528cd 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/DailyReport.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/DailyReport.java @@ -12,7 +12,7 @@ import java.io.IOException; import java.util.Optional; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.client.RestHighLevelClient; +import org.opensearch.client.RestHighLevelClient; import org.joda.time.DateTime; import org.json.JSONObject; import org.springframework.scheduling.annotation.Scheduled; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/ScheduledAnalyticsFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/ScheduledAnalyticsFactory.java index c5501067ff393..7cdca996a8131 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/ScheduledAnalyticsFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/ScheduledAnalyticsFactory.java @@ -6,7 +6,7 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.version.GitVersion; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.client.RestHighLevelClient; +import org.opensearch.client.RestHighLevelClient; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; import org.springframework.context.annotation.Bean; diff --git a/metadata-service/factories/src/test/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactoryTest.java b/metadata-service/factories/src/test/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactoryTest.java index 859c8e18cacff..266039afb45d5 100644 --- a/metadata-service/factories/src/test/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactoryTest.java +++ b/metadata-service/factories/src/test/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactoryTest.java @@ -2,7 +2,7 @@ import com.linkedin.gms.factory.config.ConfigurationProvider; import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; -import org.elasticsearch.action.support.WriteRequest; +import org.opensearch.action.support.WriteRequest; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.context.properties.EnableConfigurationProperties; import org.springframework.boot.test.context.SpringBootTest; diff --git a/metadata-service/factories/src/test/java/io/datahubproject/telemetry/TelemetryUtilsTest.java b/metadata-service/factories/src/test/java/io/datahubproject/telemetry/TelemetryUtilsTest.java index 28c47f169a111..fe0d61986b4a6 100644 --- a/metadata-service/factories/src/test/java/io/datahubproject/telemetry/TelemetryUtilsTest.java +++ b/metadata-service/factories/src/test/java/io/datahubproject/telemetry/TelemetryUtilsTest.java @@ -7,8 +7,8 @@ import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; -import static org.junit.Assert.*; import static org.mockito.ArgumentMatchers.*; +import static org.testng.AssertJUnit.assertEquals; public class TelemetryUtilsTest { diff --git a/metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java b/metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java index 02ca5182cd2be..c200e63e0d497 100644 --- a/metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java +++ b/metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java @@ -11,11 +11,11 @@ import java.util.concurrent.TimeUnit; import java.util.function.Supplier; -import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequest; -import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.cluster.health.ClusterHealthStatus; +import org.opensearch.action.admin.cluster.health.ClusterHealthRequest; +import org.opensearch.action.admin.cluster.health.ClusterHealthResponse; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.cluster.health.ClusterHealthStatus; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.http.HttpStatus; diff --git a/metadata-service/openapi-analytics-servlet/src/test/java/io/datahubproject/openapi/config/OpenAPIAnalyticsTestConfiguration.java b/metadata-service/openapi-analytics-servlet/src/test/java/io/datahubproject/openapi/config/OpenAPIAnalyticsTestConfiguration.java index 98f0db8fd10ef..83b1b3f87c724 100644 --- a/metadata-service/openapi-analytics-servlet/src/test/java/io/datahubproject/openapi/config/OpenAPIAnalyticsTestConfiguration.java +++ b/metadata-service/openapi-analytics-servlet/src/test/java/io/datahubproject/openapi/config/OpenAPIAnalyticsTestConfiguration.java @@ -7,7 +7,7 @@ import com.datahub.authorization.AuthorizationResult; import com.datahub.authorization.AuthorizerChain; import com.linkedin.metadata.search.elasticsearch.ElasticSearchService; -import org.elasticsearch.action.search.SearchResponse; +import org.opensearch.action.search.SearchResponse; import org.mockito.Mockito; import org.springframework.boot.test.context.TestConfiguration; import org.springframework.context.annotation.Bean; diff --git a/metadata-service/openapi-entity-servlet/src/test/java/io/datahubproject/openapi/util/OpenApiEntitiesUtilTest.java b/metadata-service/openapi-entity-servlet/src/test/java/io/datahubproject/openapi/util/OpenApiEntitiesUtilTest.java index 8f87b041a7e03..e0fec07452302 100644 --- a/metadata-service/openapi-entity-servlet/src/test/java/io/datahubproject/openapi/util/OpenApiEntitiesUtilTest.java +++ b/metadata-service/openapi-entity-servlet/src/test/java/io/datahubproject/openapi/util/OpenApiEntitiesUtilTest.java @@ -17,8 +17,8 @@ import java.util.List; -import static org.junit.Assert.assertNotNull; import static org.testng.AssertJUnit.assertEquals; +import static org.testng.AssertJUnit.assertNotNull; @Import({OpenAPIEntityTestConfiguration.class}) diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/elastic/OperationsController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/elastic/OperationsController.java index 7910982a63133..f29461734ebfc 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/elastic/OperationsController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/elastic/OperationsController.java @@ -13,7 +13,7 @@ import io.swagger.v3.oas.annotations.tags.Tag; import java.util.List; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.client.tasks.GetTaskResponse; +import org.opensearch.client.tasks.GetTaskResponse; import org.json.JSONObject; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/OperationsResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/OperationsResource.java index 17de9ceea35a3..1e6523e774d66 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/OperationsResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/OperationsResource.java @@ -35,7 +35,7 @@ import javax.inject.Inject; import javax.inject.Named; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.client.tasks.GetTaskResponse; +import org.opensearch.client.tasks.GetTaskResponse; import org.json.JSONObject; import static com.linkedin.metadata.Constants.*; diff --git a/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/operations/OperationsResourceTest.java b/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/operations/OperationsResourceTest.java index 665bc3cfc277c..470c6e87040ec 100644 --- a/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/operations/OperationsResourceTest.java +++ b/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/operations/OperationsResourceTest.java @@ -3,12 +3,13 @@ import com.linkedin.metadata.timeseries.TimeseriesAspectService; import com.linkedin.util.Pair; import java.util.List; -import junit.framework.TestCase; import mock.MockTimeseriesAspectService; import org.testng.annotations.Test; +import static org.testng.AssertJUnit.*; -public class OperationsResourceTest extends TestCase { + +public class OperationsResourceTest { private static final String TASK_ID = "taskId123"; diff --git a/metadata-service/services/build.gradle b/metadata-service/services/build.gradle index 99345d6f6bc3f..22c62af324c12 100644 --- a/metadata-service/services/build.gradle +++ b/metadata-service/services/build.gradle @@ -63,8 +63,6 @@ dependencies { } test { - // https://docs.gradle.org/current/userguide/performance.html - maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 testLogging.showStandardStreams = true testLogging.exceptionFormat = 'full' } diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlySearchedSource.java b/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlySearchedSource.java index ac17c882c24b6..357a5df2edd44 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlySearchedSource.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlySearchedSource.java @@ -20,18 +20,18 @@ import javax.annotation.Nonnull; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.client.indices.GetIndexRequest; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.aggregations.AggregationBuilder; -import org.elasticsearch.search.aggregations.AggregationBuilders; -import org.elasticsearch.search.aggregations.BucketOrder; -import org.elasticsearch.search.aggregations.bucket.terms.ParsedTerms; -import org.elasticsearch.search.builder.SearchSourceBuilder; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.client.indices.GetIndexRequest; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.aggregations.AggregationBuilder; +import org.opensearch.search.aggregations.AggregationBuilders; +import org.opensearch.search.aggregations.BucketOrder; +import org.opensearch.search.aggregations.bucket.terms.ParsedTerms; +import org.opensearch.search.builder.SearchSourceBuilder; @Slf4j diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/SystemMetadataService.java b/metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/SystemMetadataService.java index e6f2106bd5c3e..ea59885e8b6d5 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/SystemMetadataService.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/SystemMetadataService.java @@ -8,7 +8,7 @@ import java.util.Optional; import javax.annotation.Nonnull; import javax.annotation.Nullable; -import org.elasticsearch.client.tasks.GetTaskResponse; +import org.opensearch.client.tasks.GetTaskResponse; public interface SystemMetadataService { diff --git a/metadata-service/servlet/src/main/java/com/datahub/gms/servlet/ConfigSearchExport.java b/metadata-service/servlet/src/main/java/com/datahub/gms/servlet/ConfigSearchExport.java index 2c26c00e9c4d6..d788222c5d87b 100644 --- a/metadata-service/servlet/src/main/java/com/datahub/gms/servlet/ConfigSearchExport.java +++ b/metadata-service/servlet/src/main/java/com/datahub/gms/servlet/ConfigSearchExport.java @@ -9,16 +9,16 @@ import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.search.elasticsearch.query.request.SearchRequestHandler; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.MatchAllQueryBuilder; -import org.elasticsearch.index.query.MatchPhrasePrefixQueryBuilder; -import org.elasticsearch.index.query.QueryBuilder; -import org.elasticsearch.index.query.SimpleQueryStringBuilder; -import org.elasticsearch.index.query.TermQueryBuilder; -import org.elasticsearch.index.query.functionscore.FieldValueFactorFunctionBuilder; -import org.elasticsearch.index.query.functionscore.FunctionScoreQueryBuilder; -import org.elasticsearch.index.query.functionscore.WeightBuilder; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.MatchAllQueryBuilder; +import org.opensearch.index.query.MatchPhrasePrefixQueryBuilder; +import org.opensearch.index.query.QueryBuilder; +import org.opensearch.index.query.SimpleQueryStringBuilder; +import org.opensearch.index.query.TermQueryBuilder; +import org.opensearch.index.query.functionscore.FieldValueFactorFunctionBuilder; +import org.opensearch.index.query.functionscore.FunctionScoreQueryBuilder; +import org.opensearch.index.query.functionscore.WeightBuilder; import org.springframework.web.context.WebApplicationContext; import org.springframework.web.context.support.WebApplicationContextUtils; diff --git a/metadata-service/servlet/src/main/java/com/datahub/gms/util/CSVWriter.java b/metadata-service/servlet/src/main/java/com/datahub/gms/util/CSVWriter.java index e9d1308e857d8..79d4f7077b797 100644 --- a/metadata-service/servlet/src/main/java/com/datahub/gms/util/CSVWriter.java +++ b/metadata-service/servlet/src/main/java/com/datahub/gms/util/CSVWriter.java @@ -2,8 +2,8 @@ import lombok.Builder; -import org.elasticsearch.index.query.functionscore.FieldValueFactorFunctionBuilder; -import org.elasticsearch.index.query.functionscore.WeightBuilder; +import org.opensearch.index.query.functionscore.FieldValueFactorFunctionBuilder; +import org.opensearch.index.query.functionscore.WeightBuilder; import java.io.PrintWriter; import java.util.stream.Collectors; diff --git a/metadata-utils/build.gradle b/metadata-utils/build.gradle index 9f8ef70a0e728..1c1c368611488 100644 --- a/metadata-utils/build.gradle +++ b/metadata-utils/build.gradle @@ -26,6 +26,7 @@ dependencies { testImplementation project(':test-models') testImplementation project(path: ':test-models', configuration: 'testDataTemplate') + testImplementation externalDependency.testng constraints { implementation(externalDependency.log4jCore) { diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/utils/SearchUtil.java b/metadata-utils/src/main/java/com/linkedin/metadata/utils/SearchUtil.java index 8b4b500dfc455..69bd3b461eb12 100644 --- a/metadata-utils/src/main/java/com/linkedin/metadata/utils/SearchUtil.java +++ b/metadata-utils/src/main/java/com/linkedin/metadata/utils/SearchUtil.java @@ -18,8 +18,8 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.QueryBuilders; import javax.annotation.Nonnull; import javax.annotation.Nullable; diff --git a/smoke-test/cypress-dev.sh b/smoke-test/cypress-dev.sh index 41dca90acc9fc..93f03d36cbd19 100755 --- a/smoke-test/cypress-dev.sh +++ b/smoke-test/cypress-dev.sh @@ -17,4 +17,5 @@ npm install source ../../set-cypress-creds.sh -npx cypress open +npx cypress open \ + --env "ADMIN_DISPLAYNAME=$CYPRESS_ADMIN_DISPLAYNAME,ADMIN_USERNAME=$CYPRESS_ADMIN_USERNAME,ADMIN_PASSWORD=$CYPRESS_ADMIN_PASSWORD" diff --git a/smoke-test/run-quickstart.sh b/smoke-test/run-quickstart.sh index 050b5d2db95c9..cd747321ad602 100755 --- a/smoke-test/run-quickstart.sh +++ b/smoke-test/run-quickstart.sh @@ -12,7 +12,16 @@ pip install -r requirements.txt mkdir -p ~/.datahub/plugins/frontend/auth/ echo "test_user:test_pass" >> ~/.datahub/plugins/frontend/auth/user.props +DATAHUB_SEARCH_IMAGE="${DATAHUB_SEARCH_IMAGE:=opensearchproject/opensearch}" +DATAHUB_SEARCH_TAG="${DATAHUB_SEARCH_TAG:=2.9.0}" +XPACK_SECURITY_ENABLED="${XPACK_SECURITY_ENABLED:=plugins.security.disabled=true}" +ELASTICSEARCH_USE_SSL="${ELASTICSEARCH_USE_SSL:=false}" +USE_AWS_ELASTICSEARCH="${USE_AWS_ELASTICSEARCH:=true}" + echo "DATAHUB_VERSION = $DATAHUB_VERSION" DATAHUB_TELEMETRY_ENABLED=false \ DOCKER_COMPOSE_BASE="file://$( dirname "$DIR" )" \ +DATAHUB_SEARCH_IMAGE="$DATAHUB_SEARCH_IMAGE" DATAHUB_SEARCH_TAG="$DATAHUB_SEARCH_TAG" \ +XPACK_SECURITY_ENABLED="$XPACK_SECURITY_ENABLED" ELASTICSEARCH_USE_SSL="$ELASTICSEARCH_USE_SSL" \ +USE_AWS_ELASTICSEARCH="$USE_AWS_ELASTICSEARCH" \ datahub docker quickstart --version ${DATAHUB_VERSION} --standalone_consumers --dump-logs-on-failure --kafka-setup diff --git a/smoke-test/set-cypress-creds.sh b/smoke-test/set-cypress-creds.sh index 0512724e9a269..82fe736b0a7e1 100644 --- a/smoke-test/set-cypress-creds.sh +++ b/smoke-test/set-cypress-creds.sh @@ -1,4 +1,5 @@ #!/bin/bash export CYPRESS_ADMIN_USERNAME=${ADMIN_USERNAME:-datahub} -export CYPRESS_ADMIN_PASSWORD=${ADMIN_PASSWORD:-datahub} \ No newline at end of file +export CYPRESS_ADMIN_PASSWORD=${ADMIN_PASSWORD:-datahub} +export CYPRESS_ADMIN_DISPLAYNAME=${ADMIN_DISPLAYNAME:-DataHub} \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/e2e/login/login.js b/smoke-test/tests/cypress/cypress/e2e/login/login.js index f86741b5afe01..309eedb10b6da 100644 --- a/smoke-test/tests/cypress/cypress/e2e/login/login.js +++ b/smoke-test/tests/cypress/cypress/e2e/login/login.js @@ -4,6 +4,6 @@ describe('login', () => { cy.get('input[data-testid=username]').type(Cypress.env('ADMIN_USERNAME')); cy.get('input[data-testid=password]').type(Cypress.env('ADMIN_PASSWORD')); cy.contains('Sign In').click(); - cy.contains('Welcome back, DataHub'); + cy.contains('Welcome back, ' + Cypress.env('ADMIN_DISPLAYNAME')); }); }) From 5f06dbf542a75a0a260706ceb99b9eaf27824141 Mon Sep 17 00:00:00 2001 From: Zachary McNellis Date: Thu, 21 Sep 2023 15:29:36 -0400 Subject: [PATCH 18/37] docs(observability): Custom Assertion user guide updates (#8878) --- docs/managed-datahub/observe/custom-assertions.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/managed-datahub/observe/custom-assertions.md b/docs/managed-datahub/observe/custom-assertions.md index d52ac4b38cb4b..e221cf1058fd0 100644 --- a/docs/managed-datahub/observe/custom-assertions.md +++ b/docs/managed-datahub/observe/custom-assertions.md @@ -99,11 +99,10 @@ The **Condition Type**: This defines the conditions under which the Assertion wi - **Is Not Equal To**: The assertion will fail if the query result is not equal to the configured value - **Is Greater Than**: The assertion will fail if the query result is greater than the configured value - **Is Less Than**: The assertion will fail if the query result is less than the configured value -- **Is False**: The assertion will fail if the query result is false (i.e. 0) - **Is outside a range**: The assertion will fail if the query result is outside the configured range -- **Grows More Than**: The assertion will fail if the query result grows more than the configured range. This can be either a percentage (**Percentage**) or an absolute value (**Differential**). -- **Grows Less Than**: The assertion will fail if the query result grows less than the configured percentage. This can be either a percentage (**Percentage**) or an absolute value (**Differential**). -- **Growth is outside a range**: The assertion will fail if the query result growth is outside the configured range. This can be either a percentage (**Percentage**) or an absolute value (**Differential**). +- **Grows More Than**: The assertion will fail if the query result grows more than the configured range. This can be either a percentage (**Percentage**) or a number (**Value**). +- **Grows Less Than**: The assertion will fail if the query result grows less than the configured percentage. This can be either a percentage (**Percentage**) or a number (**Value**). +- **Growth is outside a range**: The assertion will fail if the query result growth is outside the configured range. This can be either a percentage (**Percentage**) or a number (**Value**). Custom Assertions also have an off switch: they can be started or stopped at any time with the click of button. From 2a0200b0477ce5a0c697876b4619484b3caed9d5 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 21 Sep 2023 14:28:51 -0700 Subject: [PATCH 19/37] feat(ingest): bump acryl-sqlglot (#8882) --- metadata-ingestion/setup.py | 2 +- ...est_select_ambiguous_column_no_schema.json | 31 +++++++++++++++++++ .../unit/sql_parsing/test_sqlglot_lineage.py | 10 ++++++ 3 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_ambiguous_column_no_schema.json diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index b9169186174fa..e748461b156ae 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -138,7 +138,7 @@ def get_long_description(): sqlglot_lib = { # Using an Acryl fork of sqlglot. # https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:hsheth?expand=1 - "acryl-sqlglot==18.0.2.dev15", + "acryl-sqlglot==18.5.2.dev45", } aws_common = { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_ambiguous_column_no_schema.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_ambiguous_column_no_schema.json new file mode 100644 index 0000000000000..10f5ee20b0c1f --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_ambiguous_column_no_schema.json @@ -0,0 +1,31 @@ +{ + "query_type": "SELECT", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:hive,t1,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:hive,t2,PROD)" + ], + "out_tables": [], + "column_lineage": [ + { + "downstream": { + "table": null, + "column": "a" + }, + "upstreams": [] + }, + { + "downstream": { + "table": null, + "column": "b" + }, + "upstreams": [] + }, + { + "downstream": { + "table": null, + "column": "c" + }, + "upstreams": [] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py index 7581d3bac010e..483c1ac4cc7f9 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py @@ -208,6 +208,16 @@ def test_select_from_union(): ) +def test_select_ambiguous_column_no_schema(): + assert_sql_result( + """ + select A, B, C from t1 inner join t2 on t1.id = t2.id + """, + dialect="hive", + expected_file=RESOURCE_DIR / "test_select_ambiguous_column_no_schema.json", + ) + + def test_merge_from_union(): # TODO: We don't support merge statements yet, but the union should still get handled. From 5481e19e0a66de0ae3567198c1de11565edfce5c Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Fri, 22 Sep 2023 03:35:26 +0530 Subject: [PATCH 20/37] feat(ingest): bulk fetch schema info for schema resolver (#8865) Co-authored-by: Harshal Sheth --- .../src/datahub/ingestion/graph/client.py | 426 +++++++++++------- .../ingestion/source/bigquery_v2/bigquery.py | 25 +- .../src/datahub/utilities/sqlglot_lineage.py | 34 ++ 3 files changed, 324 insertions(+), 161 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index b371ab181e133..38e965f7f6587 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -44,14 +44,17 @@ TelemetryClientIdClass, ) from datahub.utilities.perf_timer import PerfTimer -from datahub.utilities.urns.dataset_urn import DatasetUrn from datahub.utilities.urns.urn import Urn, guess_entity_type if TYPE_CHECKING: from datahub.ingestion.source.state.entity_removal_state import ( GenericCheckpointState, ) - from datahub.utilities.sqlglot_lineage import SchemaResolver, SqlParsingResult + from datahub.utilities.sqlglot_lineage import ( + GraphQLSchemaMetadata, + SchemaResolver, + SqlParsingResult, + ) logger = logging.getLogger(__name__) @@ -543,129 +546,110 @@ def get_container_urns_by_filter( logger.debug(f"yielding {x['entity']}") yield x["entity"] - def get_urns_by_filter( + def _bulk_fetch_schema_info_by_filter( self, *, - entity_types: Optional[List[str]] = None, platform: Optional[str] = None, platform_instance: Optional[str] = None, env: Optional[str] = None, query: Optional[str] = None, container: Optional[str] = None, status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED, - batch_size: int = 10000, + batch_size: int = 100, extraFilters: Optional[List[SearchFilterRule]] = None, - ) -> Iterable[str]: - """Fetch all urns that match all of the given filters. + ) -> Iterable[Tuple[str, "GraphQLSchemaMetadata"]]: + """Fetch schema info for datasets that match all of the given filters. - Filters are combined conjunctively. If multiple filters are specified, the results will match all of them. - Note that specifying a platform filter will automatically exclude all entity types that do not have a platform. - The same goes for the env filter. + :return: An iterable of (urn, schema info) tuple that match the filters. + """ + types = [_graphql_entity_type("dataset")] - :param entity_types: List of entity types to include. If None, all entity types will be returned. - :param platform: Platform to filter on. If None, all platforms will be returned. - :param platform_instance: Platform instance to filter on. If None, all platform instances will be returned. - :param env: Environment (e.g. PROD, DEV) to filter on. If None, all environments will be returned. - :param query: Query string to filter on. If None, all entities will be returned. - :param container: A container urn that entities must be within. - This works recursively, so it will include entities within sub-containers as well. - If None, all entities will be returned. - Note that this requires browsePathV2 aspects (added in 0.10.4+). - :param status: Filter on the deletion status of the entity. The default is only return non-soft-deleted entities. - :param extraFilters: Additional filters to apply. If specified, the results will match all of the filters. + # Add the query default of * if no query is specified. + query = query or "*" - :return: An iterable of urns that match the filters. - """ + orFilters = self.generate_filter( + platform, platform_instance, env, container, status, extraFilters + ) - types: Optional[List[str]] = None - if entity_types is not None: - if not entity_types: - raise ValueError( - "entity_types cannot be an empty list; use None for all entities" - ) + graphql_query = textwrap.dedent( + """ + query scrollUrnsWithFilters( + $types: [EntityType!], + $query: String!, + $orFilters: [AndFilterInput!], + $batchSize: Int!, + $scrollId: String) { - types = [_graphql_entity_type(entity_type) for entity_type in entity_types] + scrollAcrossEntities(input: { + query: $query, + count: $batchSize, + scrollId: $scrollId, + types: $types, + orFilters: $orFilters, + searchFlags: { + skipHighlighting: true + skipAggregates: true + } + }) { + nextScrollId + searchResults { + entity { + urn + ... on Dataset { + schemaMetadata(version: 0) { + fields { + fieldPath + nativeDataType + } + } + } + } + } + } + } + """ + ) - # Add the query default of * if no query is specified. - query = query or "*" + variables = { + "types": types, + "query": query, + "orFilters": orFilters, + "batchSize": batch_size, + } + + for entity in self._scroll_across_entities(graphql_query, variables): + if entity.get("schemaMetadata"): + yield entity["urn"], entity["schemaMetadata"] + def generate_filter( + self, + platform: Optional[str], + platform_instance: Optional[str], + env: Optional[str], + container: Optional[str], + status: RemovedStatusFilter, + extraFilters: Optional[List[SearchFilterRule]], + ) -> List[Dict[str, List[SearchFilterRule]]]: andFilters: List[SearchFilterRule] = [] # Platform filter. if platform: - andFilters += [ - { - "field": "platform.keyword", - "values": [make_data_platform_urn(platform)], - "condition": "EQUAL", - } - ] + andFilters.append(self._get_platform_filter(platform)) # Platform instance filter. if platform_instance: - if platform: - # Massage the platform instance into a fully qualified urn, if necessary. - platform_instance = make_dataplatform_instance_urn( - platform, platform_instance - ) - - # Warn if platform_instance is not a fully qualified urn. - # TODO: Change this once we have a first-class data platform instance urn type. - if guess_entity_type(platform_instance) != "dataPlatformInstance": - raise ValueError( - f"Invalid data platform instance urn: {platform_instance}" - ) - - andFilters += [ - { - "field": "platformInstance", - "values": [platform_instance], - "condition": "EQUAL", - } - ] + andFilters.append( + self._get_platform_instance_filter(platform, platform_instance) + ) # Browse path v2 filter. if container: - # Warn if container is not a fully qualified urn. - # TODO: Change this once we have a first-class container urn type. - if guess_entity_type(container) != "container": - raise ValueError(f"Invalid container urn: {container}") - - andFilters += [ - { - "field": "browsePathV2", - "values": [container], - "condition": "CONTAIN", - } - ] + andFilters.append(self._get_container_filter(container)) # Status filter. - if status == RemovedStatusFilter.NOT_SOFT_DELETED: - # Subtle: in some cases (e.g. when the dataset doesn't have a status aspect), the - # removed field is simply not present in the ElasticSearch document. Ideally this - # would be a "removed" : "false" filter, but that doesn't work. Instead, we need to - # use a negated filter. - andFilters.append( - { - "field": "removed", - "values": ["true"], - "condition": "EQUAL", - "negated": True, - } - ) - elif status == RemovedStatusFilter.ONLY_SOFT_DELETED: - andFilters.append( - { - "field": "removed", - "values": ["true"], - "condition": "EQUAL", - } - ) - elif status == RemovedStatusFilter.ALL: - # We don't need to add a filter for this case. - pass - else: - raise ValueError(f"Invalid status filter: {status}") + status_filter = self._get_status_filer(status) + if status_filter: + andFilters.append(status_filter) # Extra filters. if extraFilters: @@ -673,33 +657,9 @@ def get_urns_by_filter( orFilters: List[Dict[str, List[SearchFilterRule]]] = [{"and": andFilters}] - # Env filter. + # Env filter if env: - # The env filter is a bit more tricky since it's not always stored - # in the same place in ElasticSearch. - - envOrConditions: List[SearchFilterRule] = [ - # For most entity types, we look at the origin field. - { - "field": "origin", - "value": env, - "condition": "EQUAL", - }, - # For containers, we look at the customProperties field. - # For any containers created after https://github.com/datahub-project/datahub/pull/8027, - # we look for the "env" property. Otherwise, we use the "instance" property. - { - "field": "customProperties", - "value": f"env={env}", - }, - { - "field": "customProperties", - "value": f"instance={env}", - }, - # Note that not all entity types have an env (e.g. dashboards / charts). - # If the env filter is specified, these will be excluded. - ] - + envOrConditions = self._get_env_or_conditions(env) # This matches ALL of the andFilters and at least one of the envOrConditions. orFilters = [ {"and": andFilters["and"] + [extraCondition]} @@ -707,6 +667,52 @@ def get_urns_by_filter( for andFilters in orFilters ] + return orFilters + + def get_urns_by_filter( + self, + *, + entity_types: Optional[List[str]] = None, + platform: Optional[str] = None, + platform_instance: Optional[str] = None, + env: Optional[str] = None, + query: Optional[str] = None, + container: Optional[str] = None, + status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED, + batch_size: int = 10000, + extraFilters: Optional[List[SearchFilterRule]] = None, + ) -> Iterable[str]: + """Fetch all urns that match all of the given filters. + + Filters are combined conjunctively. If multiple filters are specified, the results will match all of them. + Note that specifying a platform filter will automatically exclude all entity types that do not have a platform. + The same goes for the env filter. + + :param entity_types: List of entity types to include. If None, all entity types will be returned. + :param platform: Platform to filter on. If None, all platforms will be returned. + :param platform_instance: Platform instance to filter on. If None, all platform instances will be returned. + :param env: Environment (e.g. PROD, DEV) to filter on. If None, all environments will be returned. + :param query: Query string to filter on. If None, all entities will be returned. + :param container: A container urn that entities must be within. + This works recursively, so it will include entities within sub-containers as well. + If None, all entities will be returned. + Note that this requires browsePathV2 aspects (added in 0.10.4+). + :param status: Filter on the deletion status of the entity. The default is only return non-soft-deleted entities. + :param extraFilters: Additional filters to apply. If specified, the results will match all of the filters. + + :return: An iterable of urns that match the filters. + """ + + types = self._get_types(entity_types) + + # Add the query default of * if no query is specified. + query = query or "*" + + # Env filter. + orFilters = self.generate_filter( + platform, platform_instance, env, container, status, extraFilters + ) + graphql_query = textwrap.dedent( """ query scrollUrnsWithFilters( @@ -738,18 +744,26 @@ def get_urns_by_filter( """ ) + variables = { + "types": types, + "query": query, + "orFilters": orFilters, + "batchSize": batch_size, + } + + for entity in self._scroll_across_entities(graphql_query, variables): + yield entity["urn"] + + def _scroll_across_entities( + self, graphql_query: str, variables_orig: dict + ) -> Iterable[dict]: + variables = variables_orig.copy() first_iter = True scroll_id: Optional[str] = None while first_iter or scroll_id: first_iter = False + variables["scrollId"] = scroll_id - variables = { - "types": types, - "query": query, - "orFilters": orFilters, - "batchSize": batch_size, - "scrollId": scroll_id, - } response = self.execute_graphql( graphql_query, variables=variables, @@ -757,13 +771,116 @@ def get_urns_by_filter( data = response["scrollAcrossEntities"] scroll_id = data["nextScrollId"] for entry in data["searchResults"]: - yield entry["entity"]["urn"] + yield entry["entity"] if scroll_id: logger.debug( f"Scrolling to next scrollAcrossEntities page: {scroll_id}" ) + def _get_env_or_conditions(self, env: str) -> List[SearchFilterRule]: + # The env filter is a bit more tricky since it's not always stored + # in the same place in ElasticSearch. + return [ + # For most entity types, we look at the origin field. + { + "field": "origin", + "value": env, + "condition": "EQUAL", + }, + # For containers, we look at the customProperties field. + # For any containers created after https://github.com/datahub-project/datahub/pull/8027, + # we look for the "env" property. Otherwise, we use the "instance" property. + { + "field": "customProperties", + "value": f"env={env}", + }, + { + "field": "customProperties", + "value": f"instance={env}", + }, + # Note that not all entity types have an env (e.g. dashboards / charts). + # If the env filter is specified, these will be excluded. + ] + + def _get_status_filer( + self, status: RemovedStatusFilter + ) -> Optional[SearchFilterRule]: + if status == RemovedStatusFilter.NOT_SOFT_DELETED: + # Subtle: in some cases (e.g. when the dataset doesn't have a status aspect), the + # removed field is simply not present in the ElasticSearch document. Ideally this + # would be a "removed" : "false" filter, but that doesn't work. Instead, we need to + # use a negated filter. + return { + "field": "removed", + "values": ["true"], + "condition": "EQUAL", + "negated": True, + } + + elif status == RemovedStatusFilter.ONLY_SOFT_DELETED: + return { + "field": "removed", + "values": ["true"], + "condition": "EQUAL", + } + + elif status == RemovedStatusFilter.ALL: + # We don't need to add a filter for this case. + return None + else: + raise ValueError(f"Invalid status filter: {status}") + + def _get_container_filter(self, container: str) -> SearchFilterRule: + # Warn if container is not a fully qualified urn. + # TODO: Change this once we have a first-class container urn type. + if guess_entity_type(container) != "container": + raise ValueError(f"Invalid container urn: {container}") + + return { + "field": "browsePathV2", + "values": [container], + "condition": "CONTAIN", + } + + def _get_platform_instance_filter( + self, platform: Optional[str], platform_instance: str + ) -> SearchFilterRule: + if platform: + # Massage the platform instance into a fully qualified urn, if necessary. + platform_instance = make_dataplatform_instance_urn( + platform, platform_instance + ) + + # Warn if platform_instance is not a fully qualified urn. + # TODO: Change this once we have a first-class data platform instance urn type. + if guess_entity_type(platform_instance) != "dataPlatformInstance": + raise ValueError(f"Invalid data platform instance urn: {platform_instance}") + + return { + "field": "platformInstance", + "values": [platform_instance], + "condition": "EQUAL", + } + + def _get_platform_filter(self, platform: str) -> SearchFilterRule: + return { + "field": "platform.keyword", + "values": [make_data_platform_urn(platform)], + "condition": "EQUAL", + } + + def _get_types(self, entity_types: Optional[List[str]]) -> Optional[List[str]]: + types: Optional[List[str]] = None + if entity_types is not None: + if not entity_types: + raise ValueError( + "entity_types cannot be an empty list; use None for all entities" + ) + + types = [_graphql_entity_type(entity_type) for entity_type in entity_types] + return types + def get_latest_pipeline_checkpoint( self, pipeline_name: str, platform: str ) -> Optional[Checkpoint["GenericCheckpointState"]]: @@ -1033,43 +1150,36 @@ def initialize_schema_resolver_from_datahub( self, platform: str, platform_instance: Optional[str], env: str ) -> Tuple["SchemaResolver", Set[str]]: logger.info("Initializing schema resolver") - - # TODO: Filter on platform instance? - logger.info(f"Fetching urns for platform {platform}, env {env}") - with PerfTimer() as timer: - urns = set( - self.get_urns_by_filter( - entity_types=[DatasetUrn.ENTITY_TYPE], - platform=platform, - env=env, - batch_size=3000, - ) - ) - logger.info( - f"Fetched {len(urns)} urns in {timer.elapsed_seconds()} seconds" - ) - schema_resolver = self._make_schema_resolver( platform, platform_instance, env, include_graph=False ) + + logger.info(f"Fetching schemas for platform {platform}, env {env}") + urns = [] + count = 0 with PerfTimer() as timer: - count = 0 - for i, urn in enumerate(urns): - if i % 1000 == 0: - logger.debug(f"Loaded {i} schema metadata") + for urn, schema_info in self._bulk_fetch_schema_info_by_filter( + platform=platform, + platform_instance=platform_instance, + env=env, + ): try: - schema_metadata = self.get_aspect(urn, SchemaMetadataClass) - if schema_metadata: - schema_resolver.add_schema_metadata(urn, schema_metadata) - count += 1 + urns.append(urn) + schema_resolver.add_graphql_schema_metadata(urn, schema_info) + count += 1 except Exception: - logger.warning("Failed to load schema metadata", exc_info=True) + logger.warning("Failed to add schema info", exc_info=True) + + if count % 1000 == 0: + logger.debug( + f"Loaded {count} schema info in {timer.elapsed_seconds()} seconds" + ) logger.info( - f"Loaded {count} schema metadata in {timer.elapsed_seconds()} seconds" + f"Finished loading total {count} schema info in {timer.elapsed_seconds()} seconds" ) logger.info("Finished initializing schema resolver") - return schema_resolver, urns + return schema_resolver, set(urns) def parse_sql_lineage( self, diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index ae49a4ba17c11..8a16b1a4a5f6b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -285,9 +285,7 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): # Maps view ref -> actual sql self.view_definitions: FileBackedDict[str] = FileBackedDict() - self.sql_parser_schema_resolver = SchemaResolver( - platform=self.platform, env=self.config.env - ) + self.sql_parser_schema_resolver = self._init_schema_resolver() self.add_config_to_report() atexit.register(cleanup, config) @@ -446,6 +444,27 @@ def test_connection(config_dict: dict) -> TestConnectionReport: ) return test_report + def _init_schema_resolver(self) -> SchemaResolver: + schema_resolution_required = ( + self.config.lineage_parse_view_ddl or self.config.lineage_use_sql_parser + ) + schema_ingestion_enabled = ( + self.config.include_views and self.config.include_tables + ) + + if schema_resolution_required and not schema_ingestion_enabled: + if self.ctx.graph: + return self.ctx.graph.initialize_schema_resolver_from_datahub( + platform=self.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + )[0] + else: + logger.warning( + "Failed to load schema info from DataHub as DataHubGraph is missing.", + ) + return SchemaResolver(platform=self.platform, env=self.config.env) + def get_dataplatform_instance_aspect( self, dataset_urn: str, project_id: str ) -> MetadataWorkUnit: diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index d677b0874b985..f18235af3d1fd 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -14,6 +14,7 @@ import sqlglot.optimizer.qualify import sqlglot.optimizer.qualify_columns from pydantic import BaseModel +from typing_extensions import TypedDict from datahub.emitter.mce_builder import ( DEFAULT_ENV, @@ -36,6 +37,15 @@ SQL_PARSE_RESULT_CACHE_SIZE = 1000 +class GraphQLSchemaField(TypedDict): + fieldPath: str + nativeDataType: str + + +class GraphQLSchemaMetadata(TypedDict): + fields: List[GraphQLSchemaField] + + class QueryType(enum.Enum): CREATE = "CREATE" SELECT = "SELECT" @@ -330,6 +340,12 @@ def add_schema_metadata( def add_raw_schema_info(self, urn: str, schema_info: SchemaInfo) -> None: self._save_to_cache(urn, schema_info) + def add_graphql_schema_metadata( + self, urn: str, schema_metadata: GraphQLSchemaMetadata + ) -> None: + schema_info = self.convert_graphql_schema_metadata_to_info(schema_metadata) + self._save_to_cache(urn, schema_info) + def _save_to_cache(self, urn: str, schema_info: Optional[SchemaInfo]) -> None: self._schema_cache[urn] = schema_info @@ -356,6 +372,24 @@ def _convert_schema_aspect_to_info( not in DatasetUrn.get_simple_field_path_from_v2_field_path(col.fieldPath) } + @classmethod + def convert_graphql_schema_metadata_to_info( + cls, schema: GraphQLSchemaMetadata + ) -> SchemaInfo: + return { + DatasetUrn.get_simple_field_path_from_v2_field_path(field["fieldPath"]): ( + # The actual types are more of a "nice to have". + field["nativeDataType"] + or "str" + ) + for field in schema["fields"] + # TODO: We can't generate lineage to columns nested within structs yet. + if "." + not in DatasetUrn.get_simple_field_path_from_v2_field_path( + field["fieldPath"] + ) + } + # TODO add a method to load all from graphql def close(self) -> None: From 4be8fd0905b6631ddf7161ab412719bed786882a Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 21 Sep 2023 15:59:56 -0700 Subject: [PATCH 21/37] fix(docs): remove link-checker from CI (#8883) --- docs-website/markdown-link-check-config.json | 37 ++++++++------------ docs-website/package.json | 6 ++-- 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/docs-website/markdown-link-check-config.json b/docs-website/markdown-link-check-config.json index 26e040edde6f7..2f5a51ada324e 100644 --- a/docs-website/markdown-link-check-config.json +++ b/docs-website/markdown-link-check-config.json @@ -1,50 +1,41 @@ { "ignorePatterns": [ { - "pattern": "^http://demo\\.datahubproject\\.io" + "pattern": "^https?://demo\\.datahubproject\\.io" }, { - "pattern": "^http://localhost" + "pattern": "^http://localhost" }, { - "pattern": "^http://www.famfamfam.com" + "pattern": "^/docs" }, { - "pattern": "^http://www.linkedin.com" + "pattern": "^/integrations" }, { - "pattern": "\\.md$" + "pattern": "^https?://www.linkedin.com" }, { - "pattern":"\\.json$" + "pattern": "\\.md(#.*)?$" }, { - "pattern":"\\.txt$" + "pattern": "\\.json$" }, { - "pattern": "\\.java$" + "pattern": "\\.txt$" }, { - "pattern": "\\.md#.*$" + "pattern": "\\.java$" }, { - "pattern": "^https://oauth2.googleapis.com/token" + "pattern": "^https://oauth2.googleapis.com/token" }, { - "pattern": "^https://login.microsoftonline.com/common/oauth2/na$" + "pattern": "^https://login.microsoftonline.com/common/oauth2/na$" }, { - "pattern": "#v(\\d+)-(\\d+)-(\\d+)" - }, - { - "pattern": "^https://github.com/mohdsiddique$" - }, - { - "pattern": "^https://github.com/2x$" - }, - { - "pattern": "^https://github.com/datahub-project/datahub/assets/15873986/2f47d033-6c2b-483a-951d-e6d6b807f0d0%22%3E$" + "pattern": "^https://github.com/datahub-project/datahub/assets/15873986/2f47d033-6c2b-483a-951d-e6d6b807f0d0%22%3E$" } ], - "aliveStatusCodes": [200, 206, 0, 999, 400, 401, 403] -} \ No newline at end of file + "aliveStatusCodes": [200, 206, 0, 999] +} diff --git a/docs-website/package.json b/docs-website/package.json index 1722f92169692..eca6e5814d3c6 100644 --- a/docs-website/package.json +++ b/docs-website/package.json @@ -17,8 +17,10 @@ "generate": "rm -rf genDocs genStatic && mkdir genDocs genStatic && yarn _generate-docs && mv docs/* genDocs/ && rmdir docs", "generate-rsync": "mkdir -p genDocs genStatic && yarn _generate-docs && rsync -v --checksum -r -h -i --delete docs/ genDocs && rm -rf docs", "lint": "prettier -w generateDocsDir.ts sidebars.js src/pages/index.js", - "lint-check": "prettier -l generateDocsDir.ts sidebars.js src/pages/index.js && find ./genDocs -name \\*.md -not -path \"./genDocs/python-sdk/models.md\" -print0 | xargs -0 -n1 markdown-link-check -p -q -c markdown-link-check-config.json", - "lint-fix": "prettier --write generateDocsDir.ts sidebars.js src/pages/index.js" + "lint-check": "prettier -l generateDocsDir.ts sidebars.js src/pages/index.js", + "lint-fix": "prettier --write generateDocsDir.ts sidebars.js src/pages/index.js", + "_list-link-check-files": "find ./genDocs -name '*.md' -not \\( -path './genDocs/python-sdk/*' -o -path './genDocs/releases.md' \\)", + "check-links": "yarn run -s _list-link-check-files -print0 | xargs -0 -n1 -t markdown-link-check -q -c markdown-link-check-config.json" }, "dependencies": { "@ant-design/icons": "^4.7.0", From aef49b8fb2478f8a1b902aaee16fee9c07c7beab Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Thu, 21 Sep 2023 22:00:14 -0500 Subject: [PATCH 22/37] feat(entity-client): enable client side cache for entity-client and usage-client (#8877) --- datahub-frontend/app/auth/AuthModule.java | 25 +++- .../app/auth/sso/oidc/OidcCallbackLogic.java | 7 +- .../app/config/ConfigurationProvider.java | 27 ++++ .../controllers/SsoCallbackController.java | 6 +- datahub-frontend/play.gradle | 8 +- .../datahub/graphql/GmsGraphQLEngine.java | 3 + .../datahub/graphql/GmsGraphQLEngineArgs.java | 2 + .../dataset/DatasetStatsSummaryResolver.java | 19 ++- .../dataset/DatasetUsageStatsResolver.java | 8 +- .../dashboard/DashboardStatsSummaryTest.java | 3 +- .../DatasetStatsSummaryResolverTest.java | 17 ++- .../common/steps/GMSDisableWriteModeStep.java | 8 +- .../common/steps/GMSEnableWriteModeStep.java | 9 +- .../upgrade/config/NoCodeUpgradeConfig.java | 10 +- .../upgrade/config/RestoreBackupConfig.java | 10 +- .../datahub/upgrade/nocode/NoCodeUpgrade.java | 12 +- .../upgrade/restorebackup/RestoreBackup.java | 15 +- .../client/SystemJavaEntityClient.java | 39 +++++ ...sInstanceRunEventChangeEventGenerator.java | 12 +- .../EntityChangeEventGenerator.java | 8 +- ...eConsumerApplicationTestConfiguration.java | 4 +- .../kafka/config/EntityHydratorConfig.java | 24 +-- .../event/EntityChangeEventGeneratorHook.java | 17 +-- .../hook/siblings/SiblingAssociationHook.java | 38 ++--- .../kafka/hydrator/EntityHydrator.java | 22 ++- .../EntityChangeEventGeneratorHookTest.java | 28 ++-- .../siblings/SiblingAssociationHookTest.java | 78 +++------- .../spring/MCLSpringTestConfiguration.java | 6 +- .../kafka/MceConsumerApplication.java | 2 +- .../kafka/MetadataChangeEventsProcessor.java | 8 +- .../MetadataChangeProposalsProcessor.java | 12 +- metadata-service/configuration/build.gradle | 1 + .../config/cache/CacheConfiguration.java | 2 + .../cache/client/ClientCacheConfig.java | 10 ++ .../client/ClientCacheConfiguration.java | 9 ++ .../cache/client/EntityClientCacheConfig.java | 17 +++ .../cache/client/UsageClientCacheConfig.java | 12 ++ .../spring/YamlPropertySourceFactory.java | 10 +- .../src/main/resources/application.yml | 24 +++ .../factory/auth/AuthorizerChainFactory.java | 2 +- .../auth/DataHubAuthorizerFactory.java | 2 +- .../auth/DataHubTokenServiceFactory.java | 3 +- .../gms/factory/auth/GroupServiceFactory.java | 2 +- .../auth/InviteTokenServiceFactory.java | 2 +- .../auth/NativeUserServiceFactory.java | 2 +- .../gms/factory/auth/PostServiceFactory.java | 2 +- .../gms/factory/auth/RoleServiceFactory.java | 2 +- .../auth/SystemAuthenticationFactory.java | 2 +- .../ElasticSearchGraphServiceFactory.java | 2 +- ...ticSearchSystemMetadataServiceFactory.java | 2 +- .../ElasticsearchSSLContextFactory.java | 2 +- .../factory/common/GraphServiceFactory.java | 2 +- .../common/IndexConventionFactory.java | 2 +- .../common/LocalEbeanServerConfigFactory.java | 2 +- .../factory/common/Neo4jDriverFactory.java | 2 +- .../common/RestHighLevelClientFactory.java | 2 +- .../factory/config/ConfigurationProvider.java | 2 +- .../DataProductServiceFactory.java | 2 +- .../entity/JavaEntityClientFactory.java | 29 +++- .../entity/RestliEntityClientFactory.java | 19 ++- .../entity/RetentionServiceFactory.java | 2 +- .../ConfigEntityRegistryFactory.java | 2 +- .../PluginEntityRegistryFactory.java | 2 +- .../factory/graphql/GraphQLEngineFactory.java | 6 + .../ingestion/IngestionSchedulerFactory.java | 2 +- .../DataHubKafkaEventProducerFactory.java | 2 +- .../kafka/DataHubKafkaProducerFactory.java | 2 +- .../AwsGlueSchemaRegistryFactory.java | 2 +- .../KafkaSchemaRegistryFactory.java | 2 +- .../lineage/LineageServiceFactory.java | 2 +- .../OwnershipTypeServiceFactory.java | 2 +- .../factory/query/QueryServiceFactory.java | 2 +- .../BaseElasticSearchComponentsFactory.java | 2 +- .../CachingEntitySearchServiceFactory.java | 2 +- .../ElasticSearchBulkProcessorFactory.java | 2 +- .../ElasticSearchIndexBuilderFactory.java | 2 +- .../search/ElasticSearchServiceFactory.java | 2 +- .../search/LineageSearchServiceFactory.java | 2 +- .../SearchDocumentTransformerFactory.java | 2 +- .../factory/search/SearchServiceFactory.java | 2 +- .../search/SettingsBuilderFactory.java | 2 +- .../search/views/ViewServiceFactory.java | 2 +- .../settings/SettingsServiceFactory.java | 2 +- .../factory/telemetry/MixpanelApiFactory.java | 2 +- .../MixpanelMessageBuilderFactory.java | 2 +- .../telemetry/TrackingServiceFactory.java | 2 +- ...tyChangeEventGeneratorRegistryFactory.java | 6 +- .../timeline/TimelineServiceFactory.java | 2 +- ...cSearchTimeseriesAspectServiceFactory.java | 2 +- .../gms/factory/usage/UsageClientFactory.java | 15 +- .../IngestRetentionPoliciesStepFactory.java | 2 +- .../openapi/util/OpenApiEntitiesUtilTest.java | 2 +- metadata-service/restli-client/build.gradle | 1 + .../linkedin/common/client/ClientCache.java | 134 +++++++++++++++++ .../entity/client/EntityClientCache.java | 141 ++++++++++++++++++ .../entity/client/SystemEntityClient.java | 91 +++++++++++ .../client/SystemRestliEntityClient.java | 25 ++++ .../java/com/linkedin/usage/UsageClient.java | 33 +++- .../com/linkedin/usage/UsageClientCache.java | 75 ++++++++++ .../metadata/utils/metrics/MetricUtils.java | 5 + 100 files changed, 951 insertions(+), 298 deletions(-) create mode 100644 datahub-frontend/app/config/ConfigurationProvider.java create mode 100644 metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java create mode 100644 metadata-service/configuration/src/main/java/com/linkedin/metadata/config/cache/client/ClientCacheConfig.java create mode 100644 metadata-service/configuration/src/main/java/com/linkedin/metadata/config/cache/client/ClientCacheConfiguration.java create mode 100644 metadata-service/configuration/src/main/java/com/linkedin/metadata/config/cache/client/EntityClientCacheConfig.java create mode 100644 metadata-service/configuration/src/main/java/com/linkedin/metadata/config/cache/client/UsageClientCacheConfig.java rename metadata-service/{factories/src/main/java/com/linkedin/gms/factory => configuration/src/main/java/com/linkedin/metadata}/spring/YamlPropertySourceFactory.java (87%) create mode 100644 metadata-service/restli-client/src/main/java/com/linkedin/common/client/ClientCache.java create mode 100644 metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClientCache.java create mode 100644 metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemEntityClient.java create mode 100644 metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java create mode 100644 metadata-service/restli-client/src/main/java/com/linkedin/usage/UsageClientCache.java diff --git a/datahub-frontend/app/auth/AuthModule.java b/datahub-frontend/app/auth/AuthModule.java index eb95078b1a640..98f3b82285eda 100644 --- a/datahub-frontend/app/auth/AuthModule.java +++ b/datahub-frontend/app/auth/AuthModule.java @@ -11,16 +11,19 @@ import com.google.inject.AbstractModule; import com.google.inject.Provides; import com.google.inject.Singleton; -import com.linkedin.entity.client.EntityClient; -import com.linkedin.entity.client.RestliEntityClient; +import com.linkedin.entity.client.SystemEntityClient; +import com.linkedin.entity.client.SystemRestliEntityClient; import com.linkedin.metadata.restli.DefaultRestliClientFactory; import com.linkedin.parseq.retry.backoff.ExponentialBackoff; import com.linkedin.util.Configuration; +import config.ConfigurationProvider; import controllers.SsoCallbackController; + import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; import java.util.List; + import org.apache.commons.codec.digest.DigestUtils; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; @@ -34,6 +37,7 @@ import org.pac4j.play.store.PlayCookieSessionStore; import org.pac4j.play.store.PlaySessionStore; import org.pac4j.play.store.ShiroAesDataEncrypter; +import org.springframework.context.annotation.AnnotationConfigApplicationContext; import play.Environment; import play.cache.SyncCacheApi; import utils.ConfigUtil; @@ -104,7 +108,7 @@ protected void configure() { bind(SsoCallbackController.class).toConstructor(SsoCallbackController.class.getConstructor( SsoManager.class, Authentication.class, - EntityClient.class, + SystemEntityClient.class, AuthServiceClient.class, com.typesafe.config.Config.class)); } catch (NoSuchMethodException | SecurityException e) { @@ -161,10 +165,19 @@ protected Authentication provideSystemAuthentication() { @Provides @Singleton - protected EntityClient provideEntityClient() { - return new RestliEntityClient(buildRestliClient(), + protected ConfigurationProvider provideConfigurationProvider() { + AnnotationConfigApplicationContext context = new AnnotationConfigApplicationContext(ConfigurationProvider.class); + return context.getBean(ConfigurationProvider.class); + } + + @Provides + @Singleton + protected SystemEntityClient provideEntityClient(final Authentication systemAuthentication, + final ConfigurationProvider configurationProvider) { + return new SystemRestliEntityClient(buildRestliClient(), new ExponentialBackoff(_configs.getInt(ENTITY_CLIENT_RETRY_INTERVAL)), - _configs.getInt(ENTITY_CLIENT_NUM_RETRIES)); + _configs.getInt(ENTITY_CLIENT_NUM_RETRIES), systemAuthentication, + configurationProvider.getCache().getClient().getEntityClient()); } @Provides diff --git a/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java b/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java index 85139d1db0868..4bde0872fc082 100644 --- a/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java +++ b/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java @@ -13,7 +13,7 @@ import com.linkedin.common.urn.Urn; import com.linkedin.data.template.SetMode; import com.linkedin.entity.Entity; -import com.linkedin.entity.client.EntityClient; +import com.linkedin.entity.client.SystemEntityClient; import com.linkedin.events.metadata.ChangeType; import com.linkedin.identity.CorpGroupInfo; import com.linkedin.identity.CorpUserEditableInfo; @@ -78,13 +78,14 @@ public class OidcCallbackLogic extends DefaultCallbackLogic { private final SsoManager _ssoManager; - private final EntityClient _entityClient; + private final SystemEntityClient _entityClient; private final Authentication _systemAuthentication; private final AuthServiceClient _authClient; private final CookieConfigs _cookieConfigs; public OidcCallbackLogic(final SsoManager ssoManager, final Authentication systemAuthentication, - final EntityClient entityClient, final AuthServiceClient authClient, final CookieConfigs cookieConfigs) { + final SystemEntityClient entityClient, final AuthServiceClient authClient, + final CookieConfigs cookieConfigs) { _ssoManager = ssoManager; _systemAuthentication = systemAuthentication; _entityClient = entityClient; diff --git a/datahub-frontend/app/config/ConfigurationProvider.java b/datahub-frontend/app/config/ConfigurationProvider.java new file mode 100644 index 0000000000000..00a5472ec3476 --- /dev/null +++ b/datahub-frontend/app/config/ConfigurationProvider.java @@ -0,0 +1,27 @@ +package config; + +import com.linkedin.metadata.config.cache.CacheConfiguration; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; +import lombok.Data; + +import org.springframework.boot.context.properties.ConfigurationProperties; +import org.springframework.boot.context.properties.EnableConfigurationProperties; +import org.springframework.context.annotation.PropertySource; + + +/** + * Minimal sharing between metadata-service and frontend + * Initially for use of client caching configuration. + * Does not use the factories module to avoid transitive dependencies. + */ +@EnableConfigurationProperties +@PropertySource(value = "application.yml", factory = YamlPropertySourceFactory.class) +@ConfigurationProperties +@Data +public class ConfigurationProvider { + + /** + * Configuration for caching + */ + private CacheConfiguration cache; +} diff --git a/datahub-frontend/app/controllers/SsoCallbackController.java b/datahub-frontend/app/controllers/SsoCallbackController.java index 5a36d833deceb..7a4b5585cc21a 100644 --- a/datahub-frontend/app/controllers/SsoCallbackController.java +++ b/datahub-frontend/app/controllers/SsoCallbackController.java @@ -3,7 +3,7 @@ import auth.CookieConfigs; import client.AuthServiceClient; import com.datahub.authentication.Authentication; -import com.linkedin.entity.client.EntityClient; +import com.linkedin.entity.client.SystemEntityClient; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import java.util.concurrent.CompletableFuture; @@ -40,7 +40,7 @@ public class SsoCallbackController extends CallbackController { public SsoCallbackController( @Nonnull SsoManager ssoManager, @Nonnull Authentication systemAuthentication, - @Nonnull EntityClient entityClient, + @Nonnull SystemEntityClient entityClient, @Nonnull AuthServiceClient authClient, @Nonnull com.typesafe.config.Config configs) { _ssoManager = ssoManager; @@ -79,7 +79,7 @@ public class SsoCallbackLogic implements CallbackLogic { private final OidcCallbackLogic _oidcCallbackLogic; SsoCallbackLogic(final SsoManager ssoManager, final Authentication systemAuthentication, - final EntityClient entityClient, final AuthServiceClient authClient, final CookieConfigs cookieConfigs) { + final SystemEntityClient entityClient, final AuthServiceClient authClient, final CookieConfigs cookieConfigs) { _oidcCallbackLogic = new OidcCallbackLogic(ssoManager, systemAuthentication, entityClient, authClient, cookieConfigs); } diff --git a/datahub-frontend/play.gradle b/datahub-frontend/play.gradle index e40f8e3eeb96d..daecba16cbf72 100644 --- a/datahub-frontend/play.gradle +++ b/datahub-frontend/play.gradle @@ -16,9 +16,6 @@ dependencies { implementation project(':datahub-web-react') constraints { - play(externalDependency.springCore) - play(externalDependency.springBeans) - play(externalDependency.springContext) play(externalDependency.jacksonDataBind) play('com.nimbusds:oauth2-oidc-sdk:8.36.2') play('com.nimbusds:nimbus-jose-jwt:8.18') @@ -35,7 +32,12 @@ dependencies { implementation project(":metadata-service:restli-client") implementation project(":metadata-service:auth-config") + implementation project(":metadata-service:configuration") + implementation externalDependency.springCore + implementation externalDependency.springBeans + implementation externalDependency.springContext + implementation externalDependency.springBootAutoconfigure implementation externalDependency.jettyJaas implementation externalDependency.graphqlJava implementation externalDependency.antlr4Runtime diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java index d86234cf59306..3ba0cc1f747e3 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java @@ -302,6 +302,7 @@ import com.linkedin.datahub.graphql.types.test.TestType; import com.linkedin.datahub.graphql.types.view.DataHubViewType; import com.linkedin.entity.client.EntityClient; +import com.linkedin.entity.client.SystemEntityClient; import com.linkedin.metadata.config.DataHubConfiguration; import com.linkedin.metadata.config.IngestionConfiguration; import com.linkedin.metadata.config.TestsConfiguration; @@ -364,6 +365,7 @@ public class GmsGraphQLEngine { private final EntityClient entityClient; + private final SystemEntityClient systemEntityClient; private final GraphClient graphClient; private final UsageClient usageClient; private final SiblingGraphService siblingGraphService; @@ -476,6 +478,7 @@ public GmsGraphQLEngine(final GmsGraphQLEngineArgs args) { this.graphQLPlugins.forEach(plugin -> plugin.init(args)); this.entityClient = args.entityClient; + this.systemEntityClient = args.systemEntityClient; this.graphClient = args.graphClient; this.usageClient = args.usageClient; this.siblingGraphService = args.siblingGraphService; diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngineArgs.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngineArgs.java index cbcf42c4f93d9..157fb10ce7078 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngineArgs.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngineArgs.java @@ -11,6 +11,7 @@ import com.linkedin.datahub.graphql.analytics.service.AnalyticsService; import com.linkedin.datahub.graphql.featureflags.FeatureFlags; import com.linkedin.entity.client.EntityClient; +import com.linkedin.entity.client.SystemEntityClient; import com.linkedin.metadata.config.DataHubConfiguration; import com.linkedin.metadata.config.IngestionConfiguration; import com.linkedin.metadata.config.TestsConfiguration; @@ -38,6 +39,7 @@ @Data public class GmsGraphQLEngineArgs { EntityClient entityClient; + SystemEntityClient systemEntityClient; GraphClient graphClient; UsageClient usageClient; AnalyticsService analyticsService; diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetStatsSummaryResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetStatsSummaryResolver.java index f27fd604a746f..23be49c7e7140 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetStatsSummaryResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetStatsSummaryResolver.java @@ -1,13 +1,16 @@ package com.linkedin.datahub.graphql.resolvers.dataset; +import com.datahub.authorization.ResourceSpec; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.authorization.AuthorizationUtils; import com.linkedin.datahub.graphql.generated.CorpUser; import com.linkedin.datahub.graphql.generated.DatasetStatsSummary; import com.linkedin.datahub.graphql.generated.Entity; +import com.linkedin.metadata.authorization.PoliciesConfig; import com.linkedin.usage.UsageClient; import com.linkedin.usage.UsageTimeRange; import com.linkedin.usage.UserUsageCounts; @@ -15,6 +18,7 @@ import graphql.schema.DataFetchingEnvironment; import java.util.List; import java.util.Objects; +import java.util.Optional; import java.util.concurrent.CompletableFuture; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; @@ -55,8 +59,15 @@ public CompletableFuture get(DataFetchingEnvironment enviro try { + if (!isAuthorized(resourceUrn, context)) { + log.debug("User {} is not authorized to view profile information for dataset {}", + context.getActorUrn(), + resourceUrn.toString()); + return null; + } + com.linkedin.usage.UsageQueryResult - usageQueryResult = usageClient.getUsageStats(resourceUrn.toString(), UsageTimeRange.MONTH, context.getAuthentication()); + usageQueryResult = usageClient.getUsageStats(resourceUrn.toString(), UsageTimeRange.MONTH); final DatasetStatsSummary result = new DatasetStatsSummary(); result.setQueryCountLast30Days(usageQueryResult.getAggregations().getTotalSqlQueries()); @@ -90,4 +101,10 @@ private CorpUser createPartialUser(final Urn userUrn) { result.setUrn(userUrn.toString()); return result; } + + private boolean isAuthorized(final Urn resourceUrn, final QueryContext context) { + return AuthorizationUtils.isAuthorized(context, + Optional.of(new ResourceSpec(resourceUrn.getEntityType(), resourceUrn.toString())), + PoliciesConfig.VIEW_DATASET_USAGE_PRIVILEGE); + } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetUsageStatsResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetUsageStatsResolver.java index 0476963b92e9a..20361830ad5a5 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetUsageStatsResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetUsageStatsResolver.java @@ -9,12 +9,10 @@ import com.linkedin.datahub.graphql.generated.UsageQueryResult; import com.linkedin.datahub.graphql.types.usage.UsageQueryResultMapper; import com.linkedin.metadata.authorization.PoliciesConfig; -import com.linkedin.r2.RemoteInvocationException; import com.linkedin.usage.UsageClient; import com.linkedin.usage.UsageTimeRange; import graphql.schema.DataFetcher; import graphql.schema.DataFetchingEnvironment; -import java.net.URISyntaxException; import java.util.Optional; import java.util.concurrent.CompletableFuture; import lombok.extern.slf4j.Slf4j; @@ -44,10 +42,10 @@ public CompletableFuture get(DataFetchingEnvironment environme } try { com.linkedin.usage.UsageQueryResult - usageQueryResult = usageClient.getUsageStats(resourceUrn.toString(), range, context.getAuthentication()); + usageQueryResult = usageClient.getUsageStats(resourceUrn.toString(), range); return UsageQueryResultMapper.map(usageQueryResult); - } catch (RemoteInvocationException | URISyntaxException e) { - throw new RuntimeException(String.format("Failed to load Usage Stats for resource %s", resourceUrn.toString()), e); + } catch (Exception e) { + throw new RuntimeException(String.format("Failed to load Usage Stats for resource %s", resourceUrn), e); } }); } diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/dashboard/DashboardStatsSummaryTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/dashboard/DashboardStatsSummaryTest.java index 163628c1bc590..6a9617ea41b44 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/dashboard/DashboardStatsSummaryTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/dashboard/DashboardStatsSummaryTest.java @@ -117,8 +117,7 @@ public void testGetException() throws Exception { UsageClient mockClient = Mockito.mock(UsageClient.class); Mockito.when(mockClient.getUsageStats( Mockito.eq(TEST_DASHBOARD_URN), - Mockito.eq(UsageTimeRange.MONTH), - Mockito.any(Authentication.class) + Mockito.eq(UsageTimeRange.MONTH) )).thenThrow(RuntimeException.class); // Execute resolver diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetStatsSummaryResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetStatsSummaryResolverTest.java index bd3edf65bf7ad..013e23b779c51 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetStatsSummaryResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetStatsSummaryResolverTest.java @@ -1,6 +1,8 @@ package com.linkedin.datahub.graphql.resolvers.dataset; import com.datahub.authentication.Authentication; +import com.datahub.authorization.AuthorizationResult; +import com.datahub.plugins.auth.authorization.Authorizer; import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.UrnUtils; import com.linkedin.datahub.graphql.QueryContext; @@ -53,13 +55,18 @@ public void testGetSuccess() throws Exception { UsageClient mockClient = Mockito.mock(UsageClient.class); Mockito.when(mockClient.getUsageStats( Mockito.eq(TEST_DATASET_URN), - Mockito.eq(UsageTimeRange.MONTH), - Mockito.any(Authentication.class) + Mockito.eq(UsageTimeRange.MONTH) )).thenReturn(testResult); // Execute resolver DatasetStatsSummaryResolver resolver = new DatasetStatsSummaryResolver(mockClient); QueryContext mockContext = Mockito.mock(QueryContext.class); + Mockito.when(mockContext.getActorUrn()).thenReturn("urn:li:corpuser:test"); + Authorizer mockAuthorizer = Mockito.mock(Authorizer.class); + AuthorizationResult mockAuthorizerResult = Mockito.mock(AuthorizationResult.class); + Mockito.when(mockAuthorizerResult.getType()).thenReturn(AuthorizationResult.Type.ALLOW); + Mockito.when(mockAuthorizer.authorize(Mockito.any())).thenReturn(mockAuthorizerResult); + Mockito.when(mockContext.getAuthorizer()).thenReturn(mockAuthorizer); Mockito.when(mockContext.getAuthentication()).thenReturn(Mockito.mock(Authentication.class)); DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); Mockito.when(mockEnv.getSource()).thenReturn(TEST_SOURCE); @@ -79,8 +86,7 @@ public void testGetSuccess() throws Exception { newResult.setAggregations(new UsageQueryResultAggregations()); Mockito.when(mockClient.getUsageStats( Mockito.eq(TEST_DATASET_URN), - Mockito.eq(UsageTimeRange.MONTH), - Mockito.any(Authentication.class) + Mockito.eq(UsageTimeRange.MONTH) )).thenReturn(newResult); // Then verify that the new result is _not_ returned (cache hit) @@ -116,8 +122,7 @@ public void testGetException() throws Exception { UsageClient mockClient = Mockito.mock(UsageClient.class); Mockito.when(mockClient.getUsageStats( Mockito.eq(TEST_DATASET_URN), - Mockito.eq(UsageTimeRange.MONTH), - Mockito.any(Authentication.class) + Mockito.eq(UsageTimeRange.MONTH) )).thenThrow(RuntimeException.class); // Execute resolver diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/common/steps/GMSDisableWriteModeStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/common/steps/GMSDisableWriteModeStep.java index e205fd2f5c20e..270aa11c7b070 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/common/steps/GMSDisableWriteModeStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/common/steps/GMSDisableWriteModeStep.java @@ -1,11 +1,10 @@ package com.linkedin.datahub.upgrade.common.steps; -import com.datahub.authentication.Authentication; import com.linkedin.datahub.upgrade.UpgradeContext; import com.linkedin.datahub.upgrade.UpgradeStep; import com.linkedin.datahub.upgrade.UpgradeStepResult; import com.linkedin.datahub.upgrade.impl.DefaultUpgradeStepResult; -import com.linkedin.entity.client.RestliEntityClient; +import com.linkedin.entity.client.SystemRestliEntityClient; import java.util.function.Function; import lombok.RequiredArgsConstructor; @@ -13,8 +12,7 @@ @RequiredArgsConstructor public class GMSDisableWriteModeStep implements UpgradeStep { - private final Authentication _systemAuthentication; - private final RestliEntityClient _entityClient; + private final SystemRestliEntityClient _entityClient; @Override public String id() { @@ -30,7 +28,7 @@ public int retryCount() { public Function executable() { return (context) -> { try { - _entityClient.setWritable(false, _systemAuthentication); + _entityClient.setWritable(false); } catch (Exception e) { e.printStackTrace(); context.report().addLine("Failed to turn write mode off in GMS"); diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/common/steps/GMSEnableWriteModeStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/common/steps/GMSEnableWriteModeStep.java index 270eff8df227c..8df02123983e8 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/common/steps/GMSEnableWriteModeStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/common/steps/GMSEnableWriteModeStep.java @@ -1,20 +1,17 @@ package com.linkedin.datahub.upgrade.common.steps; -import com.datahub.authentication.Authentication; import com.linkedin.datahub.upgrade.UpgradeContext; import com.linkedin.datahub.upgrade.UpgradeStep; import com.linkedin.datahub.upgrade.UpgradeStepResult; import com.linkedin.datahub.upgrade.impl.DefaultUpgradeStepResult; -import com.linkedin.entity.client.RestliEntityClient; +import com.linkedin.entity.client.SystemRestliEntityClient; import java.util.function.Function; import lombok.RequiredArgsConstructor; @RequiredArgsConstructor public class GMSEnableWriteModeStep implements UpgradeStep { - - private final Authentication _systemAuthentication; - private final RestliEntityClient _entityClient; + private final SystemRestliEntityClient _entityClient; @Override public String id() { @@ -30,7 +27,7 @@ public int retryCount() { public Function executable() { return (context) -> { try { - _entityClient.setWritable(true, _systemAuthentication); + _entityClient.setWritable(true); } catch (Exception e) { e.printStackTrace(); context.report().addLine("Failed to turn write mode back on in GMS"); diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeUpgradeConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeUpgradeConfig.java index 30175c6fa78c8..cd264e529e9a5 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeUpgradeConfig.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeUpgradeConfig.java @@ -1,8 +1,7 @@ package com.linkedin.datahub.upgrade.config; -import com.datahub.authentication.Authentication; import com.linkedin.datahub.upgrade.nocode.NoCodeUpgrade; -import com.linkedin.entity.client.RestliEntityClient; +import com.linkedin.entity.client.SystemRestliEntityClient; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.models.registry.EntityRegistry; import io.ebean.Database; @@ -21,15 +20,14 @@ public class NoCodeUpgradeConfig { ApplicationContext applicationContext; @Bean(name = "noCodeUpgrade") - @DependsOn({"ebeanServer", "entityService", "systemAuthentication", "restliEntityClient", "entityRegistry"}) + @DependsOn({"ebeanServer", "entityService", "systemRestliEntityClient", "entityRegistry"}) @Nonnull public NoCodeUpgrade createInstance() { final Database ebeanServer = applicationContext.getBean(Database.class); final EntityService entityService = applicationContext.getBean(EntityService.class); - final Authentication systemAuthentication = applicationContext.getBean(Authentication.class); - final RestliEntityClient entityClient = applicationContext.getBean(RestliEntityClient.class); + final SystemRestliEntityClient entityClient = applicationContext.getBean(SystemRestliEntityClient.class); final EntityRegistry entityRegistry = applicationContext.getBean(EntityRegistry.class); - return new NoCodeUpgrade(ebeanServer, entityService, entityRegistry, systemAuthentication, entityClient); + return new NoCodeUpgrade(ebeanServer, entityService, entityRegistry, entityClient); } } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreBackupConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreBackupConfig.java index 9b0fcf279abf5..97a08800534de 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreBackupConfig.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreBackupConfig.java @@ -1,8 +1,7 @@ package com.linkedin.datahub.upgrade.config; -import com.datahub.authentication.Authentication; import com.linkedin.datahub.upgrade.restorebackup.RestoreBackup; -import com.linkedin.entity.client.RestliEntityClient; +import com.linkedin.entity.client.SystemRestliEntityClient; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.graph.GraphService; import com.linkedin.metadata.models.registry.EntityRegistry; @@ -22,19 +21,18 @@ public class RestoreBackupConfig { ApplicationContext applicationContext; @Bean(name = "restoreBackup") - @DependsOn({"ebeanServer", "entityService", "systemAuthentication", "restliEntityClient", "graphService", + @DependsOn({"ebeanServer", "entityService", "systemRestliEntityClient", "graphService", "searchService", "entityRegistry"}) @Nonnull public RestoreBackup createInstance() { final Database ebeanServer = applicationContext.getBean(Database.class); final EntityService entityService = applicationContext.getBean(EntityService.class); - final Authentication systemAuthentication = applicationContext.getBean(Authentication.class); - final RestliEntityClient entityClient = applicationContext.getBean(RestliEntityClient.class); + final SystemRestliEntityClient entityClient = applicationContext.getBean(SystemRestliEntityClient.class); final GraphService graphClient = applicationContext.getBean(GraphService.class); final EntitySearchService searchClient = applicationContext.getBean(EntitySearchService.class); final EntityRegistry entityRegistry = applicationContext.getBean(EntityRegistry.class); - return new RestoreBackup(ebeanServer, entityService, entityRegistry, systemAuthentication, entityClient, + return new RestoreBackup(ebeanServer, entityService, entityRegistry, entityClient, graphClient, searchClient); } } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocode/NoCodeUpgrade.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocode/NoCodeUpgrade.java index ee4a3bc504e77..a299deb874721 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocode/NoCodeUpgrade.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocode/NoCodeUpgrade.java @@ -1,13 +1,12 @@ package com.linkedin.datahub.upgrade.nocode; -import com.datahub.authentication.Authentication; import com.google.common.collect.ImmutableMap; import com.linkedin.datahub.upgrade.Upgrade; import com.linkedin.datahub.upgrade.UpgradeCleanupStep; import com.linkedin.datahub.upgrade.UpgradeStep; import com.linkedin.datahub.upgrade.common.steps.GMSEnableWriteModeStep; import com.linkedin.datahub.upgrade.common.steps.GMSQualificationStep; -import com.linkedin.entity.client.RestliEntityClient; +import com.linkedin.entity.client.SystemRestliEntityClient; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.models.registry.EntityRegistry; import io.ebean.Database; @@ -30,12 +29,10 @@ public NoCodeUpgrade( final Database server, final EntityService entityService, final EntityRegistry entityRegistry, - final Authentication systemAuthentication, - final RestliEntityClient entityClient) { + final SystemRestliEntityClient entityClient) { _steps = buildUpgradeSteps( server, entityService, entityRegistry, - systemAuthentication, entityClient); _cleanupSteps = buildCleanupSteps(); } @@ -63,15 +60,14 @@ private List buildUpgradeSteps( final Database server, final EntityService entityService, final EntityRegistry entityRegistry, - final Authentication systemAuthentication, - final RestliEntityClient entityClient) { + final SystemRestliEntityClient entityClient) { final List steps = new ArrayList<>(); steps.add(new RemoveAspectV2TableStep(server)); steps.add(new GMSQualificationStep(ImmutableMap.of("noCode", "true"))); steps.add(new UpgradeQualificationStep(server)); steps.add(new CreateAspectTableStep(server)); steps.add(new DataMigrationStep(server, entityService, entityRegistry)); - steps.add(new GMSEnableWriteModeStep(systemAuthentication, entityClient)); + steps.add(new GMSEnableWriteModeStep(entityClient)); return steps; } } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreBackup.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreBackup.java index 67718a6739beb..9175ad606e3c8 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreBackup.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreBackup.java @@ -1,6 +1,5 @@ package com.linkedin.datahub.upgrade.restorebackup; -import com.datahub.authentication.Authentication; import com.google.common.collect.ImmutableList; import com.linkedin.datahub.upgrade.Upgrade; import com.linkedin.datahub.upgrade.UpgradeCleanupStep; @@ -9,7 +8,7 @@ import com.linkedin.datahub.upgrade.common.steps.ClearSearchServiceStep; import com.linkedin.datahub.upgrade.common.steps.GMSDisableWriteModeStep; import com.linkedin.datahub.upgrade.common.steps.GMSEnableWriteModeStep; -import com.linkedin.entity.client.RestliEntityClient; +import com.linkedin.entity.client.SystemRestliEntityClient; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.graph.GraphService; import com.linkedin.metadata.models.registry.EntityRegistry; @@ -27,11 +26,10 @@ public RestoreBackup( final Database server, final EntityService entityService, final EntityRegistry entityRegistry, - final Authentication systemAuthentication, - final RestliEntityClient entityClient, + final SystemRestliEntityClient entityClient, final GraphService graphClient, final EntitySearchService searchClient) { - _steps = buildSteps(server, entityService, entityRegistry, systemAuthentication, entityClient, graphClient, searchClient); + _steps = buildSteps(server, entityService, entityRegistry, entityClient, graphClient, searchClient); } @Override @@ -48,17 +46,16 @@ private List buildSteps( final Database server, final EntityService entityService, final EntityRegistry entityRegistry, - final Authentication systemAuthentication, - final RestliEntityClient entityClient, + final SystemRestliEntityClient entityClient, final GraphService graphClient, final EntitySearchService searchClient) { final List steps = new ArrayList<>(); - steps.add(new GMSDisableWriteModeStep(systemAuthentication, entityClient)); + steps.add(new GMSDisableWriteModeStep(entityClient)); steps.add(new ClearSearchServiceStep(searchClient, true)); steps.add(new ClearGraphServiceStep(graphClient, true)); steps.add(new ClearAspectV2TableStep(server)); steps.add(new RestoreStorageStep(entityService, entityRegistry)); - steps.add(new GMSEnableWriteModeStep(systemAuthentication, entityClient)); + steps.add(new GMSEnableWriteModeStep(entityClient)); return steps; } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java new file mode 100644 index 0000000000000..6b5a3d5bfb06e --- /dev/null +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java @@ -0,0 +1,39 @@ +package com.linkedin.metadata.client; + +import com.datahub.authentication.Authentication; +import com.linkedin.entity.client.EntityClientCache; +import com.linkedin.metadata.config.cache.client.EntityClientCacheConfig; +import com.linkedin.entity.client.RestliEntityClient; +import com.linkedin.entity.client.SystemEntityClient; +import com.linkedin.metadata.entity.DeleteEntityService; +import com.linkedin.metadata.entity.EntityService; +import com.linkedin.metadata.event.EventProducer; +import com.linkedin.metadata.search.EntitySearchService; +import com.linkedin.metadata.search.LineageSearchService; +import com.linkedin.metadata.search.SearchService; +import com.linkedin.metadata.search.client.CachingEntitySearchService; +import com.linkedin.metadata.timeseries.TimeseriesAspectService; +import lombok.Getter; + + +/** + * Java backed SystemEntityClient + */ +@Getter +public class SystemJavaEntityClient extends JavaEntityClient implements SystemEntityClient { + + private final EntityClientCache entityClientCache; + private final Authentication systemAuthentication; + + public SystemJavaEntityClient(EntityService entityService, DeleteEntityService deleteEntityService, + EntitySearchService entitySearchService, CachingEntitySearchService cachingEntitySearchService, + SearchService searchService, LineageSearchService lineageSearchService, + TimeseriesAspectService timeseriesAspectService, EventProducer eventProducer, + RestliEntityClient restliEntityClient, Authentication systemAuthentication, + EntityClientCacheConfig cacheConfig) { + super(entityService, deleteEntityService, entitySearchService, cachingEntitySearchService, searchService, + lineageSearchService, timeseriesAspectService, eventProducer, restliEntityClient); + this.systemAuthentication = systemAuthentication; + this.entityClientCache = buildEntityClientCache(SystemJavaEntityClient.class, systemAuthentication, cacheConfig); + } +} diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeline/eventgenerator/DataProcessInstanceRunEventChangeEventGenerator.java b/metadata-io/src/main/java/com/linkedin/metadata/timeline/eventgenerator/DataProcessInstanceRunEventChangeEventGenerator.java index fee9cd9bca56e..a3e5a051a47e3 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeline/eventgenerator/DataProcessInstanceRunEventChangeEventGenerator.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeline/eventgenerator/DataProcessInstanceRunEventChangeEventGenerator.java @@ -1,6 +1,5 @@ package com.linkedin.metadata.timeline.eventgenerator; -import com.datahub.authentication.Authentication; import com.linkedin.common.AuditStamp; import com.linkedin.common.urn.Urn; import com.linkedin.dataprocess.DataProcessInstanceRelationships; @@ -8,7 +7,7 @@ import com.linkedin.dataprocess.DataProcessRunStatus; import com.linkedin.entity.EntityResponse; import com.linkedin.entity.EnvelopedAspectMap; -import com.linkedin.entity.client.EntityClient; +import com.linkedin.entity.client.SystemEntityClient; import com.linkedin.metadata.timeline.data.ChangeCategory; import com.linkedin.metadata.timeline.data.ChangeEvent; import com.linkedin.metadata.timeline.data.ChangeOperation; @@ -27,9 +26,8 @@ public class DataProcessInstanceRunEventChangeEventGenerator private static final String COMPLETED_STATUS = "COMPLETED"; private static final String STARTED_STATUS = "STARTED"; - public DataProcessInstanceRunEventChangeEventGenerator(@Nonnull final EntityClient entityClient, @Nonnull final - Authentication authentication) { - super(entityClient, authentication); + public DataProcessInstanceRunEventChangeEventGenerator(@Nonnull final SystemEntityClient entityClient) { + super(entityClient); } @Override @@ -108,8 +106,8 @@ private DataProcessInstanceRelationships getRelationships(@Nonnull final String EntityResponse entityResponse; try { entityUrn = Urn.createFromString(entityUrnString); - entityResponse = _entityClient.getV2(DATA_PROCESS_INSTANCE_ENTITY_NAME, entityUrn, - Collections.singleton(DATA_PROCESS_INSTANCE_RELATIONSHIPS_ASPECT_NAME), _authentication); + entityResponse = _entityClient.getV2(entityUrn, + Collections.singleton(DATA_PROCESS_INSTANCE_RELATIONSHIPS_ASPECT_NAME)); } catch (Exception e) { return null; } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeline/eventgenerator/EntityChangeEventGenerator.java b/metadata-io/src/main/java/com/linkedin/metadata/timeline/eventgenerator/EntityChangeEventGenerator.java index 7f6aa5e53268e..d5539ec3d3822 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeline/eventgenerator/EntityChangeEventGenerator.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeline/eventgenerator/EntityChangeEventGenerator.java @@ -5,7 +5,7 @@ import com.linkedin.common.AuditStamp; import com.linkedin.common.urn.Urn; import com.linkedin.data.template.RecordTemplate; -import com.linkedin.entity.client.EntityClient; +import com.linkedin.entity.client.SystemEntityClient; import com.linkedin.metadata.entity.EntityAspect; import com.linkedin.metadata.timeline.data.ChangeCategory; import com.linkedin.metadata.timeline.data.ChangeEvent; @@ -19,16 +19,14 @@ */ public abstract class EntityChangeEventGenerator { // TODO: Add a check for supported aspects - protected EntityClient _entityClient; + protected SystemEntityClient _entityClient; protected Authentication _authentication; public EntityChangeEventGenerator() { } - public EntityChangeEventGenerator(@Nonnull final EntityClient entityClient, - @Nonnull final Authentication authentication) { + public EntityChangeEventGenerator(@Nonnull final SystemEntityClient entityClient) { _entityClient = entityClient; - _authentication = authentication; } @Deprecated diff --git a/metadata-jobs/mae-consumer-job/src/test/java/com/linkedin/metadata/kafka/MaeConsumerApplicationTestConfiguration.java b/metadata-jobs/mae-consumer-job/src/test/java/com/linkedin/metadata/kafka/MaeConsumerApplicationTestConfiguration.java index 3b44ede0f1d43..a214117f4e1bc 100644 --- a/metadata-jobs/mae-consumer-job/src/test/java/com/linkedin/metadata/kafka/MaeConsumerApplicationTestConfiguration.java +++ b/metadata-jobs/mae-consumer-job/src/test/java/com/linkedin/metadata/kafka/MaeConsumerApplicationTestConfiguration.java @@ -1,6 +1,6 @@ package com.linkedin.metadata.kafka; -import com.linkedin.entity.client.RestliEntityClient; +import com.linkedin.entity.client.SystemRestliEntityClient; import com.linkedin.gms.factory.auth.SystemAuthenticationFactory; import com.linkedin.metadata.dao.producer.KafkaHealthChecker; import com.linkedin.metadata.entity.EntityServiceImpl; @@ -24,7 +24,7 @@ public class MaeConsumerApplicationTestConfiguration { private EntityServiceImpl _entityServiceImpl; @MockBean - private RestliEntityClient restliEntityClient; + private SystemRestliEntityClient restliEntityClient; @MockBean private Database ebeanServer; diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/config/EntityHydratorConfig.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/config/EntityHydratorConfig.java index 2d8c52566e2ae..a9e54e5354b42 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/config/EntityHydratorConfig.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/config/EntityHydratorConfig.java @@ -1,10 +1,10 @@ package com.linkedin.metadata.kafka.config; -import com.datahub.authentication.Authentication; -import com.linkedin.entity.client.RestliEntityClient; -import com.linkedin.gms.factory.auth.SystemAuthenticationFactory; +import com.google.common.collect.ImmutableSet; +import com.linkedin.entity.client.SystemRestliEntityClient; import com.linkedin.gms.factory.entity.RestliEntityClientFactory; import com.linkedin.metadata.kafka.hydrator.EntityHydrator; +import com.linkedin.metadata.models.registry.EntityRegistry; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.context.annotation.Bean; @@ -13,19 +13,25 @@ @Configuration -@Import({RestliEntityClientFactory.class, SystemAuthenticationFactory.class}) +@Import({RestliEntityClientFactory.class}) public class EntityHydratorConfig { @Autowired - @Qualifier("systemAuthentication") - private Authentication _systemAuthentication; + @Qualifier("systemRestliEntityClient") + private SystemRestliEntityClient _entityClient; @Autowired - @Qualifier("restliEntityClient") - private RestliEntityClient _entityClient; + private EntityRegistry _entityRegistry; + + public final static ImmutableSet EXCLUDED_ASPECTS = ImmutableSet.builder() + .add("datasetUpstreamLineage", "upstreamLineage") + .add("dataJobInputOutput") + .add("dataProcessInstanceRelationships", "dataProcessInstanceInput", "dataProcessInstanceOutput") + .add("inputFields") + .build(); @Bean public EntityHydrator getEntityHydrator() { - return new EntityHydrator(_systemAuthentication, _entityClient); + return new EntityHydrator(_entityRegistry, _entityClient); } } diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHook.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHook.java index 55077c46a1526..3b65ecccad336 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHook.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHook.java @@ -1,15 +1,12 @@ package com.linkedin.metadata.kafka.hook.event; -import com.datahub.authentication.Authentication; import com.google.common.collect.ImmutableSet; import com.linkedin.common.AuditStamp; import com.linkedin.common.urn.Urn; import com.linkedin.data.DataMap; import com.linkedin.data.template.RecordTemplate; import com.linkedin.data.template.SetMode; -import com.linkedin.entity.client.EntityClient; -import com.linkedin.entity.client.RestliEntityClient; -import com.linkedin.gms.factory.auth.SystemAuthenticationFactory; +import com.linkedin.entity.client.SystemRestliEntityClient; import com.linkedin.gms.factory.entity.RestliEntityClientFactory; import com.linkedin.gms.factory.entityregistry.EntityRegistryFactory; import com.linkedin.metadata.Constants; @@ -46,8 +43,7 @@ */ @Slf4j @Component -@Import({EntityChangeEventGeneratorRegistry.class, EntityRegistryFactory.class, RestliEntityClientFactory.class, - SystemAuthenticationFactory.class}) +@Import({EntityChangeEventGeneratorRegistry.class, EntityRegistryFactory.class, RestliEntityClientFactory.class}) public class EntityChangeEventGeneratorHook implements MetadataChangeLogHook { /** @@ -83,20 +79,18 @@ public class EntityChangeEventGeneratorHook implements MetadataChangeLogHook { */ private static final Set SUPPORTED_OPERATIONS = ImmutableSet.of("CREATE", "UPSERT", "DELETE"); private final EntityChangeEventGeneratorRegistry _entityChangeEventGeneratorRegistry; - private final EntityClient _entityClient; - private final Authentication _systemAuthentication; + private final SystemRestliEntityClient _entityClient; private final EntityRegistry _entityRegistry; private final Boolean _isEnabled; @Autowired public EntityChangeEventGeneratorHook( @Nonnull final EntityChangeEventGeneratorRegistry entityChangeEventGeneratorRegistry, - @Nonnull final RestliEntityClient entityClient, @Nonnull final Authentication systemAuthentication, + @Nonnull final SystemRestliEntityClient entityClient, @Nonnull final EntityRegistry entityRegistry, @Nonnull @Value("${entityChangeEvents.enabled:true}") Boolean isEnabled) { _entityChangeEventGeneratorRegistry = Objects.requireNonNull(entityChangeEventGeneratorRegistry); _entityClient = Objects.requireNonNull(entityClient); - _systemAuthentication = Objects.requireNonNull(systemAuthentication); _entityRegistry = Objects.requireNonNull(entityRegistry); _isEnabled = isEnabled; } @@ -189,8 +183,7 @@ private void emitPlatformEvent(@Nonnull final PlatformEvent event, @Nonnull fina _entityClient.producePlatformEvent( Constants.CHANGE_EVENT_PLATFORM_EVENT_NAME, partitioningKey, - event, - _systemAuthentication + event ); } diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/siblings/SiblingAssociationHook.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/siblings/SiblingAssociationHook.java index 06545ef3525dd..7cbe53dee9fe4 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/siblings/SiblingAssociationHook.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/siblings/SiblingAssociationHook.java @@ -1,6 +1,5 @@ package com.linkedin.metadata.kafka.hook.siblings; -import com.datahub.authentication.Authentication; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; @@ -13,9 +12,8 @@ import com.linkedin.dataset.UpstreamArray; import com.linkedin.dataset.UpstreamLineage; import com.linkedin.entity.EntityResponse; -import com.linkedin.entity.client.RestliEntityClient; +import com.linkedin.entity.client.SystemRestliEntityClient; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.gms.factory.auth.SystemAuthenticationFactory; import com.linkedin.gms.factory.entity.RestliEntityClientFactory; import com.linkedin.gms.factory.entityregistry.EntityRegistryFactory; import com.linkedin.gms.factory.search.EntitySearchServiceFactory; @@ -60,7 +58,7 @@ @Slf4j @Component @Singleton -@Import({EntityRegistryFactory.class, RestliEntityClientFactory.class, EntitySearchServiceFactory.class, SystemAuthenticationFactory.class}) +@Import({EntityRegistryFactory.class, RestliEntityClientFactory.class, EntitySearchServiceFactory.class}) public class SiblingAssociationHook implements MetadataChangeLogHook { public static final String SIBLING_ASSOCIATION_SYSTEM_ACTOR = "urn:li:corpuser:__datahub_system_sibling_hook"; @@ -73,23 +71,20 @@ public class SiblingAssociationHook implements MetadataChangeLogHook { public static final String SOURCE_SUBTYPE_V2 = "Source"; private final EntityRegistry _entityRegistry; - private final RestliEntityClient _entityClient; + private final SystemRestliEntityClient _entityClient; private final EntitySearchService _searchService; - private final Authentication _systemAuthentication; private final boolean _isEnabled; @Autowired public SiblingAssociationHook( @Nonnull final EntityRegistry entityRegistry, - @Nonnull final RestliEntityClient entityClient, + @Nonnull final SystemRestliEntityClient entityClient, @Nonnull final EntitySearchService searchService, - @Nonnull final Authentication systemAuthentication, @Nonnull @Value("${siblings.enabled:true}") Boolean isEnabled ) { _entityRegistry = entityRegistry; _entityClient = entityClient; _searchService = searchService; - _systemAuthentication = systemAuthentication; _isEnabled = isEnabled; } @@ -251,9 +246,9 @@ private void setSiblingsAndSoftDeleteSibling(Urn dbtUrn, Urn sourceUrn) { dbtSiblingProposal.setEntityUrn(dbtUrn); try { - _entityClient.ingestProposal(dbtSiblingProposal, _systemAuthentication); + _entityClient.ingestProposal(dbtSiblingProposal, true); } catch (RemoteInvocationException e) { - log.error("Error while associating {} with {}: {}", dbtUrn.toString(), sourceUrn.toString(), e.toString()); + log.error("Error while associating {} with {}: {}", dbtUrn, sourceUrn, e.toString()); throw new RuntimeException("Error ingesting sibling proposal. Skipping processing.", e); } @@ -274,9 +269,9 @@ private void setSiblingsAndSoftDeleteSibling(Urn dbtUrn, Urn sourceUrn) { List filteredNewSiblingsArray = newSiblingsUrnArray.stream().filter(urn -> { try { - return _entityClient.exists(urn, _systemAuthentication); + return _entityClient.exists(urn); } catch (RemoteInvocationException e) { - log.error("Error while checking existence of {}: {}", urn.toString(), e.toString()); + log.error("Error while checking existence of {}: {}", urn, e.toString()); throw new RuntimeException("Error checking existence. Skipping processing.", e); } }).collect(Collectors.toList()); @@ -294,9 +289,9 @@ private void setSiblingsAndSoftDeleteSibling(Urn dbtUrn, Urn sourceUrn) { sourceSiblingProposal.setEntityUrn(sourceUrn); try { - _entityClient.ingestProposal(sourceSiblingProposal, _systemAuthentication); + _entityClient.ingestProposal(sourceSiblingProposal, true); } catch (RemoteInvocationException e) { - log.error("Error while associating {} with {}: {}", dbtUrn.toString(), sourceUrn.toString(), e.toString()); + log.error("Error while associating {} with {}: {}", dbtUrn, sourceUrn, e.toString()); throw new RuntimeException("Error ingesting sibling proposal. Skipping processing.", e); } } @@ -406,11 +401,8 @@ private SubTypes getSubtypesFromEntityClient( ) { try { EntityResponse entityResponse = _entityClient.getV2( - DATASET_ENTITY_NAME, urn, - ImmutableSet.of(SUB_TYPES_ASPECT_NAME), - _systemAuthentication - ); + ImmutableSet.of(SUB_TYPES_ASPECT_NAME)); if (entityResponse != null && entityResponse.hasAspects() && entityResponse.getAspects().containsKey(Constants.SUB_TYPES_ASPECT_NAME)) { return new SubTypes(entityResponse.getAspects().get(Constants.SUB_TYPES_ASPECT_NAME).getValue().data()); @@ -427,10 +419,8 @@ private UpstreamLineage getUpstreamLineageFromEntityClient( ) { try { EntityResponse entityResponse = _entityClient.getV2( - DATASET_ENTITY_NAME, urn, - ImmutableSet.of(UPSTREAM_LINEAGE_ASPECT_NAME), - _systemAuthentication + ImmutableSet.of(UPSTREAM_LINEAGE_ASPECT_NAME) ); if (entityResponse != null && entityResponse.hasAspects() && entityResponse.getAspects().containsKey(Constants.UPSTREAM_LINEAGE_ASPECT_NAME)) { @@ -448,10 +438,8 @@ private Siblings getSiblingsFromEntityClient( ) { try { EntityResponse entityResponse = _entityClient.getV2( - DATASET_ENTITY_NAME, urn, - ImmutableSet.of(SIBLINGS_ASPECT_NAME), - _systemAuthentication + ImmutableSet.of(SIBLINGS_ASPECT_NAME) ); if (entityResponse != null && entityResponse.hasAspects() && entityResponse.getAspects().containsKey(Constants.SIBLINGS_ASPECT_NAME)) { diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hydrator/EntityHydrator.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hydrator/EntityHydrator.java index d768ada1765fa..0a3b38517eaad 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hydrator/EntityHydrator.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hydrator/EntityHydrator.java @@ -1,28 +1,32 @@ package com.linkedin.metadata.kafka.hydrator; -import com.datahub.authentication.Authentication; import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; import com.linkedin.common.urn.Urn; import com.linkedin.entity.EntityResponse; -import com.linkedin.entity.client.EntityClient; +import com.linkedin.entity.client.SystemRestliEntityClient; +import com.linkedin.metadata.models.AspectSpec; +import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.r2.RemoteInvocationException; import java.net.URISyntaxException; import java.util.Collections; import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import static com.linkedin.metadata.Constants.*; +import static com.linkedin.metadata.kafka.config.EntityHydratorConfig.EXCLUDED_ASPECTS; @Slf4j @RequiredArgsConstructor public class EntityHydrator { - private final Authentication _systemAuthentication; - private final EntityClient _entityClient; - + private final EntityRegistry _entityRegistry; + private final SystemRestliEntityClient _entityClient; private final ChartHydrator _chartHydrator = new ChartHydrator(); private final CorpUserHydrator _corpUserHydrator = new CorpUserHydrator(); private final DashboardHydrator _dashboardHydrator = new DashboardHydrator(); @@ -43,8 +47,12 @@ public Optional getHydratedEntity(String entityTypeName, String urn) // Hydrate fields from snapshot EntityResponse entityResponse; try { - entityResponse = _entityClient.batchGetV2(entityTypeName, Collections.singleton(urnObj), null, - this._systemAuthentication).get(urnObj); + Set aspectNames = Optional.ofNullable(_entityRegistry.getEntitySpecs().get(urnObj.getEntityType())) + .map(spec -> spec.getAspectSpecs().stream().map(AspectSpec::getName) + .filter(aspectName -> !EXCLUDED_ASPECTS.contains(aspectName)) + .collect(Collectors.toSet())) + .orElse(Set.of()); + entityResponse = _entityClient.batchGetV2(Collections.singleton(urnObj), aspectNames).get(urnObj); } catch (RemoteInvocationException | URISyntaxException e) { log.error("Error while calling GMS to hydrate entity for urn {}", urn); return Optional.empty(); diff --git a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHookTest.java b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHookTest.java index d8759da0fe1dd..7d9619f3e2d1c 100644 --- a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHookTest.java +++ b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHookTest.java @@ -1,6 +1,5 @@ package com.linkedin.metadata.kafka.hook.event; -import com.datahub.authentication.Authentication; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.linkedin.assertion.AssertionResult; @@ -38,8 +37,7 @@ import com.linkedin.entity.EntityResponse; import com.linkedin.entity.EnvelopedAspect; import com.linkedin.entity.EnvelopedAspectMap; -import com.linkedin.entity.client.EntityClient; -import com.linkedin.entity.client.RestliEntityClient; +import com.linkedin.entity.client.SystemRestliEntityClient; import com.linkedin.events.metadata.ChangeType; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.key.DatasetKey; @@ -66,6 +64,7 @@ import com.linkedin.platform.event.v1.Parameters; import java.net.URISyntaxException; import java.util.Map; + import org.mockito.Mockito; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; @@ -92,22 +91,19 @@ public class EntityChangeEventGeneratorHookTest { private static final String TEST_DATA_FLOW_URN = "urn:li:dataFlow:flow"; private static final String TEST_DATA_JOB_URN = "urn:li:dataJob:job"; private Urn actorUrn; - private Authentication _mockAuthentication; - private RestliEntityClient _mockClient; + private SystemRestliEntityClient _mockClient; private EntityService _mockEntityService; private EntityChangeEventGeneratorHook _entityChangeEventHook; @BeforeMethod public void setupTest() throws URISyntaxException { actorUrn = Urn.createFromString(TEST_ACTOR_URN); - _mockAuthentication = Mockito.mock(Authentication.class); - _mockClient = Mockito.mock(RestliEntityClient.class); + _mockClient = Mockito.mock(SystemRestliEntityClient.class); _mockEntityService = Mockito.mock(EntityService.class); EntityChangeEventGeneratorRegistry entityChangeEventGeneratorRegistry = createEntityChangeEventGeneratorRegistry(); _entityChangeEventHook = - new EntityChangeEventGeneratorHook(entityChangeEventGeneratorRegistry, _mockClient, _mockAuthentication, - createMockEntityRegistry(), true); + new EntityChangeEventGeneratorHook(entityChangeEventGeneratorRegistry, _mockClient, createMockEntityRegistry(), true); } @Test @@ -498,8 +494,7 @@ public void testInvokeDataProcessInstanceRunEventStart() throws Exception { final EntityResponse entityResponse = buildEntityResponse(ImmutableMap.of(DATA_PROCESS_INSTANCE_RELATIONSHIPS_ASPECT_NAME, relationships)); - Mockito.when(_mockClient.getV2(eq(DATA_PROCESS_INSTANCE_ENTITY_NAME), eq(dataProcessInstanceUrn), - any(), eq(_mockAuthentication))).thenReturn(entityResponse); + Mockito.when(_mockClient.getV2(eq(dataProcessInstanceUrn), any())).thenReturn(entityResponse); _entityChangeEventHook.invoke(event); @@ -540,8 +535,7 @@ public void testInvokeDataProcessInstanceRunEventComplete() throws Exception { final EntityResponse entityResponse = buildEntityResponse(ImmutableMap.of(DATA_PROCESS_INSTANCE_RELATIONSHIPS_ASPECT_NAME, relationships)); - Mockito.when(_mockClient.getV2(eq(DATA_PROCESS_INSTANCE_ENTITY_NAME), eq(dataProcessInstanceUrn), - any(), eq(_mockAuthentication))).thenReturn(entityResponse); + Mockito.when(_mockClient.getV2(eq(dataProcessInstanceUrn), any())).thenReturn(entityResponse); _entityChangeEventHook.invoke(event); @@ -618,7 +612,7 @@ private EntityChangeEventGeneratorRegistry createEntityChangeEventGeneratorRegis // Run change event generators registry.register(ASSERTION_RUN_EVENT_ASPECT_NAME, new AssertionRunEventChangeEventGenerator()); registry.register(DATA_PROCESS_INSTANCE_RUN_EVENT_ASPECT_NAME, - new DataProcessInstanceRunEventChangeEventGenerator(_mockClient, _mockAuthentication)); + new DataProcessInstanceRunEventChangeEventGenerator(_mockClient)); return registry; } @@ -668,14 +662,14 @@ private EntityRegistry createMockEntityRegistry() { return registry; } - private void verifyProducePlatformEvent(EntityClient mockClient, PlatformEvent platformEvent) throws Exception { + private void verifyProducePlatformEvent(SystemRestliEntityClient mockClient, PlatformEvent platformEvent) throws Exception { verifyProducePlatformEvent(mockClient, platformEvent, true); } - private void verifyProducePlatformEvent(EntityClient mockClient, PlatformEvent platformEvent, boolean noMoreInteractions) throws Exception { + private void verifyProducePlatformEvent(SystemRestliEntityClient mockClient, PlatformEvent platformEvent, boolean noMoreInteractions) throws Exception { // Verify event has been emitted. verify(mockClient, Mockito.times(1)).producePlatformEvent(eq(CHANGE_EVENT_PLATFORM_EVENT_NAME), Mockito.anyString(), - argThat(new PlatformEventMatcher(platformEvent)), Mockito.any(Authentication.class)); + argThat(new PlatformEventMatcher(platformEvent))); if (noMoreInteractions) { Mockito.verifyNoMoreInteractions(_mockClient); diff --git a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/siblings/SiblingAssociationHookTest.java b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/siblings/SiblingAssociationHookTest.java index 78d304d67bfc0..6a2a05aa4b8c0 100644 --- a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/siblings/SiblingAssociationHookTest.java +++ b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/siblings/SiblingAssociationHookTest.java @@ -1,6 +1,5 @@ package com.linkedin.metadata.kafka.hook.siblings; -import com.datahub.authentication.Authentication; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.linkedin.common.FabricType; @@ -19,7 +18,7 @@ import com.linkedin.entity.EntityResponse; import com.linkedin.entity.EnvelopedAspect; import com.linkedin.entity.EnvelopedAspectMap; -import com.linkedin.entity.client.RestliEntityClient; +import com.linkedin.entity.client.SystemRestliEntityClient; import com.linkedin.events.metadata.ChangeType; import com.linkedin.metadata.key.DatasetKey; import com.linkedin.metadata.models.registry.ConfigEntityRegistry; @@ -44,19 +43,16 @@ public class SiblingAssociationHookTest { private SiblingAssociationHook _siblingAssociationHook; - RestliEntityClient _mockEntityClient; + SystemRestliEntityClient _mockEntityClient; EntitySearchService _mockSearchService; - Authentication _mockAuthentication; @BeforeMethod public void setupTest() { EntityRegistry registry = new ConfigEntityRegistry( SiblingAssociationHookTest.class.getClassLoader().getResourceAsStream("test-entity-registry-siblings.yml")); - _mockEntityClient = Mockito.mock(RestliEntityClient.class); + _mockEntityClient = Mockito.mock(SystemRestliEntityClient.class); _mockSearchService = Mockito.mock(EntitySearchService.class); - _mockAuthentication = Mockito.mock(Authentication.class); - _siblingAssociationHook = new SiblingAssociationHook(registry, _mockEntityClient, _mockSearchService, _mockAuthentication, - true); + _siblingAssociationHook = new SiblingAssociationHook(registry, _mockEntityClient, _mockSearchService, true); _siblingAssociationHook.setEnabled(true); } @@ -69,15 +65,13 @@ public void testInvokeWhenThereIsAPairWithDbtSourceNode() throws Exception { EntityResponse mockResponse = new EntityResponse(); mockResponse.setAspects(mockResponseMap); - Mockito.when(_mockEntityClient.exists(Mockito.any(), Mockito.any())).thenReturn(true); + Mockito.when(_mockEntityClient.exists(Mockito.any())).thenReturn(true); Mockito.when( _mockEntityClient.getV2( - DATASET_ENTITY_NAME, Urn.createFromString("urn:li:dataset:(urn:li:dataPlatform:dbt,my-proj.jaffle_shop.customers,PROD)"), - ImmutableSet.of(SUB_TYPES_ASPECT_NAME), - _mockAuthentication + ImmutableSet.of(SUB_TYPES_ASPECT_NAME) )).thenReturn(mockResponse); @@ -105,10 +99,7 @@ public void testInvokeWhenThereIsAPairWithDbtSourceNode() throws Exception { proposal.setAspect(GenericRecordUtils.serializeAspect(dbtSiblingsAspect)); proposal.setChangeType(ChangeType.UPSERT); - Mockito.verify(_mockEntityClient, Mockito.times(1)).ingestProposal( - Mockito.eq(proposal), - Mockito.eq(_mockAuthentication) - ); + Mockito.verify(_mockEntityClient, Mockito.times(1)).ingestProposal(Mockito.eq(proposal), eq(true)); final Siblings sourceSiblingsAspect = new Siblings() .setSiblings(new UrnArray(ImmutableList.of(Urn.createFromString("urn:li:dataset:(urn:li:dataPlatform:dbt,my-proj.jaffle_shop.customers,PROD)")))) @@ -121,10 +112,7 @@ public void testInvokeWhenThereIsAPairWithDbtSourceNode() throws Exception { proposal2.setAspect(GenericRecordUtils.serializeAspect(sourceSiblingsAspect)); proposal2.setChangeType(ChangeType.UPSERT); - Mockito.verify(_mockEntityClient, Mockito.times(1)).ingestProposal( - Mockito.eq(proposal2), - Mockito.eq(_mockAuthentication) - ); + Mockito.verify(_mockEntityClient, Mockito.times(1)).ingestProposal(Mockito.eq(proposal2), eq(true)); } @Test @@ -132,23 +120,20 @@ public void testInvokeWhenThereIsNoPairWithDbtModel() throws Exception { SubTypes mockSourceSubtypesAspect = new SubTypes(); mockSourceSubtypesAspect.setTypeNames(new StringArray(ImmutableList.of("model"))); - Mockito.when(_mockEntityClient.exists(Mockito.any(), Mockito.any())).thenReturn(true); + Mockito.when(_mockEntityClient.exists(Mockito.any())).thenReturn(true); EnvelopedAspectMap mockResponseMap = new EnvelopedAspectMap(); mockResponseMap.put(SUB_TYPES_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(mockSourceSubtypesAspect.data()))); EntityResponse mockResponse = new EntityResponse(); mockResponse.setAspects(mockResponseMap); - Mockito.when(_mockEntityClient.exists(Mockito.any(), Mockito.any())).thenReturn(true); + Mockito.when(_mockEntityClient.exists(Mockito.any())).thenReturn(true); Mockito.when( _mockEntityClient.getV2( - DATASET_ENTITY_NAME, Urn.createFromString("urn:li:dataset:(urn:li:dataPlatform:dbt,my-proj.jaffle_shop.customers,PROD)"), - ImmutableSet.of(SUB_TYPES_ASPECT_NAME), - _mockAuthentication - )).thenReturn(mockResponse); + ImmutableSet.of(SUB_TYPES_ASPECT_NAME))).thenReturn(mockResponse); MetadataChangeLog event = createEvent(DATASET_ENTITY_NAME, UPSTREAM_LINEAGE_ASPECT_NAME, ChangeType.UPSERT); Upstream upstream = createUpstream("urn:li:dataset:(urn:li:dataPlatform:bigquery,my-proj.jaffle_shop.customers,PROD)", DatasetLineageType.TRANSFORMED); @@ -174,15 +159,12 @@ public void testInvokeWhenThereIsNoPairWithDbtModel() throws Exception { proposal.setAspect(GenericRecordUtils.serializeAspect(dbtSiblingsAspect)); proposal.setChangeType(ChangeType.UPSERT); - Mockito.verify(_mockEntityClient, Mockito.times(0)).ingestProposal( - Mockito.eq(proposal), - Mockito.eq(_mockAuthentication) - ); + Mockito.verify(_mockEntityClient, Mockito.times(0)).ingestProposal(Mockito.eq(proposal), eq(true)); } @Test public void testInvokeWhenThereIsAPairWithBigqueryDownstreamNode() throws Exception { - Mockito.when(_mockEntityClient.exists(Mockito.any(), Mockito.any())).thenReturn(true); + Mockito.when(_mockEntityClient.exists(Mockito.any())).thenReturn(true); MetadataChangeLog event = createEvent(DATASET_ENTITY_NAME, UPSTREAM_LINEAGE_ASPECT_NAME, ChangeType.UPSERT); @@ -208,10 +190,7 @@ public void testInvokeWhenThereIsAPairWithBigqueryDownstreamNode() throws Except proposal.setAspect(GenericRecordUtils.serializeAspect(dbtSiblingsAspect)); proposal.setChangeType(ChangeType.UPSERT); - Mockito.verify(_mockEntityClient, Mockito.times(1)).ingestProposal( - Mockito.eq(proposal), - Mockito.eq(_mockAuthentication) - ); + Mockito.verify(_mockEntityClient, Mockito.times(1)).ingestProposal(Mockito.eq(proposal), eq(true)); final Siblings sourceSiblingsAspect = new Siblings() .setSiblings(new UrnArray(ImmutableList.of(Urn.createFromString("urn:li:dataset:(urn:li:dataPlatform:dbt,my-proj.jaffle_shop.customers,PROD)")))) @@ -224,15 +203,12 @@ public void testInvokeWhenThereIsAPairWithBigqueryDownstreamNode() throws Except proposal2.setAspect(GenericRecordUtils.serializeAspect(sourceSiblingsAspect)); proposal2.setChangeType(ChangeType.UPSERT); - Mockito.verify(_mockEntityClient, Mockito.times(1)).ingestProposal( - Mockito.eq(proposal2), - Mockito.eq(_mockAuthentication) - ); + Mockito.verify(_mockEntityClient, Mockito.times(1)).ingestProposal(Mockito.eq(proposal2), eq(true)); } @Test public void testInvokeWhenThereIsAKeyBeingReingested() throws Exception { - Mockito.when(_mockEntityClient.exists(Mockito.any(), Mockito.any())).thenReturn(true); + Mockito.when(_mockEntityClient.exists(Mockito.any())).thenReturn(true); SearchResult returnSearchResult = new SearchResult(); SearchEntityArray returnEntityArray = new SearchEntityArray(); @@ -271,10 +247,7 @@ public void testInvokeWhenThereIsAKeyBeingReingested() throws Exception { proposal.setAspect(GenericRecordUtils.serializeAspect(dbtSiblingsAspect)); proposal.setChangeType(ChangeType.UPSERT); - Mockito.verify(_mockEntityClient, Mockito.times(1)).ingestProposal( - Mockito.eq(proposal), - Mockito.eq(_mockAuthentication) - ); + Mockito.verify(_mockEntityClient, Mockito.times(1)).ingestProposal(Mockito.eq(proposal), eq(true)); final Siblings sourceSiblingsAspect = new Siblings() .setSiblings(new UrnArray(ImmutableList.of(Urn.createFromString("urn:li:dataset:(urn:li:dataPlatform:dbt,my-proj.jaffle_shop.customers,PROD)")))) @@ -287,10 +260,7 @@ public void testInvokeWhenThereIsAKeyBeingReingested() throws Exception { proposal2.setAspect(GenericRecordUtils.serializeAspect(sourceSiblingsAspect)); proposal2.setChangeType(ChangeType.UPSERT); - Mockito.verify(_mockEntityClient, Mockito.times(1)).ingestProposal( - Mockito.eq(proposal2), - Mockito.eq(_mockAuthentication) - ); + Mockito.verify(_mockEntityClient, Mockito.times(1)).ingestProposal(Mockito.eq(proposal2), eq(true)); } @Test public void testInvokeWhenSourceUrnHasTwoDbtUpstreams() throws Exception { @@ -309,10 +279,7 @@ public void testInvokeWhenSourceUrnHasTwoDbtUpstreams() throws Exception { _siblingAssociationHook.invoke(event); - Mockito.verify(_mockEntityClient, Mockito.times(0)).ingestProposal( - Mockito.any(), - Mockito.eq(_mockAuthentication) - ); + Mockito.verify(_mockEntityClient, Mockito.times(0)).ingestProposal(Mockito.any(), eq(true)); } @@ -335,12 +302,7 @@ public void testInvokeWhenSourceUrnHasTwoUpstreamsOneDbt() throws Exception { _siblingAssociationHook.invoke(event); - Mockito.verify(_mockEntityClient, Mockito.times(2)).ingestProposal( - Mockito.any(), - Mockito.eq(_mockAuthentication) - ); - - + Mockito.verify(_mockEntityClient, Mockito.times(2)).ingestProposal(Mockito.any(), eq(true)); } private MetadataChangeLog createEvent(String entityType, String aspectName, ChangeType changeType) { diff --git a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringTestConfiguration.java b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringTestConfiguration.java index ef80c49ec4520..dc5a6cd23295b 100644 --- a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringTestConfiguration.java +++ b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringTestConfiguration.java @@ -2,7 +2,7 @@ import com.datahub.authentication.Authentication; import com.datahub.metadata.ingestion.IngestionScheduler; -import com.linkedin.entity.client.RestliEntityClient; +import com.linkedin.entity.client.SystemRestliEntityClient; import com.linkedin.gms.factory.kafka.schemaregistry.SchemaRegistryConfig; import com.linkedin.metadata.boot.kafka.DataHubUpgradeKafkaListener; import com.linkedin.metadata.graph.elastic.ElasticSearchGraphService; @@ -44,8 +44,8 @@ public class MCLSpringTestConfiguration { @MockBean public IngestionScheduler ingestionScheduler; - @MockBean - public RestliEntityClient entityClient; + @MockBean(name = "systemRestliEntityClient") + public SystemRestliEntityClient entityClient; @MockBean public ElasticSearchService searchService; diff --git a/metadata-jobs/mce-consumer-job/src/main/java/com/linkedin/metadata/kafka/MceConsumerApplication.java b/metadata-jobs/mce-consumer-job/src/main/java/com/linkedin/metadata/kafka/MceConsumerApplication.java index 9b4fe15c11fc5..f0c59240a9ba4 100644 --- a/metadata-jobs/mce-consumer-job/src/main/java/com/linkedin/metadata/kafka/MceConsumerApplication.java +++ b/metadata-jobs/mce-consumer-job/src/main/java/com/linkedin/metadata/kafka/MceConsumerApplication.java @@ -1,8 +1,8 @@ package com.linkedin.metadata.kafka; import com.linkedin.gms.factory.entity.RestliEntityClientFactory; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; import com.linkedin.gms.factory.telemetry.ScheduledAnalyticsFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import org.springframework.boot.SpringApplication; import org.springframework.boot.actuate.autoconfigure.solr.SolrHealthContributorAutoConfiguration; import org.springframework.boot.autoconfigure.SpringBootApplication; diff --git a/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeEventsProcessor.java b/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeEventsProcessor.java index 74679d30b2945..c30dd6e6f96dc 100644 --- a/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeEventsProcessor.java +++ b/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeEventsProcessor.java @@ -4,8 +4,7 @@ import com.codahale.metrics.MetricRegistry; import com.datahub.authentication.Authentication; import com.linkedin.entity.Entity; -import com.linkedin.entity.client.RestliEntityClient; -import com.linkedin.gms.factory.auth.SystemAuthenticationFactory; +import com.linkedin.entity.client.SystemRestliEntityClient; import com.linkedin.gms.factory.entity.RestliEntityClientFactory; import com.linkedin.gms.factory.kafka.KafkaEventConsumerFactory; import com.linkedin.gms.factory.kafka.DataHubKafkaProducerFactory; @@ -40,15 +39,14 @@ @Slf4j @Component @Conditional(MetadataChangeProposalProcessorCondition.class) -@Import({RestliEntityClientFactory.class, SystemAuthenticationFactory.class, KafkaEventConsumerFactory.class, - DataHubKafkaProducerFactory.class}) +@Import({RestliEntityClientFactory.class, KafkaEventConsumerFactory.class, DataHubKafkaProducerFactory.class}) @EnableKafka @RequiredArgsConstructor public class MetadataChangeEventsProcessor { @NonNull private final Authentication systemAuthentication; - private final RestliEntityClient entityClient; + private final SystemRestliEntityClient entityClient; private final Producer kafkaProducer; private final Histogram kafkaLagStats = MetricUtils.get().histogram(MetricRegistry.name(this.getClass(), "kafkaLag")); diff --git a/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeProposalsProcessor.java b/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeProposalsProcessor.java index 289d70ef8c0e9..79f8c90af8ec7 100644 --- a/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeProposalsProcessor.java +++ b/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeProposalsProcessor.java @@ -2,9 +2,7 @@ import com.codahale.metrics.Histogram; import com.codahale.metrics.MetricRegistry; -import com.datahub.authentication.Authentication; -import com.linkedin.entity.client.RestliEntityClient; -import com.linkedin.gms.factory.auth.SystemAuthenticationFactory; +import com.linkedin.entity.client.SystemRestliEntityClient; import com.linkedin.gms.factory.entity.RestliEntityClientFactory; import com.linkedin.gms.factory.kafka.KafkaEventConsumerFactory; import com.linkedin.gms.factory.kafka.DataHubKafkaProducerFactory; @@ -35,15 +33,13 @@ @Slf4j @Component -@Import({RestliEntityClientFactory.class, SystemAuthenticationFactory.class, KafkaEventConsumerFactory.class, - DataHubKafkaProducerFactory.class}) +@Import({RestliEntityClientFactory.class, KafkaEventConsumerFactory.class, DataHubKafkaProducerFactory.class}) @Conditional(MetadataChangeProposalProcessorCondition.class) @EnableKafka @RequiredArgsConstructor public class MetadataChangeProposalsProcessor { - private final Authentication systemAuthentication; - private final RestliEntityClient entityClient; + private final SystemRestliEntityClient entityClient; private final Producer kafkaProducer; private final Histogram kafkaLagStats = MetricUtils.get().histogram(MetricRegistry.name(this.getClass(), "kafkaLag")); @@ -64,7 +60,7 @@ public void consume(final ConsumerRecord consumerRecord) event = EventUtils.avroToPegasusMCP(record); log.debug("MetadataChangeProposal {}", event); // TODO: Get this from the event itself. - entityClient.ingestProposal(event, this.systemAuthentication, false); + entityClient.ingestProposal(event, false); } catch (Throwable throwable) { log.error("MCP Processor Error", throwable); log.error("Message: {}", record); diff --git a/metadata-service/configuration/build.gradle b/metadata-service/configuration/build.gradle index 30fa3079d29a4..bf79469633b0f 100644 --- a/metadata-service/configuration/build.gradle +++ b/metadata-service/configuration/build.gradle @@ -7,6 +7,7 @@ dependencies { implementation externalDependency.slf4jApi implementation externalDependency.springCore + implementation externalDependency.springBeans compileOnly externalDependency.lombok diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/cache/CacheConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/cache/CacheConfiguration.java index 38934cb9a3d2f..aff0e23e3b337 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/cache/CacheConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/cache/CacheConfiguration.java @@ -1,5 +1,6 @@ package com.linkedin.metadata.config.cache; +import com.linkedin.metadata.config.cache.client.ClientCacheConfiguration; import lombok.Data; @@ -8,4 +9,5 @@ public class CacheConfiguration { PrimaryCacheConfiguration primary; HomepageCacheConfiguration homepage; SearchCacheConfiguration search; + ClientCacheConfiguration client; } diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/cache/client/ClientCacheConfig.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/cache/client/ClientCacheConfig.java new file mode 100644 index 0000000000000..3cf7ef20797bb --- /dev/null +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/cache/client/ClientCacheConfig.java @@ -0,0 +1,10 @@ +package com.linkedin.metadata.config.cache.client; + + +public interface ClientCacheConfig { + boolean isEnabled(); + boolean isStatsEnabled(); + int getStatsIntervalSeconds(); + int getDefaultTTLSeconds(); + int getMaxBytes(); +} diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/cache/client/ClientCacheConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/cache/client/ClientCacheConfiguration.java new file mode 100644 index 0000000000000..d940bbe135e55 --- /dev/null +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/cache/client/ClientCacheConfiguration.java @@ -0,0 +1,9 @@ +package com.linkedin.metadata.config.cache.client; + +import lombok.Data; + +@Data +public class ClientCacheConfiguration { + EntityClientCacheConfig entityClient; + UsageClientCacheConfig usageClient; +} diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/cache/client/EntityClientCacheConfig.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/cache/client/EntityClientCacheConfig.java new file mode 100644 index 0000000000000..595b614f2f599 --- /dev/null +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/cache/client/EntityClientCacheConfig.java @@ -0,0 +1,17 @@ +package com.linkedin.metadata.config.cache.client; + +import lombok.Data; + +import java.util.Map; + +@Data +public class EntityClientCacheConfig implements ClientCacheConfig { + private boolean enabled; + private boolean statsEnabled; + private int statsIntervalSeconds; + private int defaultTTLSeconds; + private int maxBytes; + + // entityName -> aspectName -> cache ttl override + private Map> entityAspectTTLSeconds; +} diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/cache/client/UsageClientCacheConfig.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/cache/client/UsageClientCacheConfig.java new file mode 100644 index 0000000000000..3aebec9422ed8 --- /dev/null +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/cache/client/UsageClientCacheConfig.java @@ -0,0 +1,12 @@ +package com.linkedin.metadata.config.cache.client; + +import lombok.Data; + +@Data +public class UsageClientCacheConfig implements ClientCacheConfig { + private boolean enabled; + private boolean statsEnabled; + private int statsIntervalSeconds; + private int defaultTTLSeconds; + private int maxBytes; +} diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/spring/YamlPropertySourceFactory.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/spring/YamlPropertySourceFactory.java similarity index 87% rename from metadata-service/factories/src/main/java/com/linkedin/gms/factory/spring/YamlPropertySourceFactory.java rename to metadata-service/configuration/src/main/java/com/linkedin/metadata/spring/YamlPropertySourceFactory.java index 1542407697d1b..c10399c4f3e70 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/spring/YamlPropertySourceFactory.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/spring/YamlPropertySourceFactory.java @@ -1,14 +1,18 @@ -package com.linkedin.gms.factory.spring; +package com.linkedin.metadata.spring; -import java.io.IOException; -import java.util.Properties; import org.springframework.beans.factory.config.YamlPropertiesFactoryBean; import org.springframework.core.env.PropertiesPropertySource; import org.springframework.core.env.PropertySource; import org.springframework.core.io.support.EncodedResource; import org.springframework.core.io.support.PropertySourceFactory; +import java.io.IOException; +import java.util.Properties; + +/** + * Required for Spring to parse the application.yml provided by this module + */ public class YamlPropertySourceFactory implements PropertySourceFactory { @Override diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index ea959bebf25ad..42749d8205d21 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -327,3 +327,27 @@ cache: lineage: ttlSeconds: ${CACHE_SEARCH_LINEAGE_TTL_SECONDS:86400} # 1 day lightningThreshold: ${CACHE_SEARCH_LINEAGE_LIGHTNING_THRESHOLD:300} + client: + usageClient: + enabled: ${CACHE_CLIENT_USAGE_CLIENT_ENABLED:true} + statsEnabled: ${CACHE_CLIENT_USAGE_CLIENT_STATS_ENABLED:true} + statsIntervalSeconds: ${CACHE_CLIENT_USAGE_CLIENT_STATS_INTERVAL_SECONDS:120} + defaultTTLSeconds: ${CACHE_CLIENT_USAGE_CLIENT_TTL_SECONDS:86400} # 1 day + maxBytes: ${CACHE_CLIENT_USAGE_CLIENT_MAX_BYTES:52428800} # 50MB + entityClient: + enabled: ${CACHE_CLIENT_ENTITY_CLIENT_ENABLED:true} + statsEnabled: ${CACHE_CLIENT_ENTITY_CLIENT_STATS_ENABLED:true} + statsIntervalSeconds: ${CACHE_CLIENT_ENTITY_CLIENT_STATS_INTERVAL_SECONDS:120} + defaultTTLSeconds: ${CACHE_CLIENT_ENTITY_CLIENT_TTL_SECONDS:0} # do not cache entity/aspects by default + maxBytes: ${CACHE_CLIENT_USAGE_ENTITY_MAX_BYTES:104857600} # 100MB + entityAspectTTLSeconds: + # cache user aspects for 20s + corpuser: + corpUserKey: 20 + corpUserInfo: 20 + corpUserEditableInfo: 20 + corpUserStatus: 20 + globalTags: 20 + status: 20 + corpUserCredentials: 20 + corpUserSettings: 20 diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/AuthorizerChainFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/AuthorizerChainFactory.java index ed072398178de..bf50a0c7b6473 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/AuthorizerChainFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/AuthorizerChainFactory.java @@ -19,7 +19,7 @@ import com.datahub.plugins.loader.PluginPermissionManagerImpl; import com.google.common.collect.ImmutableMap; import com.linkedin.gms.factory.config.ConfigurationProvider; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.client.JavaEntityClient; import java.nio.file.Path; import java.nio.file.Paths; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubAuthorizerFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubAuthorizerFactory.java index 30e03d87a8b56..5b298a453547a 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubAuthorizerFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubAuthorizerFactory.java @@ -4,7 +4,7 @@ import com.datahub.authorization.DataHubAuthorizer; import com.linkedin.metadata.client.JavaEntityClient; import com.linkedin.gms.factory.entity.RestliEntityClientFactory; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import javax.annotation.Nonnull; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubTokenServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubTokenServiceFactory.java index fc010a1aa2cae..6b2a61882be90 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubTokenServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubTokenServiceFactory.java @@ -1,9 +1,10 @@ package com.linkedin.gms.factory.auth; import com.datahub.authentication.token.StatefulTokenService; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; import com.linkedin.metadata.entity.EntityService; import javax.annotation.Nonnull; + +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.beans.factory.annotation.Value; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/GroupServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/GroupServiceFactory.java index 9d29b8e77d02d..57598abf8095d 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/GroupServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/GroupServiceFactory.java @@ -4,7 +4,7 @@ import com.datahub.authentication.group.GroupService; import com.linkedin.metadata.client.JavaEntityClient; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.graph.GraphClient; import javax.annotation.Nonnull; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/InviteTokenServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/InviteTokenServiceFactory.java index 47f7ef0e0c1eb..105f4c677a9e4 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/InviteTokenServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/InviteTokenServiceFactory.java @@ -1,7 +1,7 @@ package com.linkedin.gms.factory.auth; import com.datahub.authentication.invite.InviteTokenService; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.client.JavaEntityClient; import com.linkedin.metadata.secret.SecretService; import javax.annotation.Nonnull; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/NativeUserServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/NativeUserServiceFactory.java index ca52420b440b2..3df499ea9392e 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/NativeUserServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/NativeUserServiceFactory.java @@ -4,7 +4,7 @@ import com.datahub.authentication.user.NativeUserService; import com.linkedin.metadata.client.JavaEntityClient; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.secret.SecretService; import javax.annotation.Nonnull; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/PostServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/PostServiceFactory.java index 8e5e5e5cfc667..cc6f5c8272f9d 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/PostServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/PostServiceFactory.java @@ -1,7 +1,7 @@ package com.linkedin.gms.factory.auth; import com.datahub.authentication.post.PostService; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.client.JavaEntityClient; import javax.annotation.Nonnull; import org.springframework.beans.factory.annotation.Autowired; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/RoleServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/RoleServiceFactory.java index 42f3e797c33bd..8a85f63cdd66d 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/RoleServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/RoleServiceFactory.java @@ -3,7 +3,7 @@ package com.linkedin.gms.factory.auth; import com.datahub.authorization.role.RoleService; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.client.JavaEntityClient; import javax.annotation.Nonnull; import org.springframework.beans.factory.annotation.Autowired; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/SystemAuthenticationFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/SystemAuthenticationFactory.java index d6c171dc741e4..5bdd8cbf83c65 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/SystemAuthenticationFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/SystemAuthenticationFactory.java @@ -3,7 +3,7 @@ import com.datahub.authentication.Actor; import com.datahub.authentication.ActorType; import com.datahub.authentication.Authentication; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import javax.annotation.Nonnull; import lombok.Data; import org.springframework.beans.factory.annotation.Value; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchGraphServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchGraphServiceFactory.java index c1c5acbc1fddc..51c7db5e37366 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchGraphServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchGraphServiceFactory.java @@ -3,7 +3,7 @@ import com.linkedin.gms.factory.config.ConfigurationProvider; import com.linkedin.gms.factory.entityregistry.EntityRegistryFactory; import com.linkedin.gms.factory.search.BaseElasticSearchComponentsFactory; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.models.registry.LineageRegistry; import com.linkedin.metadata.graph.elastic.ESGraphQueryDAO; import com.linkedin.metadata.graph.elastic.ESGraphWriteDAO; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchSystemMetadataServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchSystemMetadataServiceFactory.java index 89f196b056ee0..504618ba9cc6a 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchSystemMetadataServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchSystemMetadataServiceFactory.java @@ -1,7 +1,7 @@ package com.linkedin.gms.factory.common; import com.linkedin.gms.factory.search.BaseElasticSearchComponentsFactory; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.systemmetadata.ESSystemMetadataDAO; import com.linkedin.metadata.systemmetadata.ElasticSearchSystemMetadataService; import javax.annotation.Nonnull; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticsearchSSLContextFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticsearchSSLContextFactory.java index d57da336429d9..0dce80b98964b 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticsearchSSLContextFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticsearchSSLContextFactory.java @@ -1,6 +1,6 @@ package com.linkedin.gms.factory.common; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import org.apache.http.ssl.SSLContextBuilder; import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.Bean; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/GraphServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/GraphServiceFactory.java index 02e31c7dc4f57..94593eb1fb84c 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/GraphServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/GraphServiceFactory.java @@ -1,6 +1,6 @@ package com.linkedin.gms.factory.common; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.graph.GraphService; import com.linkedin.metadata.graph.neo4j.Neo4jGraphService; import com.linkedin.metadata.graph.elastic.ElasticSearchGraphService; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/IndexConventionFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/IndexConventionFactory.java index a2816830f33ce..ada8466d302e6 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/IndexConventionFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/IndexConventionFactory.java @@ -1,6 +1,6 @@ package com.linkedin.gms.factory.common; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; import org.springframework.beans.factory.annotation.Value; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/LocalEbeanServerConfigFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/LocalEbeanServerConfigFactory.java index 5ab5b14160e27..6bf8ff123b221 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/LocalEbeanServerConfigFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/LocalEbeanServerConfigFactory.java @@ -1,6 +1,6 @@ package com.linkedin.gms.factory.common; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.utils.metrics.MetricUtils; import io.ebean.config.ServerConfig; import io.ebean.datasource.DataSourceConfig; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/Neo4jDriverFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/Neo4jDriverFactory.java index a364504d443f7..65b6115d6638e 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/Neo4jDriverFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/Neo4jDriverFactory.java @@ -1,6 +1,6 @@ package com.linkedin.gms.factory.common; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import java.util.concurrent.TimeUnit; import org.neo4j.driver.AuthTokens; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/RestHighLevelClientFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/RestHighLevelClientFactory.java index 5f50b8f7f0508..3c40b30bfc7d1 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/RestHighLevelClientFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/RestHighLevelClientFactory.java @@ -1,7 +1,7 @@ package com.linkedin.gms.factory.common; import com.linkedin.gms.factory.auth.AwsRequestSigningApacheInterceptor; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import java.io.IOException; import javax.annotation.Nonnull; import javax.net.ssl.HostnameVerifier; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/config/ConfigurationProvider.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/config/ConfigurationProvider.java index e07630111a567..465480be344c7 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/config/ConfigurationProvider.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/config/ConfigurationProvider.java @@ -12,7 +12,7 @@ import com.linkedin.metadata.config.kafka.KafkaConfiguration; import com.linkedin.metadata.config.search.ElasticSearchConfiguration; import com.linkedin.datahub.graphql.featureflags.FeatureFlags; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.config.telemetry.TelemetryConfiguration; import lombok.Data; import org.springframework.boot.context.properties.ConfigurationProperties; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/dataproduct/DataProductServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/dataproduct/DataProductServiceFactory.java index c0f2c8e1f1223..6eab711603c52 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/dataproduct/DataProductServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/dataproduct/DataProductServiceFactory.java @@ -1,6 +1,6 @@ package com.linkedin.gms.factory.dataproduct; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.client.JavaEntityClient; import com.linkedin.metadata.graph.GraphClient; import com.linkedin.metadata.service.DataProductService; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entity/JavaEntityClientFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entity/JavaEntityClientFactory.java index c9c3953f4d998..e1c24b805437b 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entity/JavaEntityClientFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entity/JavaEntityClientFactory.java @@ -1,8 +1,11 @@ package com.linkedin.gms.factory.entity; +import com.datahub.authentication.Authentication; +import com.linkedin.gms.factory.config.ConfigurationProvider; import com.linkedin.metadata.client.JavaEntityClient; import com.linkedin.entity.client.RestliEntityClient; import com.linkedin.gms.factory.kafka.DataHubKafkaProducerFactory; +import com.linkedin.metadata.client.SystemJavaEntityClient; import com.linkedin.metadata.entity.DeleteEntityService; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.event.EventProducer; @@ -53,12 +56,8 @@ public class JavaEntityClientFactory { @Qualifier("kafkaEventProducer") private EventProducer _eventProducer; - @Autowired - @Qualifier("restliEntityClient") - private RestliEntityClient _restliEntityClient; - @Bean("javaEntityClient") - public JavaEntityClient getJavaEntityClient() { + public JavaEntityClient getJavaEntityClient(@Qualifier("restliEntityClient") final RestliEntityClient restliEntityClient) { return new JavaEntityClient( _entityService, _deleteEntityService, @@ -68,6 +67,24 @@ public JavaEntityClient getJavaEntityClient() { _lineageSearchService, _timeseriesAspectService, _eventProducer, - _restliEntityClient); + restliEntityClient); + } + + @Bean("systemJavaEntityClient") + public SystemJavaEntityClient systemJavaEntityClient(@Qualifier("configurationProvider") final ConfigurationProvider configurationProvider, + @Qualifier("systemAuthentication") final Authentication systemAuthentication, + @Qualifier("systemRestliEntityClient") final RestliEntityClient restliEntityClient) { + return new SystemJavaEntityClient( + _entityService, + _deleteEntityService, + _entitySearchService, + _cachingEntitySearchService, + _searchService, + _lineageSearchService, + _timeseriesAspectService, + _eventProducer, + restliEntityClient, + systemAuthentication, + configurationProvider.getCache().getClient().getEntityClient()); } } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entity/RestliEntityClientFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entity/RestliEntityClientFactory.java index e149ecedfa6f6..dfc5e835392df 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entity/RestliEntityClientFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entity/RestliEntityClientFactory.java @@ -1,10 +1,14 @@ package com.linkedin.gms.factory.entity; +import com.datahub.authentication.Authentication; import com.linkedin.entity.client.RestliEntityClient; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.entity.client.SystemRestliEntityClient; +import com.linkedin.gms.factory.config.ConfigurationProvider; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.restli.DefaultRestliClientFactory; import com.linkedin.parseq.retry.backoff.ExponentialBackoff; import com.linkedin.restli.client.Client; +import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; @@ -48,4 +52,17 @@ public RestliEntityClient getRestliEntityClient() { } return new RestliEntityClient(restClient, new ExponentialBackoff(retryInterval), numRetries); } + + @Bean("systemRestliEntityClient") + public SystemRestliEntityClient systemRestliEntityClient(@Qualifier("configurationProvider") final ConfigurationProvider configurationProvider, + @Qualifier("systemAuthentication") final Authentication systemAuthentication) { + final Client restClient; + if (gmsUri != null) { + restClient = DefaultRestliClientFactory.getRestLiClient(URI.create(gmsUri), gmsSslProtocol); + } else { + restClient = DefaultRestliClientFactory.getRestLiClient(gmsHost, gmsPort, gmsUseSSL, gmsSslProtocol); + } + return new SystemRestliEntityClient(restClient, new ExponentialBackoff(retryInterval), numRetries, + systemAuthentication, configurationProvider.getCache().getClient().getEntityClient()); + } } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entity/RetentionServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entity/RetentionServiceFactory.java index b13bf5813d47e..ff56f19e4f8fd 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entity/RetentionServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entity/RetentionServiceFactory.java @@ -1,7 +1,7 @@ package com.linkedin.gms.factory.entity; import com.datastax.oss.driver.api.core.CqlSession; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.entity.RetentionService; import com.linkedin.metadata.entity.cassandra.CassandraRetentionService; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entityregistry/ConfigEntityRegistryFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entityregistry/ConfigEntityRegistryFactory.java index 471f079683d60..cda21f8907867 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entityregistry/ConfigEntityRegistryFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entityregistry/ConfigEntityRegistryFactory.java @@ -1,6 +1,6 @@ package com.linkedin.gms.factory.entityregistry; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.models.registry.ConfigEntityRegistry; import com.linkedin.metadata.models.registry.EntityRegistryException; import java.io.IOException; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entityregistry/PluginEntityRegistryFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entityregistry/PluginEntityRegistryFactory.java index 150e1e48f39af..6dbb07309c7cc 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entityregistry/PluginEntityRegistryFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entityregistry/PluginEntityRegistryFactory.java @@ -1,6 +1,6 @@ package com.linkedin.gms.factory.entityregistry; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.models.registry.PluginEntityRegistryLoader; import java.io.FileNotFoundException; import java.net.MalformedURLException; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/graphql/GraphQLEngineFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/graphql/GraphQLEngineFactory.java index d7aee59ca6dd1..c50b4c9088bc2 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/graphql/GraphQLEngineFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/graphql/GraphQLEngineFactory.java @@ -20,6 +20,7 @@ import com.linkedin.gms.factory.entityregistry.EntityRegistryFactory; import com.linkedin.gms.factory.entity.RestliEntityClientFactory; import com.linkedin.gms.factory.recommendation.RecommendationServiceFactory; +import com.linkedin.metadata.client.SystemJavaEntityClient; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.graph.GraphClient; import com.linkedin.metadata.graph.GraphService; @@ -65,6 +66,10 @@ public class GraphQLEngineFactory { @Qualifier("javaEntityClient") private JavaEntityClient _entityClient; + @Autowired + @Qualifier("systemJavaEntityClient") + private SystemJavaEntityClient _systemEntityClient; + @Autowired @Qualifier("graphClient") private GraphClient _graphClient; @@ -170,6 +175,7 @@ public class GraphQLEngineFactory { protected GraphQLEngine getInstance() { GmsGraphQLEngineArgs args = new GmsGraphQLEngineArgs(); args.setEntityClient(_entityClient); + args.setSystemEntityClient(_systemEntityClient); args.setGraphClient(_graphClient); args.setUsageClient(_usageClient); if (isAnalyticsEnabled) { diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/ingestion/IngestionSchedulerFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/ingestion/IngestionSchedulerFactory.java index b310ee25cbcbb..9beb617c4f6e8 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/ingestion/IngestionSchedulerFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/ingestion/IngestionSchedulerFactory.java @@ -6,7 +6,7 @@ import com.linkedin.gms.factory.auth.SystemAuthenticationFactory; import com.linkedin.gms.factory.config.ConfigurationProvider; import com.linkedin.gms.factory.entity.RestliEntityClientFactory; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import javax.annotation.Nonnull; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/DataHubKafkaEventProducerFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/DataHubKafkaEventProducerFactory.java index 66f556066497f..675f015d9e378 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/DataHubKafkaEventProducerFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/DataHubKafkaEventProducerFactory.java @@ -1,7 +1,7 @@ package com.linkedin.gms.factory.kafka; import com.linkedin.gms.factory.common.TopicConventionFactory; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.dao.producer.KafkaEventProducer; import com.linkedin.metadata.dao.producer.KafkaHealthChecker; import com.linkedin.mxe.TopicConvention; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/DataHubKafkaProducerFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/DataHubKafkaProducerFactory.java index e58661b357e6a..c67a2e704681f 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/DataHubKafkaProducerFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/DataHubKafkaProducerFactory.java @@ -6,7 +6,7 @@ import com.linkedin.gms.factory.kafka.schemaregistry.InternalSchemaRegistryFactory; import com.linkedin.gms.factory.kafka.schemaregistry.KafkaSchemaRegistryFactory; import com.linkedin.gms.factory.kafka.schemaregistry.SchemaRegistryConfig; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import java.util.Arrays; import java.util.Map; import org.apache.avro.generic.IndexedRecord; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/schemaregistry/AwsGlueSchemaRegistryFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/schemaregistry/AwsGlueSchemaRegistryFactory.java index 59f08e3733704..ac1cbbc5cc5ff 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/schemaregistry/AwsGlueSchemaRegistryFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/schemaregistry/AwsGlueSchemaRegistryFactory.java @@ -5,7 +5,7 @@ import com.amazonaws.services.schemaregistry.utils.AWSSchemaRegistryConstants; import com.amazonaws.services.schemaregistry.utils.AvroRecordType; import com.linkedin.gms.factory.config.ConfigurationProvider; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import java.util.HashMap; import java.util.Map; import java.util.Optional; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/schemaregistry/KafkaSchemaRegistryFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/schemaregistry/KafkaSchemaRegistryFactory.java index d0e11baab9089..7b72ba3f3bb88 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/schemaregistry/KafkaSchemaRegistryFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/schemaregistry/KafkaSchemaRegistryFactory.java @@ -1,7 +1,7 @@ package com.linkedin.gms.factory.kafka.schemaregistry; import com.linkedin.gms.factory.config.ConfigurationProvider; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import io.confluent.kafka.schemaregistry.client.SchemaRegistryClientConfig; import io.confluent.kafka.serializers.AbstractKafkaSchemaSerDeConfig; import io.confluent.kafka.serializers.KafkaAvroDeserializer; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/lineage/LineageServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/lineage/LineageServiceFactory.java index f76549c90af68..8596a14b7fc24 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/lineage/LineageServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/lineage/LineageServiceFactory.java @@ -1,6 +1,6 @@ package com.linkedin.gms.factory.lineage; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.client.JavaEntityClient; import javax.annotation.Nonnull; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/ownership/OwnershipTypeServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/ownership/OwnershipTypeServiceFactory.java index 512a0a1fa40ab..3a1f18692fdc6 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/ownership/OwnershipTypeServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/ownership/OwnershipTypeServiceFactory.java @@ -1,7 +1,7 @@ package com.linkedin.gms.factory.ownership; import com.datahub.authentication.Authentication; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.client.JavaEntityClient; import com.linkedin.metadata.service.OwnershipTypeService; import javax.annotation.Nonnull; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/query/QueryServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/query/QueryServiceFactory.java index f2bdce908319e..f98c5bd50467d 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/query/QueryServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/query/QueryServiceFactory.java @@ -1,7 +1,7 @@ package com.linkedin.gms.factory.query; import com.datahub.authentication.Authentication; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.client.JavaEntityClient; import com.linkedin.metadata.service.QueryService; import javax.annotation.Nonnull; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/BaseElasticSearchComponentsFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/BaseElasticSearchComponentsFactory.java index 620af803723e7..c99d429e986b6 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/BaseElasticSearchComponentsFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/BaseElasticSearchComponentsFactory.java @@ -2,7 +2,7 @@ import com.linkedin.gms.factory.common.IndexConventionFactory; import com.linkedin.gms.factory.common.RestHighLevelClientFactory; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/CachingEntitySearchServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/CachingEntitySearchServiceFactory.java index 7b20e798b79f2..845c63c32e0fd 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/CachingEntitySearchServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/CachingEntitySearchServiceFactory.java @@ -1,6 +1,6 @@ package com.linkedin.gms.factory.search; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.search.EntitySearchService; import com.linkedin.metadata.search.client.CachingEntitySearchService; import javax.annotation.Nonnull; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactory.java index fc6f92b2678f3..5deffdb01d247 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactory.java @@ -1,7 +1,7 @@ package com.linkedin.gms.factory.search; import com.linkedin.gms.factory.common.RestHighLevelClientFactory; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import javax.annotation.Nonnull; import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactory.java index 495d77ccbb29f..b619ee9516dce 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactory.java @@ -6,7 +6,7 @@ import com.linkedin.gms.factory.common.IndexConventionFactory; import com.linkedin.gms.factory.common.RestHighLevelClientFactory; import com.linkedin.gms.factory.config.ConfigurationProvider; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; import com.linkedin.metadata.version.GitVersion; import javax.annotation.Nonnull; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchServiceFactory.java index 03dd2d072b4a0..a2a0dbaf89c79 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchServiceFactory.java @@ -7,7 +7,7 @@ import com.fasterxml.jackson.dataformat.yaml.YAMLMapper; import com.linkedin.gms.factory.config.ConfigurationProvider; import com.linkedin.gms.factory.entityregistry.EntityRegistryFactory; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.search.elasticsearch.ElasticSearchService; import com.linkedin.metadata.search.elasticsearch.indexbuilder.EntityIndexBuilders; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/LineageSearchServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/LineageSearchServiceFactory.java index 94b3f40849a13..e2eef83bc6e3f 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/LineageSearchServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/LineageSearchServiceFactory.java @@ -2,7 +2,7 @@ import com.linkedin.gms.factory.common.GraphServiceFactory; import com.linkedin.gms.factory.config.ConfigurationProvider; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.graph.GraphService; import com.linkedin.metadata.search.LineageSearchService; import com.linkedin.metadata.search.SearchService; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/SearchDocumentTransformerFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/SearchDocumentTransformerFactory.java index e1fe0399cb115..a186d2de770f3 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/SearchDocumentTransformerFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/SearchDocumentTransformerFactory.java @@ -1,6 +1,6 @@ package com.linkedin.gms.factory.search; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.search.transformer.SearchDocumentTransformer; import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.Bean; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/SearchServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/SearchServiceFactory.java index 70307e51f3256..64bb0218a0d71 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/SearchServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/SearchServiceFactory.java @@ -1,7 +1,7 @@ package com.linkedin.gms.factory.search; import com.linkedin.gms.factory.config.ConfigurationProvider; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.search.EntitySearchService; import com.linkedin.metadata.search.SearchService; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/SettingsBuilderFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/SettingsBuilderFactory.java index b6bfef6ed8c78..840a370957706 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/SettingsBuilderFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/SettingsBuilderFactory.java @@ -1,7 +1,7 @@ package com.linkedin.gms.factory.search; import com.linkedin.gms.factory.entityregistry.EntityRegistryFactory; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder; import org.springframework.beans.factory.annotation.Autowired; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/views/ViewServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/views/ViewServiceFactory.java index 006b992191cfa..60bcd9ea22be6 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/views/ViewServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/views/ViewServiceFactory.java @@ -1,7 +1,7 @@ package com.linkedin.gms.factory.search.views; import com.datahub.authentication.Authentication; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.client.JavaEntityClient; import com.linkedin.metadata.service.ViewService; import javax.annotation.Nonnull; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/settings/SettingsServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/settings/SettingsServiceFactory.java index 73ec79fa7ed08..2e22d43913493 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/settings/SettingsServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/settings/SettingsServiceFactory.java @@ -1,7 +1,7 @@ package com.linkedin.gms.factory.settings; import com.datahub.authentication.Authentication; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.client.JavaEntityClient; import com.linkedin.metadata.service.SettingsService; import javax.annotation.Nonnull; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/MixpanelApiFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/MixpanelApiFactory.java index b2982d1f8ed9d..8178ce1399aa3 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/MixpanelApiFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/MixpanelApiFactory.java @@ -1,6 +1,6 @@ package com.linkedin.gms.factory.telemetry; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.mixpanel.mixpanelapi.MixpanelAPI; import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; import org.springframework.context.annotation.Bean; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/MixpanelMessageBuilderFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/MixpanelMessageBuilderFactory.java index aa8596786ce11..5385c5e81f804 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/MixpanelMessageBuilderFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/MixpanelMessageBuilderFactory.java @@ -1,6 +1,6 @@ package com.linkedin.gms.factory.telemetry; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.mixpanel.mixpanelapi.MessageBuilder; import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; import org.springframework.context.annotation.Bean; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/TrackingServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/TrackingServiceFactory.java index 3b53a6fe92810..bb166af5501b3 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/TrackingServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/TrackingServiceFactory.java @@ -1,7 +1,7 @@ package com.linkedin.gms.factory.telemetry; import com.datahub.telemetry.TrackingService; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.secret.SecretService; import com.linkedin.metadata.version.GitVersion; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeline/EntityChangeEventGeneratorRegistryFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeline/EntityChangeEventGeneratorRegistryFactory.java index e9b9850c01a2b..89a7e7dd8d71a 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeline/EntityChangeEventGeneratorRegistryFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeline/EntityChangeEventGeneratorRegistryFactory.java @@ -1,7 +1,7 @@ package com.linkedin.gms.factory.timeline; import com.datahub.authentication.Authentication; -import com.linkedin.entity.client.RestliEntityClient; +import com.linkedin.entity.client.SystemRestliEntityClient; import com.linkedin.metadata.timeline.eventgenerator.AssertionRunEventChangeEventGenerator; import com.linkedin.metadata.timeline.eventgenerator.DataProcessInstanceRunEventChangeEventGenerator; import com.linkedin.metadata.timeline.eventgenerator.DatasetPropertiesChangeEventGenerator; @@ -38,7 +38,7 @@ public class EntityChangeEventGeneratorRegistryFactory { @Singleton @Nonnull protected com.linkedin.metadata.timeline.eventgenerator.EntityChangeEventGeneratorRegistry entityChangeEventGeneratorRegistry() { - final RestliEntityClient entityClient = applicationContext.getBean(RestliEntityClient.class); + final SystemRestliEntityClient entityClient = applicationContext.getBean(SystemRestliEntityClient.class); final Authentication systemAuthentication = applicationContext.getBean(Authentication.class); final com.linkedin.metadata.timeline.eventgenerator.EntityChangeEventGeneratorRegistry registry = @@ -74,7 +74,7 @@ protected com.linkedin.metadata.timeline.eventgenerator.EntityChangeEventGenerat // Data Process Instance differs registry.register(DATA_PROCESS_INSTANCE_RUN_EVENT_ASPECT_NAME, - new DataProcessInstanceRunEventChangeEventGenerator(entityClient, systemAuthentication)); + new DataProcessInstanceRunEventChangeEventGenerator(entityClient)); // TODO: Add ML models. diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeline/TimelineServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeline/TimelineServiceFactory.java index df9d80eb63a02..baa22d401387f 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeline/TimelineServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeline/TimelineServiceFactory.java @@ -1,6 +1,6 @@ package com.linkedin.gms.factory.timeline; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.entity.AspectDao; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.timeline.TimelineService; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeseries/ElasticSearchTimeseriesAspectServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeseries/ElasticSearchTimeseriesAspectServiceFactory.java index 717adf7d559b7..e3cc772f21c40 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeseries/ElasticSearchTimeseriesAspectServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeseries/ElasticSearchTimeseriesAspectServiceFactory.java @@ -2,7 +2,7 @@ import com.linkedin.gms.factory.entityregistry.EntityRegistryFactory; import com.linkedin.gms.factory.search.BaseElasticSearchComponentsFactory; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.timeseries.elastic.ElasticSearchTimeseriesAspectService; import com.linkedin.metadata.timeseries.elastic.indexbuilder.TimeseriesAspectIndexBuilders; diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/usage/UsageClientFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/usage/UsageClientFactory.java index e4cbb92cebbba..e83cbc82d8067 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/usage/UsageClientFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/usage/UsageClientFactory.java @@ -1,10 +1,14 @@ package com.linkedin.gms.factory.usage; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.datahub.authentication.Authentication; +import com.linkedin.gms.factory.config.ConfigurationProvider; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.restli.DefaultRestliClientFactory; import com.linkedin.parseq.retry.backoff.ExponentialBackoff; import com.linkedin.restli.client.Client; import com.linkedin.usage.UsageClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; @@ -33,10 +37,15 @@ public class UsageClientFactory { @Value("${usageClient.numRetries:3}") private int numRetries; + @Autowired + @Qualifier("configurationProvider") + private ConfigurationProvider configurationProvider; + @Bean("usageClient") - public UsageClient getUsageClient() { + public UsageClient getUsageClient(@Qualifier("systemAuthentication") final Authentication systemAuthentication) { Client restClient = DefaultRestliClientFactory.getRestLiClient(gmsHost, gmsPort, gmsUseSSL, gmsSslProtocol); - return new UsageClient(restClient, new ExponentialBackoff(retryInterval), numRetries); + return new UsageClient(restClient, new ExponentialBackoff(retryInterval), numRetries, systemAuthentication, + configurationProvider.getCache().getClient().getUsageClient()); } } diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/factories/IngestRetentionPoliciesStepFactory.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/factories/IngestRetentionPoliciesStepFactory.java index 91fc58d074ed6..e038cb230c458 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/factories/IngestRetentionPoliciesStepFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/factories/IngestRetentionPoliciesStepFactory.java @@ -1,7 +1,7 @@ package com.linkedin.metadata.boot.factories; import com.linkedin.gms.factory.entity.RetentionServiceFactory; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.boot.steps.IngestRetentionPoliciesStep; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.entity.RetentionService; diff --git a/metadata-service/openapi-entity-servlet/src/test/java/io/datahubproject/openapi/util/OpenApiEntitiesUtilTest.java b/metadata-service/openapi-entity-servlet/src/test/java/io/datahubproject/openapi/util/OpenApiEntitiesUtilTest.java index e0fec07452302..b4e87eedea542 100644 --- a/metadata-service/openapi-entity-servlet/src/test/java/io/datahubproject/openapi/util/OpenApiEntitiesUtilTest.java +++ b/metadata-service/openapi-entity-servlet/src/test/java/io/datahubproject/openapi/util/OpenApiEntitiesUtilTest.java @@ -1,7 +1,7 @@ package io.datahubproject.openapi.util; import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor; -import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.models.registry.EntityRegistry; import io.datahubproject.openapi.config.OpenAPIEntityTestConfiguration; import io.datahubproject.openapi.dto.UpsertAspectRequest; diff --git a/metadata-service/restli-client/build.gradle b/metadata-service/restli-client/build.gradle index 45cf008d3ca7d..b1b778b45c0b5 100644 --- a/metadata-service/restli-client/build.gradle +++ b/metadata-service/restli-client/build.gradle @@ -7,6 +7,7 @@ dependencies { api project(path: ':metadata-service:restli-api', configuration: 'restClient') api project(':metadata-events:mxe-schemas') api project(':metadata-utils') + implementation project(':metadata-service:configuration') implementation externalDependency.slf4jApi compileOnly externalDependency.lombok diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/common/client/ClientCache.java b/metadata-service/restli-client/src/main/java/com/linkedin/common/client/ClientCache.java new file mode 100644 index 0000000000000..8aa0984be57b9 --- /dev/null +++ b/metadata-service/restli-client/src/main/java/com/linkedin/common/client/ClientCache.java @@ -0,0 +1,134 @@ +package com.linkedin.common.client; + +import com.codahale.metrics.Gauge; +import com.github.benmanes.caffeine.cache.CacheLoader; +import com.github.benmanes.caffeine.cache.Caffeine; +import com.github.benmanes.caffeine.cache.Expiry; +import com.github.benmanes.caffeine.cache.LoadingCache; +import com.github.benmanes.caffeine.cache.Weigher; +import com.github.benmanes.caffeine.cache.stats.CacheStats; +import com.linkedin.metadata.config.cache.client.ClientCacheConfig; +import com.linkedin.metadata.utils.metrics.MetricUtils; +import lombok.Builder; +import lombok.NonNull; +import lombok.extern.slf4j.Slf4j; +import org.checkerframework.checker.nullness.qual.Nullable; + +import java.util.List; +import java.util.Map; +import java.util.concurrent.ScheduledThreadPoolExecutor; +import java.util.concurrent.TimeUnit; +import java.util.function.BiFunction; +import java.util.function.Function; + +/** + * Generic cache with common configuration for limited weight, per item expiry, and batch loading + * @param key + * @param value + */ +@Slf4j +@Builder +public class ClientCache { + @NonNull + protected final C config; + @NonNull + protected final LoadingCache cache; + @NonNull + private final Function, Map> loadFunction; + @NonNull + private final Weigher weigher; + @NonNull + private final BiFunction ttlSecondsFunction; + + public @Nullable V get(@NonNull K key) { + return cache.get(key); + } + + public @NonNull Map<@NonNull K, @NonNull V> getAll(@NonNull Iterable keys) { + return cache.getAll(keys); + } + + public void refresh(@NonNull K key) { + cache.refresh(key); + } + + public static class ClientCacheBuilder { + + private ClientCacheBuilder cache(LoadingCache cache) { + return null; + } + private ClientCache build() { + return null; + } + + public ClientCache build(Class metricClazz) { + // loads data from entity client + CacheLoader loader = new CacheLoader<>() { + @Override + public V load(@NonNull K key) { + return loadAll(List.of(key)).get(key); + } + + @Override + @NonNull + public Map loadAll(@NonNull Iterable keys) { + return loadFunction.apply(keys); + } + }; + + // build cache + Caffeine caffeine = Caffeine.newBuilder() + .maximumWeight(config.getMaxBytes()) + // limit total size + .weigher(weigher) + .softValues() + // define per entity/aspect ttls + .expireAfter(new Expiry() { + public long expireAfterCreate(@NonNull K key, @NonNull V aspect, long currentTime) { + int ttlSeconds = ttlSecondsFunction.apply(config, key); + if (ttlSeconds < 0) { + ttlSeconds = Integer.MAX_VALUE; + } + return TimeUnit.SECONDS.toNanos(ttlSeconds); + } + public long expireAfterUpdate(@NonNull K key, @NonNull V aspect, + long currentTime, long currentDuration) { + return currentDuration; + } + public long expireAfterRead(@NonNull K key, @NonNull V aspect, + long currentTime, long currentDuration) { + return currentDuration; + } + }); + + if (config.isStatsEnabled()) { + caffeine.recordStats(); + } + + LoadingCache cache = caffeine.build(loader); + + if (config.isStatsEnabled()) { + ScheduledThreadPoolExecutor executor = new ScheduledThreadPoolExecutor(1); + executor.scheduleAtFixedRate(() -> { + CacheStats cacheStats = cache.stats(); + + MetricUtils.gauge(metricClazz, "hitRate", () -> (Gauge) cacheStats::hitRate); + MetricUtils.gauge(metricClazz, "loadFailureRate", () -> + (Gauge) cacheStats::loadFailureRate); + MetricUtils.gauge(metricClazz, "evictionCount", () -> + (Gauge) cacheStats::evictionCount); + MetricUtils.gauge(metricClazz, "loadFailureCount", () -> + (Gauge) cacheStats::loadFailureCount); + MetricUtils.gauge(metricClazz, "averageLoadPenalty", () -> + (Gauge) cacheStats::averageLoadPenalty); + MetricUtils.gauge(metricClazz, "evictionWeight", () -> + (Gauge) cacheStats::evictionWeight); + + log.debug(metricClazz.getSimpleName() + ": " + cacheStats); + }, 0, config.getStatsIntervalSeconds(), TimeUnit.SECONDS); + } + + return new ClientCache<>(config, cache, loadFunction, weigher, ttlSecondsFunction); + } + } +} diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClientCache.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClientCache.java new file mode 100644 index 0000000000000..3b35dc528915a --- /dev/null +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClientCache.java @@ -0,0 +1,141 @@ +package com.linkedin.entity.client; + +import com.github.benmanes.caffeine.cache.LoadingCache; +import com.github.benmanes.caffeine.cache.Weigher; +import com.linkedin.common.client.ClientCache; +import com.linkedin.common.urn.Urn; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.EnvelopedAspectMap; +import com.linkedin.metadata.config.cache.client.EntityClientCacheConfig; +import com.linkedin.util.Pair; +import lombok.Builder; +import lombok.Data; +import lombok.NonNull; + +import javax.annotation.Nonnull; +import java.util.Collection; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +import static com.linkedin.metadata.utils.PegasusUtils.urnToEntityName; + +@Builder +public class EntityClientCache { + @NonNull + private EntityClientCacheConfig config; + @NonNull + private final ClientCache cache; + @NonNull + private BiFunction, Set, Map> loadFunction; + + public EntityResponse getV2(@Nonnull final Urn urn, @Nonnull final Set aspectNames) { + return batchGetV2(Set.of(urn), aspectNames).get(urn); + } + + public Map batchGetV2(@Nonnull final Set urns, @Nonnull final Set aspectNames) { + final Map response; + + if (config.isEnabled()) { + Set keys = urns.stream() + .flatMap(urn -> aspectNames.stream() + .map(a -> Key.builder().urn(urn).aspectName(a).build())) + .collect(Collectors.toSet()); + Map envelopedAspects = cache.getAll(keys); + + Set responses = envelopedAspects.entrySet().stream() + .map(entry -> Pair.of(entry.getKey().getUrn(), entry.getValue())) + .collect(Collectors.groupingBy(Pair::getKey, Collectors.mapping(Pair::getValue, Collectors.toSet()))) + .entrySet().stream().map(e -> toEntityResponse(e.getKey(), e.getValue())) + .collect(Collectors.toSet()); + + response = responses.stream().collect(Collectors.toMap(EntityResponse::getUrn, Function.identity())); + } else { + response = loadFunction.apply(urns, aspectNames); + } + + return response; + } + + private static EntityResponse toEntityResponse(Urn urn, Collection envelopedAspects) { + final EntityResponse response = new EntityResponse(); + response.setUrn(urn); + response.setEntityName(urnToEntityName(urn)); + response.setAspects(new EnvelopedAspectMap( + envelopedAspects.stream() + .collect(Collectors.toMap(EnvelopedAspect::getName, aspect -> aspect)) + )); + return response; + } + + public static class EntityClientCacheBuilder { + + private EntityClientCacheBuilder cache(LoadingCache cache) { + return this; + } + + public EntityClientCache build(Class metricClazz) { + // estimate size + Weigher weighByEstimatedSize = (key, value) -> + value.getValue().data().values().parallelStream() + .mapToInt(o -> o.toString().getBytes().length) + .sum(); + + // batch loads data from entity client (restli or java) + Function, Map> loader = (Iterable keys) -> { + Map> keysByEntity = StreamSupport.stream(keys.spliterator(), true) + .collect(Collectors.groupingBy(Key::getEntityName, Collectors.toSet())); + + Stream> results = keysByEntity.entrySet().parallelStream() + .flatMap(entry -> { + Set urns = entry.getValue().stream() + .map(Key::getUrn) + .collect(Collectors.toSet()); + Set aspects = entry.getValue().stream() + .map(Key::getEntityName) + .collect(Collectors.toSet()); + return loadFunction.apply(urns, aspects).entrySet().stream(); + }) + .flatMap(resp -> resp.getValue().getAspects().values().stream() + .map(envAspect -> { + Key key = Key.builder().urn(resp.getKey()).aspectName(envAspect.getName()).build(); + return Map.entry(key, envAspect); + })); + + return results.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + }; + + // ideally the cache time comes from caching headers from service, but configuration driven for now + BiFunction ttlSeconds = (config, key) -> + Optional.ofNullable(config.getEntityAspectTTLSeconds()).orElse(Map.of()) + .getOrDefault(key.getEntityName(), Map.of()) + .getOrDefault(key.getAspectName(), config.getDefaultTTLSeconds()); + + cache = ClientCache.builder() + .weigher(weighByEstimatedSize) + .config(config) + .loadFunction(loader) + .ttlSecondsFunction(ttlSeconds) + .build(metricClazz); + + return new EntityClientCache(config, cache, loadFunction); + } + } + + @Data + @Builder + protected static class Key { + private final Urn urn; + private final String aspectName; + + public String getEntityName() { + return urn.getEntityType(); + } + } +} diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemEntityClient.java new file mode 100644 index 0000000000000..94067abd0cf65 --- /dev/null +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemEntityClient.java @@ -0,0 +1,91 @@ +package com.linkedin.entity.client; + +import com.datahub.authentication.Authentication; +import com.linkedin.common.urn.Urn; +import com.linkedin.entity.EntityResponse; +import com.linkedin.metadata.config.cache.client.EntityClientCacheConfig; +import com.linkedin.mxe.MetadataChangeProposal; +import com.linkedin.mxe.PlatformEvent; +import com.linkedin.r2.RemoteInvocationException; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.net.URISyntaxException; +import java.util.Map; +import java.util.Set; + +/** + * Adds entity/aspect cache and assumes system authentication + */ +public interface SystemEntityClient extends EntityClient { + + EntityClientCache getEntityClientCache(); + Authentication getSystemAuthentication(); + + /** + * Builds the cache + * @param systemAuthentication system authentication + * @param cacheConfig cache configuration + * @return the cache + */ + default EntityClientCache buildEntityClientCache(Class metricClazz, Authentication systemAuthentication, EntityClientCacheConfig cacheConfig) { + return EntityClientCache.builder() + .config(cacheConfig) + .loadFunction((Set urns, Set aspectNames) -> { + try { + String entityName = urns.stream().findFirst().map(Urn::getEntityType).get(); + + if (urns.stream().anyMatch(urn -> !urn.getEntityType().equals(entityName))) { + throw new IllegalArgumentException("Urns must be of the same entity type. RestliEntityClient API limitation."); + } + + return batchGetV2(entityName, urns, aspectNames, systemAuthentication); + } catch (RemoteInvocationException | URISyntaxException e) { + throw new RuntimeException(e); + } + }).build(metricClazz); + } + + /** + * Get an entity by urn with the given aspects + * @param urn the id of the entity + * @param aspectNames aspects of the entity + * @return response object + * @throws RemoteInvocationException + * @throws URISyntaxException + */ + @Nullable + default EntityResponse getV2(@Nonnull Urn urn, @Nonnull Set aspectNames) + throws RemoteInvocationException, URISyntaxException { + return getEntityClientCache().getV2(urn, aspectNames); + } + + /** + * Batch get a set of aspects for a single entity type, multiple ids with the given aspects. + * + * @param urns the urns of the entities to batch get + * @param aspectNames the aspect names to batch get + * @throws RemoteInvocationException + */ + @Nonnull + default Map batchGetV2(@Nonnull Set urns, @Nonnull Set aspectNames) + throws RemoteInvocationException, URISyntaxException { + return getEntityClientCache().batchGetV2(urns, aspectNames); + } + + default void producePlatformEvent(@Nonnull String name, @Nullable String key, @Nonnull PlatformEvent event) throws Exception { + producePlatformEvent(name, key, event, getSystemAuthentication()); + } + + default boolean exists(@Nonnull Urn urn) throws RemoteInvocationException { + return exists(urn, getSystemAuthentication()); + } + + default String ingestProposal(@Nonnull final MetadataChangeProposal metadataChangeProposal, final boolean async) throws RemoteInvocationException { + return ingestProposal(metadataChangeProposal, getSystemAuthentication(), async); + } + + default void setWritable(boolean canWrite) throws RemoteInvocationException { + setWritable(canWrite, getSystemAuthentication()); + } +} diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java new file mode 100644 index 0000000000000..f3c343534209c --- /dev/null +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java @@ -0,0 +1,25 @@ +package com.linkedin.entity.client; + +import com.datahub.authentication.Authentication; +import com.linkedin.metadata.config.cache.client.EntityClientCacheConfig; +import com.linkedin.parseq.retry.backoff.BackoffPolicy; +import com.linkedin.restli.client.Client; +import lombok.Getter; + +import javax.annotation.Nonnull; + +/** + * Restli backed SystemEntityClient + */ +@Getter +public class SystemRestliEntityClient extends RestliEntityClient implements SystemEntityClient { + private final EntityClientCache entityClientCache; + private final Authentication systemAuthentication; + + public SystemRestliEntityClient(@Nonnull final Client restliClient, @Nonnull final BackoffPolicy backoffPolicy, int retryCount, + Authentication systemAuthentication, EntityClientCacheConfig cacheConfig) { + super(restliClient, backoffPolicy, retryCount); + this.systemAuthentication = systemAuthentication; + this.entityClientCache = buildEntityClientCache(SystemRestliEntityClient.class, systemAuthentication, cacheConfig); + } +} diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/usage/UsageClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/usage/UsageClient.java index 47a15ccdd3ffc..d2b8499615e8d 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/usage/UsageClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/usage/UsageClient.java @@ -5,6 +5,7 @@ import com.linkedin.common.WindowDuration; import com.linkedin.common.client.BaseClient; +import com.linkedin.metadata.config.cache.client.UsageClientCacheConfig; import com.linkedin.parseq.retry.backoff.BackoffPolicy; import com.linkedin.r2.RemoteInvocationException; import com.linkedin.restli.client.Client; @@ -17,19 +18,39 @@ public class UsageClient extends BaseClient { private static final UsageStatsRequestBuilders USAGE_STATS_REQUEST_BUILDERS = new UsageStatsRequestBuilders(); - public UsageClient(@Nonnull final Client restliClient, @Nonnull final BackoffPolicy backoffPolicy, int retryCount) { + private final UsageClientCache usageClientCache; + + public UsageClient(@Nonnull final Client restliClient, @Nonnull final BackoffPolicy backoffPolicy, int retryCount, + Authentication systemAuthentication, UsageClientCacheConfig cacheConfig) { super(restliClient, backoffPolicy, retryCount); + this.usageClientCache = UsageClientCache.builder() + .config(cacheConfig) + .loadFunction((String resource, UsageTimeRange range) -> { + try { + return getUsageStats(resource, range, systemAuthentication); + } catch (RemoteInvocationException | URISyntaxException e) { + throw new RuntimeException(e); + } + }).build(); + } + + /** + * Gets a specific version of downstream {@link EntityRelationships} for the given dataset. + * Using cache and system authentication. + * Validate permissions before use! + */ + @Nonnull + public UsageQueryResult getUsageStats(@Nonnull String resource, @Nonnull UsageTimeRange range) { + return usageClientCache.getUsageStats(resource, range); } /** * Gets a specific version of downstream {@link EntityRelationships} for the given dataset. */ @Nonnull - public UsageQueryResult getUsageStats( - @Nonnull String resource, - @Nonnull UsageTimeRange range, - @Nonnull Authentication authentication - ) throws RemoteInvocationException, URISyntaxException { + private UsageQueryResult getUsageStats(@Nonnull String resource, @Nonnull UsageTimeRange range, + @Nonnull Authentication authentication) + throws RemoteInvocationException, URISyntaxException { final UsageStatsDoQueryRangeRequestBuilder requestBuilder = USAGE_STATS_REQUEST_BUILDERS.actionQueryRange() .resourceParam(resource) .durationParam(WindowDuration.DAY) diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/usage/UsageClientCache.java b/metadata-service/restli-client/src/main/java/com/linkedin/usage/UsageClientCache.java new file mode 100644 index 0000000000000..a04c1e90fb4a3 --- /dev/null +++ b/metadata-service/restli-client/src/main/java/com/linkedin/usage/UsageClientCache.java @@ -0,0 +1,75 @@ +package com.linkedin.usage; + +import com.github.benmanes.caffeine.cache.LoadingCache; +import com.github.benmanes.caffeine.cache.Weigher; +import com.linkedin.common.client.ClientCache; +import com.linkedin.metadata.config.cache.client.UsageClientCacheConfig; +import lombok.Builder; +import lombok.Data; +import lombok.NonNull; + +import javax.annotation.Nonnull; +import java.util.Map; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + + +@Builder +public class UsageClientCache { + @NonNull + private UsageClientCacheConfig config; + @NonNull + private final ClientCache cache; + @NonNull + private BiFunction loadFunction; + + public UsageQueryResult getUsageStats(@Nonnull String resource, @Nonnull UsageTimeRange range) { + if (config.isEnabled()) { + return cache.get(Key.builder().resource(resource).range(range).build()); + } else { + return loadFunction.apply(resource, range); + } + } + + public static class UsageClientCacheBuilder { + + private UsageClientCacheBuilder cache(LoadingCache cache) { + return this; + } + + public UsageClientCache build() { + // estimate size + Weigher weighByEstimatedSize = (key, value) -> + value.data().values().parallelStream() + .mapToInt(o -> o.toString().getBytes().length) + .sum(); + + // batch loads data from usage client + Function, Map> loader = (Iterable keys) -> + StreamSupport.stream(keys.spliterator(), true) + .map(k -> Map.entry(k, loadFunction.apply(k.getResource(), k.getRange()))) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + + // default ttl only + BiFunction ttlSeconds = (config, key) -> config.getDefaultTTLSeconds(); + + cache = ClientCache.builder() + .weigher(weighByEstimatedSize) + .config(config) + .loadFunction(loader) + .ttlSecondsFunction(ttlSeconds) + .build(UsageClientCache.class); + + return new UsageClientCache(config, cache, loadFunction); + } + } + + @Data + @Builder + protected static class Key { + private final String resource; + private final UsageTimeRange range; + } +} diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/utils/metrics/MetricUtils.java b/metadata-utils/src/main/java/com/linkedin/metadata/utils/metrics/MetricUtils.java index 3d90cba85b0fb..9a8848e090fb8 100644 --- a/metadata-utils/src/main/java/com/linkedin/metadata/utils/metrics/MetricUtils.java +++ b/metadata-utils/src/main/java/com/linkedin/metadata/utils/metrics/MetricUtils.java @@ -1,6 +1,7 @@ package com.linkedin.metadata.utils.metrics; import com.codahale.metrics.Counter; +import com.codahale.metrics.Gauge; import com.codahale.metrics.MetricRegistry; import com.codahale.metrics.SharedMetricRegistries; import com.codahale.metrics.Timer; @@ -48,4 +49,8 @@ public static Timer timer(Class klass, String metricName) { public static Timer timer(String metricName) { return REGISTRY.timer(MetricRegistry.name(metricName)); } + + public static > T gauge(Class clazz, String metricName, MetricRegistry.MetricSupplier supplier) { + return REGISTRY.gauge(MetricRegistry.name(clazz, metricName), supplier); + } } From aff1e7a620352071f3b2e12c5598ec689652cc9d Mon Sep 17 00:00:00 2001 From: Jeff Merrick Date: Fri, 22 Sep 2023 00:53:42 -0500 Subject: [PATCH 23/37] docs: add homepage ctas (#8866) Co-authored-by: Harshal Sheth --- .../_components/CardCTAs/cardCTAs.module.scss | 24 +++++++++ .../src/pages/_components/CardCTAs/index.js | 52 +++++++++++++++++++ .../src/pages/_components/Hero/index.js | 9 +++- 3 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 docs-website/src/pages/_components/CardCTAs/cardCTAs.module.scss create mode 100644 docs-website/src/pages/_components/CardCTAs/index.js diff --git a/docs-website/src/pages/_components/CardCTAs/cardCTAs.module.scss b/docs-website/src/pages/_components/CardCTAs/cardCTAs.module.scss new file mode 100644 index 0000000000000..fcd3666d03ddc --- /dev/null +++ b/docs-website/src/pages/_components/CardCTAs/cardCTAs.module.scss @@ -0,0 +1,24 @@ +.flexCol { + display: flex; +} + +.ctaCard { + flex-direction: row; + align-items: flex-start; + justify-content: space-between; + row-gap: 1rem; + padding: 1rem; + &:hover { + text-decoration: none; + border: 1px solid var(--ifm-color-primary); + background-color: var(--ifm-background-surface-color); + } + margin-bottom: 1rem; + flex: 1; +} + +.ctaHeading { + margin-bottom: 0; + display: flex; + align-items: center; +} diff --git a/docs-website/src/pages/_components/CardCTAs/index.js b/docs-website/src/pages/_components/CardCTAs/index.js new file mode 100644 index 0000000000000..d87c803b42818 --- /dev/null +++ b/docs-website/src/pages/_components/CardCTAs/index.js @@ -0,0 +1,52 @@ +import React from "react"; +import clsx from "clsx"; +import styles from "./cardCTAs.module.scss"; +import useBaseUrl from "@docusaurus/useBaseUrl"; +import { ArrowRightOutlined } from "@ant-design/icons"; + +const cardsContent = [ + { + label: "Data Mesh", + title: "Data Products, Delivered", + url: "https://www.acryldata.io/blog/data-products-in-datahub-everything-you-need-to-know", + }, + { + label: "Data Contracts", + title: "End-to-end Reliability in Data", + url: "https://www.acryldata.io/blog/data-contracts-in-datahub-combining-verifiability-with-holistic-data-management", + }, + { + label: "Shift Left", + title: "Developer-friendly Data Governance", + url: "https://www.acryldata.io/blog/the-3-must-haves-of-metadata-management-part-2", + }, +]; + +const Card = ({ label, title, url }) => { + return ( + + ); +}; + +const CardCTAs = () => + cardsContent?.length > 0 ? ( +
+
+
+ {cardsContent.map((props, idx) => ( + + ))} +
+
+
+ ) : null; + +export default CardCTAs; diff --git a/docs-website/src/pages/_components/Hero/index.js b/docs-website/src/pages/_components/Hero/index.js index b5fa04c80faee..22b406dce037e 100644 --- a/docs-website/src/pages/_components/Hero/index.js +++ b/docs-website/src/pages/_components/Hero/index.js @@ -7,6 +7,7 @@ import { useColorMode } from "@docusaurus/theme-common"; import { QuestionCircleOutlined } from "@ant-design/icons"; import styles from "./hero.module.scss"; import CodeBlock from "@theme/CodeBlock"; +import CardCTAs from "../CardCTAs"; const HeroAnnouncement = ({ message, linkUrl, linkText }) => (
@@ -33,7 +34,12 @@ const Hero = ({}) => { complexity of your data ecosystem.

-Built with ❤️ by Acryl Data and LinkedIn. + Built with ❤️ by{" "} + {" "} + + Acryl Data + {" "} + and LinkedIn.

Get Started → @@ -43,6 +49,7 @@ Built with ❤️ by DataHub Flow Diagram

Get Started Now

From c946c01199e88e724ee0e6e9e7c9ee58c212803b Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 22 Sep 2023 13:01:38 -0700 Subject: [PATCH 24/37] fix(ingest/bigquery): show report in output (#8867) Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Co-authored-by: Andrew Sikowitz --- .../source/bigquery_v2/bigquery_report.py | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py index 2d6882caa38ef..661589a0c58e5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py @@ -6,6 +6,7 @@ import pydantic +from datahub.ingestion.api.report import Report from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport from datahub.ingestion.source_report.time_window import BaseTimeWindowReport @@ -16,18 +17,20 @@ logger: logging.Logger = logging.getLogger(__name__) -class BigQuerySchemaApiPerfReport: - list_projects = PerfTimer() - list_datasets = PerfTimer() - get_columns_for_dataset = PerfTimer() - get_tables_for_dataset = PerfTimer() - list_tables = PerfTimer() - get_views_for_dataset = PerfTimer() +@dataclass +class BigQuerySchemaApiPerfReport(Report): + list_projects: PerfTimer = field(default_factory=PerfTimer) + list_datasets: PerfTimer = field(default_factory=PerfTimer) + get_columns_for_dataset: PerfTimer = field(default_factory=PerfTimer) + get_tables_for_dataset: PerfTimer = field(default_factory=PerfTimer) + list_tables: PerfTimer = field(default_factory=PerfTimer) + get_views_for_dataset: PerfTimer = field(default_factory=PerfTimer) -class BigQueryAuditLogApiPerfReport: - get_exported_log_entries = PerfTimer() - list_log_entries = PerfTimer() +@dataclass +class BigQueryAuditLogApiPerfReport(Report): + get_exported_log_entries: PerfTimer = field(default_factory=PerfTimer) + list_log_entries: PerfTimer = field(default_factory=PerfTimer) @dataclass From 146cb896c291f5062f2d8ae90c6a13dad4f01eab Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 22 Sep 2023 16:37:13 -0700 Subject: [PATCH 25/37] fix(docker): support alternate postgres db in postgres-setup (#8800) --- docker/postgres-setup/init.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/docker/postgres-setup/init.sh b/docker/postgres-setup/init.sh index 6c0adc8c69bdd..afc9bdfe4c668 100755 --- a/docker/postgres-setup/init.sh +++ b/docker/postgres-setup/init.sh @@ -1,8 +1,13 @@ #!/bin/sh export PGPASSWORD=$POSTGRES_PASSWORD +POSTGRES_CREATE_DB=${POSTGRES_CREATE_DB:-true} +POSTGRES_CREATE_DB_CONNECTION_DB=${POSTGRES_CREATE_DB_CONNECTION_DB:-postgres} + # workaround create database if not exists, check https://stackoverflow.com/a/36591842 -psql -U $POSTGRES_USERNAME -h $POSTGRES_HOST -p $POSTGRES_PORT -tc "SELECT 1 FROM pg_database WHERE datname = '${DATAHUB_DB_NAME}'" | grep -q 1 || psql -U $POSTGRES_USERNAME -h $POSTGRES_HOST -p $POSTGRES_PORT -c "CREATE DATABASE ${DATAHUB_DB_NAME}" +if [ "$POSTGRES_CREATE_DB" = true ]; then + psql -d "$POSTGRES_CREATE_DB_CONNECTION_DB" -U "$POSTGRES_USERNAME" -h "$POSTGRES_HOST" -p "$POSTGRES_PORT" -tc "SELECT 1 FROM pg_database WHERE datname = '${DATAHUB_DB_NAME}'" | grep -q 1 || psql -d "$POSTGRES_CREATE_DB_CONNECTION_DB" -U "$POSTGRES_USERNAME" -h "$POSTGRES_HOST" -p "$POSTGRES_PORT" -c "CREATE DATABASE ${DATAHUB_DB_NAME}" +fi sed -e "s/DATAHUB_DB_NAME/${DATAHUB_DB_NAME}/g" /init.sql | tee -a /tmp/init-final.sql -psql -d $DATAHUB_DB_NAME -U $POSTGRES_USERNAME -h $POSTGRES_HOST -p $POSTGRES_PORT < /tmp/init-final.sql +psql -d "$DATAHUB_DB_NAME" -U "$POSTGRES_USERNAME" -h "$POSTGRES_HOST" -p "$POSTGRES_PORT" < /tmp/init-final.sql From 791e2e7bf588d96bad94ccfdcf1beddde02dadc3 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 22 Sep 2023 16:43:58 -0700 Subject: [PATCH 26/37] feat(python): support custom models without forking (#8774) --- docs/datahub_lite.md | 1 - docs/modeling/extending-the-metadata-model.md | 105 ++++++++++------ metadata-ingestion/.gitignore | 1 + metadata-ingestion/README.md | 2 +- metadata-ingestion/build.gradle | 8 ++ metadata-ingestion/scripts/avro_codegen.py | 49 +++++++- .../scripts/custom_package_codegen.py | 119 ++++++++++++++++++ .../scripts/custom_package_codegen.sh | 16 +++ metadata-ingestion/setup.py | 8 +- .../src/datahub/cli/check_cli.py | 2 +- .../src/datahub/cli/ingest_cli.py | 15 +-- .../src/datahub/ingestion/api/registry.py | 23 ++-- .../utilities/_custom_package_loader.py | 43 +++++++ 13 files changed, 316 insertions(+), 76 deletions(-) create mode 100644 metadata-ingestion/scripts/custom_package_codegen.py create mode 100755 metadata-ingestion/scripts/custom_package_codegen.sh create mode 100644 metadata-ingestion/src/datahub/utilities/_custom_package_loader.py diff --git a/docs/datahub_lite.md b/docs/datahub_lite.md index 3918b8cee7830..de0a20eed1d01 100644 --- a/docs/datahub_lite.md +++ b/docs/datahub_lite.md @@ -7,7 +7,6 @@ import TabItem from '@theme/TabItem'; DataHub Lite is a lightweight embeddable version of DataHub with no external dependencies. It is intended to enable local developer tooling use-cases such as simple access to metadata for scripts and other tools. DataHub Lite is compatible with the DataHub metadata format and all the ingestion connectors that DataHub supports. -It was built as a reaction to [recap](https://github.com/recap-cloud/recap) to prove that a similar lightweight system could be built within DataHub quite easily. Currently DataHub Lite uses DuckDB under the covers as its default storage layer, but that might change in the future. ## Features diff --git a/docs/modeling/extending-the-metadata-model.md b/docs/modeling/extending-the-metadata-model.md index be2d7d795de70..ba101be16b98e 100644 --- a/docs/modeling/extending-the-metadata-model.md +++ b/docs/modeling/extending-the-metadata-model.md @@ -16,7 +16,6 @@ An important question that will arise once you've decided to extend the metadata

- The green lines represent pathways that will lead to lesser friction for you to maintain your code long term. The red lines represent higher risk of conflicts in the future. We are working hard to move the majority of model extension use-cases to no-code / low-code pathways to ensure that you can extend the core metadata model without having to maintain a custom fork of DataHub. We will refer to the two options as the **open-source fork** and **custom repository** approaches in the rest of the document below. @@ -92,10 +91,11 @@ the annotation model. Define the entity within an `entity-registry.yml` file. Depending on your approach, the location of this file may vary. More on that in steps [4](#step-4-choose-a-place-to-store-your-model-extension) and [5](#step-5-attaching-your-non-key-aspects-to-the-entity). Example: + ```yaml - - name: dashboard - doc: A container of related data assets. - keyAspect: dashboardKey +- name: dashboard + doc: A container of related data assets. + keyAspect: dashboardKey ``` - name: The entity name/type, this will be present as a part of the Urn. @@ -196,8 +196,8 @@ The Aspect has four key components: its properties, the @Aspect annotation, the can be defined as PDL primitives, enums, records, or collections ( see [pdl schema documentation](https://linkedin.github.io/rest.li/pdl_schema)) references to other entities, of type Urn or optionally `Urn` -- **@Aspect annotation**: Declares record is an Aspect and includes it when serializing an entity. Unlike the following - two annotations, @Aspect is applied to the entire record, rather than a specific field. Note, you can mark an aspect +- **@Aspect annotation**: Declares record is an Aspect and includes it when serializing an entity. Unlike the following + two annotations, @Aspect is applied to the entire record, rather than a specific field. Note, you can mark an aspect as a timeseries aspect. Check out this [doc](metadata-model.md#timeseries-aspects) for details. - **@Searchable annotation**: This annotation can be applied to any primitive field or a map field to indicate that it should be indexed in Elasticsearch and can be searched on. For a complete guide on using the search annotation, see @@ -205,7 +205,7 @@ The Aspect has four key components: its properties, the @Aspect annotation, the - **@Relationship annotation**: These annotations create edges between the Entity’s Urn and the destination of the annotated field when the entities are ingested. @Relationship annotations must be applied to fields of type Urn. In the case of DashboardInfo, the `charts` field is an Array of Urns. The @Relationship annotation cannot be applied - directly to an array of Urns. That’s why you see the use of an Annotation override (`”/*”:) to apply the @Relationship + directly to an array of Urns. That’s why you see the use of an Annotation override (`"/*":`) to apply the @Relationship annotation to the Urn directly. Read more about overrides in the annotation docs further down on this page. After you create your Aspect, you need to attach to all the entities that it applies to. @@ -231,7 +231,7 @@ entities: - keyAspect: dashBoardKey aspects: # the name of the aspect must be the same as that on the @Aspect annotation on the class - - dashboardInfo + - dashboardInfo ``` Previously, you were required to add all aspects for the entity into an Aspect union. You will see examples of this pattern throughout the code-base (e.g. `DatasetAspect`, `DashboardAspect` etc.). This is no longer required. @@ -251,14 +251,39 @@ Then, run `./gradlew build` from the repository root to rebuild Datahub with acc Then, re-deploy metadata-service (gms), and mae-consumer and mce-consumer (optionally if you are running them unbundled). See [docker development](../../docker/README.md) for details on how to deploy during development. This will allow Datahub to read and write your new entity or extensions to existing entities, along with serving search and graph queries for that entity type. -To emit proposals to ingest from the Datahub CLI tool, first install datahub cli -locally [following the instructions here](../../metadata-ingestion/developing.md). `./gradlew build` generated the avro -schemas your local ingestion cli tool uses earlier. After following the developing guide, you should be able to emit -your new event using the local datahub cli. +### (Optional) Step 7: Use custom models with the Python SDK + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + + + + +If you're purely using the custom models locally, you can use a local development-mode install of the DataHub CLI. + +Install the DataHub CLI locally by following the [developer instructions](../../metadata-ingestion/developing.md). +The `./gradlew build` command already generated the avro schemas for your local ingestion cli tool to use. +After following the developing guide, you should be able to emit your new event using the local DataHub CLI. + + + -Now you are ready to start ingesting metadata for your new entity! +If you want to use your custom models beyond your local machine without forking DataHub, then you can generate a custom model package that can be installed from other places. -### (Optional) Step 7: Extend the DataHub frontend to view your entity in GraphQL & React +This package should be installed alongside the base `acryl-datahub` package, and its metadata models will take precedence over the default ones. + +```bash +cd metadata-ingestion +../gradlew customPackageGenerate -Ppackage_name=my-company-datahub-models -Ppackage_version="0.0.1" +``` + +This will generate some Python build artifacts, which you can distribute within your team or publish to PyPI. +The command output will contain additional details and exact CLI commands you can use. + + + + +### (Optional) Step 8: Extend the DataHub frontend to view your entity in GraphQL & React If you are extending an entity with additional aspects, and you can use the auto-render specifications to automatically render these aspects to your satisfaction, you do not need to write any custom code. @@ -301,9 +326,9 @@ It takes the following parameters: - **autoRender**: boolean (optional) - defaults to false. When set to true, the aspect will automatically be displayed on entity pages in a tab using a default renderer. **_This is currently only supported for Charts, Dashboards, DataFlows, DataJobs, Datasets, Domains, and GlossaryTerms_**. - **renderSpec**: RenderSpec (optional) - config for autoRender aspects that controls how they are displayed. **_This is currently only supported for Charts, Dashboards, DataFlows, DataJobs, Datasets, Domains, and GlossaryTerms_**. Contains three fields: - - **displayType**: One of `tabular`, `properties`. Tabular should be used for a list of data elements, properties for a single data bag. - - **displayName**: How the aspect should be referred to in the UI. Determines the name of the tab on the entity page. - - **key**: For `tabular` aspects only. Specifies the key in which the array to render may be found. + - **displayType**: One of `tabular`, `properties`. Tabular should be used for a list of data elements, properties for a single data bag. + - **displayName**: How the aspect should be referred to in the UI. Determines the name of the tab on the entity page. + - **key**: For `tabular` aspects only. Specifies the key in which the array to render may be found. ##### Example @@ -329,34 +354,34 @@ It takes the following parameters: Thus far, we have implemented 11 fieldTypes: - 1. *KEYWORD* - Short text fields that only support exact matches, often used only for filtering + 1. _KEYWORD_ - Short text fields that only support exact matches, often used only for filtering + + 2. _TEXT_ - Text fields delimited by spaces/slashes/periods. Default field type for string variables. - 2. *TEXT* - Text fields delimited by spaces/slashes/periods. Default field type for string variables. + 3. _TEXT_PARTIAL_ - Text fields delimited by spaces/slashes/periods with partial matching support. Note, partial + matching is expensive, so this field type should not be applied to fields with long values (like description) - 3. *TEXT_PARTIAL* - Text fields delimited by spaces/slashes/periods with partial matching support. Note, partial - matching is expensive, so this field type should not be applied to fields with long values (like description) + 4. _WORD_GRAM_ - Text fields delimited by spaces, slashes, periods, dashes, or underscores with partial matching AND + word gram support. That is, the text will be split by the delimiters and can be matched with delimited queries + matching two, three, or four length tokens in addition to single tokens. As with partial match, this type is + expensive, so should not be applied to fields with long values such as description. - 4. *WORD_GRAM* - Text fields delimited by spaces, slashes, periods, dashes, or underscores with partial matching AND - word gram support. That is, the text will be split by the delimiters and can be matched with delimited queries - matching two, three, or four length tokens in addition to single tokens. As with partial match, this type is - expensive, so should not be applied to fields with long values such as description. + 5. _BROWSE_PATH_ - Field type for browse paths. Applies specific mappings for slash delimited paths. - 5. *BROWSE_PATH* - Field type for browse paths. Applies specific mappings for slash delimited paths. + 6. _URN_ - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like + "urn:li:dataplatform:kafka", it will index the platform name "kafka" and ignore the common components - 6. *URN* - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like - "urn:li:dataplatform:kafka", it will index the platform name "kafka" and ignore the common components + 7. _URN_PARTIAL_ - Urn fields where each sub-component inside the urn is indexed with partial matching support. - 7. *URN_PARTIAL* - Urn fields where each sub-component inside the urn is indexed with partial matching support. + 8. _BOOLEAN_ - Boolean fields used for filtering. - 8. *BOOLEAN* - Boolean fields used for filtering. + 9. _COUNT_ - Count fields used for filtering. - 9. *COUNT* - Count fields used for filtering. - - 10. *DATETIME* - Datetime fields used to represent timestamps. + 10. _DATETIME_ - Datetime fields used to represent timestamps. - 11. *OBJECT* - Each property in an object will become an extra column in Elasticsearch and can be referenced as - `field.property` in queries. You should be careful to not use it on objects with many properties as it can cause a - mapping explosion in Elasticsearch. + 11. _OBJECT_ - Each property in an object will become an extra column in Elasticsearch and can be referenced as + `field.property` in queries. You should be careful to not use it on objects with many properties as it can cause a + mapping explosion in Elasticsearch. - **fieldName**: string (optional) - The name of the field in search index document. Defaults to the field name where the annotation resides. @@ -401,13 +426,13 @@ Now, when Datahub ingests Dashboards, it will index the Dashboard’s title in E Dashboards, that query will be used to search on the title index and matching Dashboards will be returned. Note, when @Searchable annotation is applied to a map, it will convert it into a list with "key.toString() -=value.toString()" as elements. This allows us to index map fields, while not increasing the number of columns indexed. +=value.toString()" as elements. This allows us to index map fields, while not increasing the number of columns indexed. This way, the keys can be queried by `aMapField:key1=value1`. -You can change this behavior by specifying the fieldType as OBJECT in the @Searchable annotation. It will put each key -into a column in Elasticsearch instead of an array of serialized kay-value pairs. This way the query would look more +You can change this behavior by specifying the fieldType as OBJECT in the @Searchable annotation. It will put each key +into a column in Elasticsearch instead of an array of serialized kay-value pairs. This way the query would look more like `aMapField.key1:value1`. As this method will increase the number of columns with each unique key - large maps can -cause a mapping explosion in Elasticsearch. You should *not* use the object fieldType if you expect your maps to get +cause a mapping explosion in Elasticsearch. You should _not_ use the object fieldType if you expect your maps to get large. #### @Relationship diff --git a/metadata-ingestion/.gitignore b/metadata-ingestion/.gitignore index 673c8e0995872..acc15c4598869 100644 --- a/metadata-ingestion/.gitignore +++ b/metadata-ingestion/.gitignore @@ -8,6 +8,7 @@ bq_credentials.json junit.*.xml /tmp *.bak +custom-package/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md index 3b1aae0b24f88..a0fef614528cb 100644 --- a/metadata-ingestion/README.md +++ b/metadata-ingestion/README.md @@ -176,7 +176,7 @@ The `deploy` subcommand of the `ingest` command tree allows users to upload thei datahub ingest deploy -n -c recipe.yaml ``` -By default, no schedule is done unless explicitly configured with the `--schedule` parameter. Timezones are inferred from the system time, can be overriden with `--time-zone` flag. +By default, no schedule is done unless explicitly configured with the `--schedule` parameter. Schedule timezones are UTC by default and can be overriden with `--time-zone` flag. ```shell datahub ingest deploy -n test --schedule "0 * * * *" --time-zone "Europe/London" -c recipe.yaml ``` diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle index c20d98cbcbb58..ea7990ab9c660 100644 --- a/metadata-ingestion/build.gradle +++ b/metadata-ingestion/build.gradle @@ -62,6 +62,14 @@ task codegen(type: Exec, dependsOn: [environmentSetup, installPackage, ':metadat commandLine 'bash', '-c', "source ${venv_name}/bin/activate && ./scripts/codegen.sh" } +task customPackageGenerate(type: Exec, dependsOn: [environmentSetup, installPackage, ':metadata-events:mxe-schemas:build']) { + def package_name = project.findProperty('package_name') + def package_version = project.findProperty('package_version') + commandLine 'bash', '-c', + "source ${venv_name}/bin/activate && " + + "./scripts/custom_package_codegen.sh '${package_name}' '${package_version}'" +} + task install(dependsOn: [installPackage, codegen]) task installDev(type: Exec, dependsOn: [install]) { diff --git a/metadata-ingestion/scripts/avro_codegen.py b/metadata-ingestion/scripts/avro_codegen.py index 29ffa571c0ac8..a9b9b4b20f5ac 100644 --- a/metadata-ingestion/scripts/avro_codegen.py +++ b/metadata-ingestion/scripts/avro_codegen.py @@ -343,8 +343,15 @@ class AspectBag(TypedDict, total=False): "schemas_path", type=click.Path(exists=True, file_okay=False), required=True ) @click.argument("outdir", type=click.Path(), required=True) +@click.option("--check-unused-aspects", is_flag=True, default=False) +@click.option("--enable-custom-loader", is_flag=True, default=True) def generate( - entity_registry: str, pdl_path: str, schemas_path: str, outdir: str + entity_registry: str, + pdl_path: str, + schemas_path: str, + outdir: str, + check_unused_aspects: bool, + enable_custom_loader: bool, ) -> None: entities = load_entity_registry(Path(entity_registry)) schemas = load_schemas(schemas_path) @@ -388,10 +395,13 @@ def generate( aspect["Aspect"]["entityDoc"] = entity.doc # Check for unused aspects. We currently have quite a few. - # unused_aspects = set(aspects.keys()) - set().union( - # {entity.keyAspect for entity in entities}, - # *(set(entity.aspects) for entity in entities), - # ) + if check_unused_aspects: + unused_aspects = set(aspects.keys()) - set().union( + {entity.keyAspect for entity in entities}, + *(set(entity.aspects) for entity in entities), + ) + if unused_aspects: + raise ValueError(f"Unused aspects: {unused_aspects}") merged_schema = merge_schemas(list(schemas.values())) write_schema_files(merged_schema, outdir) @@ -404,6 +414,35 @@ def generate( Path(outdir) / "schema_classes.py", ) + if enable_custom_loader: + # Move schema_classes.py -> _schema_classes.py + # and add a custom loader. + (Path(outdir) / "_schema_classes.py").write_text( + (Path(outdir) / "schema_classes.py").read_text() + ) + (Path(outdir) / "schema_classes.py").write_text( + """ +# This is a specialized shim layer that allows us to dynamically load custom models from elsewhere. + +import importlib +from typing import TYPE_CHECKING + +from datahub.utilities._custom_package_loader import get_custom_models_package + +_custom_package_path = get_custom_models_package() + +if TYPE_CHECKING or not _custom_package_path: + from ._schema_classes import * + + # Required explicitly because __all__ doesn't include _ prefixed names. + from ._schema_classes import _Aspect, __SCHEMA_TYPES +else: + _custom_package = importlib.import_module(_custom_package_path) + globals().update(_custom_package.__dict__) + +""" + ) + # Keep a copy of a few raw avsc files. required_avsc_schemas = {"MetadataChangeEvent", "MetadataChangeProposal"} schema_save_dir = Path(outdir) / "schemas" diff --git a/metadata-ingestion/scripts/custom_package_codegen.py b/metadata-ingestion/scripts/custom_package_codegen.py new file mode 100644 index 0000000000000..4a674550d49df --- /dev/null +++ b/metadata-ingestion/scripts/custom_package_codegen.py @@ -0,0 +1,119 @@ +import re +import subprocess +import sys +from pathlib import Path + +import avro_codegen +import click + +if sys.version_info < (3, 10): + from importlib_metadata import version +else: + from importlib.metadata import version + +_avrogen_version = version("avro-gen3") + +autogen_header = """# Autogenerated by datahub's custom_package_codegen.py +# DO NOT EDIT THIS FILE DIRECTLY +""" + + +def python_package_name_normalize(name): + return re.sub(r"[-_.]+", "_", name).lower() + + +@click.command() +@click.argument( + "entity_registry", type=click.Path(exists=True, dir_okay=False), required=True +) +@click.argument( + "pdl_path", type=click.Path(exists=True, file_okay=False), required=True +) +@click.argument( + "schemas_path", type=click.Path(exists=True, file_okay=False), required=True +) +@click.argument("outdir", type=click.Path(), required=True) +@click.argument("package_name", type=str, required=True) +@click.argument("package_version", type=str, required=True) +@click.pass_context +def generate( + ctx: click.Context, + entity_registry: str, + pdl_path: str, + schemas_path: str, + outdir: str, + package_name: str, + package_version: str, +) -> None: + package_path = Path(outdir) / package_name + if package_path.is_absolute(): + raise click.UsageError("outdir must be a relative path") + + python_package_name = python_package_name_normalize(package_name) + click.echo( + f"Generating distribution {package_name} (package name {python_package_name}) at {package_path}" + ) + + src_path = package_path / "src" / python_package_name + src_path.mkdir(parents=True) + + ctx.invoke( + avro_codegen.generate, + entity_registry=entity_registry, + pdl_path=pdl_path, + schemas_path=schemas_path, + outdir=str(src_path / "models"), + enable_custom_loader=False, + ) + + (src_path / "__init__.py").write_text( + f"""{autogen_header} +__package_name__ = "{package_name}" +__version__ = "{package_version}" +""" + ) + + (package_path / "setup.py").write_text( + f"""{autogen_header} +from setuptools import setup + +_package_name = "{package_name}" +_package_version = "{package_version}" + +setup( + name=_package_name, + version=_package_version, + install_requires=[ + "avro-gen3=={_avrogen_version}", + "acryl-datahub", + ], + entry_points={{ + "datahub.custom_packages": [ + "models={python_package_name}.models.schema_classes", + ], + }}, +) +""" + ) + + # TODO add a README.md? + click.echo("Building package...") + subprocess.run(["python", "-m", "build", str(package_path)]) + + click.echo() + click.secho(f"Generated package at {package_path}", fg="green") + click.echo( + "This package should be installed alongside the main acryl-datahub package." + ) + click.echo() + click.echo(f"Install the custom package locally with `pip install {package_path}`") + click.echo( + f"To enable others to use it, share the file at {package_path}/dist/*.whl and have them install it with `pip install .whl`" + ) + click.echo( + f"Alternatively, publish it to PyPI with `twine upload {package_path}/dist/*`" + ) + + +if __name__ == "__main__": + generate() diff --git a/metadata-ingestion/scripts/custom_package_codegen.sh b/metadata-ingestion/scripts/custom_package_codegen.sh new file mode 100755 index 0000000000000..aec6293a4ef45 --- /dev/null +++ b/metadata-ingestion/scripts/custom_package_codegen.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -euo pipefail + +OUTDIR=./custom-package +PACKAGE_NAME="${1:?package name is required}" +PACKAGE_VERSION="${2:?package version is required}" + +# Note: this assumes that datahub has already been built with `./gradlew build`. +DATAHUB_ROOT=.. + +SCHEMAS_PDL="$DATAHUB_ROOT/metadata-models/src/main/pegasus/com/linkedin" +SCHEMAS_AVSC="$DATAHUB_ROOT/metadata-events/mxe-schemas/src/renamed/avro/com/linkedin" +ENTITY_REGISTRY="$DATAHUB_ROOT/metadata-models/src/main/resources/entity-registry.yml" + +rm -r $OUTDIR 2>/dev/null || true +python scripts/custom_package_codegen.py $ENTITY_REGISTRY $SCHEMAS_PDL $SCHEMAS_AVSC $OUTDIR "$PACKAGE_NAME" "$PACKAGE_VERSION" diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index e748461b156ae..10e6ff554d9f8 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -36,10 +36,11 @@ def get_long_description(): "click-default-group", "PyYAML", "toml>=0.10.0", - "entrypoints", + # In Python 3.10+, importlib_metadata is included in the standard library. + "importlib_metadata>=4.0.0; python_version < '3.10'", "docker", "expandvars>=0.6.5", - "avro-gen3==0.7.10", + "avro-gen3==0.7.11", # "avro-gen3 @ git+https://github.com/acryldata/avro_gen@master#egg=avro-gen3", "avro>=1.10.2,<1.11", "python-dateutil>=2.8.0", @@ -425,7 +426,6 @@ def get_long_description(): "types-termcolor>=1.0.0", "types-Deprecated", "types-protobuf>=4.21.0.1", - "types-tzlocal", "sqlalchemy2-stubs", } @@ -643,6 +643,7 @@ def get_long_description(): "datahub = datahub.ingestion.reporting.datahub_ingestion_run_summary_provider:DatahubIngestionRunSummaryProvider", "file = datahub.ingestion.reporting.file_reporter:FileReporter", ], + "datahub.custom_packages": [], } @@ -709,6 +710,7 @@ def get_long_description(): ] ) ), + "cloud": ["acryl-datahub-cloud"], "dev": list(dev_requirements), "testing-utils": list(test_api_requirements), # To import `datahub.testing` "integration-tests": list(full_test_dev_requirements), diff --git a/metadata-ingestion/src/datahub/cli/check_cli.py b/metadata-ingestion/src/datahub/cli/check_cli.py index f20272ecd9dbf..f7996900f7a7a 100644 --- a/metadata-ingestion/src/datahub/cli/check_cli.py +++ b/metadata-ingestion/src/datahub/cli/check_cli.py @@ -131,7 +131,7 @@ def plugins(verbose: bool) -> None: """List the enabled ingestion plugins.""" click.secho("Sources:", bold=True) - click.echo(source_registry.summary(verbose=verbose)) + click.echo(source_registry.summary(verbose=verbose, col_width=25)) click.echo() click.secho("Sinks:", bold=True) click.echo(sink_registry.summary(verbose=verbose)) diff --git a/metadata-ingestion/src/datahub/cli/ingest_cli.py b/metadata-ingestion/src/datahub/cli/ingest_cli.py index 5931bf89b010b..9b5716408f3e4 100644 --- a/metadata-ingestion/src/datahub/cli/ingest_cli.py +++ b/metadata-ingestion/src/datahub/cli/ingest_cli.py @@ -10,7 +10,6 @@ import click import click_spinner -import tzlocal from click_default_group import DefaultGroup from tabulate import tabulate @@ -248,17 +247,17 @@ async def run_ingestion_and_check_upgrade() -> int: @click.option( "--time-zone", type=str, - help=f"Timezone for the schedule. By default uses the timezone of the current system: {tzlocal.get_localzone_name()}.", + help="Timezone for the schedule in 'America/New_York' format. Uses UTC by default.", required=False, - default=tzlocal.get_localzone_name(), + default="UTC", ) def deploy( name: str, config: str, - urn: str, + urn: Optional[str], executor_id: str, - cli_version: str, - schedule: str, + cli_version: Optional[str], + schedule: Optional[str], time_zone: str, ) -> None: """ @@ -276,8 +275,6 @@ def deploy( resolve_env_vars=False, ) - graphql_query: str - variables: dict = { "urn": urn, "name": name, @@ -296,7 +293,7 @@ def deploy( exit() logger.info("Found recipe URN, will update recipe.") - graphql_query = textwrap.dedent( + graphql_query: str = textwrap.dedent( """ mutation updateIngestionSource( $urn: String!, diff --git a/metadata-ingestion/src/datahub/ingestion/api/registry.py b/metadata-ingestion/src/datahub/ingestion/api/registry.py index 56ea716948199..7d8192aff83d5 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/registry.py +++ b/metadata-ingestion/src/datahub/ingestion/api/registry.py @@ -15,18 +15,17 @@ Union, ) -import entrypoints import typing_inspect from datahub import __package_name__ from datahub.configuration.common import ConfigurationError -T = TypeVar("T") +if sys.version_info < (3, 10): + from importlib_metadata import entry_points +else: + from importlib.metadata import entry_points -# TODO: The `entrypoints` library is in maintenance mode and is not actively developed. -# We should switch to importlib.metadata once we drop support for Python 3.7. -# See https://entrypoints.readthedocs.io/en/latest/ and -# https://docs.python.org/3/library/importlib.metadata.html. +T = TypeVar("T") def _is_importable(path: str) -> bool: @@ -141,16 +140,8 @@ def register_from_entrypoint(self, entry_point_key: str) -> None: self._entrypoints.append(entry_point_key) def _load_entrypoint(self, entry_point_key: str) -> None: - entry_point: entrypoints.EntryPoint - for entry_point in entrypoints.get_group_all(entry_point_key): - name = entry_point.name - - if entry_point.object_name is None: - path = entry_point.module_name - else: - path = f"{entry_point.module_name}:{entry_point.object_name}" - - self.register_lazy(name, path) + for entry_point in entry_points(group=entry_point_key): + self.register_lazy(entry_point.name, entry_point.value) def _materialize_entrypoints(self) -> None: for entry_point_key in self._entrypoints: diff --git a/metadata-ingestion/src/datahub/utilities/_custom_package_loader.py b/metadata-ingestion/src/datahub/utilities/_custom_package_loader.py new file mode 100644 index 0000000000000..1b66258557406 --- /dev/null +++ b/metadata-ingestion/src/datahub/utilities/_custom_package_loader.py @@ -0,0 +1,43 @@ +import sys +from typing import List, Optional + +if sys.version_info < (3, 10): + from importlib_metadata import EntryPoint, entry_points +else: + from importlib.metadata import EntryPoint, entry_points + + +_CUSTOM_PACKAGE_GROUP_KEY = "datahub.custom_packages" + +_MODELS_KEY = "models" + + +class CustomPackageException(Exception): + pass + + +def _get_all_registered_custom_packages() -> List[EntryPoint]: + return list(entry_points(group=_CUSTOM_PACKAGE_GROUP_KEY)) + + +def _get_custom_package_for_name(name: str) -> Optional[str]: + entrypoints = [ + ep for ep in _get_all_registered_custom_packages() if ep.name == name + ] + + if not entrypoints: + return None + + if len(entrypoints) > 1: + all_package_options = [ + entrypoint.dist.name for entrypoint in entrypoints if entrypoint.dist + ] + raise CustomPackageException( + f"Multiple custom packages registered for {name}: cannot pick between {all_package_options}" + ) + + return entrypoints[0].value + + +def get_custom_models_package() -> Optional[str]: + return _get_custom_package_for_name(_MODELS_KEY) From 930ad2c29b3d2b7aa6e80598b16420cb409da262 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Sat, 23 Sep 2023 01:52:07 +0200 Subject: [PATCH 27/37] fix(docs): fixes link to developers guides (#8809) --- docs-website/src/pages/docs/index.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs-website/src/pages/docs/index.js b/docs-website/src/pages/docs/index.js index a0462091a046d..0e8bfdcf3b9d7 100644 --- a/docs-website/src/pages/docs/index.js +++ b/docs-website/src/pages/docs/index.js @@ -180,8 +180,8 @@ const quickLinkContent = [ { title: "Developer Guides", icon: , - description: "Interact with DataHub programmatically ", - to: "/docs/cli", + description: "Interact with DataHub programmatically", + to: "/docs/api/datahub-apis", }, { title: "Feature Guides", From 7f4395945e593c430b32e816e0848ba4ed8726ab Mon Sep 17 00:00:00 2001 From: siladitya <68184387+siladitya2@users.noreply.github.com> Date: Sat, 23 Sep 2023 01:53:12 +0200 Subject: [PATCH 28/37] docs(authorization): correct policies example (#8833) Co-authored-by: si-chakraborty Co-authored-by: John Joyce Co-authored-by: Aseem Bansal --- docs/authorization/policies.md | 41 ++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/docs/authorization/policies.md b/docs/authorization/policies.md index 27d8b15e5a73a..e3606f2a3e48d 100644 --- a/docs/authorization/policies.md +++ b/docs/authorization/policies.md @@ -145,28 +145,31 @@ For example, the following resource filter will apply the policy to datasets, ch ```json { - "resource": { - "criteria": [ - { - "field": "resource_type", - "values": [ - "dataset", - "chart", - "dashboard" - ], - "condition": "EQUALS" - }, - { - "field": "domain", - "values": [ - "urn:li:domain:domain1" - ], - "condition": "EQUALS" + "resources": { + "filter": { + "criteria": [ + { + "field": "RESOURCE_TYPE", + "condition": "EQUALS", + "values": [ + "dataset", + "chart", + "dashboard" + ] + }, + { + "field": "DOMAIN", + "values": [ + "urn:li:domain:domain1" + ], + "condition": "EQUALS" + } + ] } - ] - } + } } ``` +Where `resources` is inside the `info` aspect of a Policy. Supported fields are as follows From e254a50b5076e13f242ab79020223e2d69cd1fd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Sat, 23 Sep 2023 01:54:34 +0200 Subject: [PATCH 29/37] fix(report): too long report causes MSG_SIZE_TOO_LARGE in kafka (#8857) --- .../ingestion/source/sql/sql_common.py | 42 +++++++------------ 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index b5458a42192fc..112defe76d957 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -367,12 +367,12 @@ def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str) ) def warn(self, log: logging.Logger, key: str, reason: str) -> None: - self.report.report_warning(key, reason) + self.report.report_warning(key, reason[:100]) log.warning(f"{key} => {reason}") def error(self, log: logging.Logger, key: str, reason: str) -> None: - self.report.report_failure(key, reason) - log.error(f"{key} => {reason}") + self.report.report_failure(key, reason[:100]) + log.error(f"{key} => {reason}\n{traceback.format_exc()}") def get_inspectors(self) -> Iterable[Inspector]: # This method can be overridden in the case that you want to dynamically @@ -528,10 +528,8 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit try: self.add_profile_metadata(inspector) except Exception as e: - logger.warning( - "Failed to get enrichment data for profiler", exc_info=True - ) - self.report.report_warning( + self.warn( + logger, "profile_metadata", f"Failed to get enrichment data for profile {e}", ) @@ -638,14 +636,9 @@ def loop_tables( # noqa: C901 dataset_name, inspector, schema, table, sql_config ) except Exception as e: - logger.warning( - f"Unable to ingest {schema}.{table} due to an exception.\n {traceback.format_exc()}" - ) - self.report.report_warning( - f"{schema}.{table}", f"Ingestion error: {e}" - ) + self.warn(logger, f"{schema}.{table}", f"Ingestion error: {e}") except Exception as e: - self.report.report_failure(f"{schema}", f"Tables error: {e}") + self.error(logger, f"{schema}", f"Tables error: {e}") def add_information_for_schema(self, inspector: Inspector, schema: str) -> None: pass @@ -806,9 +799,10 @@ def _get_columns( try: columns = inspector.get_columns(table, schema) if len(columns) == 0: - self.report.report_warning(MISSING_COLUMN_INFO, dataset_name) + self.warn(logger, MISSING_COLUMN_INFO, dataset_name) except Exception as e: - self.report.report_warning( + self.warn( + logger, dataset_name, f"unable to get column information due to an error -> {e}", ) @@ -903,14 +897,9 @@ def loop_views( sql_config=sql_config, ) except Exception as e: - logger.warning( - f"Unable to ingest view {schema}.{view} due to an exception.\n {traceback.format_exc()}" - ) - self.report.report_warning( - f"{schema}.{view}", f"Ingestion error: {e}" - ) + self.warn(logger, f"{schema}.{view}", f"Ingestion error: {e}") except Exception as e: - self.report.report_failure(f"{schema}", f"Views error: {e}") + self.error(logger, f"{schema}", f"Views error: {e}") def _process_view( self, @@ -924,9 +913,7 @@ def _process_view( columns = inspector.get_columns(view, schema) except KeyError: # For certain types of views, we are unable to fetch the list of columns. - self.report.report_warning( - dataset_name, "unable to get schema for this view" - ) + self.warn(logger, dataset_name, "unable to get schema for this view") schema_metadata = None else: schema_fields = self.get_schema_fields(dataset_name, columns) @@ -1112,7 +1099,8 @@ def loop_profiler_requests( if partition is None and self.is_table_partitioned( database=None, schema=schema, table=table ): - self.report.report_warning( + self.warn( + logger, "profile skipped as partitioned table is empty or partition id was invalid", dataset_name, ) From 5bb9f30895ff90f9459e56c168cf6920f64fd9b8 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 22 Sep 2023 16:55:15 -0700 Subject: [PATCH 30/37] docs(ingest/lookml): add guide on debugging lkml parse errors (#8890) --- .../docs/sources/looker/lookml_post.md | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/docs/sources/looker/lookml_post.md b/metadata-ingestion/docs/sources/looker/lookml_post.md index 818cb681c4e90..8ebbab4b9ed48 100644 --- a/metadata-ingestion/docs/sources/looker/lookml_post.md +++ b/metadata-ingestion/docs/sources/looker/lookml_post.md @@ -2,11 +2,11 @@ :::note -The integration can use an SQL parser to try to parse the tables the views depends on. +The integration can use an SQL parser to try to parse the tables the views depends on. ::: -This parsing is disabled by default, but can be enabled by setting `parse_table_names_from_sql: True`. The default parser is based on the [`sqllineage`](https://pypi.org/project/sqllineage/) package. +This parsing is disabled by default, but can be enabled by setting `parse_table_names_from_sql: True`. The default parser is based on the [`sqllineage`](https://pypi.org/project/sqllineage/) package. As this package doesn't officially support all the SQL dialects that Looker supports, the result might not be correct. You can, however, implement a custom parser and take it into use by setting the `sql_parser` configuration value. A custom SQL parser must inherit from `datahub.utilities.sql_parser.SQLParser` and must be made available to Datahub by ,for example, installing it. The configuration then needs to be set to `module_name.ClassName` of the parser. @@ -15,12 +15,14 @@ and must be made available to Datahub by ,for example, installing it. The config Looker projects support organization as multiple git repos, with [remote includes that can refer to projects that are stored in a different repo](https://cloud.google.com/looker/docs/importing-projects#include_files_from_an_imported_project). If your Looker implementation uses multi-project setup, you can configure the LookML source to pull in metadata from your remote projects as well. If you are using local or remote dependencies, you will see include directives in your lookml files that look like this: + ``` include: "//e_flights/views/users.view.lkml" include: "//e_commerce/public/orders.view.lkml" ``` Also, you will see projects that are being referred to listed in your `manifest.lkml` file. Something like this: + ``` project_name: this_project @@ -34,9 +36,9 @@ remote_dependency: ga_360_block { } ``` - To ingest Looker repositories that are including files defined in other projects, you will need to use the `project_dependencies` directive within the configuration section. Consider the following scenario: + - Your primary project refers to a remote project called `my_remote_project` - The remote project is homed in the GitHub repo `my_org/my_remote_project` - You have provisioned a GitHub deploy key and stored the credential in the environment variable (or UI secret), `${MY_REMOTE_PROJECT_DEPLOY_KEY}` @@ -71,6 +73,23 @@ source: :::note -This is not the same as ingesting the remote project as a primary Looker project because DataHub will not be processing the model files that might live in the remote project. If you want to additionally include the views accessible via the models in the remote project, create a second recipe where your remote project is the primary project. +This is not the same as ingesting the remote project as a primary Looker project because DataHub will not be processing the model files that might live in the remote project. If you want to additionally include the views accessible via the models in the remote project, create a second recipe where your remote project is the primary project. ::: + +### Debugging LookML Parsing Errors + +If you see messages like `my_file.view.lkml': "failed to load view file: Unable to find a matching expression for '' on line 5"` in the failure logs, it indicates a parsing error for the LookML file. + +The first thing to check is that the Looker IDE can validate the file without issues. You can check this by clicking this "Validate LookML" button in the IDE when in development mode. + +If that's not the issue, it might be because DataHub's parser, which is based on the [joshtemple/lkml](https://github.com/joshtemple/lkml) library, is slightly more strict than the official Looker parser. +Note that there's currently only one known discrepancy between the two parsers, and it's related to using [multiple colon characters](https://github.com/joshtemple/lkml/issues/82) when defining parameters. + +To check if DataHub can parse your LookML file syntax, you can use the `lkml` CLI tool. If this raises an exception, DataHub will fail to parse the file. + +```sh +pip install lkml + +lkml path/to/my_file.view.lkml +``` From 5c40390a923a2a44f23290e4f1a2168820993fca Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Sat, 23 Sep 2023 05:41:42 +0530 Subject: [PATCH 31/37] feat(ingest/kafka): support metadata mapping from kafka avro schemas (#8825) Co-authored-by: Daniel Messias Co-authored-by: Deepankarkr Co-authored-by: Harshal Sheth --- .../docs/sources/kafka/kafka.md | 83 +++++++++ .../ingestion/extractor/schema_util.py | 92 ++++++++-- .../source/confluent_schema_registry.py | 22 ++- .../src/datahub/ingestion/source/kafka.py | 78 ++++++++- .../datahub/utilities/hive_schema_to_avro.py | 2 +- .../src/datahub/utilities/mapping.py | 15 +- .../integration/kafka/kafka_mces_golden.json | 164 +++++++++++++++--- .../tests/integration/kafka/value_schema.avsc | 10 +- .../tests/unit/test_kafka_source.py | 155 +++++++++++++++++ metadata-ingestion/tests/unit/test_mapping.py | 48 +++++ .../tests/unit/test_schema_util.py | 109 ++++++++++++ 11 files changed, 730 insertions(+), 48 deletions(-) diff --git a/metadata-ingestion/docs/sources/kafka/kafka.md b/metadata-ingestion/docs/sources/kafka/kafka.md index 2e8baa9516d17..9fdfc3a3af1d0 100644 --- a/metadata-ingestion/docs/sources/kafka/kafka.md +++ b/metadata-ingestion/docs/sources/kafka/kafka.md @@ -130,3 +130,86 @@ message MessageWithMap { repeated Map1Entry map_1 = 1; } ``` + +### Enriching DataHub metadata with automated meta mapping + +:::note +Meta mapping is currently only available for Avro schemas +::: + +Avro schemas are permitted to have additional attributes not defined by the specification as arbitrary metadata. A common pattern is to utilize this for business metadata. The Kafka source has the ability to transform this directly into DataHub Owners, Tags and Terms. + +#### Simple tags + +If you simply have a list of tags embedded into an Avro schema (either at the top-level or for an individual field), you can use the `schema_tags_field` config. + +Example Avro schema: + +```json +{ + "name": "sampleRecord", + "type": "record", + "tags": ["tag1", "tag2"], + "fields": [{ + "name": "field_1", + "type": "string", + "tags": ["tag3", "tag4"] + }] +} +``` + +The name of the field containing a list of tags can be configured with the `schema_tags_field` property: + +```yaml +config: + schema_tags_field: tags +``` + +#### meta mapping + +You can also map specific Avro fields into Owners, Tags and Terms using meta +mapping. + +Example Avro schema: + +```json +{ + "name": "sampleRecord", + "type": "record", + "owning_team": "@Data-Science", + "data_tier": "Bronze", + "fields": [{ + "name": "field_1", + "type": "string", + "gdpr": { + "pii": true + } + }] +} +``` + +This can be mapped to DataHub metadata with `meta_mapping` config: + +```yaml +config: + meta_mapping: + owning_team: + match: "^@(.*)" + operation: "add_owner" + config: + owner_type: group + data_tier: + match: "Bronze|Silver|Gold" + operation: "add_term" + config: + term: "{{ $match }}" + field_meta_mapping: + gdpr.pii: + match: true + operation: "add_tag" + config: + tag: "pii" +``` + +The underlying implementation is similar to [dbt meta mapping](https://datahubproject.io/docs/generated/ingestion/sources/dbt#dbt-meta-automated-mappings), which has more detailed examples that can be used for reference. + diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py index 75de18e9037ee..4acf99a50e50e 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py @@ -4,6 +4,7 @@ import avro.schema +from datahub.emitter import mce_builder from datahub.metadata.com.linkedin.pegasus2avro.schema import ( ArrayTypeClass, BooleanTypeClass, @@ -21,7 +22,7 @@ TimeTypeClass, UnionTypeClass, ) -from datahub.metadata.schema_classes import GlobalTagsClass, TagAssociationClass +from datahub.utilities.mapping import Constants, OperationProcessor """A helper file for Avro schema -> MCE schema transformations""" @@ -98,7 +99,14 @@ class AvroToMceSchemaConverter: "uuid": StringTypeClass, } - def __init__(self, is_key_schema: bool, default_nullable: bool = False) -> None: + def __init__( + self, + is_key_schema: bool, + default_nullable: bool = False, + meta_mapping_processor: Optional[OperationProcessor] = None, + schema_tags_field: Optional[str] = None, + tag_prefix: Optional[str] = None, + ) -> None: # Tracks the prefix name stack for nested name generation. self._prefix_name_stack: PrefixNameStack = [self.version_string] # Tracks the fields on the current path. @@ -112,6 +120,10 @@ def __init__(self, is_key_schema: bool, default_nullable: bool = False) -> None: if is_key_schema: # Helps maintain backwards-compatibility. Annotation for any field that is part of key-schema. self._prefix_name_stack.append("[key=True]") + # Meta mapping + self._meta_mapping_processor = meta_mapping_processor + self._schema_tags_field = schema_tags_field + self._tag_prefix = tag_prefix # Map of avro schema type to the conversion handler self._avro_type_to_mce_converter_map: Dict[ avro.schema.Schema, @@ -317,7 +329,25 @@ def emit(self) -> Generator[SchemaField, None, None]: merged_props.update(self._schema.other_props) merged_props.update(schema.other_props) - tags = None + # Parse meta_mapping + meta_aspects: Dict[str, Any] = {} + if self._converter._meta_mapping_processor: + meta_aspects = self._converter._meta_mapping_processor.process( + merged_props + ) + + tags: List[str] = [] + if self._converter._schema_tags_field: + for tag in merged_props.get(self._converter._schema_tags_field, []): + tags.append(self._converter._tag_prefix + tag) + + meta_tags_aspect = meta_aspects.get(Constants.ADD_TAG_OPERATION) + if meta_tags_aspect: + tags += [ + tag_association.tag[len("urn:li:tag:") :] + for tag_association in meta_tags_aspect.tags + ] + if "deprecated" in merged_props: description = ( f"DEPRECATED: {merged_props['deprecated']}\n" @@ -325,9 +355,13 @@ def emit(self) -> Generator[SchemaField, None, None]: if description else "" ) - tags = GlobalTagsClass( - tags=[TagAssociationClass(tag="urn:li:tag:Deprecated")] - ) + tags.append("Deprecated") + + tags_aspect = None + if tags: + tags_aspect = mce_builder.make_global_tag_aspect_with_tag_list(tags) + + meta_terms_aspect = meta_aspects.get(Constants.ADD_TERM_OPERATION) logical_type_name: Optional[str] = ( # logicalType nested inside type @@ -349,7 +383,8 @@ def emit(self) -> Generator[SchemaField, None, None]: recursive=False, nullable=self._converter._is_nullable(schema), isPartOfKey=self._converter._is_key_schema, - globalTags=tags, + globalTags=tags_aspect, + glossaryTerms=meta_terms_aspect, jsonProps=json.dumps(merged_props) if merged_props else None, ) yield field @@ -447,7 +482,9 @@ def _gen_from_non_field_nested_schemas( actual_schema = self._get_underlying_type_if_option_as_union(schema, schema) with AvroToMceSchemaConverter.SchemaFieldEmissionContextManager( - schema, actual_schema, self + schema, + actual_schema, + self, ) as fe_schema: if isinstance( actual_schema, @@ -478,7 +515,9 @@ def _gen_non_nested_to_mce_fields( ) -> Generator[SchemaField, None, None]: """Handles generation of MCE SchemaFields for non-nested AVRO types.""" with AvroToMceSchemaConverter.SchemaFieldEmissionContextManager( - schema, schema, self + schema, + schema, + self, ) as non_nested_emitter: yield from non_nested_emitter.emit() @@ -496,9 +535,12 @@ def _to_mce_fields( @classmethod def to_mce_fields( cls, - avro_schema_string: str, + avro_schema: avro.schema.Schema, is_key_schema: bool, default_nullable: bool = False, + meta_mapping_processor: Optional[OperationProcessor] = None, + schema_tags_field: Optional[str] = None, + tag_prefix: Optional[str] = None, ) -> Generator[SchemaField, None, None]: """ Converts a key or value type AVRO schema string to appropriate MCE SchemaFields. @@ -506,8 +548,14 @@ def to_mce_fields( :param is_key_schema: True if it is a key-schema. :return: An MCE SchemaField generator. """ - avro_schema = avro.schema.parse(avro_schema_string) - converter = cls(is_key_schema, default_nullable) + # avro_schema = avro.schema.parse(avro_schema) + converter = cls( + is_key_schema, + default_nullable, + meta_mapping_processor, + schema_tags_field, + tag_prefix, + ) yield from converter._to_mce_fields(avro_schema) @@ -516,28 +564,40 @@ def to_mce_fields( def avro_schema_to_mce_fields( - avro_schema_string: str, + avro_schema: Union[avro.schema.Schema, str], is_key_schema: bool = False, default_nullable: bool = False, + meta_mapping_processor: Optional[OperationProcessor] = None, + schema_tags_field: Optional[str] = None, + tag_prefix: Optional[str] = None, swallow_exceptions: bool = True, ) -> List[SchemaField]: """ Converts an avro schema into schema fields compatible with MCE. - :param avro_schema_string: String representation of the AVRO schema. + :param avro_schema: AVRO schema, either as a string or as an avro.schema.Schema object. :param is_key_schema: True if it is a key-schema. Default is False (value-schema). :param swallow_exceptions: True if the caller wants exceptions to be suppressed + :param action_processor: Optional OperationProcessor to be used for meta mappings :return: The list of MCE compatible SchemaFields. """ try: + if isinstance(avro_schema, str): + avro_schema = avro.schema.parse(avro_schema) + return list( AvroToMceSchemaConverter.to_mce_fields( - avro_schema_string, is_key_schema, default_nullable + avro_schema, + is_key_schema, + default_nullable, + meta_mapping_processor, + schema_tags_field, + tag_prefix, ) ) except Exception: if swallow_exceptions: - logger.exception(f"Failed to parse {avro_schema_string} into mce fields.") + logger.exception(f"Failed to parse {avro_schema} into mce fields.") return [] else: raise diff --git a/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py b/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py index 0bdcb115b377c..54475cb509621 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py +++ b/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py @@ -4,6 +4,7 @@ from hashlib import md5 from typing import Any, List, Optional, Set, Tuple +import avro.schema import jsonref from confluent_kafka.schema_registry.schema_registry_client import ( RegisteredSchema, @@ -22,6 +23,8 @@ SchemaField, SchemaMetadata, ) +from datahub.metadata.schema_classes import OwnershipSourceTypeClass +from datahub.utilities.mapping import OperationProcessor logger = logging.getLogger(__name__) @@ -59,6 +62,14 @@ def __init__( except Exception as e: logger.warning(f"Failed to get subjects from schema registry: {e}") + self.field_meta_processor = OperationProcessor( + self.source_config.field_meta_mapping, + self.source_config.tag_prefix, + OwnershipSourceTypeClass.SERVICE, + self.source_config.strip_user_ids_from_email, + match_nested_props=True, + ) + @classmethod def create( cls, source_config: KafkaSourceConfig, report: KafkaSourceReport @@ -290,10 +301,19 @@ def _get_schema_fields( fields: List[SchemaField] = [] if schema.schema_type == "AVRO": cleaned_str: str = self.get_schema_str_replace_confluent_ref_avro(schema) + avro_schema = avro.schema.parse(cleaned_str) + # "value.id" or "value.[type=string]id" fields = schema_util.avro_schema_to_mce_fields( - cleaned_str, is_key_schema=is_key_schema + avro_schema, + is_key_schema=is_key_schema, + meta_mapping_processor=self.field_meta_processor + if self.source_config.enable_meta_mapping + else None, + schema_tags_field=self.source_config.schema_tags_field, + tag_prefix=self.source_config.tag_prefix, ) + elif schema.schema_type == "PROTOBUF": imported_schemas: List[ ProtobufSchema diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka.py b/metadata-ingestion/src/datahub/ingestion/source/kafka.py index 61f6103347eb3..566304e1999b7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka.py @@ -5,6 +5,7 @@ from enum import Enum from typing import Any, Dict, Iterable, List, Optional, Type +import avro.schema import confluent_kafka import confluent_kafka.admin import pydantic @@ -18,6 +19,7 @@ from datahub.configuration.common import AllowDenyPattern from datahub.configuration.kafka import KafkaConsumerConnectionConfig from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.emitter import mce_builder from datahub.emitter.mce_builder import ( make_data_platform_urn, make_dataplatform_instance_urn, @@ -56,8 +58,10 @@ DataPlatformInstanceClass, DatasetPropertiesClass, KafkaSchemaClass, + OwnershipSourceTypeClass, SubTypesClass, ) +from datahub.utilities.mapping import Constants, OperationProcessor from datahub.utilities.registries.domain_registry import DomainRegistry logger = logging.getLogger(__name__) @@ -89,6 +93,29 @@ class KafkaSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): default="datahub.ingestion.source.confluent_schema_registry.ConfluentSchemaRegistry", description="The fully qualified implementation class(custom) that implements the KafkaSchemaRegistryBase interface.", ) + schema_tags_field = pydantic.Field( + default="tags", + description="The field name in the schema metadata that contains the tags to be added to the dataset.", + ) + enable_meta_mapping = pydantic.Field( + default=True, + description="When enabled, applies the mappings that are defined through the meta_mapping directives.", + ) + meta_mapping: Dict = pydantic.Field( + default={}, + description="mapping rules that will be executed against top-level schema properties. Refer to the section below on meta automated mappings.", + ) + field_meta_mapping: Dict = pydantic.Field( + default={}, + description="mapping rules that will be executed against field-level schema properties. Refer to the section below on meta automated mappings.", + ) + strip_user_ids_from_email: bool = pydantic.Field( + default=False, + description="Whether or not to strip email id while adding owners using meta mappings.", + ) + tag_prefix: str = pydantic.Field( + default="", description="Prefix added to tags during ingestion." + ) ignore_warnings_on_schema_type: bool = pydantic.Field( default=False, description="Disables warnings reported for non-AVRO/Protobuf value or key schemas if set.", @@ -167,6 +194,14 @@ def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext): graph=self.ctx.graph, ) + self.meta_processor = OperationProcessor( + self.source_config.meta_mapping, + self.source_config.tag_prefix, + OwnershipSourceTypeClass.SERVICE, + self.source_config.strip_user_ids_from_email, + match_nested_props=True, + ) + def init_kafka_admin_client(self) -> None: try: # TODO: Do we require separate config than existing consumer_config ? @@ -227,7 +262,6 @@ def _extract_record( logger.debug(f"topic = {topic}") AVRO = "AVRO" - DOC_KEY = "doc" # 1. Create the default dataset snapshot for the topic. dataset_name = topic @@ -261,8 +295,8 @@ def _extract_record( topic, topic_detail, extra_topic_config ) - # 4. Set dataset's description as top level doc, if topic schema type is avro - description = None + # 4. Set dataset's description, tags, ownership, etc, if topic schema type is avro + description: Optional[str] = None if ( schema_metadata is not None and isinstance(schema_metadata.platformSchema, KafkaSchemaClass) @@ -271,9 +305,41 @@ def _extract_record( # Point to note: # In Kafka documentSchema and keySchema both contains "doc" field. # DataHub Dataset "description" field is mapped to documentSchema's "doc" field. - schema = json.loads(schema_metadata.platformSchema.documentSchema) - if isinstance(schema, dict): - description = schema.get(DOC_KEY) + + avro_schema = avro.schema.parse( + schema_metadata.platformSchema.documentSchema + ) + description = avro_schema.doc + # set the tags + all_tags: List[str] = [] + for tag in avro_schema.other_props.get( + self.source_config.schema_tags_field, [] + ): + all_tags.append(self.source_config.tag_prefix + tag) + + if self.source_config.enable_meta_mapping: + meta_aspects = self.meta_processor.process(avro_schema.other_props) + + meta_owners_aspects = meta_aspects.get(Constants.ADD_OWNER_OPERATION) + if meta_owners_aspects: + dataset_snapshot.aspects.append(meta_owners_aspects) + + meta_terms_aspect = meta_aspects.get(Constants.ADD_TERM_OPERATION) + if meta_terms_aspect: + dataset_snapshot.aspects.append(meta_terms_aspect) + + # Create the tags aspect + meta_tags_aspect = meta_aspects.get(Constants.ADD_TAG_OPERATION) + if meta_tags_aspect: + all_tags += [ + tag_association.tag[len("urn:li:tag:") :] + for tag_association in meta_tags_aspect.tags + ] + + if all_tags: + dataset_snapshot.aspects.append( + mce_builder.make_global_tag_aspect_with_tag_list(all_tags) + ) dataset_properties = DatasetPropertiesClass( name=topic, customProperties=custom_props, description=description diff --git a/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py b/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py index 8865254e88579..4fcef990ae4f4 100644 --- a/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py +++ b/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py @@ -269,7 +269,7 @@ def get_schema_fields_for_hive_column( hive_column_name=hive_column_name, hive_column_type=hive_column_type ) schema_fields = avro_schema_to_mce_fields( - avro_schema_string=json.dumps(avro_schema_json), + avro_schema=json.dumps(avro_schema_json), default_nullable=default_nullable, swallow_exceptions=False, ) diff --git a/metadata-ingestion/src/datahub/utilities/mapping.py b/metadata-ingestion/src/datahub/utilities/mapping.py index 32666ceecdf85..793eccfb22c7e 100644 --- a/metadata-ingestion/src/datahub/utilities/mapping.py +++ b/metadata-ingestion/src/datahub/utilities/mapping.py @@ -1,6 +1,8 @@ import contextlib import logging +import operator import re +from functools import reduce from typing import Any, Dict, List, Match, Optional, Union from datahub.emitter import mce_builder @@ -94,11 +96,13 @@ def __init__( tag_prefix: str = "", owner_source_type: Optional[str] = None, strip_owner_email_id: bool = False, + match_nested_props: bool = False, ): self.operation_defs = operation_defs self.tag_prefix = tag_prefix self.strip_owner_email_id = strip_owner_email_id self.owner_source_type = owner_source_type + self.match_nested_props = match_nested_props def process(self, raw_props: Dict[str, Any]) -> Dict[str, Any]: # Defining the following local variables - @@ -121,9 +125,18 @@ def process(self, raw_props: Dict[str, Any]) -> Dict[str, Any]: ) if not operation_type or not operation_config: continue + raw_props_value = raw_props.get(operation_key) + if not raw_props_value and self.match_nested_props: + try: + raw_props_value = reduce( + operator.getitem, operation_key.split("."), raw_props + ) + except KeyError: + pass + maybe_match = self.get_match( self.operation_defs[operation_key][Constants.MATCH], - raw_props.get(operation_key), + raw_props_value, ) if maybe_match is not None: operation = self.get_operation_value( diff --git a/metadata-ingestion/tests/integration/kafka/kafka_mces_golden.json b/metadata-ingestion/tests/integration/kafka/kafka_mces_golden.json index e51eaa10b8b10..7dd328168e84c 100644 --- a/metadata-ingestion/tests/integration/kafka/kafka_mces_golden.json +++ b/metadata-ingestion/tests/integration/kafka/kafka_mces_golden.json @@ -86,7 +86,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "kafka-test" + "runId": "kafka-test", + "lastRunId": "no-run-id-provided" } }, { @@ -103,7 +104,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "kafka-test" + "runId": "kafka-test", + "lastRunId": "no-run-id-provided" } }, { @@ -118,7 +120,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "kafka-test" + "runId": "kafka-test", + "lastRunId": "no-run-id-provided" } }, { @@ -144,10 +147,10 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "hash": "cc452cf58242cdb9d09cf33d657497d8", + "hash": "a79a2fe3adab60b21d272a9cc3e93595", "platformSchema": { "com.linkedin.pegasus2avro.schema.KafkaSchema": { - "documentSchema": "{\"type\":\"record\",\"name\":\"CreateUserRequest\",\"namespace\":\"io.codebrews.createuserrequest\",\"doc\":\"Value schema for kafka topic\",\"fields\":[{\"name\":\"email\",\"type\":\"string\"},{\"name\":\"firstName\",\"type\":\"string\"},{\"name\":\"lastName\",\"type\":\"string\"}]}", + "documentSchema": "{\"type\":\"record\",\"name\":\"CreateUserRequest\",\"namespace\":\"io.codebrews.createuserrequest\",\"doc\":\"Value schema for kafka topic\",\"fields\":[{\"name\":\"email\",\"type\":\"string\",\"tags\":[\"Email\"]},{\"name\":\"firstName\",\"type\":\"string\",\"tags\":[\"Name\"]},{\"name\":\"lastName\",\"type\":\"string\",\"tags\":[\"Name\"]}],\"tags\":[\"PII\"]}", "documentSchemaType": "AVRO", "keySchema": "{\"type\":\"record\",\"name\":\"UserKey\",\"namespace\":\"io.codebrews.createuserrequest\",\"doc\":\"Key schema for kafka topic\",\"fields\":[{\"name\":\"id\",\"type\":\"long\"},{\"name\":\"namespace\",\"type\":\"string\"}]}", "keySchemaType": "AVRO" @@ -188,7 +191,15 @@ }, "nativeDataType": "email", "recursive": false, - "isPartOfKey": false + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Email" + } + ] + }, + "isPartOfKey": false, + "jsonProps": "{\"tags\": [\"Email\"]}" }, { "fieldPath": "[version=2.0].[type=CreateUserRequest].[type=string].firstName", @@ -200,7 +211,15 @@ }, "nativeDataType": "firstName", "recursive": false, - "isPartOfKey": false + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Name" + } + ] + }, + "isPartOfKey": false, + "jsonProps": "{\"tags\": [\"Name\"]}" }, { "fieldPath": "[version=2.0].[type=CreateUserRequest].[type=string].lastName", @@ -212,7 +231,15 @@ }, "nativeDataType": "lastName", "recursive": false, - "isPartOfKey": false + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Name" + } + ] + }, + "isPartOfKey": false, + "jsonProps": "{\"tags\": [\"Name\"]}" } ] } @@ -224,6 +251,15 @@ ] } }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [ + { + "tag": "urn:li:tag:PII" + } + ] + } + }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { @@ -246,7 +282,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "kafka-test" + "runId": "kafka-test", + "lastRunId": "no-run-id-provided" } }, { @@ -263,7 +300,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "kafka-test" + "runId": "kafka-test", + "lastRunId": "no-run-id-provided" } }, { @@ -280,7 +318,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "kafka-test" + "runId": "kafka-test", + "lastRunId": "no-run-id-provided" } }, { @@ -295,7 +334,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "kafka-test" + "runId": "kafka-test", + "lastRunId": "no-run-id-provided" } }, { @@ -321,10 +361,10 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "hash": "dc1cf32c2688cc3d2d27fe6e856f06d2", + "hash": "62c7c400ec5760797a59c45e59c2f2dc", "platformSchema": { "com.linkedin.pegasus2avro.schema.KafkaSchema": { - "documentSchema": "{\"type\":\"record\",\"name\":\"CreateUserRequest\",\"namespace\":\"io.codebrews.createuserrequest\",\"doc\":\"Value schema for kafka topic\",\"fields\":[{\"name\":\"email\",\"type\":\"string\"},{\"name\":\"firstName\",\"type\":\"string\"},{\"name\":\"lastName\",\"type\":\"string\"}]}", + "documentSchema": "{\"type\":\"record\",\"name\":\"CreateUserRequest\",\"namespace\":\"io.codebrews.createuserrequest\",\"doc\":\"Value schema for kafka topic\",\"fields\":[{\"name\":\"email\",\"type\":\"string\",\"tags\":[\"Email\"]},{\"name\":\"firstName\",\"type\":\"string\",\"tags\":[\"Name\"]},{\"name\":\"lastName\",\"type\":\"string\",\"tags\":[\"Name\"]}],\"tags\":[\"PII\"]}", "documentSchemaType": "AVRO", "keySchema": "\"string\"", "keySchemaType": "AVRO" @@ -353,7 +393,15 @@ }, "nativeDataType": "email", "recursive": false, - "isPartOfKey": false + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Email" + } + ] + }, + "isPartOfKey": false, + "jsonProps": "{\"tags\": [\"Email\"]}" }, { "fieldPath": "[version=2.0].[type=CreateUserRequest].[type=string].firstName", @@ -365,7 +413,15 @@ }, "nativeDataType": "firstName", "recursive": false, - "isPartOfKey": false + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Name" + } + ] + }, + "isPartOfKey": false, + "jsonProps": "{\"tags\": [\"Name\"]}" }, { "fieldPath": "[version=2.0].[type=CreateUserRequest].[type=string].lastName", @@ -377,7 +433,15 @@ }, "nativeDataType": "lastName", "recursive": false, - "isPartOfKey": false + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Name" + } + ] + }, + "isPartOfKey": false, + "jsonProps": "{\"tags\": [\"Name\"]}" } ] } @@ -389,6 +453,15 @@ ] } }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [ + { + "tag": "urn:li:tag:PII" + } + ] + } + }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { @@ -411,7 +484,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "kafka-test" + "runId": "kafka-test", + "lastRunId": "no-run-id-provided" } }, { @@ -428,7 +502,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "kafka-test" + "runId": "kafka-test", + "lastRunId": "no-run-id-provided" } }, { @@ -443,7 +518,56 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "kafka-test" + "runId": "kafka-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Email", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Email" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "kafka-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Name", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Name" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "kafka-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:PII", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "PII" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "kafka-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/kafka/value_schema.avsc b/metadata-ingestion/tests/integration/kafka/value_schema.avsc index 788cb94c47a72..8cb6c42cb03f4 100644 --- a/metadata-ingestion/tests/integration/kafka/value_schema.avsc +++ b/metadata-ingestion/tests/integration/kafka/value_schema.avsc @@ -3,18 +3,22 @@ "type": "record", "name": "CreateUserRequest", "doc": "Value schema for kafka topic", + "tags": ["PII"], "fields": [ { "name": "email", - "type": "string" + "type": "string", + "tags": ["Email"] }, { "name": "firstName", - "type": "string" + "type": "string", + "tags": ["Name"] }, { "name": "lastName", - "type": "string" + "type": "string", + "tags": ["Name"] } ] } diff --git a/metadata-ingestion/tests/unit/test_kafka_source.py b/metadata-ingestion/tests/unit/test_kafka_source.py index b48ebf12ee37a..603068780d0a7 100644 --- a/metadata-ingestion/tests/unit/test_kafka_source.py +++ b/metadata-ingestion/tests/unit/test_kafka_source.py @@ -1,3 +1,4 @@ +import json from itertools import chain from typing import Dict, Optional, Tuple from unittest.mock import MagicMock, patch @@ -7,11 +8,17 @@ RegisteredSchema, Schema, ) +from freezegun import freeze_time from datahub.emitter.mce_builder import ( + OwnerType, make_dataplatform_instance_urn, make_dataset_urn, make_dataset_urn_with_platform_instance, + make_global_tag_aspect_with_tag_list, + make_glossary_terms_aspect_from_urn_list, + make_owner_urn, + make_ownership_aspect_from_urn_list, ) from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.workunit import MetadataWorkUnit @@ -20,7 +27,10 @@ from datahub.metadata.schema_classes import ( BrowsePathsClass, DataPlatformInstanceClass, + GlobalTagsClass, + GlossaryTermsClass, KafkaSchemaClass, + OwnershipClass, SchemaMetadataClass, ) @@ -521,3 +531,148 @@ def test_kafka_source_succeeds_with_describe_configs_error( mock_admin_client_instance.describe_configs.assert_called_once() assert len(workunits) == 2 + + +@freeze_time("2023-09-20 10:00:00") +@patch( + "datahub.ingestion.source.confluent_schema_registry.SchemaRegistryClient", + autospec=True, +) +@patch("datahub.ingestion.source.kafka.confluent_kafka.Consumer", autospec=True) +def test_kafka_source_topic_meta_mappings( + mock_kafka_consumer, mock_schema_registry_client, mock_admin_client +): + # Setup the topic to key/value schema mappings for all types of schema registry subject name strategies. + # ,) + topic_subject_schema_map: Dict[str, Tuple[RegisteredSchema, RegisteredSchema]] = { + "topic1": ( + RegisteredSchema( + schema_id="schema_id_2", + schema=Schema( + schema_str='{"type":"record", "name":"Topic1Key", "namespace": "test.acryl", "fields": [{"name":"t1key", "type": "string"}]}', + schema_type="AVRO", + ), + subject="topic1-key", + version=1, + ), + RegisteredSchema( + schema_id="schema_id_1", + schema=Schema( + schema_str=json.dumps( + { + "type": "record", + "name": "Topic1Value", + "namespace": "test.acryl", + "fields": [{"name": "t1value", "type": "string"}], + "owner": "@charles", + "business_owner": "jdoe.last@gmail.com", + "data_governance.team_owner": "Finance", + "has_pii": True, + "int_property": 1, + "double_property": 2.5, + } + ), + schema_type="AVRO", + ), + subject="topic1-value", + version=1, + ), + ) + } + + # Mock the kafka consumer + mock_kafka_instance = mock_kafka_consumer.return_value + mock_cluster_metadata = MagicMock() + mock_cluster_metadata.topics = {k: None for k in topic_subject_schema_map.keys()} + mock_kafka_instance.list_topics.return_value = mock_cluster_metadata + + # Mock the schema registry client + # - mock get_subjects: all subjects in topic_subject_schema_map + mock_schema_registry_client.return_value.get_subjects.return_value = [ + v.subject for v in chain(*topic_subject_schema_map.values()) + ] + + # - mock get_latest_version + def mock_get_latest_version(subject_name: str) -> Optional[RegisteredSchema]: + for registered_schema in chain(*topic_subject_schema_map.values()): + if registered_schema.subject == subject_name: + return registered_schema + return None + + mock_schema_registry_client.return_value.get_latest_version = ( + mock_get_latest_version + ) + + ctx = PipelineContext(run_id="test1") + kafka_source = KafkaSource.create( + { + "connection": {"bootstrap": "localhost:9092"}, + "meta_mapping": { + "owner": { + "match": "^@(.*)", + "operation": "add_owner", + "config": {"owner_type": "user"}, + }, + "business_owner": { + "match": ".*", + "operation": "add_owner", + "config": {"owner_type": "user"}, + }, + "has_pii": { + "match": True, + "operation": "add_tag", + "config": {"tag": "has_pii_test"}, + }, + "int_property": { + "match": 1, + "operation": "add_tag", + "config": {"tag": "int_meta_property"}, + }, + "double_property": { + "match": 2.5, + "operation": "add_term", + "config": {"term": "double_meta_property"}, + }, + "data_governance.team_owner": { + "match": "Finance", + "operation": "add_term", + "config": {"term": "Finance_test"}, + }, + }, + }, + ctx, + ) + workunits = [w for w in kafka_source.get_workunits()] + assert len(workunits) == 4 + mce = workunits[0].metadata + assert isinstance(mce, MetadataChangeEvent) + + ownership_aspect = [ + asp for asp in mce.proposedSnapshot.aspects if isinstance(asp, OwnershipClass) + ][0] + assert ownership_aspect == make_ownership_aspect_from_urn_list( + [ + make_owner_urn("charles", OwnerType.USER), + make_owner_urn("jdoe.last@gmail.com", OwnerType.USER), + ], + "SERVICE", + ) + + tags_aspect = [ + asp for asp in mce.proposedSnapshot.aspects if isinstance(asp, GlobalTagsClass) + ][0] + assert tags_aspect == make_global_tag_aspect_with_tag_list( + ["has_pii_test", "int_meta_property"] + ) + + terms_aspect = [ + asp + for asp in mce.proposedSnapshot.aspects + if isinstance(asp, GlossaryTermsClass) + ][0] + assert terms_aspect == make_glossary_terms_aspect_from_urn_list( + [ + "urn:li:glossaryTerm:Finance_test", + "urn:li:glossaryTerm:double_meta_property", + ] + ) diff --git a/metadata-ingestion/tests/unit/test_mapping.py b/metadata-ingestion/tests/unit/test_mapping.py index aea1d8ddd9a54..d69dd4a8a96b0 100644 --- a/metadata-ingestion/tests/unit/test_mapping.py +++ b/metadata-ingestion/tests/unit/test_mapping.py @@ -231,3 +231,51 @@ def test_operation_processor_advanced_matching_tags(): tag_aspect: GlobalTagsClass = aspect_map["add_tag"] assert len(tag_aspect.tags) == 1 assert tag_aspect.tags[0].tag == "urn:li:tag:case_4567" + + +def test_operation_processor_matching_nested_props(): + raw_props = { + "gdpr": { + "pii": True, + }, + } + processor = OperationProcessor( + operation_defs={ + "gdpr.pii": { + "match": True, + "operation": "add_tag", + "config": {"tag": "pii"}, + }, + }, + owner_source_type="SOURCE_CONTROL", + match_nested_props=True, + ) + aspect_map = processor.process(raw_props) + assert "add_tag" in aspect_map + + tag_aspect: GlobalTagsClass = aspect_map["add_tag"] + assert len(tag_aspect.tags) == 1 + assert tag_aspect.tags[0].tag == "urn:li:tag:pii" + + +def test_operation_processor_matching_dot_props(): + raw_props = { + "gdpr.pii": True, + } + processor = OperationProcessor( + operation_defs={ + "gdpr.pii": { + "match": True, + "operation": "add_tag", + "config": {"tag": "pii"}, + }, + }, + owner_source_type="SOURCE_CONTROL", + match_nested_props=True, + ) + aspect_map = processor.process(raw_props) + assert "add_tag" in aspect_map + + tag_aspect: GlobalTagsClass = aspect_map["add_tag"] + assert len(tag_aspect.tags) == 1 + assert tag_aspect.tags[0].tag == "urn:li:tag:pii" diff --git a/metadata-ingestion/tests/unit/test_schema_util.py b/metadata-ingestion/tests/unit/test_schema_util.py index e81c335e178a2..0a111d700cf8c 100644 --- a/metadata-ingestion/tests/unit/test_schema_util.py +++ b/metadata-ingestion/tests/unit/test_schema_util.py @@ -6,7 +6,12 @@ from typing import Dict, List, Type import pytest +from freezegun import freeze_time +from datahub.emitter.mce_builder import ( + make_global_tag_aspect_with_tag_list, + make_glossary_terms_aspect_from_urn_list, +) from datahub.ingestion.extractor.schema_util import avro_schema_to_mce_fields from datahub.metadata.com.linkedin.pegasus2avro.schema import ( DateTypeClass, @@ -15,6 +20,7 @@ StringTypeClass, TimeTypeClass, ) +from datahub.utilities.mapping import OperationProcessor logger = logging.getLogger(__name__) @@ -771,3 +777,106 @@ def test_ignore_exceptions(): """ fields: List[SchemaField] = avro_schema_to_mce_fields(malformed_schema) assert not fields + + +@freeze_time("2023-09-12") +def test_avro_schema_to_mce_fields_with_field_meta_mapping(): + schema = """ +{ + "type": "record", + "name": "Payment", + "namespace": "some.event.namespace", + "fields": [ + {"name": "id", "type": "string"}, + {"name": "amount", "type": "double", "doc": "amountDoc","has_pii": "False"}, + {"name": "name","type": "string","default": "","has_pii": "True"}, + {"name": "phoneNumber", + "type": [{ + "type": "record", + "name": "PhoneNumber", + "doc": "testDoc", + "fields": [{ + "name": "areaCode", + "type": "string", + "doc": "areaCodeDoc", + "default": "" + }, { + "name": "countryCode", + "type": "string", + "default": "" + }, { + "name": "prefix", + "type": "string", + "default": "" + }, { + "name": "number", + "type": "string", + "default": "" + }] + }, + "null" + ], + "default": "null", + "has_pii": "True", + "glossary_field": "TERM_PhoneNumber" + }, + {"name": "address", + "type": [{ + "type": "record", + "name": "Address", + "fields": [{ + "name": "street", + "type": "string", + "default": "" + }] + }, + "null" + ], + "doc": "addressDoc", + "default": "null", + "has_pii": "True", + "glossary_field": "TERM_Address" + } + ] +} +""" + processor = OperationProcessor( + operation_defs={ + "has_pii": { + "match": "True", + "operation": "add_tag", + "config": {"tag": "has_pii_test"}, + }, + "glossary_field": { + "match": "TERM_(.*)", + "operation": "add_term", + "config": {"term": "{{ $match }}"}, + }, + } + ) + fields = avro_schema_to_mce_fields(schema, meta_mapping_processor=processor) + expected_field_paths = [ + "[version=2.0].[type=Payment].[type=string].id", + "[version=2.0].[type=Payment].[type=double].amount", + "[version=2.0].[type=Payment].[type=string].name", + "[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber", + "[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber.[type=string].areaCode", + "[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber.[type=string].countryCode", + "[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber.[type=string].prefix", + "[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber.[type=string].number", + "[version=2.0].[type=Payment].[type=Address].address", + "[version=2.0].[type=Payment].[type=Address].address.[type=string].street", + ] + assert_field_paths_match(fields, expected_field_paths) + + pii_tag_aspect = make_global_tag_aspect_with_tag_list(["has_pii_test"]) + assert fields[1].globalTags is None + assert fields[2].globalTags == pii_tag_aspect + assert fields[3].globalTags == pii_tag_aspect + assert fields[3].glossaryTerms == make_glossary_terms_aspect_from_urn_list( + ["urn:li:glossaryTerm:PhoneNumber"] + ) + assert fields[8].globalTags == pii_tag_aspect + assert fields[8].glossaryTerms == make_glossary_terms_aspect_from_urn_list( + ["urn:li:glossaryTerm:Address"] + ) From 501522d891a4c608784e0c92c32b99d67e80f4b0 Mon Sep 17 00:00:00 2001 From: Shubham Jagtap <132359390+shubhamjagtap639@users.noreply.github.com> Date: Sat, 23 Sep 2023 05:42:48 +0530 Subject: [PATCH 32/37] feat(ingest/kafka-connect): Lineage for Kafka Connect > Snowflake (#8811) Co-authored-by: Harshal Sheth --- .../docs/sources/kafka-connect/README.md | 2 +- metadata-ingestion/setup.py | 1 + .../datahub/ingestion/source/kafka_connect.py | 108 +++++++++++++ ...ka_connect_snowflake_sink_mces_golden.json | 152 ++++++++++++++++++ .../kafka-connect/test_kafka_connect.py | 100 ++++++++++++ 5 files changed, 362 insertions(+), 1 deletion(-) create mode 100644 metadata-ingestion/tests/integration/kafka-connect/kafka_connect_snowflake_sink_mces_golden.json diff --git a/metadata-ingestion/docs/sources/kafka-connect/README.md b/metadata-ingestion/docs/sources/kafka-connect/README.md index 5031bff5a3fac..e4f64c62914c5 100644 --- a/metadata-ingestion/docs/sources/kafka-connect/README.md +++ b/metadata-ingestion/docs/sources/kafka-connect/README.md @@ -21,4 +21,4 @@ This ingestion source maps the following Source System Concepts to DataHub Conce Works only for - Source connectors: JDBC, Debezium, Mongo and Generic connectors with user-defined lineage graph -- Sink connectors: BigQuery +- Sink connectors: BigQuery, Confluent S3, Snowflake diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 10e6ff554d9f8..a0d16aa92ad9b 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -505,6 +505,7 @@ def get_long_description(): "nifi", "vertica", "mode", + "kafka-connect", ] if plugin for dependency in plugins[plugin] diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py index b3fa5e3401c07..f3344782917ab 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py @@ -901,6 +901,108 @@ def _extract_lineages(self): return +@dataclass +class SnowflakeSinkConnector: + connector_manifest: ConnectorManifest + report: KafkaConnectSourceReport + + def __init__( + self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport + ) -> None: + self.connector_manifest = connector_manifest + self.report = report + self._extract_lineages() + + @dataclass + class SnowflakeParser: + database_name: str + schema_name: str + topics_to_tables: Dict[str, str] + + def report_warning(self, key: str, reason: str) -> None: + logger.warning(f"{key}: {reason}") + self.report.report_warning(key, reason) + + def get_table_name_from_topic_name(self, topic_name: str) -> str: + """ + This function converts the topic name to a valid Snowflake table name using some rules. + Refer below link for more info + https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics + """ + table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name) + if re.match("^[^a-zA-Z_].*", table_name): + table_name = "_" + table_name + # Connector may append original topic's hash code as suffix for conflict resolution + # if generated table names for 2 topics are similar. This corner case is not handled here. + # Note that Snowflake recommends to choose topic names that follow the rules for + # Snowflake identifier names so this case is not recommended by snowflake. + return table_name + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> SnowflakeParser: + database_name = connector_manifest.config["snowflake.database.name"] + schema_name = connector_manifest.config["snowflake.schema.name"] + + # Fetch user provided topic to table map + provided_topics_to_tables: Dict[str, str] = {} + if connector_manifest.config.get("snowflake.topic2table.map"): + for each in connector_manifest.config["snowflake.topic2table.map"].split( + "," + ): + topic, table = each.split(":") + provided_topics_to_tables[topic.strip()] = table.strip() + + topics_to_tables: Dict[str, str] = {} + # Extract lineage for only those topics whose data ingestion started + for topic in connector_manifest.topic_names: + if topic in provided_topics_to_tables: + # If user provided which table to get mapped with this topic + topics_to_tables[topic] = provided_topics_to_tables[topic] + else: + # Else connector converts topic name to a valid Snowflake table name. + topics_to_tables[topic] = self.get_table_name_from_topic_name(topic) + + return self.SnowflakeParser( + database_name=database_name, + schema_name=schema_name, + topics_to_tables=topics_to_tables, + ) + + def _extract_lineages(self): + self.connector_manifest.flow_property_bag = self.connector_manifest.config + + # For all snowflake sink connector properties, refer below link + # https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector + # remove private keys, secrets from properties + secret_properties = [ + "snowflake.private.key", + "snowflake.private.key.passphrase", + "value.converter.basic.auth.user.info", + ] + for k in secret_properties: + if k in self.connector_manifest.flow_property_bag: + del self.connector_manifest.flow_property_bag[k] + + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + + for topic, table in parser.topics_to_tables.items(): + target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}" + lineages.append( + KafkaConnectLineage( + source_dataset=topic, + source_platform=KAFKA, + target_dataset=target_dataset, + target_platform="snowflake", + ) + ) + + self.connector_manifest.lineages = lineages + return + + @dataclass class ConfluentS3SinkConnector: connector_manifest: ConnectorManifest @@ -1130,6 +1232,12 @@ def get_connectors_manifest(self) -> List[ConnectorManifest]: connector_manifest = ConfluentS3SinkConnector( connector_manifest=connector_manifest, report=self.report ).connector_manifest + elif connector_manifest.config.get("connector.class").__eq__( + "com.snowflake.kafka.connector.SnowflakeSinkConnector" + ): + connector_manifest = SnowflakeSinkConnector( + connector_manifest=connector_manifest, report=self.report + ).connector_manifest else: self.report.report_dropped(connector_manifest.name) logger.warning( diff --git a/metadata-ingestion/tests/integration/kafka-connect/kafka_connect_snowflake_sink_mces_golden.json b/metadata-ingestion/tests/integration/kafka-connect/kafka_connect_snowflake_sink_mces_golden.json new file mode 100644 index 0000000000000..76d49cebe5ae3 --- /dev/null +++ b/metadata-ingestion/tests/integration/kafka-connect/kafka_connect_snowflake_sink_mces_golden.json @@ -0,0 +1,152 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(kafka-connect,connect-instance-1.snowflake_sink1,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "connector.class": "com.snowflake.kafka.connector.SnowflakeSinkConnector", + "snowflake.database.name": "kafka_db", + "snowflake.schema.name": "kafka_schema", + "snowflake.topic2table.map": "topic1:table1", + "tasks.max": "1", + "topics": "topic1,_topic+2", + "snowflake.user.name": "kafka_connector_user_1", + "name": "snowflake_sink1", + "snowflake.url.name": "bcaurux-lc62744.snowflakecomputing.com:443" + }, + "name": "snowflake_sink1", + "description": "Sink connector using `com.snowflake.kafka.connector.SnowflakeSinkConnector` plugin." + } + }, + "systemMetadata": { + "lastObserved": 1635166800000, + "runId": "kafka-connect-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(kafka-connect,connect-instance-1.snowflake_sink1,PROD),topic1)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": {}, + "name": "snowflake_sink1:topic1", + "type": { + "string": "COMMAND" + } + } + }, + "systemMetadata": { + "lastObserved": 1635166800000, + "runId": "kafka-connect-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(kafka-connect,connect-instance-1.snowflake_sink1,PROD),topic1)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:kafka,topic1,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,kafka_db.kafka_schema.table1,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1635166800000, + "runId": "kafka-connect-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(kafka-connect,connect-instance-1.snowflake_sink1,PROD),_topic+2)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": {}, + "name": "snowflake_sink1:_topic+2", + "type": { + "string": "COMMAND" + } + } + }, + "systemMetadata": { + "lastObserved": 1635166800000, + "runId": "kafka-connect-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(kafka-connect,connect-instance-1.snowflake_sink1,PROD),_topic+2)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:kafka,_topic+2,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,kafka_db.kafka_schema._topic_2,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1635166800000, + "runId": "kafka-connect-test" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(kafka-connect,connect-instance-1.snowflake_sink1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1635166800000, + "runId": "kafka-connect-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(kafka-connect,connect-instance-1.snowflake_sink1,PROD),_topic+2)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1635166800000, + "runId": "kafka-connect-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(kafka-connect,connect-instance-1.snowflake_sink1,PROD),topic1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1635166800000, + "runId": "kafka-connect-test" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/kafka-connect/test_kafka_connect.py b/metadata-ingestion/tests/integration/kafka-connect/test_kafka_connect.py index 5f907bb05443c..48063908e624f 100644 --- a/metadata-ingestion/tests/integration/kafka-connect/test_kafka_connect.py +++ b/metadata-ingestion/tests/integration/kafka-connect/test_kafka_connect.py @@ -534,3 +534,103 @@ def test_kafka_connect_ingest_stateful( "urn:li:dataJob:(urn:li:dataFlow:(kafka-connect,connect-instance-1.mysql_source2,PROD),librarydb.member)", ] assert sorted(deleted_job_urns) == sorted(difference_job_urns) + + +def register_mock_api(request_mock: Any, override_data: dict = {}) -> None: + api_vs_response = { + "http://localhost:28083": { + "method": "GET", + "status_code": 200, + "json": { + "version": "7.4.0-ccs", + "commit": "30969fa33c185e880b9e02044761dfaac013151d", + "kafka_cluster_id": "MDgRZlZhSZ-4fXhwRR79bw", + }, + }, + } + + api_vs_response.update(override_data) + + for url in api_vs_response.keys(): + request_mock.register_uri( + api_vs_response[url]["method"], + url, + json=api_vs_response[url]["json"], + status_code=api_vs_response[url]["status_code"], + ) + + +@freeze_time(FROZEN_TIME) +def test_kafka_connect_snowflake_sink_ingest( + pytestconfig, tmp_path, mock_time, requests_mock +): + test_resources_dir = pytestconfig.rootpath / "tests/integration/kafka-connect" + override_data = { + "http://localhost:28083/connectors": { + "method": "GET", + "status_code": 200, + "json": ["snowflake_sink1"], + }, + "http://localhost:28083/connectors/snowflake_sink1": { + "method": "GET", + "status_code": 200, + "json": { + "name": "snowflake_sink1", + "config": { + "connector.class": "com.snowflake.kafka.connector.SnowflakeSinkConnector", + "snowflake.database.name": "kafka_db", + "snowflake.schema.name": "kafka_schema", + "snowflake.topic2table.map": "topic1:table1", + "tasks.max": "1", + "topics": "topic1,_topic+2", + "snowflake.user.name": "kafka_connector_user_1", + "snowflake.private.key": "rrSnqU=", + "name": "snowflake_sink1", + "snowflake.url.name": "bcaurux-lc62744.snowflakecomputing.com:443", + }, + "tasks": [{"connector": "snowflake_sink1", "task": 0}], + "type": "sink", + }, + }, + "http://localhost:28083/connectors/snowflake_sink1/topics": { + "method": "GET", + "status_code": 200, + "json": {"snowflake_sink1": {"topics": ["topic1", "_topic+2"]}}, + }, + } + + register_mock_api(request_mock=requests_mock, override_data=override_data) + + pipeline = Pipeline.create( + { + "run_id": "kafka-connect-test", + "source": { + "type": "kafka-connect", + "config": { + "platform_instance": "connect-instance-1", + "connect_uri": KAFKA_CONNECT_SERVER, + "connector_patterns": { + "allow": [ + "snowflake_sink1", + ] + }, + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/kafka_connect_snowflake_sink_mces.json", + }, + }, + } + ) + + pipeline.run() + pipeline.raise_from_status() + golden_file = "kafka_connect_snowflake_sink_mces_golden.json" + + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / "kafka_connect_snowflake_sink_mces.json", + golden_path=f"{test_resources_dir}/{golden_file}", + ) From b905f26d77891e5b1e7406f4b1700bdfb2e9332b Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Fri, 22 Sep 2023 20:43:57 -0500 Subject: [PATCH 33/37] fix(test): fix test execution (#8889) --- build.gradle | 5 ++++ .../auth/ListAccessTokensResolverTest.java | 24 ++++++++++++------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/build.gradle b/build.gradle index 07a0e6ad1f49f..0a94991b131aa 100644 --- a/build.gradle +++ b/build.gradle @@ -289,6 +289,11 @@ subprojects { } // https://docs.gradle.org/current/userguide/performance.html maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 + + if (project.configurations.getByName("testImplementation").getDependencies() + .any{ it.getName() == "testng" }) { + useTestNG() + } } afterEvaluate { diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/auth/ListAccessTokensResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/auth/ListAccessTokensResolverTest.java index 54b8d23bab301..52d06f73dcfab 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/auth/ListAccessTokensResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/auth/ListAccessTokensResolverTest.java @@ -1,5 +1,6 @@ package com.linkedin.datahub.graphql.resolvers.auth; +import com.datahub.authentication.Authentication; import com.google.common.collect.ImmutableList; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.TestUtils; @@ -8,6 +9,10 @@ import com.linkedin.datahub.graphql.generated.ListAccessTokenResult; import com.linkedin.entity.client.EntityClient; import com.linkedin.metadata.Constants; +import com.linkedin.metadata.query.SearchFlags; +import com.linkedin.metadata.query.filter.SortCriterion; +import com.linkedin.metadata.search.SearchEntityArray; +import com.linkedin.metadata.search.SearchResult; import graphql.schema.DataFetchingEnvironment; import java.util.Collections; import org.mockito.Mockito; @@ -36,14 +41,17 @@ public void testGetSuccess() throws Exception { Mockito.when(mockEnv.getArgument(Mockito.eq("input"))).thenReturn(input); final EntityClient mockClient = Mockito.mock(EntityClient.class); - Mockito.when(Mockito.eq(mockClient.filter( - Mockito.eq(Constants.ACCESS_TOKEN_ENTITY_NAME), - Mockito.eq(buildFilter(filters, Collections.emptyList())), - Mockito.notNull(), - Mockito.eq(input.getStart()), - Mockito.eq(input.getCount()), - Mockito.eq(getAuthentication(mockEnv))))) - .thenReturn(null); + final Authentication testAuth = getAuthentication(mockEnv); + Mockito.when(mockClient.search( + Mockito.eq(Constants.ACCESS_TOKEN_ENTITY_NAME), + Mockito.eq(""), + Mockito.eq(buildFilter(filters, Collections.emptyList())), + Mockito.any(SortCriterion.class), + Mockito.eq(input.getStart()), + Mockito.eq(input.getCount()), + Mockito.eq(testAuth), + Mockito.any(SearchFlags.class))) + .thenReturn(new SearchResult().setFrom(0).setNumEntities(0).setPageSize(0).setEntities(new SearchEntityArray())); final ListAccessTokensResolver resolver = new ListAccessTokensResolver(mockClient); final ListAccessTokenResult listAccessTokenResult = resolver.get(mockEnv).get(); From 874109f76e6fd70da4b65541d50908c4637df073 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Mon, 25 Sep 2023 14:04:05 +0530 Subject: [PATCH 34/37] feat(ingest/snowflake): allow shares config without platform instance (#8803) --- .../source/snowflake/snowflake_config.py | 34 +++++---- .../source/snowflake/snowflake_shares.py | 6 +- .../tests/unit/test_snowflake_shares.py | 74 +++++++++++++++++++ 3 files changed, 100 insertions(+), 14 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 0bc8bb17934f7..95f6444384408 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -51,15 +51,17 @@ class DatabaseId: database: str = Field( description="Database created from share in consumer account." ) - platform_instance: str = Field( - description="Platform instance of consumer snowflake account." + platform_instance: Optional[str] = Field( + default=None, + description="Platform instance of consumer snowflake account.", ) class SnowflakeShareConfig(ConfigModel): database: str = Field(description="Database from which share is created.") - platform_instance: str = Field( - description="Platform instance for snowflake account in which share is created." + platform_instance: Optional[str] = Field( + default=None, + description="Platform instance for snowflake account in which share is created.", ) consumers: Set[DatabaseId] = Field( @@ -247,10 +249,11 @@ def validate_shares( if shares: # Check: platform_instance should be present - assert current_platform_instance is not None, ( - "Did you forget to set `platform_instance` for current ingestion ? " - "It is required to use `platform_instance` when ingesting from multiple snowflake accounts." - ) + if current_platform_instance is None: + logger.info( + "It is advisable to use `platform_instance` when ingesting from multiple snowflake accounts, if they contain databases with same name. " + "Setting `platform_instance` allows distinguishing such databases without conflict and correctly ingest their metadata." + ) databases_included_in_share: List[DatabaseId] = [] databases_created_from_share: List[DatabaseId] = [] @@ -259,10 +262,11 @@ def validate_shares( shared_db = DatabaseId( share_details.database, share_details.platform_instance ) - assert all( - consumer.platform_instance != share_details.platform_instance - for consumer in share_details.consumers - ), "Share's platform_instance can not be same as consumer's platform instance. Self-sharing not supported in Snowflake." + if current_platform_instance: + assert all( + consumer.platform_instance != share_details.platform_instance + for consumer in share_details.consumers + ), "Share's platform_instance can not be same as consumer's platform instance. Self-sharing not supported in Snowflake." databases_included_in_share.append(shared_db) databases_created_from_share.extend(share_details.consumers) @@ -306,7 +310,11 @@ def inbounds(self) -> Dict[str, DatabaseId]: f"database {consumer.database} is created from inbound share {share_name}." ) inbounds[consumer.database] = share_details.source_database - break + if self.platform_instance: + break + # If not using platform_instance, any one of consumer databases + # can be the database from this instance. so we include all relevant + # databases in inbounds. else: logger.info( f"Skipping Share {share_name}, as it does not include current platform instance {self.platform_instance}", diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py index 6f7520bbf1988..dad0ce7b59ee1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py @@ -93,11 +93,15 @@ def report_missing_databases( db_names = [db.name for db in databases] missing_dbs = [db for db in inbounds + outbounds if db not in db_names] - if missing_dbs: + if missing_dbs and self.config.platform_instance: self.report_warning( "snowflake-shares", f"Databases {missing_dbs} were not ingested. Siblings/Lineage will not be set for these.", ) + elif missing_dbs: + logger.debug( + f"Databases {missing_dbs} were not ingested in this recipe.", + ) def gen_siblings( self, diff --git a/metadata-ingestion/tests/unit/test_snowflake_shares.py b/metadata-ingestion/tests/unit/test_snowflake_shares.py index 7de86139baf39..9e33ba6132e06 100644 --- a/metadata-ingestion/tests/unit/test_snowflake_shares.py +++ b/metadata-ingestion/tests/unit/test_snowflake_shares.py @@ -231,6 +231,7 @@ def test_snowflake_shares_workunit_inbound_share( else: siblings_aspect = wu.get_aspect_of_type(Siblings) assert siblings_aspect is not None + assert not siblings_aspect.primary assert len(siblings_aspect.siblings) == 1 assert siblings_aspect.siblings == [ wu.get_urn().replace("instance1.db1", "instance2.db1") @@ -275,6 +276,7 @@ def test_snowflake_shares_workunit_outbound_share( for wu in wus: siblings_aspect = wu.get_aspect_of_type(Siblings) assert siblings_aspect is not None + assert siblings_aspect.primary assert len(siblings_aspect.siblings) == 2 assert siblings_aspect.siblings == [ wu.get_urn().replace("instance1.db2", "instance2.db2_from_share"), @@ -336,13 +338,85 @@ def test_snowflake_shares_workunit_inbound_and_outbound_share( siblings_aspect = wu.get_aspect_of_type(Siblings) assert siblings_aspect is not None if "db1" in wu.get_urn(): + assert not siblings_aspect.primary assert len(siblings_aspect.siblings) == 1 assert siblings_aspect.siblings == [ wu.get_urn().replace("instance1.db1", "instance2.db1") ] else: + assert siblings_aspect.primary assert len(siblings_aspect.siblings) == 2 assert siblings_aspect.siblings == [ wu.get_urn().replace("instance1.db2", "instance2.db2_from_share"), wu.get_urn().replace("instance1.db2", "instance3.db2"), ] + + +def test_snowflake_shares_workunit_inbound_and_outbound_share_no_platform_instance( + snowflake_databases: List[SnowflakeDatabase], +) -> None: + config = SnowflakeV2Config( + account_id="abc12345", + shares={ + "share1": SnowflakeShareConfig( + database="db1", + consumers=[ + DatabaseId(database="db1_from_share"), + DatabaseId(database="db1_other"), + ], + ), + "share2": SnowflakeShareConfig( + database="db2_main", + consumers=[ + DatabaseId(database="db2"), + DatabaseId(database="db2_other"), + ], + ), + }, + ) + + report = SnowflakeV2Report() + shares_handler = SnowflakeSharesHandler( + config, report, lambda x: make_snowflake_urn(x) + ) + + assert sorted(config.outbounds().keys()) == ["db1", "db2_main"] + assert sorted(config.inbounds().keys()) == [ + "db1_from_share", + "db1_other", + "db2", + "db2_other", + ] + wus = list(shares_handler.get_shares_workunits(snowflake_databases)) + + # 6 Sibling aspects for db1 tables + # 6 Sibling aspects and and 6 upstreamLineage for db2 tables + assert len(wus) == 18 + + for wu in wus: + assert isinstance( + wu.metadata, (MetadataChangeProposal, MetadataChangeProposalWrapper) + ) + if wu.metadata.aspectName == "upstreamLineage": + upstream_aspect = wu.get_aspect_of_type(UpstreamLineage) + assert upstream_aspect is not None + assert len(upstream_aspect.upstreams) == 1 + assert upstream_aspect.upstreams[0].dataset == wu.get_urn().replace( + "db2.", "db2_main." + ) + else: + siblings_aspect = wu.get_aspect_of_type(Siblings) + assert siblings_aspect is not None + if "db1" in wu.get_urn(): + assert siblings_aspect.primary + assert len(siblings_aspect.siblings) == 2 + assert siblings_aspect.siblings == [ + wu.get_urn().replace("db1.", "db1_from_share."), + wu.get_urn().replace("db1.", "db1_other."), + ] + else: + assert not siblings_aspect.primary + assert len(siblings_aspect.siblings) == 1 + assert siblings_aspect.siblings == [ + wu.get_urn().replace("db2.", "db2_main.") + ] From 53eaac5963b6b88b9f1804b10300a2be53c142fc Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 25 Sep 2023 16:24:19 -0400 Subject: [PATCH 35/37] fix(ingest): bound types-requests (#8895) --- metadata-ingestion/setup.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index a0d16aa92ad9b..2387e848e68a2 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -405,7 +405,12 @@ def get_long_description(): "types-pkg_resources", "types-six", "types-python-dateutil", - "types-requests>=2.28.11.6", + # We need to avoid 2.31.0.5 and 2.31.0.4 due to + # https://github.com/python/typeshed/issues/10764. Once that + # issue is resolved, we can remove the upper bound and change it + # to a != constraint. + # We have a PR up to fix the underlying issue: https://github.com/python/typeshed/pull/10776. + "types-requests>=2.28.11.6,<=2.31.0.3", "types-toml", "types-PyMySQL", "types-PyYAML", From ed1d35c79bb41d06ce646ef44bf7ad810fd229b6 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 26 Sep 2023 00:10:49 -0400 Subject: [PATCH 36/37] fix(build): run codegen when building datahub-ingestion image (#8869) --- .github/workflows/docker-unified.yml | 14 +++++++------- docker/datahub-ingestion-base/Dockerfile | 4 ++-- docker/datahub-ingestion/Dockerfile | 2 +- docker/datahub-ingestion/Dockerfile-slim-only | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 13c921e953c32..de3e0ca93e6b7 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -58,7 +58,7 @@ jobs: echo "full_tag=$(get_tag)-full" >> $GITHUB_OUTPUT echo "unique_tag=$(get_unique_tag)" >> $GITHUB_OUTPUT echo "unique_slim_tag=$(get_unique_tag)-slim" >> $GITHUB_OUTPUT - echo "unique_full_tag=$(get_unique_tag)-full" >> $GITHUB_OUTPUT + echo "unique_full_tag=$(get_unique_tag)" >> $GITHUB_OUTPUT echo "python_release_version=$(get_python_docker_release_v)" >> $GITHUB_OUTPUT - name: Check whether publishing enabled id: publish @@ -501,7 +501,7 @@ jobs: platforms: linux/amd64,linux/arm64/v8 - name: Compute DataHub Ingestion (Base-Slim) Tag id: tag - run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }}" >> $GITHUB_OUTPUT + run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }}" >> $GITHUB_OUTPUT datahub_ingestion_base_full_build: name: Build and Push DataHub Ingestion (Base-Full) Docker Image runs-on: ubuntu-latest @@ -567,13 +567,13 @@ jobs: datahub-ingestion: - 'docker/datahub-ingestion/**' - name: Build codegen - if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }} + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' || needs.setup.outputs.publish }} run: ./gradlew :metadata-ingestion:codegen - name: Download Base Image uses: ishworkh/docker-image-artifact-download@v1 if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }} with: - image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }} + image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }} - name: Build and push Slim Image if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' || needs.setup.outputs.publish }} uses: ./.github/actions/docker-custom-build-and-push @@ -583,7 +583,7 @@ jobs: ${{ env.DATAHUB_INGESTION_IMAGE }} build-args: | BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }} - DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }} + DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }} RELEASE_VERSION=${{ needs.setup.outputs.python_release_version }} APP_ENV=slim tags: ${{ needs.setup.outputs.slim_tag }} @@ -595,7 +595,7 @@ jobs: platforms: linux/amd64,linux/arm64/v8 - name: Compute Tag id: tag - run: echo "tag=${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.unique_slim_tag || 'head' }}" >> $GITHUB_OUTPUT + run: echo "tag=${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.unique_slim_tag || 'head-slim' }}" >> $GITHUB_OUTPUT datahub_ingestion_slim_scan: permissions: contents: read # for actions/checkout to fetch code @@ -650,7 +650,7 @@ jobs: datahub-ingestion: - 'docker/datahub-ingestion/**' - name: Build codegen - if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }} + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' || needs.setup.outputs.publish }} run: ./gradlew :metadata-ingestion:codegen - name: Download Base Image uses: ishworkh/docker-image-artifact-download@v1 diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile index 3d47f79617370..564cc19cc9a5f 100644 --- a/docker/datahub-ingestion-base/Dockerfile +++ b/docker/datahub-ingestion-base/Dockerfile @@ -1,7 +1,7 @@ ARG APP_ENV=full ARG BASE_IMAGE=base -FROM golang:1-alpine3.17 AS binary +FROM golang:1-alpine3.17 AS dockerize-binary ENV DOCKERIZE_VERSION v0.6.1 WORKDIR /go/src/github.com/jwilder @@ -41,7 +41,7 @@ RUN apt-get update && apt-get install -y -qq \ && rm -rf /var/lib/apt/lists/* /var/cache/apk/* # compiled against newer golang for security fixes -COPY --from=binary /go/bin/dockerize /usr/local/bin +COPY --from=dockerize-binary /go/bin/dockerize /usr/local/bin COPY ./docker/datahub-ingestion-base/base-requirements.txt requirements.txt COPY ./docker/datahub-ingestion-base/entrypoint.sh /entrypoint.sh diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index 8b726df5e8842..0132ceaa9b1a9 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -1,7 +1,7 @@ # Defining environment ARG APP_ENV=full ARG BASE_IMAGE=acryldata/datahub-ingestion-base -ARG DOCKER_VERSION=latest +ARG DOCKER_VERSION=head FROM $BASE_IMAGE:$DOCKER_VERSION as base USER 0 diff --git a/docker/datahub-ingestion/Dockerfile-slim-only b/docker/datahub-ingestion/Dockerfile-slim-only index 9ae116f839aa0..cb8c27ab463c4 100644 --- a/docker/datahub-ingestion/Dockerfile-slim-only +++ b/docker/datahub-ingestion/Dockerfile-slim-only @@ -1,6 +1,6 @@ # Defining environment ARG BASE_IMAGE=acryldata/datahub-ingestion-base -ARG DOCKER_VERSION=latest +ARG DOCKER_VERSION=head-slim FROM $BASE_IMAGE:$DOCKER_VERSION as base USER 0 From 0a869dd6f8784d50039da308313946d399b0c8ce Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Tue, 26 Sep 2023 10:28:03 +0200 Subject: [PATCH 37/37] fix(ingest/s3): Converting windows style path to posix one on local fs (#8757) --- .../src/datahub/ingestion/source/s3/source.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index ab5d3a4e007ac..ac4433b7eb1f0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -7,6 +7,7 @@ import time from collections import OrderedDict from datetime import datetime +from pathlib import PurePath from typing import Any, Dict, Iterable, List, Optional, Tuple from more_itertools import peekable @@ -819,7 +820,10 @@ def local_browser(self, path_spec: PathSpec) -> Iterable[Tuple[str, datetime, in dirs.sort(key=functools.cmp_to_key(partitioned_folder_comparator)) for file in sorted(files): - full_path = os.path.join(root, file) + # We need to make sure the path is in posix style which is not true on windows + full_path = PurePath( + os.path.normpath(os.path.join(root, file)) + ).as_posix() yield full_path, datetime.utcfromtimestamp( os.path.getmtime(full_path) ), os.path.getsize(full_path)