From e808d2e6995db155cf3ce601a4c19e86cdc804f7 Mon Sep 17 00:00:00 2001 From: Tejas Rajopadhye <71188245+TejasRGitHub@users.noreply.github.com> Date: Fri, 29 Mar 2024 23:22:51 +0530 Subject: [PATCH] Adding check to remove any spaces in confidentiality names (#1126) ### Feature or Bugfix - Bugfix ### Detail - Adding Regex Check to remove any spaces , any other character apart from alpha-numerics. Similar check performed in the `dataset_indexer.py` file. - Adding dataset upsert if a dataset doesn't contain a folder or a table in it. ### Relates - https://github.com/data-dot-all/dataall/issues/1032 ### Security Please answer the questions below briefly where applicable, or write `N/A`. Based on [OWASP 10](https://owasp.org/Top10/en/). - Does this PR introduce or modify any input fields or queries - this includes fetching data from storage outside the application (e.g. a database, an S3 bucket)? N/A - Is the input sanitized? - What precautions are you taking before deserializing the data you consume? - Is injection prevented by parametrizing queries? - Have you ensured no `eval` or similar functions are used? - Does this PR introduce any functionality or component that requires authorization? N/A - How have you ensured it respects the existing AuthN/AuthZ mechanisms? - Are you logging failed auth attempts? - Are you using or adding any cryptographic features? N/A - Do you use a standard proven implementations? - Are the used keys controlled by the customer? Where are they stored? - Are you introducing any new policies/roles/users? N/A - Have you used the least-privilege principle? How? By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. --- .../modules/datasets/indexers/dataset_catalog_indexer.py | 2 ++ backend/dataall/modules/datasets/indexers/location_indexer.py | 3 ++- backend/dataall/modules/datasets/indexers/table_indexer.py | 3 ++- tests/modules/datasets/tasks/test_dataset_catalog_indexer.py | 1 + 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/backend/dataall/modules/datasets/indexers/dataset_catalog_indexer.py b/backend/dataall/modules/datasets/indexers/dataset_catalog_indexer.py index 0ad22752d..5d12cca1d 100644 --- a/backend/dataall/modules/datasets/indexers/dataset_catalog_indexer.py +++ b/backend/dataall/modules/datasets/indexers/dataset_catalog_indexer.py @@ -1,5 +1,6 @@ import logging +from dataall.modules.datasets.indexers.dataset_indexer import DatasetIndexer from dataall.modules.datasets.indexers.location_indexer import DatasetLocationIndexer from dataall.modules.datasets.indexers.table_indexer import DatasetTableIndexer from dataall.modules.datasets_base.db.dataset_repositories import DatasetRepository @@ -22,5 +23,6 @@ def index(self, session) -> int: for dataset in all_datasets: tables = DatasetTableIndexer.upsert_all(session, dataset.datasetUri) folders = DatasetLocationIndexer.upsert_all(session, dataset_uri=dataset.datasetUri) + DatasetIndexer.upsert(session=session, dataset_uri=dataset.datasetUri) indexed += len(tables) + len(folders) + 1 return indexed diff --git a/backend/dataall/modules/datasets/indexers/location_indexer.py b/backend/dataall/modules/datasets/indexers/location_indexer.py index 9b9f3c9cb..742722626 100644 --- a/backend/dataall/modules/datasets/indexers/location_indexer.py +++ b/backend/dataall/modules/datasets/indexers/location_indexer.py @@ -1,4 +1,5 @@ """Indexes DatasetStorageLocation in OpenSearch""" +import re from dataall.core.environment.services.environment_service import EnvironmentService from dataall.core.organizations.db.organization_repositories import OrganizationRepository @@ -29,7 +30,7 @@ def upsert(cls, session, folder_uri: str): 'resourceKind': 'folder', 'description': folder.description, 'source': dataset.S3BucketName, - 'classification': dataset.confidentiality, + 'classification': re.sub('[^A-Za-z0-9]+', '', dataset.confidentiality), 'tags': [f.replace('-', '') for f in folder.tags or []], 'topics': dataset.topics, 'region': folder.region.replace('-', ''), diff --git a/backend/dataall/modules/datasets/indexers/table_indexer.py b/backend/dataall/modules/datasets/indexers/table_indexer.py index b4d77724c..dc0353ad9 100644 --- a/backend/dataall/modules/datasets/indexers/table_indexer.py +++ b/backend/dataall/modules/datasets/indexers/table_indexer.py @@ -1,4 +1,5 @@ """Indexes DatasetTable in OpenSearch""" +import re from dataall.core.environment.services.environment_service import EnvironmentService from dataall.core.organizations.db.organization_repositories import OrganizationRepository @@ -31,7 +32,7 @@ def upsert(cls, session, table_uri: str): 'description': table.description, 'database': table.GlueDatabaseName, 'source': table.S3BucketName, - 'classification': dataset.confidentiality, + 'classification': re.sub('[^A-Za-z0-9]+', '', dataset.confidentiality), 'tags': [t.replace('-', '') for t in tags or []], 'topics': dataset.topics, 'region': dataset.region.replace('-', ''), diff --git a/tests/modules/datasets/tasks/test_dataset_catalog_indexer.py b/tests/modules/datasets/tasks/test_dataset_catalog_indexer.py index 0ec57913c..91e1aabfd 100644 --- a/tests/modules/datasets/tasks/test_dataset_catalog_indexer.py +++ b/tests/modules/datasets/tasks/test_dataset_catalog_indexer.py @@ -51,4 +51,5 @@ def test_catalog_indexer(db, org, env, sync_dataset, table, mocker): mocker.patch('dataall.modules.datasets.indexers.table_indexer.DatasetTableIndexer.upsert_all', return_value=[table]) mocker.patch('dataall.modules.datasets.indexers.dataset_indexer.DatasetIndexer.upsert', return_value=sync_dataset) indexed_objects_counter = index_objects(engine=db) + # Count should be One table + One Dataset = 2 assert indexed_objects_counter == 2