Skip to content

Commit

Permalink
Add Redshift datasets module (data-dot-all#1424)
Browse files Browse the repository at this point in the history
### Feature or Bugfix
- Feature

### Detail
The design is up-to-date in the issue
data-dot-all#955 as well as next steps

### Relates
- data-dot-all#955 

### Security
Please answer the questions below briefly where applicable, or write
`N/A`. Based on
[OWASP 10](https://owasp.org/Top10/en/).

- Does this PR introduce or modify any input fields or queries - this
includes
fetching data from storage outside the application (e.g. a database, an
S3 bucket)?
  - Is the input sanitized?
- What precautions are you taking before deserializing the data you
consume?
  - Is injection prevented by parametrizing queries?
  - Have you ensured no `eval` or similar functions are used?
- Does this PR introduce any functionality or component that requires
authorization?
- How have you ensured it respects the existing AuthN/AuthZ mechanisms?
  - Are you logging failed auth attempts?
- Are you using or adding any cryptographic features?
  - Do you use a standard proven implementations?
  - Are the used keys controlled by the customer? Where are they stored?
- Are you introducing any new policies/roles/users?
  - Have you used the least-privilege principle? How?


By submitting this pull request, I confirm that my contribution is made
under the terms of the Apache 2.0 license.
  • Loading branch information
dlpzx authored Aug 1, 2024
1 parent 8eb1bd0 commit 276dceb
Show file tree
Hide file tree
Showing 103 changed files with 8,025 additions and 170 deletions.
6 changes: 3 additions & 3 deletions backend/dataall/base/cdkproxy/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
aws-cdk-lib==2.99.0
boto3==1.28.23
boto3-stubs==1.28.23
botocore==1.31.23
boto3==1.34.119
boto3-stubs==1.34.119
botocore==1.34.119
cdk-nag==2.7.2
constructs==10.0.73
starlette==0.36.3
Expand Down
11 changes: 11 additions & 0 deletions backend/dataall/base/db/paginator.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,14 @@ def paginate(query, page, page_size):
# nosemgrep: python.sqlalchemy.performance.performance-improvements.len-all-count
total = len(query.order_by(None).all())
return Page(items, page, page_size, total)


def paginate_list(items, page, page_size):
if page <= 0:
raise AttributeError('page needs to be >= 1')
if page_size <= 0:
raise AttributeError('page_size needs to be >= 1')
start = (page - 1) * page_size
end = start + page_size
total = len(items)
return Page(items[start:end], page, page_size, total)
2 changes: 1 addition & 1 deletion backend/dataall/core/environment/cdk/pivot_role_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ def generate_policies(self) -> List[iam.ManagedPolicy]:
)

for service in services:
statements.extend(service.get_statements(self))
logger.info(f'Adding {service.__name__} statements to policy')
statements.extend(service.get_statements(self))
logger.info(f'statements: {str(service.get_statements(self))}')

statements_chunks = split_policy_statements_in_chunks(statements)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

class DatasetTypes(GraphQLEnumMapper):
S3 = 'S3'
Redshift = 'Redshift'


class DatasetRole(GraphQLEnumMapper):
Expand Down
96 changes: 96 additions & 0 deletions backend/dataall/modules/redshift_datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""Contains the code related to datasets"""

import logging
from typing import List, Type, Set

from dataall.base.loader import ModuleInterface, ImportMode

log = logging.getLogger(__name__)


class RedshiftDatasetApiModuleInterface(ModuleInterface):
"""Implements ModuleInterface for dataset GraphQl lambda"""

@staticmethod
def is_supported(modes):
return ImportMode.API in modes

@staticmethod
def depends_on() -> List[Type['ModuleInterface']]:
from dataall.modules.datasets_base import DatasetBaseApiModuleInterface
from dataall.modules.catalog import CatalogApiModuleInterface
from dataall.modules.feed import FeedApiModuleInterface
from dataall.modules.vote import VoteApiModuleInterface

return [
DatasetBaseApiModuleInterface,
CatalogApiModuleInterface,
FeedApiModuleInterface,
VoteApiModuleInterface,
]

def __init__(self):
from dataall.modules.vote.services.vote_service import add_vote_type
from dataall.modules.feed.api.registry import FeedRegistry, FeedDefinition
from dataall.modules.catalog.indexers.registry import GlossaryRegistry, GlossaryDefinition
from dataall.core.environment.services.environment_resource_manager import EnvironmentResourceManager

from dataall.modules.redshift_datasets.indexers.dataset_indexer import DatasetIndexer
from dataall.modules.redshift_datasets.indexers.table_indexer import DatasetTableIndexer
from dataall.modules.redshift_datasets.db.redshift_dataset_repositories import (
RedshiftDatasetEnvironmentResource,
)
from dataall.modules.redshift_datasets.db.redshift_connection_repositories import (
RedshiftConnectionEnvironmentResource,
)
from dataall.modules.redshift_datasets.db.redshift_models import RedshiftDataset, RedshiftTable
from dataall.modules.redshift_datasets.services.redshift_constants import (
GLOSSARY_REDSHIFT_DATASET_NAME,
GLOSSARY_REDSHIFT_DATASET_TABLE_NAME,
FEED_REDSHIFT_DATASET_NAME,
FEED_REDSHIFT_DATASET_TABLE_NAME,
VOTE_REDSHIFT_DATASET_NAME,
)

import dataall.modules.redshift_datasets.api

FeedRegistry.register(FeedDefinition(FEED_REDSHIFT_DATASET_TABLE_NAME, RedshiftTable))
FeedRegistry.register(FeedDefinition(FEED_REDSHIFT_DATASET_NAME, RedshiftDataset))

GlossaryRegistry.register(
GlossaryDefinition(
target_type=GLOSSARY_REDSHIFT_DATASET_NAME,
object_type='RedshiftDataset',
model=RedshiftDataset,
reindexer=DatasetIndexer,
)
)

GlossaryRegistry.register(
GlossaryDefinition(
target_type=GLOSSARY_REDSHIFT_DATASET_TABLE_NAME,
object_type='RedshiftDatasetTable',
model=RedshiftTable,
reindexer=DatasetTableIndexer,
)
)

add_vote_type(VOTE_REDSHIFT_DATASET_NAME, DatasetIndexer)

EnvironmentResourceManager.register(RedshiftDatasetEnvironmentResource())
EnvironmentResourceManager.register(RedshiftConnectionEnvironmentResource())

log.info('API of Redshift datasets has been imported')


class RedshiftDatasetCdkModuleInterface(ModuleInterface):
"""Loads dataset cdk stacks"""

@staticmethod
def is_supported(modes: Set[ImportMode]):
return ImportMode.CDK in modes

def __init__(self):
import dataall.modules.redshift_datasets.cdk

log.info('Redshift Dataset CDK has been imported')
5 changes: 5 additions & 0 deletions backend/dataall/modules/redshift_datasets/api/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""The GraphQL schema of datasets and related functionality"""

from dataall.modules.redshift_datasets.api import connections, datasets

__all__ = ['connections', 'datasets']
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from . import (
input_types,
queries,
mutations,
resolvers,
types,
)

__all__ = ['resolvers', 'types', 'input_types', 'queries', 'mutations']
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from dataall.base.api.constants import GraphQLEnumMapper


class RedshiftType(GraphQLEnumMapper):
Serverless = 'serverless'
Cluster = 'cluster'
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from dataall.base.api import gql


CreateRedshiftConnectionInput = gql.InputType(
name='CreateRedshiftConnectionInput',
arguments=[
gql.Argument('connectionName', gql.NonNullableType(gql.String)),
gql.Argument('environmentUri', gql.NonNullableType(gql.String)),
gql.Argument('SamlGroupName', gql.NonNullableType(gql.String)),
gql.Argument('redshiftType', gql.NonNullableType(gql.String)),
gql.Argument('clusterId', gql.String),
gql.Argument('nameSpaceId', gql.String),
gql.Argument('workgroup', gql.String),
gql.Argument('database', gql.NonNullableType(gql.String)),
gql.Argument('redshiftUser', gql.String),
gql.Argument('secretArn', gql.String),
],
)


ConnectionFilter = gql.InputType(
name='ConnectionFilter',
arguments=[
gql.Argument('term', gql.String),
gql.Argument('page', gql.Integer),
gql.Argument('pageSize', gql.Integer),
gql.Argument('environmentUri', gql.String),
gql.Argument('groupUri', gql.String),
],
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from dataall.base.api import gql
from dataall.modules.redshift_datasets.api.connections.resolvers import (
create_redshift_connection,
delete_redshift_connection,
)
from dataall.modules.redshift_datasets.api.connections.types import (
RedshiftConnection,
)

createRedshiftConnection = gql.MutationField(
name='createRedshiftConnection',
args=[gql.Argument('input', gql.Ref('CreateRedshiftConnectionInput'))],
type=RedshiftConnection,
resolver=create_redshift_connection,
)

deleteRedshiftConnection = gql.MutationField(
name='deleteRedshiftConnection',
args=[gql.Argument('connectionUri', gql.NonNullableType(gql.String))],
type=gql.Boolean,
resolver=delete_redshift_connection,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from dataall.base.api import gql
from dataall.modules.redshift_datasets.api.connections.resolvers import (
list_environment_redshift_connections,
list_redshift_connection_schemas,
list_redshift_schema_tables,
)

listEnvironmentRedshiftConnections = gql.QueryField(
name='listEnvironmentRedshiftConnections',
args=[gql.Argument('filter', gql.Ref('ConnectionFilter'))],
type=gql.Ref('RedshiftConnectionSearchResult'),
resolver=list_environment_redshift_connections,
)

listRedshiftConnectionSchemas = gql.QueryField(
name='listRedshiftConnectionSchemas',
args=[gql.Argument('connectionUri', gql.NonNullableType(gql.String))],
type=gql.ArrayType(gql.String),
resolver=list_redshift_connection_schemas,
)

listRedshiftSchemaTables = gql.QueryField(
name='listRedshiftSchemaTables',
args=[
gql.Argument('connectionUri', gql.NonNullableType(gql.String)),
gql.Argument('schema', gql.NonNullableType(gql.String)),
],
type=gql.ArrayType(gql.Ref('RedshiftTable')),
resolver=list_redshift_schema_tables,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import logging
from typing import Any
from dataall.base.db import exceptions
from dataall.base.api.context import Context

from dataall.modules.redshift_datasets.api.connections.enums import RedshiftType
from dataall.modules.redshift_datasets.services.redshift_connection_service import RedshiftConnectionService

log = logging.getLogger(__name__)


def create_redshift_connection(context: Context, source, input=None):
RequestValidator.validate_connection_creation_request(data=input)
return RedshiftConnectionService.create_redshift_connection(
uri=input['environmentUri'], admin_group=input['SamlGroupName'], data=input
)


def delete_redshift_connection(context: Context, source, connectionUri: str):
RequestValidator.required_param('connectionUri', connectionUri)
return RedshiftConnectionService.delete_redshift_connection(uri=connectionUri)


def list_environment_redshift_connections(context: Context, source, filter: dict = None):
environmentUri = filter['environmentUri']
RequestValidator.required_param('environmentUri', environmentUri)
return RedshiftConnectionService.list_environment_redshift_connections(uri=environmentUri, filter=filter)


def list_redshift_connection_schemas(context: Context, source, connectionUri):
RequestValidator.required_param('connectionUri', connectionUri)
return RedshiftConnectionService.list_connection_schemas(uri=connectionUri)


def list_redshift_schema_tables(context: Context, source, connectionUri: str, schema: str):
RequestValidator.required_param('connectionUri', connectionUri)
RequestValidator.required_param('schema', schema)
return RedshiftConnectionService.list_schema_tables(uri=connectionUri, schema=schema)


class RequestValidator:
def required_param(param_name: str, param_value: Any):
if not param_value:
raise exceptions.RequiredParameter(param_name)

def validate_connection_creation_request(data):
if not data:
raise exceptions.RequiredParameter('data')

RequestValidator.required_param('SamlGroupName', data.get('SamlGroupName'))
RequestValidator.required_param('environmentUri', data.get('environmentUri'))
RequestValidator.required_param('connectionName', data.get('connectionName'))
RequestValidator.required_param('redshiftType', data.get('redshiftType'))
RequestValidator.required_param('database', data.get('database'))
if not data.get('redshiftUser') and not data.get('secretArn'):
raise exceptions.RequiredParameter('RedshiftUser OR secretArn')
if data.get('redshiftType') == RedshiftType.Serverless.value:
RequestValidator.required_param('nameSpaceId', data.get('nameSpaceId'))
RequestValidator.required_param('workgroup', data.get('workgroup'))
if data.get('redshiftType') == RedshiftType.Cluster.value:
RequestValidator.required_param('clusterId', data.get('clusterId'))
40 changes: 40 additions & 0 deletions backend/dataall/modules/redshift_datasets/api/connections/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from dataall.base.api import gql

RedshiftConnection = gql.ObjectType(
name='RedshiftConnection',
fields=[
gql.Field('connectionUri', gql.ID),
gql.Field('name', gql.String),
gql.Field('connectionType', gql.String),
gql.Field('SamlGroupName', gql.String),
gql.Field('label', gql.String),
gql.Field('redshiftType', gql.String),
gql.Field('clusterId', gql.String),
gql.Field('nameSpaceId', gql.String),
gql.Field('workgroup', gql.String),
gql.Field('database', gql.String),
gql.Field('redshiftUser', gql.String),
gql.Field('secretArn', gql.String),
],
)

RedshiftConnectionSearchResult = gql.ObjectType(
name='RedshiftConnectionSearchResult',
fields=[
gql.Field('count', gql.Integer),
gql.Field('page', gql.Integer),
gql.Field('pages', gql.Integer),
gql.Field('hasNext', gql.Boolean),
gql.Field('hasPrevious', gql.Boolean),
gql.Field('nodes', gql.ArrayType(RedshiftConnection)),
],
)

RedshiftTable = gql.ObjectType(
name='RedshiftTable',
fields=[
gql.Field('name', gql.String),
gql.Field('type', gql.String),
gql.Field('alreadyAdded', gql.String),
],
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from . import (
input_types,
queries,
mutations,
resolvers,
types,
)

__all__ = ['resolvers', 'types', 'input_types', 'queries', 'mutations']
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Contains the enums GraphQL mapping for enums"""
Loading

0 comments on commit 276dceb

Please sign in to comment.