From b30a3548c1c8b3a5155c9a84a006b8e5d588b014 Mon Sep 17 00:00:00 2001 From: Noah Paige <69586985+noah-paige@users.noreply.github.com> Date: Thu, 14 Nov 2024 12:13:32 -0500 Subject: [PATCH] Separating Out Access Logging (#1695) ### Feature or Bugfix - Refactoring ### Detail - Move access logging to a separate environment logging bucket (rather than env default bucket used for athena queries and profiling job artifacts) ### Relates - ### Security Please answer the questions below briefly where applicable, or write `N/A`. Based on [OWASP 10](https://owasp.org/Top10/en/). - Does this PR introduce or modify any input fields or queries - this includes fetching data from storage outside the application (e.g. a database, an S3 bucket)? - Is the input sanitized? - What precautions are you taking before deserializing the data you consume? - Is injection prevented by parametrizing queries? - Have you ensured no `eval` or similar functions are used? - Does this PR introduce any functionality or component that requires authorization? - How have you ensured it respects the existing AuthN/AuthZ mechanisms? - Are you logging failed auth attempts? - Are you using or adding any cryptographic features? - Do you use a standard proven implementations? - Are the used keys controlled by the customer? Where are they stored? - Are you introducing any new policies/roles/users? - Have you used the least-privilege principle? How? By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. --- .checkov.baseline | 52 ++---------------- backend/dataall/core/environment/api/types.py | 1 + .../core/environment/cdk/environment_stack.py | 32 ++++++++--- .../core/environment/db/environment_models.py | 1 + .../services/environment_service.py | 7 +++ .../modules/s3_datasets/cdk/dataset_stack.py | 2 +- .../04d92886fabe_add_consumption_roles.py | 10 +++- ...8e35e39e1e_invite_env_groups_as_readers.py | 14 ++++- ...f31999_backfill_MF_resource_permissions.py | 14 ++++- .../49c6b18ed814_add_env_logs_bucket.py | 55 +++++++++++++++++++ .../852cdf6cf1e0_add_redshift_datasets.py | 15 ++++- ...1ac7a85a2_drop_remove_group_permissions.py | 14 ++++- tests/modules/conftest.py | 1 + .../test_environment_stack_with_dataset.py | 19 +++++++ .../core/environment/global_conftest.py | 3 + 15 files changed, 172 insertions(+), 68 deletions(-) create mode 100644 backend/migrations/versions/49c6b18ed814_add_env_logs_bucket.py diff --git a/.checkov.baseline b/.checkov.baseline index e0bbfce12..19875f2e3 100644 --- a/.checkov.baseline +++ b/.checkov.baseline @@ -417,7 +417,7 @@ ] }, { - "file": "/cdk.out/asset.3045cb6b4340be1e173df6dcf6248d565aa849ceda3e2cf2c2f221ccee4bc1d6/pivotRole.yaml", + "file": "/cdk.out/asset.05d71d8b69cd4483d3c9db9120b556b718c72f349debbb79d461c74c4964b350/pivotRole.yaml", "findings": [ { "resource": "AWS::IAM::ManagedPolicy.PivotRolePolicy0", @@ -490,12 +490,6 @@ { "file": "/checkov_environment_synth.json", "findings": [ - { - "resource": "AWS::IAM::ManagedPolicy.dataallanothergroup111111servicespolicy19AC37181", - "check_ids": [ - "CKV_AWS_111" - ] - }, { "resource": "AWS::IAM::ManagedPolicy.dataallanothergroup111111servicespolicy2E85AF510", "check_ids": [ @@ -508,24 +502,6 @@ "CKV_AWS_111" ] }, - { - "resource": "AWS::IAM::ManagedPolicy.dataallanothergroup111111servicespolicy5A19E75CA", - "check_ids": [ - "CKV_AWS_109" - ] - }, - { - "resource": "AWS::IAM::ManagedPolicy.dataallanothergroup111111servicespolicyCC720210", - "check_ids": [ - "CKV_AWS_109" - ] - }, - { - "resource": "AWS::IAM::ManagedPolicy.dataalltestadmins111111servicespolicy1A0C96958", - "check_ids": [ - "CKV_AWS_111" - ] - }, { "resource": "AWS::IAM::ManagedPolicy.dataalltestadmins111111servicespolicy2B12D381A", "check_ids": [ @@ -538,18 +514,6 @@ "CKV_AWS_111" ] }, - { - "resource": "AWS::IAM::ManagedPolicy.dataalltestadmins111111servicespolicy3E3CBA9E", - "check_ids": [ - "CKV_AWS_109" - ] - }, - { - "resource": "AWS::IAM::ManagedPolicy.dataalltestadmins111111servicespolicy56D7DC525", - "check_ids": [ - "CKV_AWS_109" - ] - }, { "resource": "AWS::Lambda::Function.CustomCDKBucketDeployment8693BB64968944B69AAFB0CC9EB8756C81C01536", "check_ids": [ @@ -563,16 +527,14 @@ "resource": "AWS::Lambda::Function.GlueDatabaseLFCustomResourceHandler7FAF0F82", "check_ids": [ "CKV_AWS_115", - "CKV_AWS_117", - "CKV_AWS_173" + "CKV_AWS_117" ] }, { "resource": "AWS::Lambda::Function.LakeformationDefaultSettingsHandler2CBEDB06", "check_ids": [ "CKV_AWS_115", - "CKV_AWS_117", - "CKV_AWS_173" + "CKV_AWS_117" ] }, { @@ -580,8 +542,7 @@ "check_ids": [ "CKV_AWS_115", "CKV_AWS_116", - "CKV_AWS_117", - "CKV_AWS_173" + "CKV_AWS_117" ] }, { @@ -589,12 +550,11 @@ "check_ids": [ "CKV_AWS_115", "CKV_AWS_116", - "CKV_AWS_117", - "CKV_AWS_173" + "CKV_AWS_117" ] }, { - "resource": "AWS::S3::Bucket.EnvironmentDefaultBucket78C3A8B0", + "resource": "AWS::S3::Bucket.EnvironmentDefaultLogBucket7F0EFAB3", "check_ids": [ "CKV_AWS_18" ] diff --git a/backend/dataall/core/environment/api/types.py b/backend/dataall/core/environment/api/types.py index 229593dd8..a95d943c4 100644 --- a/backend/dataall/core/environment/api/types.py +++ b/backend/dataall/core/environment/api/types.py @@ -100,6 +100,7 @@ gql.Field('subscriptionsConsumersTopicName', type=gql.String), gql.Field('subscriptionsProducersTopicName', type=gql.String), gql.Field('EnvironmentDefaultBucketName', type=gql.String), + gql.Field('EnvironmentLogsBucketName', type=gql.String), gql.Field('EnvironmentDefaultAthenaWorkGroup', type=gql.String), gql.Field( name='networks', diff --git a/backend/dataall/core/environment/cdk/environment_stack.py b/backend/dataall/core/environment/cdk/environment_stack.py index 2638001a0..cf6529a68 100644 --- a/backend/dataall/core/environment/cdk/environment_stack.py +++ b/backend/dataall/core/environment/cdk/environment_stack.py @@ -165,30 +165,44 @@ def __init__(self, scope, id, target_uri: str = None, **kwargs): f'arn:aws:iam::{self._environment.AwsAccountId}:role/{self.pivot_role_name}', ) - # Environment S3 Bucket - default_environment_bucket = s3.Bucket( + # Environment Logging S3 Bucket + default_environment_log_bucket = s3.Bucket( self, - 'EnvironmentDefaultBucket', - bucket_name=self._environment.EnvironmentDefaultBucketName, + 'EnvironmentDefaultLogBucket', + bucket_name=self._environment.EnvironmentLogsBucketName, encryption=s3.BucketEncryption.S3_MANAGED, removal_policy=RemovalPolicy.RETAIN, block_public_access=s3.BlockPublicAccess.BLOCK_ALL, versioned=True, enforce_ssl=True, ) - default_environment_bucket.policy.apply_removal_policy(RemovalPolicy.RETAIN) - self.default_environment_bucket = default_environment_bucket - - default_environment_bucket.add_to_resource_policy( + default_environment_log_bucket.policy.apply_removal_policy(RemovalPolicy.RETAIN) + default_environment_log_bucket.add_to_resource_policy( iam.PolicyStatement( sid='AWSLogDeliveryWrite', effect=iam.Effect.ALLOW, principals=[iam.ServicePrincipal('logging.s3.amazonaws.com')], actions=['s3:PutObject', 's3:PutObjectAcl'], - resources=[f'{default_environment_bucket.bucket_arn}/*'], + resources=[f'{default_environment_log_bucket.bucket_arn}/*'], ) ) + # Environment S3 Bucket + default_environment_bucket = s3.Bucket( + self, + 'EnvironmentDefaultBucket', + bucket_name=self._environment.EnvironmentDefaultBucketName, + encryption=s3.BucketEncryption.S3_MANAGED, + removal_policy=RemovalPolicy.RETAIN, + block_public_access=s3.BlockPublicAccess.BLOCK_ALL, + versioned=True, + enforce_ssl=True, + server_access_logs_bucket=default_environment_log_bucket, + server_access_logs_prefix=f'access_logs/{self._environment.EnvironmentDefaultBucketName}/', + ) + default_environment_bucket.policy.apply_removal_policy(RemovalPolicy.RETAIN) + self.default_environment_bucket = default_environment_bucket + default_environment_bucket.add_lifecycle_rule( abort_incomplete_multipart_upload_after=Duration.days(7), noncurrent_version_transitions=[ diff --git a/backend/dataall/core/environment/db/environment_models.py b/backend/dataall/core/environment/db/environment_models.py index e56135e97..c4890850a 100644 --- a/backend/dataall/core/environment/db/environment_models.py +++ b/backend/dataall/core/environment/db/environment_models.py @@ -25,6 +25,7 @@ class Environment(Resource, Base): EnvironmentDefaultIAMRoleImported = Column(Boolean, default=False) EnvironmentDefaultIAMRoleArn = Column(String, nullable=False) EnvironmentDefaultBucketName = Column(String) + EnvironmentLogsBucketName = Column(String) EnvironmentDefaultAthenaWorkGroup = Column(String) roleCreated = Column(Boolean, nullable=False, default=False) diff --git a/backend/dataall/core/environment/services/environment_service.py b/backend/dataall/core/environment/services/environment_service.py index 864031c54..4ff768b07 100644 --- a/backend/dataall/core/environment/services/environment_service.py +++ b/backend/dataall/core/environment/services/environment_service.py @@ -255,6 +255,13 @@ def create_environment(uri, data=None): resource_prefix=env.resourcePrefix, ).build_compliant_name() + env.EnvironmentLogsBucketName = NamingConventionService( + target_uri=env.environmentUri, + target_label='env-access-logs', + pattern=NamingConventionPattern.S3, + resource_prefix=env.resourcePrefix, + ).build_compliant_name() + env.EnvironmentDefaultAthenaWorkGroup = NamingConventionService( target_uri=env.environmentUri, target_label=env.label, diff --git a/backend/dataall/modules/s3_datasets/cdk/dataset_stack.py b/backend/dataall/modules/s3_datasets/cdk/dataset_stack.py index 618975b0a..4f3bb8018 100644 --- a/backend/dataall/modules/s3_datasets/cdk/dataset_stack.py +++ b/backend/dataall/modules/s3_datasets/cdk/dataset_stack.py @@ -204,7 +204,7 @@ def __init__(self, scope, id, target_uri: str = None, **kwargs): server_access_logs_bucket=s3.Bucket.from_bucket_name( self, 'EnvAccessLogsBucket', - f'{env.EnvironmentDefaultBucketName}', + f'{env.EnvironmentLogsBucketName}', ), server_access_logs_prefix=f'access_logs/{dataset.S3BucketName}/', enforce_ssl=True, diff --git a/backend/migrations/versions/04d92886fabe_add_consumption_roles.py b/backend/migrations/versions/04d92886fabe_add_consumption_roles.py index e15a34972..b7bde887f 100644 --- a/backend/migrations/versions/04d92886fabe_add_consumption_roles.py +++ b/backend/migrations/versions/04d92886fabe_add_consumption_roles.py @@ -12,9 +12,8 @@ from sqlalchemy.dialects import postgresql from sqlalchemy.ext.declarative import declarative_base -from dataall.core.environment.db.environment_models import Environment from dataall.core.environment.services.environment_service import EnvironmentService -from dataall.base.db import utils +from dataall.base.db import utils, Resource from datetime import datetime from dataall.core.permissions.services.permission_service import PermissionService @@ -33,6 +32,11 @@ Base = declarative_base() +class Environment(Resource, Base): + __tablename__ = 'environment' + environmentUri = Column(String, primary_key=True, default=utils.uuid('environment')) + + class EnvironmentGroup(Base): __tablename__ = 'environment_group_permission' groupUri = Column(String, primary_key=True) @@ -123,7 +127,7 @@ def upgrade(): bind = op.get_bind() session = orm.Session(bind=bind) print('Back-filling consumer role permissions for environments...') - envs = EnvironmentService.list_all_active_environments(session=session) + envs = session.query(Environment).filter(Environment.deleted.is_(None)).all() for env in envs: groups = EnvironmentService.get_all_environment_groups(session=session, uri=env.environmentUri, filter=None) for group in groups: diff --git a/backend/migrations/versions/328e35e39e1e_invite_env_groups_as_readers.py b/backend/migrations/versions/328e35e39e1e_invite_env_groups_as_readers.py index 5f243a8ff..43655246e 100644 --- a/backend/migrations/versions/328e35e39e1e_invite_env_groups_as_readers.py +++ b/backend/migrations/versions/328e35e39e1e_invite_env_groups_as_readers.py @@ -7,12 +7,14 @@ """ from alembic import op -from sqlalchemy import orm -from dataall.core.environment.db.environment_models import EnvironmentGroup, Environment +from sqlalchemy import orm, Column, String +from sqlalchemy.ext.declarative import declarative_base +from dataall.core.environment.db.environment_models import EnvironmentGroup from dataall.core.organizations.db.organization_repositories import OrganizationRepository from dataall.core.permissions.services.organization_permissions import GET_ORGANIZATION from dataall.core.organizations.db import organization_models as models from dataall.core.permissions.services.resource_policy_service import ResourcePolicyService +from dataall.base.db import utils, Resource # revision identifiers, used by Alembic. revision = '328e35e39e1e' @@ -20,6 +22,14 @@ branch_labels = None depends_on = None +Base = declarative_base() + + +class Environment(Resource, Base): + __tablename__ = 'environment' + environmentUri = Column(String, primary_key=True, default=utils.uuid('environment')) + organizationUri = Column(String, nullable=False) + def get_session(): bind = op.get_bind() diff --git a/backend/migrations/versions/427db8f31999_backfill_MF_resource_permissions.py b/backend/migrations/versions/427db8f31999_backfill_MF_resource_permissions.py index 5209963e8..473dc3f4a 100644 --- a/backend/migrations/versions/427db8f31999_backfill_MF_resource_permissions.py +++ b/backend/migrations/versions/427db8f31999_backfill_MF_resource_permissions.py @@ -7,9 +7,10 @@ """ from alembic import op -from sqlalchemy import orm +from sqlalchemy import orm, Column, String +from sqlalchemy.ext.declarative import declarative_base -from dataall.core.environment.db.environment_models import Environment +from dataall.base.db import utils, Resource from dataall.core.organizations.db.organization_models import Organization from dataall.core.permissions.api.enums import PermissionType from dataall.core.permissions.services.permission_service import PermissionService @@ -28,6 +29,15 @@ branch_labels = None depends_on = None +Base = declarative_base() + + +class Environment(Resource, Base): + __tablename__ = 'environment' + organizationUri = Column(String, nullable=False) + environmentUri = Column(String, primary_key=True, default=utils.uuid('environment')) + SamlGroupName = Column(String, nullable=False) + def get_session(): bind = op.get_bind() diff --git a/backend/migrations/versions/49c6b18ed814_add_env_logs_bucket.py b/backend/migrations/versions/49c6b18ed814_add_env_logs_bucket.py new file mode 100644 index 000000000..815fb8909 --- /dev/null +++ b/backend/migrations/versions/49c6b18ed814_add_env_logs_bucket.py @@ -0,0 +1,55 @@ +"""add_env_logs_bucket + +Revision ID: 49c6b18ed814 +Revises: b21f86882012 +Create Date: 2024-11-13 19:16:18.030415 + +""" + +from alembic import op +import sqlalchemy as sa +from sqlalchemy import orm, Column, String +from sqlalchemy.ext.declarative import declarative_base + +from dataall.base.db import Resource, utils +from dataall.base.utils.naming_convention import ( + NamingConventionService, + NamingConventionPattern, +) + +# revision identifiers, used by Alembic. +revision = '49c6b18ed814' +down_revision = 'b21f86882012' +branch_labels = None +depends_on = None + +Base = declarative_base() + + +class Environment(Resource, Base): + __tablename__ = 'environment' + environmentUri = Column(String, primary_key=True, default=utils.uuid('environment')) + resourcePrefix = Column(String, nullable=False, default='dataall') + EnvironmentLogsBucketName = Column(String, nullable=True) + + +def upgrade(): + op.add_column('environment', sa.Column('EnvironmentLogsBucketName', sa.String(), nullable=True)) + bind = op.get_bind() + session = orm.Session(bind=bind) + environments = session.query(Environment).all() + for env in environments: + env.EnvironmentLogsBucketName = NamingConventionService( + target_uri=env.environmentUri, + target_label='env-access-logs', + pattern=NamingConventionPattern.S3, + resource_prefix=env.resourcePrefix, + ).build_compliant_name() + session.commit() + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('environment', 'EnvironmentLogsBucketName') + # ### end Alembic commands ### diff --git a/backend/migrations/versions/852cdf6cf1e0_add_redshift_datasets.py b/backend/migrations/versions/852cdf6cf1e0_add_redshift_datasets.py index 422d1ffbb..dbee9238b 100644 --- a/backend/migrations/versions/852cdf6cf1e0_add_redshift_datasets.py +++ b/backend/migrations/versions/852cdf6cf1e0_add_redshift_datasets.py @@ -8,14 +8,14 @@ from alembic import op import sqlalchemy as sa -from sqlalchemy import orm +from sqlalchemy import orm, Column, String +from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.dialects import postgresql -from dataall.core.environment.db.environment_models import Environment from dataall.core.permissions.services.resource_policy_service import ResourcePolicyService from dataall.core.permissions.services.permission_service import PermissionService from dataall.core.permissions.api.enums import PermissionType - +from dataall.base.db import utils, Resource # revision identifiers, used by Alembic. revision = '852cdf6cf1e0' @@ -34,6 +34,15 @@ ENVIRONMENT_REDSHIFT_ALL_WITH_DESC[CREATE_REDSHIFT_CONNECTION] = 'Create Redshift Connection in this environment' ENVIRONMENT_REDSHIFT_ALL_WITH_DESC[IMPORT_REDSHIFT_DATASET] = 'Import Redshift Datasets to this environment' +Base = declarative_base() + + +class Environment(Resource, Base): + __tablename__ = 'environment' + organizationUri = Column(String, nullable=False) + environmentUri = Column(String, primary_key=True, default=utils.uuid('environment')) + SamlGroupName = Column(String, nullable=False) + def upgrade(): op.create_table( diff --git a/backend/migrations/versions/a991ac7a85a2_drop_remove_group_permissions.py b/backend/migrations/versions/a991ac7a85a2_drop_remove_group_permissions.py index 64ded7dd5..424e22f8f 100644 --- a/backend/migrations/versions/a991ac7a85a2_drop_remove_group_permissions.py +++ b/backend/migrations/versions/a991ac7a85a2_drop_remove_group_permissions.py @@ -10,9 +10,11 @@ from dataall.core.permissions.services.environment_permissions import REMOVE_ENVIRONMENT_GROUP from dataall.core.permissions.db.resource_policy.resource_policy_models import ResourcePolicy from dataall.core.permissions.services.resource_policy_service import ResourcePolicyService +from dataall.base.db import utils, Resource + from alembic import op -from sqlalchemy import orm -from dataall.core.environment.db.environment_models import Environment +from sqlalchemy import orm, Column, String +from sqlalchemy.ext.declarative import declarative_base # revision identifiers, used by Alembic. revision = 'a991ac7a85a2' @@ -21,6 +23,14 @@ depends_on = None +Base = declarative_base() + + +class Environment(Resource, Base): + __tablename__ = 'environment' + environmentUri = Column(String, primary_key=True, default=utils.uuid('environment')) + + def get_session(): bind = op.get_bind() session = orm.Session(bind=bind) diff --git a/tests/modules/conftest.py b/tests/modules/conftest.py index bf5a373f5..6f76a7a56 100644 --- a/tests/modules/conftest.py +++ b/tests/modules/conftest.py @@ -102,6 +102,7 @@ def factory( EnvironmentDefaultIAMRoleName=role, EnvironmentDefaultIAMRoleArn=f'arn:aws:iam::{account}:role/{role}', EnvironmentDefaultBucketName='defaultbucketname1234567789', + EnvironmentLogsBucketName='logsbucketname1234567789', CDKRoleArn=f'arn:aws::{account}:role/EnvRole', EnvironmentDefaultAthenaWorkGroup='DefaultWorkGroup', **kwargs, diff --git a/tests/modules/s3_datasets/test_environment_stack_with_dataset.py b/tests/modules/s3_datasets/test_environment_stack_with_dataset.py index 249b87daa..24d3e6020 100644 --- a/tests/modules/s3_datasets/test_environment_stack_with_dataset.py +++ b/tests/modules/s3_datasets/test_environment_stack_with_dataset.py @@ -132,6 +132,25 @@ def test_resources_created(env_fixture_fixed_naming, org_fixture, mocker): }, count=1, ) + + # Assert that we have created: + template.resource_properties_count_is( + type='AWS::S3::Bucket', + props={ + 'BucketName': env_fixture_fixed_naming.EnvironmentLogsBucketName, + 'BucketEncryption': { + 'ServerSideEncryptionConfiguration': [{'ServerSideEncryptionByDefault': {'SSEAlgorithm': 'AES256'}}] + }, + 'PublicAccessBlockConfiguration': { + 'BlockPublicAcls': True, + 'BlockPublicPolicy': True, + 'IgnorePublicAcls': True, + 'RestrictPublicBuckets': True, + }, + }, + count=1, + ) + template.resource_properties_count_is( type='AWS::Lambda::Function', props={ diff --git a/tests_new/integration_tests/core/environment/global_conftest.py b/tests_new/integration_tests/core/environment/global_conftest.py index 1bf5057ea..47f58b8e5 100644 --- a/tests_new/integration_tests/core/environment/global_conftest.py +++ b/tests_new/integration_tests/core/environment/global_conftest.py @@ -49,6 +49,9 @@ def create_env(client, env_name, group, org_uri, account_id, region, tags=[], re S3Client(session=session, account=env.AwsAccountId, region=env.region).delete_bucket( env.EnvironmentDefaultBucketName ) + S3Client(session=session, account=env.AwsAccountId, region=env.region).delete_bucket( + env.EnvironmentLogsBucketName + ) delete_env(client, env)