data-dot-all · noah-paige · Aug 21, 2024 · Aug 21, 2024 · Aug 21, 2024 · Aug 22, 2024
diff --git a/backend/dataall/base/db/exceptions.py b/backend/dataall/base/db/exceptions.py
@@ -181,3 +181,26 @@ def __init__(self, action, message):
 
     def __str__(self):
         return f'{self.message}'
+
+
+class ResourceThresholdExceeded(Exception):
+    def __init__(self, username, action):
+        self.username = username
+        self.action = action
+        self.message = f"""
+                    An error occurred (ResourceThresholdExceeded) when calling {self.action} operation:
+                    Requests exceeded max daily invocation count for User: {self.username}
+                """
+
+    def __str__(self):
+        return f'{self.message}'
+
+
+class ModelGuardrailException(Exception):
+    def __init__(self, message):
+        self.message = f"""
+                    An error occurred (ModelGuardrailException) when invoking the model: {message}
+                """
+
+    def __str__(self):
+        return f'{self.message}'
diff --git a/backend/dataall/core/resource_threshold/__init__.py b/backend/dataall/core/resource_threshold/__init__.py
diff --git a/backend/dataall/core/resource_threshold/db/resource_threshold_models.py b/backend/dataall/core/resource_threshold/db/resource_threshold_models.py
@@ -0,0 +1,12 @@
+from dataall.base.db import Base, utils
+from sqlalchemy import String, Integer, Column, Date
+from datetime import date
+
+
+class ResourceThreshold(Base):
+    __tablename__ = 'resource_threshold'
+    actionUri = Column(String(64), primary_key=True, default=utils.uuid('resource_threshold'))
+    username = Column(String(64), nullable=False)
+    actionType = Column(String(64), nullable=False)
+    date = Column(Date, default=date.today, nullable=False)
+    count = Column(Integer, default=1, nullable=False)
diff --git a/backend/dataall/core/resource_threshold/db/resource_threshold_repositories.py b/backend/dataall/core/resource_threshold/db/resource_threshold_repositories.py
@@ -0,0 +1,56 @@
+from dataall.core.resource_threshold.db.resource_threshold_models import ResourceThreshold
+from sqlalchemy import and_
+from datetime import date
+
+
+class ResourceThresholdRepository:
+    @staticmethod
+    def get_count_today(session, username, action_type):
+        amount = (
+            session.query(ResourceThreshold.count)
+            .filter(
+                and_(
+                    ResourceThreshold.username == username,
+                    ResourceThreshold.actionType == action_type,
+                    ResourceThreshold.date == date.today(),
+                )
+            )
+            .scalar()
+        )
+        return amount if amount else 0
+
+    @staticmethod
+    def add_entry(session, username, action_type):
+        user_entry = ResourceThresholdRepository._get_user_entry(session, username, action_type)
+        if user_entry:
+            session.query(ResourceThreshold).filter(
+                and_(
+                    ResourceThreshold.username == username,
+                    ResourceThreshold.actionType == action_type,
+                )
+            ).update({ResourceThreshold.count: 1, ResourceThreshold.date: date.today()}, synchronize_session=False)
+            session.commit()
+        else:
+            action_entry = ResourceThreshold(username=username, actionType=action_type)
+            session.add(action_entry)
+            session.commit()
+
+    @staticmethod
+    def increment_count(session, username, action_type):
+        session.query(ResourceThreshold).filter(
+            and_(
+                ResourceThreshold.username == username,
+                ResourceThreshold.actionType == action_type,
+                ResourceThreshold.date == date.today(),
+            )
+        ).update({ResourceThreshold.count: ResourceThreshold.count + 1}, synchronize_session=False)
+        session.commit()
+
+    @staticmethod
+    def _get_user_entry(session, username, action_type):
+        entry = (
+            session.query(ResourceThreshold)
+            .filter(and_(ResourceThreshold.username == username, ResourceThreshold.actionType == action_type))
+            .first()
+        )
+        return entry
diff --git a/backend/dataall/core/resource_threshold/services/__init__.py b/backend/dataall/core/resource_threshold/services/__init__.py
diff --git a/backend/dataall/core/resource_threshold/services/resource_threshold_service.py b/backend/dataall/core/resource_threshold/services/resource_threshold_service.py
@@ -0,0 +1,42 @@
+from dataall.core.resource_threshold.db.resource_threshold_repositories import ResourceThresholdRepository
+from dataall.base.db import exceptions
+from functools import wraps
+from dataall.base.config import config
+from dataall.base.context import get_context
+
+import logging
+
+log = logging.getLogger(__name__)
+
+
+class ResourceThresholdService:
+    @staticmethod
+    def check_invocation_count(action_type, max_count_config_path):
+        def decorator(func):
+            @wraps(func)
+            def wrapper(*args, **kwargs):
+                context = get_context()
+                with context.db_engine.scoped_session() as session:
+                    count = ResourceThresholdRepository.get_count_today(
+                        session=session, username=context.username, action_type=action_type
+                    )
+                    max_count = config.get_property(max_count_config_path, 10)
+                    log.info(
+                        f'User {context.username} has invoked {action_type} {count} times today of max {max_count}'
+                    )
+                    if count < max_count:
+                        if count == 0:
+                            ResourceThresholdRepository.add_entry(
+                                session=session, username=context.username, action_type=action_type
+                            )
+                        else:
+                            ResourceThresholdRepository.increment_count(
+                                session=session, username=context.username, action_type=action_type
+                            )
+                        return func(*args, **kwargs)
+                    else:
+                        raise exceptions.ResourceThresholdExceeded(username=context.username, action=action_type)
+
+            return wrapper
+
+        return decorator
diff --git a/backend/dataall/modules/s3_datasets/api/dataset/queries.py b/backend/dataall/modules/s3_datasets/api/dataset/queries.py
@@ -2,6 +2,7 @@
 from dataall.modules.s3_datasets.api.dataset.resolvers import (
     get_dataset,
     get_dataset_assume_role_url,
+    list_s3_object_keys,
     get_file_upload_presigned_url,
     list_datasets_owned_by_env_group,
 )
@@ -45,3 +46,12 @@
     resolver=list_datasets_owned_by_env_group,
     test_scope='Dataset',
 )
+
+listS3ObjectKeys = gql.QueryField(
+    name='listS3ObjectKeys',
+    type=gql.ArrayType(gql.String),
+    args=[
+        gql.Argument(name='datasetUri', type=gql.NonNullableType(gql.String)),
+    ],
+    resolver=list_s3_object_keys,
+)
diff --git a/backend/dataall/modules/s3_datasets/api/dataset/resolvers.py b/backend/dataall/modules/s3_datasets/api/dataset/resolvers.py
@@ -156,6 +156,10 @@ def list_datasets_owned_by_env_group(
     return DatasetService.list_datasets_owned_by_env_group(environmentUri, groupUri, filter)
 
 
+def list_s3_object_keys(context, source, datasetUri: str = None):
+    return DatasetService.list_s3_object_keys(uri=datasetUri)
+
+
 class RequestValidator:
     @staticmethod
     def validate_creation_request(data):

diff --git a/backend/dataall/modules/s3_datasets/aws/s3_dataset_client.py b/backend/dataall/modules/s3_datasets/aws/s3_dataset_client.py
@@ -73,3 +73,17 @@ def get_bucket_encryption(self) -> (str, str, str):
                     f'Data.all Environment Pivot Role does not have s3:GetEncryptionConfiguration Permission for {dataset.S3BucketName} bucket: {e}'
                 )
             raise Exception(f'Cannot fetch the bucket encryption configuration for {dataset.S3BucketName}: {e}')
+
+    def list_object_keys(self, bucket_name):
+        try:
+            response = self._client.list_objects_v2(
+                Bucket=bucket_name,
+            )
+
+            def txt_or_pdf(s):
+                return s.split('.')[-1] in ['pdf', 'txt']
+
+            return [ob['Key'] for ob in response.get('Contents', []) if txt_or_pdf(ob['Key'])]
+        except ClientError as e:
+            logging.error(f'Failed to list objects in {bucket_name} : {e}')
+            raise e
diff --git a/backend/dataall/modules/s3_datasets/services/dataset_service.py b/backend/dataall/modules/s3_datasets/services/dataset_service.py
@@ -38,6 +38,7 @@
     DATASET_ALL,
     DATASET_READ,
     IMPORT_DATASET,
+    GET_DATASET,
 )
 from dataall.modules.s3_datasets.db.dataset_repositories import DatasetRepository
 from dataall.modules.datasets_base.db.dataset_repositories import DatasetBaseRepository
@@ -556,3 +557,11 @@ def delete_dataset_term_links(session, dataset_uri):
         for table_uri in tables:
             GlossaryRepository.delete_glossary_terms_links(session, table_uri, 'DatasetTable')
         GlossaryRepository.delete_glossary_terms_links(session, dataset_uri, 'Dataset')
+
+    @staticmethod
+    @ResourcePolicyService.has_resource_permission(GET_DATASET)
+    def list_s3_object_keys(uri):
+        with get_context().db_engine.scoped_session() as session:
+            dataset = DatasetRepository.get_dataset_by_uri(session, uri)
+
+            return S3DatasetClient(dataset).list_object_keys(dataset.S3BucketName)
diff --git a/backend/dataall/modules/worksheets/api/queries.py b/backend/dataall/modules/worksheets/api/queries.py
@@ -1,5 +1,11 @@
 from dataall.base.api import gql
-from dataall.modules.worksheets.api.resolvers import get_worksheet, list_worksheets, run_sql_query
+from dataall.modules.worksheets.api.resolvers import (
+    get_worksheet,
+    list_worksheets,
+    run_sql_query,
+    text_to_sql,
+    analyze_text_genai,
+)
 
 
 getWorksheet = gql.QueryField(
@@ -28,3 +34,29 @@
     ],
     resolver=run_sql_query,
 )
+
+TextToSQL = gql.QueryField(
+    name='textToSQL',
+    type=gql.String,
+    args=[
+        gql.Argument(name='worksheetUri', type=gql.NonNullableType(gql.String)),
+        gql.Argument(name='environmentUri', type=gql.NonNullableType(gql.String)),
+        gql.Argument(name='prompt', type=gql.NonNullableType(gql.String)),
+        gql.Argument(name='databaseName', type=gql.NonNullableType(gql.String)),
+        gql.Argument(name='tableNames', type=gql.ArrayType(gql.String)),
+    ],
+    resolver=text_to_sql,
+)
+
+analyzeTextDocument = gql.QueryField(
+    name='analyzeTextDocument',
+    type=gql.String,
+    args=[
+        gql.Argument(name='worksheetUri', type=gql.NonNullableType(gql.String)),
+        gql.Argument(name='environmentUri', type=gql.NonNullableType(gql.String)),
+        gql.Argument(name='prompt', type=gql.NonNullableType(gql.String)),
+        gql.Argument(name='datasetUri', type=gql.NonNullableType(gql.String)),
+        gql.Argument(name='key', type=gql.NonNullableType(gql.String)),
+    ],
+    resolver=analyze_text_genai,
+)
diff --git a/backend/dataall/modules/worksheets/api/resolvers.py b/backend/dataall/modules/worksheets/api/resolvers.py
@@ -3,6 +3,7 @@
 from dataall.modules.worksheets.db.worksheet_models import Worksheet
 from dataall.modules.worksheets.db.worksheet_repositories import WorksheetRepository
 from dataall.modules.worksheets.services.worksheet_service import WorksheetService
+from dataall.base.feature_toggle_checker import is_feature_enabled
 from dataall.base.api.context import Context
 
 
@@ -14,27 +15,19 @@ def create_worksheet(context: Context, source, input: dict = None):
     if not input.get('label'):
         raise exceptions.RequiredParameter('label')
 
-    with context.engine.scoped_session() as session:
-        return WorksheetService.create_worksheet(
-            session=session,
-            username=context.username,
-            data=input,
-        )
+    return WorksheetService.create_worksheet(
+        data=input,
+    )
 
 
-def update_worksheet(context: Context, source, worksheetUri: str = None, input: dict = None):
-    with context.engine.scoped_session() as session:
-        return WorksheetService.update_worksheet(
-            session=session, username=context.username, uri=worksheetUri, data=input
-        )
+def update_worksheet(context: Context, source, worksheetUri: str, input: dict = None):
+    return WorksheetService.update_worksheet(uri=worksheetUri, data=input)
 
 
-def get_worksheet(context: Context, source, worksheetUri: str = None):
-    with context.engine.scoped_session() as session:
-        return WorksheetService.get_worksheet(
-            session=session,
-            uri=worksheetUri,
-        )
+def get_worksheet(context: Context, source, worksheetUri: str):
+    return WorksheetService.get_worksheet(
+        uri=worksheetUri,
+    )
 
 
 def resolve_user_role(context: Context, source: Worksheet):
@@ -59,13 +52,47 @@ def list_worksheets(context, source, filter: dict = None):
         )
 
 
-def run_sql_query(context: Context, source, environmentUri: str = None, worksheetUri: str = None, sqlQuery: str = None):
-    with context.engine.scoped_session() as session:
-        return WorksheetService.run_sql_query(
-            session=session, uri=environmentUri, worksheetUri=worksheetUri, sqlQuery=sqlQuery
-        )
+def run_sql_query(context: Context, source, environmentUri: str, worksheetUri: str, sqlQuery: str):
+    return WorksheetService.run_sql_query(uri=environmentUri, worksheetUri=worksheetUri, sqlQuery=sqlQuery)
 
 
-def delete_worksheet(context, source, worksheetUri: str = None):
-    with context.engine.scoped_session() as session:
-        return WorksheetService.delete_worksheet(session=session, uri=worksheetUri)
+def delete_worksheet(context, source, worksheetUri: str):
+    return WorksheetService.delete_worksheet(uri=worksheetUri)
+
+
+@is_feature_enabled('modules.worksheets.features.nlq.active')
+def text_to_sql(
+    context: Context,
+    source,
+    environmentUri: str,
+    worksheetUri: str,
+    prompt: str,
+    databaseName: str,
+    tableNames: list,
+):
+    return WorksheetService.run_nlq(
+        uri=environmentUri,
+        prompt=prompt,
+        worksheetUri=worksheetUri,
+        db_name=databaseName,
+        table_names=tableNames,
+    )
+
+
+@is_feature_enabled('modules.worksheets.features.nlq.active')
+def analyze_text_genai(
+    context,
+    source,
+    worksheetUri: str,
+    environmentUri: str,
+    prompt: str,
+    datasetUri: str,
+    key: str,
+):
+    return WorksheetService.analyze_text_genai(
+        uri=environmentUri,
+        worksheetUri=worksheetUri,
+        prompt=prompt,
+        datasetUri=datasetUri,
+        key=key,
+    )