Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/worksheet nlq beta #1639

Open
wants to merge 52 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 51 commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
270f0ae
Added parts of nlq project
LoveBroman Aug 21, 2024
74dc110
final hahah
LoveBroman Aug 21, 2024
c76983d
Merge branch 'love-main' into feat/worksheet-nlq-beta
noah-paige Aug 21, 2024
3b7d8a7
fixed cdk, unstructured data and migrations
LoveBroman Aug 22, 2024
398a149
Merge latest from fork repo
noah-paige Aug 22, 2024
ab82d50
Deleted some files
LoveBroman Aug 23, 2024
1d579bb
removed account info from cdk.json and reverted to default
LoveBroman Aug 23, 2024
666e272
ruff formatting ran
LoveBroman Aug 23, 2024
bd3b285
config.json set to default
LoveBroman Aug 23, 2024
09dfc22
fixed hardcoding in resource_treshold
LoveBroman Aug 23, 2024
07b100e
Merge remote-tracking branch 'refs/remotes/origin/main' into feat/wor…
dlpzx Oct 9, 2024
6371462
Merge branch 'love-main' into feat/worksheet-nlq-beta
noah-paige Oct 14, 2024
f0ea070
ruff
noah-paige Oct 14, 2024
be51a84
Merge branch 'os-main' into feat/worksheet-nlq-beta
noah-paige Oct 14, 2024
1b7151d
Merge branch 'os-main' into feat/worksheet-nlq-beta
noah-paige Oct 14, 2024
242498a
Resolve incorrect diffs os main + nlq branch
noah-paige Oct 14, 2024
8471484
fix version json
noah-paige Oct 14, 2024
838f46c
fix config json formatting
noah-paige Oct 14, 2024
fec8b79
format cdk json
noah-paige Oct 14, 2024
fb0b4f1
Re format Lambda IAM Permissions CDK App
noah-paige Oct 14, 2024
411d4fa
rename treshold to threshold
noah-paige Oct 14, 2024
5700d34
push changes invocation count decorator
noah-paige Oct 14, 2024
efb11fc
textToSql Edits
noah-paige Oct 14, 2024
0361931
clean up AWS Clients
noah-paige Oct 14, 2024
6068eae
Refactor Worksheet Views
noah-paige Oct 14, 2024
7f28408
rename FE components and fix textToSQL inputs
noah-paige Oct 16, 2024
af0d206
textToSql using env group role
noah-paige Oct 16, 2024
3cb2006
clean up and refactor unstructured text use case
noah-paige Oct 16, 2024
5dbb773
Combine Bedrock Clients to 1 and add new file to store prompts
noah-paige Oct 16, 2024
f7520db
linting and touch ups
noah-paige Oct 16, 2024
ba6f764
update dependencies
noah-paige Oct 16, 2024
58b1bd0
update pypdf
noah-paige Oct 16, 2024
723c82b
remove unused import
noah-paige Oct 16, 2024
21a458b
fixes from testing
noah-paige Oct 17, 2024
2fab438
remove duplicate import
noah-paige Oct 17, 2024
95c49a4
fixes to resource threshold
noah-paige Oct 17, 2024
89cd253
Add Documentation around NLQ features
noah-paige Oct 17, 2024
9bc51a1
Add integ tests
noah-paige Oct 21, 2024
82445ef
Add tests list obj keys
noah-paige Oct 21, 2024
205a2c4
Updates to resource threhsold module based on comments
noah-paige Oct 23, 2024
8816fde
fix when creating session, format worksheet resolver + services
noah-paige Oct 23, 2024
3c39b5a
resolve more PR comments backend
noah-paige Oct 23, 2024
0b66a79
change config json parameter path
noah-paige Oct 23, 2024
3eee507
update typos in docs
noah-paige Oct 23, 2024
b735f2e
positional args FE
noah-paige Oct 23, 2024
5001ad2
fix imports
noah-paige Oct 24, 2024
9bca372
Merge branch 'os-main' into feat/worksheet-nlq-beta
noah-paige Oct 24, 2024
f6f12c1
fix FE and move prompts to text file
noah-paige Oct 24, 2024
a521238
userguide doc updates
noah-paige Oct 24, 2024
6c7649c
rename listObjectkeys to listS3Objectkeys and add tests / upload txt …
noah-paige Oct 24, 2024
c94b0ce
fix bug deleteWorksheet and tests
noah-paige Oct 24, 2024
9a837d2
typo
noah-paige Oct 29, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions backend/dataall/base/db/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,3 +181,26 @@ def __init__(self, action, message):

def __str__(self):
return f'{self.message}'


class ResourceThresholdExceeded(Exception):
def __init__(self, username, action):
self.username = username
self.action = action
self.message = f"""
An error occurred (ResourceThresholdExceeded) when calling {self.action} operation:
Requests exceeded max daily invocation count for User: {self.username}
"""

def __str__(self):
return f'{self.message}'


dlpzx marked this conversation as resolved.
Show resolved Hide resolved
class ModelGuardrailException(Exception):
def __init__(self, message):
self.message = f"""
An error occurred (ModelGuardrailException) when invoking the model: {message}
"""

def __str__(self):
return f'{self.message}'
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from dataall.base.db import Base, utils
from sqlalchemy import String, Integer, Column, Date
from datetime import date


class ResourceThreshold(Base):
__tablename__ = 'resource_threshold'
actionUri = Column(String(64), primary_key=True, default=utils.uuid('resource_threshold'))
username = Column(String(64), nullable=False)
actionType = Column(String(64), nullable=False)
date = Column(Date, default=date.today, nullable=False)
count = Column(Integer, default=1, nullable=False)
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from dataall.core.resource_threshold.db.resource_threshold_models import ResourceThreshold
from sqlalchemy import and_
from datetime import date


class ResourceThresholdRepository:
@staticmethod
def get_count_today(session, username, action_type):
amount = (
session.query(ResourceThreshold.count)
.filter(
and_(
ResourceThreshold.username == username,
ResourceThreshold.actionType == action_type,
ResourceThreshold.date == date.today(),
)
)
.scalar()
)
return amount if amount else 0

@staticmethod
def add_entry(session, username, action_type):
user_entry = ResourceThresholdRepository._get_user_entry(session, username, action_type)
if user_entry:
session.query(ResourceThreshold).filter(
and_(
ResourceThreshold.username == username,
ResourceThreshold.actionType == action_type,
)
).update({ResourceThreshold.count: 1, ResourceThreshold.date: date.today()}, synchronize_session=False)
session.commit()
else:
action_entry = ResourceThreshold(username=username, actionType=action_type)
session.add(action_entry)
session.commit()

@staticmethod
def increment_count(session, username, action_type):
session.query(ResourceThreshold).filter(
and_(
ResourceThreshold.username == username,
ResourceThreshold.actionType == action_type,
ResourceThreshold.date == date.today(),
)
).update({ResourceThreshold.count: ResourceThreshold.count + 1}, synchronize_session=False)
session.commit()

@staticmethod
def _get_user_entry(session, username, action_type):
entry = (
session.query(ResourceThreshold)
.filter(and_(ResourceThreshold.username == username, ResourceThreshold.actionType == action_type))
.first()
)
return entry
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from dataall.core.resource_threshold.db.resource_threshold_repositories import ResourceThresholdRepository
from dataall.base.db import exceptions
from functools import wraps
from dataall.base.config import config
from dataall.base.context import get_context

import logging

log = logging.getLogger(__name__)


class ResourceThresholdService:
@staticmethod
def check_invocation_count(action_type, max_count_config_path):
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
context = get_context()
with context.db_engine.scoped_session() as session:
count = ResourceThresholdRepository.get_count_today(
session=session, username=context.username, action_type=action_type
)
max_count = config.get_property(max_count_config_path, 10)
log.info(
f'User {context.username} has invoked {action_type} {count} times today of max {max_count}'
)
if count < max_count:
if count == 0:
ResourceThresholdRepository.add_entry(
session=session, username=context.username, action_type=action_type
)
else:
ResourceThresholdRepository.increment_count(
session=session, username=context.username, action_type=action_type
)
return func(*args, **kwargs)
else:
raise exceptions.ResourceThresholdExceeded(username=context.username, action=action_type)

return wrapper

return decorator
10 changes: 10 additions & 0 deletions backend/dataall/modules/s3_datasets/api/dataset/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from dataall.modules.s3_datasets.api.dataset.resolvers import (
get_dataset,
get_dataset_assume_role_url,
list_s3_object_keys,
get_file_upload_presigned_url,
list_datasets_owned_by_env_group,
)
Expand Down Expand Up @@ -45,3 +46,12 @@
resolver=list_datasets_owned_by_env_group,
test_scope='Dataset',
)

listS3ObjectKeys = gql.QueryField(
name='listS3ObjectKeys',
type=gql.ArrayType(gql.String),
args=[
gql.Argument(name='datasetUri', type=gql.NonNullableType(gql.String)),
],
resolver=list_s3_object_keys,
)
4 changes: 4 additions & 0 deletions backend/dataall/modules/s3_datasets/api/dataset/resolvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,10 @@ def list_datasets_owned_by_env_group(
return DatasetService.list_datasets_owned_by_env_group(environmentUri, groupUri, filter)


def list_s3_object_keys(context, source, datasetUri: str = None):
return DatasetService.list_s3_object_keys(uri=datasetUri)

dlpzx marked this conversation as resolved.
Show resolved Hide resolved

class RequestValidator:
@staticmethod
def validate_creation_request(data):
Expand Down
14 changes: 14 additions & 0 deletions backend/dataall/modules/s3_datasets/aws/s3_dataset_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,17 @@ def get_bucket_encryption(self) -> (str, str, str):
f'Data.all Environment Pivot Role does not have s3:GetEncryptionConfiguration Permission for {dataset.S3BucketName} bucket: {e}'
)
raise Exception(f'Cannot fetch the bucket encryption configuration for {dataset.S3BucketName}: {e}')

def list_object_keys(self, bucket_name):
try:
response = self._client.list_objects_v2(
Bucket=bucket_name,
)

def txt_or_pdf(s):
return s.split('.')[-1] in ['pdf', 'txt']

dlpzx marked this conversation as resolved.
Show resolved Hide resolved
return [ob['Key'] for ob in response.get('Contents', []) if txt_or_pdf(ob['Key'])]
except ClientError as e:
logging.error(f'Failed to list objects in {bucket_name} : {e}')
raise e
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
DATASET_ALL,
DATASET_READ,
IMPORT_DATASET,
GET_DATASET,
)
from dataall.modules.s3_datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.datasets_base.db.dataset_repositories import DatasetBaseRepository
Expand Down Expand Up @@ -556,3 +557,11 @@ def delete_dataset_term_links(session, dataset_uri):
for table_uri in tables:
GlossaryRepository.delete_glossary_terms_links(session, table_uri, 'DatasetTable')
GlossaryRepository.delete_glossary_terms_links(session, dataset_uri, 'Dataset')

@staticmethod
@ResourcePolicyService.has_resource_permission(GET_DATASET)
def list_s3_object_keys(uri):
with get_context().db_engine.scoped_session() as session:
dataset = DatasetRepository.get_dataset_by_uri(session, uri)

return S3DatasetClient(dataset).list_object_keys(dataset.S3BucketName)
34 changes: 33 additions & 1 deletion backend/dataall/modules/worksheets/api/queries.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
from dataall.base.api import gql
from dataall.modules.worksheets.api.resolvers import get_worksheet, list_worksheets, run_sql_query
from dataall.modules.worksheets.api.resolvers import (
get_worksheet,
list_worksheets,
run_sql_query,
text_to_sql,
analyze_text_genai,
)


getWorksheet = gql.QueryField(
Expand Down Expand Up @@ -28,3 +34,29 @@
],
resolver=run_sql_query,
)

TextToSQL = gql.QueryField(
name='textToSQL',
type=gql.String,
args=[
gql.Argument(name='worksheetUri', type=gql.NonNullableType(gql.String)),
gql.Argument(name='environmentUri', type=gql.NonNullableType(gql.String)),
gql.Argument(name='prompt', type=gql.NonNullableType(gql.String)),
gql.Argument(name='databaseName', type=gql.NonNullableType(gql.String)),
gql.Argument(name='tableNames', type=gql.ArrayType(gql.String)),
],
resolver=text_to_sql,
)

analyzeTextDocument = gql.QueryField(
name='analyzeTextDocument',
type=gql.String,
args=[
gql.Argument(name='worksheetUri', type=gql.NonNullableType(gql.String)),
gql.Argument(name='environmentUri', type=gql.NonNullableType(gql.String)),
gql.Argument(name='prompt', type=gql.NonNullableType(gql.String)),
gql.Argument(name='datasetUri', type=gql.NonNullableType(gql.String)),
gql.Argument(name='key', type=gql.NonNullableType(gql.String)),
],
resolver=analyze_text_genai,
)
77 changes: 52 additions & 25 deletions backend/dataall/modules/worksheets/api/resolvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from dataall.modules.worksheets.db.worksheet_models import Worksheet
from dataall.modules.worksheets.db.worksheet_repositories import WorksheetRepository
from dataall.modules.worksheets.services.worksheet_service import WorksheetService
from dataall.base.feature_toggle_checker import is_feature_enabled
from dataall.base.api.context import Context


Expand All @@ -14,27 +15,19 @@ def create_worksheet(context: Context, source, input: dict = None):
if not input.get('label'):
raise exceptions.RequiredParameter('label')

with context.engine.scoped_session() as session:
return WorksheetService.create_worksheet(
session=session,
username=context.username,
data=input,
)
return WorksheetService.create_worksheet(
data=input,
)


def update_worksheet(context: Context, source, worksheetUri: str = None, input: dict = None):
with context.engine.scoped_session() as session:
return WorksheetService.update_worksheet(
session=session, username=context.username, uri=worksheetUri, data=input
)
def update_worksheet(context: Context, source, worksheetUri: str, input: dict = None):
return WorksheetService.update_worksheet(uri=worksheetUri, data=input)


def get_worksheet(context: Context, source, worksheetUri: str = None):
with context.engine.scoped_session() as session:
return WorksheetService.get_worksheet(
session=session,
uri=worksheetUri,
)
def get_worksheet(context: Context, source, worksheetUri: str):
return WorksheetService.get_worksheet(
uri=worksheetUri,
)


def resolve_user_role(context: Context, source: Worksheet):
Expand All @@ -59,13 +52,47 @@ def list_worksheets(context, source, filter: dict = None):
)


def run_sql_query(context: Context, source, environmentUri: str = None, worksheetUri: str = None, sqlQuery: str = None):
with context.engine.scoped_session() as session:
return WorksheetService.run_sql_query(
session=session, uri=environmentUri, worksheetUri=worksheetUri, sqlQuery=sqlQuery
)
def run_sql_query(context: Context, source, environmentUri: str, worksheetUri: str, sqlQuery: str):
return WorksheetService.run_sql_query(uri=environmentUri, worksheetUri=worksheetUri, sqlQuery=sqlQuery)


def delete_worksheet(context, source, worksheetUri: str = None):
with context.engine.scoped_session() as session:
return WorksheetService.delete_worksheet(session=session, uri=worksheetUri)
def delete_worksheet(context, source, worksheetUri: str):
return WorksheetService.delete_worksheet(uri=worksheetUri)


@is_feature_enabled('modules.worksheets.features.nlq.active')
def text_to_sql(
context: Context,
source,
environmentUri: str,
worksheetUri: str,
prompt: str,
databaseName: str,
tableNames: list,
):
return WorksheetService.run_nlq(
uri=environmentUri,
prompt=prompt,
worksheetUri=worksheetUri,
db_name=databaseName,
table_names=tableNames,
)


@is_feature_enabled('modules.worksheets.features.nlq.active')
def analyze_text_genai(
context,
source,
worksheetUri: str,
environmentUri: str,
prompt: str,
datasetUri: str,
key: str,
):
return WorksheetService.analyze_text_genai(
uri=environmentUri,
worksheetUri=worksheetUri,
prompt=prompt,
datasetUri=datasetUri,
key=key,
)
Loading
Loading