From ac9b39a57ce56a11b1aabe327e586954e11290fe Mon Sep 17 00:00:00 2001 From: Devis Battisti Date: Tue, 8 Oct 2024 15:38:34 +0200 Subject: [PATCH 01/22] feat(chatbot): session GSI --- apps/chatbot/src/app/main.py | 23 ++++++++++++------- .../src/modules/chatbot/dynamodb.tf | 14 +++++++++++ 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/apps/chatbot/src/app/main.py b/apps/chatbot/src/app/main.py index c0e6f9943..caee0c383 100644 --- a/apps/chatbot/src/app/main.py +++ b/apps/chatbot/src/app/main.py @@ -160,12 +160,13 @@ async def sessions_fetching( raise HTTPException(status_code=422, detail=f"[sessions_fetching] userId: {userId}, error: {e}") # TODO: pagination + items = db_response.get('Items', []) result = { - "items": db_response['Items'], + "items": items, "page": 1, "pages": 1, - "size": len(db_response['Items']), - "total": len(db_response['Items']), + "size": len(items), + "total": len(items), } return result @@ -214,20 +215,26 @@ async def queries_fetching( sessionId = last_session_id(userId) try: - # TODO: add userId filter db_response = table_queries.query( - KeyConditionExpression=Key("sessionId").eq(sessionId) + KeyConditionExpression=Key("sessionId").eq(sessionId) & + Key("id").eq(userId) ) except (BotoCoreError, ClientError) as e: raise HTTPException(status_code=422, detail=f"[queries_fetching] sessionId: {sessionId}, error: {e}") - result = db_response['Items'] + result = db_response.get('Items', []) return result def last_session_id(userId: str): - # TODO: retrieve last user session - return '1' + db_response = table_sessions.query( + IndexName='SessionsByCreatedAtIndex', + KeyConditionExpression=Key('userId').eq(userId), + ScanIndexForward=False, + Limit=1 + ) + items = db_response.get('Items', []) + return items[0] if items else None @app.patch("/queries/{id}") async def query_feedback (badAnswer: bool): diff --git a/apps/infrastructure/src/modules/chatbot/dynamodb.tf b/apps/infrastructure/src/modules/chatbot/dynamodb.tf index 50d0e63b3..eb0a69512 100644 --- a/apps/infrastructure/src/modules/chatbot/dynamodb.tf +++ b/apps/infrastructure/src/modules/chatbot/dynamodb.tf @@ -41,5 +41,19 @@ module "dynamodb_chatbot_sessions" { name = "userId" type = "S" }, + { + name = "createdAt" + type = "S" # String (for ISO 8601 timestamp) + }, ] + + # GSI for query on created_at + global_secondary_index { + name = "SessionsByCreatedAtIndex" + hash_key = "userId" + range_key = "createdAt" + read_capacity = 5 + write_capacity = 5 + projection_type = "ALL" + } } From 36c89af712298728fa0c38bb48ef7c6f1147ead5 Mon Sep 17 00:00:00 2001 From: Devis Battisti Date: Wed, 9 Oct 2024 02:46:49 +0200 Subject: [PATCH 02/22] feat(chatbot): docker compose --- apps/chatbot/.env.example | 3 ++ apps/chatbot/docker/app.local.Dockerfile | 6 ++- apps/chatbot/docker/compose.yaml | 52 +++++++++++++++++++ .../chatbot/docker/docker-run-create-index.sh | 2 + apps/chatbot/docker/docker-run-local-bash.sh | 2 + apps/chatbot/poetry.lock | 12 ++--- apps/chatbot/src/app/main.py | 32 +++++++----- apps/chatbot/src/modules/utils.py | 7 +-- 8 files changed, 92 insertions(+), 24 deletions(-) create mode 100644 apps/chatbot/docker/compose.yaml create mode 100755 apps/chatbot/docker/docker-run-create-index.sh create mode 100755 apps/chatbot/docker/docker-run-local-bash.sh diff --git a/apps/chatbot/.env.example b/apps/chatbot/.env.example index 654f93e55..52bd735dc 100644 --- a/apps/chatbot/.env.example +++ b/apps/chatbot/.env.example @@ -21,3 +21,6 @@ CHB_ENGINE_SIMILARITY_CUTOFF=... CHB_ENGINE_USE_ASYNC=... CHB_ENGINE_USE_STREAMING=... CHB_QUERY_TABLE_PREFIX=chatbot-local +GOOGLE_AND_REDIS_AWS_DEFAULT_REGION=eu-west-3 +CHB_REDIS_INDEX_NAME=zero +CHB_DYNAMODB_URL=http://locahost:8080 diff --git a/apps/chatbot/docker/app.local.Dockerfile b/apps/chatbot/docker/app.local.Dockerfile index 21bcd8be8..532ebf0b7 100644 --- a/apps/chatbot/docker/app.local.Dockerfile +++ b/apps/chatbot/docker/app.local.Dockerfile @@ -1,6 +1,10 @@ FROM python:3.12.4-slim-bullseye ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get update && \ + apt-get install -y \ + curl + ENV PYTHONPATH=/app RUN pip install --upgrade pip \ @@ -14,4 +18,4 @@ RUN poetry install COPY . . -CMD ["fastapi", "dev", "src/app/main.py", "--port", "8080"] +CMD ["fastapi", "dev", "src/app/main.py", "--port", "8080", "--host", "0.0.0.0"] diff --git a/apps/chatbot/docker/compose.yaml b/apps/chatbot/docker/compose.yaml new file mode 100644 index 000000000..e3cbc7e52 --- /dev/null +++ b/apps/chatbot/docker/compose.yaml @@ -0,0 +1,52 @@ +services: + api: + build: + context: .. + dockerfile: docker/app.local.Dockerfile + ports: + - "8080:8080" + volumes: + - ..:/app + depends_on: + redis: + condition: service_started + dynamodb: + condition: service_started + networks: + - ntw + + dynamodb: + image: amazon/dynamodb-local:2.5.2 + environment: + - AWS_ACCESS_KEY_ID=dummy + - AWS_SECRET_ACCESS_KEY=dummy + - AWS_DEFAULT_REGION=local + ports: + - "8000:8000" + networks: + - ntw + + redis: + image: redis/redis-stack:7.2.0-v13 + ports: + - "6379:6379" + networks: + - ntw + + create_index: + build: + context: .. + dockerfile: docker/app.local.Dockerfile + ports: + - "8080:8080" + volumes: + - ..:/app + command: "python src/modules/create_vector_index.py --params config/params.yaml" + depends_on: + redis: + condition: service_started + networks: + - ntw + +networks: + ntw: diff --git a/apps/chatbot/docker/docker-run-create-index.sh b/apps/chatbot/docker/docker-run-create-index.sh new file mode 100755 index 000000000..0752c4729 --- /dev/null +++ b/apps/chatbot/docker/docker-run-create-index.sh @@ -0,0 +1,2 @@ +#!/bin/bash +docker compose -f docker/compose.yaml -p chatbot up create_index diff --git a/apps/chatbot/docker/docker-run-local-bash.sh b/apps/chatbot/docker/docker-run-local-bash.sh new file mode 100755 index 000000000..9c5b63499 --- /dev/null +++ b/apps/chatbot/docker/docker-run-local-bash.sh @@ -0,0 +1,2 @@ +#!/bin/bash +docker run -it --env-file ./.env fastapi-local bash diff --git a/apps/chatbot/poetry.lock b/apps/chatbot/poetry.lock index a67da263e..4f4832e1d 100644 --- a/apps/chatbot/poetry.lock +++ b/apps/chatbot/poetry.lock @@ -4926,18 +4926,18 @@ cffi = {version = "*", markers = "implementation_name == \"pypy\""} [[package]] name = "redis" -version = "5.0.8" +version = "5.1.1" description = "Python client for Redis database and key-value store" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "redis-5.0.8-py3-none-any.whl", hash = "sha256:56134ee08ea909106090934adc36f65c9bcbbaecea5b21ba704ba6fb561f8eb4"}, - {file = "redis-5.0.8.tar.gz", hash = "sha256:0c5b10d387568dfe0698c6fad6615750c24170e548ca2deac10c649d463e9870"}, + {file = "redis-5.1.1-py3-none-any.whl", hash = "sha256:f8ea06b7482a668c6475ae202ed8d9bcaa409f6e87fb77ed1043d912afd62e24"}, + {file = "redis-5.1.1.tar.gz", hash = "sha256:f6c997521fedbae53387307c5d0bf784d9acc28d9f1d058abeac566ec4dbed72"}, ] [package.extras] -hiredis = ["hiredis (>1.0.0)"] -ocsp = ["cryptography (>=36.0.1)", "pyopenssl (==20.0.1)", "requests (>=2.26.0)"] +hiredis = ["hiredis (>=3.0.0)"] +ocsp = ["cryptography (>=36.0.1)", "pyopenssl (==23.2.1)", "requests (>=2.31.0)"] [[package]] name = "redisvl" diff --git a/apps/chatbot/src/app/main.py b/apps/chatbot/src/app/main.py index c0e6f9943..9d04b5d3d 100644 --- a/apps/chatbot/src/app/main.py +++ b/apps/chatbot/src/app/main.py @@ -28,20 +28,24 @@ class Query(BaseModel): queriedAt: str | None = None if (os.getenv('environment', 'dev') == 'local'): - profile_name='dummy' - endpoint_url='http://localhost:8000' - region_name = AWS_DEFAULT_REGION - -boto3_session = boto3.session.Session( - profile_name = locals().get('profile_name', None), - region_name=locals().get('region_name', None) -) - -dynamodb = boto3_session.resource( - 'dynamodb', - endpoint_url=locals().get('endpoint_url', None), - region_name=locals().get('region_name', None), -) + boto3_session = boto3.session.Session( + aws_access_key_id='dummy', + aws_secret_access_key='dummy', + region_name=AWS_DEFAULT_REGION + ) + dynamodb = boto3_session.resource( + 'dynamodb', + endpoint_url=os.getenv('CHB_DYNAMODB_URL', 'http://localhost:8000'), + region_name=AWS_DEFAULT_REGION + ) +else: + boto3_session = boto3.session.Session( + region_name=AWS_DEFAULT_REGION + ) + dynamodb = boto3_session.resource( + 'dynamodb', + region_name=AWS_DEFAULT_REGION + ) table_queries = dynamodb.Table( f"{os.getenv('CHB_QUERY_TABLE_PREFIX', 'chatbot')}-queries" diff --git a/apps/chatbot/src/modules/utils.py b/apps/chatbot/src/modules/utils.py index 4e511f829..8f4a8ccec 100644 --- a/apps/chatbot/src/modules/utils.py +++ b/apps/chatbot/src/modules/utils.py @@ -8,7 +8,7 @@ AWS_ACCESS_KEY_ID = os.getenv("CHB_AWS_ACCESS_KEY_ID") AWS_SECRET_ACCESS_KEY = os.getenv("CHB_AWS_SECRET_ACCESS_KEY") -AWS_DEFAUL_REGION = os.getenv("GOOGLE_AND_REDIS_AWS_DEFAULT_REGION") +AWS_DEFAULT_REGION = os.getenv("CHB_AWS_DEFAULT_REGION") def get_ssm_parameter(name: str, default: str | None = None) -> str | None: @@ -19,11 +19,12 @@ def get_ssm_parameter(name: str, default: str | None = None) -> str | None: :param default: The default value to return if the parameter is not found. :return: The value of the requested parameter. """ + ssm = boto3.client( "ssm", aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, - region_name=AWS_DEFAUL_REGION + region_name=AWS_DEFAULT_REGION ) logging.debug(f"Getting parameter {name} from SSM") try: @@ -37,4 +38,4 @@ def get_ssm_parameter(name: str, default: str | None = None) -> str | None: return default logging.debug(f"Parameter {name} retrieved from SSM") - return response["Parameter"]["Value"] \ No newline at end of file + return response["Parameter"]["Value"] From a2c16c76c47d5732f6b80cdfba9a563b77d0cf5a Mon Sep 17 00:00:00 2001 From: Devis Battisti Date: Wed, 9 Oct 2024 10:22:50 +0200 Subject: [PATCH 03/22] fix(chatbot): dynamodb and redis for local development with docker compose --- apps/chatbot/docker/compose.yaml | 1 + apps/chatbot/docker/docker-compose-up-api.sh | 2 ++ apps/chatbot/docker/files/.aws/config | 2 ++ apps/chatbot/docker/files/.aws/credentials | 3 +++ apps/chatbot/src/app/main.py | 16 ++++++---------- 5 files changed, 14 insertions(+), 10 deletions(-) create mode 100755 apps/chatbot/docker/docker-compose-up-api.sh create mode 100644 apps/chatbot/docker/files/.aws/config create mode 100644 apps/chatbot/docker/files/.aws/credentials diff --git a/apps/chatbot/docker/compose.yaml b/apps/chatbot/docker/compose.yaml index e3cbc7e52..0fcda156b 100644 --- a/apps/chatbot/docker/compose.yaml +++ b/apps/chatbot/docker/compose.yaml @@ -7,6 +7,7 @@ services: - "8080:8080" volumes: - ..:/app + - ./files/.aws:/root/.aws depends_on: redis: condition: service_started diff --git a/apps/chatbot/docker/docker-compose-up-api.sh b/apps/chatbot/docker/docker-compose-up-api.sh new file mode 100755 index 000000000..05cfd5591 --- /dev/null +++ b/apps/chatbot/docker/docker-compose-up-api.sh @@ -0,0 +1,2 @@ +#!/bin/bash +docker compose -f docker/compose.yaml -p chatbot up api diff --git a/apps/chatbot/docker/files/.aws/config b/apps/chatbot/docker/files/.aws/config new file mode 100644 index 000000000..bba4a45b8 --- /dev/null +++ b/apps/chatbot/docker/files/.aws/config @@ -0,0 +1,2 @@ +[profile default] +region = eu-south-1 diff --git a/apps/chatbot/docker/files/.aws/credentials b/apps/chatbot/docker/files/.aws/credentials new file mode 100644 index 000000000..59df68646 --- /dev/null +++ b/apps/chatbot/docker/files/.aws/credentials @@ -0,0 +1,3 @@ +[default] +aws_access_key_id = 123 +aws_secret_access_key = xyz diff --git a/apps/chatbot/src/app/main.py b/apps/chatbot/src/app/main.py index 9d04b5d3d..f12204d7c 100644 --- a/apps/chatbot/src/app/main.py +++ b/apps/chatbot/src/app/main.py @@ -18,30 +18,26 @@ params = yaml.safe_load(open("config/params.yaml", "r")) prompts = yaml.safe_load(open("config/prompts.yaml", "r")) -chatbot = Chatbot(params, prompts) - AWS_DEFAULT_REGION = os.getenv('CHB_AWS_DEFAULT_REGION', os.getenv('AWS_DEFAULT_REGION', None)) +chatbot = Chatbot(params, prompts) + class Query(BaseModel): question: str queriedAt: str | None = None +boto3_session = boto3.session.Session( + region_name=AWS_DEFAULT_REGION +) + if (os.getenv('environment', 'dev') == 'local'): - boto3_session = boto3.session.Session( - aws_access_key_id='dummy', - aws_secret_access_key='dummy', - region_name=AWS_DEFAULT_REGION - ) dynamodb = boto3_session.resource( 'dynamodb', endpoint_url=os.getenv('CHB_DYNAMODB_URL', 'http://localhost:8000'), region_name=AWS_DEFAULT_REGION ) else: - boto3_session = boto3.session.Session( - region_name=AWS_DEFAULT_REGION - ) dynamodb = boto3_session.resource( 'dynamodb', region_name=AWS_DEFAULT_REGION From 08a7fecda7c99b65025ff7c4e18534422d93e828 Mon Sep 17 00:00:00 2001 From: Devis Battisti Date: Wed, 9 Oct 2024 15:33:51 +0200 Subject: [PATCH 04/22] chore(chatbot):remove duplicate imports --- apps/chatbot/src/modules/vector_database.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/apps/chatbot/src/modules/vector_database.py b/apps/chatbot/src/modules/vector_database.py index de868a9c1..3e7bc789c 100644 --- a/apps/chatbot/src/modules/vector_database.py +++ b/apps/chatbot/src/modules/vector_database.py @@ -10,9 +10,6 @@ from selenium import webdriver from typing import List, Tuple -from bs4 import BeautifulSoup -from selenium import webdriver -import html2text import s3fs from llama_index.core import ( From 7fe9101a6a0d69a1ab1b0a1da29acb38cdcb0781 Mon Sep 17 00:00:00 2001 From: Devis Battisti Date: Wed, 9 Oct 2024 15:35:21 +0200 Subject: [PATCH 05/22] chore(chatbot): linting --- apps/chatbot/src/modules/vector_database.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/chatbot/src/modules/vector_database.py b/apps/chatbot/src/modules/vector_database.py index 3e7bc789c..013481ed8 100644 --- a/apps/chatbot/src/modules/vector_database.py +++ b/apps/chatbot/src/modules/vector_database.py @@ -175,7 +175,7 @@ def create_documentation( else: title, text = html2markdown(open(file)) - if text == None or text == "" or text == "None": + if text is None or text == "" or text == "None": # print(file) empty_pages.append(file) @@ -250,7 +250,7 @@ def build_automerging_index( leaf_nodes, storage_context=storage_context ) - logging.info(f"Created index successfully.") + logging.info("Created index successfully.") automerging_index.storage_context.persist( persist_dir=save_dir @@ -296,7 +296,7 @@ def build_automerging_index_s3( leaf_nodes, storage_context=storage_context ) - logging.info(f"Created index successfully.") + logging.info("Created index successfully.") if s3_bucket_name: # store hash table with FS.open(f"{s3_bucket_name}/hash_table.json", "w") as f: @@ -468,7 +468,7 @@ def load_automerging_index_redis( schema=REDIS_SCHEMA ) - logging.info(f"Loading vector index from Redis...") + logging.info("Loading vector index from Redis...") storage_context = StorageContext.from_defaults( vector_store=redis_vector_store, docstore=REDIS_DOCSTORE, From 9b2eb618cad64161a440a63202e31429f38a805b Mon Sep 17 00:00:00 2001 From: Devis Battisti Date: Wed, 9 Oct 2024 17:38:30 +0200 Subject: [PATCH 06/22] fix(chatbot):create index in docker --- apps/chatbot/docker/compose.yaml | 2 + apps/chatbot/poetry.lock | 13 +++- apps/chatbot/pyproject.toml | 1 + apps/chatbot/src/modules/vector_database.py | 79 +++++++++++++-------- 4 files changed, 65 insertions(+), 30 deletions(-) diff --git a/apps/chatbot/docker/compose.yaml b/apps/chatbot/docker/compose.yaml index 0fcda156b..ef266468a 100644 --- a/apps/chatbot/docker/compose.yaml +++ b/apps/chatbot/docker/compose.yaml @@ -8,6 +8,7 @@ services: volumes: - ..:/app - ./files/.aws:/root/.aws + - ./build-devp/out:/app/build-devp/out depends_on: redis: condition: service_started @@ -43,6 +44,7 @@ services: volumes: - ..:/app command: "python src/modules/create_vector_index.py --params config/params.yaml" + tty: true depends_on: redis: condition: service_started diff --git a/apps/chatbot/poetry.lock b/apps/chatbot/poetry.lock index 4f4832e1d..bb8751de2 100644 --- a/apps/chatbot/poetry.lock +++ b/apps/chatbot/poetry.lock @@ -712,6 +712,17 @@ files = [ [package.dependencies] packaging = ">=23.1" +[[package]] +name = "chromedriver-py" +version = "129.0.6668.91" +description = "chromedriver binaries for all platforms" +optional = false +python-versions = "*" +files = [ + {file = "chromedriver_py-129.0.6668.91-py3-none-any.whl", hash = "sha256:64b5f117e155e3f50840306838a0cfeae6eaff634b6dbf29e4c9fc179c71996e"}, + {file = "chromedriver_py-129.0.6668.91.tar.gz", hash = "sha256:90a2630e90fd0b7287a847c105861da7728b70c26ea48d2ff8939a32f0f63d4b"}, +] + [[package]] name = "click" version = "8.1.7" @@ -6633,4 +6644,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "fdee95ccf4e92495e158ffedad4995b97152c0b5724cad47fde6f109df6e2026" +content-hash = "bb303cb09305152f1ba71c82166d58ef6bff30750c603d33376ec54ebd7ca55d" diff --git a/apps/chatbot/pyproject.toml b/apps/chatbot/pyproject.toml index 7585ce7c5..13f603c38 100644 --- a/apps/chatbot/pyproject.toml +++ b/apps/chatbot/pyproject.toml @@ -37,6 +37,7 @@ llama-index-llms-gemini = "^0.3.4" google-generativeai = "^0.5.2" llama-index-embeddings-gemini = "^0.2.0" llama-index-llms-bedrock-converse = "^0.3.0" +chromedriver-py = "^129.0.6668.91" [build-system] diff --git a/apps/chatbot/src/modules/vector_database.py b/apps/chatbot/src/modules/vector_database.py index 013481ed8..f991f4d73 100644 --- a/apps/chatbot/src/modules/vector_database.py +++ b/apps/chatbot/src/modules/vector_database.py @@ -8,7 +8,10 @@ import html2text from bs4 import BeautifulSoup from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service from typing import List, Tuple +from chromedriver_py import binary_path import s3fs @@ -35,7 +38,6 @@ load_dotenv() - PROVIDER = os.getenv("CHB_PROVIDER") assert PROVIDER in ["google", "aws"] @@ -114,15 +116,18 @@ def filter_html_files(html_files: List[str]) -> List[str]: pattern = re.compile(r"/v\d{1,2}.") pattern2 = re.compile(r"/\d{1,2}.") filtered_files = [file for file in html_files if not pattern.search(file) and not pattern2.search(file)] + logging.info(f"[vector_database.py] filter_html_files len(filtered_files): {len(filtered_files)}") return filtered_files def get_html_files(root_folder: str) -> List[str]: + logging.info(f"[vector_database.py] get_html_files({root_folder})") html_files = [] for root, _, files in os.walk(root_folder): for file in files: if file.endswith(".html"): html_files.append(os.path.join(root, file)) + logging.info(f"[vector_database.py] get_html_files len(html_files): {len(html_files)}") return sorted(filter_html_files(html_files)) @@ -153,10 +158,15 @@ def create_documentation( if documentation_dir[-1] != "/": documentation_dir += "/" - logging.info(f"Getting documentation from: {documentation_dir}") + logging.info(f"[vector_database.py] Getting documentation from: {documentation_dir}") + logging.info(f"[vector_database.py] create_documentation: DYNAMIC_HTML: {DYNAMIC_HTMLS}") + logging.info(f"[vector_database.py] create_documentation: documentation_dir: {documentation_dir}") - html_files = get_html_files(documentation_dir) + # FIX: all docs + html_files = get_html_files(documentation_dir)[:10] + logging.info(f"[vector_database.py] create_documentation: len(html_files): {len(html_files)}") dynamic_htmls = [os.path.join(documentation_dir, path) for path in DYNAMIC_HTMLS] + logging.info(f"[vector_database.py] create_documentation: len(dynamic_htmls): {len(dynamic_htmls)}") documents = [] hash_table = {} empty_pages = [] @@ -165,9 +175,20 @@ def create_documentation( for file in tqdm.tqdm(html_files, total=len(html_files), desc="Extracting HTML"): - if file in dynamic_htmls: +# FIX: resolve webdriver.Chrome "self.assert_process_still_running" error in docker +# if file in dynamic_htmls: + if 6 == 9: url = file.replace(documentation_dir, f"{website_url}/").replace(".html", "") - driver = webdriver.Chrome() + + # svc = webdriver.ChromeService(executable_path=binary_path) + service = Service(executable_path=binary_path) + options = webdriver.ChromeOptions() + options.add_argument('--headless=new') + options.add_argument('--no-sandbox') + options.add_argument('user-agent=fake-useragent') + driver = webdriver.Chrome(service=service, options=options) + + logging.info(f"[vector_database.py] create_documentation: driver.get({url})") driver.get(url) time.sleep(5) title, text = html2markdown(driver.page_source) @@ -209,8 +230,8 @@ def create_documentation( # with open("full_text.txt", "w") as f: # f.write(full_text) - logging.info(f"Number of documents with content: {len(documents)}") - logging.info(f"Number of empty pages in the documentation: {len(empty_pages)}. These are left out.") + logging.info(f"[vector_database.py] Number of documents with content: {len(documents)}") + logging.info(f"[vector_database.py] Number of empty pages in the documentation: {len(empty_pages)}. These are left out.") with open("empty_htmls.json", "w") as f: json.dump(empty_pages, f, indent=4) @@ -226,7 +247,7 @@ def build_automerging_index( chunk_overlap: int ) -> VectorStoreIndex: - logging.info("Storing vector index and hash table on AWS bucket S3..") + logging.info("[vector_database.py] Storing vector index and hash table on AWS bucket S3..") Settings.llm = llm Settings.embed_model = embed_model @@ -239,7 +260,7 @@ def build_automerging_index( print(documentation_dir) documents, hash_table = create_documentation(WEBSITE_URL, documentation_dir) - logging.info("Creating index...") + logging.info("[vector_database.py] Creating index...") nodes = Settings.node_parser.get_nodes_from_documents(documents) leaf_nodes = get_leaf_nodes(nodes) @@ -250,7 +271,7 @@ def build_automerging_index( leaf_nodes, storage_context=storage_context ) - logging.info("Created index successfully.") + logging.info("[vector_database.py] Created index successfully.") automerging_index.storage_context.persist( persist_dir=save_dir @@ -258,7 +279,7 @@ def build_automerging_index( with open("hash_table.json", "w") as f: json.dump(hash_table, f, indent=4) - logging.info(f"Saved index successfully to {save_dir}.") + logging.info(f"[vector_database.py] Saved index successfully to {save_dir}.") return automerging_index @@ -273,7 +294,7 @@ def build_automerging_index_s3( chunk_overlap: int ) -> VectorStoreIndex: - logging.info("Storing vector index and hash table on AWS bucket S3..") + logging.info("[vector_database.py] Storing vector index and hash table on AWS bucket S3..") Settings.llm = llm Settings.embed_model = embed_model @@ -285,7 +306,7 @@ def build_automerging_index_s3( assert documentation_dir is not None documents, hash_table = create_documentation(WEBSITE_URL, documentation_dir) - logging.info("Creating index...") + logging.info("[vector_database.py] Creating index...") nodes = Settings.node_parser.get_nodes_from_documents(documents) leaf_nodes = get_leaf_nodes(nodes) @@ -296,19 +317,19 @@ def build_automerging_index_s3( leaf_nodes, storage_context=storage_context ) - logging.info("Created index successfully.") + logging.info("[vector_database.py] Created index successfully.") if s3_bucket_name: # store hash table with FS.open(f"{s3_bucket_name}/hash_table.json", "w") as f: json.dump(hash_table, f, indent=4) - logging.info(f"Uploaded URLs hash table successfully to S3 bucket {s3_bucket_name}/hash_table.json") + logging.info(f"[vector_database.py] Uploaded URLs hash table successfully to S3 bucket {s3_bucket_name}/hash_table.json") # store vector index automerging_index.storage_context.persist( persist_dir=f"{s3_bucket_name}/{save_dir}", fs = FS ) - logging.info(f"Uploaded vector index successfully to S3 bucket at {s3_bucket_name}/{save_dir}.") + logging.info(f"[vector_database.py] Uploaded vector index successfully to S3 bucket at {s3_bucket_name}/{save_dir}.") else: automerging_index.storage_context.persist( persist_dir=save_dir @@ -316,7 +337,7 @@ def build_automerging_index_s3( with open("hash_table.json", "w") as f: json.dump(hash_table, f, indent=4) - logging.info(f"Saved index successfully to {save_dir}.") + logging.info(f"[vector_database.py] Saved index successfully to {save_dir}.") return automerging_index @@ -329,7 +350,7 @@ def build_automerging_index_redis( chunk_overlap: int ) -> VectorStoreIndex: - logging.info("Storing vector index and hash table on Redis..") + logging.info("[vector_database.py] Storing vector index and hash table on Redis..") Settings.llm = llm Settings.embed_model = embed_model @@ -345,9 +366,9 @@ def build_automerging_index_redis( key=key, val=value ) - logging.info("Hash table is now on Redis.") + logging.info("[vector_database.py] Hash table is now on Redis.") - logging.info("Creating index...") + logging.info("[vector_database.py] Creating index...") nodes = Settings.node_parser.get_nodes_from_documents(documents) leaf_nodes = get_leaf_nodes(nodes) @@ -368,7 +389,7 @@ def build_automerging_index_redis( leaf_nodes, storage_context=storage_context ) - logging.info("Created vector index successfully and stored on Redis.") + logging.info("[vector_database.py] Created vector index successfully and stored on Redis.") return automerging_index @@ -378,16 +399,16 @@ def load_url_hash_table( ) -> dict: if s3_bucket_name: - logging.info("Getting URLs hash table from S3 bucket...") + logging.info("[vector_database.py] Getting URLs hash table from S3 bucket...") with FS.open(f"{s3_bucket_name}/hash_table.json", "r") as f: hash_table = json.load(f) else: - logging.info("Getting URLs hash table from local...") + logging.info("[vector_database.py] Getting URLs hash table from local...") with open("hash_table.json", "r") as f: hash_table = json.load(f) - logging.info("Loaded URLs hash table successfully.") + logging.info("[vector_database.py] Loaded URLs hash table successfully.") return hash_table @@ -407,7 +428,7 @@ def load_automerging_index_s3( chunk_overlap=chunk_overlap ) - logging.info(f"{save_dir} directory exists! Loading vector index...") + logging.info(f"[vector_database.py] {save_dir} directory exists! Loading vector index...") automerging_index = load_index_from_storage( StorageContext.from_defaults( persist_dir = f"{s3_bucket_name}/{save_dir}", @@ -415,7 +436,7 @@ def load_automerging_index_s3( ) ) - logging.info("Loaded vector index successfully!") + logging.info("[vector_database.py] Loaded vector index successfully!") return automerging_index @@ -435,7 +456,7 @@ def load_automerging_index( chunk_overlap=chunk_overlap ) - logging.info(f"{save_dir} directory exists! Loading vector index...") + logging.info(f"[vector_database.py] {save_dir} directory exists! Loading vector index...") automerging_index = load_index_from_storage( StorageContext.from_defaults( @@ -443,7 +464,7 @@ def load_automerging_index( ) ) - logging.info("Loaded vector index successfully!") + logging.info("[vector_database.py] Loaded vector index successfully!") return automerging_index @@ -468,7 +489,7 @@ def load_automerging_index_redis( schema=REDIS_SCHEMA ) - logging.info("Loading vector index from Redis...") + logging.info("[vector_database.py] Loading vector index from Redis...") storage_context = StorageContext.from_defaults( vector_store=redis_vector_store, docstore=REDIS_DOCSTORE, From b251837705179b43db1e3bef52bed9bcd2b382cb Mon Sep 17 00:00:00 2001 From: Devis Battisti Date: Thu, 10 Oct 2024 10:09:31 +0200 Subject: [PATCH 07/22] chore(chatbot): llamaindex index id --- apps/chatbot/src/modules/vector_database.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/chatbot/src/modules/vector_database.py b/apps/chatbot/src/modules/vector_database.py index f991f4d73..abbef5415 100644 --- a/apps/chatbot/src/modules/vector_database.py +++ b/apps/chatbot/src/modules/vector_database.py @@ -391,6 +391,7 @@ def build_automerging_index_redis( ) logging.info("[vector_database.py] Created vector index successfully and stored on Redis.") + automerging_index.set_index_id("1234") return automerging_index @@ -498,6 +499,7 @@ def load_automerging_index_redis( automerging_index = load_index_from_storage( storage_context=storage_context, + index_id="1234" ) return automerging_index From 4ecd4b346c74c3037b478207c84cd8038eb92dda Mon Sep 17 00:00:00 2001 From: Devis Battisti Date: Thu, 10 Oct 2024 10:35:31 +0200 Subject: [PATCH 08/22] fix(chatbot): create vector index with all docs --- apps/chatbot/src/modules/vector_database.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/apps/chatbot/src/modules/vector_database.py b/apps/chatbot/src/modules/vector_database.py index abbef5415..85eec01ee 100644 --- a/apps/chatbot/src/modules/vector_database.py +++ b/apps/chatbot/src/modules/vector_database.py @@ -162,8 +162,7 @@ def create_documentation( logging.info(f"[vector_database.py] create_documentation: DYNAMIC_HTML: {DYNAMIC_HTMLS}") logging.info(f"[vector_database.py] create_documentation: documentation_dir: {documentation_dir}") - # FIX: all docs - html_files = get_html_files(documentation_dir)[:10] + html_files = get_html_files(documentation_dir) logging.info(f"[vector_database.py] create_documentation: len(html_files): {len(html_files)}") dynamic_htmls = [os.path.join(documentation_dir, path) for path in DYNAMIC_HTMLS] logging.info(f"[vector_database.py] create_documentation: len(dynamic_htmls): {len(dynamic_htmls)}") From ea7d3db913de0e1bcdbcecb4e4020ceb3973161a Mon Sep 17 00:00:00 2001 From: Devis Battisti Date: Thu, 10 Oct 2024 11:08:04 +0200 Subject: [PATCH 09/22] chore(chatbot): terraform lint --- .../infrastructure/src/modules/chatbot/dynamodb.tf | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/apps/infrastructure/src/modules/chatbot/dynamodb.tf b/apps/infrastructure/src/modules/chatbot/dynamodb.tf index eb0a69512..d13542a70 100644 --- a/apps/infrastructure/src/modules/chatbot/dynamodb.tf +++ b/apps/infrastructure/src/modules/chatbot/dynamodb.tf @@ -43,17 +43,17 @@ module "dynamodb_chatbot_sessions" { }, { name = "createdAt" - type = "S" # String (for ISO 8601 timestamp) + type = "S" }, ] # GSI for query on created_at global_secondary_index { - name = "SessionsByCreatedAtIndex" - hash_key = "userId" - range_key = "createdAt" - read_capacity = 5 - write_capacity = 5 - projection_type = "ALL" + name = "SessionsByCreatedAtIndex" + hash_key = "userId" + range_key = "createdAt" + read_capacity = 5 + write_capacity = 5 + projection_type = "ALL" } } From 28695e37142d3902498412141c8be1aea1edbc8c Mon Sep 17 00:00:00 2001 From: Devis Battisti Date: Thu, 10 Oct 2024 11:34:56 +0200 Subject: [PATCH 10/22] fix(chatbot): terraform syntax --- .../src/modules/chatbot/dynamodb.tf | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/apps/infrastructure/src/modules/chatbot/dynamodb.tf b/apps/infrastructure/src/modules/chatbot/dynamodb.tf index d13542a70..350dbc95a 100644 --- a/apps/infrastructure/src/modules/chatbot/dynamodb.tf +++ b/apps/infrastructure/src/modules/chatbot/dynamodb.tf @@ -48,12 +48,14 @@ module "dynamodb_chatbot_sessions" { ] # GSI for query on created_at - global_secondary_index { - name = "SessionsByCreatedAtIndex" - hash_key = "userId" - range_key = "createdAt" - read_capacity = 5 - write_capacity = 5 - projection_type = "ALL" - } + global_secondary_indexes = [ + { + name = "SessionsByCreatedAtIndex" + hash_key = "userId" + range_key = "createdAt" + read_capacity = 5 + write_capacity = 5 + projection_type = "ALL" + } + ] } From 238edfd5826582dd76542d551fea5fd49e3fad80 Mon Sep 17 00:00:00 2001 From: Devis Battisti Date: Thu, 10 Oct 2024 11:55:31 +0200 Subject: [PATCH 11/22] chore(chatbot): remove dynamodb options --- apps/infrastructure/src/modules/chatbot/dynamodb.tf | 2 -- 1 file changed, 2 deletions(-) diff --git a/apps/infrastructure/src/modules/chatbot/dynamodb.tf b/apps/infrastructure/src/modules/chatbot/dynamodb.tf index 350dbc95a..a2e488d67 100644 --- a/apps/infrastructure/src/modules/chatbot/dynamodb.tf +++ b/apps/infrastructure/src/modules/chatbot/dynamodb.tf @@ -53,8 +53,6 @@ module "dynamodb_chatbot_sessions" { name = "SessionsByCreatedAtIndex" hash_key = "userId" range_key = "createdAt" - read_capacity = 5 - write_capacity = 5 projection_type = "ALL" } ] From 5f63560d533ae7b5cdafa4e3c6b95486bb71c259 Mon Sep 17 00:00:00 2001 From: Devis Battisti Date: Thu, 10 Oct 2024 12:03:54 +0200 Subject: [PATCH 12/22] chore(chatbot): from global to local secondary index --- apps/infrastructure/src/modules/chatbot/dynamodb.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/infrastructure/src/modules/chatbot/dynamodb.tf b/apps/infrastructure/src/modules/chatbot/dynamodb.tf index a2e488d67..2d9bf83e9 100644 --- a/apps/infrastructure/src/modules/chatbot/dynamodb.tf +++ b/apps/infrastructure/src/modules/chatbot/dynamodb.tf @@ -47,8 +47,8 @@ module "dynamodb_chatbot_sessions" { }, ] - # GSI for query on created_at - global_secondary_indexes = [ + # LSI for query on created_at + local_secondary_indexes = [ { name = "SessionsByCreatedAtIndex" hash_key = "userId" From a8f4c9ad0039eaa94491b2f0d9aaf0f181c0b13d Mon Sep 17 00:00:00 2001 From: Devis Battisti Date: Fri, 11 Oct 2024 09:28:31 +0200 Subject: [PATCH 13/22] feat(chatbot): find or create session --- apps/chatbot/src/app/main.py | 56 ++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/apps/chatbot/src/app/main.py b/apps/chatbot/src/app/main.py index cb08735a4..4b7b58928 100644 --- a/apps/chatbot/src/app/main.py +++ b/apps/chatbot/src/app/main.py @@ -6,6 +6,7 @@ import uuid import boto3 import datetime +import time import jwt from typing import Annotated from boto3.dynamodb.conditions import Key @@ -68,13 +69,13 @@ async def query_creation ( query: Query, authorization: Annotated[str | None, Header()] = None ): + now = datetime.datetime.now(datetime.UTC) userId = current_user_id(authorization) - session = find_or_create_session(userId) + session = find_or_create_session(userId, now=now) answer = chatbot.generate(query.question) - now = datetime.datetime.now(datetime.timezone.utc).isoformat() if query.queriedAt is None: - queriedAt = now + queriedAt = now.isoformat() else: queriedAt = query.queriedAt @@ -83,7 +84,7 @@ async def query_creation ( "sessionId": session['id'], "question": query.question, "answer": answer, - "createdAt": now, + "createdAt": now.isoformat(), "queriedAt": queriedAt } @@ -109,24 +110,41 @@ def current_user_id(authorization: str): return decoded['cognito:username'] -def find_or_create_session(userId: str): +def find_or_create_session(userId: str, now: datetime.datetime): # TODO: return if userId is None if userId is None: userId = '-' - now = datetime.datetime.now(datetime.timezone.utc).isoformat() - # TODO: calculate title - # TODO: find last session based on SESSION_MAX_DURATION_MINUTES - # TODO: if it's None, create it. - body = { - "id": '1',#f'{uuid.uuid4()}', - "title": "last session", - "userId": userId, - "createdAt": now - } + + SESSION_MAX_DURATION_DAYS = int(os.getenv('SESSION_MAX_DURATION_DAYS', '1')) + datetimeLimit = now - datetime.timedelta(SESSION_MAX_DURATION_DAYS - 1) + startOfDay = datetime.datetime.combine(datetimeLimit, datetime.time.min) + # trovare una sessione con createdAt > datetimeLimit try: - table_sessions.put_item(Item = body) + db_response = table_sessions.query( + KeyConditionExpression=Key("userId").eq(userId) & + Key('createdAt').gt(startOfDay.isoformat()), + IndexName='SessionsByCreatedAtIndex', + ScanIndexForward=False, + Limit=1 + ) except (BotoCoreError, ClientError) as e: - raise HTTPException(status_code=422, detail=f"[find_or_create_session] body: {body}, error: {e}") + raise HTTPException(status_code=422, detail=f"[find_or_create_session] userId: {userId}, error: {e}") + + items = db_response.get('Items', []) + if len(items) == 0: + body = { + "id": f'{uuid.uuid4()}', + "title": now.strftime("%Y-%m-%d"), + "userId": userId, + "createdAt": now.isoformat() + } + try: + table_sessions.put_item(Item = body) + except (BotoCoreError, ClientError) as e: + raise HTTPException(status_code=422, detail=f"[find_or_create_session] body: {body}, error: {e}") + + else: + body = items[0] return body @@ -154,7 +172,9 @@ async def sessions_fetching( try: db_response = table_sessions.query( - KeyConditionExpression=Key("userId").eq(userId) + KeyConditionExpression=Key("userId").eq(userId), + IndexName='SessionsByCreatedAtIndex', + ScanIndexForward=False ) except (BotoCoreError, ClientError) as e: raise HTTPException(status_code=422, detail=f"[sessions_fetching] userId: {userId}, error: {e}") From d96a9f99638cee624690d28163fce4d5b5db313e Mon Sep 17 00:00:00 2001 From: Devis Battisti Date: Fri, 11 Oct 2024 15:06:44 +0200 Subject: [PATCH 14/22] chore: remove old var --- apps/chatbot/.env.example | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/chatbot/.env.example b/apps/chatbot/.env.example index 50030275a..5a7068f52 100644 --- a/apps/chatbot/.env.example +++ b/apps/chatbot/.env.example @@ -25,6 +25,5 @@ CHB_ENGINE_SIMILARITY_CUTOFF=... CHB_ENGINE_USE_ASYNC=... CHB_ENGINE_USE_STREAMING=... CHB_QUERY_TABLE_PREFIX=chatbot-local -GOOGLE_AND_REDIS_AWS_DEFAULT_REGION=eu-west-3 CHB_REDIS_INDEX_NAME=zero CHB_DYNAMODB_URL=http://locahost:8080 From a5177df5675189c55ef63a005f3f12cde81f1914 Mon Sep 17 00:00:00 2001 From: Devis Battisti Date: Fri, 11 Oct 2024 17:07:47 +0200 Subject: [PATCH 15/22] Update apps/chatbot/docker/compose.yaml Co-authored-by: marcobottaro <39835990+marcobottaro@users.noreply.github.com> --- apps/chatbot/docker/compose.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/chatbot/docker/compose.yaml b/apps/chatbot/docker/compose.yaml index ef266468a..ecea265d3 100644 --- a/apps/chatbot/docker/compose.yaml +++ b/apps/chatbot/docker/compose.yaml @@ -43,6 +43,7 @@ services: - "8080:8080" volumes: - ..:/app + - ./build-devp/out:/app/build-devp/out command: "python src/modules/create_vector_index.py --params config/params.yaml" tty: true depends_on: From 4daccf8cae1005a7d8ac237237869f0b7868b16b Mon Sep 17 00:00:00 2001 From: Devis Battisti Date: Fri, 11 Oct 2024 17:34:30 +0200 Subject: [PATCH 16/22] chore: remove logs --- apps/chatbot/src/modules/vector_database.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/apps/chatbot/src/modules/vector_database.py b/apps/chatbot/src/modules/vector_database.py index de31621f9..4ce2454dd 100644 --- a/apps/chatbot/src/modules/vector_database.py +++ b/apps/chatbot/src/modules/vector_database.py @@ -117,18 +117,15 @@ def filter_html_files(html_files: List[str]) -> List[str]: pattern = re.compile(r"/v\d{1,2}.") pattern2 = re.compile(r"/\d{1,2}.") filtered_files = [file for file in html_files if not pattern.search(file) and not pattern2.search(file)] - logging.info(f"[vector_database.py] filter_html_files len(filtered_files): {len(filtered_files)}") return filtered_files def get_html_files(root_folder: str) -> List[str]: - logging.info(f"[vector_database.py] get_html_files({root_folder})") html_files = [] for root, _, files in os.walk(root_folder): for file in files: if file.endswith(".html"): html_files.append(os.path.join(root, file)) - logging.info(f"[vector_database.py] get_html_files len(html_files): {len(html_files)}") return sorted(filter_html_files(html_files)) @@ -159,14 +156,8 @@ def create_documentation( if documentation_dir[-1] != "/": documentation_dir += "/" - logging.info(f"[vector_database.py] Getting documentation from: {documentation_dir}") - logging.info(f"[vector_database.py] create_documentation: DYNAMIC_HTML: {DYNAMIC_HTMLS}") - logging.info(f"[vector_database.py] create_documentation: documentation_dir: {documentation_dir}") - html_files = get_html_files(documentation_dir) - logging.info(f"[vector_database.py] create_documentation: len(html_files): {len(html_files)}") dynamic_htmls = [os.path.join(documentation_dir, path) for path in DYNAMIC_HTMLS] - logging.info(f"[vector_database.py] create_documentation: len(dynamic_htmls): {len(dynamic_htmls)}") documents = [] hash_table = {} empty_pages = [] @@ -188,7 +179,6 @@ def create_documentation( options.add_argument('user-agent=fake-useragent') driver = webdriver.Chrome(service=service, options=options) - logging.info(f"[vector_database.py] create_documentation: driver.get({url})") driver.get(url) time.sleep(5) title, text = html2markdown(driver.page_source) From c123b5cd4017403ca971e80645633dd872113a41 Mon Sep 17 00:00:00 2001 From: Devis Battisti Date: Sun, 13 Oct 2024 16:47:30 +0200 Subject: [PATCH 17/22] fix(chatbot): compose vars --- apps/chatbot/.env.example | 3 +-- apps/chatbot/docker/compose.yaml | 4 ++-- apps/chatbot/src/modules/vector_database.py | 1 - 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/apps/chatbot/.env.example b/apps/chatbot/.env.example index e55145757..d69c20322 100644 --- a/apps/chatbot/.env.example +++ b/apps/chatbot/.env.example @@ -21,8 +21,7 @@ CHB_MODEL_MAXTOKENS=... CHB_EMBED_MODEL_ID=... CHB_ENGINE_SIMILARITY_TOPK=... CHB_ENGINE_SIMILARITY_CUTOFF=... -CHB_ENGINE_USE_ASYNC=... +CHB_ENGINE_USE_ASYNC=True CHB_ENGINE_USE_STREAMING=... CHB_QUERY_TABLE_PREFIX=chatbot-local -CHB_REDIS_INDEX_NAME=zero CHB_DYNAMODB_URL=http://locahost:8080 diff --git a/apps/chatbot/docker/compose.yaml b/apps/chatbot/docker/compose.yaml index ecea265d3..e720460e4 100644 --- a/apps/chatbot/docker/compose.yaml +++ b/apps/chatbot/docker/compose.yaml @@ -8,7 +8,7 @@ services: volumes: - ..:/app - ./files/.aws:/root/.aws - - ./build-devp/out:/app/build-devp/out + - ../../nextjs-website/out:/app/build-devp/out depends_on: redis: condition: service_started @@ -43,7 +43,7 @@ services: - "8080:8080" volumes: - ..:/app - - ./build-devp/out:/app/build-devp/out + - ../../nextjs-website/out:/app/build-devp/out command: "python src/modules/create_vector_index.py --params config/params.yaml" tty: true depends_on: diff --git a/apps/chatbot/src/modules/vector_database.py b/apps/chatbot/src/modules/vector_database.py index 4ce2454dd..7ada5a3eb 100644 --- a/apps/chatbot/src/modules/vector_database.py +++ b/apps/chatbot/src/modules/vector_database.py @@ -272,7 +272,6 @@ def build_automerging_index_redis( automerging_index.set_index_id(INDEX_ID) logging.info("Created vector index successfully and stored on Redis.") - automerging_index.set_index_id("1234") return automerging_index From 5e07dbe24c821340cfdb2a07a1d5b35f0c2fcf66 Mon Sep 17 00:00:00 2001 From: mdciri Date: Wed, 16 Oct 2024 11:36:35 +0200 Subject: [PATCH 18/22] Update modules --- apps/chatbot/src/modules/chatbot.py | 17 +++++++++++------ apps/chatbot/src/modules/presidio.py | 28 ++++++++++++---------------- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/apps/chatbot/src/modules/chatbot.py b/apps/chatbot/src/modules/chatbot.py index a18796850..5b361e949 100644 --- a/apps/chatbot/src/modules/chatbot.py +++ b/apps/chatbot/src/modules/chatbot.py @@ -16,10 +16,7 @@ from src.modules.presidio import PresidioPII -AWS_S3_BUCKET = os.getenv("CHB_AWS_S3_BUCKET") -ITALIAN_THRESHOLD = 0.85 -NUM_MIN_WORDS_QUERY = 3 -NUM_MIN_REFERENCES = 1 +USE_PRESIDIO = True if os.getenv("CHB_USE_PRESIDIO", "True") == "True" else False RESPONSE_TYPE = Union[ Response, StreamingResponse, AsyncStreamingResponse, PydanticResponse ] @@ -36,7 +33,9 @@ def __init__( self.params = params self.prompts = prompts - self.pii = PresidioPII(config=params["config_presidio"]) + if USE_PRESIDIO: + self.pii = PresidioPII(config=params["config_presidio"]) + self.model = get_llm() self.embed_model = get_embed_model() self.index = load_automerging_index_redis( @@ -111,6 +110,9 @@ def _get_response_str(self, engine_response: RESPONSE_TYPE) -> str: """ else: response_str = self._unmask_reference(response_str, nodes) + + if "Step 2:" in response_str: + response_str = response_str.split("Step 2:")[1].strip() return response_str @@ -142,7 +144,10 @@ def _unmask_reference(self, response_str: str, nodes) -> str: def mask_pii(self, message: str) -> str: - return self.pii.mask_pii(message) + if USE_PRESIDIO: + return self.pii.mask_pii(message) + else: + return message def generate(self, query_str: str) -> str: diff --git a/apps/chatbot/src/modules/presidio.py b/apps/chatbot/src/modules/presidio.py index 7de556087..65e7f114e 100644 --- a/apps/chatbot/src/modules/presidio.py +++ b/apps/chatbot/src/modules/presidio.py @@ -12,7 +12,7 @@ # see supported entities by Presidio with their description at: https://microsoft.github.io/presidio/supported_entities/ -ENTITIES = [ +GLOBAL_ENTITIES = [ "CREDIT_CARD", "CRYPTO", "DATE_TIME", @@ -23,21 +23,16 @@ "LOCATION", "PERSON", "PHONE_NUMBER", - "MEDICAL_LICENSE", + "MEDICAL_LICENSE" +] + +IT_ENTITIES = [ "IT_FISCAL_CODE", "IT_DRIVER_LICENSE", "IT_VAT_CODE", "IT_PASSPORT", "IT_IDENTITY_CARD", - "IT_PHYSICAL_ADDRESS", # this is a custom entity added to the analyzer registry - # "ES_NIF", - # "ES_NIE", - # "US_BANK_NUMBER", - # "US_DRIVER_LICENSE", - # "US_ITIN", - # "US_PASSPORT", - # "US_SSN", - # "UK_NHS" + "IT_PHYSICAL_ADDRESS" ] ALLOW_LIST = [ @@ -102,9 +97,10 @@ def __init__( analyzer_threshold: float = 0.4 ): self.config = config + self.languages = [item["lang_code"] for item in config["models"]] self.entity_mapping = entity_mapping self.mapping = mapping - self.entities = entities if entities else ENTITIES + self.entities = entities if entities else GLOBAL_ENTITIES self.analyzer_threshold = analyzer_threshold if isinstance(self.config, (Path, str)): @@ -117,7 +113,7 @@ def __init__( self.nlp_engine = nlp_engine self.analyzer = AnalyzerEngine( nlp_engine = self.nlp_engine, - supported_languages = ["it", "en"], # "es", "fr", "de" + supported_languages = self.languages, default_score_threshold = analyzer_threshold ) self._add_italian_physical_address_entity() @@ -136,7 +132,7 @@ def detect_language(self, text: str) -> str: detected_languages = detect_langs(text) lang_list = [] for detected_lang in detected_languages: - if detected_lang.lang in ["it", "en", "es", "fr", "de"]: + if detected_lang.lang in self.languages: lang_list.append(detected_lang.lang) if not lang_list: @@ -145,7 +141,7 @@ def detect_language(self, text: str) -> str: elif "it" in lang_list: lang = "it" else: - lang = "en" # lang_list[0].lang + lang = lang_list[0] except: logging.warning("No detected language.") lang = "it" @@ -160,7 +156,7 @@ def detect_pii(self, text: str) -> List[RecognizerResult]: results = self.analyzer.analyze( text=text, language=lang, - entities=self.entities, + entities=self.entities + IT_ENTITIES if lang == "it" else self.entities, allow_list=ALLOW_LIST ) From da8a41cd43e382c31bcecd45813df73c7326e0ba Mon Sep 17 00:00:00 2001 From: mdciri Date: Wed, 16 Oct 2024 11:36:54 +0200 Subject: [PATCH 19/22] Update config prompts --- apps/chatbot/config/prompts.yaml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/apps/chatbot/config/prompts.yaml b/apps/chatbot/config/prompts.yaml index daf7b3416..61b4f4f51 100644 --- a/apps/chatbot/config/prompts.yaml +++ b/apps/chatbot/config/prompts.yaml @@ -12,7 +12,6 @@ qa_prompt_str: | - the answer must be clear, non-redundant, and have not repeated sentences. - the answer must not include the query. - If your answer is based on this retrieved context, include a "Rif" section at the end of the response, listing the titles and filenames from the source nodes used. If no context is used, do not include a reference. - - the answer must be with the same language of the query. -------------------- Output Examples: Query: Cos'รจ il nodo dei pagamenti? @@ -38,7 +37,14 @@ qa_prompt_str: | -------------------- Task: Given the query: {query_str} - Answer the query according to the `Chatbot Policy` listed above. + + Reply to the user following these two steps: + Step 1: + Pay great attention in detail on the query's language and determine if it is formulated in Italian, English, Spanish, French, German, Greek, Croatian, or Slovenian ('yes' or 'no'). + Step 2: + If Step 1 returns 'yes': reply always in Italian, regardless of the input language, according to the `Chatbot Policy` listed above. + Otherwise: reply you cannot speak that language and ask for a new query written in an accepted language. + Answer: From b57d55c42619369143722978db7d31a4cecc2ed6 Mon Sep 17 00:00:00 2001 From: mdciri Date: Wed, 16 Oct 2024 11:37:11 +0200 Subject: [PATCH 20/22] Update env example --- apps/chatbot/.env.example | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/chatbot/.env.example b/apps/chatbot/.env.example index 9dbd012ac..3d9cbb938 100644 --- a/apps/chatbot/.env.example +++ b/apps/chatbot/.env.example @@ -13,6 +13,7 @@ CHB_WEBSITE_URL=... CHB_REDIS_INDEX_NAME=... CHB_LLAMAINDEX_INDEX_ID=... CHB_DOCUMENTATION_DIR=... +CHB_USE_PRESIDIO=... CHB_GOOGLE_API_KEY=... CHB_PROVIDER=... CHB_MODEL_ID=... From 278d56df6eaa80cbded42b59139a00966e19c3cc Mon Sep 17 00:00:00 2001 From: Devis Battisti Date: Wed, 16 Oct 2024 17:32:44 +0200 Subject: [PATCH 21/22] redis admin port --- apps/chatbot/docker/compose.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/chatbot/docker/compose.yaml b/apps/chatbot/docker/compose.yaml index e720460e4..349c71502 100644 --- a/apps/chatbot/docker/compose.yaml +++ b/apps/chatbot/docker/compose.yaml @@ -32,6 +32,7 @@ services: image: redis/redis-stack:7.2.0-v13 ports: - "6379:6379" + - "8001:8001" networks: - ntw From 91c782a73c77ab59299078dd9c64380dd7067b6a Mon Sep 17 00:00:00 2001 From: Devis Battisti Date: Wed, 16 Oct 2024 17:36:27 +0200 Subject: [PATCH 22/22] chore: add env example var --- apps/chatbot/.env.example | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/chatbot/.env.example b/apps/chatbot/.env.example index c7e15cf1d..cf45f3da0 100644 --- a/apps/chatbot/.env.example +++ b/apps/chatbot/.env.example @@ -26,3 +26,4 @@ CHB_ENGINE_USE_ASYNC=True CHB_ENGINE_USE_STREAMING=... CHB_QUERY_TABLE_PREFIX=chatbot-local CHB_DYNAMODB_URL=http://locahost:8080 +CHB_USE_PRESIDIO=True