Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CAI-177] - Docker read dynamic URLs and create the index automatically #1228

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion apps/chatbot/docker/app.local.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,15 @@ ARG DEBIAN_FRONTEND=noninteractive

RUN apt-get update && \
apt-get install -y \
curl
curl \
wget \
zip

RUN wget https://github.com/rphrp1985/selenium_support/raw/main/chrome_114_amd64.deb && \
apt-get install -y ./chrome_114_amd64.deb && \
wget https://chromedriver.storage.googleapis.com/114.0.5735.90/chromedriver_linux64.zip && \
unzip chromedriver_linux64.zip && \
mv chromedriver /usr/bin/chromedriver
batdevis marked this conversation as resolved.
Show resolved Hide resolved

ENV PYTHONPATH=/app

Expand Down
4 changes: 2 additions & 2 deletions apps/chatbot/docker/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ services:
condition: service_started
networks:
- ntw

dynamodb:
image: amazon/dynamodb-local:2.5.2
environment:
Expand All @@ -27,7 +27,7 @@ services:
- "8000:8000"
networks:
- ntw

redis:
image: redis/redis-stack:7.2.0-v13
ports:
Expand Down
2 changes: 2 additions & 0 deletions apps/chatbot/docker/docker-compose-build-local.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
docker compose --env-file .env -f docker/compose.yaml -p chatbot build
2 changes: 1 addition & 1 deletion apps/chatbot/docker/docker-compose-run-create_index.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
#!/bin/bash
docker compose -f docker/compose.yaml -p chatbot run create_index
docker compose --env-file .env -f docker/compose.yaml -p chatbot run create_index
2 changes: 1 addition & 1 deletion apps/chatbot/docker/docker-compose-up-api.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
#!/bin/bash
docker compose -f docker/compose.yaml -p chatbot up api
docker compose --env-file .env -f docker/compose.yaml -p chatbot up api
2 changes: 0 additions & 2 deletions apps/chatbot/docker/docker-run-create-index.sh

This file was deleted.

2 changes: 0 additions & 2 deletions apps/chatbot/docker/docker-run-local-bash.sh

This file was deleted.

2 changes: 0 additions & 2 deletions apps/chatbot/docker/docker-run-local.sh

This file was deleted.

7 changes: 0 additions & 7 deletions apps/chatbot/docker/docker-run.sh

This file was deleted.

33 changes: 16 additions & 17 deletions apps/chatbot/src/modules/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,6 @@
from datetime import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
# from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.chrome.service import Service
# from chromedriver_py import binary_path
from typing import List, Tuple

from llama_index.core import (
Expand Down Expand Up @@ -147,22 +144,25 @@ def create_documentation(
hash_table = {}
empty_pages = []

# full_text = ""

driver_exe_path = "/usr/bin/chromedriver"
if os.path.exists(driver_exe_path):
driver_service = webdriver.ChromeService(executable_path=driver_exe_path)
else:
driver_service = None
driver_options = webdriver.ChromeOptions()
driver_options.add_argument('--headless')
driver_options.add_argument('--disable-gpu')
driver_options.add_argument('--no-sandbox')
driver_options.add_argument('--disable-dev-shm-usage')

for file in tqdm.tqdm(html_files, total=len(html_files), desc="Extracting HTML"):

# FIX: resolve webdriver.Chrome "self.assert_process_still_running" error in docker
if file in dynamic_htmls or "/webinars/" in file or "/api/" in file:
# if 6 == 9:
url = file.replace(documentation_dir, f"{website_url}/").replace(".html", "")

# svc = webdriver.ChromeService(executable_path=binary_path)
# service = Service(executable_path=binary_path)
# options = webdriver.ChromeOptions()
# options.add_argument('--headless=new')
# options.add_argument('--no-sandbox')
# options.add_argument('user-agent=fake-useragent')
driver = webdriver.Chrome() #(service=service, options=options)
driver = webdriver.Chrome(
options=driver_options,
service=driver_service
)

driver.get(url)
time.sleep(5)
Expand All @@ -172,7 +172,6 @@ def create_documentation(
title, text = html2markdown(open(file))

if text is None or text == "" or text == "None" or text=="404\n\n#### Pagina non trovata\n\nLa pagina che stai cercando non esiste":
# print(file)
empty_pages.append(file)

else:
Expand Down Expand Up @@ -231,7 +230,7 @@ def build_automerging_index_redis(
key=key,
val=value
)
logging.info(f"[vector_database.py - build_automerging_index_redis] hash_table_{INDEX_ID} is now on Redis.")
logging.info(f"[vector_database.py - build_automerging_index_redis] hash_table_{NEW_INDEX_ID} is now on Redis.")

logging.info(f"[vector_database.py - build_automerging_index_redis] Creating index {NEW_INDEX_ID} ...")
nodes = Settings.node_parser.get_nodes_from_documents(documents)
Expand Down
Loading