diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 6a5e1f60..b59c7de4 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -30,6 +30,14 @@ jobs: ports: - 9200:9200 steps: + - name: Remove irrelevant software # to free up required disk space + run: | + df -h + sudo rm -rf /opt/ghc + sudo rm -rf /opt/hostedtoolcache/CodeQL + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + df -h - name: Checkout uses: actions/checkout@v4 - name: Setup python @@ -37,7 +45,7 @@ jobs: with: python-version: '3.10' - name: Setup nbtest - run: make nbtest + run: make install-nbtest - name: Warm up continue-on-error: true run: sleep 30 && PATCH_ES=1 ELASTIC_CLOUD_ID=foo ELASTIC_API_KEY=bar bin/nbtest notebooks/search/00-quick-start.ipynb diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 97e0e1b3..2856f309 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,3 +13,7 @@ repos: # generic [...]_PASSWORD=[...] pattern - --additional-pattern - '_PASSWORD=[0-9a-zA-Z_-]{10}' +- repo: https://github.com/ambv/black + rev: 24.1.1 # Use latest tag on GitHub + hooks: + - id: black-jupyter diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 45fbda35..2f0a10fb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -5,24 +5,34 @@ If you would like to contribute new example apps to the `elasticsearch-labs` rep ## Before you start Prior to opening a pull request, please: -- Create an issue to [discuss the scope of your proposal](https://github.com/elastic/elasticsearch-labs/issues). We are happy to provide guidance to make for a pleasant contribution experience. -- Sign the [Contributor License Agreement](https://www.elastic.co/contributor-agreement/). We are not asking you to assign copyright to us, but to give us the right to distribute your code without restriction. We ask this of all contributors in order to assure our users of the origin and continuing existence of the code. You only need to sign the CLA once. +1. Create an issue to [discuss the scope of your proposal](https://github.com/elastic/elasticsearch-labs/issues). We are happy to provide guidance to make for a pleasant contribution experience. +2. Sign the [Contributor License Agreement](https://www.elastic.co/contributor-agreement/). We are not asking you to assign copyright to us, but to give us the right to distribute your code without restriction. We ask this of all contributors in order to assure our users of the origin and continuing existence of the code. You only need to sign the CLA once. +3. Install pre-commit... ### Pre-commit hook This repository has a pre-commit hook that ensures that your contributed code follows our guidelines. It is strongly recommended that you install the pre-commit hook on your locally cloned repository, as that will allow you to check the correctness of your submission without having to wait for our continuous integration build. To install the pre-commit hook, clone this repository and then run the following command from its top-level directory: ```bash -make pre-commit +make install ``` If you do not have access to the `make` utility, you can also install the pre-commit hook with Python: ```bash python -m venv .venv +.venv/bin/pip install -qqq -r requirements-dev.txt .venv/bin/pre-commit install ``` +Now it can happen that you get an error when you try to commit, for example if your code or your notebook was not formatted with the [black formatter](https://github.com/psf/black). In this case, please run this command from the repo root: + +```bash +make pre-commit +``` + +If you now include the changed files in your commit, it should succeed. + ## General instruction - If the notebook or code sample requires signing up a Elastic cloud instance, make sure to add appropriate `utm_source` and `utm_content` in the cloud registration url. For example, the Elastic cloud sign up url for the Python notebooks should have `utm_source=github&utm_content=elasticsearch-labs-notebook` and code examples should have `utm_source=github&utm_content=elasticsearch-labs-samples`. diff --git a/Makefile b/Makefile index 55ffe7c2..cfcba5f6 100644 --- a/Makefile +++ b/Makefile @@ -1,20 +1,24 @@ # this is the list of notebooks that are integrated with the testing framework NOTEBOOKS = $(shell bin/find-notebooks-to-test.sh) +VENV = .venv -.PHONY: install pre-commit nbtest test notebooks +.PHONY: install install-pre-commit install-nbtest test notebooks -test: nbtest notebooks +test: install-nbtest notebooks notebooks: bin/nbtest $(NOTEBOOKS) -install: pre-commit nbtest +pre-commit: install-pre-commit + $(VENV)/bin/pre-commit run --all-files -pre-commit: - python -m venv .venv - .venv/bin/pip install -qqq -r requirements-dev.txt - .venv/bin/pre-commit install +install: install-pre-commit install-nbtest -nbtest: - python3 -m venv .venv - .venv/bin/pip install -qqq elastic-nbtest +install-pre-commit: + python -m venv $(VENV) + $(VENV)/bin/pip install -qqq -r requirements-dev.txt + $(VENV)/bin/pre-commit install + +install-nbtest: + python3 -m venv $(VENV) + $(VENV)/bin/pip install -qqq elastic-nbtest diff --git a/bin/mocks/elasticsearch.py b/bin/mocks/elasticsearch.py index 684996d0..3a2a9fd7 100644 --- a/bin/mocks/elasticsearch.py +++ b/bin/mocks/elasticsearch.py @@ -8,30 +8,30 @@ def patch_elasticsearch(): # remove the path entry that refers to this directory for path in sys.path: - if not path.startswith('/'): + if not path.startswith("/"): path = os.path.join(os.getcwd(), path) - if __file__ == os.path.join(path, 'elasticsearch.py'): + if __file__ == os.path.join(path, "elasticsearch.py"): sys.path.remove(path) break # remove this module, and import the real one instead - del sys.modules['elasticsearch'] + del sys.modules["elasticsearch"] import elasticsearch # restore the import path sys.path = saved_path - # preserve the original Elasticsearch.__init__ method + # preserve the original Elasticsearch.__init__ method orig_es_init = elasticsearch.Elasticsearch.__init__ # patched version of Elasticsearch.__init__ that connects to self-hosted # regardless of connection arguments given def patched_es_init(self, *args, **kwargs): - if 'cloud_id' in kwargs: - assert kwargs['cloud_id'] == 'foo' - if 'api_key' in kwargs: - assert kwargs['api_key'] == 'bar' - return orig_es_init(self, 'http://localhost:9200') + if "cloud_id" in kwargs: + assert kwargs["cloud_id"] == "foo" + if "api_key" in kwargs: + assert kwargs["api_key"] == "bar" + return orig_es_init(self, "http://localhost:9200", timeout=60) # patch Elasticsearch.__init__ elasticsearch.Elasticsearch.__init__ = patched_es_init diff --git a/bin/nbtest b/bin/nbtest index 21a622ad..39683994 100755 --- a/bin/nbtest +++ b/bin/nbtest @@ -2,7 +2,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) if [[ ! -f $SCRIPT_DIR/../.venv/bin/nbtest ]]; then - make nbtest + make install-nbtest fi if [[ "$PATCH_ES" != "" ]]; then diff --git a/example-apps/chatbot-rag-app/api/chat.py b/example-apps/chatbot-rag-app/api/chat.py index 87db3c32..8509b4bd 100644 --- a/example-apps/chatbot-rag-app/api/chat.py +++ b/example-apps/chatbot-rag-app/api/chat.py @@ -36,31 +36,39 @@ def ask_question(question, session_id): if len(chat_history.messages) > 0: # create a condensed question condense_question_prompt = render_template( - 'condense_question_prompt.txt', question=question, - chat_history=chat_history.messages) + "condense_question_prompt.txt", + question=question, + chat_history=chat_history.messages, + ) condensed_question = get_llm().invoke(condense_question_prompt).content else: condensed_question = question - current_app.logger.debug('Condensed question: %s', condensed_question) - current_app.logger.debug('Question: %s', question) + current_app.logger.debug("Condensed question: %s", condensed_question) + current_app.logger.debug("Question: %s", question) docs = store.as_retriever().invoke(condensed_question) for doc in docs: - doc_source = {**doc.metadata, 'page_content': doc.page_content} - current_app.logger.debug('Retrieved document passage from: %s', doc.metadata['name']) - yield f'data: {SOURCE_TAG} {json.dumps(doc_source)}\n\n' + doc_source = {**doc.metadata, "page_content": doc.page_content} + current_app.logger.debug( + "Retrieved document passage from: %s", doc.metadata["name"] + ) + yield f"data: {SOURCE_TAG} {json.dumps(doc_source)}\n\n" - qa_prompt = render_template('rag_prompt.txt', question=question, docs=docs, - chat_history=chat_history.messages) + qa_prompt = render_template( + "rag_prompt.txt", + question=question, + docs=docs, + chat_history=chat_history.messages, + ) - answer = '' + answer = "" for chunk in get_llm().stream(qa_prompt): - yield f'data: {chunk.content}\n\n' + yield f"data: {chunk.content}\n\n" answer += chunk.content yield f"data: {DONE_TAG}\n\n" - current_app.logger.debug('Answer: %s', answer) + current_app.logger.debug("Answer: %s", answer) chat_history.add_user_message(question) chat_history.add_ai_message(answer) diff --git a/example-apps/chatbot-rag-app/api/llm_integrations.py b/example-apps/chatbot-rag-app/api/llm_integrations.py index 8fe9d2ac..8c20fe27 100644 --- a/example-apps/chatbot-rag-app/api/llm_integrations.py +++ b/example-apps/chatbot-rag-app/api/llm_integrations.py @@ -5,37 +5,54 @@ LLM_TYPE = os.getenv("LLM_TYPE", "openai") + def init_openai_chat(temperature): OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") - return ChatOpenAI(openai_api_key=OPENAI_API_KEY, streaming=True, temperature=temperature) + return ChatOpenAI( + openai_api_key=OPENAI_API_KEY, streaming=True, temperature=temperature + ) + + def init_vertex_chat(temperature): VERTEX_PROJECT_ID = os.getenv("VERTEX_PROJECT_ID") VERTEX_REGION = os.getenv("VERTEX_REGION", "us-central1") vertexai.init(project=VERTEX_PROJECT_ID, location=VERTEX_REGION) return ChatVertexAI(streaming=True, temperature=temperature) + + def init_azure_chat(temperature): - OPENAI_VERSION=os.getenv("OPENAI_VERSION", "2023-05-15") - BASE_URL=os.getenv("OPENAI_BASE_URL") - OPENAI_API_KEY=os.getenv("OPENAI_API_KEY") - OPENAI_ENGINE=os.getenv("OPENAI_ENGINE") + OPENAI_VERSION = os.getenv("OPENAI_VERSION", "2023-05-15") + BASE_URL = os.getenv("OPENAI_BASE_URL") + OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") + OPENAI_ENGINE = os.getenv("OPENAI_ENGINE") return AzureChatOpenAI( deployment_name=OPENAI_ENGINE, openai_api_base=BASE_URL, openai_api_version=OPENAI_VERSION, openai_api_key=OPENAI_API_KEY, streaming=True, - temperature=temperature) + temperature=temperature, + ) + + def init_bedrock(temperature): - AWS_ACCESS_KEY=os.getenv("AWS_ACCESS_KEY") - AWS_SECRET_KEY=os.getenv("AWS_SECRET_KEY") - AWS_REGION=os.getenv("AWS_REGION") - AWS_MODEL_ID=os.getenv("AWS_MODEL_ID", "anthropic.claude-v2") - BEDROCK_CLIENT=boto3.client(service_name="bedrock-runtime", region_name=AWS_REGION, aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY) + AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY") + AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY") + AWS_REGION = os.getenv("AWS_REGION") + AWS_MODEL_ID = os.getenv("AWS_MODEL_ID", "anthropic.claude-v2") + BEDROCK_CLIENT = boto3.client( + service_name="bedrock-runtime", + region_name=AWS_REGION, + aws_access_key_id=AWS_ACCESS_KEY, + aws_secret_access_key=AWS_SECRET_KEY, + ) return BedrockChat( client=BEDROCK_CLIENT, model_id=AWS_MODEL_ID, streaming=True, - model_kwargs={"temperature":temperature}) + model_kwargs={"temperature": temperature}, + ) + MAP_LLM_TYPE_TO_CHAT_MODEL = { "azure": init_azure_chat, @@ -44,8 +61,13 @@ def init_bedrock(temperature): "vertex": init_vertex_chat, } + def get_llm(temperature=0): if not LLM_TYPE in MAP_LLM_TYPE_TO_CHAT_MODEL: - raise Exception("LLM type not found. Please set LLM_TYPE to one of: " + ", ".join(MAP_LLM_TYPE_TO_CHAT_MODEL.keys()) + ".") + raise Exception( + "LLM type not found. Please set LLM_TYPE to one of: " + + ", ".join(MAP_LLM_TYPE_TO_CHAT_MODEL.keys()) + + "." + ) return MAP_LLM_TYPE_TO_CHAT_MODEL[LLM_TYPE](temperature=temperature) diff --git a/example-apps/chatbot-rag-app/data/index_data.py b/example-apps/chatbot-rag-app/data/index_data.py index f0407b16..eeef7405 100644 --- a/example-apps/chatbot-rag-app/data/index_data.py +++ b/example-apps/chatbot-rag-app/data/index_data.py @@ -61,14 +61,16 @@ def main(): print(f"Loading data from ${FILE}") - metadata_keys = ['name', 'summary', 'url', 'category', 'updated_at'] + metadata_keys = ["name", "summary", "url", "category", "updated_at"] workplace_docs = [] - with open(FILE, 'rt') as f: + with open(FILE, "rt") as f: for doc in json.loads(f.read()): - workplace_docs.append(Document( - page_content=doc['content'], - metadata={k: doc.get(k) for k in metadata_keys} - )) + workplace_docs.append( + Document( + page_content=doc["content"], + metadata={k: doc.get(k) for k in metadata_keys}, + ) + ) print(f"Loaded {len(workplace_docs)} documents") @@ -92,7 +94,7 @@ def main(): index_name=INDEX, strategy=ElasticsearchStore.SparseVectorRetrievalStrategy(model_id=ELSER_MODEL), bulk_kwargs={ - 'request_timeout': 60, + "request_timeout": 60, }, ) diff --git a/example-apps/internal-knowledge-search/api/app.py b/example-apps/internal-knowledge-search/api/app.py index ade85497..bddcec02 100644 --- a/example-apps/internal-knowledge-search/api/app.py +++ b/example-apps/internal-knowledge-search/api/app.py @@ -10,10 +10,8 @@ def get_identities_index(search_app_name): - search_app = elasticsearch_client.search_application.get( - name=search_app_name) - identities_indices = elasticsearch_client.indices.get( - index=".search-acl-filter*") + search_app = elasticsearch_client.search_application.get(name=search_app_name) + identities_indices = elasticsearch_client.indices.get(index=".search-acl-filter*") secured_index = [ app_index for app_index in search_app["indices"] @@ -36,7 +34,8 @@ def api_index(): @app.route("/api/default_settings", methods=["GET"]) def default_settings(): return { - "elasticsearch_endpoint": os.getenv("ELASTICSEARCH_URL") or "http://localhost:9200" + "elasticsearch_endpoint": os.getenv("ELASTICSEARCH_URL") + or "http://localhost:9200" } @@ -44,11 +43,13 @@ def default_settings(): def search(text): response = requests.request( method="POST", - url=os.getenv("ELASTICSEARCH_URL") + '/' + text, + url=os.getenv("ELASTICSEARCH_URL") + "/" + text, data=request.get_data(), allow_redirects=False, - headers={"Authorization": request.headers.get( - "Authorization"), "Content-Type": "application/json"} + headers={ + "Authorization": request.headers.get("Authorization"), + "Content-Type": "application/json", + }, ) return response.content @@ -59,8 +60,7 @@ def personas(): try: search_app_name = request.args.get("app_name") identities_index = get_identities_index(search_app_name) - response = elasticsearch_client.search( - index=identities_index, size=1000) + response = elasticsearch_client.search(index=identities_index, size=1000) hits = response["hits"]["hits"] personas = [x["_id"] for x in hits] personas.append("admin") @@ -77,9 +77,8 @@ def personas(): def indices(): try: search_app_name = request.args.get("app_name") - search_app = elasticsearch_client.search_application.get( - name=search_app_name) - return search_app['indices'] + search_app = elasticsearch_client.search_application.get(name=search_app_name) + return search_app["indices"] except Exception as e: current_app.logger.warn( @@ -118,8 +117,7 @@ def api_key(): if persona == "admin": role_descriptor = default_role_descriptor else: - identity = elasticsearch_client.get( - index=identities_index, id=persona) + identity = elasticsearch_client.get(index=identities_index, id=persona) permissions = identity["_source"]["query"]["template"]["params"][ "access_control" ] @@ -161,12 +159,14 @@ def api_key(): } } api_key = elasticsearch_client.security.create_api_key( - name=search_app_name+"-internal-knowledge-search-example-"+persona, expiration="1h", role_descriptors=role_descriptor) - return {"api_key": api_key['encoded']} + name=search_app_name + "-internal-knowledge-search-example-" + persona, + expiration="1h", + role_descriptors=role_descriptor, + ) + return {"api_key": api_key["encoded"]} except Exception as e: - current_app.logger.warn( - "Encountered error %s while fetching api key", e) + current_app.logger.warn("Encountered error %s while fetching api key", e) raise e diff --git a/example-apps/internal-knowledge-search/api/elasticsearch_client.py b/example-apps/internal-knowledge-search/api/elasticsearch_client.py index 8a985b20..80d73d71 100644 --- a/example-apps/internal-knowledge-search/api/elasticsearch_client.py +++ b/example-apps/internal-knowledge-search/api/elasticsearch_client.py @@ -31,8 +31,7 @@ ) elif ELASTIC_USERNAME and ELASTIC_PASSWORD: elasticsearch_client = Elasticsearch( - basic_auth=(ELASTIC_USERNAME, ELASTIC_PASSWORD), - cloud_id=ELASTIC_CLOUD_ID + basic_auth=(ELASTIC_USERNAME, ELASTIC_PASSWORD), cloud_id=ELASTIC_CLOUD_ID ) else: raise ValueError( diff --git a/example-apps/relevance-workbench/app-api/app.py b/example-apps/relevance-workbench/app-api/app.py index 7d598439..0ba4044c 100644 --- a/example-apps/relevance-workbench/app-api/app.py +++ b/example-apps/relevance-workbench/app-api/app.py @@ -1,47 +1,54 @@ from flask import Flask, request, jsonify import os -from elasticsearch import Elasticsearch +from elasticsearch import Elasticsearch CLOUD_ID = os.environ["CLOUD_ID"] -ES_USER = os.environ['ELASTICSEARCH_USERNAME'] -ES_PASSWORD = os.environ['ELASTICSEARCH_PASSWORD'] +ES_USER = os.environ["ELASTICSEARCH_USERNAME"] +ES_PASSWORD = os.environ["ELASTICSEARCH_PASSWORD"] datasets = { "movies": { "id": "movies", "label": "Movies", "index": "search-movies", - "search_fields": ["title", "overview", "keywords"], - "elser_search_fields": ["ml.inference.overview_expanded.predicted_value", "ml.inference.title_expanded.predicted_value^0.5"], + "search_fields": ["title", "overview", "keywords"], + "elser_search_fields": [ + "ml.inference.overview_expanded.predicted_value", + "ml.inference.title_expanded.predicted_value^0.5", + ], "result_fields": ["title", "overview"], - "mapping_fields": {"text": "overview", "title": "title"} + "mapping_fields": {"text": "overview", "title": "title"}, } } app = Flask(__name__) + @app.route("/api/search/") def route_api_search(index): """ Execute the search """ [query, rrf, type, k, datasetId] = [ - request.args.get('q'), - request.args.get('rrf', default=False, type=lambda v: v.lower() == 'true'), - request.args.get('type', default='bm25'), - request.args.get('k', default=0), - request.args.get('dataset', default='movies') + request.args.get("q"), + request.args.get("rrf", default=False, type=lambda v: v.lower() == "true"), + request.args.get("type", default="bm25"), + request.args.get("k", default=0), + request.args.get("dataset", default="movies"), ] - if type=='elser': - search_result = run_semantic_search(query, index, **{ 'rrf': rrf, 'k': k, 'dataset': datasetId }) - elif type=='bm25': - search_result = run_full_text_search(query, index, **{ 'dataset': datasetId }) - transformed_search_result = transform_search_response(search_result, datasets[datasetId]['mapping_fields']) - return jsonify(response=transformed_search_result) - + if type == "elser": + search_result = run_semantic_search( + query, index, **{"rrf": rrf, "k": k, "dataset": datasetId} + ) + elif type == "bm25": + search_result = run_full_text_search(query, index, **{"dataset": datasetId}) + transformed_search_result = transform_search_response( + search_result, datasets[datasetId]["mapping_fields"] + ) + return jsonify(response=transformed_search_result) -@app.route("/api/datasets", methods=['GET']) +@app.route("/api/datasets", methods=["GET"]) def route_api_datasets(): """ Return the available datasets @@ -56,193 +63,174 @@ def resource_not_found(e): """ return jsonify(error=str(e)), 404 -def get_text_expansion_request_body(query, size = 10, **options): + +def get_text_expansion_request_body(query, size=10, **options): """ Generates an ES text expansion search request. """ - fields = datasets[options['dataset']]['elser_search_fields'] - result_fields = datasets[options['dataset']]['result_fields'] + fields = datasets[options["dataset"]]["elser_search_fields"] + result_fields = datasets[options["dataset"]]["result_fields"] text_expansions = [] boost = 1 - + for field in fields: split_field_descriptor = field.split("^") - if len(split_field_descriptor) == 2: + if len(split_field_descriptor) == 2: boost = split_field_descriptor[1] field = split_field_descriptor[0] te = {"text_expansion": {}} - te['text_expansion'][field] = { + te["text_expansion"][field] = { "model_text": query, "model_id": ".elser_model_1", - "boost": boost - } + "boost": boost, + } text_expansions.append(te) return { - '_source': False, - 'fields': result_fields, - 'size': size, - 'query': { - "bool": { - "should": text_expansions - } - } + "_source": False, + "fields": result_fields, + "size": size, + "query": {"bool": {"should": text_expansions}}, } -def get_text_expansion_request_body(query, size = 10, **options): + +def get_text_expansion_request_body(query, size=10, **options): """ Generates an ES text expansion search request. """ - fields = datasets[options['dataset']]['elser_search_fields'] - result_fields = datasets[options['dataset']]['result_fields'] + fields = datasets[options["dataset"]]["elser_search_fields"] + result_fields = datasets[options["dataset"]]["result_fields"] text_expansions = [] boost = 1 - + for field in fields: split_field_descriptor = field.split("^") - if len(split_field_descriptor) == 2: + if len(split_field_descriptor) == 2: boost = split_field_descriptor[1] field = split_field_descriptor[0] te = {"text_expansion": {}} - te['text_expansion'][field] = { + te["text_expansion"][field] = { "model_text": query, "model_id": ".elser_model_1", - "boost": boost - } + "boost": boost, + } text_expansions.append(te) return { - '_source': False, - 'fields': result_fields, - 'size': size, - 'query': { - "bool": { - "should": text_expansions - } - } + "_source": False, + "fields": result_fields, + "size": size, + "query": {"bool": {"should": text_expansions}}, } -def get_text_search_request_body(query, size = 10, **options): + +def get_text_search_request_body(query, size=10, **options): """ Generates an ES full text search request. """ - fields = datasets[options['dataset']]['result_fields'] - search_fields = datasets[options['dataset']]['search_fields'] + fields = datasets[options["dataset"]]["result_fields"] + search_fields = datasets[options["dataset"]]["search_fields"] return { - '_source': False, - 'fields': fields, - 'size': size, - 'query': { - "multi_match" : { - "query": query, - "fields": search_fields - } - } - } + "_source": False, + "fields": fields, + "size": size, + "query": {"multi_match": {"query": query, "fields": search_fields}}, + } + -def get_hybrid_search_rrf_request_body(query, size = 10, **options): +def get_hybrid_search_rrf_request_body(query, size=10, **options): """ Generates an ES hybrid search with RRF """ - fields = datasets[options['dataset']]['elser_search_fields'] - result_fields = datasets[options['dataset']]['result_fields'] - search_fields = datasets[options['dataset']]['search_fields'] + fields = datasets[options["dataset"]]["elser_search_fields"] + result_fields = datasets[options["dataset"]]["result_fields"] + search_fields = datasets[options["dataset"]]["search_fields"] text_expansions = [] boost = 1 - + for field in fields: split_field_descriptor = field.split("^") - if len(split_field_descriptor) == 2: + if len(split_field_descriptor) == 2: boost = split_field_descriptor[1] field = split_field_descriptor[0] te = {"text_expansion": {}} - te['text_expansion'][field] = { + te["text_expansion"][field] = { "model_text": query, "model_id": ".elser_model_1", - "boost": boost - } + "boost": boost, + } text_expansions.append(te) return { - '_source': False, - 'fields': result_fields, - 'size': size, - "rank": { - "rrf": { - "window_size": 10, - "rank_constant": 2 - } - }, - 'sub_searches': [ - { - 'query': { - "bool": { - "should": text_expansions - } - } - }, - { - 'query': { - "multi_match" : { - "query": query, - "fields": search_fields - } - } - }] - } - - + "_source": False, + "fields": result_fields, + "size": size, + "rank": {"rrf": {"window_size": 10, "rank_constant": 2}}, + "sub_searches": [ + {"query": {"bool": {"should": text_expansions}}}, + {"query": {"multi_match": {"query": query, "fields": search_fields}}}, + ], + } + + def execute_search_request(index, body): """ Executes an ES search request and returns the JSON response. """ - es = Elasticsearch( - cloud_id=CLOUD_ID, - basic_auth=(ES_USER,ES_PASSWORD) + es = Elasticsearch(cloud_id=CLOUD_ID, basic_auth=(ES_USER, ES_PASSWORD)) + response = es.search( + index=index, + query=body["query"], + fields=body["fields"], + size=body["size"], + source=body["_source"], ) - response = es.search(index=index,query=body["query"], fields=body["fields"], size=body["size"], source=body["_source"]) return response + def execute_search_request_using_raw_dsl(index, body): """ Executes an ES search request using the request library and returns the JSON response. """ - es = Elasticsearch( - cloud_id=CLOUD_ID, - basic_auth=(ES_USER,ES_PASSWORD) + es = Elasticsearch(cloud_id=CLOUD_ID, basic_auth=(ES_USER, ES_PASSWORD)) + response = es.perform_request( + "POST", + f"/{index}/_search", + headers={"content-type": "application/json", "accept": "application/json"}, + body=body, ) - response = es.perform_request("POST", f"/{index}/_search", headers={"content-type": "application/json", "accept": "application/json"}, body=body) return response + def run_full_text_search(query, index, **options): """ Runs a full text search on the given index using the passed query. """ - if query is None or query.strip() == '': - raise Exception('Query cannot be empty') + if query is None or query.strip() == "": + raise Exception("Query cannot be empty") body = get_text_search_request_body(query, **options) response = execute_search_request(index, body) - return response['hits']['hits'] + return response["hits"]["hits"] def run_semantic_search(query, index, **options): """ Runs a semantic search of the provided query on the target index, and reranks the KNN and BM25 results. """ - if options.get('rrf') == True: + if options.get("rrf") == True: body = get_hybrid_search_rrf_request_body(query, **options) # Execute the request using the raw DSL to avoid the ES Python client since sub_searches query are not supported yet response_json = execute_search_request_using_raw_dsl(index, body) - else: + else: body = get_text_expansion_request_body(query, **options) print(body) response_json = execute_search_request(index, body) - return response_json['hits']['hits'] + return response_json["hits"]["hits"] def find_id_index(id: int, hits: list): @@ -251,16 +239,16 @@ def find_id_index(id: int, hits: list): """ for i, v in enumerate(hits): - if v['_id'] == id: + if v["_id"] == id: return i + 1 return 0 + def transform_search_response(searchResults, mappingFields): for hit in searchResults: - fields = hit['fields'] - hit['fields'] = { - 'text': fields[mappingFields['text']], - 'title': fields[mappingFields['title']] + fields = hit["fields"] + hit["fields"] = { + "text": fields[mappingFields["text"]], + "title": fields[mappingFields["title"]], } return searchResults - diff --git a/example-apps/relevance-workbench/data/index-data.py b/example-apps/relevance-workbench/data/index-data.py index dd26cf24..892e5467 100644 --- a/example-apps/relevance-workbench/data/index-data.py +++ b/example-apps/relevance-workbench/data/index-data.py @@ -1,4 +1,3 @@ - from elasticsearch import Elasticsearch, helpers import argparse, os, json import gzip @@ -6,35 +5,41 @@ parser = argparse.ArgumentParser() # required args -parser.add_argument('--data_folder', dest='data_folder', - required=False, default='./data') -parser.add_argument('--es_user', dest='es_user', - required=False, default='elastic') -parser.add_argument('--es_password', dest='es_password', required=True) -parser.add_argument('--cloud_id', dest='cloud_id', required=True) -parser.add_argument('--index_name', dest='index_name', required=False, default='search-movies') -parser.add_argument('--gzip_file', dest='gzip_file', required=False, default='movies-sample.json.gz') +parser.add_argument( + "--data_folder", dest="data_folder", required=False, default="./data" +) +parser.add_argument("--es_user", dest="es_user", required=False, default="elastic") +parser.add_argument("--es_password", dest="es_password", required=True) +parser.add_argument("--cloud_id", dest="cloud_id", required=True) +parser.add_argument( + "--index_name", dest="index_name", required=False, default="search-movies" +) +parser.add_argument( + "--gzip_file", dest="gzip_file", required=False, default="movies-sample.json.gz" +) args = parser.parse_args() + def data_generator(file_json, index, pipeline): for doc in file_json: - doc['_run_ml_inference'] = True + doc["_run_ml_inference"] = True yield { "_index": index, - 'pipeline': pipeline, + "pipeline": pipeline, "_source": doc, } + print("Init Elasticsearch client") es = Elasticsearch( cloud_id=args.cloud_id, basic_auth=(args.es_user, args.es_password), - request_timeout=600 + request_timeout=600, ) print("Indexing movies data, this might take a while...") -file = gzip.open(args.gzip_file, 'r') +file = gzip.open(args.gzip_file, "r") json_bytes = file.read() json_str = json_bytes.decode("utf-8") file_json = json.loads(json_str) @@ -43,21 +48,22 @@ def data_generator(file_json, index, pipeline): success_count = 0 -for ok, info in helpers.streaming_bulk(client=es, actions=data_generator(file_json, args.index_name, args.index_name), raise_on_error=False,): +for ok, info in helpers.streaming_bulk( + client=es, + actions=data_generator(file_json, args.index_name, args.index_name), + raise_on_error=False, +): if ok: success_count += 1 - else: - print(f"Unable to index {info['index']['_id']}: {info['index']['error']}") + else: + print(f"Unable to index {info['index']['_id']}: {info['index']['error']}") progress_bar.update(1) progress_bar.set_postfix(success=success_count) - progress_bar.close() # Calculate the success percentage success_percentage = (success_count / total_documents) * 100 print(f"Indexing completed! Success percentage: {success_percentage}%") print("Done indexing movies data") - - diff --git a/example-apps/search-tutorial/start/search-tutorial/app.py b/example-apps/search-tutorial/start/search-tutorial/app.py index 3269ab47..5c8745ce 100644 --- a/example-apps/search-tutorial/start/search-tutorial/app.py +++ b/example-apps/search-tutorial/start/search-tutorial/app.py @@ -4,18 +4,17 @@ app = Flask(__name__) -@app.get('/') +@app.get("/") def index(): - return render_template('index.html') + return render_template("index.html") -@app.post('/') +@app.post("/") def handle_search(): - query = request.form.get('query', '') - return render_template( - 'index.html', query=query, results=[], from_=0, total=0) + query = request.form.get("query", "") + return render_template("index.html", query=query, results=[], from_=0, total=0) -@app.get('/document/') +@app.get("/document/") def get_document(id): - return 'Document not found' + return "Document not found" diff --git a/example-apps/search-tutorial/v1/search-tutorial/app.py b/example-apps/search-tutorial/v1/search-tutorial/app.py index 3a439783..efd50a56 100644 --- a/example-apps/search-tutorial/v1/search-tutorial/app.py +++ b/example-apps/search-tutorial/v1/search-tutorial/app.py @@ -6,115 +6,113 @@ es = Search() -@app.get('/') +@app.get("/") def index(): - return render_template('index.html') + return render_template("index.html") -@app.post('/') +@app.post("/") def handle_search(): - query = request.form.get('query', '') + query = request.form.get("query", "") filters, parsed_query = extract_filters(query) - from_ = request.form.get('from_', type=int, default=0) + from_ = request.form.get("from_", type=int, default=0) if parsed_query: search_query = { - 'must': { - 'multi_match': { - 'query': parsed_query, - 'fields': ['name', 'summary', 'content'], + "must": { + "multi_match": { + "query": parsed_query, + "fields": ["name", "summary", "content"], } } } else: - search_query = { - 'must': { - 'match_all': {} - } - } + search_query = {"must": {"match_all": {}}} results = es.search( - query={ - 'bool': { - **search_query, - **filters - } - }, + query={"bool": {**search_query, **filters}}, aggs={ - 'category-agg': { - 'terms': { - 'field': 'category.keyword', + "category-agg": { + "terms": { + "field": "category.keyword", } }, - 'year-agg': { - 'date_histogram': { - 'field': 'updated_at', - 'calendar_interval': 'year', - 'format': 'yyyy', + "year-agg": { + "date_histogram": { + "field": "updated_at", + "calendar_interval": "year", + "format": "yyyy", }, }, }, size=5, - from_=from_ + from_=from_, ) aggs = { - 'Category': { - bucket['key']: bucket['doc_count'] - for bucket in results['aggregations']['category-agg']['buckets'] + "Category": { + bucket["key"]: bucket["doc_count"] + for bucket in results["aggregations"]["category-agg"]["buckets"] }, - 'Year': { - bucket['key_as_string']: bucket['doc_count'] - for bucket in results['aggregations']['year-agg']['buckets'] - if bucket['doc_count'] > 0 + "Year": { + bucket["key_as_string"]: bucket["doc_count"] + for bucket in results["aggregations"]["year-agg"]["buckets"] + if bucket["doc_count"] > 0 }, } - return render_template('index.html', results=results['hits']['hits'], - query=query, from_=from_, - total=results['hits']['total']['value'], aggs=aggs) + return render_template( + "index.html", + results=results["hits"]["hits"], + query=query, + from_=from_, + total=results["hits"]["total"]["value"], + aggs=aggs, + ) -@app.get('/document/') +@app.get("/document/") def get_document(id): document = es.retrieve_document(id) - title = document['_source']['name'] - paragraphs = document['_source']['content'].split('\n') - return render_template('document.html', title=title, paragraphs=paragraphs) + title = document["_source"]["name"] + paragraphs = document["_source"]["content"].split("\n") + return render_template("document.html", title=title, paragraphs=paragraphs) @app.cli.command() def reindex(): """Regenerate the Elasticsearch index.""" response = es.reindex() - print(f'Index with {len(response["items"])} documents created ' - f'in {response["took"]} milliseconds.') + print( + f'Index with {len(response["items"])} documents created ' + f'in {response["took"]} milliseconds.' + ) def extract_filters(query): filters = [] - filter_regex = r'category:([^\s]+)\s*' + filter_regex = r"category:([^\s]+)\s*" m = re.search(filter_regex, query) if m: - filters.append({ - 'term': { - 'category.keyword': { - 'value': m.group(1) - } - }, - }) - query = re.sub(filter_regex, '', query).strip() + filters.append( + { + "term": {"category.keyword": {"value": m.group(1)}}, + } + ) + query = re.sub(filter_regex, "", query).strip() - filter_regex = r'year:([^\s]+)\s*' + filter_regex = r"year:([^\s]+)\s*" m = re.search(filter_regex, query) if m: - filters.append({ - 'range': { - 'updated_at': { - 'gte': f'{m.group(1)}||/y', - 'lte': f'{m.group(1)}||/y', - } - }, - }) - query = re.sub(filter_regex, '', query).strip() + filters.append( + { + "range": { + "updated_at": { + "gte": f"{m.group(1)}||/y", + "lte": f"{m.group(1)}||/y", + } + }, + } + ) + query = re.sub(filter_regex, "", query).strip() - return {'filter': filters}, query + return {"filter": filters}, query diff --git a/example-apps/search-tutorial/v1/search-tutorial/search.py b/example-apps/search-tutorial/v1/search-tutorial/search.py index 9c97b11e..f251918a 100644 --- a/example-apps/search-tutorial/v1/search-tutorial/search.py +++ b/example-apps/search-tutorial/v1/search-tutorial/search.py @@ -10,35 +10,36 @@ class Search: def __init__(self): - self.es = Elasticsearch(cloud_id=os.environ['ELASTIC_CLOUD_ID'], - api_key=os.environ['ELASTIC_API_KEY']) + self.es = Elasticsearch( + cloud_id=os.environ["ELASTIC_CLOUD_ID"], + api_key=os.environ["ELASTIC_API_KEY"], + ) client_info = self.es.info() - print('Connected to Elasticsearch!') + print("Connected to Elasticsearch!") pprint(client_info.body) def create_index(self): - self.es.indices.delete(index='my_documents', ignore_unavailable=True) - self.es.indices.create(index='my_documents') + self.es.indices.delete(index="my_documents", ignore_unavailable=True) + self.es.indices.create(index="my_documents") def insert_document(self, document): - return self.es.index(index='my_documents', document=document) + return self.es.index(index="my_documents", document=document) def insert_documents(self, documents): operations = [] for document in documents: - operations.append({'index': {'_index': 'my_documents'}}) + operations.append({"index": {"_index": "my_documents"}}) operations.append(document) return self.es.bulk(operations=operations) def reindex(self): self.create_index() - with open('data.json', 'rt') as f: + with open("data.json", "rt") as f: documents = json.loads(f.read()) return self.insert_documents(documents) def search(self, **query_args): - return self.es.search(index='my_documents', **query_args) + return self.es.search(index="my_documents", **query_args) def retrieve_document(self, id): - return self.es.get(index='my_documents', id=id) - + return self.es.get(index="my_documents", id=id) diff --git a/example-apps/search-tutorial/v2/search-tutorial/app.py b/example-apps/search-tutorial/v2/search-tutorial/app.py index 9b3e994b..b3c5ac1f 100644 --- a/example-apps/search-tutorial/v2/search-tutorial/app.py +++ b/example-apps/search-tutorial/v2/search-tutorial/app.py @@ -6,61 +6,50 @@ es = Search() -@app.get('/') +@app.get("/") def index(): - return render_template('index.html') + return render_template("index.html") -@app.post('/') +@app.post("/") def handle_search(): - query = request.form.get('query', '') + query = request.form.get("query", "") filters, parsed_query = extract_filters(query) - from_ = request.form.get('from_', type=int, default=0) + from_ = request.form.get("from_", type=int, default=0) if parsed_query: search_query = { - 'must': { - 'multi_match': { - 'query': parsed_query, - 'fields': ['name', 'summary', 'content'], + "must": { + "multi_match": { + "query": parsed_query, + "fields": ["name", "summary", "content"], } } } else: - search_query = { - 'must': { - 'match_all': {} - } - } + search_query = {"must": {"match_all": {}}} results = es.search( - query={ - 'bool': { - **search_query, - **filters - } - }, + query={"bool": {**search_query, **filters}}, knn={ - 'field': 'embedding', - 'query_vector': es.get_embedding(parsed_query), - 'k': 10, - 'num_candidates': 50, + "field": "embedding", + "query_vector": es.get_embedding(parsed_query), + "k": 10, + "num_candidates": 50, **filters, }, - rank={ - 'rrf': {} - }, + rank={"rrf": {}}, aggs={ - 'category-agg': { - 'terms': { - 'field': 'category.keyword', + "category-agg": { + "terms": { + "field": "category.keyword", } }, - 'year-agg': { - 'date_histogram': { - 'field': 'updated_at', - 'calendar_interval': 'year', - 'format': 'yyyy', + "year-agg": { + "date_histogram": { + "field": "updated_at", + "calendar_interval": "year", + "format": "yyyy", }, }, }, @@ -68,50 +57,49 @@ def handle_search(): from_=from_, ) aggs = { - 'Category': { - bucket['key']: bucket['doc_count'] - for bucket in results['aggregations']['category-agg']['buckets'] + "Category": { + bucket["key"]: bucket["doc_count"] + for bucket in results["aggregations"]["category-agg"]["buckets"] }, - 'Year': { - bucket['key_as_string']: bucket['doc_count'] - for bucket in results['aggregations']['year-agg']['buckets'] - if bucket['doc_count'] > 0 + "Year": { + bucket["key_as_string"]: bucket["doc_count"] + for bucket in results["aggregations"]["year-agg"]["buckets"] + if bucket["doc_count"] > 0 }, } - return render_template('index.html', results=results['hits']['hits'], - query=query, from_=from_, - total=results['hits']['total']['value'], aggs=aggs) + return render_template( + "index.html", + results=results["hits"]["hits"], + query=query, + from_=from_, + total=results["hits"]["total"]["value"], + aggs=aggs, + ) -@app.get('/document/') +@app.get("/document/") def get_document(id): document = es.retrieve_document(id) - title = document['_source']['name'] - paragraphs = document['_source']['content'].split('\n') - return render_template('document.html', title=title, paragraphs=paragraphs) + title = document["_source"]["name"] + paragraphs = document["_source"]["content"].split("\n") + return render_template("document.html", title=title, paragraphs=paragraphs) @app.cli.command() def reindex(): """Regenerate the Elasticsearch index.""" response = es.reindex() - print(f'Index with {len(response["items"])} documents created ' - f'in {response["took"]} milliseconds.') + print( + f'Index with {len(response["items"])} documents created ' + f'in {response["took"]} milliseconds.' + ) def extract_filters(query): - filter_regex = r'category:([^\s]+)\s*' + filter_regex = r"category:([^\s]+)\s*" m = re.search(filter_regex, query) if m is None: return {}, query # no filters - filters = { - 'filter': [{ - 'term': { - 'category.keyword': { - 'value': m.group(1) - } - } - }] - } - query = re.sub(filter_regex, '', query).strip() + filters = {"filter": [{"term": {"category.keyword": {"value": m.group(1)}}}]} + query = re.sub(filter_regex, "", query).strip() return filters, query diff --git a/example-apps/search-tutorial/v2/search-tutorial/search.py b/example-apps/search-tutorial/v2/search-tutorial/search.py index b61004ad..e7bac0eb 100644 --- a/example-apps/search-tutorial/v2/search-tutorial/search.py +++ b/example-apps/search-tutorial/v2/search-tutorial/search.py @@ -11,50 +11,60 @@ class Search: def __init__(self): - self.model = SentenceTransformer('all-MiniLM-L6-v2') - self.es = Elasticsearch(cloud_id=os.environ['ELASTIC_CLOUD_ID'], - api_key=os.environ['ELASTIC_API_KEY']) + self.model = SentenceTransformer("all-MiniLM-L6-v2") + self.es = Elasticsearch( + cloud_id=os.environ["ELASTIC_CLOUD_ID"], + api_key=os.environ["ELASTIC_API_KEY"], + ) client_info = self.es.info() - print('Connected to Elasticsearch!') + print("Connected to Elasticsearch!") pprint(client_info.body) def create_index(self): - self.es.indices.delete(index='my_documents', ignore_unavailable=True) - self.es.indices.create(index='my_documents', mappings={ - 'properties': { - 'embedding': { - 'type': 'dense_vector', + self.es.indices.delete(index="my_documents", ignore_unavailable=True) + self.es.indices.create( + index="my_documents", + mappings={ + "properties": { + "embedding": { + "type": "dense_vector", + } } - } - }) + }, + ) def get_embedding(self, text): return self.model.encode(text) def insert_document(self, document): - return self.es.index(index='my_documents', document={ - **document, - 'embedding': self.get_embedding(document['summary']), - }) + return self.es.index( + index="my_documents", + document={ + **document, + "embedding": self.get_embedding(document["summary"]), + }, + ) def insert_documents(self, documents): operations = [] for document in documents: - operations.append({'index': {'_index': 'my_documents'}}) - operations.append({ - **document, - 'embedding': self.get_embedding(document['summary']), - }) + operations.append({"index": {"_index": "my_documents"}}) + operations.append( + { + **document, + "embedding": self.get_embedding(document["summary"]), + } + ) return self.es.bulk(operations=operations) def reindex(self): self.create_index() - with open('data.json', 'rt') as f: + with open("data.json", "rt") as f: documents = json.loads(f.read()) return self.insert_documents(documents) def search(self, **query_args): - return self.es.search(index='my_documents', **query_args) + return self.es.search(index="my_documents", **query_args) def retrieve_document(self, id): - return self.es.get(index='my_documents', id=id) + return self.es.get(index="my_documents", id=id) diff --git a/example-apps/search-tutorial/v3/search-tutorial/app.py b/example-apps/search-tutorial/v3/search-tutorial/app.py index e5949da5..b560adfe 100644 --- a/example-apps/search-tutorial/v3/search-tutorial/app.py +++ b/example-apps/search-tutorial/v3/search-tutorial/app.py @@ -6,42 +6,42 @@ es = Search() -@app.get('/') +@app.get("/") def index(): - return render_template('index.html') + return render_template("index.html") -@app.post('/') +@app.post("/") def handle_search(): - query = request.form.get('query', '') + query = request.form.get("query", "") filters, parsed_query = extract_filters(query) - from_ = request.form.get('from_', type=int, default=0) + from_ = request.form.get("from_", type=int, default=0) if parsed_query: search_query = { - 'sub_searches': [ + "sub_searches": [ { - 'query': { - 'bool': { - 'must': { - 'multi_match': { - 'query': parsed_query, - 'fields': ['name', 'summary', 'content'], + "query": { + "bool": { + "must": { + "multi_match": { + "query": parsed_query, + "fields": ["name", "summary", "content"], } }, - **filters + **filters, } } }, { - 'query': { - 'bool': { - 'must': [ + "query": { + "bool": { + "must": [ { - 'text_expansion': { - 'elser_embedding': { - 'model_id': '.elser_model_2', - 'model_text': parsed_query, + "text_expansion": { + "elser_embedding": { + "model_id": ".elser_model_2", + "model_text": parsed_query, } }, } @@ -51,35 +51,24 @@ def handle_search(): }, }, ], - 'rank': { - 'rrf': {} - }, + "rank": {"rrf": {}}, } else: - search_query = { - 'query': { - 'bool': { - 'must': { - 'match_all': {} - }, - **filters - } - } - } + search_query = {"query": {"bool": {"must": {"match_all": {}}, **filters}}} results = es.search( **search_query, aggs={ - 'category-agg': { - 'terms': { - 'field': 'category.keyword', + "category-agg": { + "terms": { + "field": "category.keyword", } }, - 'year-agg': { - 'date_histogram': { - 'field': 'updated_at', - 'calendar_interval': 'year', - 'format': 'yyyy', + "year-agg": { + "date_histogram": { + "field": "updated_at", + "calendar_interval": "year", + "format": "yyyy", }, }, }, @@ -87,35 +76,42 @@ def handle_search(): from_=from_, ) aggs = { - 'Category': { - bucket['key']: bucket['doc_count'] - for bucket in results['aggregations']['category-agg']['buckets'] + "Category": { + bucket["key"]: bucket["doc_count"] + for bucket in results["aggregations"]["category-agg"]["buckets"] }, - 'Year': { - bucket['key_as_string']: bucket['doc_count'] - for bucket in results['aggregations']['year-agg']['buckets'] - if bucket['doc_count'] > 0 + "Year": { + bucket["key_as_string"]: bucket["doc_count"] + for bucket in results["aggregations"]["year-agg"]["buckets"] + if bucket["doc_count"] > 0 }, } - return render_template('index.html', results=results['hits']['hits'], - query=query, from_=from_, - total=results['hits']['total']['value'], aggs=aggs) + return render_template( + "index.html", + results=results["hits"]["hits"], + query=query, + from_=from_, + total=results["hits"]["total"]["value"], + aggs=aggs, + ) -@app.get('/document/') +@app.get("/document/") def get_document(id): document = es.retrieve_document(id) - title = document['_source']['name'] - paragraphs = document['_source']['content'].split('\n') - return render_template('document.html', title=title, paragraphs=paragraphs) + title = document["_source"]["name"] + paragraphs = document["_source"]["content"].split("\n") + return render_template("document.html", title=title, paragraphs=paragraphs) @app.cli.command() def reindex(): """Regenerate the Elasticsearch index.""" response = es.reindex() - print(f'Index with {len(response["items"])} documents created ' - f'in {response["took"]} milliseconds.') + print( + f'Index with {len(response["items"])} documents created ' + f'in {response["took"]} milliseconds.' + ) @app.cli.command() @@ -124,25 +120,16 @@ def deploy_elser(): try: es.deploy_elser() except Exception as exc: - print(f'Error: {exc}') + print(f"Error: {exc}") else: - print(f'ELSER model deployed.') + print(f"ELSER model deployed.") def extract_filters(query): - filter_regex = r'category:([^\s]+)\s*' + filter_regex = r"category:([^\s]+)\s*" m = re.search(filter_regex, query) if m is None: return {}, query # no filters - filters = { - 'filter': [{ - 'term': { - 'category.keyword': { - 'value': m.group(1) - } - } - }] - } - query = re.sub(filter_regex, '', query).strip() + filters = {"filter": [{"term": {"category.keyword": {"value": m.group(1)}}}]} + query = re.sub(filter_regex, "", query).strip() return filters, query - diff --git a/example-apps/search-tutorial/v3/search-tutorial/search.py b/example-apps/search-tutorial/v3/search-tutorial/search.py index ac883580..f1d2e128 100644 --- a/example-apps/search-tutorial/v3/search-tutorial/search.py +++ b/example-apps/search-tutorial/v3/search-tutorial/search.py @@ -12,107 +12,111 @@ class Search: def __init__(self): - self.model = SentenceTransformer('all-MiniLM-L6-v2') - self.es = Elasticsearch(cloud_id=os.environ['ELASTIC_CLOUD_ID'], - api_key=os.environ['ELASTIC_API_KEY']) + self.model = SentenceTransformer("all-MiniLM-L6-v2") + self.es = Elasticsearch( + cloud_id=os.environ["ELASTIC_CLOUD_ID"], + api_key=os.environ["ELASTIC_API_KEY"], + ) client_info = self.es.info() - print('Connected to Elasticsearch!') + print("Connected to Elasticsearch!") pprint(client_info.body) def create_index(self): - self.es.indices.delete(index='my_documents', ignore_unavailable=True) + self.es.indices.delete(index="my_documents", ignore_unavailable=True) self.es.indices.create( - index='my_documents', + index="my_documents", mappings={ - 'properties': { - 'embedding': { - 'type': 'dense_vector', + "properties": { + "embedding": { + "type": "dense_vector", }, - 'elser_embedding': { - 'type': 'sparse_vector', + "elser_embedding": { + "type": "sparse_vector", }, } }, - settings={ - 'index': { - 'default_pipeline': 'elser-ingest-pipeline' - } - } + settings={"index": {"default_pipeline": "elser-ingest-pipeline"}}, ) def get_embedding(self, text): return self.model.encode(text) def insert_document(self, document): - return self.es.index(index='my_documents', document={ - **document, - 'embedding': self.get_embedding(document['summary']), - }) + return self.es.index( + index="my_documents", + document={ + **document, + "embedding": self.get_embedding(document["summary"]), + }, + ) def insert_documents(self, documents): operations = [] for document in documents: - operations.append({'index': {'_index': 'my_documents'}}) - operations.append({ - **document, - 'embedding': self.get_embedding(document['summary']), - }) + operations.append({"index": {"_index": "my_documents"}}) + operations.append( + { + **document, + "embedding": self.get_embedding(document["summary"]), + } + ) return self.es.bulk(operations=operations) def reindex(self): self.create_index() - with open('data.json', 'rt') as f: + with open("data.json", "rt") as f: documents = json.loads(f.read()) return self.insert_documents(documents) def search(self, **query_args): # sub_searches is not currently supported in the client, so we send # search requests as raw requests - if 'from_' in query_args: - query_args['from'] = query_args['from_'] - del query_args['from_'] + if "from_" in query_args: + query_args["from"] = query_args["from_"] + del query_args["from_"] return self.es.perform_request( - 'GET', - f'/my_documents/_search', + "GET", + f"/my_documents/_search", body=json.dumps(query_args), - headers={'Content-Type': 'application/json', - 'Accept': 'application/json'}, + headers={"Content-Type": "application/json", "Accept": "application/json"}, ) def retrieve_document(self, id): - return self.es.get(index='my_documents', id=id) + return self.es.get(index="my_documents", id=id) def deploy_elser(self): # download ELSER v2 - self.es.ml.put_trained_model(model_id='.elser_model_2', - input={'field_names': ['text_field']}) + self.es.ml.put_trained_model( + model_id=".elser_model_2", input={"field_names": ["text_field"]} + ) # wait until ready while True: - status = self.es.ml.get_trained_models(model_id='.elser_model_2', - include='definition_status') - if status['trained_model_configs'][0]['fully_defined']: + status = self.es.ml.get_trained_models( + model_id=".elser_model_2", include="definition_status" + ) + if status["trained_model_configs"][0]["fully_defined"]: # model is ready break time.sleep(1) # deploy the model - self.es.ml.start_trained_model_deployment(model_id='.elser_model_2') + self.es.ml.start_trained_model_deployment(model_id=".elser_model_2") # define a pipeline self.es.ingest.put_pipeline( - id='elser-ingest-pipeline', + id="elser-ingest-pipeline", processors=[ { - 'inference': { - 'model_id': '.elser_model_2', - 'input_output': [ + "inference": { + "model_id": ".elser_model_2", + "input_output": [ { - 'input_field': 'summary', - 'output_field': 'elser_embedding', + "input_field": "summary", + "output_field": "elser_embedding", } - ] + ], } } - ] + ], ) diff --git a/notebooks/document-chunking/_nbtest.teardown.with-index-pipelines.ipynb b/notebooks/document-chunking/_nbtest.teardown.with-index-pipelines.ipynb index 5c1800fd..0240b792 100644 --- a/notebooks/document-chunking/_nbtest.teardown.with-index-pipelines.ipynb +++ b/notebooks/document-chunking/_nbtest.teardown.with-index-pipelines.ipynb @@ -19,7 +19,7 @@ "# Create the client instance\n", "client = Elasticsearch(\n", " # For local development\n", - " # hosts=[\"http://localhost:9200\"] \n", + " # hosts=[\"http://localhost:9200\"]\n", " cloud_id=ELASTIC_CLOUD_ID,\n", " api_key=ELASTIC_API_KEY,\n", ")" @@ -47,7 +47,9 @@ "outputs": [], "source": [ "try:\n", - " client.ml.delete_trained_model(model_id=\"sentence-transformers__all-minilm-l6-v2\", force=True)\n", + " client.ml.delete_trained_model(\n", + " model_id=\"sentence-transformers__all-minilm-l6-v2\", force=True\n", + " )\n", "except:\n", " pass" ] diff --git a/notebooks/document-chunking/_nbtest.teardown.with-langchain-splitters.ipynb b/notebooks/document-chunking/_nbtest.teardown.with-langchain-splitters.ipynb index 89586cbc..e71d8897 100644 --- a/notebooks/document-chunking/_nbtest.teardown.with-langchain-splitters.ipynb +++ b/notebooks/document-chunking/_nbtest.teardown.with-langchain-splitters.ipynb @@ -19,7 +19,7 @@ "# Create the client instance\n", "client = Elasticsearch(\n", " # For local development\n", - " # hosts=[\"http://localhost:9200\"] \n", + " # hosts=[\"http://localhost:9200\"]\n", " cloud_id=ELASTIC_CLOUD_ID,\n", " api_key=ELASTIC_API_KEY,\n", ")" @@ -47,7 +47,9 @@ "outputs": [], "source": [ "try:\n", - " client.ml.delete_trained_model(model_id=\"sentence-transformers__all-minilm-l6-v2\", force=True)\n", + " client.ml.delete_trained_model(\n", + " model_id=\"sentence-transformers__all-minilm-l6-v2\", force=True\n", + " )\n", "except:\n", " pass" ] diff --git a/notebooks/document-chunking/tokenization.ipynb b/notebooks/document-chunking/tokenization.ipynb index 04b59506..25a688fe 100644 --- a/notebooks/document-chunking/tokenization.ipynb +++ b/notebooks/document-chunking/tokenization.ipynb @@ -80,8 +80,9 @@ "metadata": {}, "outputs": [], "source": [ - "bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n", - "e5_tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-base')\n", + "bert_tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n", + "e5_tokenizer = AutoTokenizer.from_pretrained(\"intfloat/multilingual-e5-base\")\n", + "\n", "\n", "def whitespace_tokenize(text):\n", " return text.split()" @@ -149,6 +150,7 @@ " e5_tokens = len(e5_tokenizer.encode(text))\n", " return [whitespace_tokens, bert_tokens, e5_tokens, f\"{text[:80]}...\"]\n", "\n", + "\n", "counts = [count_tokens(movie[\"plot\"]) for movie in movies]\n", "\n", "print(tabulate(sorted(counts), [\"whitespace\", \"BERT\", \"E5\", \"text\"]))" @@ -219,11 +221,14 @@ "SEMANTIC_SEARCH_TOKEN_LIMIT = 510 # 512 minus space for the 2 special tokens\n", "ELSER_TOKEN_OVERLAP = 0.5 # 50% token overlap between chunks is recommended for ELSER\n", "\n", - "def chunk(tokens, chunk_size=SEMANTIC_SEARCH_TOKEN_LIMIT, overlap_ratio=ELSER_TOKEN_OVERLAP):\n", + "\n", + "def chunk(\n", + " tokens, chunk_size=SEMANTIC_SEARCH_TOKEN_LIMIT, overlap_ratio=ELSER_TOKEN_OVERLAP\n", + "):\n", " step_size = round(chunk_size * overlap_ratio)\n", "\n", " for i in range(0, len(tokens), step_size):\n", - " yield tokens[i:i+chunk_size]" + " yield tokens[i : i + chunk_size]" ] }, { @@ -281,11 +286,10 @@ } ], "source": [ - "tokens = bert_tokenizer.encode(long_text)[1:-1] # exclude special tokens at the beginning and end\n", - "chunked = [\n", - " bert_tokenizer.decode(tokens_chunk)\n", - " for tokens_chunk in chunk(tokens)\n", - "]\n", + "tokens = bert_tokenizer.encode(long_text)[\n", + " 1:-1\n", + "] # exclude special tokens at the beginning and end\n", + "chunked = [bert_tokenizer.decode(tokens_chunk) for tokens_chunk in chunk(tokens)]\n", "chunked" ] }, diff --git a/notebooks/document-chunking/with-index-pipelines.ipynb b/notebooks/document-chunking/with-index-pipelines.ipynb index ec3359de..5e7535df 100644 --- a/notebooks/document-chunking/with-index-pipelines.ipynb +++ b/notebooks/document-chunking/with-index-pipelines.ipynb @@ -103,7 +103,7 @@ "# Create the client instance\n", "client = Elasticsearch(\n", " # For local development\n", - " # hosts=[\"http://localhost:9200\"] \n", + " # hosts=[\"http://localhost:9200\"]\n", " cloud_id=ELASTIC_CLOUD_ID,\n", " api_key=ELASTIC_API_KEY,\n", ")" @@ -208,13 +208,13 @@ "CHUNK_SIZE = 400\n", "\n", "client.ingest.put_pipeline(\n", - " id=\"chunk_text_to_passages\",\n", - " processors=[\n", - " {\n", - " \"script\": {\n", - " \"description\": \"Chunk body_content into sentences by looking for . followed by a space\",\n", - " \"lang\": \"painless\",\n", - " \"source\": \"\"\"\n", + " id=\"chunk_text_to_passages\",\n", + " processors=[\n", + " {\n", + " \"script\": {\n", + " \"description\": \"Chunk body_content into sentences by looking for . followed by a space\",\n", + " \"lang\": \"painless\",\n", + " \"source\": \"\"\"\n", " String[] envSplit = /((? dict:\n", " metadata[\"name\"] = record.get(\"name\")\n", @@ -117,6 +117,7 @@ "\n", " return metadata\n", "\n", + "\n", "# For more loaders https://python.langchain.com/docs/modules/data_connection/document_loaders/\n", "# And 3rd party loaders https://python.langchain.com/docs/modules/data_connection/document_loaders/#third-party-loaders\n", "loader = JSONLoader(\n", @@ -186,67 +187,61 @@ "\n", "# Create the pipeline\n", "client.ingest.put_pipeline(\n", - " id=PIPELINE_ID, \n", - " processors=[\n", - " {\n", - " \"foreach\": {\n", - " \"field\": \"passages\",\n", - " \"processor\": {\n", - " \"inference\": {\n", - " \"field_map\": {\n", - " \"_ingest._value.text\": \"text_field\"\n", - " },\n", - " \"model_id\": MODEL_ID,\n", - " \"target_field\": \"_ingest._value.vector\",\n", - " \"on_failure\": [\n", - " {\n", - " \"append\": {\n", - " \"field\": \"_source._ingest.inference_errors\",\n", - " \"value\": [\n", - " {\n", - " \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n", - " \"pipeline\": \"ml-inference-title-vector\",\n", - " \"timestamp\": \"{{{ _ingest.timestamp }}}\"\n", + " id=PIPELINE_ID,\n", + " processors=[\n", + " {\n", + " \"foreach\": {\n", + " \"field\": \"passages\",\n", + " \"processor\": {\n", + " \"inference\": {\n", + " \"field_map\": {\"_ingest._value.text\": \"text_field\"},\n", + " \"model_id\": MODEL_ID,\n", + " \"target_field\": \"_ingest._value.vector\",\n", + " \"on_failure\": [\n", + " {\n", + " \"append\": {\n", + " \"field\": \"_source._ingest.inference_errors\",\n", + " \"value\": [\n", + " {\n", + " \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n", + " \"pipeline\": \"ml-inference-title-vector\",\n", + " \"timestamp\": \"{{{ _ingest.timestamp }}}\",\n", + " }\n", + " ],\n", + " }\n", + " }\n", + " ],\n", " }\n", - " ]\n", - " }\n", - " }\n", - " ]\n", - " }\n", + " },\n", + " }\n", " }\n", - " }\n", - " }\n", - " ]\n", + " ],\n", ")\n", "\n", "# Create the index\n", - "client.indices.create( \n", - " index=INDEX_NAME, \n", - " settings={\n", - " \"index\": {\n", - " \"default_pipeline\": PIPELINE_ID\n", - " }\n", - " },\n", - " mappings={\n", - " \"dynamic\": \"true\",\n", - " \"properties\": {\n", - " \"passages\": {\n", - " \"type\": \"nested\",\n", + "client.indices.create(\n", + " index=INDEX_NAME,\n", + " settings={\"index\": {\"default_pipeline\": PIPELINE_ID}},\n", + " mappings={\n", + " \"dynamic\": \"true\",\n", " \"properties\": {\n", - " \"vector\": {\n", - " \"properties\": {\n", - " \"predicted_value\": {\n", - " \"type\": \"dense_vector\",\n", - " \"index\": True,\n", - " \"dims\": MODEL_DIMS,\n", - " \"similarity\": \"dot_product\"\n", - " }\n", + " \"passages\": {\n", + " \"type\": \"nested\",\n", + " \"properties\": {\n", + " \"vector\": {\n", + " \"properties\": {\n", + " \"predicted_value\": {\n", + " \"type\": \"dense_vector\",\n", + " \"index\": True,\n", + " \"dims\": MODEL_DIMS,\n", + " \"similarity\": \"dot_product\",\n", + " }\n", + " }\n", + " }\n", + " },\n", " }\n", - " }\n", - " }\n", - " }\n", - " }\n", - " }\n", + " },\n", + " },\n", ")" ] }, @@ -268,27 +263,30 @@ "source": [ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "\n", + "\n", "def parent_child_splitter(documents, chunk_size: int = 200):\n", "\n", - " child_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)\n", - "\n", - " docs = []\n", - " for i, doc in enumerate(documents):\n", - " passages = []\n", - "\n", - " for _doc in child_splitter.split_documents([doc]):\n", - " passages.append({\n", - " \"text\": _doc.page_content,\n", - " })\n", - "\n", - " doc = {\n", - " \"content\": doc.page_content,\n", - " \"metadata\": doc.metadata,\n", - " \"passages\": passages\n", - " }\n", - " docs.append(doc)\n", - " \n", - " return docs\n" + " child_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)\n", + "\n", + " docs = []\n", + " for i, doc in enumerate(documents):\n", + " passages = []\n", + "\n", + " for _doc in child_splitter.split_documents([doc]):\n", + " passages.append(\n", + " {\n", + " \"text\": _doc.page_content,\n", + " }\n", + " )\n", + "\n", + " doc = {\n", + " \"content\": doc.page_content,\n", + " \"metadata\": doc.metadata,\n", + " \"passages\": passages,\n", + " }\n", + " docs.append(doc)\n", + "\n", + " return docs" ] }, { @@ -306,26 +304,26 @@ "outputs": [], "source": [ "def pretty_response(response, show_parent_text=False):\n", - " if len(response['hits']['hits']) == 0:\n", - " print('Your search returned no results.')\n", - " else:\n", - " for hit in response['hits']['hits']:\n", - " id = hit['_id']\n", - " score = hit['_score']\n", - " doc_title = hit['_source'][\"metadata\"]['name']\n", - " parent_text = \"\"\n", - "\n", - " if show_parent_text:\n", - " parent_text = hit['_source'][\"content\"]\n", - "\n", - " passage_text = \"\"\n", - "\n", - " for passage in hit['inner_hits']['passages']['hits']['hits']:\n", - " passage_text += passage[\"fields\"][\"passages\"][0]['text'][0] + \"\\n\\n\"\n", - "\n", - " pretty_output = (f\"\\nID: {id}\\nDoc Title: {doc_title}\\nparent text:\\n{parent_text}\\nPassage Text:\\n{passage_text}\\nScore: {score}\\n\")\n", - " print(pretty_output)\n", - " print(\"---\")" + " if len(response[\"hits\"][\"hits\"]) == 0:\n", + " print(\"Your search returned no results.\")\n", + " else:\n", + " for hit in response[\"hits\"][\"hits\"]:\n", + " id = hit[\"_id\"]\n", + " score = hit[\"_score\"]\n", + " doc_title = hit[\"_source\"][\"metadata\"][\"name\"]\n", + " parent_text = \"\"\n", + "\n", + " if show_parent_text:\n", + " parent_text = hit[\"_source\"][\"content\"]\n", + "\n", + " passage_text = \"\"\n", + "\n", + " for passage in hit[\"inner_hits\"][\"passages\"][\"hits\"][\"hits\"]:\n", + " passage_text += passage[\"fields\"][\"passages\"][0][\"text\"][0] + \"\\n\\n\"\n", + "\n", + " pretty_output = f\"\\nID: {id}\\nDoc Title: {doc_title}\\nparent text:\\n{parent_text}\\nPassage Text:\\n{passage_text}\\nScore: {score}\\n\"\n", + " print(pretty_output)\n", + " print(\"---\")" ] }, { @@ -360,15 +358,12 @@ "\n", "chunked_docs = parent_child_splitter(loader.load(), chunk_size=600)\n", "\n", - "count, errors = helpers.bulk(\n", - " client, \n", - " chunked_docs,\n", - " index=INDEX_NAME\n", - ")\n", + "count, errors = helpers.bulk(client, chunked_docs, index=INDEX_NAME)\n", "\n", "print(f\"Indexed {count} documents with {errors} errors\")\n", "\n", "import time\n", + "\n", "time.sleep(5)" ] }, @@ -475,25 +470,19 @@ ], "source": [ "response = client.search(\n", - " index=INDEX_NAME, \n", - " knn={\n", - " \"inner_hits\": {\n", - " \"size\": 1,\n", - " \"_source\": False,\n", - " \"fields\": [\n", - " \"passages.text\"\n", - " ]\n", + " index=INDEX_NAME,\n", + " knn={\n", + " \"inner_hits\": {\"size\": 1, \"_source\": False, \"fields\": [\"passages.text\"]},\n", + " \"field\": \"passages.vector.predicted_value\",\n", + " \"k\": 5,\n", + " \"num_candidates\": 100,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", + " \"model_text\": \"Whats the work from home policy?\",\n", + " }\n", + " },\n", " },\n", - " \"field\": \"passages.vector.predicted_value\",\n", - " \"k\": 5,\n", - " \"num_candidates\": 100,\n", - " \"query_vector_builder\": {\n", - " \"text_embedding\": {\n", - " \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", - " \"model_text\": \"Whats the work from home policy?\"\n", - " }\n", - " }\n", - " }\n", ")\n", "\n", "pretty_response(response)" @@ -559,42 +548,41 @@ } ], "source": [ - "from langchain.vectorstores.elasticsearch import ElasticsearchStore, ApproxRetrievalStrategy\n", + "from langchain.vectorstores.elasticsearch import (\n", + " ElasticsearchStore,\n", + " ApproxRetrievalStrategy,\n", + ")\n", "from typing import List, Union\n", "from langchain_core.documents import Document\n", "\n", + "\n", "class CustomRetrievalStrategy(ApproxRetrievalStrategy):\n", "\n", " def query(\n", - " self,\n", - " query: Union[str, None],\n", - " filter: List[dict],\n", - " **kwargs,\n", + " self,\n", + " query: Union[str, None],\n", + " filter: List[dict],\n", + " **kwargs,\n", " ):\n", - " \n", - " es_query = {\n", - " \"knn\": {\n", - " \"inner_hits\": {\n", - " \"_source\": False,\n", - " \"fields\": [\n", - " \"passages.text\"\n", - " ]\n", - " },\n", - " \"field\": \"passages.vector.predicted_value\",\n", - " \"filter\": filter,\n", - " \"k\": 5,\n", - " \"num_candidates\": 100,\n", - " \"query_vector_builder\": {\n", - " \"text_embedding\": {\n", - " \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", - " \"model_text\": query\n", + "\n", + " es_query = {\n", + " \"knn\": {\n", + " \"inner_hits\": {\"_source\": False, \"fields\": [\"passages.text\"]},\n", + " \"field\": \"passages.vector.predicted_value\",\n", + " \"filter\": filter,\n", + " \"k\": 5,\n", + " \"num_candidates\": 100,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", + " \"model_text\": query,\n", + " }\n", + " },\n", " }\n", - " }\n", " }\n", - " }\n", "\n", - " return es_query\n", - " \n", + " return es_query\n", + "\n", "\n", "vector_store = ElasticsearchStore(\n", " index_name=INDEX_NAME,\n", @@ -603,22 +591,28 @@ " strategy=CustomRetrievalStrategy(),\n", ")\n", "\n", + "\n", "def doc_builder(hit):\n", - " passage_hits = hit.get(\"inner_hits\", {}).get(\"passages\", {}).get(\"hits\", {}).get(\"hits\", [])\n", - " page_content = \"\"\n", - " for passage_hit in passage_hits:\n", - " passage_fields = passage_hit.get(\"fields\", {}).get(\"passages\", [])[0]\n", - " page_content += passage_fields.get(\"text\", [])[0] + \"\\n\\n\"\n", - "\n", - " return Document(\n", - " page_content=page_content,\n", - " metadata=hit[\"_source\"][\"metadata\"],\n", + " passage_hits = (\n", + " hit.get(\"inner_hits\", {}).get(\"passages\", {}).get(\"hits\", {}).get(\"hits\", [])\n", " )\n", + " page_content = \"\"\n", + " for passage_hit in passage_hits:\n", + " passage_fields = passage_hit.get(\"fields\", {}).get(\"passages\", [])[0]\n", + " page_content += passage_fields.get(\"text\", [])[0] + \"\\n\\n\"\n", "\n", - "results = vector_store.similarity_search(query=\"Whats the work from home policy?\", doc_builder=doc_builder)\n", + " return Document(\n", + " page_content=page_content,\n", + " metadata=hit[\"_source\"][\"metadata\"],\n", + " )\n", + "\n", + "\n", + "results = vector_store.similarity_search(\n", + " query=\"Whats the work from home policy?\", doc_builder=doc_builder\n", + ")\n", "for result in results:\n", " print(f'Doc title: {result.metadata[\"name\"]}')\n", - " print(f'Text:\\n{result.page_content}')" + " print(f\"Text:\\n{result.page_content}\")" ] }, { diff --git a/notebooks/generative-ai/chatbot.ipynb b/notebooks/generative-ai/chatbot.ipynb index aac8199a..b5f39a0a 100644 --- a/notebooks/generative-ai/chatbot.ipynb +++ b/notebooks/generative-ai/chatbot.ipynb @@ -171,10 +171,7 @@ "\n", "for doc in workplace_docs:\n", " content.append(doc[\"content\"])\n", - " metadata.append({\n", - " \"name\": doc[\"name\"],\n", - " \"summary\": doc[\"summary\"]\n", - " })\n", + " metadata.append({\"name\": doc[\"name\"], \"summary\": doc[\"summary\"]})\n", "\n", "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n", " chunk_size=512,\n", @@ -205,10 +202,10 @@ "\n", "vector_store = ElasticsearchStore.from_documents(\n", " docs,\n", - " es_cloud_id=ELASTIC_CLOUD_ID, \n", + " es_cloud_id=ELASTIC_CLOUD_ID,\n", " es_api_key=ELASTIC_API_KEY,\n", " index_name=\"workplace-docs\",\n", - " embedding=embeddings\n", + " embedding=embeddings,\n", ")" ] }, @@ -238,9 +235,7 @@ "llm = OpenAI(openai_api_key=OPENAI_API_KEY)\n", "\n", "chat = ConversationalRetrievalChain.from_llm(\n", - " llm=llm,\n", - " retriever=retriever,\n", - " return_source_documents=True\n", + " llm=llm, retriever=retriever, return_source_documents=True\n", ")\n", "\n", "session_id = str(uuid4())\n", @@ -248,7 +243,7 @@ " es_cloud_id=ELASTIC_CLOUD_ID,\n", " es_api_key=ELASTIC_API_KEY,\n", " session_id=session_id,\n", - " index=\"workplace-docs-chat-history\"\n", + " index=\"workplace-docs-chat-history\",\n", ")" ] }, @@ -287,12 +282,15 @@ "# Define a convenience function for Q&A\n", "def ask(question, chat_history):\n", " result = chat({\"question\": question, \"chat_history\": chat_history.messages})\n", - " print(f\"\"\"[QUESTION] {question}\n", + " print(\n", + " f\"\"\"[QUESTION] {question}\n", "[ANSWER] {result[\"answer\"]}\n", - " [SUPPORTING DOCUMENTS] {list(map(lambda d: d.metadata[\"name\"], list(result[\"source_documents\"])))}\"\"\")\n", + " [SUPPORTING DOCUMENTS] {list(map(lambda d: d.metadata[\"name\"], list(result[\"source_documents\"])))}\"\"\"\n", + " )\n", " chat_history.add_user_message(result[\"question\"])\n", " chat_history.add_ai_message(result[\"answer\"])\n", "\n", + "\n", "# Chat away!\n", "print(f\"[CHAT SESSION ID] {session_id}\")\n", "ask(\"What does NASA stand for?\", chat_history)\n", @@ -349,8 +347,8 @@ } ], "source": [ - "vector_store.client.indices.delete(index='workplace-docs')\n", - "vector_store.client.indices.delete(index='workplace-docs-chat-history')" + "vector_store.client.indices.delete(index=\"workplace-docs\")\n", + "vector_store.client.indices.delete(index=\"workplace-docs-chat-history\")" ] } ], diff --git a/notebooks/generative-ai/question-answering.ipynb b/notebooks/generative-ai/question-answering.ipynb index 4d61706f..3fea57c6 100644 --- a/notebooks/generative-ai/question-answering.ipynb +++ b/notebooks/generative-ai/question-answering.ipynb @@ -1,693 +1,704 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "tZnIXBfrRpex" - }, - "source": [ - "# Question Answering with Langchain and OpenAI\n", - "\n", - "\"Open\n", - "\n", - "This interactive notebook uses Langchain to split fictional workplace documents into passages and uses OpenAI to transform these passages into embeddings and store them into Elasticsearch.\n", - "\n", - "\n", - "![image.png]()\n", - "\n", - "Then when we ask a question, we retrieve the relevant passages from the vector store and use langchain and OpenAI to provide a summary for the question." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "GyAst2W-VpHb" - }, - "source": [ - "## Install required packages\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "33A-cP-XvFCr" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "langserve 0.0.21 requires pydantic<2,>=1, but you have pydantic 2.3.0 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" - ] - } - ], - "source": [ - "!python3 -m pip install -qU langchain openai==0.28.1 elasticsearch tiktoken jq" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "qtEOCsCLWCZp" - }, - "source": [ - "## Connect to Elasticsearch\n", - "\n", - "ℹ️ We're using an Elastic Cloud deployment of Elasticsearch for this notebook. If you don't have an Elastic Cloud deployment, sign up [here](https://cloud.elastic.co/registration?utm_source=github&utm_content=elasticsearch-labs-notebook) for a free trial. \n", - "\n", - "We'll use the **Cloud ID** to identify our deployment, because we are using Elastic Cloud deployment. To find the Cloud ID for your deployment, go to https://cloud.elastic.co/deployments and select your deployment.\n", - "\n", - "\n", - "We will use [ElasticsearchStore](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.elasticsearch.ElasticsearchStore.html) to connect to our elastic cloud deployment. This would help create and index data easily. In the ElasticsearchStore instance, will set embedding to [OpenAIEmbeddings](https://api.python.langchain.com/en/latest/embeddings/langchain.embeddings.openai.OpenAIEmbeddings.html) to embed the texts and elasticsearch index name that will be used in this example." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "a-t1mglib54F" - }, - "outputs": [], - "source": [ - "from langchain.vectorstores import ElasticsearchStore\n", - "from langchain.embeddings.openai import OpenAIEmbeddings\n", - "from getpass import getpass\n", - "\n", - "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id\n", - "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", - "\n", - "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#creating-an-api-key\n", - "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", - "\n", - "# https://platform.openai.com/api-keys\n", - "OPENAI_API_KEY = getpass(\"OpenAI API key: \")\n", - "\n", - "embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)\n", - "\n", - "vector_store = ElasticsearchStore(\n", - " es_cloud_id=ELASTIC_CLOUD_ID,\n", - " es_api_key=ELASTIC_API_KEY,\n", - " index_name= \"workplace_index\",\n", - " embedding=embeddings\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Indexing Data into Elasticsearch\n", - "\n", - "Let's download the sample dataset and deserialize the document. " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "J8-93TiJsNyK" - }, - "outputs": [], - "source": [ - "from urllib.request import urlopen\n", - "from langchain.llms import OpenAI\n", - "import json\n", - "\n", - "url = \"https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/example-apps/chatbot-rag-app/data/data.json\"\n", - "\n", - "response = urlopen(url)\n", - "data = json.load(response)\n", - "\n", - "with open('temp.json', 'w') as json_file:\n", - " json.dump(data, json_file)\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "p0cQFDl1b9v4" - }, - "source": [ - "### Split Documents into Passages\n", - "\n", - "We’ll chunk documents into passages in order to improve the retrieval specificity and to ensure that we can provide multiple passages within the context window of the final question answering prompt.\n", - "\n", - "Here we are chunking documents into 800 token passages with an overlap of 400 tokens.\n", - "\n", - "Here we are using a simple splitter but Langchain offers more advanced splitters to reduce the chance of context being lost." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "dbHEoTF6vBXE" - }, - "outputs": [], - "source": [ - "from langchain.document_loaders import JSONLoader \n", - "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", - "\n", - "def metadata_func(record: dict, metadata: dict) -> dict:\n", - " metadata[\"name\"] = record.get(\"name\")\n", - " metadata[\"summary\"] = record.get(\"summary\")\n", - " metadata[\"url\"] = record.get(\"url\")\n", - " metadata[\"category\"] = record.get(\"category\")\n", - " metadata[\"updated_at\"] = record.get(\"updated_at\")\n", - "\n", - " return metadata\n", - "\n", - "# For more loaders https://python.langchain.com/docs/modules/data_connection/document_loaders/\n", - "# And 3rd party loaders https://python.langchain.com/docs/modules/data_connection/document_loaders/#third-party-loaders\n", - "loader = JSONLoader(\n", - " file_path=\"temp.json\",\n", - " jq_schema=\".[]\",\n", - " content_key=\"content\",\n", - " metadata_func=metadata_func,\n", - ")\n", - "\n", - "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=512, chunk_overlap=256)\n", - "docs = loader.load_and_split(text_splitter=text_splitter)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "RmCUl0hxW4lG" - }, - "source": [ - "### Bulk Import Passages\n", - "\n", - "Now that we have split each document into the chunk size of 800, we will now index data to elasticsearch using [ElasticsearchStore.from_documents](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.elasticsearch.ElasticsearchStore.html#langchain.vectorstores.elasticsearch.ElasticsearchStore.from_documents).\n", - "\n", - "We will use Cloud ID, Password and Index name values set in the `Create cloud deployment` step." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "documents = vector_store.from_documents(\n", - " docs, \n", - " embeddings, \n", - " index_name=\"workplace_index\",\n", - " es_cloud_id=ELASTIC_CLOUD_ID,\n", - " es_api_key=ELASTIC_API_KEY\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "rXJH_MiWejv7" - }, - "source": [ - "## Asking a question\n", - "Now that we have the passages stored in Elasticsearch, we can now ask a question to get the relevant passages." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "OobeBT6rek7Q", - "outputId": "ba7b3a7a-253e-4e7f-83b9-cec07ebdac09" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "---- Answer ----\n", - "\n", - "The NASA Sales Team is responsible for understanding the unique market dynamics and cultural nuances of North and South America. It is led by Area Vice-Presidents Laura Martinez (North America) and Gary Johnson (South America), and consists of dedicated account managers, sales representatives, and support staff. The team works to effectively target and engage with customers across the region.\n" - ] - } - ], - "source": [ - "from langchain.schema.runnable import RunnablePassthrough\n", - "from langchain.prompts import ChatPromptTemplate\n", - "from langchain.schema.output_parser import StrOutputParser\n", - "\n", - "retriever = vector_store.as_retriever()\n", - "\n", - "llm = OpenAI(openai_api_key=OPENAI_API_KEY)\n", - "\n", - "ANSWER_PROMPT = ChatPromptTemplate.from_template(\n", - " \"\"\"You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Be as verbose and educational in your response as possible. \n", - " \n", - " context: {context}\n", - " Question: \"{question}\"\n", - " Answer:\n", - " \"\"\"\n", - ")\n", - "\n", - "chain = (\n", - " {\"context\": retriever, \"question\": RunnablePassthrough()}\n", - " | ANSWER_PROMPT\n", - " | llm\n", - " | StrOutputParser()\n", - ")\n", - "\n", - "ans = chain.invoke(\"what is the nasa sales team?\")\n", - "\n", - "print(\"---- Answer ----\")\n", - "print(ans)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Add Source Tracing\n", - "RAG can provide clear traceability of the source knowledge used to answer a question. This is important for compliance and regulatory reasons and limiting LLM hallucinations. This is known as source tracking.\n", - "\n", - "In this example, we extend the Prompt template to ask the LLM to cite the source of the answer." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "---- Answer ----\n", - "The North America South America (NASA) sales team is responsible for serving customers and achieving business objectives across North and South America. The team is led by two Area Vice-Presidents: Laura Martinez is the Area Vice-President of North America, and Gary Johnson is the Area Vice-President of South America. The team consists of dedicated account managers, sales representatives, and support staff. They are responsible for identifying and pursuing new business opportunities, nurturing existing client relationships, and ensuring customer satisfaction.\n", - "SOURCE: Sales Organization Overview\n" - ] - } - ], - "source": [ - "from langchain.schema.runnable import RunnablePassthrough\n", - "from langchain.prompts import ChatPromptTemplate, PromptTemplate\n", - "from langchain.schema.output_parser import StrOutputParser\n", - "from langchain.schema import format_document\n", - "\n", - "retriever = vector_store.as_retriever()\n", - "\n", - "llm = OpenAI(openai_api_key=OPENAI_API_KEY)\n", - "\n", - "ANSWER_PROMPT = ChatPromptTemplate.from_template(\n", - "\"\"\"\n", - "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Be as verbose and educational in your response as possible. \n", - "Each passage has a SOURCE which is the title of the document. When answering, cite source name of the passages you are answering from below the answer, on a new line, with a prefix of \"SOURCE:\".\n", - "\n", - "\n", - "context: {context}\n", - "Question: \"{question}\"\n", - "Answer:\n", - "\"\"\"\n", - ")\n", - "\n", - "DOCUMENT_PROMPT = PromptTemplate.from_template(\"\"\"\n", - "---\n", - "SOURCE: {name}\n", - "{page_content}\n", - "---\n", - "\"\"\")\n", - "\n", - "def _combine_documents(\n", - " docs, document_prompt=DOCUMENT_PROMPT, document_separator=\"\\n\\n\"\n", - "):\n", - " doc_strings = [format_document(doc, document_prompt) for doc in docs]\n", - " return document_separator.join(doc_strings)\n", - "\n", - "_context = {\n", - " \"context\": retriever | _combine_documents,\n", - " \"question\": RunnablePassthrough(),\n", - "}\n", - "\n", - "chain = (\n", - " _context\n", - " | ANSWER_PROMPT\n", - " | llm\n", - " | StrOutputParser()\n", - ")\n", - "\n", - "ans = chain.invoke(\"what is the nasa sales team?\")\n", - "\n", - "print(\"---- Answer ----\")\n", - "print(ans)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Returning Passages with Answer\n", - "\n", - "In this example, we extend the chain to return the passages back with the answer. This is helpful for the UI to display the source passages, should the user want to read more on the topic. " - ] - }, + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "tZnIXBfrRpex" + }, + "source": [ + "# Question Answering with Langchain and OpenAI\n", + "\n", + "\"Open\n", + "\n", + "This interactive notebook uses Langchain to split fictional workplace documents into passages and uses OpenAI to transform these passages into embeddings and store them into Elasticsearch.\n", + "\n", + "\n", + "![image.png]()\n", + "\n", + "Then when we ask a question, we retrieve the relevant passages from the vector store and use langchain and OpenAI to provide a summary for the question." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "GyAst2W-VpHb" + }, + "source": [ + "## Install required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "33A-cP-XvFCr" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "---- Answer ----\n", - "The North America South America (NASA) region has two Area Vice-Presidents: Laura Martinez is the Area Vice-President of North America, and Gary Johnson is the Area Vice-President of South America. The NASA sales team consists of dedicated account managers, sales representatives, and support staff, led by their respective Area Vice-Presidents. They are responsible for identifying and pursuing new business opportunities, nurturing existing client relationships, and ensuring customer satisfaction. The teams collaborate closely with other departments, such as marketing, product development, and customer support, to ensure we consistently deliver high-quality products and services to our clients.\n", - "\n", - "SOURCE: Sales Organization Overview\n", - "\n", - "---- Documents ----\n", - "Sales Organization Overview\n", - "Our sales organization is structured to effectively serve our customers and achieve our business objectives across multiple regions. The organization is divided into the following main regions:\n", - "\n", - "The Americas: This region includes the United States, Canada, Mexico, as well as Central and South America. The North America South America region (NASA) has two Area Vice-Presidents: Laura Martinez is the Area Vice-President of North America, and Gary Johnson is the Area Vice-President of South America.\n", - "\n", - "Europe: Our European sales team covers the entire continent, including the United Kingdom, Germany, France, Spain, Italy, and other countries. The team is responsible for understanding the unique market dynamics and cultural nuances, enabling them to effectively target and engage with customers across the region. The Area Vice-President for Europe is Rajesh Patel.\n", - "Asia-Pacific: This region encompasses countries such as China, Japan, South Korea, India, Australia, and New Zealand. Our sales team in the Asia-Pacific region works diligently to capitalize on growth opportunities and address the diverse needs of customers in this vast and rapidly evolving market. The Area Vice-President for Asia-Pacific is Mei Li.\n", - "Middle East & Africa: This region comprises countries across the Middle East and Africa, such as the United Arab Emirates, Saudi Arabia, South Africa, and Nigeria. Our sales team in this region is responsible for navigating the unique market challenges and identifying opportunities to expand our presence and better serve our customers. The Area Vice-President for Middle East & Africa is Jamal Abdi.\n", - "\n", - "Each regional sales team consists of dedicated account managers, sales representatives, and support staff, led by their respective Area Vice-Presidents. They are responsible for identifying and pursuing new business opportunities, nurturing existing client relationships, and ensuring customer satisfaction. The teams collaborate closely with other departments, such as marketing, product development, and customer support, to ensure we consistently deliver high-quality products and services to our clients.\n", - "----\n", - "Sales Engineering Collaboration\n", - "As an engineer, it is important to understand the sales team's goals and objectives, as this will help you to provide them with the necessary information, tools, and support to successfully sell your company's products and services.\n", - "Communication:\n", - "Effective communication is key to successfully working with the sales team. Make sure to maintain open lines of communication, and be responsive to their questions and concerns. This includes:\n", - "\n", - "a. Attending sales meetings and conference calls when required.\n", - "b. Providing regular product updates and training sessions to the sales team.\n", - "c. Being available to answer technical questions and clarifications.\n", - "Collaboration:\n", - "Collaborate with the sales team in developing and refining sales materials, such as product presentations, demos, and technical documents. This will ensure that the sales team has accurate and up-to-date information to present to clients.\n", - "\n", - "Additionally, work closely with the sales team on customer projects or product customizations, providing technical guidance, and ensuring that the solutions meet the customer's requirements.\n", - "Customer Engagement:\n", - "At times, engineers may be asked to join sales meetings or calls with potential clients to provide technical expertise. In these situations, it is important to:\n", - "\n", - "a. Be prepared and understand the customer's needs and pain points.\n", - "b. Clearly explain the technical aspects of the product or solution in a simple language that the customer can understand.\n", - "c. Address any concerns or questions the customer may have.\n", - "Continuous Improvement:\n", - "Actively seek feedback from the sales team regarding product performance, customer experiences, and market trends. Use this feedback to identify areas of improvement and collaborate with other engineers to enhance the product or service offerings.\n", - "Mutual Respect and Support:\n", - "It is essential to treat your colleagues in the sales team with respect and professionalism. Recognize and appreciate their efforts in promoting and selling the company's products and services. In turn, the sales team should also respect and appreciate the technical expertise and knowledge of the engineering team.\n", - "\n", - "By working together, both the engineering and sales teams can contribute to the overall success of the company.\n", - "\n", - "Conclusion:\n", - "Collaboration between engineers and the sales team is crucial for a tech company's success. By understanding each other's roles, maintaining effective communication, collaborating on projects, and supporting one another, both teams can work together to achieve the company's goals and ensure customer satisfaction.\n", - "----\n", - "Fy2024 Company Sales Strategy\n", - "III. Action Plans\n", - "A. Sales Team Development:\n", - "Expand the sales team to cover new markets and industries.\n", - "Provide ongoing training to sales staff on product knowledge, sales techniques, and industry trends.\n", - "Implement a performance-based incentive system to reward top performers.\n", - "\n", - "B. Marketing and Promotion:\n", - "Develop targeted marketing campaigns for different customer segments and industries.\n", - "Leverage digital marketing channels to increase brand visibility and lead generation.\n", - "Participate in industry events and trade shows to showcase our products and services.\n", - "\n", - "C. Partner Ecosystem:\n", - "Strengthen existing partnerships and establish new strategic alliances to expand market reach.\n", - "Collaborate with partners on joint marketing and sales initiatives.\n", - "Provide partner training and support to ensure they effectively represent our products and services.\n", - "\n", - "D. Customer Success:\n", - "Implement a proactive customer success program to improve customer retention and satisfaction.\n", - "Develop a dedicated customer support team to address customer inquiries and concerns promptly.\n", - "Collect and analyze customer feedback to identify areas for improvement in our products, services, and processes.\n", - "\n", - "IV. Monitoring and Evaluation\n", - "Establish key performance indicators (KPIs) to track progress toward our objectives.\n", - "Conduct regular sales team meetings to review performance, share best practices, and address challenges.\n", - "Conduct quarterly reviews of our sales strategy to ensure alignment with market trends and adjust as needed.\n", - "\n", - "By following this sales strategy for fiscal year 2024, our tech company aims to achieve significant growth and success in our target markets, while also providing exceptional value and service to our customers.\n", - "----\n", - "Sales Engineering Collaboration\n", - "Title: Working with the Sales Team as an Engineer in a Tech Company\n", - "\n", - "Introduction:\n", - "As an engineer in a tech company, collaboration with the sales team is essential to ensure the success of the company's products and services. This guidance document aims to provide an overview of how engineers can effectively work with the sales team, fostering a positive and productive working environment.\n", - "Understanding the Sales Team's Role:\n", - "The sales team is responsible for promoting and selling the company's products and services to potential clients. Their role involves establishing relationships with customers, understanding their needs, and ensuring that the offered solutions align with their requirements.\n", - "\n", - "As an engineer, it is important to understand the sales team's goals and objectives, as this will help you to provide them with the necessary information, tools, and support to successfully sell your company's products and services.\n", - "Communication:\n", - "Effective communication is key to successfully working with the sales team. Make sure to maintain open lines of communication, and be responsive to their questions and concerns. This includes:\n", - "\n", - "a. Attending sales meetings and conference calls when required.\n", - "b. Providing regular product updates and training sessions to the sales team.\n", - "c. Being available to answer technical questions and clarifications.\n", - "Collaboration:\n", - "Collaborate with the sales team in developing and refining sales materials, such as product presentations, demos, and technical documents. This will ensure that the sales team has accurate and up-to-date information to present to clients.\n", - "\n", - "Additionally, work closely with the sales team on customer projects or product customizations, providing technical guidance, and ensuring that the solutions meet the customer's requirements.\n", - "Customer Engagement:\n", - "At times, engineers may be asked to join sales meetings or calls with potential clients to provide technical expertise. In these situations, it is important to:\n", - "----\n" - ] - } - ], - "source": [ - "from langchain.schema.runnable import RunnableMap\n", - "from langchain.prompts import ChatPromptTemplate, PromptTemplate\n", - "from langchain.schema import format_document\n", - "from operator import itemgetter\n", - "\n", - "retriever = vector_store.as_retriever()\n", - "\n", - "llm = OpenAI(openai_api_key=OPENAI_API_KEY)\n", - "\n", - "ANSWER_PROMPT = ChatPromptTemplate.from_template(\n", - "\"\"\"\n", - "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Be as verbose and educational in your response as possible. \n", - "Each passage has a SOURCE which is the title of the document. When answering, cite source name of the passages you are answering from below the answer, on a new line, with a prefix of \"SOURCE:\".\n", - "\n", - "context: {context}\n", - "Question: {question}\n", - "Answer:\n", - "\n", - "\"\"\"\n", - ")\n", - "\n", - "DOCUMENT_PROMPT = PromptTemplate.from_template(\"\"\"\n", - "---\n", - "SOURCE: {name}\n", - "{page_content}\n", - "---\n", - "\"\"\")\n", - "\n", - "def _combine_documents(\n", - " docs, document_prompt=DOCUMENT_PROMPT, document_separator=\"\\n\\n\"\n", - "):\n", - " doc_strings = [format_document(doc, document_prompt) for doc in docs]\n", - " return document_separator.join(doc_strings)\n", - "\n", - "retrieved_documents = RunnableMap(\n", - " docs=itemgetter(\"question\") | retriever,\n", - " question=itemgetter(\"question\"),\n", - ")\n", - "\n", - "_context = {\n", - " \"context\": lambda x: _combine_documents(x[\"docs\"]),\n", - " \"question\": lambda x: x[\"question\"],\n", - "}\n", - "\n", - "answer = {\n", - " \"answer\": _context | ANSWER_PROMPT | llm,\n", - " \"docs\": itemgetter(\"docs\"),\n", - "}\n", - "\n", - "chain = (\n", - " retrieved_documents | answer\n", - ")\n", - "\n", - "ans = chain.invoke({ \"question\": \"what is the nasa sales team?\"})\n", - "\n", - "print(\"---- Answer ----\")\n", - "print(ans[\"answer\"])\n", - "print()\n", - "print(\"---- Documents ----\")\n", - "for doc in ans[\"docs\"]:\n", - " print(doc.metadata[\"name\"])\n", - " print(doc.page_content)\n", - " print(\"----\")\n" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "langserve 0.0.21 requires pydantic<2,>=1, but you have pydantic 2.3.0 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "!python3 -m pip install -qU langchain openai==0.28.1 elasticsearch tiktoken jq" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "qtEOCsCLWCZp" + }, + "source": [ + "## Connect to Elasticsearch\n", + "\n", + "ℹ️ We're using an Elastic Cloud deployment of Elasticsearch for this notebook. If you don't have an Elastic Cloud deployment, sign up [here](https://cloud.elastic.co/registration?utm_source=github&utm_content=elasticsearch-labs-notebook) for a free trial. \n", + "\n", + "We'll use the **Cloud ID** to identify our deployment, because we are using Elastic Cloud deployment. To find the Cloud ID for your deployment, go to https://cloud.elastic.co/deployments and select your deployment.\n", + "\n", + "\n", + "We will use [ElasticsearchStore](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.elasticsearch.ElasticsearchStore.html) to connect to our elastic cloud deployment. This would help create and index data easily. In the ElasticsearchStore instance, will set embedding to [OpenAIEmbeddings](https://api.python.langchain.com/en/latest/embeddings/langchain.embeddings.openai.OpenAIEmbeddings.html) to embed the texts and elasticsearch index name that will be used in this example." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "a-t1mglib54F" + }, + "outputs": [], + "source": [ + "from langchain.vectorstores import ElasticsearchStore\n", + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from getpass import getpass\n", + "\n", + "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id\n", + "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", + "\n", + "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#creating-an-api-key\n", + "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", + "\n", + "# https://platform.openai.com/api-keys\n", + "OPENAI_API_KEY = getpass(\"OpenAI API key: \")\n", + "\n", + "embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)\n", + "\n", + "vector_store = ElasticsearchStore(\n", + " es_cloud_id=ELASTIC_CLOUD_ID,\n", + " es_api_key=ELASTIC_API_KEY,\n", + " index_name=\"workplace_index\",\n", + " embedding=embeddings,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Indexing Data into Elasticsearch\n", + "\n", + "Let's download the sample dataset and deserialize the document. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "J8-93TiJsNyK" + }, + "outputs": [], + "source": [ + "from urllib.request import urlopen\n", + "from langchain.llms import OpenAI\n", + "import json\n", + "\n", + "url = \"https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/example-apps/chatbot-rag-app/data/data.json\"\n", + "\n", + "response = urlopen(url)\n", + "data = json.load(response)\n", + "\n", + "with open(\"temp.json\", \"w\") as json_file:\n", + " json.dump(data, json_file)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "p0cQFDl1b9v4" + }, + "source": [ + "### Split Documents into Passages\n", + "\n", + "We’ll chunk documents into passages in order to improve the retrieval specificity and to ensure that we can provide multiple passages within the context window of the final question answering prompt.\n", + "\n", + "Here we are chunking documents into 800 token passages with an overlap of 400 tokens.\n", + "\n", + "Here we are using a simple splitter but Langchain offers more advanced splitters to reduce the chance of context being lost." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "dbHEoTF6vBXE" + }, + "outputs": [], + "source": [ + "from langchain.document_loaders import JSONLoader\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "\n", + "\n", + "def metadata_func(record: dict, metadata: dict) -> dict:\n", + " metadata[\"name\"] = record.get(\"name\")\n", + " metadata[\"summary\"] = record.get(\"summary\")\n", + " metadata[\"url\"] = record.get(\"url\")\n", + " metadata[\"category\"] = record.get(\"category\")\n", + " metadata[\"updated_at\"] = record.get(\"updated_at\")\n", + "\n", + " return metadata\n", + "\n", + "\n", + "# For more loaders https://python.langchain.com/docs/modules/data_connection/document_loaders/\n", + "# And 3rd party loaders https://python.langchain.com/docs/modules/data_connection/document_loaders/#third-party-loaders\n", + "loader = JSONLoader(\n", + " file_path=\"temp.json\",\n", + " jq_schema=\".[]\",\n", + " content_key=\"content\",\n", + " metadata_func=metadata_func,\n", + ")\n", + "\n", + "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n", + " chunk_size=512, chunk_overlap=256\n", + ")\n", + "docs = loader.load_and_split(text_splitter=text_splitter)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "RmCUl0hxW4lG" + }, + "source": [ + "### Bulk Import Passages\n", + "\n", + "Now that we have split each document into the chunk size of 800, we will now index data to elasticsearch using [ElasticsearchStore.from_documents](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.elasticsearch.ElasticsearchStore.html#langchain.vectorstores.elasticsearch.ElasticsearchStore.from_documents).\n", + "\n", + "We will use Cloud ID, Password and Index name values set in the `Create cloud deployment` step." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "documents = vector_store.from_documents(\n", + " docs,\n", + " embeddings,\n", + " index_name=\"workplace_index\",\n", + " es_cloud_id=ELASTIC_CLOUD_ID,\n", + " es_api_key=ELASTIC_API_KEY,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "rXJH_MiWejv7" + }, + "source": [ + "## Asking a question\n", + "Now that we have the passages stored in Elasticsearch, we can now ask a question to get the relevant passages." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "OobeBT6rek7Q", + "outputId": "ba7b3a7a-253e-4e7f-83b9-cec07ebdac09" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Conversational Question Answering\n", - "We have achieved getting answers to questions, but what if we want to ask follow up questions? We can use the answer from the previous question as the context for the next question. This is known as conversational question answering.\n", - "\n", - "In this example, we extend the chain to use the answer from the previous question as the context for the next question." - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "---- Answer ----\n", + "\n", + "The NASA Sales Team is responsible for understanding the unique market dynamics and cultural nuances of North and South America. It is led by Area Vice-Presidents Laura Martinez (North America) and Gary Johnson (South America), and consists of dedicated account managers, sales representatives, and support staff. The team works to effectively target and engage with customers across the region.\n" + ] + } + ], + "source": [ + "from langchain.schema.runnable import RunnablePassthrough\n", + "from langchain.prompts import ChatPromptTemplate\n", + "from langchain.schema.output_parser import StrOutputParser\n", + "\n", + "retriever = vector_store.as_retriever()\n", + "\n", + "llm = OpenAI(openai_api_key=OPENAI_API_KEY)\n", + "\n", + "ANSWER_PROMPT = ChatPromptTemplate.from_template(\n", + " \"\"\"You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Be as verbose and educational in your response as possible. \n", + " \n", + " context: {context}\n", + " Question: \"{question}\"\n", + " Answer:\n", + " \"\"\"\n", + ")\n", + "\n", + "chain = (\n", + " {\"context\": retriever, \"question\": RunnablePassthrough()}\n", + " | ANSWER_PROMPT\n", + " | llm\n", + " | StrOutputParser()\n", + ")\n", + "\n", + "ans = chain.invoke(\"what is the nasa sales team?\")\n", + "\n", + "print(\"---- Answer ----\")\n", + "print(ans)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add Source Tracing\n", + "RAG can provide clear traceability of the source knowledge used to answer a question. This is important for compliance and regulatory reasons and limiting LLM hallucinations. This is known as source tracking.\n", + "\n", + "In this example, we extend the Prompt template to ask the LLM to cite the source of the answer." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "---- Answer ----\n", - "The objectives for fiscal year 2024 are to increase revenue by 20% compared to fiscal year 2023, expand market share in key segments by 15%, retain 95% of existing customers and increase customer satisfaction ratings, and launch at least two new products or services in high-demand market segments. SOURCE: Fy2024 Company Sales Strategy\n" - ] - } - ], - "source": [ - "from langchain.schema.runnable import RunnableMap\n", - "from langchain.prompts import ChatPromptTemplate, PromptTemplate\n", - "from langchain.schema import format_document\n", - "from operator import itemgetter\n", - "\n", - "retriever = vector_store.as_retriever()\n", - "\n", - "llm = OpenAI(openai_api_key=OPENAI_API_KEY)\n", - "\n", - "ANSWER_PROMPT = ChatPromptTemplate.from_template(\n", - "\"\"\"\n", - "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Be as verbose and educational in your response as possible. \n", - "Each passage has a SOURCE which is the title of the document. When answering, cite source name of the passages you are answering from below the answer, on a new line, with a prefix of \"SOURCE:\".\n", - "\n", - "context: \n", - "{context}\n", - "\n", - "Question: {question}\n", - "Answer:\n", - "\"\"\"\n", - ")\n", - "\n", - "DOCUMENT_PROMPT = PromptTemplate.from_template(\"\"\"\n", - "---\n", - "SOURCE: {name}\n", - "{page_content}\n", - "---\n", - "\"\"\")\n", - "\n", - "CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(\n", - "\"\"\"Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.\n", - "\n", - "Chat History:\n", - "{chat_history}\n", - "Follow Up Input: {question}\n", - "\"\"\")\n", - "\n", - "standalone_question = RunnableMap(\n", - " standalone_question=RunnablePassthrough.assign(\n", - " chat_history=lambda x: _format_chat_history(x[\"chat_history\"])\n", - " )\n", - " | CONDENSE_QUESTION_PROMPT\n", - " | llm\n", - " | StrOutputParser(),\n", - ")\n", - "\n", - "def _format_chat_history(chat_history) -> str:\n", - " buffer = \"\"\n", - " for dialogue_turn in chat_history:\n", - " human = \"Human: \" + dialogue_turn[0]\n", - " ai = \"Assistant: \" + dialogue_turn[1]\n", - " buffer += \"\\n\" + \"\\n\".join([human, ai])\n", - " return buffer\n", - "\n", - "def _combine_documents(\n", - " docs, document_prompt=DOCUMENT_PROMPT, document_separator=\"\\n\\n\"\n", - "):\n", - " doc_strings = [format_document(doc, document_prompt) for doc in docs]\n", - " return document_separator.join(doc_strings)\n", - "\n", - "retrieved_documents = RunnableMap(\n", - " docs=itemgetter(\"standalone_question\") | retriever,\n", - " question=itemgetter(\"standalone_question\"),\n", - ")\n", - "\n", - "_context = {\n", - " \"context\": lambda x: _combine_documents(x[\"docs\"]),\n", - " \"question\": lambda x: x[\"question\"],\n", - "}\n", - "\n", - "answer = {\n", - " \"answer\": _context | ANSWER_PROMPT | llm,\n", - " \"docs\": itemgetter(\"docs\"),\n", - "}\n", - "\n", - "chain = (\n", - " standalone_question | retrieved_documents | answer\n", - ")\n", - "\n", - "ans = chain.invoke({ \n", - " \"question\": \"What are their objectives?\", \n", - " \"chat_history\": [\n", - " \"What is the nasa sales team?\",\n", - " \"The sales team of NASA consists of Laura Martinez, the Area \"\n", - " \"Vice-President of North America, and Gary Johnson, the Area \"\n", - " \"Vice-President of South America.\"\n", - " \"SOURCE: Sales Organization Overview\"\n", - " ]\n", - "})\n", - "\n", - "print(\"---- Answer ----\")\n", - "print(ans[\"answer\"])" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "---- Answer ----\n", + "The North America South America (NASA) sales team is responsible for serving customers and achieving business objectives across North and South America. The team is led by two Area Vice-Presidents: Laura Martinez is the Area Vice-President of North America, and Gary Johnson is the Area Vice-President of South America. The team consists of dedicated account managers, sales representatives, and support staff. They are responsible for identifying and pursuing new business opportunities, nurturing existing client relationships, and ensuring customer satisfaction.\n", + "SOURCE: Sales Organization Overview\n" + ] + } + ], + "source": [ + "from langchain.schema.runnable import RunnablePassthrough\n", + "from langchain.prompts import ChatPromptTemplate, PromptTemplate\n", + "from langchain.schema.output_parser import StrOutputParser\n", + "from langchain.schema import format_document\n", + "\n", + "retriever = vector_store.as_retriever()\n", + "\n", + "llm = OpenAI(openai_api_key=OPENAI_API_KEY)\n", + "\n", + "ANSWER_PROMPT = ChatPromptTemplate.from_template(\n", + " \"\"\"\n", + "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Be as verbose and educational in your response as possible. \n", + "Each passage has a SOURCE which is the title of the document. When answering, cite source name of the passages you are answering from below the answer, on a new line, with a prefix of \"SOURCE:\".\n", + "\n", + "\n", + "context: {context}\n", + "Question: \"{question}\"\n", + "Answer:\n", + "\"\"\"\n", + ")\n", + "\n", + "DOCUMENT_PROMPT = PromptTemplate.from_template(\n", + " \"\"\"\n", + "---\n", + "SOURCE: {name}\n", + "{page_content}\n", + "---\n", + "\"\"\"\n", + ")\n", + "\n", + "\n", + "def _combine_documents(\n", + " docs, document_prompt=DOCUMENT_PROMPT, document_separator=\"\\n\\n\"\n", + "):\n", + " doc_strings = [format_document(doc, document_prompt) for doc in docs]\n", + " return document_separator.join(doc_strings)\n", + "\n", + "\n", + "_context = {\n", + " \"context\": retriever | _combine_documents,\n", + " \"question\": RunnablePassthrough(),\n", + "}\n", + "\n", + "chain = _context | ANSWER_PROMPT | llm | StrOutputParser()\n", + "\n", + "ans = chain.invoke(\"what is the nasa sales team?\")\n", + "\n", + "print(\"---- Answer ----\")\n", + "print(ans)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Returning Passages with Answer\n", + "\n", + "In this example, we extend the chain to return the passages back with the answer. This is helpful for the UI to display the source passages, should the user want to read more on the topic. " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Next Steps\n", - "We have shown how to use Langchain to build a question answering system. We have shown how to index data into Elasticsearch, ask a question and use the answer from the previous question as the context for the next question." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "---- Answer ----\n", + "The North America South America (NASA) region has two Area Vice-Presidents: Laura Martinez is the Area Vice-President of North America, and Gary Johnson is the Area Vice-President of South America. The NASA sales team consists of dedicated account managers, sales representatives, and support staff, led by their respective Area Vice-Presidents. They are responsible for identifying and pursuing new business opportunities, nurturing existing client relationships, and ensuring customer satisfaction. The teams collaborate closely with other departments, such as marketing, product development, and customer support, to ensure we consistently deliver high-quality products and services to our clients.\n", + "\n", + "SOURCE: Sales Organization Overview\n", + "\n", + "---- Documents ----\n", + "Sales Organization Overview\n", + "Our sales organization is structured to effectively serve our customers and achieve our business objectives across multiple regions. The organization is divided into the following main regions:\n", + "\n", + "The Americas: This region includes the United States, Canada, Mexico, as well as Central and South America. The North America South America region (NASA) has two Area Vice-Presidents: Laura Martinez is the Area Vice-President of North America, and Gary Johnson is the Area Vice-President of South America.\n", + "\n", + "Europe: Our European sales team covers the entire continent, including the United Kingdom, Germany, France, Spain, Italy, and other countries. The team is responsible for understanding the unique market dynamics and cultural nuances, enabling them to effectively target and engage with customers across the region. The Area Vice-President for Europe is Rajesh Patel.\n", + "Asia-Pacific: This region encompasses countries such as China, Japan, South Korea, India, Australia, and New Zealand. Our sales team in the Asia-Pacific region works diligently to capitalize on growth opportunities and address the diverse needs of customers in this vast and rapidly evolving market. The Area Vice-President for Asia-Pacific is Mei Li.\n", + "Middle East & Africa: This region comprises countries across the Middle East and Africa, such as the United Arab Emirates, Saudi Arabia, South Africa, and Nigeria. Our sales team in this region is responsible for navigating the unique market challenges and identifying opportunities to expand our presence and better serve our customers. The Area Vice-President for Middle East & Africa is Jamal Abdi.\n", + "\n", + "Each regional sales team consists of dedicated account managers, sales representatives, and support staff, led by their respective Area Vice-Presidents. They are responsible for identifying and pursuing new business opportunities, nurturing existing client relationships, and ensuring customer satisfaction. The teams collaborate closely with other departments, such as marketing, product development, and customer support, to ensure we consistently deliver high-quality products and services to our clients.\n", + "----\n", + "Sales Engineering Collaboration\n", + "As an engineer, it is important to understand the sales team's goals and objectives, as this will help you to provide them with the necessary information, tools, and support to successfully sell your company's products and services.\n", + "Communication:\n", + "Effective communication is key to successfully working with the sales team. Make sure to maintain open lines of communication, and be responsive to their questions and concerns. This includes:\n", + "\n", + "a. Attending sales meetings and conference calls when required.\n", + "b. Providing regular product updates and training sessions to the sales team.\n", + "c. Being available to answer technical questions and clarifications.\n", + "Collaboration:\n", + "Collaborate with the sales team in developing and refining sales materials, such as product presentations, demos, and technical documents. This will ensure that the sales team has accurate and up-to-date information to present to clients.\n", + "\n", + "Additionally, work closely with the sales team on customer projects or product customizations, providing technical guidance, and ensuring that the solutions meet the customer's requirements.\n", + "Customer Engagement:\n", + "At times, engineers may be asked to join sales meetings or calls with potential clients to provide technical expertise. In these situations, it is important to:\n", + "\n", + "a. Be prepared and understand the customer's needs and pain points.\n", + "b. Clearly explain the technical aspects of the product or solution in a simple language that the customer can understand.\n", + "c. Address any concerns or questions the customer may have.\n", + "Continuous Improvement:\n", + "Actively seek feedback from the sales team regarding product performance, customer experiences, and market trends. Use this feedback to identify areas of improvement and collaborate with other engineers to enhance the product or service offerings.\n", + "Mutual Respect and Support:\n", + "It is essential to treat your colleagues in the sales team with respect and professionalism. Recognize and appreciate their efforts in promoting and selling the company's products and services. In turn, the sales team should also respect and appreciate the technical expertise and knowledge of the engineering team.\n", + "\n", + "By working together, both the engineering and sales teams can contribute to the overall success of the company.\n", + "\n", + "Conclusion:\n", + "Collaboration between engineers and the sales team is crucial for a tech company's success. By understanding each other's roles, maintaining effective communication, collaborating on projects, and supporting one another, both teams can work together to achieve the company's goals and ensure customer satisfaction.\n", + "----\n", + "Fy2024 Company Sales Strategy\n", + "III. Action Plans\n", + "A. Sales Team Development:\n", + "Expand the sales team to cover new markets and industries.\n", + "Provide ongoing training to sales staff on product knowledge, sales techniques, and industry trends.\n", + "Implement a performance-based incentive system to reward top performers.\n", + "\n", + "B. Marketing and Promotion:\n", + "Develop targeted marketing campaigns for different customer segments and industries.\n", + "Leverage digital marketing channels to increase brand visibility and lead generation.\n", + "Participate in industry events and trade shows to showcase our products and services.\n", + "\n", + "C. Partner Ecosystem:\n", + "Strengthen existing partnerships and establish new strategic alliances to expand market reach.\n", + "Collaborate with partners on joint marketing and sales initiatives.\n", + "Provide partner training and support to ensure they effectively represent our products and services.\n", + "\n", + "D. Customer Success:\n", + "Implement a proactive customer success program to improve customer retention and satisfaction.\n", + "Develop a dedicated customer support team to address customer inquiries and concerns promptly.\n", + "Collect and analyze customer feedback to identify areas for improvement in our products, services, and processes.\n", + "\n", + "IV. Monitoring and Evaluation\n", + "Establish key performance indicators (KPIs) to track progress toward our objectives.\n", + "Conduct regular sales team meetings to review performance, share best practices, and address challenges.\n", + "Conduct quarterly reviews of our sales strategy to ensure alignment with market trends and adjust as needed.\n", + "\n", + "By following this sales strategy for fiscal year 2024, our tech company aims to achieve significant growth and success in our target markets, while also providing exceptional value and service to our customers.\n", + "----\n", + "Sales Engineering Collaboration\n", + "Title: Working with the Sales Team as an Engineer in a Tech Company\n", + "\n", + "Introduction:\n", + "As an engineer in a tech company, collaboration with the sales team is essential to ensure the success of the company's products and services. This guidance document aims to provide an overview of how engineers can effectively work with the sales team, fostering a positive and productive working environment.\n", + "Understanding the Sales Team's Role:\n", + "The sales team is responsible for promoting and selling the company's products and services to potential clients. Their role involves establishing relationships with customers, understanding their needs, and ensuring that the offered solutions align with their requirements.\n", + "\n", + "As an engineer, it is important to understand the sales team's goals and objectives, as this will help you to provide them with the necessary information, tools, and support to successfully sell your company's products and services.\n", + "Communication:\n", + "Effective communication is key to successfully working with the sales team. Make sure to maintain open lines of communication, and be responsive to their questions and concerns. This includes:\n", + "\n", + "a. Attending sales meetings and conference calls when required.\n", + "b. Providing regular product updates and training sessions to the sales team.\n", + "c. Being available to answer technical questions and clarifications.\n", + "Collaboration:\n", + "Collaborate with the sales team in developing and refining sales materials, such as product presentations, demos, and technical documents. This will ensure that the sales team has accurate and up-to-date information to present to clients.\n", + "\n", + "Additionally, work closely with the sales team on customer projects or product customizations, providing technical guidance, and ensuring that the solutions meet the customer's requirements.\n", + "Customer Engagement:\n", + "At times, engineers may be asked to join sales meetings or calls with potential clients to provide technical expertise. In these situations, it is important to:\n", + "----\n" + ] } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3.11.3 64-bit", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.3" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" - } + ], + "source": [ + "from langchain.schema.runnable import RunnableMap\n", + "from langchain.prompts import ChatPromptTemplate, PromptTemplate\n", + "from langchain.schema import format_document\n", + "from operator import itemgetter\n", + "\n", + "retriever = vector_store.as_retriever()\n", + "\n", + "llm = OpenAI(openai_api_key=OPENAI_API_KEY)\n", + "\n", + "ANSWER_PROMPT = ChatPromptTemplate.from_template(\n", + " \"\"\"\n", + "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Be as verbose and educational in your response as possible. \n", + "Each passage has a SOURCE which is the title of the document. When answering, cite source name of the passages you are answering from below the answer, on a new line, with a prefix of \"SOURCE:\".\n", + "\n", + "context: {context}\n", + "Question: {question}\n", + "Answer:\n", + "\n", + "\"\"\"\n", + ")\n", + "\n", + "DOCUMENT_PROMPT = PromptTemplate.from_template(\n", + " \"\"\"\n", + "---\n", + "SOURCE: {name}\n", + "{page_content}\n", + "---\n", + "\"\"\"\n", + ")\n", + "\n", + "\n", + "def _combine_documents(\n", + " docs, document_prompt=DOCUMENT_PROMPT, document_separator=\"\\n\\n\"\n", + "):\n", + " doc_strings = [format_document(doc, document_prompt) for doc in docs]\n", + " return document_separator.join(doc_strings)\n", + "\n", + "\n", + "retrieved_documents = RunnableMap(\n", + " docs=itemgetter(\"question\") | retriever,\n", + " question=itemgetter(\"question\"),\n", + ")\n", + "\n", + "_context = {\n", + " \"context\": lambda x: _combine_documents(x[\"docs\"]),\n", + " \"question\": lambda x: x[\"question\"],\n", + "}\n", + "\n", + "answer = {\n", + " \"answer\": _context | ANSWER_PROMPT | llm,\n", + " \"docs\": itemgetter(\"docs\"),\n", + "}\n", + "\n", + "chain = retrieved_documents | answer\n", + "\n", + "ans = chain.invoke({\"question\": \"what is the nasa sales team?\"})\n", + "\n", + "print(\"---- Answer ----\")\n", + "print(ans[\"answer\"])\n", + "print()\n", + "print(\"---- Documents ----\")\n", + "for doc in ans[\"docs\"]:\n", + " print(doc.metadata[\"name\"])\n", + " print(doc.page_content)\n", + " print(\"----\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Conversational Question Answering\n", + "We have achieved getting answers to questions, but what if we want to ask follow up questions? We can use the answer from the previous question as the context for the next question. This is known as conversational question answering.\n", + "\n", + "In this example, we extend the chain to use the answer from the previous question as the context for the next question." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---- Answer ----\n", + "The objectives for fiscal year 2024 are to increase revenue by 20% compared to fiscal year 2023, expand market share in key segments by 15%, retain 95% of existing customers and increase customer satisfaction ratings, and launch at least two new products or services in high-demand market segments. SOURCE: Fy2024 Company Sales Strategy\n" + ] } + ], + "source": [ + "from langchain.schema.runnable import RunnableMap\n", + "from langchain.prompts import ChatPromptTemplate, PromptTemplate\n", + "from langchain.schema import format_document\n", + "from operator import itemgetter\n", + "\n", + "retriever = vector_store.as_retriever()\n", + "\n", + "llm = OpenAI(openai_api_key=OPENAI_API_KEY)\n", + "\n", + "ANSWER_PROMPT = ChatPromptTemplate.from_template(\n", + " \"\"\"\n", + "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Be as verbose and educational in your response as possible. \n", + "Each passage has a SOURCE which is the title of the document. When answering, cite source name of the passages you are answering from below the answer, on a new line, with a prefix of \"SOURCE:\".\n", + "\n", + "context: \n", + "{context}\n", + "\n", + "Question: {question}\n", + "Answer:\n", + "\"\"\"\n", + ")\n", + "\n", + "DOCUMENT_PROMPT = PromptTemplate.from_template(\n", + " \"\"\"\n", + "---\n", + "SOURCE: {name}\n", + "{page_content}\n", + "---\n", + "\"\"\"\n", + ")\n", + "\n", + "CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(\n", + " \"\"\"Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.\n", + "\n", + "Chat History:\n", + "{chat_history}\n", + "Follow Up Input: {question}\n", + "\"\"\"\n", + ")\n", + "\n", + "standalone_question = RunnableMap(\n", + " standalone_question=RunnablePassthrough.assign(\n", + " chat_history=lambda x: _format_chat_history(x[\"chat_history\"])\n", + " )\n", + " | CONDENSE_QUESTION_PROMPT\n", + " | llm\n", + " | StrOutputParser(),\n", + ")\n", + "\n", + "\n", + "def _format_chat_history(chat_history) -> str:\n", + " buffer = \"\"\n", + " for dialogue_turn in chat_history:\n", + " human = \"Human: \" + dialogue_turn[0]\n", + " ai = \"Assistant: \" + dialogue_turn[1]\n", + " buffer += \"\\n\" + \"\\n\".join([human, ai])\n", + " return buffer\n", + "\n", + "\n", + "def _combine_documents(\n", + " docs, document_prompt=DOCUMENT_PROMPT, document_separator=\"\\n\\n\"\n", + "):\n", + " doc_strings = [format_document(doc, document_prompt) for doc in docs]\n", + " return document_separator.join(doc_strings)\n", + "\n", + "\n", + "retrieved_documents = RunnableMap(\n", + " docs=itemgetter(\"standalone_question\") | retriever,\n", + " question=itemgetter(\"standalone_question\"),\n", + ")\n", + "\n", + "_context = {\n", + " \"context\": lambda x: _combine_documents(x[\"docs\"]),\n", + " \"question\": lambda x: x[\"question\"],\n", + "}\n", + "\n", + "answer = {\n", + " \"answer\": _context | ANSWER_PROMPT | llm,\n", + " \"docs\": itemgetter(\"docs\"),\n", + "}\n", + "\n", + "chain = standalone_question | retrieved_documents | answer\n", + "\n", + "ans = chain.invoke(\n", + " {\n", + " \"question\": \"What are their objectives?\",\n", + " \"chat_history\": [\n", + " \"What is the nasa sales team?\",\n", + " \"The sales team of NASA consists of Laura Martinez, the Area \"\n", + " \"Vice-President of North America, and Gary Johnson, the Area \"\n", + " \"Vice-President of South America.\"\n", + " \"SOURCE: Sales Organization Overview\",\n", + " ],\n", + " }\n", + ")\n", + "\n", + "print(\"---- Answer ----\")\n", + "print(ans[\"answer\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next Steps\n", + "We have shown how to use Langchain to build a question answering system. We have shown how to index data into Elasticsearch, ask a question and use the answer from the previous question as the context for the next question." + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3.11.3 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.3" }, - "nbformat": 4, - "nbformat_minor": 0 + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/notebooks/integrations/amazon-bedrock/langchain-qa-example.ipynb b/notebooks/integrations/amazon-bedrock/langchain-qa-example.ipynb index 823d8363..15cfa11e 100644 --- a/notebooks/integrations/amazon-bedrock/langchain-qa-example.ipynb +++ b/notebooks/integrations/amazon-bedrock/langchain-qa-example.ipynb @@ -1,385 +1,382 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "IQt5lMKvxios" - }, - "source": [ - "# Use Amazon Bedrock\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/notebooks/integrations/amazon-bedrock/langchain-qa-example.ipynb)\n", - "\n", - "This workbook demonstrates how to work with Langchain [Amazon Bedrock](https://aws.amazon.com/bedrock/). Amazon Bedrock is a managed service that makes foundation models from leading AI startup and Amazon's own Titan models available through APIs.\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fWuHgEHjyRMt" - }, - "source": [ - "## Install packages and import modules" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7byqCX6VyWYW" - }, - "outputs": [], - "source": [ - "# install packages\n", - "!python3 -m pip install -qU langchain elasticsearch boto3\n", - "\n", - "# import modules\n", - "from getpass import getpass\n", - "from urllib.request import urlopen\n", - "from langchain.vectorstores import ElasticsearchStore\n", - "from langchain.embeddings.bedrock import BedrockEmbeddings\n", - "from langchain.llms import Bedrock\n", - "from langchain.chains import RetrievalQA\n", - "import boto3\n", - "import json" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bWCXMAi58M3G" - }, - "source": [ - "Note: boto3 is part of AWS SDK for Python and is required to use Bedrock LLM" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "F84cH96QqG6_" - }, - "source": [ - "## Init Bedrock client\n", - "\n", - "To authorize in AWS service we can use `~/.aws/config` file with [configuring credentials](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#configuring-credentials) or pass `AWS_ACCESS_KEY`, `AWS_SECRET_KEY`, `AWS_REGION` to boto3 module.\n", - "\n", - "We're using second approach for our example." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kG76APtmp6dH" - }, - "outputs": [], - "source": [ - "default_region = \"us-east-1\"\n", - "AWS_ACCESS_KEY = getpass(\"AWS Acces key: \")\n", - "AWS_SECRET_KEY = getpass(\"AWS Secret key: \")\n", - "AWS_REGION = input(f\"AWS Region [default: {default_region}]: \") or default_region\n", - "\n", - "bedrock_client = boto3.client(\n", - " service_name=\"bedrock-runtime\",\n", - " region_name=AWS_REGION,\n", - " aws_access_key_id=AWS_ACCESS_KEY,\n", - " aws_secret_access_key=AWS_SECRET_KEY\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Utg-ZqS_QS1G" - }, - "source": [ - "## Connect to Elasticsearch\n", - "\n", - "ℹ️ We're using an Elastic Cloud deployment of Elasticsearch for this notebook. If you don't have an Elastic Cloud deployment, sign up [here](https://cloud.elastic.co/registration?utm_source=github&utm_content=elasticsearch-labs-notebook) for a free trial.\n", - "\n", - "We'll use the **Cloud ID** to identify our deployment, because we are using Elastic Cloud deployment. To find the Cloud ID for your deployment, go to https://cloud.elastic.co/deployments and select your deployment.\n", - "\n", - "\n", - "We will use [ElasticsearchStore](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.elasticsearch.ElasticsearchStore.html) to connect to our elastic cloud deployment. This would help create and index data easily. In the ElasticsearchStore instance, will set embedding to [BedrockEmbeddings](https://api.python.langchain.com/en/latest/embeddings/langchain.embeddings.bedrock.BedrockEmbeddings.html) to embed the texts and elasticsearch index name that will be used in this example. In the instance, we will set `strategy` to [ElasticsearchStore.SparseVectorRetrievalStrategy()](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.elasticsearch.SparseRetrievalStrategy.html#langchain.vectorstores.elasticsearch.SparseRetrievalStrategy) as we use this strategy to split documents.\n", - "\n", - "As we're using [ELSER](https://www.elastic.co/guide/en/machine-learning/current/ml-nlp-elser.html) we use [SparseVectorRetrievalStrategy](https://python.langchain.com/docs/integrations/vectorstores/elasticsearch#sparsevectorretrievalstrategy-elser) strategy. This strategy uses Elasticsearch's sparse vector retrieval to retrieve the top-k results. There is more other [strategies](https://python.langchain.com/docs/integrations/vectorstores/elasticsearch#approxretrievalstrategy) in langchain that might be used base on your needs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "idJiMEZpQfP7" - }, - "outputs": [], - "source": [ - "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id\n", - "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", - "\n", - "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#creating-an-api-key\n", - "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", - "\n", - "embeddings = BedrockEmbeddings(client=bedrock_client)\n", - "\n", - "vector_store = ElasticsearchStore(\n", - " es_cloud_id=ELASTIC_CLOUD_ID,\n", - " es_api_key=ELASTIC_API_KEY,\n", - " index_name= \"workplace_index\",\n", - " embedding=embeddings,\n", - " strategy=ElasticsearchStore.SparseVectorRetrievalStrategy()\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qAkNwd_lQ7HZ" - }, - "source": [ - "## Download the dataset\n", - "\n", - "Let's download the sample dataset and deserialize the document." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sjwpw_IxQ72L" - }, - "outputs": [], - "source": [ - "url = \"https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/example-apps/chatbot-rag-app/data/data.json\"\n", - "\n", - "response = urlopen(url)\n", - "\n", - "workplace_docs = json.loads(response.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YWCTPOgnRHiZ" - }, - "source": [ - "## Split Documents into Passages\n", - "\n", - "We’ll chunk documents into passages in order to improve the retrieval specificity and to ensure that we can provide multiple passages within the context window of the final question answering prompt.\n", - "\n", - "Here we are chunking documents into 500 token passages with an overlap of 0 tokens.\n", - "\n", - "Here we are using a simple splitter but Langchain offers more advanced splitters to reduce the chance of context being lost." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mAtGD7GjRIIf" - }, - "outputs": [], - "source": [ - "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", - "\n", - "metadata = []\n", - "content = []\n", - "\n", - "for doc in workplace_docs:\n", - " content.append(doc[\"content\"])\n", - " metadata.append({\n", - " \"name\": doc[\"name\"],\n", - " \"summary\": doc[\"summary\"],\n", - " \"rolePermissions\":doc[\"rolePermissions\"]\n", - " })\n", - "\n", - "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=512, chunk_overlap=256)\n", - "docs = text_splitter.create_documents(content, metadatas=metadata)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MRXt1tnXRK_M" - }, - "source": [ - "## Index data into elasticsearch\n", - "\n", - "Next, we will index data to elasticsearch using [ElasticsearchStore.from_documents](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.elasticsearch.ElasticsearchStore.html#langchain.vectorstores.elasticsearch.ElasticsearchStore.from_documents). We will use Cloud ID, Password and Index name values set in the `Create cloud deployment` step.\n", - "\n", - "In the instance, we will set `strategy` to [ElasticsearchStore.SparseVectorRetrievalStrategy()](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.elasticsearch.SparseRetrievalStrategy.html#langchain.vectorstores.elasticsearch.SparseRetrievalStrategy)\n", - "\n", - "Note: Before we begin indexing, ensure you have [downloaded and deployed ELSER model](https://www.elastic.co/guide/en/machine-learning/current/ml-nlp-elser.html#download-deploy-elser) in your deployment and is running in ml node.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-T2P8_ltRNgy" - }, - "outputs": [], - "source": [ - "documents = vector_store.from_documents(\n", - " docs,\n", - " es_cloud_id=ELASTIC_CLOUD_ID,\n", - " es_api_key=ELASTIC_API_KEY,\n", - " index_name=\"workplace_index\",\n", - " strategy=ElasticsearchStore.SparseVectorRetrievalStrategy()\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "azqaOaChswVv" - }, - "source": [ - "## Init Bedrock LLM\n", - "\n", - "Next, we will initialize Bedrock LLM. In the Bedrock instance, will pass `bedrock_client` and specific `model_id`: `amazon.titan-text-express-v1`, `ai21.j2-ultra-v1`, `anthropic.claude-v2`, `cohere.command-text-v14` or etc. You can see list of available base models on [Amazon Bedrock User Guide](https://docs.aws.amazon.com/bedrock/latest/userguide/model-ids-arns.html)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fRtZ_dfXsjaL" - }, - "outputs": [], - "source": [ - "default_model_id = \"amazon.titan-text-express-v1\"\n", - "AWS_MODEL_ID = input(f\"AWS model [default: {default_model_id}]: \") or default_model_id\n", - "llm = Bedrock(\n", - " client=bedrock_client,\n", - " model_id=AWS_MODEL_ID\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9UZskhJRRTQV" - }, - "source": [ - "## Asking a question\n", - "Now that we have the passages stored in Elasticsearch and llm is initialized, we can now ask a question to get the relevant passages.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "gWGfbz2TkuJt", - "outputId": "12af9c94-9113-4f34-b9de-f60681951206" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Question: What is our work from home policy?\n", - "\n", - "\u001b[92m ---- Answer ---- \u001b[0m\n", - " We have a full-time work from home policy that provides guidelines and support for employees to work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\n", - "\n", - "\u001b[94m ---- Sources ---- \u001b[0m\n", - "Name: Work From Home Policy\n", - "Content: Effective: March 2020\n", - "Purpose\n", - "\n", - "The purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\n", - "Scope\n", - "\n", - "This policy applies to all employees who are eligible for remote work as determined by their role and responsibilities. It is designed to allow employees to work from home full time while maintaining the same level of performance and collaboration as they would in the office.\n", - "Eligibility\n", - "\n", - "Employees who can perform their work duties remotely and have received approval from their direct supervisor and the HR department are eligible for this work-from-home arrangement.\n", - "Equipment and Resources\n", - "-------\n", - "\n", - "Name: Work From Home Policy\n", - "Content: The company encourages employees to prioritize their health and well-being while working from home. This includes taking regular breaks, maintaining a work-life balance, and seeking support from supervisors and colleagues when needed.\n", - "Policy Review and Updates\n", - "\n", - "This work-from-home policy will be reviewed periodically and updated as necessary, taking into account changes in public health guidance, business needs, and employee feedback.\n", - "Questions and Concerns\n", - "\n", - "Employees are encouraged to direct any questions or concerns about this policy to their supervisor or the HR department.\n", - "-------\n", - "\n", - "Name: Work From Home Policy\n", - "Content: Employees are required to accurately track their work hours using the company's time tracking system. Non-exempt employees must obtain approval from their supervisor before working overtime.\n", - "Confidentiality and Data Security\n", - "\n", - "Employees must adhere to the company's confidentiality and data security policies while working from home. This includes safeguarding sensitive information, securing personal devices and internet connections, and reporting any security breaches to the IT department.\n", - "Health and Well-being\n", - "\n", - "The company encourages employees to prioritize their health and well-being while working from home. This includes taking regular breaks, maintaining a work-life balance, and seeking support from supervisors and colleagues when needed.\n", - "Policy Review and Updates\n", - "-------\n", - "\n", - "Name: Work From Home Policy\n", - "Content: Employees who can perform their work duties remotely and have received approval from their direct supervisor and the HR department are eligible for this work-from-home arrangement.\n", - "Equipment and Resources\n", - "\n", - "The necessary equipment and resources will be provided to employees for remote work, including a company-issued laptop, software licenses, and access to secure communication tools. Employees are responsible for maintaining and protecting the company's equipment and data.\n", - "Workspace\n", - "\n", - "Employees working from home are responsible for creating a comfortable and safe workspace that is conducive to productivity. This includes ensuring that their home office is ergonomically designed, well-lit, and free from distractions.\n", - "Communication\n", - "-------\n", - "\n" - ] - } - ], - "source": [ - "retriever = vector_store.as_retriever()\n", - "\n", - "qa = RetrievalQA.from_llm(\n", - " llm=llm,\n", - " retriever=retriever,\n", - " return_source_documents=True\n", - ")\n", - "\n", - "questions = [\n", - " 'What is the nasa sales team?',\n", - " 'What is our work from home policy?',\n", - " 'Does the company own my personal project?',\n", - " 'What job openings do we have?',\n", - " 'How does compensation work?'\n", - "]\n", - "question = questions[1]\n", - "print(f\"Question: {question}\\n\")\n", - "\n", - "ans = qa({\"query\": question})\n", - "\n", - "print(\"\\033[92m ---- Answer ---- \\033[0m\")\n", - "print(ans[\"result\"] + \"\\n\")\n", - "print(\"\\033[94m ---- Sources ---- \\033[0m\")\n", - "for doc in ans[\"source_documents\"]:\n", - " print(\"Name: \" + doc.metadata[\"name\"])\n", - " print(\"Content: \"+ doc.page_content)\n", - " print(\"-------\\n\")" - ] - } - ], - "metadata": { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "IQt5lMKvxios" + }, + "source": [ + "# Use Amazon Bedrock\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/notebooks/integrations/amazon-bedrock/langchain-qa-example.ipynb)\n", + "\n", + "This workbook demonstrates how to work with Langchain [Amazon Bedrock](https://aws.amazon.com/bedrock/). Amazon Bedrock is a managed service that makes foundation models from leading AI startup and Amazon's own Titan models available through APIs.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fWuHgEHjyRMt" + }, + "source": [ + "## Install packages and import modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7byqCX6VyWYW" + }, + "outputs": [], + "source": [ + "# install packages\n", + "!python3 -m pip install -qU langchain elasticsearch boto3\n", + "\n", + "# import modules\n", + "from getpass import getpass\n", + "from urllib.request import urlopen\n", + "from langchain.vectorstores import ElasticsearchStore\n", + "from langchain.embeddings.bedrock import BedrockEmbeddings\n", + "from langchain.llms import Bedrock\n", + "from langchain.chains import RetrievalQA\n", + "import boto3\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bWCXMAi58M3G" + }, + "source": [ + "Note: boto3 is part of AWS SDK for Python and is required to use Bedrock LLM" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F84cH96QqG6_" + }, + "source": [ + "## Init Bedrock client\n", + "\n", + "To authorize in AWS service we can use `~/.aws/config` file with [configuring credentials](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#configuring-credentials) or pass `AWS_ACCESS_KEY`, `AWS_SECRET_KEY`, `AWS_REGION` to boto3 module.\n", + "\n", + "We're using second approach for our example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kG76APtmp6dH" + }, + "outputs": [], + "source": [ + "default_region = \"us-east-1\"\n", + "AWS_ACCESS_KEY = getpass(\"AWS Acces key: \")\n", + "AWS_SECRET_KEY = getpass(\"AWS Secret key: \")\n", + "AWS_REGION = input(f\"AWS Region [default: {default_region}]: \") or default_region\n", + "\n", + "bedrock_client = boto3.client(\n", + " service_name=\"bedrock-runtime\",\n", + " region_name=AWS_REGION,\n", + " aws_access_key_id=AWS_ACCESS_KEY,\n", + " aws_secret_access_key=AWS_SECRET_KEY,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Utg-ZqS_QS1G" + }, + "source": [ + "## Connect to Elasticsearch\n", + "\n", + "ℹ️ We're using an Elastic Cloud deployment of Elasticsearch for this notebook. If you don't have an Elastic Cloud deployment, sign up [here](https://cloud.elastic.co/registration?utm_source=github&utm_content=elasticsearch-labs-notebook) for a free trial.\n", + "\n", + "We'll use the **Cloud ID** to identify our deployment, because we are using Elastic Cloud deployment. To find the Cloud ID for your deployment, go to https://cloud.elastic.co/deployments and select your deployment.\n", + "\n", + "\n", + "We will use [ElasticsearchStore](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.elasticsearch.ElasticsearchStore.html) to connect to our elastic cloud deployment. This would help create and index data easily. In the ElasticsearchStore instance, will set embedding to [BedrockEmbeddings](https://api.python.langchain.com/en/latest/embeddings/langchain.embeddings.bedrock.BedrockEmbeddings.html) to embed the texts and elasticsearch index name that will be used in this example. In the instance, we will set `strategy` to [ElasticsearchStore.SparseVectorRetrievalStrategy()](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.elasticsearch.SparseRetrievalStrategy.html#langchain.vectorstores.elasticsearch.SparseRetrievalStrategy) as we use this strategy to split documents.\n", + "\n", + "As we're using [ELSER](https://www.elastic.co/guide/en/machine-learning/current/ml-nlp-elser.html) we use [SparseVectorRetrievalStrategy](https://python.langchain.com/docs/integrations/vectorstores/elasticsearch#sparsevectorretrievalstrategy-elser) strategy. This strategy uses Elasticsearch's sparse vector retrieval to retrieve the top-k results. There is more other [strategies](https://python.langchain.com/docs/integrations/vectorstores/elasticsearch#approxretrievalstrategy) in langchain that might be used base on your needs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "idJiMEZpQfP7" + }, + "outputs": [], + "source": [ + "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id\n", + "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", + "\n", + "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#creating-an-api-key\n", + "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", + "\n", + "embeddings = BedrockEmbeddings(client=bedrock_client)\n", + "\n", + "vector_store = ElasticsearchStore(\n", + " es_cloud_id=ELASTIC_CLOUD_ID,\n", + " es_api_key=ELASTIC_API_KEY,\n", + " index_name=\"workplace_index\",\n", + " embedding=embeddings,\n", + " strategy=ElasticsearchStore.SparseVectorRetrievalStrategy(),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qAkNwd_lQ7HZ" + }, + "source": [ + "## Download the dataset\n", + "\n", + "Let's download the sample dataset and deserialize the document." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sjwpw_IxQ72L" + }, + "outputs": [], + "source": [ + "url = \"https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/example-apps/chatbot-rag-app/data/data.json\"\n", + "\n", + "response = urlopen(url)\n", + "\n", + "workplace_docs = json.loads(response.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YWCTPOgnRHiZ" + }, + "source": [ + "## Split Documents into Passages\n", + "\n", + "We’ll chunk documents into passages in order to improve the retrieval specificity and to ensure that we can provide multiple passages within the context window of the final question answering prompt.\n", + "\n", + "Here we are chunking documents into 500 token passages with an overlap of 0 tokens.\n", + "\n", + "Here we are using a simple splitter but Langchain offers more advanced splitters to reduce the chance of context being lost." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mAtGD7GjRIIf" + }, + "outputs": [], + "source": [ + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "\n", + "metadata = []\n", + "content = []\n", + "\n", + "for doc in workplace_docs:\n", + " content.append(doc[\"content\"])\n", + " metadata.append(\n", + " {\n", + " \"name\": doc[\"name\"],\n", + " \"summary\": doc[\"summary\"],\n", + " \"rolePermissions\": doc[\"rolePermissions\"],\n", + " }\n", + " )\n", + "\n", + "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n", + " chunk_size=512, chunk_overlap=256\n", + ")\n", + "docs = text_splitter.create_documents(content, metadatas=metadata)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MRXt1tnXRK_M" + }, + "source": [ + "## Index data into elasticsearch\n", + "\n", + "Next, we will index data to elasticsearch using [ElasticsearchStore.from_documents](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.elasticsearch.ElasticsearchStore.html#langchain.vectorstores.elasticsearch.ElasticsearchStore.from_documents). We will use Cloud ID, Password and Index name values set in the `Create cloud deployment` step.\n", + "\n", + "In the instance, we will set `strategy` to [ElasticsearchStore.SparseVectorRetrievalStrategy()](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.elasticsearch.SparseRetrievalStrategy.html#langchain.vectorstores.elasticsearch.SparseRetrievalStrategy)\n", + "\n", + "Note: Before we begin indexing, ensure you have [downloaded and deployed ELSER model](https://www.elastic.co/guide/en/machine-learning/current/ml-nlp-elser.html#download-deploy-elser) in your deployment and is running in ml node.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-T2P8_ltRNgy" + }, + "outputs": [], + "source": [ + "documents = vector_store.from_documents(\n", + " docs,\n", + " es_cloud_id=ELASTIC_CLOUD_ID,\n", + " es_api_key=ELASTIC_API_KEY,\n", + " index_name=\"workplace_index\",\n", + " strategy=ElasticsearchStore.SparseVectorRetrievalStrategy(),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "azqaOaChswVv" + }, + "source": [ + "## Init Bedrock LLM\n", + "\n", + "Next, we will initialize Bedrock LLM. In the Bedrock instance, will pass `bedrock_client` and specific `model_id`: `amazon.titan-text-express-v1`, `ai21.j2-ultra-v1`, `anthropic.claude-v2`, `cohere.command-text-v14` or etc. You can see list of available base models on [Amazon Bedrock User Guide](https://docs.aws.amazon.com/bedrock/latest/userguide/model-ids-arns.html)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fRtZ_dfXsjaL" + }, + "outputs": [], + "source": [ + "default_model_id = \"amazon.titan-text-express-v1\"\n", + "AWS_MODEL_ID = input(f\"AWS model [default: {default_model_id}]: \") or default_model_id\n", + "llm = Bedrock(client=bedrock_client, model_id=AWS_MODEL_ID)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9UZskhJRRTQV" + }, + "source": [ + "## Asking a question\n", + "Now that we have the passages stored in Elasticsearch and llm is initialized, we can now ask a question to get the relevant passages.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { "colab": { - "provenance": [] + "base_uri": "https://localhost:8080/" }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" + "id": "gWGfbz2TkuJt", + "outputId": "12af9c94-9113-4f34-b9de-f60681951206" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Question: What is our work from home policy?\n", + "\n", + "\u001b[92m ---- Answer ---- \u001b[0m\n", + " We have a full-time work from home policy that provides guidelines and support for employees to work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\n", + "\n", + "\u001b[94m ---- Sources ---- \u001b[0m\n", + "Name: Work From Home Policy\n", + "Content: Effective: March 2020\n", + "Purpose\n", + "\n", + "The purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\n", + "Scope\n", + "\n", + "This policy applies to all employees who are eligible for remote work as determined by their role and responsibilities. It is designed to allow employees to work from home full time while maintaining the same level of performance and collaboration as they would in the office.\n", + "Eligibility\n", + "\n", + "Employees who can perform their work duties remotely and have received approval from their direct supervisor and the HR department are eligible for this work-from-home arrangement.\n", + "Equipment and Resources\n", + "-------\n", + "\n", + "Name: Work From Home Policy\n", + "Content: The company encourages employees to prioritize their health and well-being while working from home. This includes taking regular breaks, maintaining a work-life balance, and seeking support from supervisors and colleagues when needed.\n", + "Policy Review and Updates\n", + "\n", + "This work-from-home policy will be reviewed periodically and updated as necessary, taking into account changes in public health guidance, business needs, and employee feedback.\n", + "Questions and Concerns\n", + "\n", + "Employees are encouraged to direct any questions or concerns about this policy to their supervisor or the HR department.\n", + "-------\n", + "\n", + "Name: Work From Home Policy\n", + "Content: Employees are required to accurately track their work hours using the company's time tracking system. Non-exempt employees must obtain approval from their supervisor before working overtime.\n", + "Confidentiality and Data Security\n", + "\n", + "Employees must adhere to the company's confidentiality and data security policies while working from home. This includes safeguarding sensitive information, securing personal devices and internet connections, and reporting any security breaches to the IT department.\n", + "Health and Well-being\n", + "\n", + "The company encourages employees to prioritize their health and well-being while working from home. This includes taking regular breaks, maintaining a work-life balance, and seeking support from supervisors and colleagues when needed.\n", + "Policy Review and Updates\n", + "-------\n", + "\n", + "Name: Work From Home Policy\n", + "Content: Employees who can perform their work duties remotely and have received approval from their direct supervisor and the HR department are eligible for this work-from-home arrangement.\n", + "Equipment and Resources\n", + "\n", + "The necessary equipment and resources will be provided to employees for remote work, including a company-issued laptop, software licenses, and access to secure communication tools. Employees are responsible for maintaining and protecting the company's equipment and data.\n", + "Workspace\n", + "\n", + "Employees working from home are responsible for creating a comfortable and safe workspace that is conducive to productivity. This includes ensuring that their home office is ergonomically designed, well-lit, and free from distractions.\n", + "Communication\n", + "-------\n", + "\n" + ] } + ], + "source": [ + "retriever = vector_store.as_retriever()\n", + "\n", + "qa = RetrievalQA.from_llm(llm=llm, retriever=retriever, return_source_documents=True)\n", + "\n", + "questions = [\n", + " \"What is the nasa sales team?\",\n", + " \"What is our work from home policy?\",\n", + " \"Does the company own my personal project?\",\n", + " \"What job openings do we have?\",\n", + " \"How does compensation work?\",\n", + "]\n", + "question = questions[1]\n", + "print(f\"Question: {question}\\n\")\n", + "\n", + "ans = qa({\"query\": question})\n", + "\n", + "print(\"\\033[92m ---- Answer ---- \\033[0m\")\n", + "print(ans[\"result\"] + \"\\n\")\n", + "print(\"\\033[94m ---- Sources ---- \\033[0m\")\n", + "for doc in ans[\"source_documents\"]:\n", + " print(\"Name: \" + doc.metadata[\"name\"])\n", + " print(\"Content: \" + doc.page_content)\n", + " print(\"-------\\n\")" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/notebooks/integrations/gemini/qa-langchain-gemini-elasticsearch.ipynb b/notebooks/integrations/gemini/qa-langchain-gemini-elasticsearch.ipynb index 6ce750db..eb7c01c3 100644 --- a/notebooks/integrations/gemini/qa-langchain-gemini-elasticsearch.ipynb +++ b/notebooks/integrations/gemini/qa-langchain-gemini-elasticsearch.ipynb @@ -141,11 +141,13 @@ "\n", "for doc in workplace_docs:\n", " content.append(doc[\"content\"])\n", - " metadata.append({\n", - " \"name\": doc[\"name\"],\n", - " \"summary\": doc[\"summary\"],\n", - " \"rolePermissions\":doc[\"rolePermissions\"]\n", - " })\n", + " metadata.append(\n", + " {\n", + " \"name\": doc[\"name\"],\n", + " \"summary\": doc[\"summary\"],\n", + " \"rolePermissions\": doc[\"rolePermissions\"],\n", + " }\n", + " )\n", "\n", "text_splitter = CharacterTextSplitter(chunk_size=50, chunk_overlap=0)\n", "docs = text_splitter.create_documents(content, metadatas=metadata)" @@ -175,7 +177,7 @@ " es_cloud_id=ELASTIC_CLOUD_ID,\n", " es_api_key=ELASTIC_API_KEY,\n", " index_name=elastic_index_name,\n", - " embedding=query_embedding\n", + " embedding=query_embedding,\n", ")" ] }, @@ -202,7 +204,7 @@ " es_cloud_id=ELASTIC_CLOUD_ID,\n", " es_api_key=ELASTIC_API_KEY,\n", " embedding=query_embedding,\n", - " index_name=elastic_index_name\n", + " index_name=elastic_index_name,\n", ")\n", "\n", "retriever = es.as_retriever(search_kwargs={\"k\": 3})" @@ -252,9 +254,9 @@ "\n", "\n", "chain = (\n", - " {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()} \n", - " | prompt \n", - " | ChatGoogleGenerativeAI(model=\"gemini-pro\", temperature=0.7) \n", + " {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n", + " | prompt\n", + " | ChatGoogleGenerativeAI(model=\"gemini-pro\", temperature=0.7)\n", " | StrOutputParser()\n", ")\n", "\n", diff --git a/notebooks/integrations/gemini/vector-search-gemini-elastic.ipynb b/notebooks/integrations/gemini/vector-search-gemini-elastic.ipynb index 09fef317..0fbd4b71 100644 --- a/notebooks/integrations/gemini/vector-search-gemini-elastic.ipynb +++ b/notebooks/integrations/gemini/vector-search-gemini-elastic.ipynb @@ -60,10 +60,10 @@ "from elasticsearch import Elasticsearch, helpers\n", "from getpass import getpass\n", "\n", - "GOOGLE_API_KEY=getpass(\"Google API Key :\")\n", - "ELASTIC_API_KEY=getpass(\"Elastic API Key :\")\n", - "ELASTIC_CLOUD_ID=getpass(\"Elastic Cloud ID :\")\n", - "elastic_index_name='gemini-demo'" + "GOOGLE_API_KEY = getpass(\"Google API Key :\")\n", + "ELASTIC_API_KEY = getpass(\"Elastic API Key :\")\n", + "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID :\")\n", + "elastic_index_name = \"gemini-demo\"" ] }, { @@ -85,13 +85,12 @@ "genai.configure(api_key=GOOGLE_API_KEY)\n", "\n", "title = \"Climate in India\"\n", - "sample_text = (\"India generally experiences a hot summer from March to June, with temperatures often exceeding 40°C in central and northern regions. Monsoon season, from June to September, brings heavy rainfall, especially in the western coast and northeastern areas. Post-monsoon months, October and November, mark a transition with decreasing rainfall. Winter, from December to February, varies in temperature across the country, with colder conditions in the north and milder weather in the south. India's diverse climate is influenced by its geographical features, resulting in regional \")\n", + "sample_text = \"India generally experiences a hot summer from March to June, with temperatures often exceeding 40°C in central and northern regions. Monsoon season, from June to September, brings heavy rainfall, especially in the western coast and northeastern areas. Post-monsoon months, October and November, mark a transition with decreasing rainfall. Winter, from December to February, varies in temperature across the country, with colder conditions in the north and milder weather in the south. India's diverse climate is influenced by its geographical features, resulting in regional \"\n", "\n", - "model = 'models/embedding-001'\n", - "embedding = genai.embed_content(model=model,\n", - " content=sample_text,\n", - " task_type=\"retrieval_document\",\n", - " title=title)\n" + "model = \"models/embedding-001\"\n", + "embedding = genai.embed_content(\n", + " model=model, content=sample_text, task_type=\"retrieval_document\", title=title\n", + ")" ] }, { @@ -109,10 +108,7 @@ "metadata": {}, "outputs": [], "source": [ - "es = Elasticsearch(\n", - " cloud_id = ELASTIC_CLOUD_ID,\n", - " api_key= ELASTIC_API_KEY\n", - ")" + "es = Elasticsearch(cloud_id=ELASTIC_CLOUD_ID, api_key=ELASTIC_API_KEY)" ] }, { @@ -130,10 +126,7 @@ "metadata": {}, "outputs": [], "source": [ - "doc = {\n", - " 'text' : sample_text,\n", - " 'text_embedding' : embedding['embedding'] \n", - "}\n", + "doc = {\"text\": sample_text, \"text_embedding\": embedding[\"embedding\"]}\n", "\n", "resp = es.index(index=elastic_index_name, document=doc)\n", "\n", @@ -157,23 +150,21 @@ "source": [ "q = \"How's weather in India?\"\n", "\n", - "embedding = genai.embed_content(model=model,\n", - " content=q,\n", - " task_type=\"retrieval_query\")\n", + "embedding = genai.embed_content(model=model, content=q, task_type=\"retrieval_query\")\n", "\n", "resp = es.search(\n", - " index = elastic_index_name,\n", - " knn={\n", - " \"field\": \"text_embedding\",\n", - " \"query_vector\": embedding['embedding'],\n", - " \"k\": 10,\n", - " \"num_candidates\": 100\n", - " }\n", + " index=elastic_index_name,\n", + " knn={\n", + " \"field\": \"text_embedding\",\n", + " \"query_vector\": embedding[\"embedding\"],\n", + " \"k\": 10,\n", + " \"num_candidates\": 100,\n", + " },\n", ")\n", "\n", "\n", - "for result in resp['hits']['hits']:\n", - " pretty_output = (f\"\\n\\nID: {result['_id']}\\n\\nText: {result['_source']['text']}\\n\\nEmbedding: {result['_source']['text_embedding']}\")\n", + "for result in resp[\"hits\"][\"hits\"]:\n", + " pretty_output = f\"\\n\\nID: {result['_id']}\\n\\nText: {result['_source']['text']}\\n\\nEmbedding: {result['_source']['text_embedding']}\"\n", " print(pretty_output)" ] } diff --git a/notebooks/integrations/hugging-face/_nbtest.teardown.loading-model-from-hugging-face.ipynb b/notebooks/integrations/hugging-face/_nbtest.teardown.loading-model-from-hugging-face.ipynb index cbdb13e7..ae5f4c9d 100644 --- a/notebooks/integrations/hugging-face/_nbtest.teardown.loading-model-from-hugging-face.ipynb +++ b/notebooks/integrations/hugging-face/_nbtest.teardown.loading-model-from-hugging-face.ipynb @@ -13,7 +13,10 @@ "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", "\n", - "client = Elasticsearch(cloud_id=ELASTIC_CLOUD_ID, api_key=ELASTIC_API_KEY,)\n", + "client = Elasticsearch(\n", + " cloud_id=ELASTIC_CLOUD_ID,\n", + " api_key=ELASTIC_API_KEY,\n", + ")\n", "\n", "# delete the notebook's index\n", "client.indices.delete(index=\"blogs\", ignore_unavailable=True)\n", @@ -26,7 +29,9 @@ "\n", "# delete the model\n", "try:\n", - " client.ml.delete_trained_model(model_id=\"sentence-transformers__all-minilm-l6-v2\", force=True)\n", + " client.ml.delete_trained_model(\n", + " model_id=\"sentence-transformers__all-minilm-l6-v2\", force=True\n", + " )\n", "except:\n", " pass" ] diff --git a/notebooks/integrations/hugging-face/loading-model-from-hugging-face.ipynb b/notebooks/integrations/hugging-face/loading-model-from-hugging-face.ipynb index f2688cd5..485582b6 100644 --- a/notebooks/integrations/hugging-face/loading-model-from-hugging-face.ipynb +++ b/notebooks/integrations/hugging-face/loading-model-from-hugging-face.ipynb @@ -141,12 +141,10 @@ "outputs": [], "source": [ "es = Elasticsearch(\n", - " cloud_id=ELASTIC_CLOUD_ID,\n", - " api_key=ELASTIC_API_KEY,\n", - " request_timeout=600\n", + " cloud_id=ELASTIC_CLOUD_ID, api_key=ELASTIC_API_KEY, request_timeout=600\n", ")\n", "\n", - "es.info() # should return cluster info" + "es.info() # should return cluster info" ] }, { @@ -175,17 +173,20 @@ "outputs": [], "source": [ "# ingest pipeline definition\n", - "PIPELINE_ID=\"vectorize_blogs\"\n", - "\n", - "es.ingest.put_pipeline(id=PIPELINE_ID, processors=[{\n", - " \"inference\": {\n", - " \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", - " \"target_field\": \"text_embedding\",\n", - " \"field_map\": {\n", - " \"title\": \"text_field\"\n", - " }\n", + "PIPELINE_ID = \"vectorize_blogs\"\n", + "\n", + "es.ingest.put_pipeline(\n", + " id=PIPELINE_ID,\n", + " processors=[\n", + " {\n", + " \"inference\": {\n", + " \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", + " \"target_field\": \"text_embedding\",\n", + " \"field_map\": {\"title\": \"text_field\"},\n", + " }\n", " }\n", - " }])" + " ],\n", + ")" ] }, { @@ -213,66 +214,54 @@ "outputs": [], "source": [ "# define index name\n", - "INDEX_NAME=\"blogs\"\n", + "INDEX_NAME = \"blogs\"\n", "\n", "# flag to check if index has to be deleted before creating\n", - "SHOULD_DELETE_INDEX=True\n", + "SHOULD_DELETE_INDEX = True\n", "\n", "# define index mapping\n", "INDEX_MAPPING = {\n", " \"properties\": {\n", - " \"title\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\n", - " \"type\": \"keyword\",\n", - " \"ignore_above\": 256\n", - " }\n", - " }\n", - " },\n", - " \n", - " \"text_embedding\": {\n", - " \"properties\": {\n", - " \"is_truncated\": {\n", - " \"type\": \"boolean\"\n", - " },\n", - " \"model_id\": {\n", + " \"title\": {\n", " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\n", - " \"type\": \"keyword\",\n", - " \"ignore_above\": 256\n", - " }\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", + " \"text_embedding\": {\n", + " \"properties\": {\n", + " \"is_truncated\": {\"type\": \"boolean\"},\n", + " \"model_id\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", + " \"predicted_value\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 384,\n", + " \"index\": True,\n", + " \"similarity\": \"l2_norm\",\n", + " },\n", " }\n", - " },\n", - " \"predicted_value\": {\n", - " \"type\": \"dense_vector\",\n", - " \"dims\": 384,\n", - " \"index\": True,\n", - " \"similarity\": \"l2_norm\"\n", - " }\n", - " }\n", - " }\n", + " },\n", " }\n", - " }\n", + "}\n", "\n", "INDEX_SETTINGS = {\n", " \"index\": {\n", - " \"number_of_replicas\": \"1\",\n", - " \"number_of_shards\": \"1\",\n", - " \"default_pipeline\": PIPELINE_ID\n", + " \"number_of_replicas\": \"1\",\n", + " \"number_of_shards\": \"1\",\n", + " \"default_pipeline\": PIPELINE_ID,\n", " }\n", "}\n", "\n", "# check if we want to delete index before creating the index\n", - "if(SHOULD_DELETE_INDEX):\n", - " if es.indices.exists(index=INDEX_NAME):\n", - " print(\"Deleting existing %s\" % INDEX_NAME)\n", - " es.indices.delete(index=INDEX_NAME, ignore=[400, 404])\n", + "if SHOULD_DELETE_INDEX:\n", + " if es.indices.exists(index=INDEX_NAME):\n", + " print(\"Deleting existing %s\" % INDEX_NAME)\n", + " es.indices.delete(index=INDEX_NAME, ignore=[400, 404])\n", "\n", "print(\"Creating index %s\" % INDEX_NAME)\n", - "es.indices.create(index=INDEX_NAME, mappings=INDEX_MAPPING, settings=INDEX_SETTINGS,\n", - " ignore=[400, 404])\n" + "es.indices.create(\n", + " index=INDEX_NAME, mappings=INDEX_MAPPING, settings=INDEX_SETTINGS, ignore=[400, 404]\n", + ")" ] }, { @@ -370,32 +359,31 @@ } ], "source": [ - "INDEX_NAME=\"blogs\"\n", + "INDEX_NAME = \"blogs\"\n", "\n", - "source_fields = [ \"id\", \"title\"]\n", + "source_fields = [\"id\", \"title\"]\n", "\n", "query = {\n", - " \"field\": \"text_embedding.predicted_value\",\n", - " \"k\": 10,\n", - " \"num_candidates\": 50,\n", - " \"query_vector_builder\": {\n", - " \"text_embedding\": {\"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", - " \"model_text\": \"how to track network connections\"\n", - " }\n", - " }\n", + " \"field\": \"text_embedding.predicted_value\",\n", + " \"k\": 10,\n", + " \"num_candidates\": 50,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", + " \"model_text\": \"how to track network connections\",\n", + " }\n", + " },\n", "}\n", "\n", - "response = es.search(\n", - " index=INDEX_NAME,\n", - " fields=source_fields,\n", - " knn=query,\n", - " source=False)\n", + "response = es.search(index=INDEX_NAME, fields=source_fields, knn=query, source=False)\n", + "\n", "\n", "def show_results(results):\n", " for result in results:\n", " print(f'{result[\"fields\"][\"title\"]}\\nScore: {result[\"_score\"]}\\n')\n", - " \n", - "show_results(response.body['hits']['hits'])" + "\n", + "\n", + "show_results(response.body[\"hits\"][\"hits\"])" ] }, { diff --git a/notebooks/integrations/openai/openai-KNN-RAG.ipynb b/notebooks/integrations/openai/openai-KNN-RAG.ipynb index 8cba820a..4df72cd7 100644 --- a/notebooks/integrations/openai/openai-KNN-RAG.ipynb +++ b/notebooks/integrations/openai/openai-KNN-RAG.ipynb @@ -76,17 +76,13 @@ "metadata": {}, "outputs": [], "source": [ - "\n", "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id\n", "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", "\n", "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#creating-an-api-key\n", "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", "\n", - "client = Elasticsearch(\n", - " cloud_id = ELASTIC_CLOUD_ID,\n", - " api_key=ELASTIC_API_KEY\n", - ")\n", + "client = Elasticsearch(cloud_id=ELASTIC_CLOUD_ID, api_key=ELASTIC_API_KEY)\n", "\n", "# Test connection to Elasticsearch\n", "print(client.info())" @@ -109,11 +105,10 @@ "metadata": {}, "outputs": [], "source": [ - "embeddings_url = 'https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip'\n", + "embeddings_url = \"https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip\"\n", "wget.download(embeddings_url)\n", "\n", - "with zipfile.ZipFile(\"vector_database_wikipedia_articles_embedded.zip\",\n", - "\"r\") as zip_ref:\n", + "with zipfile.ZipFile(\"vector_database_wikipedia_articles_embedded.zip\", \"r\") as zip_ref:\n", " zip_ref.extractall(\"data\")" ] }, @@ -134,8 +129,9 @@ "metadata": {}, "outputs": [], "source": [ - "\n", - "wikipedia_dataframe = pd.read_csv(\"data/vector_database_wikipedia_articles_embedded.csv\")" + "wikipedia_dataframe = pd.read_csv(\n", + " \"data/vector_database_wikipedia_articles_embedded.csv\"\n", + ")" ] }, { @@ -159,25 +155,24 @@ "metadata": {}, "outputs": [], "source": [ - "index_mapping= {\n", + "index_mapping = {\n", " \"properties\": {\n", - " \"title_vector\": {\n", - " \"type\": \"dense_vector\",\n", - " \"dims\": 1536,\n", - " \"index\": \"true\",\n", - " \"similarity\": \"cosine\"\n", - " },\n", - " \"content_vector\": {\n", - " \"type\": \"dense_vector\",\n", - " \"dims\": 1536,\n", - " \"index\": \"true\",\n", - " \"similarity\": \"cosine\"\n", - " },\n", - " \"text\": {\"type\": \"text\"},\n", - " \"title\": {\"type\": \"text\"},\n", - " \"url\": { \"type\": \"keyword\"},\n", - " \"vector_id\": {\"type\": \"long\"}\n", - " \n", + " \"title_vector\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 1536,\n", + " \"index\": \"true\",\n", + " \"similarity\": \"cosine\",\n", + " },\n", + " \"content_vector\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 1536,\n", + " \"index\": \"true\",\n", + " \"similarity\": \"cosine\",\n", + " },\n", + " \"text\": {\"type\": \"text\"},\n", + " \"title\": {\"type\": \"text\"},\n", + " \"url\": {\"type\": \"keyword\"},\n", + " \"vector_id\": {\"type\": \"long\"},\n", " }\n", "}\n", "client.indices.create(index=\"wikipedia_vector_index\", mappings=index_mapping)" @@ -205,16 +200,16 @@ "def dataframe_to_bulk_actions(df):\n", " for index, row in df.iterrows():\n", " yield {\n", - " \"_index\": 'wikipedia_vector_index',\n", - " \"_id\": row['id'],\n", + " \"_index\": \"wikipedia_vector_index\",\n", + " \"_id\": row[\"id\"],\n", " \"_source\": {\n", - " 'url' : row[\"url\"],\n", - " 'title' : row[\"title\"],\n", - " 'text' : row[\"text\"],\n", - " 'title_vector' : json.loads(row[\"title_vector\"]),\n", - " 'content_vector' : json.loads(row[\"content_vector\"]),\n", - " 'vector_id' : row[\"vector_id\"]\n", - " }\n", + " \"url\": row[\"url\"],\n", + " \"title\": row[\"title\"],\n", + " \"text\": row[\"text\"],\n", + " \"title_vector\": json.loads(row[\"title_vector\"]),\n", + " \"content_vector\": json.loads(row[\"content_vector\"]),\n", + " \"vector_id\": row[\"vector_id\"],\n", + " },\n", " }" ] }, @@ -258,13 +253,12 @@ "metadata": {}, "outputs": [], "source": [ - "print(client.search(index=\"wikipedia_vector_index\", query={\n", - " \"match\": {\n", - " \"text\": {\n", - " \"query\": \"Hummingbird\"\n", - " }\n", - " }\n", - "}))" + "print(\n", + " client.search(\n", + " index=\"wikipedia_vector_index\",\n", + " query={\"match\": {\"text\": {\"query\": \"Hummingbird\"}}},\n", + " )\n", + ")" ] }, { @@ -297,10 +291,10 @@ "EMBEDDING_MODEL = \"text-embedding-ada-002\"\n", "\n", "# Define question\n", - "question = 'How big is the Atlantic ocean?'\n", + "question = \"How big is the Atlantic ocean?\"\n", "\n", "# Create embedding\n", - "question_embedding = openai.Embedding.create(input=question, model=EMBEDDING_MODEL)\n" + "question_embedding = openai.Embedding.create(input=question, model=EMBEDDING_MODEL)" ] }, { @@ -324,13 +318,14 @@ "source": [ "# Function to pretty print Elasticsearch results\n", "\n", + "\n", "def pretty_response(response):\n", - " for hit in response['hits']['hits']:\n", - " id = hit['_id']\n", - " score = hit['_score']\n", - " title = hit['_source']['title']\n", - " text = hit['_source']['text']\n", - " pretty_output = (f\"\\nID: {id}\\nTitle: {title}\\nSummary: {text}\\nScore: {score}\")\n", + " for hit in response[\"hits\"][\"hits\"]:\n", + " id = hit[\"_id\"]\n", + " score = hit[\"_score\"]\n", + " title = hit[\"_source\"][\"title\"]\n", + " text = hit[\"_source\"][\"text\"]\n", + " pretty_output = f\"\\nID: {id}\\nTitle: {title}\\nSummary: {text}\\nScore: {score}\"\n", " print(pretty_output)" ] }, @@ -350,16 +345,18 @@ "outputs": [], "source": [ "response = client.search(\n", - " index = \"wikipedia_vector_index\",\n", - " knn={\n", - " \"field\": \"content_vector\",\n", - " \"query_vector\": question_embedding[\"data\"][0][\"embedding\"],\n", - " \"k\": 10,\n", - " \"num_candidates\": 100\n", - " }\n", + " index=\"wikipedia_vector_index\",\n", + " knn={\n", + " \"field\": \"content_vector\",\n", + " \"query_vector\": question_embedding[\"data\"][0][\"embedding\"],\n", + " \"k\": 10,\n", + " \"num_candidates\": 100,\n", + " },\n", ")\n", "pretty_response(response)\n", - "top_hit_summary = response['hits']['hits'][0]['_source']['text'] # Store content of top hit for final step" + "top_hit_summary = response[\"hits\"][\"hits\"][0][\"_source\"][\n", + " \"text\"\n", + "] # Store content of top hit for final step" ] }, { @@ -396,14 +393,17 @@ "outputs": [], "source": [ "summary = openai.ChatCompletion.create(\n", - " model=\"gpt-3.5-turbo\",\n", - " messages=[\n", + " model=\"gpt-3.5-turbo\",\n", + " messages=[\n", " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", - " {\"role\": \"user\", \"content\": \"Answer the following question:\" \n", - " + question \n", - " + \"by using the following text:\" \n", - " + top_hit_summary},\n", - " ]\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"Answer the following question:\"\n", + " + question\n", + " + \"by using the following text:\"\n", + " + top_hit_summary,\n", + " },\n", + " ],\n", ")\n", "\n", "choices = summary.choices\n", diff --git a/notebooks/langchain/_nbtest.setup.langchain-vector-store-using-elser.ipynb b/notebooks/langchain/_nbtest.setup.langchain-vector-store-using-elser.ipynb index 5c1d4cac..f7b7e01e 100644 --- a/notebooks/langchain/_nbtest.setup.langchain-vector-store-using-elser.ipynb +++ b/notebooks/langchain/_nbtest.setup.langchain-vector-store-using-elser.ipynb @@ -25,7 +25,10 @@ "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", "\n", - "client = Elasticsearch(cloud_id=ELASTIC_CLOUD_ID, api_key=ELASTIC_API_KEY,)" + "client = Elasticsearch(\n", + " cloud_id=ELASTIC_CLOUD_ID,\n", + " api_key=ELASTIC_API_KEY,\n", + ")" ] }, { @@ -37,46 +40,40 @@ "source": [ "# delete model if already downloaded and deployed\n", "try:\n", - " client.ml.delete_trained_model(model_id=\".elser_model_2\", force=True)\n", - " print(\"Model deleted successfully, We will proceed with creating one\")\n", + " client.ml.delete_trained_model(model_id=\".elser_model_2\", force=True)\n", + " print(\"Model deleted successfully, We will proceed with creating one\")\n", "except exceptions.NotFoundError:\n", - " print(\"Model doesn't exist, but We will proceed with creating one\")\n", + " print(\"Model doesn't exist, but We will proceed with creating one\")\n", "\n", - "# Creates the ELSER model configuration. Automatically downloads the model if it doesn't exist. \n", + "# Creates the ELSER model configuration. Automatically downloads the model if it doesn't exist.\n", "client.ml.put_trained_model(\n", - " model_id=\".elser_model_2\",\n", - " input={\n", - " \"field_names\": [\"text_field\"]\n", - " }\n", - " )\n", + " model_id=\".elser_model_2\", input={\"field_names\": [\"text_field\"]}\n", + ")\n", "\n", "while True:\n", " status = client.ml.get_trained_models(\n", - " model_id=\".elser_model_2\",\n", - " include=\"definition_status\"\n", + " model_id=\".elser_model_2\", include=\"definition_status\"\n", " )\n", - " \n", - " if (status[\"trained_model_configs\"][0][\"fully_defined\"]):\n", + "\n", + " if status[\"trained_model_configs\"][0][\"fully_defined\"]:\n", " break\n", " time.sleep(5)\n", "\n", "# Start trained model deployment if not already deployed\n", "client.ml.start_trained_model_deployment(\n", - " model_id=\".elser_model_2\",\n", - " number_of_allocations=1,\n", - " wait_for=\"starting\"\n", + " model_id=\".elser_model_2\", number_of_allocations=1, wait_for=\"starting\"\n", ")\n", "\n", "while True:\n", - " status = client.ml.get_trained_models_stats(\n", - " model_id=\".elser_model_2\",\n", - " )\n", - " if (status[\"trained_model_stats\"][0][\"deployment_stats\"][\"state\"] == \"started\"):\n", - " print(\"ELSER Model has been successfully deployed.\")\n", - " break\n", - " else:\n", - " print(\"ELSER Model is currently being deployed.\")\n", - " time.sleep(5)\n", + " status = client.ml.get_trained_models_stats(\n", + " model_id=\".elser_model_2\",\n", + " )\n", + " if status[\"trained_model_stats\"][0][\"deployment_stats\"][\"state\"] == \"started\":\n", + " print(\"ELSER Model has been successfully deployed.\")\n", + " break\n", + " else:\n", + " print(\"ELSER Model is currently being deployed.\")\n", + " time.sleep(5)\n", "\n", "time.sleep(5)" ] diff --git a/notebooks/langchain/_nbtest.teardown.langchain-using-own-model.ipynb b/notebooks/langchain/_nbtest.teardown.langchain-using-own-model.ipynb index aac893b7..98cbf25b 100644 --- a/notebooks/langchain/_nbtest.teardown.langchain-using-own-model.ipynb +++ b/notebooks/langchain/_nbtest.teardown.langchain-using-own-model.ipynb @@ -13,7 +13,10 @@ "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", "\n", - "client = Elasticsearch(cloud_id=ELASTIC_CLOUD_ID, api_key=ELASTIC_API_KEY,)\n", + "client = Elasticsearch(\n", + " cloud_id=ELASTIC_CLOUD_ID,\n", + " api_key=ELASTIC_API_KEY,\n", + ")\n", "\n", "# delete the notebook's index\n", "client.indices.delete(index=\"approx-search-demo\", ignore_unavailable=True)\n", @@ -26,7 +29,9 @@ "\n", "# delete the model\n", "try:\n", - " client.ml.delete_trained_model(model_id=\"sentence-transformers__all-minilm-l6-v2\", force=True)\n", + " client.ml.delete_trained_model(\n", + " model_id=\"sentence-transformers__all-minilm-l6-v2\", force=True\n", + " )\n", "except:\n", " pass" ] diff --git a/notebooks/langchain/_nbtest.teardown.langchain-vector-store-using-elser.ipynb b/notebooks/langchain/_nbtest.teardown.langchain-vector-store-using-elser.ipynb index de691c7d..4225fc11 100644 --- a/notebooks/langchain/_nbtest.teardown.langchain-vector-store-using-elser.ipynb +++ b/notebooks/langchain/_nbtest.teardown.langchain-vector-store-using-elser.ipynb @@ -13,7 +13,10 @@ "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", "\n", - "client = Elasticsearch(cloud_id=ELASTIC_CLOUD_ID, api_key=ELASTIC_API_KEY,)\n", + "client = Elasticsearch(\n", + " cloud_id=ELASTIC_CLOUD_ID,\n", + " api_key=ELASTIC_API_KEY,\n", + ")\n", "\n", "# delete the notebook's index\n", "client.indices.delete(index=\"workplace_index\", ignore_unavailable=True)\n", diff --git a/notebooks/langchain/langchain-using-own-model.ipynb b/notebooks/langchain/langchain-using-own-model.ipynb index dfa75ae4..1c95c9a9 100644 --- a/notebooks/langchain/langchain-using-own-model.ipynb +++ b/notebooks/langchain/langchain-using-own-model.ipynb @@ -68,13 +68,15 @@ "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", "\n", "vector_store = ElasticsearchStore(\n", - " es_cloud_id=ELASTIC_CLOUD_ID, \n", - " es_api_key=ELASTIC_API_KEY, \n", + " es_cloud_id=ELASTIC_CLOUD_ID,\n", + " es_api_key=ELASTIC_API_KEY,\n", " query_field=\"text_field\",\n", " vector_query_field=\"vector_query_field.predicted_value\",\n", - " index_name= \"approx-search-demo\",\n", - " strategy=ElasticsearchStore.ApproxRetrievalStrategy(query_model_id=\"sentence-transformers__all-minilm-l6-v2\")\n", - ")\n" + " index_name=\"approx-search-demo\",\n", + " strategy=ElasticsearchStore.ApproxRetrievalStrategy(\n", + " query_model_id=\"sentence-transformers__all-minilm-l6-v2\"\n", + " ),\n", + ")" ] }, { @@ -118,7 +120,7 @@ "metadata": {}, "outputs": [], "source": [ - "url = \"https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/example-apps/chatbot-rag-app/data/data.json\" \n", + "url = \"https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/example-apps/chatbot-rag-app/data/data.json\"\n", "response = urlopen(url)\n", "\n", "workplace_docs = json.loads(response.read())" @@ -152,17 +154,20 @@ } ], "source": [ - "PIPELINE_ID=\"vectorize_workplace\"\n", - "\n", - "vector_store.client.ingest.put_pipeline(id=PIPELINE_ID, processors=[{\n", - " \"inference\": {\n", - " \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", - " \"field_map\": {\n", - " \"query_field\": \"text_field\"\n", - " },\n", - " \"target_field\": \"vector_query_field\",\n", + "PIPELINE_ID = \"vectorize_workplace\"\n", + "\n", + "vector_store.client.ingest.put_pipeline(\n", + " id=PIPELINE_ID,\n", + " processors=[\n", + " {\n", + " \"inference\": {\n", + " \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", + " \"field_map\": {\"query_field\": \"text_field\"},\n", + " \"target_field\": \"vector_query_field\",\n", + " }\n", " }\n", - " }])" + " ],\n", + ")" ] }, { @@ -188,41 +193,40 @@ "outputs": [], "source": [ "# define index name\n", - "INDEX_NAME=\"approx-search-demo\"\n", + "INDEX_NAME = \"approx-search-demo\"\n", "\n", "# flag to check if index has to be deleted before creating\n", - "SHOULD_DELETE_INDEX=True\n", + "SHOULD_DELETE_INDEX = True\n", "\n", "# define index mapping\n", "INDEX_MAPPING = {\n", " \"properties\": {\n", - " \"text_field\": {\"type\": \"text\"},\n", - " \"vector_query_field\": {\n", - " \"properties\": {\n", - " \"is_truncated\": {\n", - " \"type\": \"boolean\"\n", - " },\n", - " \"predicted_value\": {\n", - " \"type\": \"dense_vector\",\n", - " \"dims\": 384,\n", - " \"index\": True,\n", - " \"similarity\": \"l2_norm\"\n", - " }\n", - " }\n", - " }\n", + " \"text_field\": {\"type\": \"text\"},\n", + " \"vector_query_field\": {\n", + " \"properties\": {\n", + " \"is_truncated\": {\"type\": \"boolean\"},\n", + " \"predicted_value\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 384,\n", + " \"index\": True,\n", + " \"similarity\": \"l2_norm\",\n", + " },\n", + " }\n", + " },\n", " }\n", - " }\n", + "}\n", "\n", "\n", - "INDEX_SETTINGS = {\"index\": { \"default_pipeline\": PIPELINE_ID}}\n", + "INDEX_SETTINGS = {\"index\": {\"default_pipeline\": PIPELINE_ID}}\n", "\n", "# check if we want to delete index before creating the index\n", - "if(SHOULD_DELETE_INDEX):\n", - " if vector_store.client.indices.exists(index=INDEX_NAME):\n", - " vector_store.client.indices.delete(index=INDEX_NAME, ignore=[400, 404])\n", + "if SHOULD_DELETE_INDEX:\n", + " if vector_store.client.indices.exists(index=INDEX_NAME):\n", + " vector_store.client.indices.delete(index=INDEX_NAME, ignore=[400, 404])\n", "\n", - "vector_store.client.indices.create(index=INDEX_NAME, mappings=INDEX_MAPPING, settings=INDEX_SETTINGS,\n", - " ignore=[400, 404])\n" + "vector_store.client.indices.create(\n", + " index=INDEX_NAME, mappings=INDEX_MAPPING, settings=INDEX_SETTINGS, ignore=[400, 404]\n", + ")" ] }, { @@ -245,13 +249,12 @@ "\n", "# data.json\n", "for doc in workplace_docs:\n", - " content.append(doc[\"content\"])\n", - " metadata.append({\n", - " \"name\": doc[\"name\"],\n", - " \"summary\": doc[\"summary\"]\n", - " })\n", + " content.append(doc[\"content\"])\n", + " metadata.append({\"name\": doc[\"name\"], \"summary\": doc[\"summary\"]})\n", "\n", - "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=512, chunk_overlap=256)\n", + "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n", + " chunk_size=512, chunk_overlap=256\n", + ")\n", "docs = text_splitter.create_documents(content, metadatas=metadata)" ] }, @@ -275,11 +278,15 @@ "outputs": [], "source": [ "documents = ElasticsearchStore.from_documents(\n", - " docs, es_cloud_id=ELASTIC_CLOUD_ID, es_api_key=ELASTIC_API_KEY, \n", - " index_name= \"approx-search-demo\",\n", + " docs,\n", + " es_cloud_id=ELASTIC_CLOUD_ID,\n", + " es_api_key=ELASTIC_API_KEY,\n", + " index_name=\"approx-search-demo\",\n", " query_field=\"text_field\",\n", " vector_query_field=\"vector_query_field.predicted_value\",\n", - " strategy=ElasticsearchStore.ApproxRetrievalStrategy(query_model_id=\"sentence-transformers__all-minilm-l6-v2\")\n", + " strategy=ElasticsearchStore.ApproxRetrievalStrategy(\n", + " query_model_id=\"sentence-transformers__all-minilm-l6-v2\"\n", + " ),\n", ")" ] }, diff --git a/notebooks/langchain/langchain-vector-store-using-elser.ipynb b/notebooks/langchain/langchain-vector-store-using-elser.ipynb index 48b738b0..b96bc22e 100644 --- a/notebooks/langchain/langchain-vector-store-using-elser.ipynb +++ b/notebooks/langchain/langchain-vector-store-using-elser.ipynb @@ -63,7 +63,6 @@ "metadata": {}, "outputs": [], "source": [ - "\n", "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id\n", "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", "\n", @@ -71,9 +70,9 @@ "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", "\n", "vector_store = ElasticsearchStore(\n", - " es_cloud_id=ELASTIC_CLOUD_ID, \n", - " es_api_key=ELASTIC_API_KEY,\n", - " index_name= \"workplace_index\",\n", + " es_cloud_id=ELASTIC_CLOUD_ID,\n", + " es_api_key=ELASTIC_API_KEY,\n", + " index_name=\"workplace_index\",\n", ")" ] }, @@ -119,14 +118,18 @@ "content = []\n", "\n", "for doc in workplace_docs:\n", - " content.append(doc[\"content\"])\n", - " metadata.append({\n", - " \"name\": doc[\"name\"],\n", - " \"summary\": doc[\"summary\"],\n", - " \"rolePermissions\":doc[\"rolePermissions\"]\n", - " })\n", + " content.append(doc[\"content\"])\n", + " metadata.append(\n", + " {\n", + " \"name\": doc[\"name\"],\n", + " \"summary\": doc[\"summary\"],\n", + " \"rolePermissions\": doc[\"rolePermissions\"],\n", + " }\n", + " )\n", "\n", - "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=512, chunk_overlap=256)\n", + "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n", + " chunk_size=512, chunk_overlap=256\n", + ")\n", "docs = text_splitter.create_documents(content, metadatas=metadata)" ] }, @@ -150,14 +153,16 @@ "outputs": [], "source": [ "documents = vector_store.from_documents(\n", - " docs, \n", + " docs,\n", " es_cloud_id=ELASTIC_CLOUD_ID,\n", " es_api_key=ELASTIC_API_KEY,\n", " index_name=\"workplace_index\",\n", - " strategy=ElasticsearchStore.SparseVectorRetrievalStrategy(model_id=\".elser_model_2\"),\n", + " strategy=ElasticsearchStore.SparseVectorRetrievalStrategy(\n", + " model_id=\".elser_model_2\"\n", + " ),\n", " bulk_kwargs={\n", " \"request_timeout\": 60,\n", - " }\n", + " },\n", ")" ] }, @@ -176,9 +181,9 @@ "outputs": [], "source": [ "def showResults(output):\n", - " print(\"Total results: \", len(output))\n", - " for index in range(len(output)):\n", - " print(output[index])" + " print(\"Total results: \", len(output))\n", + " for index in range(len(output)):\n", + " print(output[index])" ] }, { @@ -275,7 +280,10 @@ } ], "source": [ - "results = documents.similarity_search(\"How does the compensation work\", filter=[{ 'match': { \"metadata.rolePermissions\": \"manager\" }}])\n", + "results = documents.similarity_search(\n", + " \"How does the compensation work\",\n", + " filter=[{\"match\": {\"metadata.rolePermissions\": \"manager\"}}],\n", + ")\n", "showResults(results)" ] }, diff --git a/notebooks/langchain/langchain-vector-store.ipynb b/notebooks/langchain/langchain-vector-store.ipynb index bdb89a56..5db8392c 100644 --- a/notebooks/langchain/langchain-vector-store.ipynb +++ b/notebooks/langchain/langchain-vector-store.ipynb @@ -56,7 +56,6 @@ "metadata": {}, "outputs": [], "source": [ - "\n", "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id\n", "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", "\n", @@ -69,11 +68,11 @@ "embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)\n", "\n", "vector_store = ElasticsearchStore(\n", - " es_cloud_id=ELASTIC_CLOUD_ID, \n", - " es_api_key=ELASTIC_API_KEY,\n", - " index_name= \"workplace_index\", \n", - " embedding=embeddings\n", - ")\n" + " es_cloud_id=ELASTIC_CLOUD_ID,\n", + " es_api_key=ELASTIC_API_KEY,\n", + " index_name=\"workplace_index\",\n", + " embedding=embeddings,\n", + ")" ] }, { @@ -118,14 +117,18 @@ "content = []\n", "\n", "for doc in workplace_docs:\n", - " content.append(doc[\"content\"])\n", - " metadata.append({\n", - " \"name\": doc[\"name\"],\n", - " \"summary\": doc[\"summary\"],\n", - " \"rolePermissions\":doc[\"rolePermissions\"],\n", - " })\n", + " content.append(doc[\"content\"])\n", + " metadata.append(\n", + " {\n", + " \"name\": doc[\"name\"],\n", + " \"summary\": doc[\"summary\"],\n", + " \"rolePermissions\": doc[\"rolePermissions\"],\n", + " }\n", + " )\n", "\n", - "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=512, chunk_overlap=256)\n", + "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n", + " chunk_size=512, chunk_overlap=256\n", + ")\n", "docs = text_splitter.create_documents(content, metadatas=metadata)" ] }, @@ -147,11 +150,11 @@ "outputs": [], "source": [ "documents = vector_store.from_documents(\n", - " docs, \n", - " embeddings, \n", + " docs,\n", + " embeddings,\n", " es_cloud_id=ELASTIC_CLOUD_ID,\n", " es_api_key=ELASTIC_API_KEY,\n", - " index_name=\"workplace_index\"\n", + " index_name=\"workplace_index\",\n", ")" ] }, @@ -170,9 +173,9 @@ "outputs": [], "source": [ "def showResults(output):\n", - " print(\"Total results: \", len(output))\n", - " for index in range(len(output)):\n", - " print(output[index])" + " print(\"Total results: \", len(output))\n", + " for index in range(len(output)):\n", + " print(output[index])" ] }, { @@ -275,9 +278,11 @@ ], "source": [ "query = \"How does the compensation work?\"\n", - "results = documents.similarity_search(query, filter=[{ 'match': { \"metadata.rolePermissions\": \"manager\" }}])\n", + "results = documents.similarity_search(\n", + " query, filter=[{\"match\": {\"metadata.rolePermissions\": \"manager\"}}]\n", + ")\n", "\n", - "showResults(results)\n" + "showResults(results)" ] } ], diff --git a/notebooks/langchain/multi-query-retriever-examples/chatbot-with-multi-query-retriever.ipynb b/notebooks/langchain/multi-query-retriever-examples/chatbot-with-multi-query-retriever.ipynb index 8f308d48..80dc3405 100644 --- a/notebooks/langchain/multi-query-retriever-examples/chatbot-with-multi-query-retriever.ipynb +++ b/notebooks/langchain/multi-query-retriever-examples/chatbot-with-multi-query-retriever.ipynb @@ -77,10 +77,10 @@ "embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)\n", "\n", "vectorstore = ElasticsearchStore(\n", - " es_cloud_id=ELASTIC_CLOUD_ID, \n", + " es_cloud_id=ELASTIC_CLOUD_ID,\n", " es_api_key=ELASTIC_API_KEY,\n", " index_name=\"chatbot-multi-query-demo\",\n", - " embedding= embeddings,\n", + " embedding=embeddings,\n", ")" ] }, @@ -106,7 +106,7 @@ "response = urlopen(url)\n", "data = json.load(response)\n", "\n", - "with open('temp.json', 'w') as json_file:\n", + "with open(\"temp.json\", \"w\") as json_file:\n", " json.dump(data, json_file)" ] }, @@ -129,9 +129,10 @@ "metadata": {}, "outputs": [], "source": [ - "from langchain.document_loaders import JSONLoader \n", + "from langchain.document_loaders import JSONLoader\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "\n", + "\n", "def metadata_func(record: dict, metadata: dict) -> dict:\n", " metadata[\"name\"] = record.get(\"name\")\n", " metadata[\"summary\"] = record.get(\"summary\")\n", @@ -141,6 +142,7 @@ "\n", " return metadata\n", "\n", + "\n", "# For more loaders https://python.langchain.com/docs/modules/data_connection/document_loaders/\n", "# And 3rd party loaders https://python.langchain.com/docs/modules/data_connection/document_loaders/#third-party-loaders\n", "loader = JSONLoader(\n", @@ -150,7 +152,9 @@ " metadata_func=metadata_func,\n", ")\n", "\n", - "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=512, chunk_overlap=256)\n", + "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n", + " chunk_size=512, chunk_overlap=256\n", + ")\n", "docs = loader.load_and_split(text_splitter=text_splitter)" ] }, @@ -172,11 +176,11 @@ "outputs": [], "source": [ "documents = vectorstore.from_documents(\n", - " docs, \n", - " embeddings, \n", + " docs,\n", + " embeddings,\n", " index_name=\"chatbot-multi-query-demo\",\n", " es_cloud_id=ELASTIC_CLOUD_ID,\n", - " es_api_key=ELASTIC_API_KEY\n", + " es_api_key=ELASTIC_API_KEY,\n", ")\n", "\n", "llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)\n", @@ -233,12 +237,15 @@ " \"\"\"\n", ")\n", "\n", - "LLM_DOCUMENT_PROMPT = PromptTemplate.from_template(\"\"\"\n", + "LLM_DOCUMENT_PROMPT = PromptTemplate.from_template(\n", + " \"\"\"\n", "---\n", "SOURCE: {name}\n", "{page_content}\n", "---\n", - "\"\"\")\n", + "\"\"\"\n", + ")\n", + "\n", "\n", "def _combine_documents(\n", " docs, document_prompt=LLM_DOCUMENT_PROMPT, document_separator=\"\\n\\n\"\n", @@ -246,12 +253,13 @@ " doc_strings = [format_document(doc, document_prompt) for doc in docs]\n", " return document_separator.join(doc_strings)\n", "\n", + "\n", "_context = RunnableParallel(\n", " context=retriever | _combine_documents,\n", " question=RunnablePassthrough(),\n", ")\n", "\n", - "chain = (_context | LLM_CONTEXT_PROMPT | llm)\n", + "chain = _context | LLM_CONTEXT_PROMPT | llm\n", "\n", "ans = chain.invoke(\"what is the nasa sales team?\")\n", "\n", diff --git a/notebooks/langchain/multi-query-retriever-examples/langchain-multi-query-retriever.ipynb b/notebooks/langchain/multi-query-retriever-examples/langchain-multi-query-retriever.ipynb index 13b13bc0..863b4d73 100644 --- a/notebooks/langchain/multi-query-retriever-examples/langchain-multi-query-retriever.ipynb +++ b/notebooks/langchain/multi-query-retriever-examples/langchain-multi-query-retriever.ipynb @@ -66,11 +66,11 @@ " Document(\n", " page_content=\"A bunch of scientists bring back dinosaurs and mayhem breaks loose\",\n", " metadata={\n", - " \"year\": 1993, \n", - " \"rating\": 7.7, \n", - " \"genre\": \"science fiction\", \n", - " \"director\": \"Steven Spielberg\", \n", - " \"title\": \"Jurassic Park\"\n", + " \"year\": 1993,\n", + " \"rating\": 7.7,\n", + " \"genre\": \"science fiction\",\n", + " \"director\": \"Steven Spielberg\",\n", + " \"title\": \"Jurassic Park\",\n", " },\n", " ),\n", " Document(\n", @@ -79,35 +79,35 @@ " \"year\": 2010,\n", " \"director\": \"Christopher Nolan\",\n", " \"rating\": 8.2,\n", - " \"title\": \"Inception\"\n", + " \"title\": \"Inception\",\n", " },\n", " ),\n", " Document(\n", " page_content=\"A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea\",\n", " metadata={\n", - " \"year\": 2006, \n", - " \"director\": \"Satoshi Kon\", \n", - " \"rating\": 8.6, \n", - " \"title\": \"Paprika\"\n", + " \"year\": 2006,\n", + " \"director\": \"Satoshi Kon\",\n", + " \"rating\": 8.6,\n", + " \"title\": \"Paprika\",\n", " },\n", " ),\n", " Document(\n", " page_content=\"A bunch of normal-sized women are supremely wholesome and some men pine after them\",\n", " metadata={\n", - " \"year\": 2019, \n", - " \"director\": \"Greta Gerwig\", \n", - " \"rating\": 8.3, \n", - " \"title\": \"Little Women\"\n", + " \"year\": 2019,\n", + " \"director\": \"Greta Gerwig\",\n", + " \"rating\": 8.3,\n", + " \"title\": \"Little Women\",\n", " },\n", " ),\n", " Document(\n", " page_content=\"Toys come alive and have a blast doing so\",\n", " metadata={\n", " \"year\": 1995,\n", - " \"genre\": \"animated\", \n", - " \"director\": \"John Lasseter\", \n", - " \"rating\": 8.3, \n", - " \"title\": \"Toy Story\"\n", + " \"genre\": \"animated\",\n", + " \"director\": \"John Lasseter\",\n", + " \"rating\": 8.3,\n", + " \"title\": \"Toy Story\",\n", " },\n", " ),\n", " Document(\n", @@ -160,7 +160,7 @@ " embeddings,\n", " index_name=\"elasticsearch-multi-query-demo\",\n", " es_cloud_id=ELASTIC_CLOUD_ID,\n", - " es_api_key=ELASTIC_API_KEY\n", + " es_api_key=ELASTIC_API_KEY,\n", ")" ] }, diff --git a/notebooks/langchain/self-query-retriever-examples/chatbot-example.ipynb b/notebooks/langchain/self-query-retriever-examples/chatbot-example.ipynb index af260e2e..e51815a7 100644 --- a/notebooks/langchain/self-query-retriever-examples/chatbot-example.ipynb +++ b/notebooks/langchain/self-query-retriever-examples/chatbot-example.ipynb @@ -67,23 +67,50 @@ "docs = [\n", " Document(\n", " page_content=\"A bunch of scientists bring back dinosaurs and mayhem breaks loose\",\n", - " metadata={\"year\": 1993, \"rating\": 7.7, \"genre\": \"science fiction\", \"director\": \"Steven Spielberg\", \"title\": \"Jurassic Park\"},\n", + " metadata={\n", + " \"year\": 1993,\n", + " \"rating\": 7.7,\n", + " \"genre\": \"science fiction\",\n", + " \"director\": \"Steven Spielberg\",\n", + " \"title\": \"Jurassic Park\",\n", + " },\n", " ),\n", " Document(\n", " page_content=\"Leo DiCaprio gets lost in a dream within a dream within a dream within a ...\",\n", - " metadata={\"year\": 2010, \"director\": \"Christopher Nolan\", \"rating\": 8.2, \"title\": \"Inception\"},\n", + " metadata={\n", + " \"year\": 2010,\n", + " \"director\": \"Christopher Nolan\",\n", + " \"rating\": 8.2,\n", + " \"title\": \"Inception\",\n", + " },\n", " ),\n", " Document(\n", " page_content=\"A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea\",\n", - " metadata={\"year\": 2006, \"director\": \"Satoshi Kon\", \"rating\": 8.6, \"title\": \"Paprika\"},\n", + " metadata={\n", + " \"year\": 2006,\n", + " \"director\": \"Satoshi Kon\",\n", + " \"rating\": 8.6,\n", + " \"title\": \"Paprika\",\n", + " },\n", " ),\n", " Document(\n", " page_content=\"A bunch of normal-sized women are supremely wholesome and some men pine after them\",\n", - " metadata={\"year\": 2019, \"director\": \"Greta Gerwig\", \"rating\": 8.3, \"title\": \"Little Women\"},\n", + " metadata={\n", + " \"year\": 2019,\n", + " \"director\": \"Greta Gerwig\",\n", + " \"rating\": 8.3,\n", + " \"title\": \"Little Women\",\n", + " },\n", " ),\n", " Document(\n", " page_content=\"Toys come alive and have a blast doing so\",\n", - " metadata={\"year\": 1995, \"genre\": \"animated\", \"director\": \"John Lasseter\", \"rating\": 8.3, \"title\": \"Toy Story\"},\n", + " metadata={\n", + " \"year\": 1995,\n", + " \"genre\": \"animated\",\n", + " \"director\": \"John Lasseter\",\n", + " \"rating\": 8.3,\n", + " \"title\": \"Toy Story\",\n", + " },\n", " ),\n", " Document(\n", " page_content=\"Three men walk into the Zone, three men walk out of the Zone\",\n", @@ -132,12 +159,12 @@ "\n", "\n", "vectorstore = ElasticsearchStore.from_documents(\n", - " docs, \n", - " embeddings, \n", - " index_name=\"elasticsearch-self-query-demo\", \n", - " es_cloud_id=ELASTIC_CLOUD_ID, \n", - " es_api_key=ELASTIC_API_KEY\n", - ")\n" + " docs,\n", + " embeddings,\n", + " index_name=\"elasticsearch-self-query-demo\",\n", + " es_cloud_id=ELASTIC_CLOUD_ID,\n", + " es_api_key=ELASTIC_API_KEY,\n", + ")" ] }, { @@ -187,7 +214,7 @@ "# instantiate retriever\n", "retriever = SelfQueryRetriever.from_llm(\n", " llm, vectorstore, document_content_description, metadata_field_info, verbose=True\n", - ")\n" + ")" ] }, { @@ -221,7 +248,8 @@ "from langchain.prompts import ChatPromptTemplate, PromptTemplate\n", "from langchain.schema import format_document\n", "\n", - "LLM_CONTEXT_PROMPT = ChatPromptTemplate.from_template(\"\"\"\n", + "LLM_CONTEXT_PROMPT = ChatPromptTemplate.from_template(\n", + " \"\"\"\n", "Use the following context movies that matched the user question. Use the movies below only to answer the user's question.\n", "\n", "If you don't know the answer, just say that you don't know, don't try to make up an answer.\n", @@ -231,15 +259,19 @@ "----\n", "Question: {question}\n", "Answer:\n", - "\"\"\")\n", + "\"\"\"\n", + ")\n", "\n", - "DOCUMENT_PROMPT = PromptTemplate.from_template(\"\"\"\n", + "DOCUMENT_PROMPT = PromptTemplate.from_template(\n", + " \"\"\"\n", "---\n", "title: {title} \n", "year: {year} \n", "director: {director} \n", "---\n", - "\"\"\")\n", + "\"\"\"\n", + ")\n", + "\n", "\n", "def _combine_documents(\n", " docs, document_prompt=DOCUMENT_PROMPT, document_separator=\"\\n\\n\"\n", @@ -253,9 +285,11 @@ " question=RunnablePassthrough(),\n", ")\n", "\n", - "chain = (_context | LLM_CONTEXT_PROMPT | llm)\n", + "chain = _context | LLM_CONTEXT_PROMPT | llm\n", "\n", - "chain.invoke(\"What movies are about dreams and was released after the year 2007 but before 2012?\")" + "chain.invoke(\n", + " \"What movies are about dreams and was released after the year 2007 but before 2012?\"\n", + ")" ] } ], diff --git a/notebooks/langchain/self-query-retriever-examples/chatbot-with-bm25-only-example.ipynb b/notebooks/langchain/self-query-retriever-examples/chatbot-with-bm25-only-example.ipynb index 04acd6ec..3928f086 100644 --- a/notebooks/langchain/self-query-retriever-examples/chatbot-with-bm25-only-example.ipynb +++ b/notebooks/langchain/self-query-retriever-examples/chatbot-with-bm25-only-example.ipynb @@ -58,35 +58,62 @@ "docs = [\n", " {\n", " \"text\": \"A bunch of scientists bring back dinosaurs and mayhem breaks loose\",\n", - " \"metadata\": {\"year\": 1993, \"rating\": 7.7, \"genre\": \"science fiction\", \"director\": \"Steven Spielberg\", \"title\": \"Jurassic Park\"},\n", + " \"metadata\": {\n", + " \"year\": 1993,\n", + " \"rating\": 7.7,\n", + " \"genre\": \"science fiction\",\n", + " \"director\": \"Steven Spielberg\",\n", + " \"title\": \"Jurassic Park\",\n", + " },\n", " },\n", " {\n", " \"text\": \"Leo DiCaprio gets lost in a dream within a dream within a dream within a ...\",\n", - " \"metadata\": {\"year\": 2010, \"director\": \"Christopher Nolan\", \"rating\": 8.2, \"title\": \"Inception\"},\n", + " \"metadata\": {\n", + " \"year\": 2010,\n", + " \"director\": \"Christopher Nolan\",\n", + " \"rating\": 8.2,\n", + " \"title\": \"Inception\",\n", + " },\n", " },\n", " {\n", " \"text\": \"A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea\",\n", - " \"metadata\": {\"year\": 2006, \"director\": \"Satoshi Kon\", \"rating\": 8.6, \"title\": \"Paprika\"},\n", + " \"metadata\": {\n", + " \"year\": 2006,\n", + " \"director\": \"Satoshi Kon\",\n", + " \"rating\": 8.6,\n", + " \"title\": \"Paprika\",\n", + " },\n", " },\n", " {\n", - " \"text\":\"A bunch of normal-sized women are supremely wholesome and some men pine after them\",\n", - " \"metadata\":{\"year\": 2019, \"director\": \"Greta Gerwig\", \"rating\": 8.3, \"title\": \"Little Women\"},\n", + " \"text\": \"A bunch of normal-sized women are supremely wholesome and some men pine after them\",\n", + " \"metadata\": {\n", + " \"year\": 2019,\n", + " \"director\": \"Greta Gerwig\",\n", + " \"rating\": 8.3,\n", + " \"title\": \"Little Women\",\n", + " },\n", " },\n", " {\n", - " \"text\":\"Toys come alive and have a blast doing so\",\n", - " \"metadata\":{\"year\": 1995, \"genre\": \"animated\", \"director\": \"John Lasseter\", \"rating\": 8.3, \"title\": \"Toy Story\"},\n", + " \"text\": \"Toys come alive and have a blast doing so\",\n", + " \"metadata\": {\n", + " \"year\": 1995,\n", + " \"genre\": \"animated\",\n", + " \"director\": \"John Lasseter\",\n", + " \"rating\": 8.3,\n", + " \"title\": \"Toy Story\",\n", + " },\n", " },\n", " {\n", - " \"text\":\"Three men walk into the Zone, three men walk out of the Zone\",\n", - " \"metadata\":{\n", + " \"text\": \"Three men walk into the Zone, three men walk out of the Zone\",\n", + " \"metadata\": {\n", " \"year\": 1979,\n", " \"rating\": 9.9,\n", " \"director\": \"Andrei Tarkovsky\",\n", " \"genre\": \"science fiction\",\n", " \"rating\": 9.9,\n", " \"title\": \"Stalker\",\n", - " }\n", - " }\n", + " },\n", + " },\n", "]" ] }, @@ -123,8 +150,8 @@ "OPENAI_API_KEY = getpass(\"OpenAI API key: \")\n", "\n", "client = Elasticsearch(\n", - " cloud_id=ELASTIC_CLOUD_ID,\n", - " api_key=ELASTIC_API_KEY,\n", + " cloud_id=ELASTIC_CLOUD_ID,\n", + " api_key=ELASTIC_API_KEY,\n", ")" ] }, @@ -150,17 +177,18 @@ "\n", "operations = [\n", " {\n", - " \"_index\": \"movies_self_query\",\n", - " \"_id\": i,\n", - " \"text\": doc[\"text\"],\n", - " \"metadata\": doc[\"metadata\"]\n", - " } for i, doc in enumerate(docs)\n", + " \"_index\": \"movies_self_query\",\n", + " \"_id\": i,\n", + " \"text\": doc[\"text\"],\n", + " \"metadata\": doc[\"metadata\"],\n", + " }\n", + " for i, doc in enumerate(docs)\n", "]\n", "\n", "# Add the documents to the index directly\n", "response = helpers.bulk(\n", - " client,\n", - " operations,\n", + " client,\n", + " operations,\n", ")" ] }, @@ -215,11 +243,10 @@ "# Set up openAI llm with sampling temperature 0\n", "llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)\n", "\n", + "\n", "class BM25RetrievalStrategy(ApproxRetrievalStrategy):\n", "\n", - " def __init__(\n", - " self\n", - " ):\n", + " def __init__(self):\n", " pass\n", "\n", " def query(\n", @@ -228,26 +255,22 @@ " filter: List[dict],\n", " **kwargs,\n", " ):\n", - " \n", + "\n", " if query:\n", - " query_clause = [{\n", - " \"multi_match\": {\n", - " \"query\": query,\n", - " \"fields\": [\"text\"],\n", - " \"fuzziness\": \"AUTO\",\n", + " query_clause = [\n", + " {\n", + " \"multi_match\": {\n", + " \"query\": query,\n", + " \"fields\": [\"text\"],\n", + " \"fuzziness\": \"AUTO\",\n", + " }\n", " }\n", - " }]\n", + " ]\n", " else:\n", " query_clause = []\n", "\n", - "\n", " bm25_query = {\n", - " \"query\": {\n", - " \"bool\": {\n", - " \"filter\": filter,\n", - " \"must\": query_clause\n", - " }\n", - " },\n", + " \"query\": {\"bool\": {\"filter\": filter, \"must\": query_clause}},\n", " }\n", "\n", " print(\"query\", bm25_query)\n", @@ -256,10 +279,10 @@ "\n", "\n", "vectorstore = ElasticsearchStore(\n", - " index_name=\"movies_self_query\",\n", - " es_connection=client,\n", - " strategy=BM25RetrievalStrategy()\n", - ")\n" + " index_name=\"movies_self_query\",\n", + " es_connection=client,\n", + " strategy=BM25RetrievalStrategy(),\n", + ")" ] }, { @@ -304,14 +327,11 @@ "from langchain.schema import format_document\n", "\n", "retriever = SelfQueryRetriever.from_llm(\n", - " llm, \n", - " vectorstore, \n", - " document_content_description, \n", - " metadata_field_info, \n", - " verbose=True\n", + " llm, vectorstore, document_content_description, metadata_field_info, verbose=True\n", ")\n", "\n", - "LLM_CONTEXT_PROMPT = ChatPromptTemplate.from_template(\"\"\"\n", + "LLM_CONTEXT_PROMPT = ChatPromptTemplate.from_template(\n", + " \"\"\"\n", "Use the following context movies that matched the user question. Use the movies below only to answer the user's question.\n", "\n", "If you don't know the answer, just say that you don't know, don't try to make up an answer.\n", @@ -321,15 +341,19 @@ "----\n", "Question: {question}\n", "Answer:\n", - "\"\"\")\n", + "\"\"\"\n", + ")\n", "\n", - "DOCUMENT_PROMPT = PromptTemplate.from_template(\"\"\"\n", + "DOCUMENT_PROMPT = PromptTemplate.from_template(\n", + " \"\"\"\n", "---\n", "title: {title} \n", "year: {year} \n", "director: {director} \n", "---\n", - "\"\"\")\n", + "\"\"\"\n", + ")\n", + "\n", "\n", "def _combine_documents(\n", " docs, document_prompt=DOCUMENT_PROMPT, document_separator=\"\\n\\n\"\n", @@ -344,9 +368,11 @@ " question=RunnablePassthrough(),\n", ")\n", "\n", - "chain = (_context | LLM_CONTEXT_PROMPT | llm)\n", + "chain = _context | LLM_CONTEXT_PROMPT | llm\n", "\n", - "chain.invoke(\"Which director directed movies about dinosaurs that was released after the year 1992 but before 2007?\")" + "chain.invoke(\n", + " \"Which director directed movies about dinosaurs that was released after the year 1992 but before 2007?\"\n", + ")" ] }, { diff --git a/notebooks/langchain/self-query-retriever-examples/langchain-self-query-retriever.ipynb b/notebooks/langchain/self-query-retriever-examples/langchain-self-query-retriever.ipynb index 37c027a7..8d3ee218 100644 --- a/notebooks/langchain/self-query-retriever-examples/langchain-self-query-retriever.ipynb +++ b/notebooks/langchain/self-query-retriever-examples/langchain-self-query-retriever.ipynb @@ -67,23 +67,50 @@ "docs = [\n", " Document(\n", " page_content=\"A bunch of scientists bring back dinosaurs and mayhem breaks loose\",\n", - " metadata={\"year\": 1993, \"rating\": 7.7, \"genre\": \"science fiction\", \"director\": \"Steven Spielberg\", \"title\": \"Jurassic Park\"},\n", + " metadata={\n", + " \"year\": 1993,\n", + " \"rating\": 7.7,\n", + " \"genre\": \"science fiction\",\n", + " \"director\": \"Steven Spielberg\",\n", + " \"title\": \"Jurassic Park\",\n", + " },\n", " ),\n", " Document(\n", " page_content=\"Leo DiCaprio gets lost in a dream within a dream within a dream within a ...\",\n", - " metadata={\"year\": 2010, \"director\": \"Christopher Nolan\", \"rating\": 8.2, \"title\": \"Inception\"},\n", + " metadata={\n", + " \"year\": 2010,\n", + " \"director\": \"Christopher Nolan\",\n", + " \"rating\": 8.2,\n", + " \"title\": \"Inception\",\n", + " },\n", " ),\n", " Document(\n", " page_content=\"A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea\",\n", - " metadata={\"year\": 2006, \"director\": \"Satoshi Kon\", \"rating\": 8.6, \"title\": \"Paprika\"},\n", + " metadata={\n", + " \"year\": 2006,\n", + " \"director\": \"Satoshi Kon\",\n", + " \"rating\": 8.6,\n", + " \"title\": \"Paprika\",\n", + " },\n", " ),\n", " Document(\n", " page_content=\"A bunch of normal-sized women are supremely wholesome and some men pine after them\",\n", - " metadata={\"year\": 2019, \"director\": \"Greta Gerwig\", \"rating\": 8.3, \"title\": \"Little Women\"},\n", + " metadata={\n", + " \"year\": 2019,\n", + " \"director\": \"Greta Gerwig\",\n", + " \"rating\": 8.3,\n", + " \"title\": \"Little Women\",\n", + " },\n", " ),\n", " Document(\n", " page_content=\"Toys come alive and have a blast doing so\",\n", - " metadata={\"year\": 1995, \"genre\": \"animated\", \"director\": \"John Lasseter\", \"rating\": 8.3, \"title\": \"Toy Story\"},\n", + " metadata={\n", + " \"year\": 1995,\n", + " \"genre\": \"animated\",\n", + " \"director\": \"John Lasseter\",\n", + " \"rating\": 8.3,\n", + " \"title\": \"Toy Story\",\n", + " },\n", " ),\n", " Document(\n", " page_content=\"Three men walk into the Zone, three men walk out of the Zone\",\n", @@ -132,12 +159,12 @@ "\n", "\n", "vectorstore = ElasticsearchStore.from_documents(\n", - " docs, \n", - " embeddings, \n", - " index_name=\"elasticsearch-self-query-demo\", \n", - " es_cloud_id=ELASTIC_CLOUD_ID, \n", - " es_api_key=ELASTIC_API_KEY\n", - ")\n" + " docs,\n", + " embeddings,\n", + " index_name=\"elasticsearch-self-query-demo\",\n", + " es_cloud_id=ELASTIC_CLOUD_ID,\n", + " es_api_key=ELASTIC_API_KEY,\n", + ")" ] }, { @@ -187,7 +214,7 @@ "# instantiate retriever\n", "retriever = SelfQueryRetriever.from_llm(\n", " llm, vectorstore, document_content_description, metadata_field_info, verbose=True\n", - ")\n" + ")" ] }, { @@ -221,7 +248,7 @@ } ], "source": [ - "# This example only specifies a relevant query \n", + "# This example only specifies a relevant query\n", "retriever.get_relevant_documents(\"What are some movies about dream\")" ] }, @@ -253,7 +280,9 @@ } ], "source": [ - "retriever.get_relevant_documents(\"Has Andrei Tarkovsky directed any science fiction movies\")" + "retriever.get_relevant_documents(\n", + " \"Has Andrei Tarkovsky directed any science fiction movies\"\n", + ")" ] }, { @@ -344,7 +373,9 @@ } ], "source": [ - "retriever.get_relevant_documents(\"Show that one movie which was about dream and was released after the year 1992 but before 2007?\")" + "retriever.get_relevant_documents(\n", + " \"Show that one movie which was about dream and was released after the year 1992 but before 2007?\"\n", + ")" ] } ], diff --git a/notebooks/model-upgrades/_nbtest.setup.upgrading-index-to-use-elser.ipynb b/notebooks/model-upgrades/_nbtest.setup.upgrading-index-to-use-elser.ipynb index f9580bb2..938d624a 100644 --- a/notebooks/model-upgrades/_nbtest.setup.upgrading-index-to-use-elser.ipynb +++ b/notebooks/model-upgrades/_nbtest.setup.upgrading-index-to-use-elser.ipynb @@ -34,7 +34,10 @@ "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", "\n", - "client = Elasticsearch(cloud_id=ELASTIC_CLOUD_ID, api_key=ELASTIC_API_KEY,)" + "client = Elasticsearch(\n", + " cloud_id=ELASTIC_CLOUD_ID,\n", + " api_key=ELASTIC_API_KEY,\n", + ")" ] }, { @@ -46,46 +49,40 @@ "source": [ "# delete model if already downloaded and deployed\n", "try:\n", - " client.ml.delete_trained_model(model_id=\".elser_model_2\",force=True)\n", - " print(\"Model deleted successfully, We will proceed with creating one\")\n", + " client.ml.delete_trained_model(model_id=\".elser_model_2\", force=True)\n", + " print(\"Model deleted successfully, We will proceed with creating one\")\n", "except exceptions.NotFoundError:\n", - " print(\"Model doesn't exist, but We will proceed with creating one\")\n", + " print(\"Model doesn't exist, but We will proceed with creating one\")\n", "\n", - "# Creates the ELSER model configuration. Automatically downloads the model if it doesn't exist. \n", + "# Creates the ELSER model configuration. Automatically downloads the model if it doesn't exist.\n", "client.ml.put_trained_model(\n", - " model_id=\".elser_model_2\",\n", - " input={\n", - " \"field_names\": [\"text_field\"]\n", - " }\n", - " )\n", + " model_id=\".elser_model_2\", input={\"field_names\": [\"text_field\"]}\n", + ")\n", "\n", "while True:\n", " status = client.ml.get_trained_models(\n", - " model_id=\".elser_model_2\",\n", - " include=\"definition_status\"\n", + " model_id=\".elser_model_2\", include=\"definition_status\"\n", " )\n", - " \n", - " if (status[\"trained_model_configs\"][0][\"fully_defined\"]):\n", + "\n", + " if status[\"trained_model_configs\"][0][\"fully_defined\"]:\n", " break\n", " time.sleep(5)\n", "\n", "# Start trained model deployment if not already deployed\n", "client.ml.start_trained_model_deployment(\n", - " model_id=\".elser_model_2\",\n", - " number_of_allocations=1,\n", - " wait_for=\"starting\"\n", + " model_id=\".elser_model_2\", number_of_allocations=1, wait_for=\"starting\"\n", ")\n", "\n", "while True:\n", - " status = client.ml.get_trained_models_stats(\n", - " model_id=\".elser_model_2\",\n", - " )\n", - " if (status[\"trained_model_stats\"][0][\"deployment_stats\"][\"state\"] == \"started\"):\n", - " print(\"ELSER Model has been successfully deployed.\")\n", - " break\n", - " else:\n", - " print(\"ELSER Model is currently being deployed.\")\n", - " time.sleep(5)" + " status = client.ml.get_trained_models_stats(\n", + " model_id=\".elser_model_2\",\n", + " )\n", + " if status[\"trained_model_stats\"][0][\"deployment_stats\"][\"state\"] == \"started\":\n", + " print(\"ELSER Model has been successfully deployed.\")\n", + " break\n", + " else:\n", + " print(\"ELSER Model is currently being deployed.\")\n", + " time.sleep(5)" ] }, { @@ -119,17 +116,17 @@ " \"type\": \"dense_vector\",\n", " \"dims\": 384,\n", " \"index\": \"true\",\n", - " \"similarity\": \"cosine\"\n", + " \"similarity\": \"cosine\",\n", " }\n", " }\n", "}\n", - "client.indices.create(index='books', mappings=mappings)\n", + "client.indices.create(index=\"books\", mappings=mappings)\n", "\n", "url = \"https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/notebooks/search/data.json\"\n", "response = urlopen(url)\n", "books = json.loads(response.read())\n", "\n", - "model = SentenceTransformer('all-MiniLM-L6-v2')\n", + "model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n", "operations = []\n", "for book in books:\n", " operations.append({\"index\": {\"_index\": \"books\"}})\n", diff --git a/notebooks/model-upgrades/_nbtest.teardown.upgrading-index-to-use-elser.ipynb b/notebooks/model-upgrades/_nbtest.teardown.upgrading-index-to-use-elser.ipynb index 4d479722..1aba84e5 100644 --- a/notebooks/model-upgrades/_nbtest.teardown.upgrading-index-to-use-elser.ipynb +++ b/notebooks/model-upgrades/_nbtest.teardown.upgrading-index-to-use-elser.ipynb @@ -13,7 +13,10 @@ "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", "\n", - "client = Elasticsearch(cloud_id=ELASTIC_CLOUD_ID, api_key=ELASTIC_API_KEY,)\n", + "client = Elasticsearch(\n", + " cloud_id=ELASTIC_CLOUD_ID,\n", + " api_key=ELASTIC_API_KEY,\n", + ")\n", "\n", "# delete the indices\n", "client.indices.delete(index=\"books\", ignore_unavailable=True)\n", diff --git a/notebooks/model-upgrades/upgrading-index-to-use-elser.ipynb b/notebooks/model-upgrades/upgrading-index-to-use-elser.ipynb index 1cfa6952..3b0bdd00 100644 --- a/notebooks/model-upgrades/upgrading-index-to-use-elser.ipynb +++ b/notebooks/model-upgrades/upgrading-index-to-use-elser.ipynb @@ -154,17 +154,10 @@ } ], "source": [ - "\n", "client.ingest.put_pipeline(\n", - " id=\"ingest-pipeline-lowercase\", \n", + " id=\"ingest-pipeline-lowercase\",\n", " description=\"Ingest pipeline to change title to lowercase\",\n", - " processors=[\n", - " {\n", - " \"lowercase\": {\n", - " \"field\": \"title\"\n", - " }\n", - " }\n", - " ]\n", + " processors=[{\"lowercase\": {\"field\": \"title\"}}],\n", ")" ] }, @@ -194,29 +187,24 @@ } ], "source": [ - "client.indices.delete(index=\"movies\",ignore_unavailable=True)\n", + "client.indices.delete(index=\"movies\", ignore_unavailable=True)\n", "client.indices.create(\n", - " index=\"movies\",\n", - " settings={\n", - " \"index\": {\n", - " \"number_of_shards\": 1,\n", - " \"number_of_replicas\": 1,\n", - " \"default_pipeline\": \"ingest-pipeline-lowercase\"\n", - " }\n", - " },\n", - " mappings={\n", - " \"properties\": {\n", - " \"plot\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\n", - " \"type\": \"keyword\",\n", - " \"ignore_above\": 256\n", - " }\n", + " index=\"movies\",\n", + " settings={\n", + " \"index\": {\n", + " \"number_of_shards\": 1,\n", + " \"number_of_replicas\": 1,\n", + " \"default_pipeline\": \"ingest-pipeline-lowercase\",\n", + " }\n", + " },\n", + " mappings={\n", + " \"properties\": {\n", + " \"plot\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", " }\n", - " },\n", - " }\n", - " }\n", + " },\n", ")" ] }, @@ -251,10 +239,12 @@ "# Prepare the documents to be indexed\n", "documents = []\n", "for doc in data_json:\n", - " documents.append({\n", - " \"_index\": \"movies\",\n", - " \"_source\": doc,\n", - " })\n", + " documents.append(\n", + " {\n", + " \"_index\": \"movies\",\n", + " \"_source\": doc,\n", + " }\n", + " )\n", "\n", "# Use helpers.bulk to index\n", "helpers.bulk(client, documents)\n", @@ -293,21 +283,18 @@ ], "source": [ "client.ingest.put_pipeline(\n", - " id=\"elser-ingest-pipeline\", \n", + " id=\"elser-ingest-pipeline\",\n", " description=\"Ingest pipeline for ELSER\",\n", " processors=[\n", - " {\n", - " \"inference\": {\n", - " \"model_id\": \".elser_model_2\",\n", - " \"input_output\": [\n", - " {\n", - " \"input_field\": \"plot\",\n", - " \"output_field\": \"plot_embedding\"\n", + " {\n", + " \"inference\": {\n", + " \"model_id\": \".elser_model_2\",\n", + " \"input_output\": [\n", + " {\"input_field\": \"plot\", \"output_field\": \"plot_embedding\"}\n", + " ],\n", " }\n", - " ]\n", - " }\n", - " }\n", - " ]\n", + " }\n", + " ],\n", ")" ] }, @@ -337,25 +324,18 @@ } ], "source": [ - "client.indices.delete(index=\"elser-movies\",ignore_unavailable=True)\n", + "client.indices.delete(index=\"elser-movies\", ignore_unavailable=True)\n", "client.indices.create(\n", - " index=\"elser-movies\",\n", - " mappings={\n", - " \"properties\": {\n", - " \"plot\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\n", - " \"type\": \"keyword\",\n", - " \"ignore_above\": 256\n", - " }\n", + " index=\"elser-movies\",\n", + " mappings={\n", + " \"properties\": {\n", + " \"plot\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", + " \"plot_embedding\": {\"type\": \"sparse_vector\"},\n", " }\n", - " },\n", - " \"plot_embedding\": { \n", - " \"type\": \"sparse_vector\" \n", - " }\n", - " }\n", - " }\n", + " },\n", ")" ] }, @@ -383,12 +363,10 @@ "metadata": {}, "outputs": [], "source": [ - "client.reindex(source={\n", - " \"index\": \"movies\"\n", - " }, dest={\n", - " \"index\": \"elser-movies\",\n", - " \"pipeline\": \"elser-ingest-pipeline\"\n", - " })\n", + "client.reindex(\n", + " source={\"index\": \"movies\"},\n", + " dest={\"index\": \"elser-movies\", \"pipeline\": \"elser-ingest-pipeline\"},\n", + ")\n", "time.sleep(7)" ] }, @@ -435,23 +413,23 @@ ], "source": [ "response = client.search(\n", - " index='elser-movies', \n", + " index=\"elser-movies\",\n", " size=3,\n", " query={\n", " \"text_expansion\": {\n", " \"plot_embedding\": {\n", - " \"model_id\":\".elser_model_2\",\n", - " \"model_text\":\"investigation\"\n", + " \"model_id\": \".elser_model_2\",\n", + " \"model_text\": \"investigation\",\n", " }\n", " }\n", - " }\n", + " },\n", ")\n", "\n", - "for hit in response['hits']['hits']:\n", - " doc_id = hit['_id']\n", - " score = hit['_score']\n", - " title = hit['_source']['title']\n", - " plot = hit['_source']['plot']\n", + "for hit in response[\"hits\"][\"hits\"]:\n", + " doc_id = hit[\"_id\"]\n", + " score = hit[\"_score\"]\n", + " title = hit[\"_source\"][\"title\"]\n", + " plot = hit[\"_source\"][\"plot\"]\n", " print(f\"Score: {score}\\nTitle: {title}\\nPlot: {plot}\\n\")" ] }, @@ -493,21 +471,18 @@ ], "source": [ "client.ingest.put_pipeline(\n", - " id=\"elser-pipeline-upgrade-demo\", \n", + " id=\"elser-pipeline-upgrade-demo\",\n", " description=\"Ingest pipeline for ELSER upgrade demo\",\n", " processors=[\n", - " {\n", - " \"inference\": {\n", - " \"model_id\": \".elser_model_2\",\n", - " \"input_output\": [\n", - " {\n", - " \"input_field\": \"title\",\n", - " \"output_field\": \"title_embedding\"\n", + " {\n", + " \"inference\": {\n", + " \"model_id\": \".elser_model_2\",\n", + " \"input_output\": [\n", + " {\"input_field\": \"title\", \"output_field\": \"title_embedding\"}\n", + " ],\n", " }\n", - " ]\n", - " }\n", - " }\n", - " ]\n", + " }\n", + " ],\n", ")" ] }, @@ -538,23 +513,16 @@ "source": [ "client.indices.delete(index=\"elser-upgrade-index-demo\", ignore_unavailable=True)\n", "client.indices.create(\n", - " index=\"elser-upgrade-index-demo\",\n", - " mappings={\n", - " \"properties\": {\n", - " \"title\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\n", - " \"type\": \"keyword\",\n", - " \"ignore_above\": 256\n", - " }\n", + " index=\"elser-upgrade-index-demo\",\n", + " mappings={\n", + " \"properties\": {\n", + " \"title\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", + " \"title_embedding\": {\"type\": \"sparse_vector\"},\n", " }\n", - " },\n", - " \"title_embedding\": {\n", - " \"type\": \"sparse_vector\"\n", - " },\n", - " }\n", - " }\n", + " },\n", ")" ] }, @@ -575,15 +543,20 @@ "metadata": {}, "outputs": [], "source": [ - "client.reindex(source={\n", - " \"index\": \"books\", # replace with your index name\n", - " \"_source\": {\n", - " \"excludes\": [\"title_vector\"] # replace with the field-name from your index, that has previously generated tokens\n", - " }}, \n", + "client.reindex(\n", + " source={\n", + " \"index\": \"books\", # replace with your index name\n", + " \"_source\": {\n", + " \"excludes\": [\n", + " \"title_vector\"\n", + " ] # replace with the field-name from your index, that has previously generated tokens\n", + " },\n", + " },\n", " dest={\n", - " \"index\": \"elser-upgrade-index-demo\",\n", - " \"pipeline\": \"elser-pipeline-upgrade-demo\"\n", - " })\n", + " \"index\": \"elser-upgrade-index-demo\",\n", + " \"pipeline\": \"elser-pipeline-upgrade-demo\",\n", + " },\n", + ")\n", "time.sleep(5)" ] }, @@ -622,24 +595,24 @@ ], "source": [ "response = client.search(\n", - " index='elser-upgrade-index-demo', \n", + " index=\"elser-upgrade-index-demo\",\n", " size=3,\n", " query={\n", " \"text_expansion\": {\n", " \"title_embedding\": {\n", - " \"model_id\":\".elser_model_2\",\n", - " \"model_text\":\"Programming Course\"\n", + " \"model_id\": \".elser_model_2\",\n", + " \"model_text\": \"Programming Course\",\n", " }\n", " }\n", - " }\n", + " },\n", ")\n", "\n", - "for hit in response['hits']['hits']:\n", - " doc_id = hit['_id']\n", - " score = hit['_score']\n", - " title = hit['_source']['title']\n", - " plot = hit['_source']['title']\n", - " print(f\"Score: {score}\\nTitle: {title}\\nPlot: {plot}\\n\")\n" + "for hit in response[\"hits\"][\"hits\"]:\n", + " doc_id = hit[\"_id\"]\n", + " score = hit[\"_score\"]\n", + " title = hit[\"_source\"][\"title\"]\n", + " plot = hit[\"_source\"][\"title\"]\n", + " print(f\"Score: {score}\\nTitle: {title}\\nPlot: {plot}\\n\")" ] }, { @@ -721,21 +694,18 @@ ], "source": [ "client.ingest.put_pipeline(\n", - " id=\"elser-pipeline-books\", \n", + " id=\"elser-pipeline-books\",\n", " description=\"Ingest pipeline for ELSER upgrade\",\n", " processors=[\n", - " {\n", - " \"inference\": {\n", - " \"model_id\": \".elser_model_2\",\n", - " \"input_output\": [\n", - " {\n", - " \"input_field\": \"title\",\n", - " \"output_field\": \"title_embedding\"\n", - " }\n", - " ]\n", - " }\n", - " }\n", - " ]\n", + " {\n", + " \"inference\": {\n", + " \"model_id\": \".elser_model_2\",\n", + " \"input_output\": [\n", + " {\"input_field\": \"title\", \"output_field\": \"title_embedding\"}\n", + " ],\n", + " }\n", + " }\n", + " ],\n", ")" ] }, @@ -767,23 +737,16 @@ "source": [ "client.indices.delete(index=\"elser-books\", ignore_unavailable=True)\n", "client.indices.create(\n", - " index=\"elser-books\",\n", - " mappings={\n", - " \"properties\": {\n", - " \"title\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\n", - " \"type\": \"keyword\",\n", - " \"ignore_above\": 256\n", - " }\n", + " index=\"elser-books\",\n", + " mappings={\n", + " \"properties\": {\n", + " \"title\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", + " \"title_embedding\": {\"type\": \"sparse_vector\"},\n", " }\n", - " },\n", - " \"title_embedding\": {\n", - " \"type\": \"sparse_vector\"\n", - " },\n", - " }\n", - " }\n", + " },\n", ")" ] }, @@ -802,15 +765,10 @@ "metadata": {}, "outputs": [], "source": [ - "client.reindex(source={\n", - " \"index\": \"books\",\n", - " \"_source\": {\n", - " \"excludes\": [\"title_vector\"]\n", - " }\n", - " }, dest={\n", - " \"index\": \"elser-books\",\n", - " \"pipeline\": \"elser-pipeline-books\"\n", - " })\n", + "client.reindex(\n", + " source={\"index\": \"books\", \"_source\": {\"excludes\": [\"title_vector\"]}},\n", + " dest={\"index\": \"elser-books\", \"pipeline\": \"elser-pipeline-books\"},\n", + ")\n", "time.sleep(5)" ] }, @@ -849,23 +807,23 @@ ], "source": [ "response = client.search(\n", - " index='elser-books', \n", + " index=\"elser-books\",\n", " size=3,\n", " query={\n", " \"text_expansion\": {\n", " \"title_embedding\": {\n", - " \"model_id\":\".elser_model_2\",\n", - " \"model_text\":\"Python tutorial\"\n", + " \"model_id\": \".elser_model_2\",\n", + " \"model_text\": \"Python tutorial\",\n", " }\n", " }\n", - " }\n", + " },\n", ")\n", "\n", - "for hit in response['hits']['hits']:\n", - " doc_id = hit['_id']\n", - " score = hit['_score']\n", - " title = hit['_source']['title']\n", - " print(f\"Score: {score}\\nTitle: {title}\")\n" + "for hit in response[\"hits\"][\"hits\"]:\n", + " doc_id = hit[\"_id\"]\n", + " score = hit[\"_score\"]\n", + " title = hit[\"_source\"][\"title\"]\n", + " print(f\"Score: {score}\\nTitle: {title}\")" ] }, { diff --git a/notebooks/search/00-quick-start.ipynb b/notebooks/search/00-quick-start.ipynb index 1cccafc3..3462ab9f 100644 --- a/notebooks/search/00-quick-start.ipynb +++ b/notebooks/search/00-quick-start.ipynb @@ -79,7 +79,7 @@ "source": [ "from sentence_transformers import SentenceTransformer\n", "\n", - "model = SentenceTransformer('all-MiniLM-L6-v2')" + "model = SentenceTransformer(\"all-MiniLM-L6-v2\")" ] }, { @@ -119,7 +119,7 @@ "# Create the client instance\n", "client = Elasticsearch(\n", " # For local development\n", - " # hosts=[\"http://localhost:9200\"] \n", + " # hosts=[\"http://localhost:9200\"]\n", " cloud_id=ELASTIC_CLOUD_ID,\n", " api_key=ELASTIC_API_KEY,\n", ")" @@ -257,13 +257,13 @@ " \"type\": \"dense_vector\",\n", " \"dims\": 384,\n", " \"index\": \"true\",\n", - " \"similarity\": \"cosine\"\n", + " \"similarity\": \"cosine\",\n", " }\n", " }\n", "}\n", "\n", "# Create the index\n", - "client.indices.create(index='book_index', mappings=mappings)" + "client.indices.create(index=\"book_index\", mappings=mappings)" ] }, { @@ -338,19 +338,19 @@ "outputs": [], "source": [ "def pretty_response(response):\n", - " if len(response['hits']['hits']) == 0:\n", - " print('Your search returned no results.')\n", + " if len(response[\"hits\"][\"hits\"]) == 0:\n", + " print(\"Your search returned no results.\")\n", " else:\n", - " for hit in response['hits']['hits']:\n", - " id = hit['_id']\n", - " publication_date = hit['_source']['publish_date']\n", - " score = hit['_score']\n", - " title = hit['_source']['title']\n", - " summary = hit['_source']['summary']\n", + " for hit in response[\"hits\"][\"hits\"]:\n", + " id = hit[\"_id\"]\n", + " publication_date = hit[\"_source\"][\"publish_date\"]\n", + " score = hit[\"_score\"]\n", + " title = hit[\"_source\"][\"title\"]\n", + " summary = hit[\"_source\"][\"summary\"]\n", " publisher = hit[\"_source\"][\"publisher\"]\n", " num_reviews = hit[\"_source\"][\"num_reviews\"]\n", " authors = hit[\"_source\"][\"authors\"]\n", - " pretty_output = (f\"\\nID: {id}\\nPublication date: {publication_date}\\nTitle: {title}\\nSummary: {summary}\\nPublisher: {publisher}\\nReviews: {num_reviews}\\nAuthors: {authors}\\nScore: {score}\")\n", + " pretty_output = f\"\\nID: {id}\\nPublication date: {publication_date}\\nTitle: {title}\\nSummary: {summary}\\nPublisher: {publisher}\\nReviews: {num_reviews}\\nAuthors: {authors}\\nScore: {score}\"\n", " print(pretty_output)" ] }, @@ -480,11 +480,11 @@ "response = client.search(\n", " index=\"book_index\",\n", " knn={\n", - " \"field\": \"title_vector\",\n", - " \"query_vector\": model.encode(\"javascript books\"),\n", - " \"k\": 10,\n", - " \"num_candidates\": 100\n", - " }\n", + " \"field\": \"title_vector\",\n", + " \"query_vector\": model.encode(\"javascript books\"),\n", + " \"k\": 10,\n", + " \"num_candidates\": 100,\n", + " },\n", ")\n", "\n", "pretty_response(response)" @@ -560,16 +560,12 @@ "response = client.search(\n", " index=\"book_index\",\n", " knn={\n", - " \"field\": \"title_vector\",\n", - " \"query_vector\": model.encode(\"javascript books\"),\n", - " \"k\": 10,\n", - " \"num_candidates\": 100,\n", - " \"filter\": {\n", - " \"term\": {\n", - " \"publisher.keyword\": \"addison-wesley\"\n", - " }\n", - " }\n", - " }\n", + " \"field\": \"title_vector\",\n", + " \"query_vector\": model.encode(\"javascript books\"),\n", + " \"k\": 10,\n", + " \"num_candidates\": 100,\n", + " \"filter\": {\"term\": {\"publisher.keyword\": \"addison-wesley\"}},\n", + " },\n", ")\n", "\n", "pretty_response(response)" diff --git a/notebooks/search/01-keyword-querying-filtering.ipynb b/notebooks/search/01-keyword-querying-filtering.ipynb index 03dae98c..84f1830e 100644 --- a/notebooks/search/01-keyword-querying-filtering.ipynb +++ b/notebooks/search/01-keyword-querying-filtering.ipynb @@ -36,7 +36,7 @@ "outputs": [], "source": [ "from elasticsearch import Elasticsearch\n", - "from getpass import getpass " + "from getpass import getpass" ] }, { @@ -61,7 +61,7 @@ "# Create the client instance\n", "client = Elasticsearch(\n", " # For local development\n", - " # hosts=[\"http://localhost:9200\"] \n", + " # hosts=[\"http://localhost:9200\"]\n", " cloud_id=ELASTIC_CLOUD_ID,\n", " api_key=ELASTIC_API_KEY,\n", ")" @@ -83,19 +83,19 @@ "outputs": [], "source": [ "def pretty_response(response):\n", - " if len(response['hits']['hits']) == 0:\n", - " print('Your search returned no results.')\n", + " if len(response[\"hits\"][\"hits\"]) == 0:\n", + " print(\"Your search returned no results.\")\n", " else:\n", - " for hit in response['hits']['hits']:\n", - " id = hit['_id']\n", - " publication_date = hit['_source']['publish_date']\n", - " score = hit['_score']\n", - " title = hit['_source']['title']\n", - " summary = hit['_source']['summary']\n", + " for hit in response[\"hits\"][\"hits\"]:\n", + " id = hit[\"_id\"]\n", + " publication_date = hit[\"_source\"][\"publish_date\"]\n", + " score = hit[\"_score\"]\n", + " title = hit[\"_source\"][\"title\"]\n", + " summary = hit[\"_source\"][\"summary\"]\n", " publisher = hit[\"_source\"][\"publisher\"]\n", " num_reviews = hit[\"_source\"][\"num_reviews\"]\n", " authors = hit[\"_source\"][\"authors\"]\n", - " pretty_output = (f\"\\nID: {id}\\nPublication date: {publication_date}\\nTitle: {title}\\nSummary: {summary}\\nPublisher: {publisher}\\nReviews: {num_reviews}\\nAuthors: {authors}\\nScore: {score}\")\n", + " pretty_output = f\"\\nID: {id}\\nPublication date: {publication_date}\\nTitle: {title}\\nSummary: {summary}\\nPublisher: {publisher}\\nReviews: {num_reviews}\\nAuthors: {authors}\\nScore: {score}\"\n", " print(pretty_output)" ] }, @@ -200,13 +200,9 @@ } ], "source": [ - "response = client.search(index=\"book_index\", query={\n", - " \"match\": {\n", - " \"summary\": {\n", - " \"query\": \"guide\"\n", - " }\n", - " }\n", - "})\n", + "response = client.search(\n", + " index=\"book_index\", query={\"match\": {\"summary\": {\"query\": \"guide\"}}}\n", + ")\n", "\n", "pretty_response(response)" ] @@ -271,12 +267,10 @@ } ], "source": [ - "response = client.search(index=\"book_index\", query={\n", - " \"multi_match\": {\n", - " \"query\": \"javascript\",\n", - " \"fields\": [\"summary\", \"title\"]\n", - " }\n", - "})\n", + "response = client.search(\n", + " index=\"book_index\",\n", + " query={\"multi_match\": {\"query\": \"javascript\", \"fields\": [\"summary\", \"title\"]}},\n", + ")\n", "\n", "pretty_response(response)" ] @@ -337,12 +331,10 @@ } ], "source": [ - "response = client.search(index=\"book_index\", query={\n", - " \"multi_match\": {\n", - " \"query\": \"javascript\",\n", - " \"fields\": [\"summary\", \"title^3\"]\n", - " }\n", - "})\n", + "response = client.search(\n", + " index=\"book_index\",\n", + " query={\"multi_match\": {\"query\": \"javascript\", \"fields\": [\"summary\", \"title^3\"]}},\n", + ")\n", "\n", "pretty_response(response)" ] @@ -398,11 +390,9 @@ } ], "source": [ - "response = client.search(index=\"book_index\", query={\n", - " \"term\": {\n", - " \"publisher.keyword\": \"addison-wesley\"\n", - " }\n", - "})\n", + "response = client.search(\n", + " index=\"book_index\", query={\"term\": {\"publisher.keyword\": \"addison-wesley\"}}\n", + ")\n", "\n", "pretty_response(response)" ] @@ -458,13 +448,9 @@ } ], "source": [ - "response = client.search(index=\"book_index\", query={\n", - " \"range\": {\n", - " \"num_reviews\": {\n", - " \"gte\": 45\n", - " }\n", - " }\n", - "})\n", + "response = client.search(\n", + " index=\"book_index\", query={\"range\": {\"num_reviews\": {\"gte\": 45}}}\n", + ")\n", "\n", "pretty_response(response)" ] @@ -518,13 +504,9 @@ } ], "source": [ - "response = client.search(index=\"book_index\", query={\n", - " \"prefix\": {\n", - " \"title\": {\n", - " \"value\": 'java'\n", - " }\n", - " }\n", - "})\n", + "response = client.search(\n", + " index=\"book_index\", query={\"prefix\": {\"title\": {\"value\": \"java\"}}}\n", + ")\n", "\n", "pretty_response(response)" ] @@ -588,13 +570,9 @@ } ], "source": [ - "response = client.search(index=\"book_index\", query={\n", - " \"fuzzy\": {\n", - " \"title\": {\n", - " \"value\": 'pyvascript'\n", - " }\n", - " }\n", - "})\n", + "response = client.search(\n", + " index=\"book_index\", query={\"fuzzy\": {\"title\": {\"value\": \"pyvascript\"}}}\n", + ")\n", "\n", "pretty_response(response)" ] @@ -647,19 +625,17 @@ } ], "source": [ - "response = client.search(index=\"book_index\", query={\n", - " \"bool\": {\n", - " \"must\": [{\n", - " \"term\": {\n", - " \"publisher.keyword\": \"addison-wesley\"\n", - " }\n", - " }, {\n", - " \"term\": {\n", - " \"authors.keyword\": \"richard helm\"\n", - " }\n", - " }]\n", - " }\n", - "})\n", + "response = client.search(\n", + " index=\"book_index\",\n", + " query={\n", + " \"bool\": {\n", + " \"must\": [\n", + " {\"term\": {\"publisher.keyword\": \"addison-wesley\"}},\n", + " {\"term\": {\"authors.keyword\": \"richard helm\"}},\n", + " ]\n", + " }\n", + " },\n", + ")\n", "\n", "pretty_response(response)" ] @@ -722,19 +698,17 @@ } ], "source": [ - "response = client.search(index=\"book_index\", query={\n", - " \"bool\": {\n", - " \"should\": [{\n", - " \"term\": {\n", - " \"publisher.keyword\": \"addison-wesley\"\n", - " }\n", - " }, {\n", - " \"term\": {\n", - " \"authors.keyword\": \"douglas crockford\"\n", - " }\n", - " }]\n", - " }\n", - "})\n", + "response = client.search(\n", + " index=\"book_index\",\n", + " query={\n", + " \"bool\": {\n", + " \"should\": [\n", + " {\"term\": {\"publisher.keyword\": \"addison-wesley\"}},\n", + " {\"term\": {\"authors.keyword\": \"douglas crockford\"}},\n", + " ]\n", + " }\n", + " },\n", + ")\n", "\n", "pretty_response(response)" ] @@ -805,15 +779,10 @@ } ], "source": [ - "response = client.search(index=\"book_index\", query={\n", - " \"bool\": {\n", - " \"filter\": [{\n", - " \"term\": {\n", - " \"publisher.keyword\": \"prentice hall\"\n", - " }\n", - " }]\n", - " }\n", - "})\n", + "response = client.search(\n", + " index=\"book_index\",\n", + " query={\"bool\": {\"filter\": [{\"term\": {\"publisher.keyword\": \"prentice hall\"}}]}},\n", + ")\n", "\n", "pretty_response(response)" ] @@ -857,17 +826,10 @@ } ], "source": [ - "response = client.search(index=\"book_index\", query={\n", - " \"bool\": {\n", - " \"must_not\": [{\n", - " \"range\": {\n", - " \"num_reviews\": {\n", - " \"lte\": 45\n", - " }\n", - " }\n", - " }]\n", - " }\n", - "})\n", + "response = client.search(\n", + " index=\"book_index\",\n", + " query={\"bool\": {\"must_not\": [{\"range\": {\"num_reviews\": {\"lte\": 45}}}]}},\n", + ")\n", "\n", "pretty_response(response)" ] @@ -904,28 +866,15 @@ } ], "source": [ - "response = client.search(index=\"book_index\", query={\n", - " \"bool\": {\n", - " \"must\": [\n", - " {\n", - " \"match\": {\n", - " \"title\": {\n", - " \"query\": \"javascript\"\n", - " }\n", - " }\n", - " }\n", - " ], \n", - " \"must_not\": [\n", - " {\n", - " \"range\": {\n", - " \"num_reviews\": {\n", - " \"lte\": 45\n", - " }\n", - " }\n", - " }\n", - " ]\n", - " }\n", - "})\n", + "response = client.search(\n", + " index=\"book_index\",\n", + " query={\n", + " \"bool\": {\n", + " \"must\": [{\"match\": {\"title\": {\"query\": \"javascript\"}}}],\n", + " \"must_not\": [{\"range\": {\"num_reviews\": {\"lte\": 45}}}],\n", + " }\n", + " },\n", + ")\n", "\n", "pretty_response(response)" ] diff --git a/notebooks/search/02-hybrid-search.ipynb b/notebooks/search/02-hybrid-search.ipynb index c99af1d7..c5e9fd22 100644 --- a/notebooks/search/02-hybrid-search.ipynb +++ b/notebooks/search/02-hybrid-search.ipynb @@ -70,7 +70,7 @@ "from sentence_transformers import SentenceTransformer\n", "from getpass import getpass\n", "\n", - "model = SentenceTransformer('all-MiniLM-L6-v2')" + "model = SentenceTransformer(\"all-MiniLM-L6-v2\")" ] }, { @@ -109,7 +109,7 @@ "# Create the client instance\n", "client = Elasticsearch(\n", " # For local development\n", - " # hosts=[\"http://localhost:9200\"] \n", + " # hosts=[\"http://localhost:9200\"]\n", " cloud_id=ELASTIC_CLOUD_ID,\n", " api_key=ELASTIC_API_KEY,\n", ")" @@ -179,16 +179,16 @@ "outputs": [], "source": [ "def pretty_response(response):\n", - " if len(response['hits']['hits']) == 0:\n", - " print('Your search returned no results.')\n", + " if len(response[\"hits\"][\"hits\"]) == 0:\n", + " print(\"Your search returned no results.\")\n", " else:\n", - " for hit in response['hits']['hits']:\n", - " id = hit['_id']\n", - " publication_date = hit['_source']['publish_date']\n", - " rank = hit['_rank']\n", - " title = hit['_source']['title']\n", - " summary = hit['_source']['summary']\n", - " pretty_output = (f\"\\nID: {id}\\nPublication date: {publication_date}\\nTitle: {title}\\nSummary: {summary}\\nRank: {rank}\")\n", + " for hit in response[\"hits\"][\"hits\"]:\n", + " id = hit[\"_id\"]\n", + " publication_date = hit[\"_source\"][\"publish_date\"]\n", + " rank = hit[\"_rank\"]\n", + " title = hit[\"_source\"][\"title\"]\n", + " summary = hit[\"_source\"][\"summary\"]\n", + " pretty_output = f\"\\nID: {id}\\nPublication date: {publication_date}\\nTitle: {title}\\nSummary: {summary}\\nRank: {rank}\"\n", " print(pretty_output)" ] }, @@ -256,22 +256,18 @@ ], "source": [ "response = client.search(\n", - " index=\"book_index\", \n", - " size=5, \n", - " query={\n", - " \"match\": {\n", - " \"summary\": \"python programming\"\n", - " }\n", - " }, \n", + " index=\"book_index\",\n", + " size=5,\n", + " query={\"match\": {\"summary\": \"python programming\"}},\n", " knn={\n", " \"field\": \"title_vector\",\n", - " \"query_vector\" : model.encode(\"python programming\").tolist(), # generate embedding for query so it can be compared to `title_vector`\n", + " \"query_vector\": model.encode(\n", + " \"python programming\"\n", + " ).tolist(), # generate embedding for query so it can be compared to `title_vector`\n", " \"k\": 5,\n", - " \"num_candidates\": 10\n", + " \"num_candidates\": 10,\n", " },\n", - " rank={\n", - " \"rrf\": {}\n", - " }\n", + " rank={\"rrf\": {}},\n", ")\n", "\n", "pretty_response(response)" diff --git a/notebooks/search/03-ELSER.ipynb b/notebooks/search/03-ELSER.ipynb index 16edf6bc..c6c5e6af 100644 --- a/notebooks/search/03-ELSER.ipynb +++ b/notebooks/search/03-ELSER.ipynb @@ -111,7 +111,7 @@ "# Create the client instance\n", "client = Elasticsearch(\n", " # For local development\n", - " # hosts=[\"http://localhost:9200\"] \n", + " # hosts=[\"http://localhost:9200\"]\n", " cloud_id=ELASTIC_CLOUD_ID,\n", " api_key=ELASTIC_API_KEY,\n", ")" @@ -179,18 +179,15 @@ "source": [ "# delete model if already downloaded and deployed\n", "try:\n", - " client.ml.delete_trained_model(model_id=\".elser_model_2\",force=True)\n", - " print(\"Model deleted successfully, We will proceed with creating one\")\n", + " client.ml.delete_trained_model(model_id=\".elser_model_2\", force=True)\n", + " print(\"Model deleted successfully, We will proceed with creating one\")\n", "except exceptions.NotFoundError:\n", - " print(\"Model doesn't exist, but We will proceed with creating one\")\n", + " print(\"Model doesn't exist, but We will proceed with creating one\")\n", "\n", - "# Creates the ELSER model configuration. Automatically downloads the model if it doesn't exist. \n", + "# Creates the ELSER model configuration. Automatically downloads the model if it doesn't exist.\n", "client.ml.put_trained_model(\n", - " model_id=\".elser_model_2\",\n", - " input={\n", - " \"field_names\": [\"text_field\"]\n", - " }\n", - " )\n" + " model_id=\".elser_model_2\", input={\"field_names\": [\"text_field\"]}\n", + ")" ] }, { @@ -208,11 +205,10 @@ "source": [ "while True:\n", " status = client.ml.get_trained_models(\n", - " model_id=\".elser_model_2\",\n", - " include=\"definition_status\"\n", + " model_id=\".elser_model_2\", include=\"definition_status\"\n", " )\n", - " \n", - " if (status[\"trained_model_configs\"][0][\"fully_defined\"]):\n", + "\n", + " if status[\"trained_model_configs\"][0][\"fully_defined\"]:\n", " print(\"ELSER Model is downloaded and ready to be deployed.\")\n", " break\n", " else:\n", @@ -235,23 +231,19 @@ "source": [ "# Start trained model deployment if not already deployed\n", "client.ml.start_trained_model_deployment(\n", - " model_id=\".elser_model_2\",\n", - " number_of_allocations=1,\n", - " wait_for=\"starting\"\n", + " model_id=\".elser_model_2\", number_of_allocations=1, wait_for=\"starting\"\n", ")\n", "\n", "while True:\n", - " status = client.ml.get_trained_models_stats(\n", - " model_id=\".elser_model_2\",\n", - " )\n", - " if (status[\"trained_model_stats\"][0][\"deployment_stats\"][\"state\"] == \"started\"):\n", - " print(\"ELSER Model has been successfully deployed.\")\n", - " break\n", - " else:\n", - " print(\"ELSER Model is currently being deployed.\")\n", - " time.sleep(5)\n", - "\n", - "\n" + " status = client.ml.get_trained_models_stats(\n", + " model_id=\".elser_model_2\",\n", + " )\n", + " if status[\"trained_model_stats\"][0][\"deployment_stats\"][\"state\"] == \"started\":\n", + " print(\"ELSER Model has been successfully deployed.\")\n", + " break\n", + " else:\n", + " print(\"ELSER Model is currently being deployed.\")\n", + " time.sleep(5)" ] }, { @@ -298,21 +290,18 @@ ], "source": [ "client.ingest.put_pipeline(\n", - " id=\"elser-ingest-pipeline\", \n", + " id=\"elser-ingest-pipeline\",\n", " description=\"Ingest pipeline for ELSER\",\n", " processors=[\n", - " {\n", - " \"inference\": {\n", - " \"model_id\": \".elser_model_2\",\n", - " \"input_output\": [\n", - " {\n", - " \"input_field\": \"plot\",\n", - " \"output_field\": \"plot_embedding\"\n", + " {\n", + " \"inference\": {\n", + " \"model_id\": \".elser_model_2\",\n", + " \"input_output\": [\n", + " {\"input_field\": \"plot\", \"output_field\": \"plot_embedding\"}\n", + " ],\n", " }\n", - " ]\n", - " }\n", - " }\n", - " ]\n", + " }\n", + " ],\n", ")" ] }, @@ -373,28 +362,17 @@ "source": [ "client.indices.delete(index=\"elser-example-movies\", ignore_unavailable=True)\n", "client.indices.create(\n", - " index=\"elser-example-movies\",\n", - " settings={\n", - " \"index\": {\n", - " \"default_pipeline\": \"elser-ingest-pipeline\"\n", - " }\n", - " },\n", - " mappings={\n", - " \"properties\": {\n", - " \"plot\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\n", - " \"type\": \"keyword\",\n", - " \"ignore_above\": 256\n", - " }\n", + " index=\"elser-example-movies\",\n", + " settings={\"index\": {\"default_pipeline\": \"elser-ingest-pipeline\"}},\n", + " mappings={\n", + " \"properties\": {\n", + " \"plot\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", + " \"plot_embedding\": {\"type\": \"sparse_vector\"},\n", " }\n", - " },\n", - " \"plot_embedding\": { \n", - " \"type\": \"sparse_vector\" \n", - " }\n", - " }\n", - " }\n", + " },\n", ")" ] }, @@ -440,10 +418,12 @@ "# Prepare the documents to be indexed\n", "documents = []\n", "for doc in data_json:\n", - " documents.append({\n", - " \"_index\": \"elser-example-movies\",\n", - " \"_source\": doc,\n", - " })\n", + " documents.append(\n", + " {\n", + " \"_index\": \"elser-example-movies\",\n", + " \"_source\": doc,\n", + " }\n", + " )\n", "\n", "# Use helpers.bulk to index\n", "helpers.bulk(client, documents)\n", @@ -511,23 +491,23 @@ ], "source": [ "response = client.search(\n", - " index='elser-example-movies', \n", + " index=\"elser-example-movies\",\n", " size=3,\n", " query={\n", " \"text_expansion\": {\n", " \"plot_embedding\": {\n", - " \"model_id\":\".elser_model_2\",\n", - " \"model_text\":\"fighting movie\"\n", + " \"model_id\": \".elser_model_2\",\n", + " \"model_text\": \"fighting movie\",\n", " }\n", " }\n", - " }\n", + " },\n", ")\n", "\n", - "for hit in response['hits']['hits']:\n", - " doc_id = hit['_id']\n", - " score = hit['_score']\n", - " title = hit['_source']['title']\n", - " plot = hit['_source']['plot']\n", + "for hit in response[\"hits\"][\"hits\"]:\n", + " doc_id = hit[\"_id\"]\n", + " score = hit[\"_score\"]\n", + " title = hit[\"_source\"][\"title\"]\n", + " plot = hit[\"_source\"][\"plot\"]\n", " print(f\"Score: {score}\\nTitle: {title}\\nPlot: {plot}\\n\")" ] }, diff --git a/notebooks/search/04-multilingual.ipynb b/notebooks/search/04-multilingual.ipynb index 3d0d8005..9b41984a 100644 --- a/notebooks/search/04-multilingual.ipynb +++ b/notebooks/search/04-multilingual.ipynb @@ -249,7 +249,7 @@ "# Create the client instance\n", "client = Elasticsearch(\n", " # For local development\n", - " # hosts=[\"http://localhost:9200\"] \n", + " # hosts=[\"http://localhost:9200\"]\n", " cloud_id=ELASTIC_CLOUD_ID,\n", " api_key=ELASTIC_API_KEY,\n", ")" @@ -347,8 +347,8 @@ " \"type\": \"dense_vector\",\n", " \"dims\": 768,\n", " \"index\": \"true\",\n", - " \"similarity\": \"cosine\"\n", - " }\n", + " \"similarity\": \"cosine\",\n", + " },\n", " }\n", "}\n", "\n", @@ -451,9 +451,9 @@ "def query(q, language=None):\n", " knn = {\n", " \"field\": \"passage_embedding\",\n", - " \"query_vector\" : model.encode(f\"query: {q}\").tolist(),\n", + " \"query_vector\": model.encode(f\"query: {q}\").tolist(),\n", " \"k\": 2,\n", - " \"num_candidates\": 5\n", + " \"num_candidates\": 5,\n", " }\n", "\n", " if language:\n", @@ -480,8 +480,8 @@ "outputs": [], "source": [ "def pretty_response(response):\n", - " if len(response['hits']['hits']) == 0:\n", - " print('Your search returned no results.')\n", + " if len(response[\"hits\"][\"hits\"]) == 0:\n", + " print(\"Your search returned no results.\")\n", " else:\n", " for hit in response[\"hits\"][\"hits\"]:\n", " score = hit[\"_score\"]\n", diff --git a/notebooks/search/05-query-rules.ipynb b/notebooks/search/05-query-rules.ipynb index f96de870..3d9e172d 100644 --- a/notebooks/search/05-query-rules.ipynb +++ b/notebooks/search/05-query-rules.ipynb @@ -93,7 +93,7 @@ "# Create the client instance\n", "client = Elasticsearch(\n", " # For local development\n", - " # hosts=[\"http://localhost:9200\"] \n", + " # hosts=[\"http://localhost:9200\"]\n", " cloud_id=ELASTIC_CLOUD_ID,\n", " api_key=ELASTIC_API_KEY,\n", ")" @@ -232,34 +232,35 @@ "outputs": [], "source": [ "def pretty_response(response):\n", - " if len(response['hits']['hits']) == 0:\n", - " print('Your search returned no results.')\n", + " if len(response[\"hits\"][\"hits\"]) == 0:\n", + " print(\"Your search returned no results.\")\n", " else:\n", - " for hit in response['hits']['hits']:\n", - " id = hit['_id']\n", - " score = hit['_score']\n", - " name = hit['_source']['name']\n", - " description = hit['_source']['description']\n", + " for hit in response[\"hits\"][\"hits\"]:\n", + " id = hit[\"_id\"]\n", + " score = hit[\"_score\"]\n", + " name = hit[\"_source\"][\"name\"]\n", + " description = hit[\"_source\"][\"description\"]\n", " price = hit[\"_source\"][\"price\"]\n", " currency = hit[\"_source\"][\"currency\"]\n", " plug_type = hit[\"_source\"][\"plug_type\"]\n", " voltage = hit[\"_source\"][\"voltage\"]\n", - " pretty_output = (f\"\\nID: {id}\\nName: {name}\\nDescription: {description}\\nPrice: {price}\\nCurrency: {currency}\\nPlug type: {plug_type}\\nVoltage: {voltage}\\nScore: {score}\")\n", + " pretty_output = f\"\\nID: {id}\\nName: {name}\\nDescription: {description}\\nPrice: {price}\\nCurrency: {currency}\\nPlug type: {plug_type}\\nVoltage: {voltage}\\nScore: {score}\"\n", " print(pretty_output)\n", "\n", + "\n", "def pretty_ruleset(response):\n", - " print(\"Ruleset ID: \" + response['ruleset_id'])\n", - " for rule in response['rules']:\n", - " rule_id = rule['rule_id']\n", - " type = rule['type']\n", + " print(\"Ruleset ID: \" + response[\"ruleset_id\"])\n", + " for rule in response[\"rules\"]:\n", + " rule_id = rule[\"rule_id\"]\n", + " type = rule[\"type\"]\n", " print(f\"\\nRule ID: {rule_id}\\n\\tType: {type}\\n\\tCriteria:\")\n", - " criteria = rule['criteria']\n", + " criteria = rule[\"criteria\"]\n", " for rule_criteria in criteria:\n", - " criteria_type = rule_criteria['type']\n", - " metadata = rule_criteria['metadata']\n", - " values = rule_criteria['values']\n", + " criteria_type = rule_criteria[\"type\"]\n", + " metadata = rule_criteria[\"metadata\"]\n", + " values = rule_criteria[\"values\"]\n", " print(f\"\\t\\t{metadata} {criteria_type} {values}\")\n", - " ids = rule['actions']['ids']\n", + " ids = rule[\"actions\"][\"ids\"]\n", " print(f\"\\tPinned ids: {ids}\")" ] }, @@ -321,12 +322,15 @@ } ], "source": [ - "response = client.search(index=\"products_index\", query={\n", - " \"multi_match\": {\n", - " \"query\": \"reliable wireless charger for iPhone\",\n", - " \"fields\": [ \"name^5\", \"description\" ]\n", - " }\n", - "})\n", + "response = client.search(\n", + " index=\"products_index\",\n", + " query={\n", + " \"multi_match\": {\n", + " \"query\": \"reliable wireless charger for iPhone\",\n", + " \"fields\": [\"name^5\", \"description\"],\n", + " }\n", + " },\n", + ")\n", "\n", "pretty_response(response)" ] @@ -371,50 +375,37 @@ } ], "source": [ - "client.query_ruleset.put(ruleset_id=\"promotion-rules\", rules=[\n", - " {\n", - " \"rule_id\": \"us-charger\",\n", - " \"type\": \"pinned\",\n", - " \"criteria\": [\n", + "client.query_ruleset.put(\n", + " ruleset_id=\"promotion-rules\",\n", + " rules=[\n", " {\n", - " \"type\": \"contains\",\n", - " \"metadata\": \"my_query\",\n", - " \"values\": [\"wireless charger\"]\n", + " \"rule_id\": \"us-charger\",\n", + " \"type\": \"pinned\",\n", + " \"criteria\": [\n", + " {\n", + " \"type\": \"contains\",\n", + " \"metadata\": \"my_query\",\n", + " \"values\": [\"wireless charger\"],\n", + " },\n", + " {\"type\": \"exact\", \"metadata\": \"country\", \"values\": [\"us\"]},\n", + " ],\n", + " \"actions\": {\"ids\": [\"us1\"]},\n", " },\n", " {\n", - " \"type\": \"exact\",\n", - " \"metadata\": \"country\",\n", - " \"values\": [\"us\"]\n", - " }\n", - " ],\n", - " \"actions\": {\n", - " \"ids\": [\n", - " \"us1\"\n", - " ]\n", - " }\n", - " },\n", - " {\n", - " \"rule_id\": \"uk-charger\",\n", - " \"type\": \"pinned\",\n", - " \"criteria\": [\n", - " {\n", - " \"type\": \"contains\",\n", - " \"metadata\": \"my_query\",\n", - " \"values\": [\"wireless charger\"]\n", + " \"rule_id\": \"uk-charger\",\n", + " \"type\": \"pinned\",\n", + " \"criteria\": [\n", + " {\n", + " \"type\": \"contains\",\n", + " \"metadata\": \"my_query\",\n", + " \"values\": [\"wireless charger\"],\n", + " },\n", + " {\"type\": \"exact\", \"metadata\": \"country\", \"values\": [\"uk\"]},\n", + " ],\n", + " \"actions\": {\"ids\": [\"uk1\"]},\n", " },\n", - " {\n", - " \"type\": \"exact\",\n", - " \"metadata\": \"country\",\n", - " \"values\": [\"uk\"]\n", - " }\n", - " ],\n", - " \"actions\": {\n", - " \"ids\": [\n", - " \"uk1\"\n", - " ]\n", - " }\n", - " }\n", - " ])" + " ],\n", + ")" ] }, { @@ -527,21 +518,24 @@ } ], "source": [ - "response = client.search(index=\"products_index\", query={\n", - " \"rule_query\": {\n", - " \"organic\": {\n", - " \"multi_match\": {\n", - " \"query\": \"reliable wireless charger for iPhone\",\n", - " \"fields\": [ \"name^5\", \"description\" ]\n", - " }\n", - " },\n", - " \"match_criteria\": {\n", - " \"my_query\": \"reliable wireless charger for iPhone\",\n", - " \"country\": \"us\"\n", - " },\n", - " \"ruleset_id\": \"promotion-rules\"\n", - " }\n", - "})\n", + "response = client.search(\n", + " index=\"products_index\",\n", + " query={\n", + " \"rule_query\": {\n", + " \"organic\": {\n", + " \"multi_match\": {\n", + " \"query\": \"reliable wireless charger for iPhone\",\n", + " \"fields\": [\"name^5\", \"description\"],\n", + " }\n", + " },\n", + " \"match_criteria\": {\n", + " \"my_query\": \"reliable wireless charger for iPhone\",\n", + " \"country\": \"us\",\n", + " },\n", + " \"ruleset_id\": \"promotion-rules\",\n", + " }\n", + " },\n", + ")\n", "\n", "pretty_response(response)" ] @@ -606,21 +600,24 @@ } ], "source": [ - "response = client.search(index=\"products_index\", query={\n", - " \"rule_query\": {\n", - " \"organic\": {\n", - " \"multi_match\": {\n", - " \"query\": \"reliable wireless charger for iPhone\",\n", - " \"fields\": [ \"name^5\", \"description\" ]\n", - " }\n", - " },\n", - " \"match_criteria\": {\n", - " \"my_query\": \"reliable wireless charger for iPhone\",\n", - " \"country\": \"ca\"\n", - " },\n", - " \"ruleset_id\": \"promotion-rules\"\n", - " }\n", - "})\n", + "response = client.search(\n", + " index=\"products_index\",\n", + " query={\n", + " \"rule_query\": {\n", + " \"organic\": {\n", + " \"multi_match\": {\n", + " \"query\": \"reliable wireless charger for iPhone\",\n", + " \"fields\": [\"name^5\", \"description\"],\n", + " }\n", + " },\n", + " \"match_criteria\": {\n", + " \"my_query\": \"reliable wireless charger for iPhone\",\n", + " \"country\": \"ca\",\n", + " },\n", + " \"ruleset_id\": \"promotion-rules\",\n", + " }\n", + " },\n", + ")\n", "\n", "pretty_response(response)" ] @@ -653,64 +650,43 @@ } ], "source": [ - "client.query_ruleset.put(ruleset_id=\"promotion-rules\", rules=[\n", - " {\n", - " \"rule_id\": \"preorder\",\n", - " \"type\": \"pinned\",\n", - " \"criteria\": [\n", - " {\n", - " \"type\": \"always\"\n", - " }\n", - " ],\n", - " \"actions\": {\n", - " \"ids\": [\n", - " \"preview1\"\n", - " ]\n", - " }\n", - " },\n", - " {\n", - " \"rule_id\": \"us-charger\",\n", - " \"type\": \"pinned\",\n", - " \"criteria\": [\n", + "client.query_ruleset.put(\n", + " ruleset_id=\"promotion-rules\",\n", + " rules=[\n", " {\n", - " \"type\": \"contains\",\n", - " \"metadata\": \"my_query\",\n", - " \"values\": [\"wireless charger\"]\n", + " \"rule_id\": \"preorder\",\n", + " \"type\": \"pinned\",\n", + " \"criteria\": [{\"type\": \"always\"}],\n", + " \"actions\": {\"ids\": [\"preview1\"]},\n", " },\n", " {\n", - " \"type\": \"exact\",\n", - " \"metadata\": \"country\",\n", - " \"values\": [\"us\"]\n", - " }\n", - " ],\n", - " \"actions\": {\n", - " \"ids\": [\n", - " \"us1\"\n", - " ]\n", - " }\n", - " },\n", - " {\n", - " \"rule_id\": \"uk-charger\",\n", - " \"type\": \"pinned\",\n", - " \"criteria\": [\n", - " {\n", - " \"type\": \"contains\",\n", - " \"metadata\": \"my_query\",\n", - " \"values\": [\"wireless charger\"]\n", + " \"rule_id\": \"us-charger\",\n", + " \"type\": \"pinned\",\n", + " \"criteria\": [\n", + " {\n", + " \"type\": \"contains\",\n", + " \"metadata\": \"my_query\",\n", + " \"values\": [\"wireless charger\"],\n", + " },\n", + " {\"type\": \"exact\", \"metadata\": \"country\", \"values\": [\"us\"]},\n", + " ],\n", + " \"actions\": {\"ids\": [\"us1\"]},\n", " },\n", " {\n", - " \"type\": \"exact\",\n", - " \"metadata\": \"country\",\n", - " \"values\": [\"uk\"]\n", - " }\n", - " ],\n", - " \"actions\": {\n", - " \"ids\": [\n", - " \"uk1\"\n", - " ]\n", - " }\n", - " }\n", - " ])" + " \"rule_id\": \"uk-charger\",\n", + " \"type\": \"pinned\",\n", + " \"criteria\": [\n", + " {\n", + " \"type\": \"contains\",\n", + " \"metadata\": \"my_query\",\n", + " \"values\": [\"wireless charger\"],\n", + " },\n", + " {\"type\": \"exact\", \"metadata\": \"country\", \"values\": [\"uk\"]},\n", + " ],\n", + " \"actions\": {\"ids\": [\"uk1\"]},\n", + " },\n", + " ],\n", + ")" ] }, { @@ -771,21 +747,24 @@ } ], "source": [ - "response = client.search(index=\"products_index\", query={\n", - " \"rule_query\": {\n", - " \"organic\": {\n", - " \"multi_match\": {\n", - " \"query\": \"reliable wireless charger for iPhone\",\n", - " \"fields\": [ \"name^5\", \"description\" ]\n", - " }\n", - " },\n", - " \"match_criteria\": {\n", - " \"my_query\": \"reliable wireless charger for iPhone\",\n", - " \"country\": \"uk\"\n", - " },\n", - " \"ruleset_id\": \"promotion-rules\"\n", - " }\n", - "})\n", + "response = client.search(\n", + " index=\"products_index\",\n", + " query={\n", + " \"rule_query\": {\n", + " \"organic\": {\n", + " \"multi_match\": {\n", + " \"query\": \"reliable wireless charger for iPhone\",\n", + " \"fields\": [\"name^5\", \"description\"],\n", + " }\n", + " },\n", + " \"match_criteria\": {\n", + " \"my_query\": \"reliable wireless charger for iPhone\",\n", + " \"country\": \"uk\",\n", + " },\n", + " \"ruleset_id\": \"promotion-rules\",\n", + " }\n", + " },\n", + ")\n", "\n", "pretty_response(response)" ] diff --git a/notebooks/search/06-synonyms-api.ipynb b/notebooks/search/06-synonyms-api.ipynb index 5c6c5025..209b4ad4 100644 --- a/notebooks/search/06-synonyms-api.ipynb +++ b/notebooks/search/06-synonyms-api.ipynb @@ -1,589 +1,583 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "87773ce7" - }, - "source": [ - "# Synonyms API quick start\n", - "\n", - "\"Open\n", - "\n", - "This interactive notebook will introduce you to the Synonyms API ([blog post](https://www.elastic.co/blog/update-synonyms-elasticsearch-introducing-synonyms-api), [API documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/synonyms-apis.html)) using the official [Elasticsearch Python client](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html). Synonyms allow you to enhance search relevancy by defining relationships between terms that have the similar meanings. In this notebook, you'll create & update synonyms sets, configure an index to use synonyms, and run queries that leverage synonyms for enhanced relevancy." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "a32202e2" - }, - "source": [ - "## Create Elastic Cloud deployment\n", - "\n", - "If you don't have an Elastic Cloud deployment, sign up [here](https://cloud.elastic.co/registration?utm_source=github&utm_content=elasticsearch-labs-notebook) for a free trial.\n", - "\n", - "Once logged in to your Elastic Cloud account, go to the [Create deployment](https://cloud.elastic.co/deployments/create) page and select **Create deployment**. Leave all settings with their default values." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "52a6a607" - }, - "source": [ - "## Install packages and import modules\n", - "\n", - "To get started, we'll need to connect to our Elastic deployment using the Python client.\n", - "Because we're using an Elastic Cloud deployment, we'll use the **Cloud ID** to identify our deployment.\n", - "\n", - "First we need to install the `elasticsearch` Python client." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ffc5fa6f", - "outputId": "2afe8842-15be-4d34-9e0f-e4de7ffc7a13" - }, - "outputs": [], - "source": [ - "!pip install -qU elasticsearch" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0241694c" - }, - "source": [ - "## Initialize the Elasticsearch client\n", - "\n", - "Now we can instantiate the [Elasticsearch python client](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/index.html), providing the cloud id and password in your deployment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "f38e0397", - "outputId": "33239952-fa18-46f0-b4ee-285b0b4054ee" - }, - "outputs": [], - "source": [ - "from elasticsearch import Elasticsearch\n", - "from getpass import getpass\n", - "\n", - "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id\n", - "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", - "\n", - "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#creating-an-api-key\n", - "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", - "\n", - "# Create the client instance\n", - "client = Elasticsearch(\n", - " # For local development\n", - " # hosts=[\"http://localhost:9200\"] \n", - " cloud_id=ELASTIC_CLOUD_ID,\n", - " api_key=ELASTIC_API_KEY,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fcd165fa" - }, - "source": [ - "If you're running Elasticsearch locally or self-managed, you can pass in the Elasticsearch host instead. [Read more](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html#_verifying_https_with_certificate_fingerprints_python_3_10_or_later) on how to connect to Elasticsearch locally." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1462ebd8" - }, - "source": [ - "Confirm that the client has connected with this test." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "25c618eb", - "outputId": "9eb26926-d63e-478b-8aa1-8bdb2a5dfbd8" - }, - "outputs": [], - "source": [ - "print(client.info())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_ROfAyq7CL60" - }, - "source": [ - "## Configure & populate the index\n", - "\n", - "Our client is set up and connected to our Elastic deployment. Now we need to configure the index that will store our test data and populate it with some documents. We'll use a small index of books with the following fields:\n", - "\n", - "- `title`\n", - "- `authors`\n", - "- `publish_date`\n", - "- `num_reviews`\n", - "- `publisher`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create synonyms set\n", - "\n", - "Let's create our initial synonyms set first." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "synonyms_set = [\n", - " {\n", - " \"id\": \"synonym-1\",\n", - " \"synonyms\": \"js, javascript, java script\"\n", - " }\n", - "]\n", - "\n", - "client.synonyms.put_synonym(id=\"my-synonyms-set\", synonyms_set=synonyms_set)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-phOM4SOFopW" - }, - "source": [ - "### Configure the index\n", - "\n", - "Ensure that you do not have a previously created index with the name `book_index`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "pIl2dCpJGu1R", - "outputId": "294ae0c4-0cc0-45d8-ffd1-541115fdd31a" - }, - "outputs": [], - "source": [ - "client.indices.delete(index=\"book_index\", ignore_unavailable=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0fNVJ_JCHe04" - }, - "source": [ - "🔐 NOTE: at any time you can come back to this section and run the `delete` function above to remove your index and start from scratch." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IRMTg7siGykU" - }, - "source": [ - "\n", - "\n", - "In order to use synonyms, we need to define a [custom analyzer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-custom-analyzer.html) that uses the [`synonym`](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html) or [`synonym_graph`](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html) token filter. Let's create an index that's configured to use an appropriate custom analyzer.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "4AXB9IR8JjCT", - "outputId": "31d59878-88a8-4294-a727-0271d3890e1c" - }, - "outputs": [], - "source": [ - "settings = {\n", - " \"analysis\": {\n", - " \"analyzer\": {\n", - " \"my_custom_index_analyzer\": {\n", - " \"tokenizer\": \"standard\",\n", - " \"filter\": [\n", - " \"lowercase\"\n", - " ]\n", - " },\n", - " \"my_custom_search_analyzer\": {\n", - " \"tokenizer\": \"standard\",\n", - " \"filter\": [\n", - " \"lowercase\",\n", - " \"my_synonym_filter\"\n", - " ]\n", - " }\n", - " },\n", - " \"filter\": {\n", - " \"my_synonym_filter\": {\n", - " \"type\": \"synonym_graph\",\n", - " \"synonyms_set\": \"my-synonyms-set\",\n", - " \"updateable\": True\n", - " }\n", - " }\n", - " }\n", - "}\n", - "\n", - "mappings = {\n", - " \"properties\": {\n", - " \"title\": {\n", - " \"type\": \"text\",\n", - " \"analyzer\": \"my_custom_index_analyzer\",\n", - " \"search_analyzer\": \"my_custom_search_analyzer\"\n", - " },\n", - " \"summary\": {\n", - " \"type\": \"text\",\n", - " \"analyzer\": \"my_custom_index_analyzer\",\n", - " \"search_analyzer\": \"my_custom_search_analyzer\"\n", - " }\n", - " }\n", - "}\n", - "\n", - "client.indices.create(index='book_index', mappings=mappings, settings=settings)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YYa3kdKvJtZW" - }, - "source": [ - "There are a few things to note in the configuration:\n", - "\n", - "- We are using the [`synonym_graph` token filter](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html).\n", - "- We have defined two analyzers: `my_custom_index_analyzer` and `my_custom_search_analyzer`. `my_custom_search_analyzer` is used as a [search analyzer](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-analyzer.html).\n", - "- `my_synonym_filter` is used only in `my_custom_search_analyzer`.\n", - "\n", - "The `synonym_graph` token filter allows us to use multi-word synonyms. However, it is important to apply this filter only at search time, hence why we use it only in `my_custom_search_analyzer`. And since synonyms are only applied at search time, we can update them without reindexing.\n", - "\n", - "See [_The same, but different: Boosting the power of Elasticsearch with synonyms_](https://www.elastic.co/blog/boosting-the-power-of-elasticsearch-with-synonyms) for more background information about search-time synonyms." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e6uvE1K9GeMm" - }, - "source": [ - "### Populate the index\n", - "\n", - "Run the following command to upload some test data, containing information about 10 popular programming books from this [dataset](https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/notebooks/search/data.json)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "qX2jo_TzVwqR", - "outputId": "5a749972-a960-4218-b2df-58060dee265b" - }, - "outputs": [], - "source": [ - "import json\n", - "from urllib.request import urlopen\n", - "\n", - "url = \"https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/notebooks/search/data.json\"\n", - "response = urlopen(url)\n", - "books = json.loads(response.read())\n", - "\n", - "operations = []\n", - "for book in books:\n", - " operations.append({\"index\": {\"_index\": \"book_index\"}})\n", - " operations.append(book)\n", - "client.bulk(index=\"book_index\", operations=operations, refresh=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "50ghTAEYV4Yu" - }, - "source": [ - "## Aside: Pretty printing Elasticsearch search results\n", - "\n", - "Your `search` API calls will return hard-to-read nested JSON.\n", - "We'll create a little function called `pretty_search_response` to return nice, human-readable outputs from our examples." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "e1HgqDC4V_HW" - }, - "outputs": [], - "source": [ - "def pretty_search_response(response):\n", - " if len(response['hits']['hits']) == 0:\n", - " print('Your search returned no results.')\n", - " else:\n", - " for hit in response['hits']['hits']:\n", - " id = hit['_id']\n", - " publication_date = hit['_source']['publish_date']\n", - " score = hit['_score']\n", - " title = hit['_source']['title']\n", - " summary = hit['_source']['summary']\n", - " publisher = hit[\"_source\"][\"publisher\"]\n", - " num_reviews = hit[\"_source\"][\"num_reviews\"]\n", - " authors = hit[\"_source\"][\"authors\"]\n", - " pretty_output = (f\"\\nID: {id}\\nPublication date: {publication_date}\\nTitle: {title}\\nSummary: {summary}\\nPublisher: {publisher}\\nReviews: {num_reviews}\\nAuthors: {authors}\\nScore: {score}\")\n", - " print(pretty_output)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "OGwvVLQMW6lA" - }, - "source": [ - "## Run queries\n", - "\n", - "Let's use our synonyms in some Elasticsearch queries. We'll start by searching for books about Javascript." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "KPvOrmTBYDet", - "outputId": "8d9f3de5-2508-4ca0-91b1-ece5e6099bea" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "ID: 3NfpXIsBGHjk6-WLlqOE\n", - "Publication date: 2018-12-04\n", - "Title: Eloquent JavaScript\n", - "Summary: A modern introduction to programming\n", - "Publisher: no starch press\n", - "Reviews: 38\n", - "Authors: ['marijn haverbeke']\n", - "Score: 20.307524\n", - "\n", - "ID: 29fpXIsBGHjk6-WLlqOE\n", - "Publication date: 2015-03-27\n", - "Title: You Don't Know JS: Up & Going\n", - "Summary: Introduction to JavaScript and programming as a whole\n", - "Publisher: oreilly\n", - "Reviews: 36\n", - "Authors: ['kyle simpson']\n", - "Score: 19.787104\n", - "\n", - "ID: 39fpXIsBGHjk6-WLlqOE\n", - "Publication date: 2008-05-15\n", - "Title: JavaScript: The Good Parts\n", - "Summary: A deep dive into the parts of JavaScript that are essential to writing maintainable code\n", - "Publisher: oreilly\n", - "Reviews: 51\n", - "Authors: ['douglas crockford']\n", - "Score: 17.064087\n" - ] - } - ], - "source": [ - "response = client.search(\n", - " index=\"book_index\",\n", - " query={\n", - " \"multi_match\": {\n", - " \"query\": \"java script\",\n", - " \"fields\": [\n", - " \"title^10\",\n", - " \"summary\",\n", - " ]\n", - " }\n", - " }\n", - ")\n", - "\n", - "pretty_search_response(response)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9KFJaht4Yxvh" - }, - "source": [ - "Notice that even though we searched for the term \"java script\", we got results containing the terms \"JS\" and \"JavaScript\". Our synonyms are working!\n", - "\n", - "Now let's try searching for books about AI." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "oj1ynL5nZz0u", - "outputId": "f1968d2c-83a5-4b3c-f397-44b16e7ab46e" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Your search returned no results.\n" - ] - } - ], - "source": [ - "response = client.search(\n", - " index=\"book_index\",\n", - " query={\n", - " \"multi_match\": {\n", - " \"query\": \"AI\",\n", - " \"fields\": [\n", - " \"title^10\",\n", - " \"summary\",\n", - " ]\n", - " }\n", - " }\n", - ")\n", - "\n", - "pretty_search_response(response)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RtXj_JwyZ3DZ" - }, - "source": [ - "We didn't get any results! There are some books that use the terms \"artificial intelligence\", but not \"AI\". Let's try using the Synonyms API to add a new synonym rule for \"AI\" so the previous query returns results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "8sZ4nkpzgwMy", - "outputId": "d425906a-3f6e-4dc2-89ed-ca6bbef70b0b" - }, - "outputs": [], - "source": [ - "client.synonyms.put_synonym_rule(set_id=\"my-synonyms-set\", rule_id=\"synonym-2\", synonyms=\"ai, artificial intelligence\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KFgKAma1hMT_" - }, - "source": [ - "If we run the query again, we should now get some results." - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "87773ce7" + }, + "source": [ + "# Synonyms API quick start\n", + "\n", + "\"Open\n", + "\n", + "This interactive notebook will introduce you to the Synonyms API ([blog post](https://www.elastic.co/blog/update-synonyms-elasticsearch-introducing-synonyms-api), [API documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/synonyms-apis.html)) using the official [Elasticsearch Python client](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html). Synonyms allow you to enhance search relevancy by defining relationships between terms that have the similar meanings. In this notebook, you'll create & update synonyms sets, configure an index to use synonyms, and run queries that leverage synonyms for enhanced relevancy." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a32202e2" + }, + "source": [ + "## Create Elastic Cloud deployment\n", + "\n", + "If you don't have an Elastic Cloud deployment, sign up [here](https://cloud.elastic.co/registration?utm_source=github&utm_content=elasticsearch-labs-notebook) for a free trial.\n", + "\n", + "Once logged in to your Elastic Cloud account, go to the [Create deployment](https://cloud.elastic.co/deployments/create) page and select **Create deployment**. Leave all settings with their default values." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "52a6a607" + }, + "source": [ + "## Install packages and import modules\n", + "\n", + "To get started, we'll need to connect to our Elastic deployment using the Python client.\n", + "Because we're using an Elastic Cloud deployment, we'll use the **Cloud ID** to identify our deployment.\n", + "\n", + "First we need to install the `elasticsearch` Python client." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ffc5fa6f", + "outputId": "2afe8842-15be-4d34-9e0f-e4de7ffc7a13" + }, + "outputs": [], + "source": [ + "!pip install -qU elasticsearch" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0241694c" + }, + "source": [ + "## Initialize the Elasticsearch client\n", + "\n", + "Now we can instantiate the [Elasticsearch python client](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/index.html), providing the cloud id and password in your deployment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "f38e0397", + "outputId": "33239952-fa18-46f0-b4ee-285b0b4054ee" + }, + "outputs": [], + "source": [ + "from elasticsearch import Elasticsearch\n", + "from getpass import getpass\n", + "\n", + "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id\n", + "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", + "\n", + "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#creating-an-api-key\n", + "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", + "\n", + "# Create the client instance\n", + "client = Elasticsearch(\n", + " # For local development\n", + " # hosts=[\"http://localhost:9200\"]\n", + " cloud_id=ELASTIC_CLOUD_ID,\n", + " api_key=ELASTIC_API_KEY,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fcd165fa" + }, + "source": [ + "If you're running Elasticsearch locally or self-managed, you can pass in the Elasticsearch host instead. [Read more](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html#_verifying_https_with_certificate_fingerprints_python_3_10_or_later) on how to connect to Elasticsearch locally." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1462ebd8" + }, + "source": [ + "Confirm that the client has connected with this test." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "25c618eb", + "outputId": "9eb26926-d63e-478b-8aa1-8bdb2a5dfbd8" + }, + "outputs": [], + "source": [ + "print(client.info())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_ROfAyq7CL60" + }, + "source": [ + "## Configure & populate the index\n", + "\n", + "Our client is set up and connected to our Elastic deployment. Now we need to configure the index that will store our test data and populate it with some documents. We'll use a small index of books with the following fields:\n", + "\n", + "- `title`\n", + "- `authors`\n", + "- `publish_date`\n", + "- `num_reviews`\n", + "- `publisher`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create synonyms set\n", + "\n", + "Let's create our initial synonyms set first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "synonyms_set = [{\"id\": \"synonym-1\", \"synonyms\": \"js, javascript, java script\"}]\n", + "\n", + "client.synonyms.put_synonym(id=\"my-synonyms-set\", synonyms_set=synonyms_set)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-phOM4SOFopW" + }, + "source": [ + "### Configure the index\n", + "\n", + "Ensure that you do not have a previously created index with the name `book_index`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pIl2dCpJGu1R", + "outputId": "294ae0c4-0cc0-45d8-ffd1-541115fdd31a" + }, + "outputs": [], + "source": [ + "client.indices.delete(index=\"book_index\", ignore_unavailable=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0fNVJ_JCHe04" + }, + "source": [ + "🔐 NOTE: at any time you can come back to this section and run the `delete` function above to remove your index and start from scratch." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IRMTg7siGykU" + }, + "source": [ + "\n", + "\n", + "In order to use synonyms, we need to define a [custom analyzer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-custom-analyzer.html) that uses the [`synonym`](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html) or [`synonym_graph`](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html) token filter. Let's create an index that's configured to use an appropriate custom analyzer.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4AXB9IR8JjCT", + "outputId": "31d59878-88a8-4294-a727-0271d3890e1c" + }, + "outputs": [], + "source": [ + "settings = {\n", + " \"analysis\": {\n", + " \"analyzer\": {\n", + " \"my_custom_index_analyzer\": {\n", + " \"tokenizer\": \"standard\",\n", + " \"filter\": [\"lowercase\"],\n", + " },\n", + " \"my_custom_search_analyzer\": {\n", + " \"tokenizer\": \"standard\",\n", + " \"filter\": [\"lowercase\", \"my_synonym_filter\"],\n", + " },\n", + " },\n", + " \"filter\": {\n", + " \"my_synonym_filter\": {\n", + " \"type\": \"synonym_graph\",\n", + " \"synonyms_set\": \"my-synonyms-set\",\n", + " \"updateable\": True,\n", + " }\n", + " },\n", + " }\n", + "}\n", + "\n", + "mappings = {\n", + " \"properties\": {\n", + " \"title\": {\n", + " \"type\": \"text\",\n", + " \"analyzer\": \"my_custom_index_analyzer\",\n", + " \"search_analyzer\": \"my_custom_search_analyzer\",\n", + " },\n", + " \"summary\": {\n", + " \"type\": \"text\",\n", + " \"analyzer\": \"my_custom_index_analyzer\",\n", + " \"search_analyzer\": \"my_custom_search_analyzer\",\n", + " },\n", + " }\n", + "}\n", + "\n", + "client.indices.create(index=\"book_index\", mappings=mappings, settings=settings)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YYa3kdKvJtZW" + }, + "source": [ + "There are a few things to note in the configuration:\n", + "\n", + "- We are using the [`synonym_graph` token filter](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html).\n", + "- We have defined two analyzers: `my_custom_index_analyzer` and `my_custom_search_analyzer`. `my_custom_search_analyzer` is used as a [search analyzer](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-analyzer.html).\n", + "- `my_synonym_filter` is used only in `my_custom_search_analyzer`.\n", + "\n", + "The `synonym_graph` token filter allows us to use multi-word synonyms. However, it is important to apply this filter only at search time, hence why we use it only in `my_custom_search_analyzer`. And since synonyms are only applied at search time, we can update them without reindexing.\n", + "\n", + "See [_The same, but different: Boosting the power of Elasticsearch with synonyms_](https://www.elastic.co/blog/boosting-the-power-of-elasticsearch-with-synonyms) for more background information about search-time synonyms." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e6uvE1K9GeMm" + }, + "source": [ + "### Populate the index\n", + "\n", + "Run the following command to upload some test data, containing information about 10 popular programming books from this [dataset](https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/notebooks/search/data.json)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qX2jo_TzVwqR", + "outputId": "5a749972-a960-4218-b2df-58060dee265b" + }, + "outputs": [], + "source": [ + "import json\n", + "from urllib.request import urlopen\n", + "\n", + "url = \"https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/notebooks/search/data.json\"\n", + "response = urlopen(url)\n", + "books = json.loads(response.read())\n", + "\n", + "operations = []\n", + "for book in books:\n", + " operations.append({\"index\": {\"_index\": \"book_index\"}})\n", + " operations.append(book)\n", + "client.bulk(index=\"book_index\", operations=operations, refresh=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "50ghTAEYV4Yu" + }, + "source": [ + "## Aside: Pretty printing Elasticsearch search results\n", + "\n", + "Your `search` API calls will return hard-to-read nested JSON.\n", + "We'll create a little function called `pretty_search_response` to return nice, human-readable outputs from our examples." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "e1HgqDC4V_HW" + }, + "outputs": [], + "source": [ + "def pretty_search_response(response):\n", + " if len(response[\"hits\"][\"hits\"]) == 0:\n", + " print(\"Your search returned no results.\")\n", + " else:\n", + " for hit in response[\"hits\"][\"hits\"]:\n", + " id = hit[\"_id\"]\n", + " publication_date = hit[\"_source\"][\"publish_date\"]\n", + " score = hit[\"_score\"]\n", + " title = hit[\"_source\"][\"title\"]\n", + " summary = hit[\"_source\"][\"summary\"]\n", + " publisher = hit[\"_source\"][\"publisher\"]\n", + " num_reviews = hit[\"_source\"][\"num_reviews\"]\n", + " authors = hit[\"_source\"][\"authors\"]\n", + " pretty_output = f\"\\nID: {id}\\nPublication date: {publication_date}\\nTitle: {title}\\nSummary: {summary}\\nPublisher: {publisher}\\nReviews: {num_reviews}\\nAuthors: {authors}\\nScore: {score}\"\n", + " print(pretty_output)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OGwvVLQMW6lA" + }, + "source": [ + "## Run queries\n", + "\n", + "Let's use our synonyms in some Elasticsearch queries. We'll start by searching for books about Javascript." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "KPvOrmTBYDet", + "outputId": "8d9f3de5-2508-4ca0-91b1-ece5e6099bea" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "KDx_V__QhIiy", - "outputId": "6d23e7f1-e129-4ee7-edf7-8e55ba1d0355" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "ID: 2dfpXIsBGHjk6-WLlqOE\n", - "Publication date: 2020-04-06\n", - "Title: Artificial Intelligence: A Modern Approach\n", - "Summary: Comprehensive introduction to the theory and practice of artificial intelligence\n", - "Publisher: pearson\n", - "Reviews: 39\n", - "Authors: ['stuart russell', 'peter norvig']\n", - "Score: 42.500813\n" - ] - } - ], - "source": [ - "response = client.search(\n", - " index=\"book_index\",\n", - " query={\n", - " \"multi_match\": {\n", - " \"query\": \"AI\",\n", - " \"fields\": [\n", - " \"title^10\",\n", - " \"summary\",\n", - " ]\n", - " }\n", - " }\n", - ")\n", - "\n", - "pretty_search_response(response)" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "ID: 3NfpXIsBGHjk6-WLlqOE\n", + "Publication date: 2018-12-04\n", + "Title: Eloquent JavaScript\n", + "Summary: A modern introduction to programming\n", + "Publisher: no starch press\n", + "Reviews: 38\n", + "Authors: ['marijn haverbeke']\n", + "Score: 20.307524\n", + "\n", + "ID: 29fpXIsBGHjk6-WLlqOE\n", + "Publication date: 2015-03-27\n", + "Title: You Don't Know JS: Up & Going\n", + "Summary: Introduction to JavaScript and programming as a whole\n", + "Publisher: oreilly\n", + "Reviews: 36\n", + "Authors: ['kyle simpson']\n", + "Score: 19.787104\n", + "\n", + "ID: 39fpXIsBGHjk6-WLlqOE\n", + "Publication date: 2008-05-15\n", + "Title: JavaScript: The Good Parts\n", + "Summary: A deep dive into the parts of JavaScript that are essential to writing maintainable code\n", + "Publisher: oreilly\n", + "Reviews: 51\n", + "Authors: ['douglas crockford']\n", + "Score: 17.064087\n" + ] + } + ], + "source": [ + "response = client.search(\n", + " index=\"book_index\",\n", + " query={\n", + " \"multi_match\": {\n", + " \"query\": \"java script\",\n", + " \"fields\": [\n", + " \"title^10\",\n", + " \"summary\",\n", + " ],\n", + " }\n", + " },\n", + ")\n", + "\n", + "pretty_search_response(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9KFJaht4Yxvh" + }, + "source": [ + "Notice that even though we searched for the term \"java script\", we got results containing the terms \"JS\" and \"JavaScript\". Our synonyms are working!\n", + "\n", + "Now let's try searching for books about AI." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "oj1ynL5nZz0u", + "outputId": "f1968d2c-83a5-4b3c-f397-44b16e7ab46e" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Conclusion\n", - "\n", - "The Synonyms API allows you to dynamically create & modify the synonyms used in your search index in real time. After reading this notebook, you should have all you need to start integrating the Synonyms API into your search experience!" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Your search returned no results.\n" + ] } - ], - "metadata": { + ], + "source": [ + "response = client.search(\n", + " index=\"book_index\",\n", + " query={\n", + " \"multi_match\": {\n", + " \"query\": \"AI\",\n", + " \"fields\": [\n", + " \"title^10\",\n", + " \"summary\",\n", + " ],\n", + " }\n", + " },\n", + ")\n", + "\n", + "pretty_search_response(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RtXj_JwyZ3DZ" + }, + "source": [ + "We didn't get any results! There are some books that use the terms \"artificial intelligence\", but not \"AI\". Let's try using the Synonyms API to add a new synonym rule for \"AI\" so the previous query returns results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" + "base_uri": "https://localhost:8080/" + }, + "id": "8sZ4nkpzgwMy", + "outputId": "d425906a-3f6e-4dc2-89ed-ca6bbef70b0b" + }, + "outputs": [], + "source": [ + "client.synonyms.put_synonym_rule(\n", + " set_id=\"my-synonyms-set\",\n", + " rule_id=\"synonym-2\",\n", + " synonyms=\"ai, artificial intelligence\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KFgKAma1hMT_" + }, + "source": [ + "If we run the query again, we should now get some results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - "language_info": { - "name": "python" + "id": "KDx_V__QhIiy", + "outputId": "6d23e7f1-e129-4ee7-edf7-8e55ba1d0355" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "ID: 2dfpXIsBGHjk6-WLlqOE\n", + "Publication date: 2020-04-06\n", + "Title: Artificial Intelligence: A Modern Approach\n", + "Summary: Comprehensive introduction to the theory and practice of artificial intelligence\n", + "Publisher: pearson\n", + "Reviews: 39\n", + "Authors: ['stuart russell', 'peter norvig']\n", + "Score: 42.500813\n" + ] } + ], + "source": [ + "response = client.search(\n", + " index=\"book_index\",\n", + " query={\n", + " \"multi_match\": {\n", + " \"query\": \"AI\",\n", + " \"fields\": [\n", + " \"title^10\",\n", + " \"summary\",\n", + " ],\n", + " }\n", + " },\n", + ")\n", + "\n", + "pretty_search_response(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "The Synonyms API allows you to dynamically create & modify the synonyms used in your search index in real time. After reading this notebook, you should have all you need to start integrating the Synonyms API into your search experience!" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/notebooks/search/07-inference.ipynb b/notebooks/search/07-inference.ipynb index 182b19e8..ca59cf90 100644 --- a/notebooks/search/07-inference.ipynb +++ b/notebooks/search/07-inference.ipynb @@ -113,7 +113,7 @@ "# Create the client instance\n", "client = Elasticsearch(\n", " # For local development\n", - " # hosts=[\"http://localhost:9200\"] \n", + " # hosts=[\"http://localhost:9200\"]\n", " cloud_id=ELASTIC_CLOUD_ID,\n", " api_key=ELASTIC_API_KEY,\n", ")" @@ -166,20 +166,16 @@ "metadata": {}, "outputs": [], "source": [ - "API_KEY = getpass.getpass('OpenAI API key: ')\n", + "API_KEY = getpass.getpass(\"OpenAI API key: \")\n", "\n", "client.inference.put_model(\n", " task_type=\"text_embedding\",\n", " model_id=\"my_openai_embedding_model\",\n", " body={\n", " \"service\": \"openai\",\n", - " \"service_settings\": {\n", - " \"api_key\": API_KEY\n", - " },\n", - " \"task_settings\": {\n", - " \"model\": \"text-embedding-ada-002\"\n", - " }\n", - " }\n", + " \"service_settings\": {\"api_key\": API_KEY},\n", + " \"task_settings\": {\"model\": \"text-embedding-ada-002\"},\n", + " },\n", ")" ] }, @@ -201,19 +197,19 @@ "outputs": [], "source": [ "client.ingest.put_pipeline(\n", - " id=\"openai_embeddings_pipeline\", \n", + " id=\"openai_embeddings_pipeline\",\n", " description=\"Ingest pipeline for OpenAI inference.\",\n", " processors=[\n", - " {\n", - " \"inference\": {\n", - " \"model_id\": \"my_openai_embedding_model\",\n", - " \"input_output\": {\n", - " \"input_field\": \"plot\",\n", - " \"output_field\": \"plot_embedding\"\n", + " {\n", + " \"inference\": {\n", + " \"model_id\": \"my_openai_embedding_model\",\n", + " \"input_output\": {\n", + " \"input_field\": \"plot\",\n", + " \"output_field\": \"plot_embedding\",\n", + " },\n", " }\n", - " }\n", - " }\n", - " ]\n", + " }\n", + " ],\n", ")" ] }, @@ -252,24 +248,18 @@ "source": [ "client.indices.delete(index=\"openai-movie-embeddings\", ignore_unavailable=True)\n", "client.indices.create(\n", - " index=\"openai-movie-embeddings\",\n", - " settings={\n", - " \"index\": {\n", - " \"default_pipeline\": \"openai_embeddings_pipeline\"\n", - " }\n", - " },\n", - " mappings={\n", - " \"properties\": {\n", - " \"plot_embedding\": { \n", - " \"type\": \"dense_vector\", \n", - " \"dims\": 1536, \n", - " \"similarity\": \"dot_product\" \n", - " },\n", - " \"plot\": {\n", - " \"type\": \"text\"\n", + " index=\"openai-movie-embeddings\",\n", + " settings={\"index\": {\"default_pipeline\": \"openai_embeddings_pipeline\"}},\n", + " mappings={\n", + " \"properties\": {\n", + " \"plot_embedding\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 1536,\n", + " \"similarity\": \"dot_product\",\n", + " },\n", + " \"plot\": {\"type\": \"text\"},\n", " }\n", - " }\n", - " }\n", + " },\n", ")" ] }, @@ -299,10 +289,12 @@ "# Prepare the documents to be indexed\n", "documents = []\n", "for doc in data_json:\n", - " documents.append({\n", - " \"_index\": \"openai-movie-embeddings\",\n", - " \"_source\": doc,\n", - " })\n", + " documents.append(\n", + " {\n", + " \"_index\": \"openai-movie-embeddings\",\n", + " \"_source\": doc,\n", + " }\n", + " )\n", "\n", "# Use helpers.bulk to index\n", "helpers.bulk(client, documents)\n", @@ -348,26 +340,26 @@ ], "source": [ "response = client.search(\n", - " index='openai-movie-embeddings', \n", + " index=\"openai-movie-embeddings\",\n", " size=3,\n", " knn={\n", " \"field\": \"plot_embedding\",\n", " \"query_vector_builder\": {\n", " \"text_embedding\": {\n", " \"model_id\": \"my_openai_embedding_model\",\n", - " \"model_text\": \"Fighting movie\"\n", + " \"model_text\": \"Fighting movie\",\n", " }\n", " },\n", " \"k\": 10,\n", - " \"num_candidates\": 100\n", - " }\n", + " \"num_candidates\": 100,\n", + " },\n", ")\n", "\n", - "for hit in response['hits']['hits']:\n", - " doc_id = hit['_id']\n", - " score = hit['_score']\n", - " title = hit['_source']['title']\n", - " plot = hit['_source']['plot']\n", + "for hit in response[\"hits\"][\"hits\"]:\n", + " doc_id = hit[\"_id\"]\n", + " score = hit[\"_score\"]\n", + " title = hit[\"_source\"][\"title\"]\n", + " plot = hit[\"_source\"][\"plot\"]\n", " print(f\"Score: {score}\\nTitle: {title}\\nPlot: {plot}\\n\")" ] } diff --git a/notebooks/search/08-learning-to-rank.ipynb b/notebooks/search/08-learning-to-rank.ipynb index bbf2d8b5..3b8a0005 100644 --- a/notebooks/search/08-learning-to-rank.ipynb +++ b/notebooks/search/08-learning-to-rank.ipynb @@ -746,12 +746,16 @@ "\n", " # Adding a column to the dataframe for each feature:\n", " for feature_index, feature_name in enumerate(ltr_config.feature_names):\n", - " query_judgements_group[feature_name] = numpy.array([doc_features[doc_id][feature_index] for doc_id in doc_ids])\n", + " query_judgements_group[feature_name] = numpy.array(\n", + " [doc_features[doc_id][feature_index] for doc_id in doc_ids]\n", + " )\n", "\n", " return query_judgements_group\n", "\n", "\n", - "judgments_with_features = judgments_df.groupby(\"query_id\", group_keys=False).progress_apply(_extract_query_features)\n", + "judgments_with_features = judgments_df.groupby(\n", + " \"query_id\", group_keys=False\n", + ").progress_apply(_extract_query_features)\n", "\n", "judgments_with_features" ] @@ -937,7 +941,9 @@ "groups = judgments_with_features[\"query_id\"]\n", "\n", "# Split the dataset in two parts respectively used for training and evaluation of the model.\n", - "group_preserving_splitter = GroupShuffleSplit(n_splits=1, train_size=0.7).split(X, y, groups)\n", + "group_preserving_splitter = GroupShuffleSplit(n_splits=1, train_size=0.7).split(\n", + " X, y, groups\n", + ")\n", "train_idx, eval_idx = next(group_preserving_splitter)\n", "\n", "train_features, eval_features = X.loc[train_idx], X.loc[eval_idx]\n", @@ -1020,7 +1026,7 @@ "source": [ "from eland.ml import MLModel\n", "\n", - "LEARNING_TO_RANK_MODEL_ID=\"ltr-model-xgboost\"\n", + "LEARNING_TO_RANK_MODEL_ID = \"ltr-model-xgboost\"\n", "\n", "MLModel.import_ltr_model(\n", " es_client=es_client,\n", @@ -1097,13 +1103,13 @@ "\n", "# First let's display the result when not using the rescorer:\n", "search_fields = [\"title\", \"overview\", \"actors\", \"director\", \"tags\", \"characters\"]\n", - "bm25_query = { \"multi_match\": { \"query\": query, \"fields\": search_fields } }\n", + "bm25_query = {\"multi_match\": {\"query\": query, \"fields\": search_fields}}\n", "\n", "bm25_search_response = es_client.search(index=MOVIE_INDEX, query=bm25_query)\n", "\n", "[\n", " (movie[\"_source\"][\"title\"], movie[\"_score\"], movie[\"_id\"])\n", - " for movie in bm25_search_response['hits']['hits']\n", + " for movie in bm25_search_response[\"hits\"][\"hits\"]\n", "]" ] }, @@ -1143,7 +1149,9 @@ " \"window_size\": 100,\n", "}\n", "\n", - "rescored_search_response = es_client.search(index=MOVIE_INDEX, query=bm25_query, rescore=ltr_rescorer)\n", + "rescored_search_response = es_client.search(\n", + " index=MOVIE_INDEX, query=bm25_query, rescore=ltr_rescorer\n", + ")\n", "\n", "[\n", " (movie[\"_source\"][\"title\"], movie[\"_score\"], movie[\"_id\"])\n", diff --git a/notebooks/search/_nbtest.setup.ipynb b/notebooks/search/_nbtest.setup.ipynb index 2d321ae7..1913718f 100644 --- a/notebooks/search/_nbtest.setup.ipynb +++ b/notebooks/search/_nbtest.setup.ipynb @@ -24,7 +24,10 @@ "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", "\n", - "client = Elasticsearch(cloud_id=ELASTIC_CLOUD_ID, api_key=ELASTIC_API_KEY,)" + "client = Elasticsearch(\n", + " cloud_id=ELASTIC_CLOUD_ID,\n", + " api_key=ELASTIC_API_KEY,\n", + ")" ] }, { @@ -38,27 +41,31 @@ "from urllib.request import urlopen\n", "from sentence_transformers import SentenceTransformer\n", "\n", - "if NBTEST[\"notebook\"] in ['01-keyword-querying-filtering.ipynb', '02-hybrid-search.ipynb', '06-synonyms-api.ipynb']:\n", + "if NBTEST[\"notebook\"] in [\n", + " \"01-keyword-querying-filtering.ipynb\",\n", + " \"02-hybrid-search.ipynb\",\n", + " \"06-synonyms-api.ipynb\",\n", + "]:\n", " # these tests need book_index to exist ahead of time\n", " client.indices.delete(index=\"book_index\", ignore_unavailable=True)\n", - " \n", + "\n", " mappings = {\n", " \"properties\": {\n", " \"title_vector\": {\n", " \"type\": \"dense_vector\",\n", " \"dims\": 384,\n", " \"index\": \"true\",\n", - " \"similarity\": \"cosine\"\n", + " \"similarity\": \"cosine\",\n", " }\n", " }\n", " }\n", - " client.indices.create(index='book_index', mappings=mappings)\n", + " client.indices.create(index=\"book_index\", mappings=mappings)\n", "\n", " url = \"https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/notebooks/search/data.json\"\n", " response = urlopen(url)\n", " books = json.loads(response.read())\n", "\n", - " model = SentenceTransformer('all-MiniLM-L6-v2')\n", + " model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n", " operations = []\n", " for book in books:\n", " operations.append({\"index\": {\"_index\": \"book_index\"}})\n", diff --git a/notebooks/search/_nbtest.teardown.03-ELSER.ipynb b/notebooks/search/_nbtest.teardown.03-ELSER.ipynb index 43276f1f..56bfd869 100644 --- a/notebooks/search/_nbtest.teardown.03-ELSER.ipynb +++ b/notebooks/search/_nbtest.teardown.03-ELSER.ipynb @@ -13,7 +13,10 @@ "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", "\n", - "client = Elasticsearch(cloud_id=ELASTIC_CLOUD_ID, api_key=ELASTIC_API_KEY,)\n", + "client = Elasticsearch(\n", + " cloud_id=ELASTIC_CLOUD_ID,\n", + " api_key=ELASTIC_API_KEY,\n", + ")\n", "\n", "# delete the notebook's index\n", "client.indices.delete(index=\"elser-example-movies\", ignore_unavailable=True)\n", diff --git a/notebooks/search/_nbtest.teardown.ipynb b/notebooks/search/_nbtest.teardown.ipynb index c66a543a..9bebe368 100644 --- a/notebooks/search/_nbtest.teardown.ipynb +++ b/notebooks/search/_nbtest.teardown.ipynb @@ -33,7 +33,10 @@ "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", "\n", - "client = Elasticsearch(cloud_id=ELASTIC_CLOUD_ID, api_key=ELASTIC_API_KEY,)" + "client = Elasticsearch(\n", + " cloud_id=ELASTIC_CLOUD_ID,\n", + " api_key=ELASTIC_API_KEY,\n", + ")" ] }, { diff --git a/supporting-blog-content/Boston-Celtics-Demo/load_data_and_write_queries.ipynb b/supporting-blog-content/Boston-Celtics-Demo/load_data_and_write_queries.ipynb index 52632039..be1238ef 100644 --- a/supporting-blog-content/Boston-Celtics-Demo/load_data_and_write_queries.ipynb +++ b/supporting-blog-content/Boston-Celtics-Demo/load_data_and_write_queries.ipynb @@ -21,8 +21,8 @@ "outputs": [], "source": [ "nba_teams = teams.get_teams()\n", - "celtics = [team for team in nba_teams if team['abbreviation'] == 'BOS'][0]\n", - "celtics_id = celtics['id']" + "celtics = [team for team in nba_teams if team[\"abbreviation\"] == \"BOS\"][0]\n", + "celtics_id = celtics[\"id\"]" ] }, { @@ -1335,7 +1335,9 @@ } ], "source": [ - "current_season = games.loc[(games['GAME_DATE'] >= '2023-10-24') & (games['GAME_DATE'] <= '2024-06-20')]\n", + "current_season = games.loc[\n", + " (games[\"GAME_DATE\"] >= \"2023-10-24\") & (games[\"GAME_DATE\"] <= \"2024-06-20\")\n", + "]\n", "current_season" ] }, @@ -1418,12 +1420,13 @@ "metadata": {}, "outputs": [], "source": [ - "timeframe = 'boston_celtics_current_season'\n", + "timeframe = \"boston_celtics_current_season\"\n", + "\n", "\n", "def doc_generator(df, timeframe):\n", " for index, document in df.iterrows():\n", " yield {\n", - " \"_index\": timeframe, \n", + " \"_index\": timeframe,\n", " \"_id\": f\"{document['GAME_ID']}\",\n", " \"_source\": document.to_dict(),\n", " }" @@ -1457,13 +1460,7 @@ "metadata": {}, "outputs": [], "source": [ - "search_query = {\n", - " \"query\": {\n", - " \"match\": {\n", - " \"WL\": \"W\"\n", - " }\n", - " }\n", - "}\n", + "search_query = {\"query\": {\"match\": {\"WL\": \"W\"}}}\n", "\n", "games_won = es.count(index=\"boston_celtics_current_season\", body=search_query)" ] @@ -1494,15 +1491,9 @@ "outputs": [], "source": [ "streak_query = {\n", - " \"size\": 1000, \n", - " \"sort\": [\n", - " {\n", - " \"GAME_DATE\": {\n", - " \"order\": \"asc\"\n", - " }\n", - " }\n", - " ],\n", - " \"_source\": [\"GAME_DATE\", \"WL\"]\n", + " \"size\": 1000,\n", + " \"sort\": [{\"GAME_DATE\": {\"order\": \"asc\"}}],\n", + " \"_source\": [\"GAME_DATE\", \"WL\"],\n", "}" ] }, @@ -1513,9 +1504,7 @@ "metadata": {}, "outputs": [], "source": [ - "streak_search = es.search(\n", - " index=\"boston_celtics_current_season\",\n", - " body=streak_query)" + "streak_search = es.search(index=\"boston_celtics_current_season\", body=streak_query)" ] }, { @@ -1525,7 +1514,7 @@ "metadata": {}, "outputs": [], "source": [ - "gs = [hit['_source'] for hit in streak_search['hits']['hits']]" + "gs = [hit[\"_source\"] for hit in streak_search[\"hits\"][\"hits\"]]" ] }, { @@ -1546,14 +1535,14 @@ "streaks = []\n", "current_streak = 1\n", "for i in range(1, len(gs)):\n", - " if gs[i]['WL'] == gs[i-1]['WL']:\n", + " if gs[i][\"WL\"] == gs[i - 1][\"WL\"]:\n", " current_streak += 1\n", " else:\n", - " streaks.append((gs[i-1]['WL'], current_streak))\n", + " streaks.append((gs[i - 1][\"WL\"], current_streak))\n", " current_streak = 1\n", "\n", "\n", - "streaks.append((gs[-1]['WL'], current_streak))\n", + "streaks.append((gs[-1][\"WL\"], current_streak))\n", "top_streaks = sorted(streaks, key=lambda x: x[1], reverse=True)[:5]\n", "print(top_streaks)" ] diff --git a/supporting-blog-content/ElasticDocs_GPT/elasticdocs_gpt-summarize5.py b/supporting-blog-content/ElasticDocs_GPT/elasticdocs_gpt-summarize5.py index 30617014..73e39f78 100644 --- a/supporting-blog-content/ElasticDocs_GPT/elasticdocs_gpt-summarize5.py +++ b/supporting-blog-content/ElasticDocs_GPT/elasticdocs_gpt-summarize5.py @@ -9,7 +9,7 @@ import random # This code is part of an Elastic Blog showing how to combine -# Elasticsearch's search relevancy power with +# Elasticsearch's search relevancy power with # OpenAI's GPT's Question Answering power @@ -19,12 +19,13 @@ # cloud_user - Elasticsearch Cluster User # cloud_pass - Elasticsearch User Password -openai.api_key = os.environ['openai_api_key'] -openai.api_type = os.environ['openai_api_type'] -openai.api_base = os.environ['openai_api_base'] -openai.api_version = os.environ['openai_api_version'] +openai.api_key = os.environ["openai_api_key"] +openai.api_type = os.environ["openai_api_type"] +openai.api_base = os.environ["openai_api_base"] +openai.api_version = os.environ["openai_api_version"] openai.verify_ssl_certs = False -engine = os.environ['openai_api_engine'] +engine = os.environ["openai_api_engine"] + # Connect to Elastic Cloud cluster def es_connect(cid, user, passwd): @@ -32,51 +33,30 @@ def es_connect(cid, user, passwd): es = Elasticsearch(cloud_id=cid, http_auth=(user, passwd)) return es + # Search ElasticSearch index and return body and URL of the result def search(query_text, size): - cid = os.environ['cloud_id'] - cp = os.environ['cloud_pass'] - cu = os.environ['cloud_user'] + cid = os.environ["cloud_id"] + cp = os.environ["cloud_pass"] + cu = os.environ["cloud_user"] es = es_connect(cid, cu, cp) # Elasticsearch query (BM25) and kNN configuration for hybrid search query = { "bool": { - "should": [{ - "match": { - "title": { - "query": query_text, - "boost": 1, - "analyzer": "stop" - } - } - }, - { - "match": { - "body_content": { - "query": query_text, - "boost": 2 - } - } - }, - { - "match": { - "product_name.stem": { - "query": query_text, - "boost": 5 + "should": [ + { + "match": { + "title": {"query": query_text, "boost": 1, "analyzer": "stop"} } - } - } - + }, + {"match": {"body_content": {"query": query_text, "boost": 2}}}, + {"match": {"product_name.stem": {"query": query_text, "boost": 5}}}, ], - "filter": [{ - "exists": { - "field": "title-vector" - } - }] + "filter": [{"exists": {"field": "title-vector"}}], } } - + knn = { "field": "title-vector", "k": 1, @@ -84,122 +64,155 @@ def search(query_text, size): "query_vector_builder": { "text_embedding": { "model_id": "sentence-transformers__all-distilroberta-v1", - "model_text": query_text + "model_text": query_text, } }, - "boost": 1 + "boost": 1, } - #compile list of filters, depending on checkboxes in UI + # compile list of filters, depending on checkboxes in UI productFilters = [] - if st.session_state['checkboxes'] != [None] * 10: - for filter in st.session_state['checkboxes']: - if filter['state']: - productFilters.append(filter['name']) - + if st.session_state["checkboxes"] != [None] * 10: + for filter in st.session_state["checkboxes"]: + if filter["state"]: + productFilters.append(filter["name"]) + if productFilters != []: # add terms filter to query - query['bool']['filter'].append({ - "terms": { - "product_name.enum": productFilters - } - }) + query["bool"]["filter"].append( + {"terms": {"product_name.enum": productFilters}} + ) # add terms filter to knn - knn['filter'] = { - "terms": { - "product_name.enum": productFilters - } + knn["filter"] = {"terms": {"product_name.enum": productFilters}} + + agg = { + "all_products": { + "global": {}, + "aggs": { + "filtered": { + "filter": { + "bool": { + "must": [ + { + "match": { + "title": { + "query": "how", + "boost": 1, + "analyzer": "stop", + } + } + } + ], + "filter": [{"exists": {"field": "title-vector"}}], + } + }, + "aggs": { + "products": { + "terms": {"field": "product_name.enum", "size": 10} + } + }, } - - agg = { - "all_products": { - "global": {}, - "aggs": { - "filtered": { - "filter": { - "bool": { - "must": [ - { - "match": { - "title": { - "query": "how", - "boost": 1, - "analyzer": "stop" - } - } - } - ], - "filter": [ - { - "exists": { - "field": "title-vector" - } - } - ] - } - }, - "aggs": { - "products": { - "terms": { - "field": "product_name.enum", - "size": 10 - } - } - } + }, } - } } - } fields = ["title", "body_content", "url", "product_name"] - index = 'search-elastic-docs,search-elastic-docs-2' - resp = es.search(index=index,query=query,knn=knn,fields=fields,size=size,source=False, aggs=agg) + index = "search-elastic-docs,search-elastic-docs-2" + resp = es.search( + index=index, + query=query, + knn=knn, + fields=fields, + size=size, + source=False, + aggs=agg, + ) return resp + # limit the prompt to the max tokens allowed def truncate_text(text, max_tokens): tokens = text.split() if len(tokens) <= max_tokens: return text - return ' '.join(tokens[:max_tokens]) + return " ".join(tokens[:max_tokens]) # Generate a response from ChatGPT based on the given prompt -def chat_gpt(prompt, result, index, traceparent, apm, model="gpt-3.5-turbo", max_tokens=1024, max_context_tokens=4000, safety_margin=1000): +def chat_gpt( + prompt, + result, + index, + traceparent, + apm, + model="gpt-3.5-turbo", + max_tokens=1024, + max_context_tokens=4000, + safety_margin=1000, +): # Truncate the prompt content to fit within the model's context length parent = elasticapm.trace_parent_from_string(traceparent) - apm.begin_transaction('openai', trace_parent=parent) + apm.begin_transaction("openai", trace_parent=parent) print("request to openai") - truncated_prompt = truncate_text(prompt, max_context_tokens - max_tokens - safety_margin) + truncated_prompt = truncate_text( + prompt, max_context_tokens - max_tokens - safety_margin + ) + + response = openai.ChatCompletion.create( + engine=engine, + temperature=0, + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": truncated_prompt}, + ], + ) - response = openai.ChatCompletion.create(engine=engine, - temperature=0, - messages=[{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": truncated_prompt}]) - result[index] = response - apm.end_transaction('openai', 'success') + apm.end_transaction("openai", "success") + # Generate a response from ChatGPT based on the given prompt, async version -async def achat_gpt(prompt, result, index, element, model="gpt-3.5-turbo", max_tokens=1024, max_context_tokens=4000, safety_margin=1000): +async def achat_gpt( + prompt, + result, + index, + element, + model="gpt-3.5-turbo", + max_tokens=1024, + max_context_tokens=4000, + safety_margin=1000, +): # Truncate the prompt content to fit within the model's context length - - truncated_prompt = truncate_text(prompt, max_context_tokens - max_tokens - safety_margin) - + + truncated_prompt = truncate_text( + prompt, max_context_tokens - max_tokens - safety_margin + ) + tries = 0 while tries < 5: - try: - print("request to openai for task number: " + str(index) + " attempt: " + str(tries)) + try: + print( + "request to openai for task number: " + + str(index) + + " attempt: " + + str(tries) + ) output = "" counter = 0 element.empty() - async with elasticapm.async_capture_span('openaiChatCompletion', span_type='openai'): + async with elasticapm.async_capture_span( + "openaiChatCompletion", span_type="openai" + ): async for chunk in await openai.ChatCompletion.acreate( - engine=engine, - messages=[{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": truncated_prompt}], + engine=engine, + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": truncated_prompt}, + ], stream=True, temperature=0, presence_penalty=-1, frequency_penalty=-1, - ): + ): content = chunk["choices"][0].get("delta", {}).get("content") # the counter tracks the number of tokens we received # this is not required, but it's a good way to track the cost later @@ -211,7 +224,7 @@ async def achat_gpt(prompt, result, index, element, model="gpt-3.5-turbo", max_t # concatenate the output to the previous one, so have the full response at the end output += content # with every token we get, we update the element - #time.sleep(0.01) + # time.sleep(0.01) element.markdown(output) break except Exception as e: @@ -223,93 +236,112 @@ async def achat_gpt(prompt, result, index, element, model="gpt-3.5-turbo", max_t element.error("Error: " + str(e)) else: print("retrying...") - - + # add the output to the result dictionary, so we can access the full response later - result[index] = {'usage': {"total_tokens": counter }, "choices": [{"message": {"content": output}}]} + result[index] = { + "usage": {"total_tokens": counter}, + "choices": [{"message": {"content": output}}], + } # update the completed tasks counter, so the UI can show the progress if multiple requests are done in parallel - st.session_state['completed'] = st.session_state['completed'] + 1 - + st.session_state["completed"] = st.session_state["completed"] + 1 + # update the progress bar, with a workaround to limit the progress to 100% - progress = st.session_state['completed']/(numberOfResults) + progress = st.session_state["completed"] / (numberOfResults) if progress > 1: progress = 1 - st.session_state['topResult'].progress(progress, text=f"loading individual results...{st.session_state['completed']}/{numberOfResults}") + st.session_state["topResult"].progress( + progress, + text=f"loading individual results...{st.session_state['completed']}/{numberOfResults}", + ) print("##################finished request to openai for task number: " + str(index)) + + # exception handling for the async tasks, so we can show the error in the UI def handle_exception(loop, context): - client = elasticapm.get_client() + client = elasticapm.get_client() # context["message"] will always be there; but context["exception"] may not msg = context.get("exception", context["message"]) exception = context.get("exception", context["message"]) - with st.session_state['topResult']: + with st.session_state["topResult"]: st.error(msg) - #apmClient.ca() - client = elasticapm.get_client() + # apmClient.ca() + client = elasticapm.get_client() elasticapm.set_transaction_outcome("failure") client.capture_exception(exc_info=("OpenAiError", exception, None)) apmClient.end_transaction("user-query") print("Caught exception: #####################") print(msg) print(exception) - #asyncio.create_task(shutdown(loop)) + # asyncio.create_task(shutdown(loop)) st.set_page_config(layout="wide") st.title("ElasticDocs GPT") + @st.cache_resource def initAPM(): # the APM Agent is initialized apmClient = elasticapm.Client(service_name="elasticdocs-gpt-v2-streaming") # the default instrumentation is applied # this will instrument the most common libraries - elasticapm.instrument() + elasticapm.instrument() return apmClient + apmClient = initAPM() # start building the UI, adding a title and a sidebar with st.sidebar: - st.sidebar.title("Options") - st.session_state['summarizeResults'] = ({ 'name': 'summarization', 'state': st.checkbox('summarization', value=True)}) - st.session_state['hideirrelevant'] = ({ 'name': 'hideirrelevant', 'state': st.checkbox('hide irrelevant', value=False)}) - st.sidebar.title("Filters") + st.sidebar.title("Options") + st.session_state["summarizeResults"] = { + "name": "summarization", + "state": st.checkbox("summarization", value=True), + } + st.session_state["hideirrelevant"] = { + "name": "hideirrelevant", + "state": st.checkbox("hide irrelevant", value=False), + } + st.sidebar.title("Filters") # this number controls the number of results to show # setting this higher will increase the cost, as more requests are made to the model numberOfResults = 5 # initialize the session state -if 'resp' not in st.session_state: - st.session_state['resp'] = None - st.session_state['summary'] = None - st.session_state['openai_tokens'] = 0 - st.session_state['runtime_es'] = 0 - st.session_state['runtime_openai'] = 0 - st.session_state['openai_tokens'] = 0 - st.session_state['openai_current_tokens'] = 0 - st.session_state['results'] = [None] * numberOfResults - st.session_state['checkboxes'] = [None] * 10 - st.session_state['topResult'] = None - st.session_state['completed'] = 0 +if "resp" not in st.session_state: + st.session_state["resp"] = None + st.session_state["summary"] = None + st.session_state["openai_tokens"] = 0 + st.session_state["runtime_es"] = 0 + st.session_state["runtime_openai"] = 0 + st.session_state["openai_tokens"] = 0 + st.session_state["openai_current_tokens"] = 0 + st.session_state["results"] = [None] * numberOfResults + st.session_state["checkboxes"] = [None] * 10 + st.session_state["topResult"] = None + st.session_state["completed"] = 0 # Main chat form -with st.form("chat_form", ): +with st.form( + "chat_form", +): query = st.text_input("You: ") - submit_button = st.form_submit_button("Search", ) + submit_button = st.form_submit_button( + "Search", + ) # placeholder for the top result that we can fill later - st.session_state['topResult'] = st.empty() + st.session_state["topResult"] = st.empty() # build a placeholder structure consisting of rows and columns rows = [None] * numberOfResults -col0h, col1h, col2h = st.columns([1,3,3]) +col0h, col1h, col2h = st.columns([1, 3, 3]) col1h.markdown(f"#### ChatGPT Results") col2h.markdown(f"#### Docs Results") @@ -318,10 +350,13 @@ def initAPM(): # the user submitted a query if submit_button: - st.session_state['summary'] = None - st.session_state['completed'] = 0 + st.session_state["summary"] = None + st.session_state["completed"] = 0 # show a progress bar - st.session_state['topResult'].progress(st.session_state['completed']/(numberOfResults), text=f"loading individual results...") + st.session_state["topResult"].progress( + st.session_state["completed"] / (numberOfResults), + text=f"loading individual results...", + ) # start the APM transaction apmClient.begin_transaction("user-query") @@ -330,10 +365,10 @@ def initAPM(): elasticapm.label(query=query) start = time.time() # query Elasticsearch for the top N results - resp = search(query, numberOfResults ) + resp = search(query, numberOfResults) end = time.time() - st.session_state['runtime_es'] = end - start - st.session_state['resp'] = resp + st.session_state["runtime_es"] = end - start + st.session_state["resp"] = resp counter = 0 threads = [] @@ -345,91 +380,122 @@ def initAPM(): loop.set_exception_handler(handle_exception) asyncio.set_event_loop(loop) tasks = [] - - # prepare the facets/filtering - st.session_state['checkboxes'] = [None] * len(resp['aggregations']['all_products']['filtered']['products']['buckets']) + # prepare the facets/filtering + st.session_state["checkboxes"] = [None] * len( + resp["aggregations"]["all_products"]["filtered"]["products"]["buckets"] + ) - #for hit in resp['hits']['hits']: + # for hit in resp['hits']['hits']: # body = hit['fields']['body_content'][0] # url = hit['fields']['url'][0] # counter += 1 counter = 0 with elasticapm.capture_span("individual-results", "openai"): - for hit in resp['hits']['hits']: - body = hit['fields']['body_content'][0] - url = hit['fields']['url'][0] + for hit in resp["hits"]["hits"]: + body = hit["fields"]["body_content"][0] + url = hit["fields"]["url"][0] prompt = f"Answer this question: {query}\n. Don’t give information not mentioned in the CONTEXT INFORMATION. If the CONTEXT INFORMATION contains code or API requests, your response should include code snippets to use in Kibana DevTools Console. If the context does not contain relevant information, answer 'The provided page does not answer the question': \n {body}" counter += 1 - + try: - with rows[counter-1]: + with rows[counter - 1]: with st.container(): - #col0, col1, col2 = rows[counter-1] - col0, col1, col2 = st.columns([1,3,3]) + # col0, col1, col2 = rows[counter-1] + col0, col1, col2 = st.columns([1, 3, 3]) col0.markdown(f"***") - col0.write(f"**Result {counter}:** ", unsafe_allow_html=False) - col0.write(f"**{resp['hits']['hits'][counter-1]['fields']['product_name'][0]}** ", unsafe_allow_html=False) + col0.write( + f"**Result {counter}:** ", unsafe_allow_html=False + ) + col0.write( + f"**{resp['hits']['hits'][counter-1]['fields']['product_name'][0]}** ", + unsafe_allow_html=False, + ) col1.markdown(f"***") - col1.markdown(f"**{resp['hits']['hits'][counter-1]['fields']['title'][0].strip()}**") - element = col1.markdown('') - - tasks.append(loop.create_task(achat_gpt(prompt, results, counter -1, element))) + col1.markdown( + f"**{resp['hits']['hits'][counter-1]['fields']['title'][0].strip()}**" + ) + element = col1.markdown("") + + tasks.append( + loop.create_task( + achat_gpt(prompt, results, counter - 1, element) + ) + ) col2.markdown(f"***") - col2.markdown(f"**{resp['hits']['hits'][counter-1]['fields']['title'][0].strip() }**") - - content = resp['hits']['hits'][counter-1]['fields']['body_content'][0] - # limit content length to 200 chars + col2.markdown( + f"**{resp['hits']['hits'][counter-1]['fields']['title'][0].strip() }**" + ) + + content = resp["hits"]["hits"][counter - 1]["fields"][ + "body_content" + ][0] + # limit content length to 200 chars if len(content) > 200: content = content[:1000] + "..." col2.write(f"{content}", unsafe_allow_html=False) - col2.write(f"**Docs**: {resp['hits']['hits'][counter-1]['fields']['url']}", unsafe_allow_html=False) - col2.write(f"score: {resp['hits']['hits'][counter-1]['_score']}", unsafe_allow_html=False) - rows[counter-1] = col0, col1, col2 + col2.write( + f"**Docs**: {resp['hits']['hits'][counter-1]['fields']['url']}", + unsafe_allow_html=False, + ) + col2.write( + f"score: {resp['hits']['hits'][counter-1]['_score']}", + unsafe_allow_html=False, + ) + rows[counter - 1] = col0, col1, col2 except: pass - + # t = Thread(target=chat_gpt, args=(prompt, results, counter, elasticapm.get_trace_parent_header(), apmClient)) - #element = st.empty() - #element.markdown('') - #tasks.append(loop.create_task(achat_gpt(prompt, results, counter, elasticapm.get_trace_parent_header(), apmClient, element))) - - #threads.append(t) + # element = st.empty() + # element.markdown('') + # tasks.append(loop.create_task(achat_gpt(prompt, results, counter, elasticapm.get_trace_parent_header(), apmClient, element))) - #for thread in threads: + # threads.append(t) + + # for thread in threads: # thread.start() # Wait for all of them to finish start = time.time() - #for x in threads: + # for x in threads: # x.join() print("waiting for openai tasks to finish") loop.run_until_complete(asyncio.wait(tasks)) - st.session_state['results'] = results - + st.session_state["results"] = results + end = time.time() print("openai tasks done") - st.session_state['runtime_openai'] = end - start + st.session_state["runtime_openai"] = end - start counter = 0 - st.session_state['openai_current_tokens'] = 0 + st.session_state["openai_current_tokens"] = 0 for i, resultObject in enumerate(results): - st.session_state['openai_current_tokens'] = st.session_state['openai_current_tokens'] + resultObject['usage']["total_tokens"] + len(resp['hits']['hits'][i]['fields']['body_content'][0]) / 4 - st.session_state['openai_tokens'] = st.session_state['openai_tokens'] + st.session_state['openai_current_tokens'] + st.session_state["openai_current_tokens"] = ( + st.session_state["openai_current_tokens"] + + resultObject["usage"]["total_tokens"] + + len(resp["hits"]["hits"][i]["fields"]["body_content"][0]) / 4 + ) + st.session_state["openai_tokens"] = ( + st.session_state["openai_tokens"] + + st.session_state["openai_current_tokens"] + ) concatResult = "" - for resultObject in results: - - if not resultObject['choices'][0]["message"]["content"].startswith("The provided page does not answer the question"): - concatResult += resultObject['choices'][0]["message"]["content"] - #st.session_state['topResult'].progress(st.session_state['completed']/(numberOfResults), text=f"loading individual results...") - if st.session_state['summarizeResults']['state']: + for resultObject in results: + + if not resultObject["choices"][0]["message"]["content"].startswith( + "The provided page does not answer the question" + ): + concatResult += resultObject["choices"][0]["message"]["content"] + # st.session_state['topResult'].progress(st.session_state['completed']/(numberOfResults), text=f"loading individual results...") + if st.session_state["summarizeResults"]["state"]: results = [None] * 1 tasks = [] prompt = f"I will give you {numberOfResults} answers to this question.: \"{query}\"\n. They are ordered by their likelyhood to be correct. Come up with the best answer to the original question, using only the context I will provide you here. If the provided context contains code snippets or API requests, half of your response must be code snippets or API requests. If the context does not contain relevant information, answer 'The provided page does not answer the question': \n {concatResult}" element = None - with st.session_state['topResult']: + with st.session_state["topResult"]: with st.container(): st.markdown(f"**Summary of all results:**") element = st.empty() @@ -439,109 +505,151 @@ def initAPM(): tasks.append(task) loop.set_exception_handler(handle_exception) loop.run_until_complete(asyncio.wait(tasks)) - st.session_state['summary'] = results[0]['choices'][0]["message"]["content"] - st.session_state['openai_current_tokens'] += results[0]['usage']["total_tokens"] + len(concatResult) / 4 - st.session_state['openai_tokens'] += st.session_state['openai_current_tokens'] + st.session_state["summary"] = results[0]["choices"][0]["message"]["content"] + st.session_state["openai_current_tokens"] += ( + results[0]["usage"]["total_tokens"] + len(concatResult) / 4 + ) + st.session_state["openai_tokens"] += st.session_state[ + "openai_current_tokens" + ] loop.close() - else: + else: time.sleep(0.5) - st.session_state['topResult'].empty() + st.session_state["topResult"].empty() - elasticapm.label(openapi_tokens = st.session_state['openai_current_tokens']) - elasticapm.label(openapi_cost = st.session_state['openai_current_tokens'] / 1000 * 0.002) + elasticapm.label(openapi_tokens=st.session_state["openai_current_tokens"]) + elasticapm.label( + openapi_cost=st.session_state["openai_current_tokens"] / 1000 * 0.002 + ) elasticapm.set_transaction_outcome("success") apmClient.end_transaction("user-query") - + except Exception as e: - apmClient.capture_exception() - elasticapm.set_transaction_outcome("failure") - apmClient.end_transaction("user-query") - print(e) - st.error(e) - -if st.session_state['resp'] != None: + apmClient.capture_exception() + elasticapm.set_transaction_outcome("failure") + apmClient.end_transaction("user-query") + print(e) + st.error(e) + +if st.session_state["resp"] != None: try: - resp = st.session_state['resp'] + resp = st.session_state["resp"] counter = 0 threads = [] - st.session_state['checkboxes'] = [None] * (len(resp['aggregations']['all_products']['filtered']['products']['buckets'])) - + st.session_state["checkboxes"] = [None] * ( + len(resp["aggregations"]["all_products"]["filtered"]["products"]["buckets"]) + ) - for i, product in enumerate(resp['aggregations']['all_products']['filtered']['products']['buckets']): + for i, product in enumerate( + resp["aggregations"]["all_products"]["filtered"]["products"]["buckets"] + ): with st.sidebar: - value = product['key'] + ' (' + str(product['doc_count']) + ')' - st.session_state['checkboxes'][i] = ({ 'name': product['key'], 'state': st.checkbox(value, value=False)}) + value = product["key"] + " (" + str(product["doc_count"]) + ")" + st.session_state["checkboxes"][i] = { + "name": product["key"], + "state": st.checkbox(value, value=False), + } - for hit in resp['hits']['hits']: - body = hit['fields']['body_content'][0] - url = hit['fields']['url'][0] + for hit in resp["hits"]["hits"]: + body = hit["fields"]["body_content"][0] + url = hit["fields"]["url"][0] counter += 1 counter = 0 with col1h: with st.expander("See cost and runtime details"): - st.markdown(f"OpenAi request cost: ${round(st.session_state['openai_current_tokens'] / 1000 * 0.002, 3)}") - st.markdown(f"OpenAi running cost: ${round(st.session_state['openai_tokens'] / 1000 * 0.002, 3)}") - st.markdown(f"OpenAi request duration: {round(st.session_state['runtime_openai'], 3)} seconds") + st.markdown( + f"OpenAi request cost: ${round(st.session_state['openai_current_tokens'] / 1000 * 0.002, 3)}" + ) + st.markdown( + f"OpenAi running cost: ${round(st.session_state['openai_tokens'] / 1000 * 0.002, 3)}" + ) + st.markdown( + f"OpenAi request duration: {round(st.session_state['runtime_openai'], 3)} seconds" + ) with col2h: with st.expander("See runtime details"): st.markdown("") st.markdown("") - st.markdown(f"Elasticsearch query duration: {round(st.session_state['runtime_es'], 3)} seconds") - for resultObject in st.session_state['results']: - result = resultObject['choices'][0]["message"]["content"] + st.markdown( + f"Elasticsearch query duration: {round(st.session_state['runtime_es'], 3)} seconds" + ) + for resultObject in st.session_state["results"]: + result = resultObject["choices"][0]["message"]["content"] counter += 1 print("##################################") showResult = False - if st.session_state['hideirrelevant']['state']: + if st.session_state["hideirrelevant"]["state"]: if result != "The provided page does not answer the question.": showResult = True - else: + else: showResult = True if showResult: print("relevant") try: - with rows[counter-1]: - with st.container(): - col0, col1, col2 = st.columns([1,3,3]) - col0.markdown(f"***") - col0.write(f"**Result {counter}:** ", unsafe_allow_html=False) - col0.write(f"**{resp['hits']['hits'][counter-1]['fields']['product_name'][0]}** ", unsafe_allow_html=False) - - col1.markdown(f"***") - - col1.markdown(f"**{resp['hits']['hits'][counter-1]['fields']['title'][0].strip()}**") - col1.write(f"{result.strip()}", unsafe_allow_html=False) - - col2.markdown(f"***") - col2.markdown(f"**{resp['hits']['hits'][counter-1]['fields']['title'][0].strip() }**") - - content = resp['hits']['hits'][counter-1]['fields']['body_content'][0] - # limit content length to length of result - content = content[:len(result*2)] + "..." - col2.write(f"{content}", unsafe_allow_html=False) - col2.write(f"**Docs**: {resp['hits']['hits'][counter-1]['fields']['url']}", unsafe_allow_html=False) - col2.write(f"score: {resp['hits']['hits'][counter-1]['_score']}", unsafe_allow_html=False) + with rows[counter - 1]: + with st.container(): + col0, col1, col2 = st.columns([1, 3, 3]) + col0.markdown(f"***") + col0.write( + f"**Result {counter}:** ", unsafe_allow_html=False + ) + col0.write( + f"**{resp['hits']['hits'][counter-1]['fields']['product_name'][0]}** ", + unsafe_allow_html=False, + ) + + col1.markdown(f"***") + + col1.markdown( + f"**{resp['hits']['hits'][counter-1]['fields']['title'][0].strip()}**" + ) + col1.write(f"{result.strip()}", unsafe_allow_html=False) + + col2.markdown(f"***") + col2.markdown( + f"**{resp['hits']['hits'][counter-1]['fields']['title'][0].strip() }**" + ) + + content = resp["hits"]["hits"][counter - 1]["fields"][ + "body_content" + ][0] + # limit content length to length of result + content = content[: len(result * 2)] + "..." + col2.write(f"{content}", unsafe_allow_html=False) + col2.write( + f"**Docs**: {resp['hits']['hits'][counter-1]['fields']['url']}", + unsafe_allow_html=False, + ) + col2.write( + f"score: {resp['hits']['hits'][counter-1]['_score']}", + unsafe_allow_html=False, + ) except: continue - else: + else: print("irrelevant") # display the top result if enabled - if st.session_state['summarizeResults']['state']: - with st.session_state['topResult']: + if st.session_state["summarizeResults"]["state"]: + with st.session_state["topResult"]: with st.container(): # write the top result - st.write(st.session_state['summary']) + st.write(st.session_state["summary"]) - # iterate over hits and build a markdown string with the title and link to the documentation as a hyperlink, + # iterate over hits and build a markdown string with the title and link to the documentation as a hyperlink, # if the response is not "The provided page does not answer the question." st.write(f"**Find more information in our documentation**:") markdownString = "" - for i, hit in enumerate(resp['hits']['hits']): - if st.session_state['results'][i]['choices'][0]["message"]["content"] != "The provided page does not answer the question.": + for i, hit in enumerate(resp["hits"]["hits"]): + if ( + st.session_state["results"][i]["choices"][0]["message"][ + "content" + ] + != "The provided page does not answer the question." + ): markdownString += f"[{hit['fields']['title'][0]}]({hit['fields']['url'][0]})\n\n" st.write(markdownString) @@ -552,4 +660,3 @@ def initAPM(): elasticapm.set_transaction_outcome("failure") apmClient.end_transaction("user-query") st.error(e) - diff --git a/supporting-blog-content/ElasticDocs_GPT/elasticdocs_gpt.py b/supporting-blog-content/ElasticDocs_GPT/elasticdocs_gpt.py index c65c808e..75068da4 100644 --- a/supporting-blog-content/ElasticDocs_GPT/elasticdocs_gpt.py +++ b/supporting-blog-content/ElasticDocs_GPT/elasticdocs_gpt.py @@ -4,7 +4,7 @@ from elasticsearch import Elasticsearch # This code is part of an Elastic Blog showing how to combine -# Elasticsearch's search relevancy power with +# Elasticsearch's search relevancy power with # OpenAI's GPT's Question Answering power # https://www.elastic.co/blog/chatgpt-elasticsearch-openai-meets-private-data @@ -18,37 +18,28 @@ # cloud_user - Elasticsearch Cluster User # cloud_pass - Elasticsearch User Password -openai.api_key = os.environ['openai_api'] +openai.api_key = os.environ["openai_api"] model = "gpt-3.5-turbo-0301" + # Connect to Elastic Cloud cluster def es_connect(cid, user, passwd): es = Elasticsearch(cloud_id=cid, http_auth=(user, passwd)) return es + # Search ElasticSearch index and return body and URL of the result def search(query_text): - cid = os.environ['cloud_id'] - cp = os.environ['cloud_pass'] - cu = os.environ['cloud_user'] + cid = os.environ["cloud_id"] + cp = os.environ["cloud_pass"] + cu = os.environ["cloud_user"] es = es_connect(cid, cu, cp) # Elasticsearch query (BM25) and kNN configuration for hybrid search query = { "bool": { - "must": [{ - "match": { - "title": { - "query": query_text, - "boost": 1 - } - } - }], - "filter": [{ - "exists": { - "field": "title-vector" - } - }] + "must": [{"match": {"title": {"query": query_text, "boost": 1}}}], + "filter": [{"exists": {"field": "title-vector"}}], } } @@ -59,40 +50,52 @@ def search(query_text): "query_vector_builder": { "text_embedding": { "model_id": "sentence-transformers__all-distilroberta-v1", - "model_text": query_text + "model_text": query_text, } }, - "boost": 24 + "boost": 24, } fields = ["title", "body_content", "url"] - index = 'search-elastic-docs' - resp = es.search(index=index, - query=query, - knn=knn, - fields=fields, - size=1, - source=False) + index = "search-elastic-docs" + resp = es.search( + index=index, query=query, knn=knn, fields=fields, size=1, source=False + ) - body = resp['hits']['hits'][0]['fields']['body_content'][0] - url = resp['hits']['hits'][0]['fields']['url'][0] + body = resp["hits"]["hits"][0]["fields"]["body_content"][0] + url = resp["hits"]["hits"][0]["fields"]["url"][0] return body, url + def truncate_text(text, max_tokens): tokens = text.split() if len(tokens) <= max_tokens: return text - return ' '.join(tokens[:max_tokens]) + return " ".join(tokens[:max_tokens]) + # Generate a response from ChatGPT based on the given prompt -def chat_gpt(prompt, model="gpt-3.5-turbo", max_tokens=1024, max_context_tokens=4000, safety_margin=5): +def chat_gpt( + prompt, + model="gpt-3.5-turbo", + max_tokens=1024, + max_context_tokens=4000, + safety_margin=5, +): # Truncate the prompt content to fit within the model's context length - truncated_prompt = truncate_text(prompt, max_context_tokens - max_tokens - safety_margin) - - response = openai.ChatCompletion.create(model=model, - messages=[{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": truncated_prompt}]) + truncated_prompt = truncate_text( + prompt, max_context_tokens - max_tokens - safety_margin + ) + + response = openai.ChatCompletion.create( + model=model, + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": truncated_prompt}, + ], + ) return response["choices"][0]["message"]["content"] @@ -110,7 +113,7 @@ def chat_gpt(prompt, model="gpt-3.5-turbo", max_tokens=1024, max_context_tokens= resp, url = search(query) prompt = f"Answer this question: {query}\nUsing only the information from this Elastic Doc: {resp}\nIf the answer is not contained in the supplied doc reply '{negResponse}' and nothing else" answer = chat_gpt(prompt) - + if negResponse in answer: st.write(f"ChatGPT: {answer.strip()}") else: diff --git a/supporting-blog-content/ElasticDocs_GPT/load_embedding_model.ipynb b/supporting-blog-content/ElasticDocs_GPT/load_embedding_model.ipynb index 6120b524..7669ff46 100644 --- a/supporting-blog-content/ElasticDocs_GPT/load_embedding_model.ipynb +++ b/supporting-blog-content/ElasticDocs_GPT/load_embedding_model.ipynb @@ -1,266 +1,266 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "include_colab_link": true }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "source": [ - "# ElasticDocs GPT Blog\n", - "# Loading an embedding from Hugging Face into Elasticsearch\n", - "\n", - "This code will show you how to load a supported embedding model from Hugging Face into an elasticsearch cluster in [Elastic Cloud](https://cloud.elastic.co/)\n", - "\n", - "[Blog - ChatGPT and Elasticsearch: OpenAI meets private data](https://www.elastic.co/blog/chatgpt-elasticsearch-openai-meets-private-data)" - ], - "metadata": { - "id": "6xoLDtS_6Df1" - } - }, - { - "cell_type": "markdown", - "source": [ - "# Setup\n" - ], - "metadata": { - "id": "DgxCKQS7mCZw" - } - }, - { - "cell_type": "markdown", - "source": [ - "## Install and import required python libraries" - ], - "metadata": { - "id": "Ly1f1P-l9ri8" - } - }, - { - "cell_type": "markdown", - "source": [ - "Elastic uses the [eland python library](https://github.com/elastic/eland) to download modesl from Hugging Face hub and load them into elasticsearch" - ], - "metadata": { - "id": "MJAb_8zlPFhQ" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rUedSzQW9FIF" - }, - "outputs": [], - "source": [ - "pip -q install eland elasticsearch sentence_transformers transformers torch==1.11" - ] - }, - { - "cell_type": "code", - "source": [ - "from pathlib import Path\n", - "from eland.ml.pytorch import PyTorchModel\n", - "from eland.ml.pytorch.transformers import TransformerModel\n", - "from elasticsearch import Elasticsearch\n", - "from elasticsearch.client import MlClient\n", - "\n", - "import getpass" - ], - "metadata": { - "id": "wyUZXUi4RWWL" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Configure elasticsearch authentication. \n", - "The recommended authentication approach is using the [Elastic Cloud ID](https://www.elastic.co/guide/en/cloud/current/ec-cloud-id.html) and a [cluster level API key](https://www.elastic.co/guide/en/kibana/current/api-keys.html)\n", - "\n", - "You can use any method you wish to set the required credentials. We are using getpass in this example to prompt for credentials to avoide storing them in github." - ], - "metadata": { - "id": "r7nMIbHke37Q" - } - }, - { - "cell_type": "code", - "source": [ - "es_cloud_id = getpass.getpass('Enter Elastic Cloud ID: ')\n", - "es_user = getpass.getpass('Enter cluster username: ') \n", - "es_pass = getpass.getpass('Enter cluster password: ') \n", - "\n", - "#es_api_id = getpass.getpass('Enter cluster API key ID: ') \n", - "#es_api_key = getpass.getpass('Enter cluster API key: ')" - ], - "metadata": { - "id": "SSGgYHome69o" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Connect to Elastic Cloud" - ], - "metadata": { - "id": "jL4VDnVp96lf" - } - }, - { - "cell_type": "code", - "source": [ - "#es = Elasticsearch(cloud_id=es_cloud_id, \n", - "# api_key=(es_api_id, es_api_key)\n", - "# )\n", - "es = Elasticsearch(cloud_id=es_cloud_id, \n", - " basic_auth=(es_user, es_pass)\n", - " )\n", - "es.info() # should return cluster info" - ], - "metadata": { - "id": "I8mVJkKmetXo" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Load the model From Hugging Face into Elasticsearch\n", - "Here we specify the model id from Hugging Face. The easiest way to get this id is clicking the copy the model name icon next to the name on the model page. \n", - "\n", - "When calling `TransformerModel` you specify the HF model id and the task type. You can try specifying `auto` and eland will attempt to determine the correct type from info in the model config. This is not always possible so a list of specific `task_type` values can be viewed in the following code: \n", - "[Supported values](https://github.com/elastic/eland/blob/15a300728876022b206161d71055c67b500a0192/eland/ml/pytorch/transformers.py#*L41*)" - ], - "metadata": { - "id": "uBMWHj-ZmtvE" - } - }, - { - "cell_type": "code", - "source": [ - "# Set the model name from Hugging Face and task type\n", - "hf_model_id='sentence-transformers/all-distilroberta-v1'\n", - "tm = TransformerModel(hf_model_id, \"text_embedding\")\n", - "\n", - "#set the modelID as it is named in Elasticsearch\n", - "es_model_id = tm.elasticsearch_model_id()\n", - "\n", - "# Download the model from Hugging Face\n", - "tmp_path = \"models\"\n", - "Path(tmp_path).mkdir(parents=True, exist_ok=True)\n", - "model_path, config, vocab_path = tm.save(tmp_path)\n", - "\n", - "# Load the model into Elasticsearch\n", - "ptm = PyTorchModel(es, es_model_id)\n", - "ptm.import_model(model_path=model_path, config_path=None, vocab_path=vocab_path, config=config) \n" - ], - "metadata": { - "id": "zPV3oFsKiYFL" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Starting the Model" - ], - "metadata": { - "id": "4UYSzFp3vHdB" - } - }, - { - "cell_type": "markdown", - "source": [ - "## View information about the model\n", - "This is not required but can be handy to get a model overivew" - ], - "metadata": { - "id": "wQwfozwznK4Y" - } - }, - { - "cell_type": "code", - "source": [ - "# List the in elasticsearch\n", - "m = MlClient.get_trained_models(es, model_id=es_model_id)\n", - "m.body" - ], - "metadata": { - "id": "b4Wv8EJvpfZI" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Deploy the model\n", - "This will load the model on the ML nodes and start the process(es) making it available for the NLP task" - ], - "metadata": { - "id": "oMGw3sk-pbaN" - } - }, - { - "cell_type": "code", - "source": [ - "s = MlClient.start_trained_model_deployment(es, model_id=es_model_id)\n", - "s.body" - ], - "metadata": { - "id": "w5muJ1rLqvUW" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Verify the model started without issue\n", - "Should output -> {'routing_state': 'started'}" - ], - "metadata": { - "id": "ZytlELrsnn_O" - } - }, - { - "cell_type": "code", - "source": [ - "stats = MlClient.get_trained_models_stats(es, model_id=es_model_id)\n", - "stats.body['trained_model_stats'][0]['deployment_stats']['nodes'][0]['routing_state']" - ], - "metadata": { - "id": "ZaQUUWe0Hxwz" - }, - "execution_count": null, - "outputs": [] - } - ] + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# ElasticDocs GPT Blog\n", + "# Loading an embedding from Hugging Face into Elasticsearch\n", + "\n", + "This code will show you how to load a supported embedding model from Hugging Face into an elasticsearch cluster in [Elastic Cloud](https://cloud.elastic.co/)\n", + "\n", + "[Blog - ChatGPT and Elasticsearch: OpenAI meets private data](https://www.elastic.co/blog/chatgpt-elasticsearch-openai-meets-private-data)" + ], + "metadata": { + "id": "6xoLDtS_6Df1" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Setup\n" + ], + "metadata": { + "id": "DgxCKQS7mCZw" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Install and import required python libraries" + ], + "metadata": { + "id": "Ly1f1P-l9ri8" + } + }, + { + "cell_type": "markdown", + "source": [ + "Elastic uses the [eland python library](https://github.com/elastic/eland) to download modesl from Hugging Face hub and load them into elasticsearch" + ], + "metadata": { + "id": "MJAb_8zlPFhQ" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rUedSzQW9FIF" + }, + "outputs": [], + "source": [ + "pip -q install eland elasticsearch sentence_transformers transformers torch==1.11" + ] + }, + { + "cell_type": "code", + "source": [ + "from pathlib import Path\n", + "from eland.ml.pytorch import PyTorchModel\n", + "from eland.ml.pytorch.transformers import TransformerModel\n", + "from elasticsearch import Elasticsearch\n", + "from elasticsearch.client import MlClient\n", + "\n", + "import getpass" + ], + "metadata": { + "id": "wyUZXUi4RWWL" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Configure elasticsearch authentication. \n", + "The recommended authentication approach is using the [Elastic Cloud ID](https://www.elastic.co/guide/en/cloud/current/ec-cloud-id.html) and a [cluster level API key](https://www.elastic.co/guide/en/kibana/current/api-keys.html)\n", + "\n", + "You can use any method you wish to set the required credentials. We are using getpass in this example to prompt for credentials to avoide storing them in github." + ], + "metadata": { + "id": "r7nMIbHke37Q" + } + }, + { + "cell_type": "code", + "source": [ + "es_cloud_id = getpass.getpass(\"Enter Elastic Cloud ID: \")\n", + "es_user = getpass.getpass(\"Enter cluster username: \")\n", + "es_pass = getpass.getpass(\"Enter cluster password: \")\n", + "\n", + "# es_api_id = getpass.getpass('Enter cluster API key ID: ')\n", + "# es_api_key = getpass.getpass('Enter cluster API key: ')" + ], + "metadata": { + "id": "SSGgYHome69o" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Connect to Elastic Cloud" + ], + "metadata": { + "id": "jL4VDnVp96lf" + } + }, + { + "cell_type": "code", + "source": [ + "# es = Elasticsearch(cloud_id=es_cloud_id,\n", + "# api_key=(es_api_id, es_api_key)\n", + "# )\n", + "es = Elasticsearch(cloud_id=es_cloud_id, basic_auth=(es_user, es_pass))\n", + "es.info() # should return cluster info" + ], + "metadata": { + "id": "I8mVJkKmetXo" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Load the model From Hugging Face into Elasticsearch\n", + "Here we specify the model id from Hugging Face. The easiest way to get this id is clicking the copy the model name icon next to the name on the model page. \n", + "\n", + "When calling `TransformerModel` you specify the HF model id and the task type. You can try specifying `auto` and eland will attempt to determine the correct type from info in the model config. This is not always possible so a list of specific `task_type` values can be viewed in the following code: \n", + "[Supported values](https://github.com/elastic/eland/blob/15a300728876022b206161d71055c67b500a0192/eland/ml/pytorch/transformers.py#*L41*)" + ], + "metadata": { + "id": "uBMWHj-ZmtvE" + } + }, + { + "cell_type": "code", + "source": [ + "# Set the model name from Hugging Face and task type\n", + "hf_model_id = \"sentence-transformers/all-distilroberta-v1\"\n", + "tm = TransformerModel(hf_model_id, \"text_embedding\")\n", + "\n", + "# set the modelID as it is named in Elasticsearch\n", + "es_model_id = tm.elasticsearch_model_id()\n", + "\n", + "# Download the model from Hugging Face\n", + "tmp_path = \"models\"\n", + "Path(tmp_path).mkdir(parents=True, exist_ok=True)\n", + "model_path, config, vocab_path = tm.save(tmp_path)\n", + "\n", + "# Load the model into Elasticsearch\n", + "ptm = PyTorchModel(es, es_model_id)\n", + "ptm.import_model(\n", + " model_path=model_path, config_path=None, vocab_path=vocab_path, config=config\n", + ")" + ], + "metadata": { + "id": "zPV3oFsKiYFL" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Starting the Model" + ], + "metadata": { + "id": "4UYSzFp3vHdB" + } + }, + { + "cell_type": "markdown", + "source": [ + "## View information about the model\n", + "This is not required but can be handy to get a model overivew" + ], + "metadata": { + "id": "wQwfozwznK4Y" + } + }, + { + "cell_type": "code", + "source": [ + "# List the in elasticsearch\n", + "m = MlClient.get_trained_models(es, model_id=es_model_id)\n", + "m.body" + ], + "metadata": { + "id": "b4Wv8EJvpfZI" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Deploy the model\n", + "This will load the model on the ML nodes and start the process(es) making it available for the NLP task" + ], + "metadata": { + "id": "oMGw3sk-pbaN" + } + }, + { + "cell_type": "code", + "source": [ + "s = MlClient.start_trained_model_deployment(es, model_id=es_model_id)\n", + "s.body" + ], + "metadata": { + "id": "w5muJ1rLqvUW" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Verify the model started without issue\n", + "Should output -> {'routing_state': 'started'}" + ], + "metadata": { + "id": "ZytlELrsnn_O" + } + }, + { + "cell_type": "code", + "source": [ + "stats = MlClient.get_trained_models_stats(es, model_id=es_model_id)\n", + "stats.body[\"trained_model_stats\"][0][\"deployment_stats\"][\"nodes\"][0][\"routing_state\"]" + ], + "metadata": { + "id": "ZaQUUWe0Hxwz" + }, + "execution_count": null, + "outputs": [] + } + ] } diff --git a/supporting-blog-content/ElasticGPT_Plugin/app.py b/supporting-blog-content/ElasticGPT_Plugin/app.py index e367e5c0..67fadbd5 100644 --- a/supporting-blog-content/ElasticGPT_Plugin/app.py +++ b/supporting-blog-content/ElasticGPT_Plugin/app.py @@ -17,7 +17,6 @@ under the License. """ - import json import requests import urllib.parse @@ -32,134 +31,123 @@ app = quart_cors.cors(quart.Quart(__name__), allow_origin="*") -openai.api_key = os.environ['openai_api'] +openai.api_key = os.environ["openai_api"] model = "gpt-3.5-turbo-0301" + # Connect to Elastic Cloud cluster def es_connect(cid, user, passwd): - es = Elasticsearch(cloud_id=cid, http_auth=(user, passwd)) - return es + es = Elasticsearch(cloud_id=cid, http_auth=(user, passwd)) + return es + # Search ElasticSearch index and return body and URL of the result def ESSearch(query_text): - cid = os.environ['cloud_id'] - cp = os.environ['cloud_pass'] - cu = os.environ['cloud_user'] - es = es_connect(cid, cu, cp) - - # Elasticsearch query (BM25) and kNN configuration for hybrid search - query = { - "bool": { - "must": [{ - "match": { - "title": { - "query": query_text, - "boost": 1 - } + cid = os.environ["cloud_id"] + cp = os.environ["cloud_pass"] + cu = os.environ["cloud_user"] + es = es_connect(cid, cu, cp) + + # Elasticsearch query (BM25) and kNN configuration for hybrid search + query = { + "bool": { + "must": [{"match": {"title": {"query": query_text, "boost": 1}}}], + "filter": [{"exists": {"field": "title-vector"}}], } - }], - "filter": [{ - "exists": { - "field": "title-vector" - } - }] } - } - - knn = { - "field": "title-vector", - "k": 1, - "num_candidates": 20, - "query_vector_builder": { - "text_embedding": { - "model_id": "sentence-transformers__all-distilroberta-v1", - "model_text": query_text - } - }, - "boost": 24 - } - - fields = ["title", "body_content", "url"] - index = 'search-elastic-docs' - resp = es.search(index=index, - query=query, - knn=knn, - fields=fields, - size=1, - source=False) - - body = resp['hits']['hits'][0]['fields']['body_content'][0] - url = resp['hits']['hits'][0]['fields']['url'][0] - - return body, url + + knn = { + "field": "title-vector", + "k": 1, + "num_candidates": 20, + "query_vector_builder": { + "text_embedding": { + "model_id": "sentence-transformers__all-distilroberta-v1", + "model_text": query_text, + } + }, + "boost": 24, + } + + fields = ["title", "body_content", "url"] + index = "search-elastic-docs" + resp = es.search( + index=index, query=query, knn=knn, fields=fields, size=1, source=False + ) + + body = resp["hits"]["hits"][0]["fields"]["body_content"][0] + url = resp["hits"]["hits"][0]["fields"]["url"][0] + + return body, url def truncate_text(text, max_tokens): - tokens = text.split() - if len(tokens) <= max_tokens: - return text + tokens = text.split() + if len(tokens) <= max_tokens: + return text - return ' '.join(tokens[:max_tokens]) + return " ".join(tokens[:max_tokens]) # Generate a response from ChatGPT based on the given prompt -def chat_gpt(prompt, - model="gpt-3.5-turbo", - max_tokens=1024, - max_context_tokens=4000, - safety_margin=5): - # Truncate the prompt content to fit within the model's context length - truncated_prompt = truncate_text( - prompt, max_context_tokens - max_tokens - safety_margin) - - response = openai.ChatCompletion.create(model=model, - messages=[{ - "role": - "system", - "content": - "You are a helpful assistant." - }, { - "role": "user", - "content": truncated_prompt - }]) - - return response["choices"][0]["message"]["content"] +def chat_gpt( + prompt, + model="gpt-3.5-turbo", + max_tokens=1024, + max_context_tokens=4000, + safety_margin=5, +): + # Truncate the prompt content to fit within the model's context length + truncated_prompt = truncate_text( + prompt, max_context_tokens - max_tokens - safety_margin + ) + + response = openai.ChatCompletion.create( + model=model, + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": truncated_prompt}, + ], + ) + + return response["choices"][0]["message"]["content"] @app.get("/search") async def search(): - query = request.args.get("query") - resp, url = ESSearch(query) - return quart.Response(response=resp + '\n\n' + resp) + query = request.args.get("query") + resp, url = ESSearch(query) + return quart.Response(response=resp + "\n\n" + resp) @app.get("/logo.png") async def plugin_logo(): - filename = 'logo.png' - return await quart.send_file(filename, mimetype='image/png') + filename = "logo.png" + return await quart.send_file(filename, mimetype="image/png") + @app.get("/.well-known/ai-plugin.json") async def plugin_manifest(): - host = request.headers['Host'] - with open("./.well-known/ai-plugin.json") as f: - text = f.read() - text = text.replace("PLUGIN_HOSTNAME", f"https://{host}") - return quart.Response(text, mimetype="text/json") + host = request.headers["Host"] + with open("./.well-known/ai-plugin.json") as f: + text = f.read() + text = text.replace("PLUGIN_HOSTNAME", f"https://{host}") + return quart.Response(text, mimetype="text/json") @app.get("/openapi.yaml") async def openapi_spec(): - host = request.headers['Host'] - with open("openapi.yaml") as f: - text = f.read() - text = text.replace("PLUGIN_HOSTNAME", f"https://{host}") - return quart.Response(text, mimetype="text/yaml") + host = request.headers["Host"] + with open("openapi.yaml") as f: + text = f.read() + text = text.replace("PLUGIN_HOSTNAME", f"https://{host}") + return quart.Response(text, mimetype="text/yaml") def main(): - port = int(os.environ.get("PORT", 5001)) - app.run(debug=True, host="0.0.0.0", port=port) + port = int(os.environ.get("PORT", 5001)) + app.run(debug=True, host="0.0.0.0", port=port) if __name__ == "__main__": - main() + main() diff --git a/supporting-blog-content/elasticsearch_llm_cache/elasticRAG_with_cache.py b/supporting-blog-content/elasticsearch_llm_cache/elasticRAG_with_cache.py index 60f2ecdf..08aabdcb 100644 --- a/supporting-blog-content/elasticsearch_llm_cache/elasticRAG_with_cache.py +++ b/supporting-blog-content/elasticsearch_llm_cache/elasticRAG_with_cache.py @@ -11,160 +11,146 @@ ) ## Configure OpenAI client -#openai.api_key = os.environ['OPENAI_API_KEY'] -#openai.api_base = os.environ['OPENAI_API_BASE'] -#openai.default_model = os.environ['OPENAI_API_ENGINE'] -#openai.verify_ssl_certs = False - -#Below is for Azure OpenAI -openai.api_type = os.environ['OPENAI_API_TYPE'] -openai.api_base = os.environ['OPENAI_API_BASE'] -openai.api_version = os.environ['OPENAI_API_VERSION'] +# openai.api_key = os.environ['OPENAI_API_KEY'] +# openai.api_base = os.environ['OPENAI_API_BASE'] +# openai.default_model = os.environ['OPENAI_API_ENGINE'] +# openai.verify_ssl_certs = False + +# Below is for Azure OpenAI +openai.api_type = os.environ["OPENAI_API_TYPE"] +openai.api_base = os.environ["OPENAI_API_BASE"] +openai.api_version = os.environ["OPENAI_API_VERSION"] openai.verify_ssl_certs = False -engine = os.environ['OPENAI_API_ENGINE'] +engine = os.environ["OPENAI_API_ENGINE"] # Configure APM and Elasticsearch clients @st.cache_resource def initElastic(): - #os.environ['ELASTIC_APM_SERVICE_NAME'] = "elasticsearch_llm_cache_demo" + # os.environ['ELASTIC_APM_SERVICE_NAME'] = "elasticsearch_llm_cache_demo" apmclient = elasticapm.Client() elasticapm.instrument() es = Elasticsearch( - cloud_id=os.environ['ELASTIC_CLOUD_ID'].strip("="), - basic_auth=(os.environ['ELASTIC_USER'], os.environ['ELASTIC_PASSWORD']), - request_timeout=30 + cloud_id=os.environ["ELASTIC_CLOUD_ID"].strip("="), + basic_auth=(os.environ["ELASTIC_USER"], os.environ["ELASTIC_PASSWORD"]), + request_timeout=30, ) return apmclient, es + apmclient, es = initElastic() # Set our data index -index = os.environ['ELASTIC_INDEX_DOCS'] +index = os.environ["ELASTIC_INDEX_DOCS"] + # Run an Elasticsearch query using hybrid RRF scoring of KNN and BM25 @elasticapm.capture_span("knn_search") def search_knn(query_text, es): query = { "bool": { - "must": [{ - "match": { - "body_content": { - "query": query_text - } - } - }], - "filter": [{ - "term": { - "url_path_dir3": "elasticsearch" - } - }] + "must": [{"match": {"body_content": {"query": query_text}}}], + "filter": [{"term": {"url_path_dir3": "elasticsearch"}}], } } knn = [ - { - "field": "chunk-vector", - "k": 10, - "num_candidates": 10, - "filter": { - "bool": { - "filter": [ - { - "range": { - "chunklength": { - "gte": 0 + { + "field": "chunk-vector", + "k": 10, + "num_candidates": 10, + "filter": { + "bool": { + "filter": [ + {"range": {"chunklength": {"gte": 0}}}, + {"term": {"url_path_dir3": "elasticsearch"}}, + ] + } + }, + "query_vector_builder": { + "text_embedding": { + "model_id": "sentence-transformers__msmarco-minilm-l-12-v3", + "model_text": query_text, } - } }, - { - "term": { - "url_path_dir3": "elasticsearch" - } - } - ] - } - }, - "query_vector_builder": { - "text_embedding": { - "model_id": "sentence-transformers__msmarco-minilm-l-12-v3", - "model_text": query_text } - } - }] + ] - rank = { - "rrf": { - } - } + rank = {"rrf": {}} + + fields = ["title", "url", "position", "url_path_dir3", "body_content"] + + resp = es.search( + index=index, + query=query, + knn=knn, + rank=rank, + fields=fields, + size=10, + source=False, + ) - fields= [ - "title", - "url", - "position", - "url_path_dir3", - "body_content" - ] - - resp = es.search(index=index, - query=query, - knn=knn, - rank=rank, - fields=fields, - size=10, - source=False) - - body = resp['hits']['hits'][0]['fields']['body_content'][0] - url = resp['hits']['hits'][0]['fields']['url'][0] + body = resp["hits"]["hits"][0]["fields"]["body_content"][0] + url = resp["hits"]["hits"][0]["fields"]["url"][0] return body, url + def truncate_text(text, max_tokens): tokens = text.split() if len(tokens) <= max_tokens: return text - return ' '.join(tokens[:max_tokens]) + return " ".join(tokens[:max_tokens]) + - # Generate a response from ChatGPT based on the given prompt -def genAI(prompt, - model="gpt-3.5-turbo", - max_tokens=1024, - max_context_tokens=4000, - safety_margin=5, - sys_content=None - ): - - # Truncate the prompt content to fit within the model's context length - truncated_prompt = truncate_text(prompt, max_context_tokens - max_tokens - safety_margin) +def genAI( + prompt, + model="gpt-3.5-turbo", + max_tokens=1024, + max_context_tokens=4000, + safety_margin=5, + sys_content=None, +): - response = openai.ChatCompletion.create(engine=engine, - temperature=0, - messages=[{"role": "system", "content": sys_content}, - {"role": "user", "content": truncated_prompt}] - ) + # Truncate the prompt content to fit within the model's context length + truncated_prompt = truncate_text( + prompt, max_context_tokens - max_tokens - safety_margin + ) + response = openai.ChatCompletion.create( + engine=engine, + temperature=0, + messages=[ + {"role": "system", "content": sys_content}, + {"role": "user", "content": truncated_prompt}, + ], + ) # APM: add metadata labels of data we want to capture - elasticapm.label(model = model) - elasticapm.label(prompt = prompt) - elasticapm.label(total_tokens = response["usage"]["total_tokens"]) - elasticapm.label(prompt_tokens = response["usage"]["prompt_tokens"]) - elasticapm.label(response_tokens = response["usage"]["completion_tokens"]) - if 'USER_HASH' in os.environ: elasticapm.label(user = os.environ['USER_HASH']) + elasticapm.label(model=model) + elasticapm.label(prompt=prompt) + elasticapm.label(total_tokens=response["usage"]["total_tokens"]) + elasticapm.label(prompt_tokens=response["usage"]["prompt_tokens"]) + elasticapm.label(response_tokens=response["usage"]["completion_tokens"]) + if "USER_HASH" in os.environ: + elasticapm.label(user=os.environ["USER_HASH"]) return response["choices"][0]["message"]["content"] + def toLLM(resp, url, usr_prompt, sys_prompt, neg_resp, show_prompt, engine): prompt_template = Template(usr_prompt) - prompt_formatted = prompt_template.substitute(query=query, resp=resp, negResponse=negResponse) + prompt_formatted = prompt_template.substitute( + query=query, resp=resp, negResponse=negResponse + ) answer = genAI(prompt_formatted, engine, sys_content=sys_prompt) # Display response from LLM - st.header('Response from LLM') + st.header("Response from LLM") st.markdown(answer.strip()) # We don't need to return a reference URL if it wasn't useful @@ -174,7 +160,7 @@ def toLLM(resp, url, usr_prompt, sys_prompt, neg_resp, show_prompt, engine): # Display full prompt if checkbox was selected if show_prompt: st.divider() - st.subheader('Full prompt sent to LLM') + st.subheader("Full prompt sent to LLM") prompt_formatted return answer @@ -184,31 +170,33 @@ def toLLM(resp, url, usr_prompt, sys_prompt, neg_resp, show_prompt, engine): def cache_query(cache, prompt_text): return cache.query(prompt_text=query) + @elasticapm.capture_span("add_to_cache") def add_to_cache(cache, prompt, response): return cache.add(prompt=prompt, response=response) -#sidebar setup +# sidebar setup st.sidebar.header("Elasticsearch LLM Cache Info") ### MAIN # Init Elasticsearch Cache -cache = ElasticsearchLLMCache(es_client=es, - index_name="llm_cache_test", - create_index=False # setting only because of Streamlit behavor - ) -st.sidebar.markdown(f'_creating Elasticsearch Cache_') +cache = ElasticsearchLLMCache( + es_client=es, + index_name="llm_cache_test", + create_index=False, # setting only because of Streamlit behavor +) +st.sidebar.markdown(f"_creating Elasticsearch Cache_") # Only want to attempt to create the index on first run if "index_created" not in st.session_state: - st.sidebar.markdown('_running create_index_') + st.sidebar.markdown("_running create_index_") cache.create_index(768) # Set the flag so it doesn't run every time st.session_state.index_created = True else: - st.sidebar.markdown('_index already created, skipping_') + st.sidebar.markdown("_index already created, skipping_") # Prompt Defaults @@ -217,7 +205,7 @@ def add_to_cache(cache, prompt, response): Format the answer in complete markdown code format If the answer is not contained in the supplied doc reply '$negResponse' and nothing else""" -system_default = 'You are a helpful assistant.' +system_default = "You are a helpful assistant." neg_default = "I'm unable to answer the question based on the information I have from Elastic Docs." @@ -225,55 +213,63 @@ def add_to_cache(cache, prompt, response): with st.form("chat_form"): - query = st.text_input("Ask the Elastic Documentation a question: ", placeholder='I want to secure my elastic cluster') + query = st.text_input( + "Ask the Elastic Documentation a question: ", + placeholder="I want to secure my elastic cluster", + ) with st.expander("Show Prompt Override Inputs"): # Inputs for system and User prompt override - sys_prompt = st.text_area("create an alernative system prompt", placeholder=system_default, value=system_default) - usr_prompt = st.text_area("create an alternative user prompt required -> \$query, \$resp, \$negResponse", - placeholder=prompt_default, value=prompt_default ) + sys_prompt = st.text_area( + "create an alernative system prompt", + placeholder=system_default, + value=system_default, + ) + usr_prompt = st.text_area( + "create an alternative user prompt required -> \$query, \$resp, \$negResponse", + placeholder=prompt_default, + value=prompt_default, + ) # Default Response when criteria are not met - negResponse = st.text_area("Create an alternative negative response", placeholder = neg_default, value=neg_default) - - show_full_prompt = st.checkbox('Show Full Prompt Sent to LLM') + negResponse = st.text_area( + "Create an alternative negative response", + placeholder=neg_default, + value=neg_default, + ) + show_full_prompt = st.checkbox("Show Full Prompt Sent to LLM") col1, col2 = st.columns(2) with col1: query_button = st.form_submit_button("Run With Cache Check") with col2: refresh_button = st.form_submit_button("Refresh Cache with new call to LLM") - + if query_button: apmclient.begin_transaction("query") - elasticapm.label(search_method = "knn") - elasticapm.label(query = query) + elasticapm.label(search_method="knn") + elasticapm.label(query=query) # Start timing start_time = time.time() # check the llm cache first query_check = cache_query(cache, prompt_text=query) - + if query_check: - st.sidebar.markdown('_cache match, using cached results_') - st.subheader('Response from Cache') - st.markdown(query_check['response'][0]) -# st.button('rerun without cache') + st.sidebar.markdown("_cache match, using cached results_") + st.subheader("Response from Cache") + st.markdown(query_check["response"][0]) + # st.button('rerun without cache') else: - st.sidebar.markdown('_no cache match, querying es and sending to LLM_') - resp, url = search_knn(query, es) # run kNN hybrid query - llmAnswer = toLLM(resp, - url, - usr_prompt, - sys_prompt, - negResponse, - show_full_prompt, - engine - ) - - st.sidebar.markdown('_adding prompt and response to cache_') + st.sidebar.markdown("_no cache match, querying es and sending to LLM_") + resp, url = search_knn(query, es) # run kNN hybrid query + llmAnswer = toLLM( + resp, url, usr_prompt, sys_prompt, negResponse, show_full_prompt, engine + ) + + st.sidebar.markdown("_adding prompt and response to cache_") add_to_cache(cache, query, llmAnswer) # End timing and print the elapsed time @@ -285,32 +281,27 @@ def add_to_cache(cache, prompt, response): if refresh_button: apmclient.begin_transaction("refresh_cache") - st.sidebar.markdown('_refreshing cache_') + st.sidebar.markdown("_refreshing cache_") - ''' + """ Cache Refresh idea: set an 'invalidated' flag in the already cached document and then call the LLM - ''' + """ - elasticapm.label(search_method = "knn") - elasticapm.label(query = query) + elasticapm.label(search_method="knn") + elasticapm.label(query=query) # Start timing start_time = time.time() - st.sidebar.markdown('_skipping cache check - sending to LLM_') + st.sidebar.markdown("_skipping cache check - sending to LLM_") - resp, url = search_knn(query, es) # run kNN hybrid query - llmAnswer = toLLM(resp, - url, - usr_prompt, - sys_prompt, - negResponse, - show_full_prompt, - engine - ) + resp, url = search_knn(query, es) # run kNN hybrid query + llmAnswer = toLLM( + resp, url, usr_prompt, sys_prompt, negResponse, show_full_prompt, engine + ) - st.sidebar.markdown('_adding prompt and response to cache_') + st.sidebar.markdown("_adding prompt and response to cache_") add_to_cache(cache, query, llmAnswer) # End timing and print the elapsed time @@ -320,5 +311,5 @@ def add_to_cache(cache, prompt, response): apmclient.end_transaction("query", "success") - st.sidebar.markdown('_cache refreshed_') - apmclient.end_transaction("refresh_cache", "success") \ No newline at end of file + st.sidebar.markdown("_cache refreshed_") + apmclient.end_transaction("refresh_cache", "success") diff --git a/supporting-blog-content/elasticsearch_llm_cache/elasticsearch_llm_cache.py b/supporting-blog-content/elasticsearch_llm_cache/elasticsearch_llm_cache.py index 67285185..3fc2c60c 100644 --- a/supporting-blog-content/elasticsearch_llm_cache/elasticsearch_llm_cache.py +++ b/supporting-blog-content/elasticsearch_llm_cache/elasticsearch_llm_cache.py @@ -62,12 +62,13 @@ class ElasticsearchLLMCache: - def __init__(self, - es_client: Elasticsearch, - index_name: Optional[str] = None, - es_model_id: Optional[str] = 'sentence-transformers__all-distilroberta-v1', - create_index=True - ): + def __init__( + self, + es_client: Elasticsearch, + index_name: Optional[str] = None, + es_model_id: Optional[str] = "sentence-transformers__all-distilroberta-v1", + create_index=True, + ): """ Initialize the ElasticsearchLLMCache instance. @@ -77,14 +78,12 @@ def __init__(self, :param create_index: Boolean to determine whether to create a new index; defaults to True. """ self.es = es_client - self.index_name = index_name or 'llm_cache' + self.index_name = index_name or "llm_cache" self.es_model_id = es_model_id if create_index: self.create_index() - def create_index(self, - dims: Optional[int] = 768 - ) -> Dict: + def create_index(self, dims: Optional[int] = 768) -> Dict: """ Create the index if it does not already exist. @@ -98,11 +97,12 @@ def create_index(self, "response": {"type": "text"}, "create_date": {"type": "date"}, "last_hit_date": {"type": "date"}, - "prompt_vector": {"type": "dense_vector", - "dims": dims, - "index": True, - "similarity": "dot_product" - } + "prompt_vector": { + "type": "dense_vector", + "dims": dims, + "index": True, + "similarity": "dot_product", + }, } } } @@ -110,10 +110,10 @@ def create_index(self, self.es.indices.create(index=self.index_name, body=mappings, ignore=400) logger.info(f"Index {self.index_name} created.") - return {'cache_index': self.index_name, 'created_new': True} + return {"cache_index": self.index_name, "created_new": True} else: logger.info(f"Index {self.index_name} already exists.") - return {'cache_index': self.index_name, 'created_new': False} + return {"cache_index": self.index_name, "created_new": False} def update_last_hit_date(self, doc_id: str): """ @@ -121,19 +121,16 @@ def update_last_hit_date(self, doc_id: str): :param doc_id: The ID of the document to update. """ - update_body = { - "doc": { - "last_hit_date": datetime.now() - } - } + update_body = {"doc": {"last_hit_date": datetime.now()}} self.es.update(index=self.index_name, id=doc_id, body=update_body) - def query(self, - prompt_text: str, - similarity_threshold: Optional[float] = 0.5, - num_candidates: Optional[int] = 1000, - create_date_gte: Optional[str] = "now-1y/y" - ) -> dict: + def query( + self, + prompt_text: str, + similarity_threshold: Optional[float] = 0.5, + num_candidates: Optional[int] = 1000, + create_date_gte: Optional[str] = "now-1y/y", + ) -> dict: """ Query the index to find similar prompts and update the `last_hit_date` for that document if a hit is found. @@ -152,64 +149,40 @@ def query(self, "query_vector_builder": { "text_embedding": { "model_id": self.es_model_id, - "model_text": prompt_text + "model_text": prompt_text, } }, - "filter": { - "range": { - "create_date": { - "gte": create_date_gte - - } - } - } + "filter": {"range": {"create_date": {"gte": create_date_gte}}}, } ] - fields = [ - "prompt", - "response" - ] + fields = ["prompt", "response"] - resp = self.es.search(index=self.index_name, - knn=knn, - fields=fields, - size=1, - source=False - ) + resp = self.es.search( + index=self.index_name, knn=knn, fields=fields, size=1, source=False + ) - if resp['hits']['total']['value'] == 0: + if resp["hits"]["total"]["value"] == 0: return {} else: - doc_id = resp['hits']['hits'][0]['_id'] + doc_id = resp["hits"]["hits"][0]["_id"] self.update_last_hit_date(doc_id) - return resp['hits']['hits'][0]['fields'] + return resp["hits"]["hits"][0]["fields"] - def _generate_vector(self, - prompt: str - ) -> List[float]: + def _generate_vector(self, prompt: str) -> List[float]: """ Generate a vector for a given prompt using Elasticsearch's text embedding. :param prompt: The text prompt to generate a vector for. :return: A list of floats representing the vector. """ - docs = [ - { - "text_field": prompt - } - ] + docs = [{"text_field": prompt}] - embedding = self.es.ml.infer_trained_model(model_id=self.es_model_id, - docs=docs - ) + embedding = self.es.ml.infer_trained_model(model_id=self.es_model_id, docs=docs) - return embedding['inference_results'][0]['predicted_value'] + return embedding["inference_results"][0]["predicted_value"] - def add(self, prompt: str, - response: str, - source: Optional[str] = None - ) -> Dict: + def add(self, prompt: str, response: str, source: Optional[str] = None) -> Dict: """ Add a new document to the index. @@ -226,12 +199,11 @@ def add(self, prompt: str, "create_date": datetime.now(), "last_hit_date": datetime.now(), "prompt_vector": prompt_vector, - "source": source # Optional + "source": source, # Optional } try: self.es.index(index=self.index_name, document=doc) - return {'success': True} + return {"success": True} except Exception as e: logger.error(e) - return {'success': False, - 'error': e} + return {"success": False, "error": e} diff --git a/supporting-blog-content/elasticsearch_llm_cache/test_elasticsearch_llm_cache.py b/supporting-blog-content/elasticsearch_llm_cache/test_elasticsearch_llm_cache.py index 0458a3d9..f8e4a3c3 100644 --- a/supporting-blog-content/elasticsearch_llm_cache/test_elasticsearch_llm_cache.py +++ b/supporting-blog-content/elasticsearch_llm_cache/test_elasticsearch_llm_cache.py @@ -1,7 +1,8 @@ import os import time -#print(os.environ['ELASTIC_CLOUD_ID']) -#time.sleep(10) + +# print(os.environ['ELASTIC_CLOUD_ID']) +# time.sleep(10) from elasticsearch import Elasticsearch from elasticsearch_llm_cache import ( @@ -13,26 +14,27 @@ # Initialize Elasticsearch client es_client = Elasticsearch( - cloud_id=os.environ['ELASTIC_CLOUD_ID'], - basic_auth=(os.environ['ELASTIC_USER'], os.environ['ELASTIC_PASSWORD']), - request_timeout=30) + cloud_id=os.environ["ELASTIC_CLOUD_ID"], + basic_auth=(os.environ["ELASTIC_USER"], os.environ["ELASTIC_PASSWORD"]), + request_timeout=30, +) # Initialize your caching class cache = ElasticsearchLLMCache(es_client=es_client, index_name="llm_cache_test") -#print(cache) +# print(cache) # Example prompt, response, and prompt vector example_prompt = "What is the capital of France?" example_response = "The capital of France is Paris." -print(f'example prompt: {example_prompt}') -print(f'example LLM Response: {example_response}') +print(f"example prompt: {example_prompt}") +print(f"example LLM Response: {example_response}") # -print ('first pass attempt') -q1= 'What is the capital of France?' -print(f'query 1: {q1}') +print("first pass attempt") +q1 = "What is the capital of France?" +print(f"query 1: {q1}") query_result_1 = cache.query(prompt_text=q1) -print("Query result 1:") +print("Query result 1:") pprint(query_result_1) print() @@ -46,19 +48,19 @@ # Query the cache (simulating later, separate operations) print() -print ('Second pass attempt') +print("Second pass attempt") print("Testing cached similar results\n") print() -q1= 'What is the capital of France?' -print(f'query 1: {q1}') +q1 = "What is the capital of France?" +print(f"query 1: {q1}") query_result_1 = cache.query(prompt_text=q1) -print("Query result 1:") +print("Query result 1:") pprint(query_result_1) print() -q2= 'What is the currency of the UK?' -print(f'query 1: {q2}') +q2 = "What is the currency of the UK?" +print(f"query 1: {q2}") query_result_2 = cache.query(prompt_text=q2) -print("Query result 2:") +print("Query result 2:") pprint(query_result_2) diff --git a/supporting-blog-content/homecraft-vertex/homecraft_home.py b/supporting-blog-content/homecraft-vertex/homecraft_home.py index 7b8f1482..13d7711b 100644 --- a/supporting-blog-content/homecraft-vertex/homecraft_home.py +++ b/supporting-blog-content/homecraft-vertex/homecraft_home.py @@ -18,46 +18,37 @@ # cloud_user - Elasticsearch Cluster User # cloud_pass - Elasticsearch User Password -projid = os.environ['gcp_project_id'] -cid = os.environ['cloud_id'] -cp = os.environ['cloud_pass'] -cu = os.environ['cloud_user'] +projid = os.environ["gcp_project_id"] +cid = os.environ["cloud_id"] +cp = os.environ["cloud_pass"] +cu = os.environ["cloud_user"] parameters = { - "temperature": 0.4, # 0 - 1. The higher the temp the more creative and less on point answers become - "max_output_tokens": 606, #modify this number (1 - 1024) for short/longer answers + "temperature": 0.4, # 0 - 1. The higher the temp the more creative and less on point answers become + "max_output_tokens": 606, # modify this number (1 - 1024) for short/longer answers "top_p": 0.8, - "top_k": 40 + "top_k": 40, } vertexai.init(project=projid, location="us-central1") model = TextGenerationModel.from_pretrained("text-bison@001") + # Connect to Elastic Cloud cluster def es_connect(cid, user, passwd): es = Elasticsearch(cloud_id=cid, http_auth=(user, passwd)) return es + # Search ElasticSearch index and return details on relevant products def search_products(query_text): # Elasticsearch query (BM25) and kNN configuration for hybrid search query = { "bool": { - "must": [{ - "match": { - "title": { - "query": query_text, - "boost": 1 - } - } - }], - "filter": [{ - "exists": { - "field": "title-vector" - } - }] + "must": [{"match": {"title": {"query": query_text, "boost": 1}}}], + "filter": [{"exists": {"field": "title-vector"}}], } } @@ -68,50 +59,44 @@ def search_products(query_text): "query_vector_builder": { "text_embedding": { "model_id": "sentence-transformers__all-distilroberta-v1", - "model_text": query_text + "model_text": query_text, } }, - "boost": 24 + "boost": 24, } - fields = ["title", "description", "url", "availability", "price", "brand", "product_id"] - index = 'home-depot-product-catalog-vector' - resp = es.search(index=index, - query=query, - knn=knn, - fields=fields, - size=5, - source=False) - - doc_list = resp['hits']['hits'] - body = resp['hits']['hits'] - url = '' + fields = [ + "title", + "description", + "url", + "availability", + "price", + "brand", + "product_id", + ] + index = "home-depot-product-catalog-vector" + resp = es.search( + index=index, query=query, knn=knn, fields=fields, size=5, source=False + ) + + doc_list = resp["hits"]["hits"] + body = resp["hits"]["hits"] + url = "" for doc in doc_list: - #body = body + doc['fields']['description'][0] - url = url + "\n\n" + doc['fields']['url'][0] + # body = body + doc['fields']['description'][0] + url = url + "\n\n" + doc["fields"]["url"][0] return body, url + # Search ElasticSearch index and return body and URL for crawled docs def search_docs(query_text): - # Elasticsearch query (BM25) and kNN configuration for hybrid search query = { "bool": { - "must": [{ - "match": { - "title": { - "query": query_text, - "boost": 1 - } - } - }], - "filter": [{ - "exists": { - "field": "title-vector" - } - }] + "must": [{"match": {"title": {"query": query_text, "boost": 1}}}], + "filter": [{"exists": {"field": "title-vector"}}], } } @@ -122,74 +107,67 @@ def search_docs(query_text): "query_vector_builder": { "text_embedding": { "model_id": "sentence-transformers__all-distilroberta-v1", - "model_text": query_text + "model_text": query_text, } }, - "boost": 24 + "boost": 24, } fields = ["title", "body_content", "url"] - index = 'search-homecraft-ikea' - resp = es.search(index=index, - query=query, - knn=knn, - fields=fields, - size=1, - source=False) + index = "search-homecraft-ikea" + resp = es.search( + index=index, query=query, knn=knn, fields=fields, size=1, source=False + ) - body = resp['hits']['hits'][0]['fields']['body_content'][0] - url = resp['hits']['hits'][0]['fields']['url'][0] + body = resp["hits"]["hits"][0]["fields"]["body_content"][0] + url = resp["hits"]["hits"][0]["fields"]["url"][0] return body, url + # Search ElasticSearch index for user's order history def search_orders(user): # Use only text-search - query = { - "bool": { - "must": [{ - "match": { - "user_id": { - "query": user, - "boost": 1 - } - } - }] - } - } - - fields = ["id", "order_id", "user_id", "product_id" "status", "created_at", "shipped_at", "delivered_at", "returned_at", "sale_price"] - index = 'bigquery-thelook-order-items' - resp = es.search(index=index, - query=query, - fields=fields, - size=10, - source=False) - - order_items_list = resp['hits']['hits'] + query = {"bool": {"must": [{"match": {"user_id": {"query": user, "boost": 1}}}]}} + + fields = [ + "id", + "order_id", + "user_id", + "product_id" "status", + "created_at", + "shipped_at", + "delivered_at", + "returned_at", + "sale_price", + ] + index = "bigquery-thelook-order-items" + resp = es.search(index=index, query=query, fields=fields, size=10, source=False) + + order_items_list = resp["hits"]["hits"] return order_items_list + def truncate_text(text, max_tokens): tokens = text.split() if len(tokens) <= max_tokens: return text - return ' '.join(tokens[:max_tokens]) + return " ".join(tokens[:max_tokens]) + # Generate a response from Text-Bison based on the given prompt def vertexAI(prompt): # Truncate the prompt content to fit within the model's context length - #truncated_prompt = truncate_text(prompt, max_context_tokens - max_tokens - safety_margin) - response = model.predict( - prompt, - **parameters - ) + # truncated_prompt = truncate_text(prompt, max_context_tokens - max_tokens - safety_margin) + response = model.predict(prompt, **parameters) return response.text -#image = Image.open('homecraft_logo.jpg') + +# image = Image.open('homecraft_logo.jpg') st.image("https://i.imgur.com/cdjafe0.png", caption=None) st.title("HomeCraft Search Bar") @@ -204,12 +182,13 @@ def vertexAI(prompt): es = es_connect(cid, cu, cp) resp_products, url_products = search_products(query) resp_docs, url_docs = search_docs(query) - resp_order_items = search_orders(1) # 1 is the hardcoded userid, to simplify this scenario. You should take user_id by user session + resp_order_items = search_orders( + 1 + ) # 1 is the hardcoded userid, to simplify this scenario. You should take user_id by user session prompt = f"Answer this question: {query}.\n If product information is requested use the information provided in this JSON: {resp_products} listing the identified products in bullet points with this format: Product name, product key features, price, web url. \n For other questions use the documentation provided in these docs: {resp_docs} and your own knowledge. \n If the question contains requests for user past orders consider the following order list: {resp_order_items}" answer = vertexAI(prompt) - if answer.strip() == '': + if answer.strip() == "": st.write(f"Search Assistant: \n\n{answer.strip()}") else: st.write(f"Search Assistant: \n\n{answer.strip()}\n\n") - diff --git a/supporting-blog-content/homecraft-vertex/load_embedding_model.ipynb b/supporting-blog-content/homecraft-vertex/load_embedding_model.ipynb index 926a31c6..196293cb 100644 --- a/supporting-blog-content/homecraft-vertex/load_embedding_model.ipynb +++ b/supporting-blog-content/homecraft-vertex/load_embedding_model.ipynb @@ -1,276 +1,276 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "view-in-github" - }, - "source": [ - "\"Open" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "6xoLDtS_6Df1" - }, - "source": [ - "# ElasticDocs GPT Blog\n", - "# Loading an embedding from Hugging Face into Elasticsearch\n", - "\n", - "This code will show you how to load a supported embedding model from Hugging Face into an elasticsearch cluster in [Elastic Cloud](https://cloud.elastic.co/)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "DgxCKQS7mCZw" - }, - "source": [ - "# Setup\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "Ly1f1P-l9ri8" - }, - "source": [ - "## Install and import required python libraries" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "MJAb_8zlPFhQ" - }, - "source": [ - "Elastic uses the [eland python library](https://github.com/elastic/eland) to download modesl from Hugging Face hub and load them into elasticsearch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rUedSzQW9FIF" - }, - "outputs": [], - "source": [ - "pip -q install eland elasticsearch sentence_transformers transformers torch==1.11" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wyUZXUi4RWWL" - }, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "from eland.ml.pytorch import PyTorchModel\n", - "from eland.ml.pytorch.transformers import TransformerModel\n", - "from elasticsearch import Elasticsearch\n", - "from elasticsearch.client import MlClient\n", - "\n", - "import getpass" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "r7nMIbHke37Q" - }, - "source": [ - "## Configure elasticsearch authentication. \n", - "The recommended authentication approach is using the [Elastic Cloud ID](https://www.elastic.co/guide/en/cloud/current/ec-cloud-id.html) and a [cluster level API key](https://www.elastic.co/guide/en/kibana/current/api-keys.html)\n", - "\n", - "You can use any method you wish to set the required credentials. We are using getpass in this example to prompt for credentials to avoide storing them in github." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SSGgYHome69o" - }, - "outputs": [], - "source": [ - "es_cloud_id = getpass.getpass('Enter Elastic Cloud ID: ')\n", - "es_user = getpass.getpass('Enter cluster username: ') \n", - "es_pass = getpass.getpass('Enter cluster password: ') \n", - "\n", - "#es_api_id = getpass.getpass('Enter cluster API key ID: ') \n", - "#es_api_key = getpass.getpass('Enter cluster API key: ')" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "jL4VDnVp96lf" - }, - "source": [ - "## Connect to Elastic Cloud" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "I8mVJkKmetXo" - }, - "outputs": [], - "source": [ - "#es = Elasticsearch(cloud_id=es_cloud_id, \n", - "# api_key=(es_api_id, es_api_key)\n", - "# )\n", - "es = Elasticsearch(cloud_id=es_cloud_id, \n", - " basic_auth=(es_user, es_pass)\n", - " )\n", - "es.info() # should return cluster info" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "uBMWHj-ZmtvE" - }, - "source": [ - "# Load the model From Hugging Face into Elasticsearch\n", - "Here we specify the model id from Hugging Face. The easiest way to get this id is clicking the copy the model name icon next to the name on the model page. \n", - "\n", - "When calling `TransformerModel` you specify the HF model id and the task type. You can try specifying `auto` and eland will attempt to determine the correct type from info in the model config. This is not always possible so a list of specific `task_type` values can be viewed in the following code: \n", - "[Supported values](https://github.com/elastic/eland/blob/15a300728876022b206161d71055c67b500a0192/eland/ml/pytorch/transformers.py#*L41*)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zPV3oFsKiYFL" - }, - "outputs": [], - "source": [ - "# Set the model name from Hugging Face and task type\n", - "hf_model_id='sentence-transformers/all-distilroberta-v1'\n", - "tm = TransformerModel(hf_model_id, \"text_embedding\")\n", - "\n", - "#set the modelID as it is named in Elasticsearch\n", - "es_model_id = tm.elasticsearch_model_id()\n", - "\n", - "# Download the model from Hugging Face\n", - "tmp_path = \"models\"\n", - "Path(tmp_path).mkdir(parents=True, exist_ok=True)\n", - "model_path, config, vocab_path = tm.save(tmp_path)\n", - "\n", - "# Load the model into Elasticsearch\n", - "ptm = PyTorchModel(es, es_model_id)\n", - "ptm.import_model(model_path=model_path, config_path=None, vocab_path=vocab_path, config=config) \n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "4UYSzFp3vHdB" - }, - "source": [ - "# Starting the Model" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "wQwfozwznK4Y" - }, - "source": [ - "## View information about the model\n", - "This is not required but can be handy to get a model overivew" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "b4Wv8EJvpfZI" - }, - "outputs": [], - "source": [ - "# List the in elasticsearch\n", - "m = MlClient.get_trained_models(es, model_id=es_model_id)\n", - "m.body" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "oMGw3sk-pbaN" - }, - "source": [ - "## Deploy the model\n", - "This will load the model on the ML nodes and start the process(es) making it available for the NLP task" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "w5muJ1rLqvUW" - }, - "outputs": [], - "source": [ - "s = MlClient.start_trained_model_deployment(es, model_id=es_model_id)\n", - "s.body" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "ZytlELrsnn_O" - }, - "source": [ - "## Verify the model started without issue\n", - "Should output -> {'routing_state': 'started'}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZaQUUWe0Hxwz" - }, - "outputs": [], - "source": [ - "stats = MlClient.get_trained_models_stats(es, model_id=es_model_id)\n", - "stats.body['trained_model_stats'][0]['deployment_stats']['nodes'][0]['routing_state']" - ] - } - ], - "metadata": { - "colab": { - "include_colab_link": true, - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] }, - "nbformat": 4, - "nbformat_minor": 0 + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "6xoLDtS_6Df1" + }, + "source": [ + "# ElasticDocs GPT Blog\n", + "# Loading an embedding from Hugging Face into Elasticsearch\n", + "\n", + "This code will show you how to load a supported embedding model from Hugging Face into an elasticsearch cluster in [Elastic Cloud](https://cloud.elastic.co/)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "DgxCKQS7mCZw" + }, + "source": [ + "# Setup\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "Ly1f1P-l9ri8" + }, + "source": [ + "## Install and import required python libraries" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "MJAb_8zlPFhQ" + }, + "source": [ + "Elastic uses the [eland python library](https://github.com/elastic/eland) to download modesl from Hugging Face hub and load them into elasticsearch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rUedSzQW9FIF" + }, + "outputs": [], + "source": [ + "pip -q install eland elasticsearch sentence_transformers transformers torch==1.11" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wyUZXUi4RWWL" + }, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from eland.ml.pytorch import PyTorchModel\n", + "from eland.ml.pytorch.transformers import TransformerModel\n", + "from elasticsearch import Elasticsearch\n", + "from elasticsearch.client import MlClient\n", + "\n", + "import getpass" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "r7nMIbHke37Q" + }, + "source": [ + "## Configure elasticsearch authentication. \n", + "The recommended authentication approach is using the [Elastic Cloud ID](https://www.elastic.co/guide/en/cloud/current/ec-cloud-id.html) and a [cluster level API key](https://www.elastic.co/guide/en/kibana/current/api-keys.html)\n", + "\n", + "You can use any method you wish to set the required credentials. We are using getpass in this example to prompt for credentials to avoide storing them in github." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SSGgYHome69o" + }, + "outputs": [], + "source": [ + "es_cloud_id = getpass.getpass(\"Enter Elastic Cloud ID: \")\n", + "es_user = getpass.getpass(\"Enter cluster username: \")\n", + "es_pass = getpass.getpass(\"Enter cluster password: \")\n", + "\n", + "# es_api_id = getpass.getpass('Enter cluster API key ID: ')\n", + "# es_api_key = getpass.getpass('Enter cluster API key: ')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "jL4VDnVp96lf" + }, + "source": [ + "## Connect to Elastic Cloud" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "I8mVJkKmetXo" + }, + "outputs": [], + "source": [ + "# es = Elasticsearch(cloud_id=es_cloud_id,\n", + "# api_key=(es_api_id, es_api_key)\n", + "# )\n", + "es = Elasticsearch(cloud_id=es_cloud_id, basic_auth=(es_user, es_pass))\n", + "es.info() # should return cluster info" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "uBMWHj-ZmtvE" + }, + "source": [ + "# Load the model From Hugging Face into Elasticsearch\n", + "Here we specify the model id from Hugging Face. The easiest way to get this id is clicking the copy the model name icon next to the name on the model page. \n", + "\n", + "When calling `TransformerModel` you specify the HF model id and the task type. You can try specifying `auto` and eland will attempt to determine the correct type from info in the model config. This is not always possible so a list of specific `task_type` values can be viewed in the following code: \n", + "[Supported values](https://github.com/elastic/eland/blob/15a300728876022b206161d71055c67b500a0192/eland/ml/pytorch/transformers.py#*L41*)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zPV3oFsKiYFL" + }, + "outputs": [], + "source": [ + "# Set the model name from Hugging Face and task type\n", + "hf_model_id = \"sentence-transformers/all-distilroberta-v1\"\n", + "tm = TransformerModel(hf_model_id, \"text_embedding\")\n", + "\n", + "# set the modelID as it is named in Elasticsearch\n", + "es_model_id = tm.elasticsearch_model_id()\n", + "\n", + "# Download the model from Hugging Face\n", + "tmp_path = \"models\"\n", + "Path(tmp_path).mkdir(parents=True, exist_ok=True)\n", + "model_path, config, vocab_path = tm.save(tmp_path)\n", + "\n", + "# Load the model into Elasticsearch\n", + "ptm = PyTorchModel(es, es_model_id)\n", + "ptm.import_model(\n", + " model_path=model_path, config_path=None, vocab_path=vocab_path, config=config\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "4UYSzFp3vHdB" + }, + "source": [ + "# Starting the Model" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "wQwfozwznK4Y" + }, + "source": [ + "## View information about the model\n", + "This is not required but can be handy to get a model overivew" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "b4Wv8EJvpfZI" + }, + "outputs": [], + "source": [ + "# List the in elasticsearch\n", + "m = MlClient.get_trained_models(es, model_id=es_model_id)\n", + "m.body" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "oMGw3sk-pbaN" + }, + "source": [ + "## Deploy the model\n", + "This will load the model on the ML nodes and start the process(es) making it available for the NLP task" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "w5muJ1rLqvUW" + }, + "outputs": [], + "source": [ + "s = MlClient.start_trained_model_deployment(es, model_id=es_model_id)\n", + "s.body" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "ZytlELrsnn_O" + }, + "source": [ + "## Verify the model started without issue\n", + "Should output -> {'routing_state': 'started'}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZaQUUWe0Hxwz" + }, + "outputs": [], + "source": [ + "stats = MlClient.get_trained_models_stats(es, model_id=es_model_id)\n", + "stats.body[\"trained_model_stats\"][0][\"deployment_stats\"][\"nodes\"][0][\"routing_state\"]" + ] + } + ], + "metadata": { + "colab": { + "include_colab_link": true, + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/supporting-blog-content/homecraft-vertex/pages/homecraft_finetuned.py b/supporting-blog-content/homecraft-vertex/pages/homecraft_finetuned.py index 03ae97ae..cbfae573 100644 --- a/supporting-blog-content/homecraft-vertex/pages/homecraft_finetuned.py +++ b/supporting-blog-content/homecraft-vertex/pages/homecraft_finetuned.py @@ -4,7 +4,7 @@ import vertexai from vertexai.preview.language_models import TextGenerationModel -#WATCHOUT!!! For fine-tuning feature you need to import vertexai.preview instead of just vertexAI +# WATCHOUT!!! For fine-tuning feature you need to import vertexai.preview instead of just vertexAI # This page shows the integration with a fine-tuned text-bison model via VertexAI @@ -18,23 +18,20 @@ # cloud_user - Elasticsearch Cluster User # cloud_pass - Elasticsearch User Password -projid = os.environ['gcp_project_id'] -cid = os.environ['cloud_id'] -cp = os.environ['cloud_pass'] -cu = os.environ['cloud_user'] +projid = os.environ["gcp_project_id"] +cid = os.environ["cloud_id"] +cp = os.environ["cloud_pass"] +cu = os.environ["cloud_user"] -parameters = { - "temperature": 0.5, - "max_output_tokens": 606, - "top_p": 0.8, - "top_k": 40 - } +parameters = {"temperature": 0.5, "max_output_tokens": 606, "top_p": 0.8, "top_k": 40} vertexai.init(project="1059491012611", location="us-central1") - -#we are here referencing our custom fine-tuned model + +# we are here referencing our custom fine-tuned model model = TextGenerationModel.from_pretrained("text-bison@001") -model = model.get_tuned_model("projects/1059491012611/locations/us-central1/models/5745671733780676608") +model = model.get_tuned_model( + "projects/1059491012611/locations/us-central1/models/5745671733780676608" +) # Connect to Elastic Cloud cluster @@ -42,25 +39,15 @@ def es_connect(cid, user, passwd): es = Elasticsearch(cloud_id=cid, http_auth=(user, passwd)) return es + # Search ElasticSearch index and return details on relevant products def search_products(query_text): # Elasticsearch query (BM25) and kNN configuration for hybrid search query = { "bool": { - "must": [{ - "match": { - "title": { - "query": query_text, - "boost": 1 - } - } - }], - "filter": [{ - "exists": { - "field": "title-vector" - } - }] + "must": [{"match": {"title": {"query": query_text, "boost": 1}}}], + "filter": [{"exists": {"field": "title-vector"}}], } } @@ -71,50 +58,44 @@ def search_products(query_text): "query_vector_builder": { "text_embedding": { "model_id": "sentence-transformers__all-distilroberta-v1", - "model_text": query_text + "model_text": query_text, } }, - "boost": 24 + "boost": 24, } - fields = ["title", "description", "url", "availability", "price", "brand", "product_id"] - index = 'home-depot-product-catalog-vector' - resp = es.search(index=index, - query=query, - knn=knn, - fields=fields, - size=5, - source=False) - - doc_list = resp['hits']['hits'] - body = resp['hits']['hits'] - url = '' + fields = [ + "title", + "description", + "url", + "availability", + "price", + "brand", + "product_id", + ] + index = "home-depot-product-catalog-vector" + resp = es.search( + index=index, query=query, knn=knn, fields=fields, size=5, source=False + ) + + doc_list = resp["hits"]["hits"] + body = resp["hits"]["hits"] + url = "" for doc in doc_list: - #body = body + doc['fields']['description'][0] - url = url + "\n\n" + doc['fields']['url'][0] + # body = body + doc['fields']['description'][0] + url = url + "\n\n" + doc["fields"]["url"][0] return body, url + # Search ElasticSearch index and return body and URL for crawled docs def search_docs(query_text): - # Elasticsearch query (BM25) and kNN configuration for hybrid search query = { "bool": { - "must": [{ - "match": { - "title": { - "query": query_text, - "boost": 1 - } - } - }], - "filter": [{ - "exists": { - "field": "title-vector" - } - }] + "must": [{"match": {"title": {"query": query_text, "boost": 1}}}], + "filter": [{"exists": {"field": "title-vector"}}], } } @@ -125,45 +106,42 @@ def search_docs(query_text): "query_vector_builder": { "text_embedding": { "model_id": "sentence-transformers__all-distilroberta-v1", - "model_text": query_text + "model_text": query_text, } }, - "boost": 24 + "boost": 24, } fields = ["title", "body_content", "url"] - index = 'search-homecraft-ikea' - resp = es.search(index=index, - query=query, - knn=knn, - fields=fields, - size=1, - source=False) + index = "search-homecraft-ikea" + resp = es.search( + index=index, query=query, knn=knn, fields=fields, size=1, source=False + ) - body = resp['hits']['hits'][0]['fields']['body_content'][0] - url = resp['hits']['hits'][0]['fields']['url'][0] + body = resp["hits"]["hits"][0]["fields"]["body_content"][0] + url = resp["hits"]["hits"][0]["fields"]["url"][0] return body, url + def truncate_text(text, max_tokens): tokens = text.split() if len(tokens) <= max_tokens: return text - return ' '.join(tokens[:max_tokens]) + return " ".join(tokens[:max_tokens]) + # Generate a response from ChatGPT based on the given prompt def vertexAI(prompt): # Truncate the prompt content to fit within the model's context length - #truncated_prompt = truncate_text(prompt, max_context_tokens - max_tokens - safety_margin) - response = model.predict( - prompt, - **parameters - ) + # truncated_prompt = truncate_text(prompt, max_context_tokens - max_tokens - safety_margin) + response = model.predict(prompt, **parameters) return response.text -#image = Image.open('homecraft_logo.jpg') + +# image = Image.open('homecraft_logo.jpg') st.image("https://i.imgur.com/cdjafe0.png", caption=None) st.title("HomeCraft Search Bar") @@ -180,9 +158,8 @@ def vertexAI(prompt): resp_docs, url_docs = search_docs(query) prompt = f"question: {query}" answer = vertexAI(prompt) - + if negResponse in answer: st.write(f"Search Assistant: \n\n{answer.strip()}") else: st.write(f"Search Assistant: {answer.strip()}\n\n") - diff --git a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/ecommerce_dense_sparse_project.ipynb b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/ecommerce_dense_sparse_project.ipynb index 453a1241..3d515f3f 100644 --- a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/ecommerce_dense_sparse_project.ipynb +++ b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/ecommerce_dense_sparse_project.ipynb @@ -1,834 +1,816 @@ { - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# **Lexical and Semantic Search with Elasticsearch**\n", - "\n", - "In this example, you will explore various approaches to retrieving information using Elasticsearch, focusing specifically on text, lexical and semantic search.\n", - "\n", - "To accomplish this, this example demonstrate various search scenarios on a dataset generated to simulate e-commerce product information.\n", - "\n", - "This dataset contains over 2,500 products, each with a description. These products are categorized into 76 distinct product categories, with each category containing a varying number of products.\n", - "\n", - "## **🧰 Requirements**\n", - "\n", - "For this example, you will need:\n", - "\n", - "- Python 3.6 or later\n", - "- The Elastic Python client\n", - "- Elastic 8.8 deployment or later, with 8GB memory machine learning node\n", - "- The Elastic Learned Sparse EncodeR model that comes pre-loaded into Elastic installed and started on your deployment\n", - "\n", - "We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html), a [free trial](https://cloud.elastic.co/registration?utm_source=github&utm_content=elasticsearch-labs-notebook) is available." - ], - "metadata": { - "id": "r8OKk3QOGBXl" - }, - "id": "r8OKk3QOGBXl" - }, - { - "cell_type": "markdown", - "source": [ - "## Setup Elasticsearch environment:\n", - "\n", - "To get started, we'll need to connect to our Elastic deployment using the Python client.\n", - "\n", - "Because we're using an Elastic Cloud deployment, we'll use the **Cloud ID** to identify our deployment.\n" - ], - "metadata": { - "id": "hmMWo2e-IkTB" - }, - "id": "hmMWo2e-IkTB" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e8d24cd8-a437-4bd2-a1f0-93e535ccf8a9", - "metadata": { - "id": "e8d24cd8-a437-4bd2-a1f0-93e535ccf8a9" - }, - "outputs": [], - "source": [ - "!pip install elasticsearch==8.8 #Elasticsearch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8c36e9b5-8f2b-4734-9213-1350caa7f837", - "metadata": { - "id": "8c36e9b5-8f2b-4734-9213-1350caa7f837" - }, - "outputs": [], - "source": [ - "pip -q install eland elasticsearch sentence_transformers transformers torch==1.11 # Eland Python Client" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eaf90bc8-647e-4ada-9aa9-5cb9e60762b7", - "metadata": { - "id": "eaf90bc8-647e-4ada-9aa9-5cb9e60762b7" - }, - "outputs": [], - "source": [ - "from elasticsearch import Elasticsearch, helpers # Import the Elasticsearch client and helpers module\n", - "from urllib.request import urlopen # library for opening URLs\n", - "import json # module for handling JSON data\n", - "from pathlib import Path # module for working with file paths\n", - "# Python client and toolkit for machine learning in Elasticsearch\n", - "from eland.ml.pytorch import PyTorchModel\n", - "from eland.ml.pytorch.transformers import TransformerModel\n", - "from elasticsearch.client import MlClient # Elastic module for ml\n", - "import getpass # handling password input" - ] - }, - { - "cell_type": "markdown", - "source": [ - "Now we can instantiate the Python Elasticsearch client.\n", - "\n", - "First we prompt the user for their password and Cloud ID.\n", - "\n", - "🔐 NOTE: `getpass` enables us to securely prompt the user for credentials without echoing them to the terminal, or storing it in memory.\n", - "\n", - "Then we create a `client` object that instantiates an instance of the `Elasticsearch` class." - ], - "metadata": { - "id": "ea1VkDBXJIQR" - }, - "id": "ea1VkDBXJIQR" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6907a2bf-4927-428e-9ca8-9df3dd35a2cc", - "metadata": { - "id": "6907a2bf-4927-428e-9ca8-9df3dd35a2cc" - }, - "outputs": [], - "source": [ - "# Found in the 'Manage Deployment' page\n", - "CLOUD_ID = getpass.getpass('Enter Elastic Cloud ID: ')\n", - "\n", - "# Password for the 'elastic' user generated by Elasticsearch\n", - "ELASTIC_PASSWORD = getpass.getpass('Enter Elastic password: ')\n", - "\n", - "# Create the client instance\n", - "client = Elasticsearch(\n", - " cloud_id=CLOUD_ID,\n", - " basic_auth=(\"elastic\", ELASTIC_PASSWORD),\n", - " request_timeout=3600\n", - ")" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Setup emebdding model\n", - "\n", - "Next we upload the all-mpnet-base-v2 embedding model into Elasticsearch and create an ingest pipeline with inference processors for text embedding and text expansion, using the description field for both. This field contains the description of each product." - ], - "metadata": { - "id": "BH-N6epTJarM" - }, - "id": "BH-N6epTJarM" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7f6f3f5a-2b93-4a0c-93c8-c887ca80f687", - "metadata": { - "id": "7f6f3f5a-2b93-4a0c-93c8-c887ca80f687" - }, - "outputs": [], - "source": [ - "# Set the model name from Hugging Face and task type\n", - "# sentence-transformers model\n", - "hf_model_id='sentence-transformers/all-mpnet-base-v2'\n", - "tm = TransformerModel(hf_model_id, \"text_embedding\")\n", - "\n", - "#set the modelID as it is named in Elasticsearch\n", - "es_model_id = tm.elasticsearch_model_id()\n", - "\n", - "# Download the model from Hugging Face\n", - "tmp_path = \"models\"\n", - "Path(tmp_path).mkdir(parents=True, exist_ok=True)\n", - "model_path, config, vocab_path = tm.save(tmp_path)\n", - "\n", - "# Load the model into Elasticsearch\n", - "ptm = PyTorchModel(client, es_model_id)\n", - "ptm.import_model(model_path=model_path, config_path=None, vocab_path=vocab_path, config=config)\n", - "\n", - "# Start the model\n", - "s = MlClient.start_trained_model_deployment(client, model_id=es_model_id)\n", - "s.body" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6739f55b-6983-4b48-9349-6e0111b313fe", - "metadata": { - "id": "6739f55b-6983-4b48-9349-6e0111b313fe" - }, - "outputs": [], - "source": [ - "# Creating an ingest pipeline with inference processors to use ELSER (sparse) and all-mpnet-base-v2 (dense) to infer against data that will be ingested in the pipeline.\n", - "\n", - "client.ingest.put_pipeline(\n", - " id=\"ecommerce-pipeline\",\n", - " processors = [\n", - " {\n", - " \"inference\": {\n", - " \"model_id\": \"elser_model\",\n", - " \"target_field\": \"ml\",\n", - " \"field_map\": {\n", - " \"description\": \"text_field\"\n", - " },\n", - " \"inference_config\": {\n", - " \"text_expansion\": { # text_expansion inference type (ELSER)\n", - " \"results_field\": \"tokens\"\n", - " }\n", - " }\n", - " }\n", - " },\n", - " {\n", - " \"inference\": {\n", - " \"model_id\": \"sentence-transformers__all-mpnet-base-v2\",\n", - " \"target_field\": \"description_vector\", # Target field for the inference results\n", - " \"field_map\": {\n", - " \"description\": \"text_field\" # Field matching our configured trained model input. Typically for NLP models, the field name is text_field.\n", - " }\n", - " }\n", - " }\n", - " ]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Index documents\n", - "\n", - "Then, we create a source index to load `products-ecommerce.json`, this will be the `ecommerce` index and a destination index to extract the documents from the source and index these documents into the destination `ecommerce-search`.\n", - "\n", - "For the `ecommerce-search` index we add a field to support dense vector storage and search `description_vector.predicted_value`, this is the target field for inference results. The field type in this case is `dense_vector`, the `all-mpnet-base-v2` model has embedding_size of 768, so dims is set to 768. We also add a `rank_features` field type to support the text expansion output." - ], - "metadata": { - "id": "QUQ1nCaiKIQr" - }, - "id": "QUQ1nCaiKIQr" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6e115bd0-e758-44db-b5b9-96217af472c1", - "metadata": { - "id": "6e115bd0-e758-44db-b5b9-96217af472c1" - }, - "outputs": [], - "source": [ - "#Index to load products-ecommerce.json docs\n", - "\n", - "client.indices.create(\n", - " index=\"ecommerce\",\n", - " mappings= {\n", - " \"properties\": {\n", - " \"product\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\n", - " \"type\": \"keyword\",\n", - " \"ignore_above\": 256\n", - " }\n", - " }\n", - " },\n", - " \"description\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\n", - " \"type\": \"keyword\",\n", - " \"ignore_above\": 256\n", - " }\n", - " }\n", - " },\n", - " \"category\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\n", - " \"type\": \"keyword\",\n", - " \"ignore_above\": 256\n", - " }\n", - " }\n", - " }\n", - " }\n", - "})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418", - "metadata": { - "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418" - }, - "outputs": [], - "source": [ - "# Reindex dest index\n", - "\n", - "INDEX = 'ecommerce-search'\n", - "client.indices.create(\n", - " index=INDEX,\n", - " settings={\n", - " \"index\": {\n", - " \"number_of_shards\": 1,\n", - " \"number_of_replicas\": 1\n", - " }\n", - " },\n", - " mappings={\n", - "# Saving disk space by excluding the ELSER tokens and the dense_vector field from document source.\n", - "# Note: That should only be applied if you are certain that reindexing will not be required in the future.\n", - " \"_source\" : {\n", - " \"excludes\": [\"ml.tokens\",\"description_vector.predicted_value\"]\n", - " },\n", - " \"properties\": {\n", - " \"product\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\n", - " \"type\": \"keyword\",\n", - " \"ignore_above\": 256\n", - " }\n", - " }\n", - " },\n", - " \"description\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\n", - " \"type\": \"keyword\",\n", - " \"ignore_above\": 256\n", - " }\n", - " }\n", - " },\n", - " \"category\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\n", - " \"type\": \"keyword\",\n", - " \"ignore_above\": 256\n", - " }\n", - " }\n", - " },\n", - " \"ml.tokens\": { # The name of the field to contain the generated tokens.\n", - " \"type\": \"rank_features\" # ELSER output must be ingested into a field with the rank_features field type.\n", - " },\n", - " \"description_vector.predicted_value\": { # Inference results field, target_field.predicted_value\n", - " \"type\": \"dense_vector\",\n", - " \"dims\": 768, # The all-mpnet-base-v2 model has embedding_size of 768, so dims is set to 768.\n", - " \"index\": \"true\",\n", - " \"similarity\": \"dot_product\" # When indexing vectors for approximate kNN search, you need to specify the similarity function for comparing the vectors.\n", - " }\n", - " }\n", - "\n", - "}\n", - ")" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Load documents\n", - "\n", - "Then we load `products-ecommerce.json` into the `ecommerce` index." - ], - "metadata": { - "id": "Vo-LKu8TOT5j" - }, - "id": "Vo-LKu8TOT5j" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3cfdc3b7-7e4f-4111-997b-c333ac8938ba", - "metadata": { - "id": "3cfdc3b7-7e4f-4111-997b-c333ac8938ba" - }, - "outputs": [], - "source": [ - "# dataset\n", - "\n", - "url = \"https://raw.githubusercontent.com/elastic/elasticsearch-labs/02c01b3450e8ddc72ccec85d559eee5280c185ac/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/products-ecommerce.json\" # json raw file - update the link here\n", - "\n", - "response = urlopen(url)\n", - "\n", - "# Load the response data into a JSON object\n", - "data_json = json.loads(response.read())\n", - "\n", - "def create_index_body(doc):\n", - " \"\"\" Generate the body for an Elasticsearch document. \"\"\"\n", - " return {\n", - " \"_index\": \"ecommerce\",\n", - " \"_source\": doc,\n", - " }\n", - "\n", - "# Prepare the documents to be indexed\n", - "documents = [create_index_body(doc) for doc in data_json]\n", - "\n", - "# Use helpers.bulk to index\n", - "helpers.bulk(client, documents)\n", - "\n", - "print(\"Done indexing documents into `ecommerce` index\")" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Reindex\n", - "\n", - "Now we can reindex data from the `source` index `ecommerce` to the `dest` index `ecommerce-search` with the ingest pipeline `ecommerce-pipeline` we created.\n", - "\n", - "After this step our `dest` index will have the fields we need to perform Semantic Search." - ], - "metadata": { - "id": "3dShN9W4Opl8" - }, - "id": "3dShN9W4Opl8" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4297cb0b-ae2e-44f9-811d-27a41c43a858", - "metadata": { - "id": "4297cb0b-ae2e-44f9-811d-27a41c43a858" - }, - "outputs": [], - "source": [ - "# Reindex data from one index 'source' to another 'dest' with the 'ecommerce-pipeline' pipeline.\n", - "\n", - "client.reindex(wait_for_completion=True,\n", - " source={\n", - " \"index\": \"ecommerce\"\n", - " },\n", - " dest= {\n", - " \"index\": \"ecommerce-search\",\n", - " \"pipeline\": \"ecommerce-pipeline\"\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Text Analysis with Standard Analyzer" - ], - "metadata": { - "id": "-qUXNuOvPDsI" - }, - "id": "-qUXNuOvPDsI" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "829ae6e8-807d-4f0d-ada6-fee86748b91a", - "metadata": { - "id": "829ae6e8-807d-4f0d-ada6-fee86748b91a" - }, - "outputs": [], - "source": [ - "#Performs text analysis on a string and returns the resulting tokens.\n", - "\n", - "# Define the text to be analyzed\n", - "text = \"Comfortable furniture for a large balcony\"\n", - "\n", - "# Define the analyze request\n", - "request_body = {\n", - " \"analyzer\": \"standard\", # Standard Analyzer\n", - " \"text\": text\n", - "}\n", - "\n", - "# Perform the analyze request\n", - "response = client.indices.analyze(analyzer=request_body[\"analyzer\"], text=request_body[\"text\"])\n", - "\n", - "# Extract and display the analyzed tokens\n", - "tokens = [token[\"token\"] for token in response[\"tokens\"]]\n", - "print(\"Analyzed Tokens:\", tokens)" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Text Analysis with Stop Analyzer" - ], - "metadata": { - "id": "12u70NLmPyNV" - }, - "id": "12u70NLmPyNV" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "55b602d1-f1e4-4b70-9273-5fc701ac9039", - "metadata": { - "id": "55b602d1-f1e4-4b70-9273-5fc701ac9039" - }, - "outputs": [], - "source": [ - "#Performs text analysis on a string and returns the resulting tokens.\n", - "\n", - "# Define the text to be analyzed\n", - "text = \"Comfortable furniture for a large balcony\"\n", - "\n", - "# Define the analyze request\n", - "request_body = {\n", - " \"analyzer\": \"stop\", # Stop Analyzer\n", - " \"text\": text\n", - "}\n", - "\n", - "# Perform the analyze request\n", - "response = client.indices.analyze(analyzer=request_body[\"analyzer\"], text=request_body[\"text\"])\n", - "\n", - "# Extract and display the analyzed tokens\n", - "tokens = [token[\"token\"] for token in response[\"tokens\"]]\n", - "print(\"Analyzed Tokens:\", tokens)" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Lexical Search" - ], - "metadata": { - "id": "8G8MKcUvP0zs" - }, - "id": "8G8MKcUvP0zs" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f4984f6c-ceec-46a4-b64c-f749e6b1b04f", - "metadata": { - "id": "f4984f6c-ceec-46a4-b64c-f749e6b1b04f" - }, - "outputs": [], - "source": [ - "# BM25\n", - "\n", - "response = client.search(size=2,\n", - " index=\"ecommerce-search\",\n", - " query= {\n", - " \"match\": {\n", - " \"description\" : {\n", - " \"query\": \"Comfortable furniture for a large balcony\",\n", - " \"analyzer\": \"stop\"\n", - " }\n", - " }\n", - " }\n", - ")\n", - "hits = response['hits']['hits']\n", - "\n", - "if not hits:\n", - " print(\"No matches found\")\n", - "else:\n", - " for hit in hits:\n", - " score = hit['_score']\n", - " product = hit['_source']['product']\n", - " category = hit['_source']['category']\n", - " description = hit['_source']['description']\n", - " print(f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\")\n" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Semantic Search with Dense Vector" - ], - "metadata": { - "id": "xiywcf_-P39a" - }, - "id": "xiywcf_-P39a" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "72187c9a-14c1-4084-a080-4e5c1e614f22", - "metadata": { - "id": "72187c9a-14c1-4084-a080-4e5c1e614f22" - }, - "outputs": [], - "source": [ - "# KNN\n", - "\n", - "response = client.search(index='ecommerce-search', size=2,\n", - " knn={\n", - " \"field\": \"description_vector.predicted_value\",\n", - " \"k\": 50, # Number of nearest neighbors to return as top hits.\n", - " \"num_candidates\": 500, # Number of nearest neighbor candidates to consider per shard. Increasing num_candidates tends to improve the accuracy of the final k results.\n", - " \"query_vector_builder\": { # Object indicating how to build a query_vector. kNN search enables you to perform semantic search by using a previously deployed text embedding model.\n", - " \"text_embedding\": {\n", - " \"model_id\": \"sentence-transformers__all-mpnet-base-v2\", # Text embedding model id\n", - " \"model_text\": \"Comfortable furniture for a large balcony\" # Query\n", - " }\n", - " }\n", - " }\n", - ")\n", - "\n", - "for hit in response['hits']['hits']:\n", - "\n", - " score = hit['_score']\n", - " product = hit['_source']['product']\n", - " category = hit['_source']['category']\n", - " description = hit['_source']['description']\n", - " print(f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\")" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Semantic Search with Sparse Vector" - ], - "metadata": { - "id": "QlWFdngRQFbv" - }, - "id": "QlWFdngRQFbv" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5", - "metadata": { - "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5" - }, - "outputs": [], - "source": [ - "# Elastic Learned Sparse Encoder - ELSER\n", - "\n", - "response = client.search(index='ecommerce-search', size=2,\n", - " query={\n", - " \"text_expansion\": {\n", - " \"ml.tokens\": {\n", - " \"model_id\":\"elser_model\",\n", - " \"model_text\":\"Comfortable furniture for a large balcony\"\n", - " }\n", - " }\n", - "}\n", - ")\n", - "\n", - "for hit in response['hits']['hits']:\n", - "\n", - " score = hit['_score']\n", - " product = hit['_source']['product']\n", - " category = hit['_source']['category']\n", - " description = hit['_source']['description']\n", - " print(f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\")" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Hybrid Search - BM25+KNN linear combination" - ], - "metadata": { - "id": "kz9deDBYQJxr" - }, - "id": "kz9deDBYQJxr" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f84aa16b-49c5-4abf-a049-d556c225542e", - "metadata": { - "id": "f84aa16b-49c5-4abf-a049-d556c225542e" - }, - "outputs": [], - "source": [ - "# BM25 + KNN (Linear Combination)\n", - "\n", - "response = client.search(index='ecommerce-search', size=2,\n", - " query={\n", - " \"bool\": {\n", - " \"should\": [\n", - " {\n", - " \"match\": {\n", - " \"description\": {\n", - " \"query\": \"A dining table and comfortable chairs for a large balcony\",\n", - " \"boost\": 1 # You can adjust the boost value\n", - " }\n", - " }\n", - " }\n", - " ]\n", - " }\n", - " },\n", - " knn={\n", - " \"field\": \"description_vector.predicted_value\",\n", - " \"k\": 50,\n", - " \"num_candidates\": 500,\n", - " \"boost\": 1, # You can adjust the boost value\n", - " \"query_vector_builder\": {\n", - " \"text_embedding\": {\n", - " \"model_id\": \"sentence-transformers__all-mpnet-base-v2\",\n", - " \"model_text\": \"A dining table and comfortable chairs for a large balcony\"\n", - " }\n", - " }\n", - " }\n", - ")\n", - "\n", - "for hit in response['hits']['hits']:\n", - "\n", - " score = hit['_score']\n", - " product = hit['_source']['product']\n", - " category = hit['_source']['category']\n", - " description = hit['_source']['description']\n", - " print(f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\")\n" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Hybrid Search - BM25+KNN RRF" - ], - "metadata": { - "id": "cybkWjmpQV8g" - }, - "id": "cybkWjmpQV8g" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa2e072d-37bb-43fd-a83f-e1cb55a24861", - "metadata": { - "id": "aa2e072d-37bb-43fd-a83f-e1cb55a24861" - }, - "outputs": [], - "source": [ - "# BM25 + KNN (RRF)\n", - "# RRF functionality is in technical preview and may be changed or removed in a future release. The syntax will likely change before GA.\n", - "\n", - "response = client.search(index='ecommerce-search', size=2,\n", - " query={\n", - " \"bool\": {\n", - " \"should\": [\n", - " {\n", - " \"match\": {\n", - " \"description\": {\n", - " \"query\": \"A dining table and comfortable chairs for a large balcony\"\n", - " }\n", - " }\n", - " }\n", - " ]\n", - " }\n", - " },\n", - " knn={\n", - " \"field\": \"description_vector.predicted_value\",\n", - " \"k\": 50,\n", - " \"num_candidates\": 500,\n", - " \"query_vector_builder\": {\n", - " \"text_embedding\": {\n", - " \"model_id\": \"sentence-transformers__all-mpnet-base-v2\",\n", - " \"model_text\": \"A dining table and comfortable chairs for a large balcony\"\n", - " }\n", - " }\n", - " },\n", - " rank={\n", - " \"rrf\": { # Reciprocal rank fusion\n", - " \"window_size\": 50, # This value determines the size of the individual result sets per query.\n", - " \"rank_constant\": 20 # This value determines how much influence documents in individual result sets per query have over the final ranked result set.\n", - " }\n", - " }\n", - ")\n", - "\n", - "for hit in response['hits']['hits']:\n", - "\n", - " rank = hit['_rank']\n", - " category = hit['_source']['category']\n", - " product = hit['_source']['product']\n", - " description = hit['_source']['description']\n", - " print(f\"\\nRank: {rank}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\")\n" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Hybrid Search - BM25+ELSER linear combination" - ], - "metadata": { - "id": "LyKI2Z-XQbI6" - }, - "id": "LyKI2Z-XQbI6" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bd842732-b20a-4c7a-b735-e1f558a9b922", - "metadata": { - "id": "bd842732-b20a-4c7a-b735-e1f558a9b922" - }, - "outputs": [], - "source": [ - "# BM25 + Elastic Learned Sparse Encoder (Linear Combination)\n", - "\n", - "response = client.search(index='ecommerce-search', size=2,\n", - "\n", - " query= {\n", - " \"bool\": {\n", - " \"should\": [\n", - " {\n", - " \"match\": {\n", - " \"description\" : {\n", - " \"query\": \"A dining table and comfortable chairs for a large balcony\",\n", - " \"boost\": 1 # You can adjust the boost value\n", - " }\n", - " }\n", - " },\n", - " {\n", - " \"text_expansion\": {\n", - " \"ml.tokens\": {\n", - " \"model_id\": \"elser_model\",\n", - " \"model_text\": \"A dining table and comfortable chairs for a large balcony\",\n", - " \"boost\": 1 # You can adjust the boost value\n", - " }\n", - " }\n", - " }\n", - " ]\n", - " }\n", - " }\n", - "\n", - ")\n", - "\n", - "for hit in response['hits']['hits']:\n", - "\n", - " score = hit['_score']\n", - " product = hit['_source']['product']\n", - " category = hit['_source']['category']\n", - " description = hit['_source']['description']\n", - " print(f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.10" - }, - "colab": { - "provenance": [] - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# **Lexical and Semantic Search with Elasticsearch**\n", + "\n", + "In this example, you will explore various approaches to retrieving information using Elasticsearch, focusing specifically on text, lexical and semantic search.\n", + "\n", + "To accomplish this, this example demonstrate various search scenarios on a dataset generated to simulate e-commerce product information.\n", + "\n", + "This dataset contains over 2,500 products, each with a description. These products are categorized into 76 distinct product categories, with each category containing a varying number of products.\n", + "\n", + "## **🧰 Requirements**\n", + "\n", + "For this example, you will need:\n", + "\n", + "- Python 3.6 or later\n", + "- The Elastic Python client\n", + "- Elastic 8.8 deployment or later, with 8GB memory machine learning node\n", + "- The Elastic Learned Sparse EncodeR model that comes pre-loaded into Elastic installed and started on your deployment\n", + "\n", + "We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html), a [free trial](https://cloud.elastic.co/registration?utm_source=github&utm_content=elasticsearch-labs-notebook) is available." + ], + "metadata": { + "id": "r8OKk3QOGBXl" + }, + "id": "r8OKk3QOGBXl" + }, + { + "cell_type": "markdown", + "source": [ + "## Setup Elasticsearch environment:\n", + "\n", + "To get started, we'll need to connect to our Elastic deployment using the Python client.\n", + "\n", + "Because we're using an Elastic Cloud deployment, we'll use the **Cloud ID** to identify our deployment.\n" + ], + "metadata": { + "id": "hmMWo2e-IkTB" + }, + "id": "hmMWo2e-IkTB" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8d24cd8-a437-4bd2-a1f0-93e535ccf8a9", + "metadata": { + "id": "e8d24cd8-a437-4bd2-a1f0-93e535ccf8a9" + }, + "outputs": [], + "source": [ + "!pip install elasticsearch==8.8 #Elasticsearch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c36e9b5-8f2b-4734-9213-1350caa7f837", + "metadata": { + "id": "8c36e9b5-8f2b-4734-9213-1350caa7f837" + }, + "outputs": [], + "source": [ + "pip -q install eland elasticsearch sentence_transformers transformers torch==1.11 # Eland Python Client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eaf90bc8-647e-4ada-9aa9-5cb9e60762b7", + "metadata": { + "id": "eaf90bc8-647e-4ada-9aa9-5cb9e60762b7" + }, + "outputs": [], + "source": [ + "from elasticsearch import (\n", + " Elasticsearch,\n", + " helpers,\n", + ") # Import the Elasticsearch client and helpers module\n", + "from urllib.request import urlopen # library for opening URLs\n", + "import json # module for handling JSON data\n", + "from pathlib import Path # module for working with file paths\n", + "\n", + "# Python client and toolkit for machine learning in Elasticsearch\n", + "from eland.ml.pytorch import PyTorchModel\n", + "from eland.ml.pytorch.transformers import TransformerModel\n", + "from elasticsearch.client import MlClient # Elastic module for ml\n", + "import getpass # handling password input" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Now we can instantiate the Python Elasticsearch client.\n", + "\n", + "First we prompt the user for their password and Cloud ID.\n", + "\n", + "🔐 NOTE: `getpass` enables us to securely prompt the user for credentials without echoing them to the terminal, or storing it in memory.\n", + "\n", + "Then we create a `client` object that instantiates an instance of the `Elasticsearch` class." + ], + "metadata": { + "id": "ea1VkDBXJIQR" + }, + "id": "ea1VkDBXJIQR" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6907a2bf-4927-428e-9ca8-9df3dd35a2cc", + "metadata": { + "id": "6907a2bf-4927-428e-9ca8-9df3dd35a2cc" + }, + "outputs": [], + "source": [ + "# Found in the 'Manage Deployment' page\n", + "CLOUD_ID = getpass.getpass(\"Enter Elastic Cloud ID: \")\n", + "\n", + "# Password for the 'elastic' user generated by Elasticsearch\n", + "ELASTIC_PASSWORD = getpass.getpass(\"Enter Elastic password: \")\n", + "\n", + "# Create the client instance\n", + "client = Elasticsearch(\n", + " cloud_id=CLOUD_ID, basic_auth=(\"elastic\", ELASTIC_PASSWORD), request_timeout=3600\n", + ")" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Setup emebdding model\n", + "\n", + "Next we upload the all-mpnet-base-v2 embedding model into Elasticsearch and create an ingest pipeline with inference processors for text embedding and text expansion, using the description field for both. This field contains the description of each product." + ], + "metadata": { + "id": "BH-N6epTJarM" + }, + "id": "BH-N6epTJarM" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f6f3f5a-2b93-4a0c-93c8-c887ca80f687", + "metadata": { + "id": "7f6f3f5a-2b93-4a0c-93c8-c887ca80f687" + }, + "outputs": [], + "source": [ + "# Set the model name from Hugging Face and task type\n", + "# sentence-transformers model\n", + "hf_model_id = \"sentence-transformers/all-mpnet-base-v2\"\n", + "tm = TransformerModel(hf_model_id, \"text_embedding\")\n", + "\n", + "# set the modelID as it is named in Elasticsearch\n", + "es_model_id = tm.elasticsearch_model_id()\n", + "\n", + "# Download the model from Hugging Face\n", + "tmp_path = \"models\"\n", + "Path(tmp_path).mkdir(parents=True, exist_ok=True)\n", + "model_path, config, vocab_path = tm.save(tmp_path)\n", + "\n", + "# Load the model into Elasticsearch\n", + "ptm = PyTorchModel(client, es_model_id)\n", + "ptm.import_model(\n", + " model_path=model_path, config_path=None, vocab_path=vocab_path, config=config\n", + ")\n", + "\n", + "# Start the model\n", + "s = MlClient.start_trained_model_deployment(client, model_id=es_model_id)\n", + "s.body" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6739f55b-6983-4b48-9349-6e0111b313fe", + "metadata": { + "id": "6739f55b-6983-4b48-9349-6e0111b313fe" + }, + "outputs": [], + "source": [ + "# Creating an ingest pipeline with inference processors to use ELSER (sparse) and all-mpnet-base-v2 (dense) to infer against data that will be ingested in the pipeline.\n", + "\n", + "client.ingest.put_pipeline(\n", + " id=\"ecommerce-pipeline\",\n", + " processors=[\n", + " {\n", + " \"inference\": {\n", + " \"model_id\": \"elser_model\",\n", + " \"target_field\": \"ml\",\n", + " \"field_map\": {\"description\": \"text_field\"},\n", + " \"inference_config\": {\n", + " \"text_expansion\": { # text_expansion inference type (ELSER)\n", + " \"results_field\": \"tokens\"\n", + " }\n", + " },\n", + " }\n", + " },\n", + " {\n", + " \"inference\": {\n", + " \"model_id\": \"sentence-transformers__all-mpnet-base-v2\",\n", + " \"target_field\": \"description_vector\", # Target field for the inference results\n", + " \"field_map\": {\n", + " \"description\": \"text_field\" # Field matching our configured trained model input. Typically for NLP models, the field name is text_field.\n", + " },\n", + " }\n", + " },\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Index documents\n", + "\n", + "Then, we create a source index to load `products-ecommerce.json`, this will be the `ecommerce` index and a destination index to extract the documents from the source and index these documents into the destination `ecommerce-search`.\n", + "\n", + "For the `ecommerce-search` index we add a field to support dense vector storage and search `description_vector.predicted_value`, this is the target field for inference results. The field type in this case is `dense_vector`, the `all-mpnet-base-v2` model has embedding_size of 768, so dims is set to 768. We also add a `rank_features` field type to support the text expansion output." + ], + "metadata": { + "id": "QUQ1nCaiKIQr" + }, + "id": "QUQ1nCaiKIQr" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e115bd0-e758-44db-b5b9-96217af472c1", + "metadata": { + "id": "6e115bd0-e758-44db-b5b9-96217af472c1" + }, + "outputs": [], + "source": [ + "# Index to load products-ecommerce.json docs\n", + "\n", + "client.indices.create(\n", + " index=\"ecommerce\",\n", + " mappings={\n", + " \"properties\": {\n", + " \"product\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", + " \"description\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", + " \"category\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", + " }\n", + " },\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418", + "metadata": { + "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418" + }, + "outputs": [], + "source": [ + "# Reindex dest index\n", + "\n", + "INDEX = \"ecommerce-search\"\n", + "client.indices.create(\n", + " index=INDEX,\n", + " settings={\"index\": {\"number_of_shards\": 1, \"number_of_replicas\": 1}},\n", + " mappings={\n", + " # Saving disk space by excluding the ELSER tokens and the dense_vector field from document source.\n", + " # Note: That should only be applied if you are certain that reindexing will not be required in the future.\n", + " \"_source\": {\"excludes\": [\"ml.tokens\", \"description_vector.predicted_value\"]},\n", + " \"properties\": {\n", + " \"product\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", + " \"description\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", + " \"category\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", + " \"ml.tokens\": { # The name of the field to contain the generated tokens.\n", + " \"type\": \"rank_features\" # ELSER output must be ingested into a field with the rank_features field type.\n", + " },\n", + " \"description_vector.predicted_value\": { # Inference results field, target_field.predicted_value\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 768, # The all-mpnet-base-v2 model has embedding_size of 768, so dims is set to 768.\n", + " \"index\": \"true\",\n", + " \"similarity\": \"dot_product\", # When indexing vectors for approximate kNN search, you need to specify the similarity function for comparing the vectors.\n", + " },\n", + " },\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Load documents\n", + "\n", + "Then we load `products-ecommerce.json` into the `ecommerce` index." + ], + "metadata": { + "id": "Vo-LKu8TOT5j" + }, + "id": "Vo-LKu8TOT5j" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cfdc3b7-7e4f-4111-997b-c333ac8938ba", + "metadata": { + "id": "3cfdc3b7-7e4f-4111-997b-c333ac8938ba" + }, + "outputs": [], + "source": [ + "# dataset\n", + "\n", + "url = \"https://raw.githubusercontent.com/elastic/elasticsearch-labs/02c01b3450e8ddc72ccec85d559eee5280c185ac/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/products-ecommerce.json\" # json raw file - update the link here\n", + "\n", + "response = urlopen(url)\n", + "\n", + "# Load the response data into a JSON object\n", + "data_json = json.loads(response.read())\n", + "\n", + "\n", + "def create_index_body(doc):\n", + " \"\"\"Generate the body for an Elasticsearch document.\"\"\"\n", + " return {\n", + " \"_index\": \"ecommerce\",\n", + " \"_source\": doc,\n", + " }\n", + "\n", + "\n", + "# Prepare the documents to be indexed\n", + "documents = [create_index_body(doc) for doc in data_json]\n", + "\n", + "# Use helpers.bulk to index\n", + "helpers.bulk(client, documents)\n", + "\n", + "print(\"Done indexing documents into `ecommerce` index\")" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Reindex\n", + "\n", + "Now we can reindex data from the `source` index `ecommerce` to the `dest` index `ecommerce-search` with the ingest pipeline `ecommerce-pipeline` we created.\n", + "\n", + "After this step our `dest` index will have the fields we need to perform Semantic Search." + ], + "metadata": { + "id": "3dShN9W4Opl8" + }, + "id": "3dShN9W4Opl8" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4297cb0b-ae2e-44f9-811d-27a41c43a858", + "metadata": { + "id": "4297cb0b-ae2e-44f9-811d-27a41c43a858" + }, + "outputs": [], + "source": [ + "# Reindex data from one index 'source' to another 'dest' with the 'ecommerce-pipeline' pipeline.\n", + "\n", + "client.reindex(\n", + " wait_for_completion=True,\n", + " source={\"index\": \"ecommerce\"},\n", + " dest={\"index\": \"ecommerce-search\", \"pipeline\": \"ecommerce-pipeline\"},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Text Analysis with Standard Analyzer" + ], + "metadata": { + "id": "-qUXNuOvPDsI" + }, + "id": "-qUXNuOvPDsI" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "829ae6e8-807d-4f0d-ada6-fee86748b91a", + "metadata": { + "id": "829ae6e8-807d-4f0d-ada6-fee86748b91a" + }, + "outputs": [], + "source": [ + "# Performs text analysis on a string and returns the resulting tokens.\n", + "\n", + "# Define the text to be analyzed\n", + "text = \"Comfortable furniture for a large balcony\"\n", + "\n", + "# Define the analyze request\n", + "request_body = {\"analyzer\": \"standard\", \"text\": text} # Standard Analyzer\n", + "\n", + "# Perform the analyze request\n", + "response = client.indices.analyze(\n", + " analyzer=request_body[\"analyzer\"], text=request_body[\"text\"]\n", + ")\n", + "\n", + "# Extract and display the analyzed tokens\n", + "tokens = [token[\"token\"] for token in response[\"tokens\"]]\n", + "print(\"Analyzed Tokens:\", tokens)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Text Analysis with Stop Analyzer" + ], + "metadata": { + "id": "12u70NLmPyNV" + }, + "id": "12u70NLmPyNV" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55b602d1-f1e4-4b70-9273-5fc701ac9039", + "metadata": { + "id": "55b602d1-f1e4-4b70-9273-5fc701ac9039" + }, + "outputs": [], + "source": [ + "# Performs text analysis on a string and returns the resulting tokens.\n", + "\n", + "# Define the text to be analyzed\n", + "text = \"Comfortable furniture for a large balcony\"\n", + "\n", + "# Define the analyze request\n", + "request_body = {\"analyzer\": \"stop\", \"text\": text} # Stop Analyzer\n", + "\n", + "# Perform the analyze request\n", + "response = client.indices.analyze(\n", + " analyzer=request_body[\"analyzer\"], text=request_body[\"text\"]\n", + ")\n", + "\n", + "# Extract and display the analyzed tokens\n", + "tokens = [token[\"token\"] for token in response[\"tokens\"]]\n", + "print(\"Analyzed Tokens:\", tokens)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Lexical Search" + ], + "metadata": { + "id": "8G8MKcUvP0zs" + }, + "id": "8G8MKcUvP0zs" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4984f6c-ceec-46a4-b64c-f749e6b1b04f", + "metadata": { + "id": "f4984f6c-ceec-46a4-b64c-f749e6b1b04f" + }, + "outputs": [], + "source": [ + "# BM25\n", + "\n", + "response = client.search(\n", + " size=2,\n", + " index=\"ecommerce-search\",\n", + " query={\n", + " \"match\": {\n", + " \"description\": {\n", + " \"query\": \"Comfortable furniture for a large balcony\",\n", + " \"analyzer\": \"stop\",\n", + " }\n", + " }\n", + " },\n", + ")\n", + "hits = response[\"hits\"][\"hits\"]\n", + "\n", + "if not hits:\n", + " print(\"No matches found\")\n", + "else:\n", + " for hit in hits:\n", + " score = hit[\"_score\"]\n", + " product = hit[\"_source\"][\"product\"]\n", + " category = hit[\"_source\"][\"category\"]\n", + " description = hit[\"_source\"][\"description\"]\n", + " print(\n", + " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Semantic Search with Dense Vector" + ], + "metadata": { + "id": "xiywcf_-P39a" + }, + "id": "xiywcf_-P39a" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72187c9a-14c1-4084-a080-4e5c1e614f22", + "metadata": { + "id": "72187c9a-14c1-4084-a080-4e5c1e614f22" + }, + "outputs": [], + "source": [ + "# KNN\n", + "\n", + "response = client.search(\n", + " index=\"ecommerce-search\",\n", + " size=2,\n", + " knn={\n", + " \"field\": \"description_vector.predicted_value\",\n", + " \"k\": 50, # Number of nearest neighbors to return as top hits.\n", + " \"num_candidates\": 500, # Number of nearest neighbor candidates to consider per shard. Increasing num_candidates tends to improve the accuracy of the final k results.\n", + " \"query_vector_builder\": { # Object indicating how to build a query_vector. kNN search enables you to perform semantic search by using a previously deployed text embedding model.\n", + " \"text_embedding\": {\n", + " \"model_id\": \"sentence-transformers__all-mpnet-base-v2\", # Text embedding model id\n", + " \"model_text\": \"Comfortable furniture for a large balcony\", # Query\n", + " }\n", + " },\n", + " },\n", + ")\n", + "\n", + "for hit in response[\"hits\"][\"hits\"]:\n", + "\n", + " score = hit[\"_score\"]\n", + " product = hit[\"_source\"][\"product\"]\n", + " category = hit[\"_source\"][\"category\"]\n", + " description = hit[\"_source\"][\"description\"]\n", + " print(\n", + " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Semantic Search with Sparse Vector" + ], + "metadata": { + "id": "QlWFdngRQFbv" + }, + "id": "QlWFdngRQFbv" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5", + "metadata": { + "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5" + }, + "outputs": [], + "source": [ + "# Elastic Learned Sparse Encoder - ELSER\n", + "\n", + "response = client.search(\n", + " index=\"ecommerce-search\",\n", + " size=2,\n", + " query={\n", + " \"text_expansion\": {\n", + " \"ml.tokens\": {\n", + " \"model_id\": \"elser_model\",\n", + " \"model_text\": \"Comfortable furniture for a large balcony\",\n", + " }\n", + " }\n", + " },\n", + ")\n", + "\n", + "for hit in response[\"hits\"][\"hits\"]:\n", + "\n", + " score = hit[\"_score\"]\n", + " product = hit[\"_source\"][\"product\"]\n", + " category = hit[\"_source\"][\"category\"]\n", + " description = hit[\"_source\"][\"description\"]\n", + " print(\n", + " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Hybrid Search - BM25+KNN linear combination" + ], + "metadata": { + "id": "kz9deDBYQJxr" + }, + "id": "kz9deDBYQJxr" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f84aa16b-49c5-4abf-a049-d556c225542e", + "metadata": { + "id": "f84aa16b-49c5-4abf-a049-d556c225542e" + }, + "outputs": [], + "source": [ + "# BM25 + KNN (Linear Combination)\n", + "\n", + "response = client.search(\n", + " index=\"ecommerce-search\",\n", + " size=2,\n", + " query={\n", + " \"bool\": {\n", + " \"should\": [\n", + " {\n", + " \"match\": {\n", + " \"description\": {\n", + " \"query\": \"A dining table and comfortable chairs for a large balcony\",\n", + " \"boost\": 1, # You can adjust the boost value\n", + " }\n", + " }\n", + " }\n", + " ]\n", + " }\n", + " },\n", + " knn={\n", + " \"field\": \"description_vector.predicted_value\",\n", + " \"k\": 50,\n", + " \"num_candidates\": 500,\n", + " \"boost\": 1, # You can adjust the boost value\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \"sentence-transformers__all-mpnet-base-v2\",\n", + " \"model_text\": \"A dining table and comfortable chairs for a large balcony\",\n", + " }\n", + " },\n", + " },\n", + ")\n", + "\n", + "for hit in response[\"hits\"][\"hits\"]:\n", + "\n", + " score = hit[\"_score\"]\n", + " product = hit[\"_source\"][\"product\"]\n", + " category = hit[\"_source\"][\"category\"]\n", + " description = hit[\"_source\"][\"description\"]\n", + " print(\n", + " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Hybrid Search - BM25+KNN RRF" + ], + "metadata": { + "id": "cybkWjmpQV8g" + }, + "id": "cybkWjmpQV8g" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa2e072d-37bb-43fd-a83f-e1cb55a24861", + "metadata": { + "id": "aa2e072d-37bb-43fd-a83f-e1cb55a24861" + }, + "outputs": [], + "source": [ + "# BM25 + KNN (RRF)\n", + "# RRF functionality is in technical preview and may be changed or removed in a future release. The syntax will likely change before GA.\n", + "\n", + "response = client.search(\n", + " index=\"ecommerce-search\",\n", + " size=2,\n", + " query={\n", + " \"bool\": {\n", + " \"should\": [\n", + " {\n", + " \"match\": {\n", + " \"description\": {\n", + " \"query\": \"A dining table and comfortable chairs for a large balcony\"\n", + " }\n", + " }\n", + " }\n", + " ]\n", + " }\n", + " },\n", + " knn={\n", + " \"field\": \"description_vector.predicted_value\",\n", + " \"k\": 50,\n", + " \"num_candidates\": 500,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \"sentence-transformers__all-mpnet-base-v2\",\n", + " \"model_text\": \"A dining table and comfortable chairs for a large balcony\",\n", + " }\n", + " },\n", + " },\n", + " rank={\n", + " \"rrf\": { # Reciprocal rank fusion\n", + " \"window_size\": 50, # This value determines the size of the individual result sets per query.\n", + " \"rank_constant\": 20, # This value determines how much influence documents in individual result sets per query have over the final ranked result set.\n", + " }\n", + " },\n", + ")\n", + "\n", + "for hit in response[\"hits\"][\"hits\"]:\n", + "\n", + " rank = hit[\"_rank\"]\n", + " category = hit[\"_source\"][\"category\"]\n", + " product = hit[\"_source\"][\"product\"]\n", + " description = hit[\"_source\"][\"description\"]\n", + " print(\n", + " f\"\\nRank: {rank}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Hybrid Search - BM25+ELSER linear combination" + ], + "metadata": { + "id": "LyKI2Z-XQbI6" + }, + "id": "LyKI2Z-XQbI6" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd842732-b20a-4c7a-b735-e1f558a9b922", + "metadata": { + "id": "bd842732-b20a-4c7a-b735-e1f558a9b922" + }, + "outputs": [], + "source": [ + "# BM25 + Elastic Learned Sparse Encoder (Linear Combination)\n", + "\n", + "response = client.search(\n", + " index=\"ecommerce-search\",\n", + " size=2,\n", + " query={\n", + " \"bool\": {\n", + " \"should\": [\n", + " {\n", + " \"match\": {\n", + " \"description\": {\n", + " \"query\": \"A dining table and comfortable chairs for a large balcony\",\n", + " \"boost\": 1, # You can adjust the boost value\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"text_expansion\": {\n", + " \"ml.tokens\": {\n", + " \"model_id\": \"elser_model\",\n", + " \"model_text\": \"A dining table and comfortable chairs for a large balcony\",\n", + " \"boost\": 1, # You can adjust the boost value\n", + " }\n", + " }\n", + " },\n", + " ]\n", + " }\n", + " },\n", + ")\n", + "\n", + "for hit in response[\"hits\"][\"hits\"]:\n", + "\n", + " score = hit[\"_score\"]\n", + " product = hit[\"_source\"][\"product\"]\n", + " category = hit[\"_source\"][\"category\"]\n", + " description = hit[\"_source\"][\"description\"]\n", + " print(\n", + " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + }, + "colab": { + "provenance": [] + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/supporting-blog-content/multilingual-e5/multilingual-e5.ipynb b/supporting-blog-content/multilingual-e5/multilingual-e5.ipynb index 0fbbae31..21187d95 100644 --- a/supporting-blog-content/multilingual-e5/multilingual-e5.ipynb +++ b/supporting-blog-content/multilingual-e5/multilingual-e5.ipynb @@ -1,491 +1,491 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "s49gpkvZ7q53" - }, - "source": [ - "# Multilingual vector search with E5 embedding models\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/supporting-blog-content/multilingual-e5/multilingual-e5.ipynb)\n", - "\n", - "In this example we'll use a multilingual embedding model\n", - "[multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base) to perform search on a toy dataset of mixed\n", - "language documents. The examples in this notebook follow the blog post of the same title: Multilingual vector search with E5 embedding models." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Y01AXpELkygt" - }, - "source": [ - "# 🧰 Requirements\n", - "\n", - "For this example, you will need:\n", - "\n", - "- An Elastic Cloud deployment with an ML node (min. 8 GB memory)\n", - " - We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html) for this example (available with a [free trial](https://cloud.elastic.co/registration?utm_source=github&utm_content=elasticsearch-labs-notebook))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "N4pI1-eIvWrI" - }, - "source": [ - "## Create Elastic Cloud deployment\n", - "\n", - "If you don't have an Elastic Cloud deployment, sign up [here](https://cloud.elastic.co/registration?utm_source=github&utm_content=elasticsearch-labs-notebook) for a free trial.\n", - "\n", - "- Go to the [Create deployment](https://cloud.elastic.co/deployments/create) page\n", - " - Select **Create deployment**\n", - " - Use the default node types for Elasticsearch and Kibana\n", - " - Add an ML node with **8 GB memory** (the multilingual E5 base model is larger than most)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gaTFHLJC-Mgi" - }, - "source": [ - "# Setup Elasticsearch environment\n", - "\n", - "To get started, we'll need to connect to our Elastic deployment using the Python client.\n", - "Because we're using an Elastic Cloud deployment, we'll use the **Cloud ID** to identify our deployment.\n", - "\n", - "First we need to `pip` install the packages we need for this example." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "K9Q1p2C9-wce", - "outputId": "9745cf6b-d8ae-4c85-9992-3b096645e52c" - }, - "outputs": [], - "source": [ - "!pip install elasticsearch eland[pytorch]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gEzq2Z1wBs3M" - }, - "source": [ - "Next we need to import the `elasticsearch` module and the `getpass` module.\n", - "`getpass` is part of the Python standard library and is used to securely prompt for credentials." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "uP_GTVRi-d96" - }, - "outputs": [], - "source": [ - "import getpass\n", - "\n", - "from elasticsearch import Elasticsearch" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AMSePFiZCRqX" - }, - "source": [ - "Now we can instantiate the Python Elasticsearch client.\n", - "First we prompt the user for their password and Cloud ID.\n", - "\n", - "🔐 NOTE: `getpass` enables us to securely prompt the user for credentials without echoing them to the terminal, or storing it in memory.\n", - "\n", - "Then we create a `client` object that instantiates an instance of the `Elasticsearch` class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "h0MdAZ53CdKL", - "outputId": "eac211ff-8172-45af-898c-993f7389d557" - }, - "outputs": [], - "source": [ - "# Found in the \"Manage Deployment\" page\n", - "CLOUD_ID = getpass.getpass(\"Enter Elastic Cloud ID: \")\n", - "\n", - "# Password for the \"elastic\" user generated by Elasticsearch\n", - "ELASTIC_PASSWORD = getpass.getpass(\"Enter Elastic password: \")\n", - "\n", - "# Create the client instance\n", - "client = Elasticsearch(\n", - " cloud_id=CLOUD_ID,\n", - " basic_auth=(\"elastic\", ELASTIC_PASSWORD)\n", - ")\n", - "\n", - "client.info()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rw80JKiZVrPE" - }, - "source": [ - "# Setup emebdding model\n", - "\n", - "Next we upload the E5 multilingual embedding model into Elasticsearch and create an ingest pipeline to automatically create embeddings when ingesting documents. For more details on this process, please see the blog post: [How to deploy NLP: Text Embeddings and Vector Search](https://www.elastic.co/blog/how-to-deploy-nlp-text-embeddings-and-vector-search)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "I2MQhlQKVrPF", - "outputId": "fd2e7798-2be6-4e36-9999-a7a29bd1c537" - }, - "outputs": [], - "source": [ - "MODEL_ID = \"multilingual-e5-base\"\n", - "\n", - "!eland_import_hub_model \\\n", - " --cloud-id $CLOUD_ID \\\n", - " --es-username elastic \\\n", - " --es-password $ELASTIC_PASSWORD \\\n", - " --hub-model-id intfloat/$MODEL_ID \\\n", - " --es-model-id $MODEL_ID \\\n", - " --task-type text_embedding \\\n", - " --start" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "bMil5myQVrPF" - }, - "outputs": [], - "source": [ - "client.ingest.put_pipeline(id=\"pipeline\", processors=[{\n", - " \"inference\": {\n", - " \"model_id\": MODEL_ID,\n", - " \"field_map\": {\n", - " \"passage\": \"text_field\" # field to embed: passage\n", - " },\n", - " \"target_field\": \"passage_embedding\" # embedded field: passage_embedding\n", - " }\n", - "}])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TF_wxIAhD07a" - }, - "source": [ - "# Index documents\n", - "\n", - "We need to add a field to support dense vector storage and search.\n", - "Note the `passage_embedding.predicted_value` field below, which is used to store the dense vector representation of the `passage` field, and will be automatically populated by the inference processor in the pipeline created above. The `passage_embedding` field will also store metadata from the inference process." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cvYECABJJs_2" - }, - "outputs": [], - "source": [ - "# Define the mapping and settings\n", - "mapping = {\n", - " \"properties\": {\n", - " \"id\": { \"type\": \"keyword\" },\n", - " \"language\": { \"type\": \"keyword\" },\n", - " \"passage\": { \"type\": \"text\" },\n", - " \"passage_embedding.predicted_value\": {\n", - " \"type\": \"dense_vector\",\n", - " \"dims\": 768,\n", - " \"index\": \"true\",\n", - " \"similarity\": \"cosine\"\n", - " }\n", - " },\n", - " \"_source\": {\n", - " \"excludes\": [\n", - " \"passage_embedding.predicted_value\"\n", - " ]\n", - " }\n", - "}\n", - "\n", - "settings = {\n", - " \"index\": {\n", - " \"number_of_replicas\": \"1\",\n", - " \"number_of_shards\": \"1\",\n", - " \"default_pipeline\": \"pipeline\"\n", - " }\n", - "}\n", - "\n", - "# Create the index (deleting any existing index)\n", - "client.indices.delete(index=\"passages\", ignore_unavailable=True)\n", - "client.indices.create(index=\"passages\", mappings=mapping, settings=settings)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_QUZr2gwVrPF" - }, - "source": [ - "Now that we have the pipeline and mappings ready, we can index our documents. This is of course just a demo so we only index the few toy examples from the blog post." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "48GUdoH1VrPF" - }, - "outputs": [], - "source": [ - "passages = [\n", - " {\n", - " \"id\": \"doc1\",\n", - " \"language\": \"en\",\n", - " \"passage\": \"\"\"I sat on the bank of the river today.\"\"\"\n", - " },\n", - " {\n", - " \"id\": \"doc2\",\n", - " \"language\": \"de\",\n", - " \"passage\": \"\"\"Ich bin heute zum Flussufer gegangen.\"\"\"\n", - " },\n", - " {\n", - " \"id\": \"doc3\",\n", - " \"language\": \"en\",\n", - " \"passage\": \"\"\"I walked to the bank today to deposit money.\"\"\"\n", - " },\n", - " {\n", - " \"id\": \"doc4\",\n", - " \"language\": \"de\",\n", - " \"passage\": \"\"\"Ich saß heute bei der Bank und wartete auf mein Geld.\"\"\"\n", - " }\n", - "]\n", - "\n", - "# Index passages, adding first the \"passage: \" instruction for E5\n", - "for doc in passages:\n", - " doc[\"passage\"] = f\"\"\"passage: {doc[\"passage\"]}\"\"\"\n", - " client.index(index=\"passages\", document=doc)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MrBCHdH1u8Wd" - }, - "source": [ - "# Multilingual semantic search" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "s49gpkvZ7q53" + }, + "source": [ + "# Multilingual vector search with E5 embedding models\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/supporting-blog-content/multilingual-e5/multilingual-e5.ipynb)\n", + "\n", + "In this example we'll use a multilingual embedding model\n", + "[multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base) to perform search on a toy dataset of mixed\n", + "language documents. The examples in this notebook follow the blog post of the same title: Multilingual vector search with E5 embedding models." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y01AXpELkygt" + }, + "source": [ + "# 🧰 Requirements\n", + "\n", + "For this example, you will need:\n", + "\n", + "- An Elastic Cloud deployment with an ML node (min. 8 GB memory)\n", + " - We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html) for this example (available with a [free trial](https://cloud.elastic.co/registration?utm_source=github&utm_content=elasticsearch-labs-notebook))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "N4pI1-eIvWrI" + }, + "source": [ + "## Create Elastic Cloud deployment\n", + "\n", + "If you don't have an Elastic Cloud deployment, sign up [here](https://cloud.elastic.co/registration?utm_source=github&utm_content=elasticsearch-labs-notebook) for a free trial.\n", + "\n", + "- Go to the [Create deployment](https://cloud.elastic.co/deployments/create) page\n", + " - Select **Create deployment**\n", + " - Use the default node types for Elasticsearch and Kibana\n", + " - Add an ML node with **8 GB memory** (the multilingual E5 base model is larger than most)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gaTFHLJC-Mgi" + }, + "source": [ + "# Setup Elasticsearch environment\n", + "\n", + "To get started, we'll need to connect to our Elastic deployment using the Python client.\n", + "Because we're using an Elastic Cloud deployment, we'll use the **Cloud ID** to identify our deployment.\n", + "\n", + "First we need to `pip` install the packages we need for this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "f2fHLYxnVrPF" - }, - "outputs": [], - "source": [ - "def query(q):\n", - " \"\"\"Query with embeddings, adding first the \"query: \" instruction for E5.\"\"\"\n", - "\n", - " return client.search(index=\"passages\", knn={\n", - " \"field\": \"passage_embedding.predicted_value\",\n", - " \"query_vector_builder\": {\n", - " \"text_embedding\": {\n", - " \"model_id\": MODEL_ID,\n", - " \"model_text\": f\"query: {q}\",\n", - " }\n", - " },\n", - " \"k\": 2, # for the demo, we're always just searching for pairs of passages\n", - " \"num_candidates\": 5\n", - " })\n", - "\n", - "def pretty_response(response):\n", - " \"\"\"Pretty print search responses.\"\"\"\n", - "\n", - " for hit in response[\"hits\"][\"hits\"]:\n", - " score = hit[\"_score\"]\n", - " id = hit[\"_source\"][\"id\"]\n", - " language = hit[\"_source\"][\"language\"]\n", - " passage = hit[\"_source\"][\"passage\"]\n", - " print()\n", - " print(f\"ID: {id}\")\n", - " print(f\"Language: {language}\")\n", - " print(f\"Passage: {passage}\")\n", - " print(f\"Score: {score}\")" - ] + "id": "K9Q1p2C9-wce", + "outputId": "9745cf6b-d8ae-4c85-9992-3b096645e52c" + }, + "outputs": [], + "source": [ + "!pip install elasticsearch eland[pytorch]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gEzq2Z1wBs3M" + }, + "source": [ + "Next we need to import the `elasticsearch` module and the `getpass` module.\n", + "`getpass` is part of the Python standard library and is used to securely prompt for credentials." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "uP_GTVRi-d96" + }, + "outputs": [], + "source": [ + "import getpass\n", + "\n", + "from elasticsearch import Elasticsearch" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AMSePFiZCRqX" + }, + "source": [ + "Now we can instantiate the Python Elasticsearch client.\n", + "First we prompt the user for their password and Cloud ID.\n", + "\n", + "🔐 NOTE: `getpass` enables us to securely prompt the user for credentials without echoing them to the terminal, or storing it in memory.\n", + "\n", + "Then we create a `client` object that instantiates an instance of the `Elasticsearch` class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "e6ssY6dfVrPG", - "outputId": "01625e8c-bef8-485e-e5b8-118f7386d79a" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "ID: doc1\n", - "Language: en\n", - "Passage: passage: I sat on the bank of the river today.\n", - "Score: 0.88001645\n", - "\n", - "ID: doc2\n", - "Language: de\n", - "Passage: passage: Ich bin heute zum Flussufer gegangen.\n", - "Score: 0.87662137\n" - ] - } - ], - "source": [ - "# Example 1\n", - "pretty_response(query(\"riverside\"))" - ] + "id": "h0MdAZ53CdKL", + "outputId": "eac211ff-8172-45af-898c-993f7389d557" + }, + "outputs": [], + "source": [ + "# Found in the \"Manage Deployment\" page\n", + "CLOUD_ID = getpass.getpass(\"Enter Elastic Cloud ID: \")\n", + "\n", + "# Password for the \"elastic\" user generated by Elasticsearch\n", + "ELASTIC_PASSWORD = getpass.getpass(\"Enter Elastic password: \")\n", + "\n", + "# Create the client instance\n", + "client = Elasticsearch(cloud_id=CLOUD_ID, basic_auth=(\"elastic\", ELASTIC_PASSWORD))\n", + "\n", + "client.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rw80JKiZVrPE" + }, + "source": [ + "# Setup emebdding model\n", + "\n", + "Next we upload the E5 multilingual embedding model into Elasticsearch and create an ingest pipeline to automatically create embeddings when ingesting documents. For more details on this process, please see the blog post: [How to deploy NLP: Text Embeddings and Vector Search](https://www.elastic.co/blog/how-to-deploy-nlp-text-embeddings-and-vector-search)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "I2MQhlQKVrPF", + "outputId": "fd2e7798-2be6-4e36-9999-a7a29bd1c537" + }, + "outputs": [], + "source": [ + "MODEL_ID = \"multilingual-e5-base\"\n", + "\n", + "!eland_import_hub_model \\\n", + " --cloud-id $CLOUD_ID \\\n", + " --es-username elastic \\\n", + " --es-password $ELASTIC_PASSWORD \\\n", + " --hub-model-id intfloat/$MODEL_ID \\\n", + " --es-model-id $MODEL_ID \\\n", + " --task-type text_embedding \\\n", + " --start" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bMil5myQVrPF" + }, + "outputs": [], + "source": [ + "client.ingest.put_pipeline(\n", + " id=\"pipeline\",\n", + " processors=[\n", + " {\n", + " \"inference\": {\n", + " \"model_id\": MODEL_ID,\n", + " \"field_map\": {\"passage\": \"text_field\"}, # field to embed: passage\n", + " \"target_field\": \"passage_embedding\", # embedded field: passage_embedding\n", + " }\n", + " }\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TF_wxIAhD07a" + }, + "source": [ + "# Index documents\n", + "\n", + "We need to add a field to support dense vector storage and search.\n", + "Note the `passage_embedding.predicted_value` field below, which is used to store the dense vector representation of the `passage` field, and will be automatically populated by the inference processor in the pipeline created above. The `passage_embedding` field will also store metadata from the inference process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cvYECABJJs_2" + }, + "outputs": [], + "source": [ + "# Define the mapping and settings\n", + "mapping = {\n", + " \"properties\": {\n", + " \"id\": {\"type\": \"keyword\"},\n", + " \"language\": {\"type\": \"keyword\"},\n", + " \"passage\": {\"type\": \"text\"},\n", + " \"passage_embedding.predicted_value\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 768,\n", + " \"index\": \"true\",\n", + " \"similarity\": \"cosine\",\n", + " },\n", + " },\n", + " \"_source\": {\"excludes\": [\"passage_embedding.predicted_value\"]},\n", + "}\n", + "\n", + "settings = {\n", + " \"index\": {\n", + " \"number_of_replicas\": \"1\",\n", + " \"number_of_shards\": \"1\",\n", + " \"default_pipeline\": \"pipeline\",\n", + " }\n", + "}\n", + "\n", + "# Create the index (deleting any existing index)\n", + "client.indices.delete(index=\"passages\", ignore_unavailable=True)\n", + "client.indices.create(index=\"passages\", mappings=mapping, settings=settings)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_QUZr2gwVrPF" + }, + "source": [ + "Now that we have the pipeline and mappings ready, we can index our documents. This is of course just a demo so we only index the few toy examples from the blog post." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "48GUdoH1VrPF" + }, + "outputs": [], + "source": [ + "passages = [\n", + " {\n", + " \"id\": \"doc1\",\n", + " \"language\": \"en\",\n", + " \"passage\": \"\"\"I sat on the bank of the river today.\"\"\",\n", + " },\n", + " {\n", + " \"id\": \"doc2\",\n", + " \"language\": \"de\",\n", + " \"passage\": \"\"\"Ich bin heute zum Flussufer gegangen.\"\"\",\n", + " },\n", + " {\n", + " \"id\": \"doc3\",\n", + " \"language\": \"en\",\n", + " \"passage\": \"\"\"I walked to the bank today to deposit money.\"\"\",\n", + " },\n", + " {\n", + " \"id\": \"doc4\",\n", + " \"language\": \"de\",\n", + " \"passage\": \"\"\"Ich saß heute bei der Bank und wartete auf mein Geld.\"\"\",\n", + " },\n", + "]\n", + "\n", + "# Index passages, adding first the \"passage: \" instruction for E5\n", + "for doc in passages:\n", + " doc[\"passage\"] = f\"\"\"passage: {doc[\"passage\"]}\"\"\"\n", + " client.index(index=\"passages\", document=doc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MrBCHdH1u8Wd" + }, + "source": [ + "# Multilingual semantic search" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "f2fHLYxnVrPF" + }, + "outputs": [], + "source": [ + "def query(q):\n", + " \"\"\"Query with embeddings, adding first the \"query: \" instruction for E5.\"\"\"\n", + "\n", + " return client.search(\n", + " index=\"passages\",\n", + " knn={\n", + " \"field\": \"passage_embedding.predicted_value\",\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": MODEL_ID,\n", + " \"model_text\": f\"query: {q}\",\n", + " }\n", + " },\n", + " \"k\": 2, # for the demo, we're always just searching for pairs of passages\n", + " \"num_candidates\": 5,\n", + " },\n", + " )\n", + "\n", + "\n", + "def pretty_response(response):\n", + " \"\"\"Pretty print search responses.\"\"\"\n", + "\n", + " for hit in response[\"hits\"][\"hits\"]:\n", + " score = hit[\"_score\"]\n", + " id = hit[\"_source\"][\"id\"]\n", + " language = hit[\"_source\"][\"language\"]\n", + " passage = hit[\"_source\"][\"passage\"]\n", + " print()\n", + " print(f\"ID: {id}\")\n", + " print(f\"Language: {language}\")\n", + " print(f\"Passage: {passage}\")\n", + " print(f\"Score: {score}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "e6ssY6dfVrPG", + "outputId": "01625e8c-bef8-485e-e5b8-118f7386d79a" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ifFbSGkSVrPG", - "outputId": "a14f0ff6-62a2-4ed2-cec3-2dc2bb22e969" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "ID: doc4\n", - "Language: de\n", - "Passage: passage: Ich saß heute bei der Bank und wartete auf mein Geld.\n", - "Score: 0.8967148\n", - "\n", - "ID: doc3\n", - "Language: en\n", - "Passage: passage: I walked to the bank today to deposit money.\n", - "Score: 0.8863925\n" - ] - } - ], - "source": [ - "# Example 2\n", - "pretty_response(query(\"Geldautomat\"))" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "ID: doc1\n", + "Language: en\n", + "Passage: passage: I sat on the bank of the river today.\n", + "Score: 0.88001645\n", + "\n", + "ID: doc2\n", + "Language: de\n", + "Passage: passage: Ich bin heute zum Flussufer gegangen.\n", + "Score: 0.87662137\n" + ] + } + ], + "source": [ + "# Example 1\n", + "pretty_response(query(\"riverside\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ifFbSGkSVrPG", + "outputId": "a14f0ff6-62a2-4ed2-cec3-2dc2bb22e969" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "YqcLx5fSVrPH", - "outputId": "5a0e2b19-24dd-4ee6-c887-fad26cb24538" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "ID: doc3\n", - "Language: en\n", - "Passage: passage: I walked to the bank today to deposit money.\n", - "Score: 0.87475425\n", - "\n", - "ID: doc2\n", - "Language: de\n", - "Passage: passage: Ich bin heute zum Flussufer gegangen.\n", - "Score: 0.8741033\n" - ] - } - ], - "source": [ - "# Example 3a\n", - "pretty_response(query(\"movement\"))" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "ID: doc4\n", + "Language: de\n", + "Passage: passage: Ich saß heute bei der Bank und wartete auf mein Geld.\n", + "Score: 0.8967148\n", + "\n", + "ID: doc3\n", + "Language: en\n", + "Passage: passage: I walked to the bank today to deposit money.\n", + "Score: 0.8863925\n" + ] + } + ], + "source": [ + "# Example 2\n", + "pretty_response(query(\"Geldautomat\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YqcLx5fSVrPH", + "outputId": "5a0e2b19-24dd-4ee6-c887-fad26cb24538" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SoIXRY-jVrPH", - "outputId": "2285cd2f-7d79-4553-dbea-dc8844841622" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "ID: doc4\n", - "Language: de\n", - "Passage: passage: Ich saß heute bei der Bank und wartete auf mein Geld.\n", - "Score: 0.85991657\n", - "\n", - "ID: doc1\n", - "Language: en\n", - "Passage: passage: I sat on the bank of the river today.\n", - "Score: 0.8561436\n" - ] - } - ], - "source": [ - "# Example 3b\n", - "pretty_response(query(\"stillness\"))" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "ID: doc3\n", + "Language: en\n", + "Passage: passage: I walked to the bank today to deposit money.\n", + "Score: 0.87475425\n", + "\n", + "ID: doc2\n", + "Language: de\n", + "Passage: passage: Ich bin heute zum Flussufer gegangen.\n", + "Score: 0.8741033\n" + ] } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3.11.4 64-bit", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - }, - "vscode": { - "interpreter": { - "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" - } + ], + "source": [ + "# Example 3a\n", + "pretty_response(query(\"movement\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SoIXRY-jVrPH", + "outputId": "2285cd2f-7d79-4553-dbea-dc8844841622" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "ID: doc4\n", + "Language: de\n", + "Passage: passage: Ich saß heute bei der Bank und wartete auf mein Geld.\n", + "Score: 0.85991657\n", + "\n", + "ID: doc1\n", + "Language: en\n", + "Passage: passage: I sat on the bank of the river today.\n", + "Score: 0.8561436\n" + ] } + ], + "source": [ + "# Example 3b\n", + "pretty_response(query(\"stillness\"))" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3.11.4 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" }, - "nbformat": 4, - "nbformat_minor": 0 + "vscode": { + "interpreter": { + "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/supporting-blog-content/music-search/elastic_music_search.ipynb b/supporting-blog-content/music-search/elastic_music_search.ipynb index 5373e570..5c8ed02c 100644 --- a/supporting-blog-content/music-search/elastic_music_search.ipynb +++ b/supporting-blog-content/music-search/elastic_music_search.ipynb @@ -1,736 +1,722 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "n6jBQq5ePGYS" - }, - "source": [ - "# Humming search\n", - "# Author: Alex Salgado\n", - "## Obs: Code from blog 'Searching by Music: Leveraging Vector Search for Music Information Retrieval'" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WVB15gN4cvUP" - }, - "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/salgado/music-search/blob/main/elastic_music_search.ipynb)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "pZwFi6xBcvUP", - "outputId": "cdc5030f-974f-42f9-f6fd-633b225b5716", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Cloning into 'music-search'...\n", - "remote: Enumerating objects: 40, done.\u001b[K\n", - "remote: Counting objects: 100% (40/40), done.\u001b[K\n", - "remote: Compressing objects: 100% (36/36), done.\u001b[K\n", - "remote: Total 40 (delta 10), reused 27 (delta 4), pack-reused 0\u001b[K\n", - "Receiving objects: 100% (40/40), 8.38 MiB | 27.51 MiB/s, done.\n", - "Resolving deltas: 100% (10/10), done.\n" - ] - } - ], - "source": [ - "!git clone https://github.com/salgado/music-search.git" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "_kHUvq5UBjsa", - "outputId": "27bdfccf-9b4e-4bc5-d1d7-ce7fd585474d" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting elasticsearch\n", - " Downloading elasticsearch-8.9.0-py3-none-any.whl (395 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m395.5/395.5 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting elastic-transport<9,>=8 (from elasticsearch)\n", - " Downloading elastic_transport-8.4.0-py3-none-any.whl (59 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.5/59.5 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting urllib3<2,>=1.26.2 (from elastic-transport<9,>=8->elasticsearch)\n", - " Downloading urllib3-1.26.16-py2.py3-none-any.whl (143 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.1/143.1 kB\u001b[0m \u001b[31m15.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8->elasticsearch) (2023.7.22)\n", - "Installing collected packages: urllib3, elastic-transport, elasticsearch\n", - " Attempting uninstall: urllib3\n", - " Found existing installation: urllib3 2.0.4\n", - " Uninstalling urllib3-2.0.4:\n", - " Successfully uninstalled urllib3-2.0.4\n", - "Successfully installed elastic-transport-8.4.0 elasticsearch-8.9.0 urllib3-1.26.16\n" - ] - } - ], - "source": [ - "!pip install elasticsearch" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "SBB742d6FbqL", - "outputId": "02d87089-906b-472e-b756-19080b37e2c9" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Enter Elastic Cloud ID: ··········\n", - "Enter cluster username: ··········\n", - "Enter cluster password: ··········\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "ObjectApiResponse({'name': 'instance-0000000007', 'cluster_name': 'df2380a9e6b0425f9d4bc01639e59cf5', 'cluster_uuid': 'FvCTlZHYQqasErU8cbn4_A', 'version': {'number': '8.8.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'f8edfccba429b6477927a7c1ce1bc6729521305e', 'build_date': '2023-06-05T21:32:25.188464208Z', 'build_snapshot': False, 'lucene_version': '9.6.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})" - ] - }, - "metadata": {}, - "execution_count": 3 - } - ], - "source": [ - "#index data in elasticsearch\n", - "from elasticsearch import Elasticsearch\n", - "import getpass\n", - "\n", - "es_cloud_id = getpass.getpass('Enter Elastic Cloud ID: ')\n", - "es_user = getpass.getpass('Enter cluster username: ')\n", - "es_pass = getpass.getpass('Enter cluster password: ')\n", - "es = Elasticsearch(cloud_id=es_cloud_id,\n", - " basic_auth=(es_user, es_pass)\n", - " )\n", - "es.info() # should return cluster info" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "e55hlxlCFePb" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CpDhPsFsFhY4" - }, - "source": [ - "## create index" - ] - }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "n6jBQq5ePGYS" + }, + "source": [ + "# Humming search\n", + "# Author: Alex Salgado\n", + "## Obs: Code from blog 'Searching by Music: Leveraging Vector Search for Music Information Retrieval'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WVB15gN4cvUP" + }, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/salgado/music-search/blob/main/elastic_music_search.ipynb)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "pZwFi6xBcvUP", + "outputId": "cdc5030f-974f-42f9-f6fd-633b225b5716", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "JBQgk_q6cvUR" - }, - "outputs": [], - "source": [ - "index_name = \"my-audio-index\"\n" - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "Cloning into 'music-search'...\n", + "remote: Enumerating objects: 40, done.\u001b[K\n", + "remote: Counting objects: 100% (40/40), done.\u001b[K\n", + "remote: Compressing objects: 100% (36/36), done.\u001b[K\n", + "remote: Total 40 (delta 10), reused 27 (delta 4), pack-reused 0\u001b[K\n", + "Receiving objects: 100% (40/40), 8.38 MiB | 27.51 MiB/s, done.\n", + "Resolving deltas: 100% (10/10), done.\n" + ] + } + ], + "source": [ + "!git clone https://github.com/salgado/music-search.git" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "_kHUvq5UBjsa", + "outputId": "27bdfccf-9b4e-4bc5-d1d7-ce7fd585474d" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "inRCoHOMBe7r", - "outputId": "f5481309-8b10-4780-a0b0-e71fd89c71ad" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "index created: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'my-audio-index'}\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - ":41: DeprecationWarning: Passing transport options in the API method is deprecated. Use 'Elasticsearch.options()' instead.\n", - " index_creation = es.indices.create(index=index_name, ignore=400, body=index_config)\n", - ":41: DeprecationWarning: The 'body' parameter is deprecated and will be removed in a future version. Instead use individual parameters.\n", - " index_creation = es.indices.create(index=index_name, ignore=400, body=index_config)\n" - ] - } - ], - "source": [ - "from elasticsearch import Elasticsearch\n", - "\n", - "\n", - "# Specify index configuration\n", - "index_config = {\n", - " \"mappings\": {\n", - " \"_source\": {\n", - " \"excludes\": [\"audio-embedding\"]\n", - " },\n", - " \"properties\": {\n", - " \"audio-embedding\": {\n", - " \"type\": \"dense_vector\",\n", - " \"dims\": 2048,\n", - " \"index\": True,\n", - " \"similarity\": \"cosine\"\n", - " },\n", - " \"path\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\n", - " \"type\": \"keyword\",\n", - " \"ignore_above\": 256\n", - " }\n", - " }\n", - " },\n", - " \"timestamp\": {\n", - " \"type\": \"date\"\n", - " },\n", - " \"title\": {\n", - " \"type\": \"text\"\n", - " },\n", - " \"genre\": {\n", - " \"type\": \"text\"\n", - " }\n", - " }\n", - " }\n", - "}\n", - "\n", - "# Create index\n", - "if not es.indices.exists(index=index_name):\n", - " index_creation = es.indices.create(index=index_name, ignore=400, body=index_config)\n", - " print(\"index created: \", index_creation)\n", - "else:\n", - " print(\"Index already exists.\")\n" - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting elasticsearch\n", + " Downloading elasticsearch-8.9.0-py3-none-any.whl (395 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m395.5/395.5 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting elastic-transport<9,>=8 (from elasticsearch)\n", + " Downloading elastic_transport-8.4.0-py3-none-any.whl (59 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.5/59.5 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting urllib3<2,>=1.26.2 (from elastic-transport<9,>=8->elasticsearch)\n", + " Downloading urllib3-1.26.16-py2.py3-none-any.whl (143 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.1/143.1 kB\u001b[0m \u001b[31m15.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8->elasticsearch) (2023.7.22)\n", + "Installing collected packages: urllib3, elastic-transport, elasticsearch\n", + " Attempting uninstall: urllib3\n", + " Found existing installation: urllib3 2.0.4\n", + " Uninstalling urllib3-2.0.4:\n", + " Successfully uninstalled urllib3-2.0.4\n", + "Successfully installed elastic-transport-8.4.0 elasticsearch-8.9.0 urllib3-1.26.16\n" + ] + } + ], + "source": [ + "!pip install elasticsearch" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "SBB742d6FbqL", + "outputId": "02d87089-906b-472e-b756-19080b37e2c9" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "oxJdxHLFF1QX", - "outputId": "0cabd7eb-ff9b-43e9-9dfa-3cd6f951a044" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "mozart_symphony25_guitar-solo\n", - "mozart_symphony25_string-quartet\n", - "mozart_symphony25_jazz-with-saxophone\n", - "bella_ciao_jazz-with-saxophone\n", - "bella_ciao_guitar-solo\n", - "mozart_symphony25_opera-singer\n", - "bella_ciao_string-quartet\n", - "mozart_symphony25_electronic-synth-lead\n", - "a-cappella-chorus\n", - "bella_ciao_piano-solo\n", - "bella_ciao_electronic-synth-lead\n", - "bella_ciao_humming\n", - "mozart_symphony25_tribal-drums-and-flute\n", - "bella_ciao_opera-singer\n", - "mozart_symphony25_piano-solo\n", - "bella_ciao_a-cappella-chorus\n", - "mozart_symphony25_prompt\n", - "bella_ciao_tribal-drums-and-flute\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "def list_audio_files(directory):\n", - " # The list to store the names of .wav files\n", - " audio_files = []\n", - "\n", - " # Check if the path exists\n", - " if os.path.exists(directory):\n", - " # Walk the directory\n", - " for root, dirs, files in os.walk(directory):\n", - " for file in files:\n", - " # Check if the file is a .wav file\n", - " if file.endswith('.wav'):\n", - " # Extract the filename from the path\n", - " filename = os.path.splitext(file)[0]\n", - " print(filename)\n", - "\n", - " # Add the file to the list\n", - " audio_files.append(file)\n", - " else:\n", - " print(f\"The directory '{directory}' does not exist.\")\n", - "\n", - " # Return the list of .mp3 files\n", - " return audio_files\n", - "\n", - "# Use the function\n", - "audio_path = \"/content/music-search/dataset/\"\n", - "audio_files = list_audio_files(audio_path)\n", - "\n", - "\n" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Enter Elastic Cloud ID: ··········\n", + "Enter cluster username: ··········\n", + "Enter cluster password: ··········\n" + ] }, { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "fCQYEOQDAXVm" - }, - "outputs": [], - "source": [ - "!pip install -qU panns-inference librosa" + "output_type": "execute_result", + "data": { + "text/plain": [ + "ObjectApiResponse({'name': 'instance-0000000007', 'cluster_name': 'df2380a9e6b0425f9d4bc01639e59cf5', 'cluster_uuid': 'FvCTlZHYQqasErU8cbn4_A', 'version': {'number': '8.8.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'f8edfccba429b6477927a7c1ce1bc6729521305e', 'build_date': '2023-06-05T21:32:25.188464208Z', 'build_snapshot': False, 'lucene_version': '9.6.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})" ] + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "# index data in elasticsearch\n", + "from elasticsearch import Elasticsearch\n", + "import getpass\n", + "\n", + "es_cloud_id = getpass.getpass(\"Enter Elastic Cloud ID: \")\n", + "es_user = getpass.getpass(\"Enter cluster username: \")\n", + "es_pass = getpass.getpass(\"Enter cluster password: \")\n", + "es = Elasticsearch(cloud_id=es_cloud_id, basic_auth=(es_user, es_pass))\n", + "es.info() # should return cluster info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "e55hlxlCFePb" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CpDhPsFsFhY4" + }, + "source": [ + "## create index" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "JBQgk_q6cvUR" + }, + "outputs": [], + "source": [ + "index_name = \"my-audio-index\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "inRCoHOMBe7r", + "outputId": "f5481309-8b10-4780-a0b0-e71fd89c71ad" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "sCI7uIqXCmsZ", - "outputId": "9aa4f3e2-c0bd-49da-9697-fa06bd909c61" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Checkpoint path: /root/panns_data/Cnn14_mAP=0.431.pth\n", - "Using CPU.\n" - ] - } - ], - "source": [ - "from panns_inference import AudioTagging\n", - "\n", - "# load the default model into the gpu.\n", - "model = AudioTagging(checkpoint_path=None, device='cuda') # change device to cpu if a gpu is not available" - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "index created: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'my-audio-index'}\n" + ] }, { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "IiUwmLueCqIf" - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import librosa\n", - "\n", - "# Function to normalize a vector. Normalizing a vector means adjusting the values measured in different scales to a common scale.\n", - "def normalize(v):\n", - " # np.linalg.norm computes the vector's norm (magnitude). The norm is the total length of all vectors in a space.\n", - " norm = np.linalg.norm(v)\n", - " if norm == 0:\n", - " return v\n", - "\n", - " # Return the normalized vector.\n", - " return v / norm\n", - "\n", - "# Function to get an embedding of an audio file. An embedding is a reduced-dimensionality representation of the file.\n", - "def get_embedding (audio_file):\n", - "\n", - " # Load the audio file using librosa's load function, which returns an audio time series and its corresponding sample rate.\n", - " a, _ = librosa.load(audio_file, sr=44100)\n", - "\n", - " # Reshape the audio time series to have an extra dimension, which is required by the model's inference function.\n", - " query_audio = a[None, :]\n", - "\n", - " # Perform inference on the reshaped audio using the model. This returns an embedding of the audio.\n", - " _, emb = model.inference(query_audio)\n", - "\n", - " # Normalize the embedding. This scales the embedding to have a length (magnitude) of 1, while maintaining its direction.\n", - " normalized_v = normalize(emb[0])\n", - "\n", - " # Return the normalized embedding required for dot_product elastic similarity dense vector\n", - " return normalized_v" - ] + "output_type": "stream", + "name": "stderr", + "text": [ + ":41: DeprecationWarning: Passing transport options in the API method is deprecated. Use 'Elasticsearch.options()' instead.\n", + " index_creation = es.indices.create(index=index_name, ignore=400, body=index_config)\n", + ":41: DeprecationWarning: The 'body' parameter is deprecated and will be removed in a future version. Instead use individual parameters.\n", + " index_creation = es.indices.create(index=index_name, ignore=400, body=index_config)\n" + ] + } + ], + "source": [ + "from elasticsearch import Elasticsearch\n", + "\n", + "\n", + "# Specify index configuration\n", + "index_config = {\n", + " \"mappings\": {\n", + " \"_source\": {\"excludes\": [\"audio-embedding\"]},\n", + " \"properties\": {\n", + " \"audio-embedding\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 2048,\n", + " \"index\": True,\n", + " \"similarity\": \"cosine\",\n", + " },\n", + " \"path\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", + " \"timestamp\": {\"type\": \"date\"},\n", + " \"title\": {\"type\": \"text\"},\n", + " \"genre\": {\"type\": \"text\"},\n", + " },\n", + " }\n", + "}\n", + "\n", + "# Create index\n", + "if not es.indices.exists(index=index_name):\n", + " index_creation = es.indices.create(index=index_name, ignore=400, body=index_config)\n", + " print(\"index created: \", index_creation)\n", + "else:\n", + " print(\"Index already exists.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "oxJdxHLFF1QX", + "outputId": "0cabd7eb-ff9b-43e9-9dfa-3cd6f951a044" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "PvUiFaqIJPE2" - }, - "outputs": [], - "source": [ - "from datetime import datetime\n", - "\n", - "#Storing Songs in Elasticsearch with Vector Embeddings:\n", - "def store_in_elasticsearch(song, embedding, path, index_name, genre, vec_field):\n", - " body = {\n", - " 'audio-embedding' : embedding,\n", - " 'title': song,\n", - " 'timestamp': datetime.now(),\n", - " 'path' : path,\n", - " 'genre' : genre\n", - "\n", - " }\n", - "\n", - " es.index(index=index_name, document=body)\n", - " print (\"stored...\",song, embedding, path, genre, index_name)\n", - "\n" - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "mozart_symphony25_guitar-solo\n", + "mozart_symphony25_string-quartet\n", + "mozart_symphony25_jazz-with-saxophone\n", + "bella_ciao_jazz-with-saxophone\n", + "bella_ciao_guitar-solo\n", + "mozart_symphony25_opera-singer\n", + "bella_ciao_string-quartet\n", + "mozart_symphony25_electronic-synth-lead\n", + "a-cappella-chorus\n", + "bella_ciao_piano-solo\n", + "bella_ciao_electronic-synth-lead\n", + "bella_ciao_humming\n", + "mozart_symphony25_tribal-drums-and-flute\n", + "bella_ciao_opera-singer\n", + "mozart_symphony25_piano-solo\n", + "bella_ciao_a-cappella-chorus\n", + "mozart_symphony25_prompt\n", + "bella_ciao_tribal-drums-and-flute\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "\n", + "def list_audio_files(directory):\n", + " # The list to store the names of .wav files\n", + " audio_files = []\n", + "\n", + " # Check if the path exists\n", + " if os.path.exists(directory):\n", + " # Walk the directory\n", + " for root, dirs, files in os.walk(directory):\n", + " for file in files:\n", + " # Check if the file is a .wav file\n", + " if file.endswith(\".wav\"):\n", + " # Extract the filename from the path\n", + " filename = os.path.splitext(file)[0]\n", + " print(filename)\n", + "\n", + " # Add the file to the list\n", + " audio_files.append(file)\n", + " else:\n", + " print(f\"The directory '{directory}' does not exist.\")\n", + "\n", + " # Return the list of .mp3 files\n", + " return audio_files\n", + "\n", + "\n", + "# Use the function\n", + "audio_path = \"/content/music-search/dataset/\"\n", + "audio_files = list_audio_files(audio_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "fCQYEOQDAXVm" + }, + "outputs": [], + "source": [ + "!pip install -qU panns-inference librosa" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "sCI7uIqXCmsZ", + "outputId": "9aa4f3e2-c0bd-49da-9697-fa06bd909c61" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "N2m1ViF3Cw7t", - "outputId": "72e69712-60af-4a5e-c4b3-21b9e754401b" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "stored... mozart_symphony25_guitar-solo.wav [0. 0.03304292 0. ... 0.00757442 0. 0. ] /content/music-search/dataset//mozart_symphony25_guitar-solo.wav guitar my-audio-index\n", - "stored... mozart_symphony25_string-quartet.wav [0. 0. 0. ... 0.02552068 0. 0. ] /content/music-search/dataset//mozart_symphony25_string-quartet.wav string my-audio-index\n", - "stored... mozart_symphony25_jazz-with-saxophone.wav [0. 0. 0. ... 0.0032542 0. 0. ] /content/music-search/dataset//mozart_symphony25_jazz-with-saxophone.wav jazz my-audio-index\n", - "stored... bella_ciao_jazz-with-saxophone.wav [0. 0.00063776 0. ... 0.00766881 0.00809752 0. ] /content/music-search/dataset//bella_ciao_jazz-with-saxophone.wav jazz my-audio-index\n", - "stored... bella_ciao_guitar-solo.wav [0. 0.0221919 0. ... 0.00023983 0. 0. ] /content/music-search/dataset//bella_ciao_guitar-solo.wav guitar my-audio-index\n", - "stored... mozart_symphony25_opera-singer.wav [0. 0. 0. ... 0.02008585 0. 0. ] /content/music-search/dataset//mozart_symphony25_opera-singer.wav opera my-audio-index\n", - "stored... bella_ciao_string-quartet.wav [0. 0. 0. ... 0.02983136 0. 0. ] /content/music-search/dataset//bella_ciao_string-quartet.wav string my-audio-index\n", - "stored... mozart_symphony25_electronic-synth-lead.wav [0. 0.00024901 0. ... 0.04454654 0.0603435 0. ] /content/music-search/dataset//mozart_symphony25_electronic-synth-lead.wav generic my-audio-index\n", - "stored... a-cappella-chorus.wav [0. 0.02836778 0. ... 0.02595067 0.02352854 0. ] /content/music-search/dataset//a-cappella-chorus.wav generic my-audio-index\n", - "stored... bella_ciao_piano-solo.wav [0. 0. 0. ... 0.01568016 0.04890013 0. ] /content/music-search/dataset//bella_ciao_piano-solo.wav piano my-audio-index\n", - "stored... bella_ciao_electronic-synth-lead.wav [0. 0. 0. ... 0.02660546 0.03412616 0. ] /content/music-search/dataset//bella_ciao_electronic-synth-lead.wav generic my-audio-index\n", - "stored... bella_ciao_humming.wav [0. 0.01029461 0. ... 0.07024138 0.00545542 0. ] /content/music-search/dataset//bella_ciao_humming.wav humming my-audio-index\n", - "stored... mozart_symphony25_tribal-drums-and-flute.wav [0. 0. 0. ... 0.03785052 0.03278063 0. ] /content/music-search/dataset//mozart_symphony25_tribal-drums-and-flute.wav generic my-audio-index\n", - "stored... bella_ciao_opera-singer.wav [0. 0. 0. ... 0.03254001 0. 0. ] /content/music-search/dataset//bella_ciao_opera-singer.wav opera my-audio-index\n", - "stored... mozart_symphony25_piano-solo.wav [0. 0.00863423 0. ... 0.00270792 0.02372581 0. ] /content/music-search/dataset//mozart_symphony25_piano-solo.wav piano my-audio-index\n", - "stored... bella_ciao_a-cappella-chorus.wav [0. 0.03191019 0. ... 0.03001107 0.00014891 0. ] /content/music-search/dataset//bella_ciao_a-cappella-chorus.wav generic my-audio-index\n", - "stored... mozart_symphony25_prompt.wav [0. 0. 0. ... 0.03432674 0.01389641 0. ] /content/music-search/dataset//mozart_symphony25_prompt.wav prompt my-audio-index\n", - "stored... bella_ciao_tribal-drums-and-flute.wav [0. 0.01298603 0. ... 0.03869031 0.02441213 0. ] /content/music-search/dataset//bella_ciao_tribal-drums-and-flute.wav generic my-audio-index\n" - ] - } - ], - "source": [ - "\n", - "# Initialize a list genre for test\n", - "genre_lst = ['jazz', 'opera', 'piano','prompt', 'humming', 'string', 'capella', 'eletronic', 'guitar']\n", - "\n", - "for filename in audio_files:\n", - " audio_file = audio_path + \"/\" + filename\n", - "\n", - " emb = get_embedding(audio_file)\n", - "\n", - " song = filename.lower()\n", - "\n", - " # Compare if genre list exists inside the song\n", - " genre = next((g for g in genre_lst if g in song), \"generic\")\n", - "\n", - " store_in_elasticsearch(song, emb, audio_file, index_name, genre, 2 )\n" - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "Checkpoint path: /root/panns_data/Cnn14_mAP=0.431.pth\n", + "Using CPU.\n" + ] + } + ], + "source": [ + "from panns_inference import AudioTagging\n", + "\n", + "# load the default model into the gpu.\n", + "model = AudioTagging(\n", + " checkpoint_path=None, device=\"cuda\"\n", + ") # change device to cpu if a gpu is not available" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "IiUwmLueCqIf" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import librosa\n", + "\n", + "\n", + "# Function to normalize a vector. Normalizing a vector means adjusting the values measured in different scales to a common scale.\n", + "def normalize(v):\n", + " # np.linalg.norm computes the vector's norm (magnitude). The norm is the total length of all vectors in a space.\n", + " norm = np.linalg.norm(v)\n", + " if norm == 0:\n", + " return v\n", + "\n", + " # Return the normalized vector.\n", + " return v / norm\n", + "\n", + "\n", + "# Function to get an embedding of an audio file. An embedding is a reduced-dimensionality representation of the file.\n", + "def get_embedding(audio_file):\n", + "\n", + " # Load the audio file using librosa's load function, which returns an audio time series and its corresponding sample rate.\n", + " a, _ = librosa.load(audio_file, sr=44100)\n", + "\n", + " # Reshape the audio time series to have an extra dimension, which is required by the model's inference function.\n", + " query_audio = a[None, :]\n", + "\n", + " # Perform inference on the reshaped audio using the model. This returns an embedding of the audio.\n", + " _, emb = model.inference(query_audio)\n", + "\n", + " # Normalize the embedding. This scales the embedding to have a length (magnitude) of 1, while maintaining its direction.\n", + " normalized_v = normalize(emb[0])\n", + "\n", + " # Return the normalized embedding required for dot_product elastic similarity dense vector\n", + " return normalized_v" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "PvUiFaqIJPE2" + }, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "\n", + "\n", + "# Storing Songs in Elasticsearch with Vector Embeddings:\n", + "def store_in_elasticsearch(song, embedding, path, index_name, genre, vec_field):\n", + " body = {\n", + " \"audio-embedding\": embedding,\n", + " \"title\": song,\n", + " \"timestamp\": datetime.now(),\n", + " \"path\": path,\n", + " \"genre\": genre,\n", + " }\n", + "\n", + " es.index(index=index_name, document=body)\n", + " print(\"stored...\", song, embedding, path, genre, index_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "N2m1ViF3Cw7t", + "outputId": "72e69712-60af-4a5e-c4b3-21b9e754401b" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "saGfzIfsDmEa" - }, - "outputs": [], - "source": [ - "# Define a function to query audio vector in Elasticsearch\n", - "def query_audio_vector(es, emb, field_key, index_name):\n", - " # Initialize the query structure\n", - " # It's a bool filter query that checks if the field exists\n", - " query = {\n", - " \"bool\": {\n", - " \"filter\": [{\n", - " \"exists\": {\n", - " \"field\": field_key\n", - " }\n", - " }]\n", - " }\n", - " }\n", - "\n", - " # KNN search parameters\n", - " # field is the name of the field to perform the search on\n", - " # k is the number of nearest neighbors to find\n", - " # num_candidates is the number of candidates to consider (more means slower but potentially more accurate results)\n", - " # query_vector is the vector to find nearest neighbors for\n", - " # boost is the multiplier for scores (higher means this match is considered more important)\n", - " knn = {\n", - " \"field\": field_key,\n", - " \"k\": 2,\n", - " \"num_candidates\": 100,\n", - " \"query_vector\": emb,\n", - " \"boost\": 100\n", - " }\n", - "\n", - " # The fields to retrieve from the matching documents\n", - " fields = [\"title\", \"path\", \"genre\", \"body_content\", \"url\"]\n", - "\n", - " # The name of the index to search\n", - " index = index_name\n", - "\n", - " # Perform the search\n", - " # index is the name of the index to search\n", - " # query is the query to use to find matching documents\n", - " # knn is the parameters for KNN search\n", - " # fields is the fields to retrieve from the matching documents\n", - " # size is the maximum number of matches to return\n", - " # source is whether to include the source document in the results\n", - " resp = es.search(index=index,\n", - " query=query,\n", - " knn=knn,\n", - " fields=fields,\n", - " size=5,\n", - " source=False)\n", - "\n", - " # Return the search results\n", - " return resp\n" - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "stored... mozart_symphony25_guitar-solo.wav [0. 0.03304292 0. ... 0.00757442 0. 0. ] /content/music-search/dataset//mozart_symphony25_guitar-solo.wav guitar my-audio-index\n", + "stored... mozart_symphony25_string-quartet.wav [0. 0. 0. ... 0.02552068 0. 0. ] /content/music-search/dataset//mozart_symphony25_string-quartet.wav string my-audio-index\n", + "stored... mozart_symphony25_jazz-with-saxophone.wav [0. 0. 0. ... 0.0032542 0. 0. ] /content/music-search/dataset//mozart_symphony25_jazz-with-saxophone.wav jazz my-audio-index\n", + "stored... bella_ciao_jazz-with-saxophone.wav [0. 0.00063776 0. ... 0.00766881 0.00809752 0. ] /content/music-search/dataset//bella_ciao_jazz-with-saxophone.wav jazz my-audio-index\n", + "stored... bella_ciao_guitar-solo.wav [0. 0.0221919 0. ... 0.00023983 0. 0. ] /content/music-search/dataset//bella_ciao_guitar-solo.wav guitar my-audio-index\n", + "stored... mozart_symphony25_opera-singer.wav [0. 0. 0. ... 0.02008585 0. 0. ] /content/music-search/dataset//mozart_symphony25_opera-singer.wav opera my-audio-index\n", + "stored... bella_ciao_string-quartet.wav [0. 0. 0. ... 0.02983136 0. 0. ] /content/music-search/dataset//bella_ciao_string-quartet.wav string my-audio-index\n", + "stored... mozart_symphony25_electronic-synth-lead.wav [0. 0.00024901 0. ... 0.04454654 0.0603435 0. ] /content/music-search/dataset//mozart_symphony25_electronic-synth-lead.wav generic my-audio-index\n", + "stored... a-cappella-chorus.wav [0. 0.02836778 0. ... 0.02595067 0.02352854 0. ] /content/music-search/dataset//a-cappella-chorus.wav generic my-audio-index\n", + "stored... bella_ciao_piano-solo.wav [0. 0. 0. ... 0.01568016 0.04890013 0. ] /content/music-search/dataset//bella_ciao_piano-solo.wav piano my-audio-index\n", + "stored... bella_ciao_electronic-synth-lead.wav [0. 0. 0. ... 0.02660546 0.03412616 0. ] /content/music-search/dataset//bella_ciao_electronic-synth-lead.wav generic my-audio-index\n", + "stored... bella_ciao_humming.wav [0. 0.01029461 0. ... 0.07024138 0.00545542 0. ] /content/music-search/dataset//bella_ciao_humming.wav humming my-audio-index\n", + "stored... mozart_symphony25_tribal-drums-and-flute.wav [0. 0. 0. ... 0.03785052 0.03278063 0. ] /content/music-search/dataset//mozart_symphony25_tribal-drums-and-flute.wav generic my-audio-index\n", + "stored... bella_ciao_opera-singer.wav [0. 0. 0. ... 0.03254001 0. 0. ] /content/music-search/dataset//bella_ciao_opera-singer.wav opera my-audio-index\n", + "stored... mozart_symphony25_piano-solo.wav [0. 0.00863423 0. ... 0.00270792 0.02372581 0. ] /content/music-search/dataset//mozart_symphony25_piano-solo.wav piano my-audio-index\n", + "stored... bella_ciao_a-cappella-chorus.wav [0. 0.03191019 0. ... 0.03001107 0.00014891 0. ] /content/music-search/dataset//bella_ciao_a-cappella-chorus.wav generic my-audio-index\n", + "stored... mozart_symphony25_prompt.wav [0. 0. 0. ... 0.03432674 0.01389641 0. ] /content/music-search/dataset//mozart_symphony25_prompt.wav prompt my-audio-index\n", + "stored... bella_ciao_tribal-drums-and-flute.wav [0. 0.01298603 0. ... 0.03869031 0.02441213 0. ] /content/music-search/dataset//bella_ciao_tribal-drums-and-flute.wav generic my-audio-index\n" + ] + } + ], + "source": [ + "# Initialize a list genre for test\n", + "genre_lst = [\n", + " \"jazz\",\n", + " \"opera\",\n", + " \"piano\",\n", + " \"prompt\",\n", + " \"humming\",\n", + " \"string\",\n", + " \"capella\",\n", + " \"eletronic\",\n", + " \"guitar\",\n", + "]\n", + "\n", + "for filename in audio_files:\n", + " audio_file = audio_path + \"/\" + filename\n", + "\n", + " emb = get_embedding(audio_file)\n", + "\n", + " song = filename.lower()\n", + "\n", + " # Compare if genre list exists inside the song\n", + " genre = next((g for g in genre_lst if g in song), \"generic\")\n", + "\n", + " store_in_elasticsearch(song, emb, audio_file, index_name, genre, 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "saGfzIfsDmEa" + }, + "outputs": [], + "source": [ + "# Define a function to query audio vector in Elasticsearch\n", + "def query_audio_vector(es, emb, field_key, index_name):\n", + " # Initialize the query structure\n", + " # It's a bool filter query that checks if the field exists\n", + " query = {\"bool\": {\"filter\": [{\"exists\": {\"field\": field_key}}]}}\n", + "\n", + " # KNN search parameters\n", + " # field is the name of the field to perform the search on\n", + " # k is the number of nearest neighbors to find\n", + " # num_candidates is the number of candidates to consider (more means slower but potentially more accurate results)\n", + " # query_vector is the vector to find nearest neighbors for\n", + " # boost is the multiplier for scores (higher means this match is considered more important)\n", + " knn = {\n", + " \"field\": field_key,\n", + " \"k\": 2,\n", + " \"num_candidates\": 100,\n", + " \"query_vector\": emb,\n", + " \"boost\": 100,\n", + " }\n", + "\n", + " # The fields to retrieve from the matching documents\n", + " fields = [\"title\", \"path\", \"genre\", \"body_content\", \"url\"]\n", + "\n", + " # The name of the index to search\n", + " index = index_name\n", + "\n", + " # Perform the search\n", + " # index is the name of the index to search\n", + " # query is the query to use to find matching documents\n", + " # knn is the parameters for KNN search\n", + " # fields is the fields to retrieve from the matching documents\n", + " # size is the maximum number of matches to return\n", + " # source is whether to include the source document in the results\n", + " resp = es.search(\n", + " index=index, query=query, knn=knn, fields=fields, size=5, source=False\n", + " )\n", + "\n", + " # Return the search results\n", + " return resp" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 76 }, + "id": "MCNIaA3_EGIH", + "outputId": "3da1c5ea-e5b6-4c6c-c640-5f4d612857ef" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 76 - }, - "id": "MCNIaA3_EGIH", - "outputId": "3da1c5ea-e5b6-4c6c-c640-5f4d612857ef" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "" - ], - "text/html": [ - "\n", - " \n", - " " - ] - }, - "metadata": {}, - "execution_count": 13 - } + "output_type": "execute_result", + "data": { + "text/plain": [ + "" ], - "source": [ - "# Import necessary modules for audio display from IPython\n", - "from IPython.display import Audio, display\n", - "\n", - "# Provide the URL of the audio file\n", - "my_audio = \"/content/music-search/dataset/bella_ciao_humming.wav\"\n", - "\n", - "# Display the audio file in the notebook\n", - "Audio(my_audio)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "Vws3Aow4EV9W" - }, - "outputs": [], - "source": [ - "audio_file = \"/content/music-search/dataset/bella_ciao_humming.wav\"\n", - "# Generate the embedding vector from the provided audio file\n", - "# 'get_embedding' is a function that presumably converts the audio file into a numerical vector\n", - "emb = get_embedding(audio_file)\n", - "\n", - "# Query the Elasticsearch instance 'es' with the embedding vector 'emb', field key 'audio-embedding',\n", - "# and index name 'my-audio-index'\n", - "# 'query_audio_vector' is a function that performs a search in Elasticsearch using a vector embedding.\n", - "# 'tolist()' method is used to convert numpy array to python list if 'emb' is a numpy array.\n", - "resp = query_audio_vector (es, emb.tolist(), \"audio-embedding\",\"my-audio-index\")\n" + "text/html": [ + "\n", + " \n", + " " ] + }, + "metadata": {}, + "execution_count": 13 + } + ], + "source": [ + "# Import necessary modules for audio display from IPython\n", + "from IPython.display import Audio, display\n", + "\n", + "# Provide the URL of the audio file\n", + "my_audio = \"/content/music-search/dataset/bella_ciao_humming.wav\"\n", + "\n", + "# Display the audio file in the notebook\n", + "Audio(my_audio)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "Vws3Aow4EV9W" + }, + "outputs": [], + "source": [ + "audio_file = \"/content/music-search/dataset/bella_ciao_humming.wav\"\n", + "# Generate the embedding vector from the provided audio file\n", + "# 'get_embedding' is a function that presumably converts the audio file into a numerical vector\n", + "emb = get_embedding(audio_file)\n", + "\n", + "# Query the Elasticsearch instance 'es' with the embedding vector 'emb', field key 'audio-embedding',\n", + "# and index name 'my-audio-index'\n", + "# 'query_audio_vector' is a function that performs a search in Elasticsearch using a vector embedding.\n", + "# 'tolist()' method is used to convert numpy array to python list if 'emb' is a numpy array.\n", + "resp = query_audio_vector(es, emb.tolist(), \"audio-embedding\", \"my-audio-index\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "qz5bCtu2FlKx", + "outputId": "a14b9201-4dd1-49b5-c9eb-c099ddd780f3" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "qz5bCtu2FlKx", - "outputId": "a14b9201-4dd1-49b5-c9eb-c099ddd780f3" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "{'total': {'value': 18, 'relation': 'eq'},\n", - " 'max_score': 100.0,\n", - " 'hits': [{'_index': 'my-audio-index',\n", - " '_id': 'b-FjHYoBwzxpWbqUtJsw',\n", - " '_score': 100.0,\n", - " 'fields': {'genre': ['humming'],\n", - " 'title': ['bella_ciao_humming.wav'],\n", - " 'path': ['/content/music-search/dataset//bella_ciao_humming.wav']}},\n", - " {'_index': 'my-audio-index',\n", - " '_id': 'I_5jHYoBSIrryGYbvxS_',\n", - " '_score': 86.1148,\n", - " 'fields': {'genre': ['opera'],\n", - " 'title': ['bella_ciao_opera-singer.wav'],\n", - " 'path': ['/content/music-search/dataset//bella_ciao_opera-singer.wav']}},\n", - " {'_index': 'my-audio-index',\n", - " '_id': 'auFjHYoBwzxpWbqUapvZ',\n", - " '_score': 0.0,\n", - " 'fields': {'genre': ['guitar'],\n", - " 'title': ['mozart_symphony25_guitar-solo.wav'],\n", - " 'path': ['/content/music-search/dataset//mozart_symphony25_guitar-solo.wav']}},\n", - " {'_index': 'my-audio-index',\n", - " '_id': 'Hf5jHYoBSIrryGYbcBSS',\n", - " '_score': 0.0,\n", - " 'fields': {'genre': ['string'],\n", - " 'title': ['mozart_symphony25_string-quartet.wav'],\n", - " 'path': ['/content/music-search/dataset//mozart_symphony25_string-quartet.wav']}},\n", - " {'_index': 'my-audio-index',\n", - " '_id': 'Hv5jHYoBSIrryGYbdhRb',\n", - " '_score': 0.0,\n", - " 'fields': {'genre': ['jazz'],\n", - " 'title': ['mozart_symphony25_jazz-with-saxophone.wav'],\n", - " 'path': ['/content/music-search/dataset//mozart_symphony25_jazz-with-saxophone.wav']}}]}" - ] - }, - "metadata": {}, - "execution_count": 15 - } - ], - "source": [ - "resp['hits']\n" + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'total': {'value': 18, 'relation': 'eq'},\n", + " 'max_score': 100.0,\n", + " 'hits': [{'_index': 'my-audio-index',\n", + " '_id': 'b-FjHYoBwzxpWbqUtJsw',\n", + " '_score': 100.0,\n", + " 'fields': {'genre': ['humming'],\n", + " 'title': ['bella_ciao_humming.wav'],\n", + " 'path': ['/content/music-search/dataset//bella_ciao_humming.wav']}},\n", + " {'_index': 'my-audio-index',\n", + " '_id': 'I_5jHYoBSIrryGYbvxS_',\n", + " '_score': 86.1148,\n", + " 'fields': {'genre': ['opera'],\n", + " 'title': ['bella_ciao_opera-singer.wav'],\n", + " 'path': ['/content/music-search/dataset//bella_ciao_opera-singer.wav']}},\n", + " {'_index': 'my-audio-index',\n", + " '_id': 'auFjHYoBwzxpWbqUapvZ',\n", + " '_score': 0.0,\n", + " 'fields': {'genre': ['guitar'],\n", + " 'title': ['mozart_symphony25_guitar-solo.wav'],\n", + " 'path': ['/content/music-search/dataset//mozart_symphony25_guitar-solo.wav']}},\n", + " {'_index': 'my-audio-index',\n", + " '_id': 'Hf5jHYoBSIrryGYbcBSS',\n", + " '_score': 0.0,\n", + " 'fields': {'genre': ['string'],\n", + " 'title': ['mozart_symphony25_string-quartet.wav'],\n", + " 'path': ['/content/music-search/dataset//mozart_symphony25_string-quartet.wav']}},\n", + " {'_index': 'my-audio-index',\n", + " '_id': 'Hv5jHYoBSIrryGYbdhRb',\n", + " '_score': 0.0,\n", + " 'fields': {'genre': ['jazz'],\n", + " 'title': ['mozart_symphony25_jazz-with-saxophone.wav'],\n", + " 'path': ['/content/music-search/dataset//mozart_symphony25_jazz-with-saxophone.wav']}}]}" ] + }, + "metadata": {}, + "execution_count": 15 + } + ], + "source": [ + "resp[\"hits\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "uEgpWT60FpOx", + "outputId": "0baea339-861a-4586-97ce-31d3450bca4d" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "uEgpWT60FpOx", - "outputId": "0baea339-861a-4586-97ce-31d3450bca4d" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "/content/music-search/dataset//bella_ciao_humming.wav\n", - "/content/music-search/dataset//bella_ciao_opera-singer.wav\n", - "/content/music-search/dataset//mozart_symphony25_guitar-solo.wav\n", - "/content/music-search/dataset//mozart_symphony25_string-quartet.wav\n", - "/content/music-search/dataset//mozart_symphony25_jazz-with-saxophone.wav\n" - ] - } - ], - "source": [ - "NUM_MUSIC = 5 # example value\n", - "\n", - "for i in range(NUM_MUSIC):\n", - " path = resp['hits']['hits'][i]['fields']['path'][0]\n", - " print(path)" - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "/content/music-search/dataset//bella_ciao_humming.wav\n", + "/content/music-search/dataset//bella_ciao_opera-singer.wav\n", + "/content/music-search/dataset//mozart_symphony25_guitar-solo.wav\n", + "/content/music-search/dataset//mozart_symphony25_string-quartet.wav\n", + "/content/music-search/dataset//mozart_symphony25_jazz-with-saxophone.wav\n" + ] + } + ], + "source": [ + "NUM_MUSIC = 5 # example value\n", + "\n", + "for i in range(NUM_MUSIC):\n", + " path = resp[\"hits\"][\"hits\"][i][\"fields\"][\"path\"][0]\n", + " print(path)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 76 }, + "id": "mBu7mqUZFtt_", + "outputId": "8615fdd6-b5c4-452f-8039-ce5e16f300d0" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 76 - }, - "id": "mBu7mqUZFtt_", - "outputId": "8615fdd6-b5c4-452f-8039-ce5e16f300d0" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "" - ], - "text/html": [ - "\n", - " \n", - " " - ] - }, - "metadata": {}, - "execution_count": 17 - } + "output_type": "execute_result", + "data": { + "text/plain": [ + "" ], - "source": [ - "Audio(\"/content/music-search/dataset/bella_ciao_opera-singer.wav\")" + "text/html": [ + "\n", + " \n", + " " ] + }, + "metadata": {}, + "execution_count": 17 } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } + ], + "source": [ + "Audio(\"/content/music-search/dataset/bella_ciao_opera-singer.wav\")" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } \ No newline at end of file diff --git a/supporting-blog-content/openai-rag-streamlit/openai_rag_streamlit.ipynb b/supporting-blog-content/openai-rag-streamlit/openai_rag_streamlit.ipynb index 12a0ba5a..8ad9c31d 100644 --- a/supporting-blog-content/openai-rag-streamlit/openai_rag_streamlit.ipynb +++ b/supporting-blog-content/openai-rag-streamlit/openai_rag_streamlit.ipynb @@ -1,558 +1,565 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "f96b815e" - }, - "source": [ - "# Build a Generative AI application using Elasticsearch and OpenAI" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e0f537af" - }, - "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/supporting-blog-content/openai-rag-streamlit/openai_rag_streamlit.ipynb)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "349e0e74" - }, - "source": [ - "This notebook demonstrates how to:\n", - "- Index the OpenAI Wikipedia vector dataset into Elasticsearch\n", - "- Build a simple Gen AI application with Streamlit that retrieves context using Elasticsearch and formulate answers using OpenAI\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8nGNmWM3idlK" - }, - "source": [ - "![Screenshot 2023-08-16 at 3.27.07 PM.png]()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aa9576ca" - }, - "source": [ - "## Install packages and import modules" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8c304b93" - }, - "outputs": [], - "source": [ - "# install packages\n", - "\n", - "!python3 -m pip install -qU openai pandas==1.5.3 wget elasticsearch streamlit tqdm\n", - "\n", - "# import modules\n", - "\n", - "import os\n", - "from getpass import getpass\n", - "from elasticsearch import Elasticsearch, helpers\n", - "import wget, zipfile, pandas as pd, json, openai\n", - "import streamlit as st\n", - "from tqdm.notebook import tqdm" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "de32a789" - }, - "source": [ - "## Connect to Elasticsearch\n", - "\n", - "ℹ️ We're using an Elastic Cloud deployment of Elasticsearch for this notebook.\n", - "If you don't already have an Elastic deployment, you can sign up for a free [Elastic Cloud trial](https://cloud.elastic.co/registration?utm_source=github&utm_content=elasticsearch-labs-notebook).\n", - "\n", - "To connect to Elasticsearch, you need to create a client instance with the Cloud ID and password for your deployment.\n", - "\n", - "Find the Cloud ID for your deployment by going to https://cloud.elastic.co/deployments and selecting your deployment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3a57b6a8" - }, - "outputs": [], - "source": [ - "os.environ['es_cloud_id'] = getpass(\"Elastic deployment Cloud ID\")\n", - "os.environ['es_password'] = getpass(\"Elastic deployment Password\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cb2ng640_o2n" - }, - "source": [ - "Test the connection with Elasticsearch." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NJkflYkGHnGL" - }, - "outputs": [], - "source": [ - "es_cloud_id = os.environ['es_cloud_id']\n", - "es_password = os.environ['es_password']\n", - "\n", - "client = Elasticsearch(\n", - " cloud_id = es_cloud_id,\n", - " basic_auth=(\"elastic\", es_password) # Alternatively use `api_key` instead of `basic_auth`\n", - ")\n", - "\n", - "# Test connection to Elasticsearch\n", - "print(client.info())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JiA-4-Kb-C3K" - }, - "source": [ - "## Configure OpenAI connection\n", - "\n", - "Our example will use OpenAI to formulate an answer, so please provide a valid OpenAI Api Key here.\n", - "\n", - "You can follow [this guide](https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key) to retrieve your API Key.\n", - "\n", - "Then test the connection with OpenAI and check the model used in this notebook is available." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "imKyf8sm-caV" - }, - "outputs": [], - "source": [ - "os.environ['openai_api_key'] = getpass(\"OpenAI Api Key\")\n", - "openai.api_key = os.environ['openai_api_key']\n", - "openai.Model.retrieve(\"text-embedding-ada-002\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "80b55952" - }, - "source": [ - "## Download the dataset\n", - "\n", - "In this step we download the OpenAI Wikipedia embeddings dataset, and extract the zip file." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c584f15c" - }, - "outputs": [], - "source": [ - "embeddings_url = 'https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip'\n", - "wget.download(embeddings_url)\n", - "\n", - "with zipfile.ZipFile(\"vector_database_wikipedia_articles_embedded.zip\",\n", - "\"r\") as zip_ref:\n", - " zip_ref.extractall(\"data\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9654ac08" - }, - "source": [ - "## Read CSV file into a Pandas DataFrame\n", - "\n", - "Next we use the Pandas library to read the unzipped CSV file into a DataFrame. This step makes it easier to index the data into Elasticsearch in bulk." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "76347d10" - }, - "outputs": [], - "source": [ - "\n", - "wikipedia_dataframe = pd.read_csv(\"data/vector_database_wikipedia_articles_embedded.csv\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6af9f5ad" - }, - "source": [ - "## Create index with mapping\n", - "\n", - "Now we need to create an Elasticsearch index with the necessary mappings. This will enable us to index the data into Elasticsearch.\n", - "\n", - "We use the `dense_vector` field type for the `title_vector` and `content_vector` fields. This is a special field type that allows us to store dense vectors in Elasticsearch.\n", - "\n", - "Later, we'll need to target the `dense_vector` field for kNN search.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "681989b3" - }, - "outputs": [], - "source": [ - "index_mapping= {\n", - " \"properties\": {\n", - " \"title_vector\": {\n", - " \"type\": \"dense_vector\",\n", - " \"dims\": 1536,\n", - " \"index\": \"true\",\n", - " \"similarity\": \"cosine\"\n", - " },\n", - " \"content_vector\": {\n", - " \"type\": \"dense_vector\",\n", - " \"dims\": 1536,\n", - " \"index\": \"true\",\n", - " \"similarity\": \"cosine\"\n", - " },\n", - " \"text\": {\"type\": \"text\"},\n", - " \"title\": {\"type\": \"text\"},\n", - " \"url\": { \"type\": \"keyword\"},\n", - " \"vector_id\": {\"type\": \"long\"}\n", - "\n", - " }\n", - "}\n", - "client.indices.create(index=\"wikipedia_vector_index\", mappings=index_mapping)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "c2fb582e" - }, - "source": [ - "## Index data into Elasticsearch\n", - "\n", - "The following function generates the required bulk actions that can be passed to Elasticsearch's Bulk API, so we can index multiple documents efficiently in a single request.\n", - "\n", - "For each row in the DataFrame, the function yields a dictionary representing a single document to be indexed." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "efee9b97" - }, - "outputs": [], - "source": [ - "def dataframe_to_bulk_actions(df):\n", - " for index, row in df.iterrows():\n", - " yield {\n", - " \"_index\": 'wikipedia_vector_index',\n", - " \"_id\": row['id'],\n", - " \"_source\": {\n", - " 'url' : row[\"url\"],\n", - " 'title' : row[\"title\"],\n", - " 'text' : row[\"text\"],\n", - " 'title_vector' : json.loads(row[\"title_vector\"]),\n", - " 'content_vector' : json.loads(row[\"content_vector\"]),\n", - " 'vector_id' : row[\"vector_id\"]\n", - " }\n", - " }" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "b8164b38" - }, - "source": [ - "As the dataframe is large, we will index data in batches of `100`. We index the data into Elasticsearch using the Python client's [helpers](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/client-helpers.html#bulk-helpers) for the bulk API." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "aacb5e9c" - }, - "outputs": [], - "source": [ - "total_documents = len(wikipedia_dataframe)\n", - "\n", - "progress_bar = tqdm(total=total_documents, unit=\"documents\")\n", - "success_count = 0\n", - "\n", - "for ok, info in helpers.streaming_bulk(client, actions=dataframe_to_bulk_actions(wikipedia_dataframe), raise_on_error=False, chunk_size=100):\n", - " if ok:\n", - " success_count += 1\n", - " else:\n", - " print(f\"Unable to index {info['index']['_id']}: {info['index']['error']}\")\n", - " progress_bar.update(1)\n", - " progress_bar.set_postfix(success=success_count)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rnJbAQdgbXSm" - }, - "source": [ - "## Build application with Streamlit\n", - "\n", - "In the following section, you will build a simple interface using streamlit.\n", - "\n", - "This application will display a simple search bar where an user can ask a question. Elasticsearch is used to retrieve the relevant documents (context) matching the question then OpenAI formulate an answer using the context." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "I55fQHW589RP" - }, - "source": [ - "Install the dependency to access the application once running." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "LnL-wOdRct5O" - }, - "outputs": [], - "source": [ - "!npm install localtunnel" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LkEHb4VMevcc" - }, - "source": [ - "Create application" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_J7keMUAewy1" - }, - "outputs": [], - "source": [ - "%%writefile app.py\n", - "\n", - "import os\n", - "import streamlit as st\n", - "import openai\n", - "from elasticsearch import Elasticsearch\n", - "\n", - "\n", - "# Elastic Cloud\n", - "es_cloud_id = os.environ['es_cloud_id']\n", - "es_password = os.environ['es_password']\n", - "\n", - "# OpenAI\n", - "openai.api_key = os.environ['openai_api_key']\n", - "\n", - "# Define model\n", - "EMBEDDING_MODEL = \"text-embedding-ada-002\"\n", - "\n", - "# Connect to Elasticsearch\n", - "client = Elasticsearch(\n", - " cloud_id = es_cloud_id,\n", - " basic_auth=(\"elastic\", es_password) # Alternatively use `api_key` instead of `basic_auth`\n", - ")\n", - "\n", - "def openai_summarize(query, response):\n", - " context = response['hits']['hits'][0]['_source']['text']\n", - " summary = openai.ChatCompletion.create(\n", - " model=\"gpt-3.5-turbo\",\n", - " messages=[\n", - " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", - " {\"role\": \"user\", \"content\": \"Answer the following question:\" + query + \"by using the following text: \" + context},\n", - " ]\n", - " )\n", - " return summary.choices[0].message.content\n", - "\n", - "\n", - "def search_es(query):\n", - " # Create embedding\n", - " question_embedding = openai.Embedding.create(input=query, model=EMBEDDING_MODEL)\n", - "\n", - " # Define Elasticsearch query\n", - " response = client.search(\n", - " index = \"wikipedia_vector_index\",\n", - " knn={\n", - " \"field\": \"content_vector\",\n", - " \"query_vector\": question_embedding[\"data\"][0][\"embedding\"],\n", - " \"k\": 10,\n", - " \"num_candidates\": 100\n", - " }\n", - " )\n", - " return response\n", - "\n", - "\n", - "def main():\n", - " st.title(\"Gen AI Application\")\n", - "\n", - " # Input for user search query\n", - " user_query = st.text_input(\"Enter your question:\")\n", - "\n", - " if st.button(\"Search\"):\n", - " if user_query:\n", - "\n", - " st.write(f\"Searching for: {user_query}\")\n", - " result = search_es(user_query)\n", - "\n", - " # print(result)\n", - " openai_summary = openai_summarize(user_query, result)\n", - " st.write(f\"OpenAI Summary: {openai_summary}\")\n", - "\n", - " # Display search results\n", - " if result['hits']['total']['value'] > 0:\n", - " st.write(\"Search Results:\")\n", - " for hit in result['hits']['hits']:\n", - " st.write(hit['_source']['title'])\n", - " st.write(hit['_source']['text'])\n", - " else:\n", - " st.write(\"No results found.\")\n", - "\n", - "if __name__ == \"__main__\":\n", - " main()\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BU1WKBVGe5ZY" - }, - "source": [ - "### Run the application\n", - "\n", - "Run the application and check your IP for the tunneling" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-oQa-VV6e40J" - }, - "outputs": [], - "source": [ - "!streamlit run app.py &> /content/app.log & curl ipv4.icanhazip.com" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZXLKpEvMe-D2" - }, - "source": [ - "### Create the tunnel to access it from anywhere\n", - "\n", - "Run the tunnel and use the link below to connect to the tunnel.\n", - "\n", - "Use the IP from the previous step to connect to the application" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "background_save": true - }, - "id": "ertvvtnifAZy" - }, - "outputs": [], - "source": [ - "!npx localtunnel --port 8501" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AcvvIMxxGIun" - }, - "source": [ - "Success you build your first Gen AI Application.\n", - "\n", - "You can try it by asking question such as \"Who is Beethoven?\" or \"What is football?\" and see the answers." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WHBfQus1TfRG" - }, - "source": [ - "## Next steps\n", - "\n", - "Now you know how to quickly put together an interface that allows you to ask questions and get answer from a specific dataset, in this notebook example, wikipedia.\n", - "\n", - "You can adapt this example to use your own dataset, and use the streamlit application as a blueprint for integrating with your own application." - ] - } - ], - "metadata": { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "f96b815e" + }, + "source": [ + "# Build a Generative AI application using Elasticsearch and OpenAI" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e0f537af" + }, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/supporting-blog-content/openai-rag-streamlit/openai_rag_streamlit.ipynb)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "349e0e74" + }, + "source": [ + "This notebook demonstrates how to:\n", + "- Index the OpenAI Wikipedia vector dataset into Elasticsearch\n", + "- Build a simple Gen AI application with Streamlit that retrieves context using Elasticsearch and formulate answers using OpenAI\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8nGNmWM3idlK" + }, + "source": [ + "![Screenshot 2023-08-16 at 3.27.07 PM.png]()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aa9576ca" + }, + "source": [ + "## Install packages and import modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8c304b93" + }, + "outputs": [], + "source": [ + "# install packages\n", + "\n", + "!python3 -m pip install -qU openai pandas==1.5.3 wget elasticsearch streamlit tqdm\n", + "\n", + "# import modules\n", + "\n", + "import os\n", + "from getpass import getpass\n", + "from elasticsearch import Elasticsearch, helpers\n", + "import wget, zipfile, pandas as pd, json, openai\n", + "import streamlit as st\n", + "from tqdm.notebook import tqdm" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "de32a789" + }, + "source": [ + "## Connect to Elasticsearch\n", + "\n", + "ℹ️ We're using an Elastic Cloud deployment of Elasticsearch for this notebook.\n", + "If you don't already have an Elastic deployment, you can sign up for a free [Elastic Cloud trial](https://cloud.elastic.co/registration?utm_source=github&utm_content=elasticsearch-labs-notebook).\n", + "\n", + "To connect to Elasticsearch, you need to create a client instance with the Cloud ID and password for your deployment.\n", + "\n", + "Find the Cloud ID for your deployment by going to https://cloud.elastic.co/deployments and selecting your deployment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3a57b6a8" + }, + "outputs": [], + "source": [ + "os.environ[\"es_cloud_id\"] = getpass(\"Elastic deployment Cloud ID\")\n", + "os.environ[\"es_password\"] = getpass(\"Elastic deployment Password\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cb2ng640_o2n" + }, + "source": [ + "Test the connection with Elasticsearch." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NJkflYkGHnGL" + }, + "outputs": [], + "source": [ + "es_cloud_id = os.environ[\"es_cloud_id\"]\n", + "es_password = os.environ[\"es_password\"]\n", + "\n", + "client = Elasticsearch(\n", + " cloud_id=es_cloud_id,\n", + " basic_auth=(\n", + " \"elastic\",\n", + " es_password,\n", + " ), # Alternatively use `api_key` instead of `basic_auth`\n", + ")\n", + "\n", + "# Test connection to Elasticsearch\n", + "print(client.info())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JiA-4-Kb-C3K" + }, + "source": [ + "## Configure OpenAI connection\n", + "\n", + "Our example will use OpenAI to formulate an answer, so please provide a valid OpenAI Api Key here.\n", + "\n", + "You can follow [this guide](https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key) to retrieve your API Key.\n", + "\n", + "Then test the connection with OpenAI and check the model used in this notebook is available." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "imKyf8sm-caV" + }, + "outputs": [], + "source": [ + "os.environ[\"openai_api_key\"] = getpass(\"OpenAI Api Key\")\n", + "openai.api_key = os.environ[\"openai_api_key\"]\n", + "openai.Model.retrieve(\"text-embedding-ada-002\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "80b55952" + }, + "source": [ + "## Download the dataset\n", + "\n", + "In this step we download the OpenAI Wikipedia embeddings dataset, and extract the zip file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c584f15c" + }, + "outputs": [], + "source": [ + "embeddings_url = \"https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip\"\n", + "wget.download(embeddings_url)\n", + "\n", + "with zipfile.ZipFile(\"vector_database_wikipedia_articles_embedded.zip\", \"r\") as zip_ref:\n", + " zip_ref.extractall(\"data\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9654ac08" + }, + "source": [ + "## Read CSV file into a Pandas DataFrame\n", + "\n", + "Next we use the Pandas library to read the unzipped CSV file into a DataFrame. This step makes it easier to index the data into Elasticsearch in bulk." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "76347d10" + }, + "outputs": [], + "source": [ + "wikipedia_dataframe = pd.read_csv(\n", + " \"data/vector_database_wikipedia_articles_embedded.csv\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6af9f5ad" + }, + "source": [ + "## Create index with mapping\n", + "\n", + "Now we need to create an Elasticsearch index with the necessary mappings. This will enable us to index the data into Elasticsearch.\n", + "\n", + "We use the `dense_vector` field type for the `title_vector` and `content_vector` fields. This is a special field type that allows us to store dense vectors in Elasticsearch.\n", + "\n", + "Later, we'll need to target the `dense_vector` field for kNN search.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "681989b3" + }, + "outputs": [], + "source": [ + "index_mapping = {\n", + " \"properties\": {\n", + " \"title_vector\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 1536,\n", + " \"index\": \"true\",\n", + " \"similarity\": \"cosine\",\n", + " },\n", + " \"content_vector\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 1536,\n", + " \"index\": \"true\",\n", + " \"similarity\": \"cosine\",\n", + " },\n", + " \"text\": {\"type\": \"text\"},\n", + " \"title\": {\"type\": \"text\"},\n", + " \"url\": {\"type\": \"keyword\"},\n", + " \"vector_id\": {\"type\": \"long\"},\n", + " }\n", + "}\n", + "client.indices.create(index=\"wikipedia_vector_index\", mappings=index_mapping)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "c2fb582e" + }, + "source": [ + "## Index data into Elasticsearch\n", + "\n", + "The following function generates the required bulk actions that can be passed to Elasticsearch's Bulk API, so we can index multiple documents efficiently in a single request.\n", + "\n", + "For each row in the DataFrame, the function yields a dictionary representing a single document to be indexed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "efee9b97" + }, + "outputs": [], + "source": [ + "def dataframe_to_bulk_actions(df):\n", + " for index, row in df.iterrows():\n", + " yield {\n", + " \"_index\": \"wikipedia_vector_index\",\n", + " \"_id\": row[\"id\"],\n", + " \"_source\": {\n", + " \"url\": row[\"url\"],\n", + " \"title\": row[\"title\"],\n", + " \"text\": row[\"text\"],\n", + " \"title_vector\": json.loads(row[\"title_vector\"]),\n", + " \"content_vector\": json.loads(row[\"content_vector\"]),\n", + " \"vector_id\": row[\"vector_id\"],\n", + " },\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b8164b38" + }, + "source": [ + "As the dataframe is large, we will index data in batches of `100`. We index the data into Elasticsearch using the Python client's [helpers](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/client-helpers.html#bulk-helpers) for the bulk API." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "aacb5e9c" + }, + "outputs": [], + "source": [ + "total_documents = len(wikipedia_dataframe)\n", + "\n", + "progress_bar = tqdm(total=total_documents, unit=\"documents\")\n", + "success_count = 0\n", + "\n", + "for ok, info in helpers.streaming_bulk(\n", + " client,\n", + " actions=dataframe_to_bulk_actions(wikipedia_dataframe),\n", + " raise_on_error=False,\n", + " chunk_size=100,\n", + "):\n", + " if ok:\n", + " success_count += 1\n", + " else:\n", + " print(f\"Unable to index {info['index']['_id']}: {info['index']['error']}\")\n", + " progress_bar.update(1)\n", + " progress_bar.set_postfix(success=success_count)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rnJbAQdgbXSm" + }, + "source": [ + "## Build application with Streamlit\n", + "\n", + "In the following section, you will build a simple interface using streamlit.\n", + "\n", + "This application will display a simple search bar where an user can ask a question. Elasticsearch is used to retrieve the relevant documents (context) matching the question then OpenAI formulate an answer using the context." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "I55fQHW589RP" + }, + "source": [ + "Install the dependency to access the application once running." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LnL-wOdRct5O" + }, + "outputs": [], + "source": [ + "!npm install localtunnel" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LkEHb4VMevcc" + }, + "source": [ + "Create application" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_J7keMUAewy1" + }, + "outputs": [], + "source": [ + "%%writefile app.py\n", + "\n", + "import os\n", + "import streamlit as st\n", + "import openai\n", + "from elasticsearch import Elasticsearch\n", + "\n", + "\n", + "# Elastic Cloud\n", + "es_cloud_id = os.environ['es_cloud_id']\n", + "es_password = os.environ['es_password']\n", + "\n", + "# OpenAI\n", + "openai.api_key = os.environ['openai_api_key']\n", + "\n", + "# Define model\n", + "EMBEDDING_MODEL = \"text-embedding-ada-002\"\n", + "\n", + "# Connect to Elasticsearch\n", + "client = Elasticsearch(\n", + " cloud_id = es_cloud_id,\n", + " basic_auth=(\"elastic\", es_password) # Alternatively use `api_key` instead of `basic_auth`\n", + ")\n", + "\n", + "def openai_summarize(query, response):\n", + " context = response['hits']['hits'][0]['_source']['text']\n", + " summary = openai.ChatCompletion.create(\n", + " model=\"gpt-3.5-turbo\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", + " {\"role\": \"user\", \"content\": \"Answer the following question:\" + query + \"by using the following text: \" + context},\n", + " ]\n", + " )\n", + " return summary.choices[0].message.content\n", + "\n", + "\n", + "def search_es(query):\n", + " # Create embedding\n", + " question_embedding = openai.Embedding.create(input=query, model=EMBEDDING_MODEL)\n", + "\n", + " # Define Elasticsearch query\n", + " response = client.search(\n", + " index = \"wikipedia_vector_index\",\n", + " knn={\n", + " \"field\": \"content_vector\",\n", + " \"query_vector\": question_embedding[\"data\"][0][\"embedding\"],\n", + " \"k\": 10,\n", + " \"num_candidates\": 100\n", + " }\n", + " )\n", + " return response\n", + "\n", + "\n", + "def main():\n", + " st.title(\"Gen AI Application\")\n", + "\n", + " # Input for user search query\n", + " user_query = st.text_input(\"Enter your question:\")\n", + "\n", + " if st.button(\"Search\"):\n", + " if user_query:\n", + "\n", + " st.write(f\"Searching for: {user_query}\")\n", + " result = search_es(user_query)\n", + "\n", + " # print(result)\n", + " openai_summary = openai_summarize(user_query, result)\n", + " st.write(f\"OpenAI Summary: {openai_summary}\")\n", + "\n", + " # Display search results\n", + " if result['hits']['total']['value'] > 0:\n", + " st.write(\"Search Results:\")\n", + " for hit in result['hits']['hits']:\n", + " st.write(hit['_source']['title'])\n", + " st.write(hit['_source']['text'])\n", + " else:\n", + " st.write(\"No results found.\")\n", + "\n", + "if __name__ == \"__main__\":\n", + " main()\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BU1WKBVGe5ZY" + }, + "source": [ + "### Run the application\n", + "\n", + "Run the application and check your IP for the tunneling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-oQa-VV6e40J" + }, + "outputs": [], + "source": [ + "!streamlit run app.py &> /content/app.log & curl ipv4.icanhazip.com" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZXLKpEvMe-D2" + }, + "source": [ + "### Create the tunnel to access it from anywhere\n", + "\n", + "Run the tunnel and use the link below to connect to the tunnel.\n", + "\n", + "Use the IP from the previous step to connect to the application" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3.11.3 64-bit", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.3" - }, - "vscode": { - "interpreter": { - "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" - } - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "background_save": true + }, + "id": "ertvvtnifAZy" + }, + "outputs": [], + "source": [ + "!npx localtunnel --port 8501" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AcvvIMxxGIun" + }, + "source": [ + "Success you build your first Gen AI Application.\n", + "\n", + "You can try it by asking question such as \"Who is Beethoven?\" or \"What is football?\" and see the answers." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WHBfQus1TfRG" + }, + "source": [ + "## Next steps\n", + "\n", + "Now you know how to quickly put together an interface that allows you to ask questions and get answer from a specific dataset, in this notebook example, wikipedia.\n", + "\n", + "You can adapt this example to use your own dataset, and use the streamlit application as a blueprint for integrating with your own application." + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3.11.3 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + }, + "vscode": { + "interpreter": { + "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/supporting-blog-content/plagiarism-detection-with-elasticsearch/plagiarism_detection_es.ipynb b/supporting-blog-content/plagiarism-detection-with-elasticsearch/plagiarism_detection_es.ipynb index 0018e5ff..c34872fa 100644 --- a/supporting-blog-content/plagiarism-detection-with-elasticsearch/plagiarism_detection_es.ipynb +++ b/supporting-blog-content/plagiarism-detection-with-elasticsearch/plagiarism_detection_es.ipynb @@ -1,470 +1,434 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "**Blog: Plagiarism detection with Elasticsearch**" - ], - "metadata": { - "id": "kmMkWI9MH7SG" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Q9cqVF6lJtYw" - }, - "outputs": [], - "source": [ - "!pip install elasticsearch==8.11 #Elasticsearch" - ] - }, - { - "cell_type": "code", - "source": [ - "pip -q install eland elasticsearch sentence_transformers transformers torch==2.1.0" - ], - "metadata": { - "id": "wwi3NpszKa_U" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "from elasticsearch import Elasticsearch, helpers\n", - "from elasticsearch.client import MlClient\n", - "from eland.ml.pytorch import PyTorchModel\n", - "from eland.ml.pytorch.transformers import TransformerModel\n", - "from urllib.request import urlopen\n", - "import json\n", - "from pathlib import Path\n", - "import getpass" - ], - "metadata": { - "id": "8JSAt-uUKcix" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Found in the 'Manage Deployment' page\n", - "CLOUD_ID = getpass.getpass('Enter Elastic Cloud ID: ')\n", - "\n", - "# Password for the 'elastic' user generated by Elasticsearch\n", - "ELASTIC_PASSWORD = getpass.getpass('Enter Elastic password: ')\n", - "\n", - "# Create the client instance\n", - "client = Elasticsearch(\n", - " cloud_id=CLOUD_ID,\n", - " basic_auth=(\"elastic\", ELASTIC_PASSWORD),\n", - " request_timeout=3600\n", - ")" - ], - "metadata": { - "id": "ctmF7sNwKd5o" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Set the model name from Hugging Face and task type\n", - "# open ai detector model - developed by open ai https://github.com/openai/gpt-2-output-dataset/tree/master/detector\n", - "hf_model_id='roberta-base-openai-detector'\n", - "tm = TransformerModel(model_id=hf_model_id, task_type=\"text_classification\")\n", - "\n", - "#set the modelID as it is named in Elasticsearch\n", - "es_model_id = tm.elasticsearch_model_id()\n", - "\n", - "# Download the model from Hugging Face\n", - "tmp_path = \"models\"\n", - "Path(tmp_path).mkdir(parents=True, exist_ok=True)\n", - "model_path, config, vocab_path = tm.save(tmp_path)\n", - "\n", - "# Load the model into Elasticsearch\n", - "ptm = PyTorchModel(client, es_model_id)\n", - "ptm.import_model(model_path=model_path, config_path=None, vocab_path=vocab_path, config=config)\n", - "\n", - "#Start the model\n", - "s = MlClient.start_trained_model_deployment(client, model_id=es_model_id)\n", - "s.body" - ], - "metadata": { - "id": "AXeDnvJWKfll" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Set the model name from Hugging Face and task type\n", - "# sentence-transformers model\n", - "hf_model_id='sentence-transformers/all-mpnet-base-v2'\n", - "tm = TransformerModel(model_id=hf_model_id, task_type=\"text_embedding\")\n", - "\n", - "#set the modelID as it is named in Elasticsearch\n", - "es_model_id = tm.elasticsearch_model_id()\n", - "\n", - "# Download the model from Hugging Face\n", - "tmp_path = \"models\"\n", - "Path(tmp_path).mkdir(parents=True, exist_ok=True)\n", - "model_path, config, vocab_path = tm.save(tmp_path)\n", - "\n", - "# Load the model into Elasticsearch\n", - "ptm = PyTorchModel(client, es_model_id)\n", - "ptm.import_model(model_path=model_path, config_path=None, vocab_path=vocab_path, config=config)\n", - "\n", - "# Start the model\n", - "s = MlClient.start_trained_model_deployment(client, model_id=es_model_id)\n", - "s.body" - ], - "metadata": { - "id": "wFiJAVpBKjkP" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "#source index\n", - "client.indices.create(\n", - "index=\"plagiarism-docs\",\n", - "mappings= {\n", - " \"properties\": {\n", - " \"title\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\n", - " \"type\": \"keyword\"\n", - " }\n", - " }\n", - " },\n", - " \"abstract\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\n", - " \"type\": \"keyword\"\n", - " }\n", - " }\n", - " },\n", - " \"url\": {\n", - " \"type\": \"keyword\"\n", - " },\n", - " \"venue\": {\n", - " \"type\": \"keyword\"\n", - " },\n", - " \"year\": {\n", - " \"type\": \"keyword\"\n", - " }\n", - " }\n", - "})" - ], - "metadata": { - "id": "S-SNKitkKmHC" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "#ingest pipeline\n", - "\n", - "client.ingest.put_pipeline(\n", - " id=\"plagiarism-checker-pipeline\",\n", - " processors = [\n", - " {\n", - " \"inference\": { #for ml models - to infer against the data that is being ingested in the pipeline\n", - " \"model_id\": \"roberta-base-openai-detector\", #text classification model id\n", - " \"target_field\": \"openai-detector\", # Target field for the inference results\n", - " \"field_map\": { #Maps the document field names to the known field names of the model.\n", - " \"abstract\": \"text_field\" # Field matching our configured trained model input. Typically for NLP models, the field name is text_field.\n", - " }\n", - " }\n", - " },\n", - " {\n", - " \"inference\": {\n", - " \"model_id\": \"sentence-transformers__all-mpnet-base-v2\", #text embedding model model id\n", - " \"target_field\": \"abstract_vector\", # Target field for the inference results\n", - " \"field_map\": { #Maps the document field names to the known field names of the model.\n", - " \"abstract\": \"text_field\" # Field matching our configured trained model input. Typically for NLP models, the field name is text_field.\n", - " }\n", - " }\n", - " }\n", - "\n", - " ]\n", - ")" - ], - "metadata": { - "id": "XdxP1bJ2KocF" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "client.indices.create(\n", - "index=\"plagiarism-checker\",\n", - "mappings={\n", - "\"properties\": {\n", - " \"title\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\n", - " \"type\": \"keyword\"\n", - " }\n", - " }\n", - " },\n", - " \"abstract\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\n", - " \"type\": \"keyword\"\n", - " }\n", - " }\n", - " },\n", - " \"url\": {\n", - " \"type\": \"keyword\"\n", - " },\n", - " \"venue\": {\n", - " \"type\": \"keyword\"\n", - " },\n", - " \"year\": {\n", - " \"type\": \"keyword\"\n", - " },\n", - " \"abstract_vector.predicted_value\": { # Inference results field, target_field.predicted_value\n", - " \"type\": \"dense_vector\",\n", - " \"dims\": 768, # embedding_size\n", - " \"index\": \"true\",\n", - " \"similarity\": \"dot_product\" # When indexing vectors for approximate kNN search, you need to specify the similarity function for comparing the vectors.\n", - " }\n", - " }\n", - "}\n", - ")" - ], - "metadata": { - "id": "cN4KjsXKKyTu" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "url = \"https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/emnlp2016-2018.json\"\n", - "\n", - "# Send a request to the URL and get the response\n", - "response = urlopen(url)\n", - "\n", - "# Load the response data into a JSON object\n", - "data_json = json.loads(response.read())\n", - "\n", - "def create_index_body(doc):\n", - " \"\"\" Generate the body for an Elasticsearch document. \"\"\"\n", - " return {\n", - " \"_index\": \"plagiarism-docs\",\n", - " \"_source\": doc,\n", - " }\n", - "\n", - "# Prepare the documents to be indexed\n", - "documents = [create_index_body(doc) for doc in data_json]\n", - "\n", - "# Use helpers.bulk to index\n", - "helpers.bulk(client, documents)\n", - "\n", - "print(\"Done indexing documents into `plagiarism-docs` source index\")" - ], - "metadata": { - "id": "svjGh_hUK136" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "#reindex with ingest pipeline\n", - "\n", - "client.reindex(wait_for_completion=True,\n", - " source={\n", - " \"index\": \"plagiarism-docs\"\n", - " },\n", - " dest= {\n", - " \"index\": \"plagiarism-checker\",\n", - " \"pipeline\": \"plagiarism-checker-pipeline\"\n", - " }\n", - ")" - ], - "metadata": { - "id": "_lHg7p6SK5Ws" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "#duplicated text - direct plagiarism\n", - "\n", - "model_text = 'Understanding and reasoning about cooking recipes is a fruitful research direction towards enabling machines to interpret procedural text. In this work, we introduce RecipeQA, a dataset for multimodal comprehension of cooking recipes. It comprises of approximately 20K instructional recipes with multiple modalities such as titles, descriptions and aligned set of images. With over 36K automatically generated question-answer pairs, we design a set of comprehension and reasoning tasks that require joint understanding of images and text, capturing the temporal flow of events and making sense of procedural knowledge. Our preliminary results indicate that RecipeQA will serve as a challenging test bed and an ideal benchmark for evaluating machine comprehension systems. The data and leaderboard are available at http://hucvl.github.io/recipeqa.'\n", - "\n", - "response = client.search(index='plagiarism-checker', size=1,\n", - " knn={\n", - " \"field\": \"abstract_vector.predicted_value\",\n", - " \"k\": 9,\n", - " \"num_candidates\": 974,\n", - " \"query_vector_builder\": {\n", - " \"text_embedding\": {\n", - " \"model_id\": \"sentence-transformers__all-mpnet-base-v2\",\n", - " \"model_text\": model_text\n", - " }\n", - " }\n", - " }\n", - ")\n", - "\n", - "for hit in response['hits']['hits']:\n", - " score = hit['_score']\n", - " title = hit['_source']['title']\n", - " abstract = hit['_source']['abstract']\n", - " openai = hit['_source']['openai-detector']['predicted_value']\n", - " url = hit['_source']['url']\n", - "\n", - " if score > 0.9:\n", - " print(f\"\\nHigh similarity detected! This might be plagiarism.\")\n", - " print(f\"\\nMost similar document: '{title}'\\n\\nAbstract: {abstract}\\n\\nurl: {url}\\n\\nScore:{score}\\n\")\n", - "\n", - " if openai == 'Fake':\n", - " print(\"This document may have been created by AI.\\n\")\n", - "\n", - " elif score < 0.7:\n", - " print(f\"\\nLow similarity detected. This might not be plagiarism.\")\n", - "\n", - " if openai == 'Fake':\n", - " print(\"This document may have been created by AI.\\n\")\n", - "\n", - " else:\n", - " print(f\"\\nModerate similarity detected.\")\n", - " print(f\"\\nMost similar document: '{title}'\\n\\nAbstract: {abstract}\\n\\nurl: {url}\\n\\nScore:{score}\\n\")\n", - "\n", - " if openai == 'Fake':\n", - " print(\"This document may have been created by AI.\\n\")\n", - "\n", - "ml_client = MlClient(client)\n", - "\n", - "model_id = 'roberta-base-openai-detector' #open ai text classification model\n", - "\n", - "document = [\n", - " {\n", - " \"text_field\": model_text\n", - " }\n", - "]\n", - "\n", - "ml_response = ml_client.infer_trained_model(model_id=model_id, docs=document)\n", - "\n", - "predicted_value = ml_response['inference_results'][0]['predicted_value']\n", - "\n", - "if predicted_value == 'Fake':\n", - " print(\"Note: The text query you entered may have been generated by AI.\\n\")\n" - ], - "metadata": { - "id": "51Tjohr8K-tW" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "#similar text - paraphrase plagiarism\n", - "\n", - "model_text = 'Comprehending and deducing information from culinary instructions represents a promising avenue for research aimed at empowering artificial intelligence to decipher step-by-step text. In this study, we present CuisineInquiry, a database for the multifaceted understanding of cooking guidelines. It encompasses a substantial number of informative recipes featuring various elements such as headings, explanations, and a matched assortment of visuals. Utilizing an extensive set of automatically crafted question-answer pairings, we formulate a series of tasks focusing on understanding and logic that necessitate a combined interpretation of visuals and written content. This involves capturing the sequential progression of events and extracting meaning from procedural expertise. Our initial findings suggest that CuisineInquiry is poised to function as a demanding experimental platform.'\n", - "\n", - "response = client.search(index='plagiarism-checker', size=1,\n", - " knn={\n", - " \"field\": \"abstract_vector.predicted_value\",\n", - " \"k\": 9,\n", - " \"num_candidates\": 974,\n", - " \"query_vector_builder\": {\n", - " \"text_embedding\": {\n", - " \"model_id\": \"sentence-transformers__all-mpnet-base-v2\",\n", - " \"model_text\": model_text\n", - " }\n", - " }\n", - " }\n", - ")\n", - "\n", - "for hit in response['hits']['hits']:\n", - " score = hit['_score']\n", - " title = hit['_source']['title']\n", - " abstract = hit['_source']['abstract']\n", - " openai = hit['_source']['openai-detector']['predicted_value']\n", - " url = hit['_source']['url']\n", - "\n", - " if score > 0.9:\n", - " print(f\"\\nHigh similarity detected! This might be plagiarism.\")\n", - " print(f\"\\nMost similar document: '{title}'\\n\\nAbstract: {abstract}\\n\\nurl: {url}\\n\\nScore:{score}\\n\")\n", - "\n", - " if openai == 'Fake':\n", - " print(\"This document may have been created by AI.\\n\")\n", - "\n", - " elif score < 0.7:\n", - " print(f\"\\nLow similarity detected. This might not be plagiarism.\")\n", - "\n", - " if openai == 'Fake':\n", - " print(\"This document may have been created by AI.\\n\")\n", - "\n", - " else:\n", - " print(f\"\\nModerate similarity detected.\")\n", - " print(f\"\\nMost similar document: '{title}'\\n\\nAbstract: {abstract}\\n\\nurl: {url}\\n\\nScore:{score}\\n\")\n", - "\n", - " if openai == 'Fake':\n", - " print(\"This document may have been created by AI.\\n\")\n", - "\n", - "ml_client = MlClient(client)\n", - "\n", - "model_id = 'roberta-base-openai-detector' #open ai text classification model\n", - "\n", - "document = [\n", - " {\n", - " \"text_field\": model_text\n", - " }\n", - "]\n", - "\n", - "ml_response = ml_client.infer_trained_model(model_id=model_id, docs=document)\n", - "\n", - "predicted_value = ml_response['inference_results'][0]['predicted_value']\n", - "\n", - "if predicted_value == 'Fake':\n", - " print(\"Note: The text query you entered may have been generated by AI.\\n\")\n" - ], - "metadata": { - "id": "XcYCPXM0LAT3" - }, - "execution_count": null, - "outputs": [] - } - ] + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "**Blog: Plagiarism detection with Elasticsearch**" + ], + "metadata": { + "id": "kmMkWI9MH7SG" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Q9cqVF6lJtYw" + }, + "outputs": [], + "source": [ + "!pip install elasticsearch==8.11 #Elasticsearch" + ] + }, + { + "cell_type": "code", + "source": [ + "pip -q install eland elasticsearch sentence_transformers transformers torch==2.1.0" + ], + "metadata": { + "id": "wwi3NpszKa_U" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from elasticsearch import Elasticsearch, helpers\n", + "from elasticsearch.client import MlClient\n", + "from eland.ml.pytorch import PyTorchModel\n", + "from eland.ml.pytorch.transformers import TransformerModel\n", + "from urllib.request import urlopen\n", + "import json\n", + "from pathlib import Path\n", + "import getpass" + ], + "metadata": { + "id": "8JSAt-uUKcix" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Found in the 'Manage Deployment' page\n", + "CLOUD_ID = getpass.getpass(\"Enter Elastic Cloud ID: \")\n", + "\n", + "# Password for the 'elastic' user generated by Elasticsearch\n", + "ELASTIC_PASSWORD = getpass.getpass(\"Enter Elastic password: \")\n", + "\n", + "# Create the client instance\n", + "client = Elasticsearch(\n", + " cloud_id=CLOUD_ID, basic_auth=(\"elastic\", ELASTIC_PASSWORD), request_timeout=3600\n", + ")" + ], + "metadata": { + "id": "ctmF7sNwKd5o" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Set the model name from Hugging Face and task type\n", + "# open ai detector model - developed by open ai https://github.com/openai/gpt-2-output-dataset/tree/master/detector\n", + "hf_model_id = \"roberta-base-openai-detector\"\n", + "tm = TransformerModel(model_id=hf_model_id, task_type=\"text_classification\")\n", + "\n", + "# set the modelID as it is named in Elasticsearch\n", + "es_model_id = tm.elasticsearch_model_id()\n", + "\n", + "# Download the model from Hugging Face\n", + "tmp_path = \"models\"\n", + "Path(tmp_path).mkdir(parents=True, exist_ok=True)\n", + "model_path, config, vocab_path = tm.save(tmp_path)\n", + "\n", + "# Load the model into Elasticsearch\n", + "ptm = PyTorchModel(client, es_model_id)\n", + "ptm.import_model(\n", + " model_path=model_path, config_path=None, vocab_path=vocab_path, config=config\n", + ")\n", + "\n", + "# Start the model\n", + "s = MlClient.start_trained_model_deployment(client, model_id=es_model_id)\n", + "s.body" + ], + "metadata": { + "id": "AXeDnvJWKfll" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Set the model name from Hugging Face and task type\n", + "# sentence-transformers model\n", + "hf_model_id = \"sentence-transformers/all-mpnet-base-v2\"\n", + "tm = TransformerModel(model_id=hf_model_id, task_type=\"text_embedding\")\n", + "\n", + "# set the modelID as it is named in Elasticsearch\n", + "es_model_id = tm.elasticsearch_model_id()\n", + "\n", + "# Download the model from Hugging Face\n", + "tmp_path = \"models\"\n", + "Path(tmp_path).mkdir(parents=True, exist_ok=True)\n", + "model_path, config, vocab_path = tm.save(tmp_path)\n", + "\n", + "# Load the model into Elasticsearch\n", + "ptm = PyTorchModel(client, es_model_id)\n", + "ptm.import_model(\n", + " model_path=model_path, config_path=None, vocab_path=vocab_path, config=config\n", + ")\n", + "\n", + "# Start the model\n", + "s = MlClient.start_trained_model_deployment(client, model_id=es_model_id)\n", + "s.body" + ], + "metadata": { + "id": "wFiJAVpBKjkP" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# source index\n", + "client.indices.create(\n", + " index=\"plagiarism-docs\",\n", + " mappings={\n", + " \"properties\": {\n", + " \"title\": {\"type\": \"text\", \"fields\": {\"keyword\": {\"type\": \"keyword\"}}},\n", + " \"abstract\": {\"type\": \"text\", \"fields\": {\"keyword\": {\"type\": \"keyword\"}}},\n", + " \"url\": {\"type\": \"keyword\"},\n", + " \"venue\": {\"type\": \"keyword\"},\n", + " \"year\": {\"type\": \"keyword\"},\n", + " }\n", + " },\n", + ")" + ], + "metadata": { + "id": "S-SNKitkKmHC" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# ingest pipeline\n", + "\n", + "client.ingest.put_pipeline(\n", + " id=\"plagiarism-checker-pipeline\",\n", + " processors=[\n", + " {\n", + " \"inference\": { # for ml models - to infer against the data that is being ingested in the pipeline\n", + " \"model_id\": \"roberta-base-openai-detector\", # text classification model id\n", + " \"target_field\": \"openai-detector\", # Target field for the inference results\n", + " \"field_map\": { # Maps the document field names to the known field names of the model.\n", + " \"abstract\": \"text_field\" # Field matching our configured trained model input. Typically for NLP models, the field name is text_field.\n", + " },\n", + " }\n", + " },\n", + " {\n", + " \"inference\": {\n", + " \"model_id\": \"sentence-transformers__all-mpnet-base-v2\", # text embedding model model id\n", + " \"target_field\": \"abstract_vector\", # Target field for the inference results\n", + " \"field_map\": { # Maps the document field names to the known field names of the model.\n", + " \"abstract\": \"text_field\" # Field matching our configured trained model input. Typically for NLP models, the field name is text_field.\n", + " },\n", + " }\n", + " },\n", + " ],\n", + ")" + ], + "metadata": { + "id": "XdxP1bJ2KocF" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "client.indices.create(\n", + " index=\"plagiarism-checker\",\n", + " mappings={\n", + " \"properties\": {\n", + " \"title\": {\"type\": \"text\", \"fields\": {\"keyword\": {\"type\": \"keyword\"}}},\n", + " \"abstract\": {\"type\": \"text\", \"fields\": {\"keyword\": {\"type\": \"keyword\"}}},\n", + " \"url\": {\"type\": \"keyword\"},\n", + " \"venue\": {\"type\": \"keyword\"},\n", + " \"year\": {\"type\": \"keyword\"},\n", + " \"abstract_vector.predicted_value\": { # Inference results field, target_field.predicted_value\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 768, # embedding_size\n", + " \"index\": \"true\",\n", + " \"similarity\": \"dot_product\", # When indexing vectors for approximate kNN search, you need to specify the similarity function for comparing the vectors.\n", + " },\n", + " }\n", + " },\n", + ")" + ], + "metadata": { + "id": "cN4KjsXKKyTu" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "url = \"https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/emnlp2016-2018.json\"\n", + "\n", + "# Send a request to the URL and get the response\n", + "response = urlopen(url)\n", + "\n", + "# Load the response data into a JSON object\n", + "data_json = json.loads(response.read())\n", + "\n", + "\n", + "def create_index_body(doc):\n", + " \"\"\"Generate the body for an Elasticsearch document.\"\"\"\n", + " return {\n", + " \"_index\": \"plagiarism-docs\",\n", + " \"_source\": doc,\n", + " }\n", + "\n", + "\n", + "# Prepare the documents to be indexed\n", + "documents = [create_index_body(doc) for doc in data_json]\n", + "\n", + "# Use helpers.bulk to index\n", + "helpers.bulk(client, documents)\n", + "\n", + "print(\"Done indexing documents into `plagiarism-docs` source index\")" + ], + "metadata": { + "id": "svjGh_hUK136" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# reindex with ingest pipeline\n", + "\n", + "client.reindex(\n", + " wait_for_completion=True,\n", + " source={\"index\": \"plagiarism-docs\"},\n", + " dest={\"index\": \"plagiarism-checker\", \"pipeline\": \"plagiarism-checker-pipeline\"},\n", + ")" + ], + "metadata": { + "id": "_lHg7p6SK5Ws" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# duplicated text - direct plagiarism\n", + "\n", + "model_text = \"Understanding and reasoning about cooking recipes is a fruitful research direction towards enabling machines to interpret procedural text. In this work, we introduce RecipeQA, a dataset for multimodal comprehension of cooking recipes. It comprises of approximately 20K instructional recipes with multiple modalities such as titles, descriptions and aligned set of images. With over 36K automatically generated question-answer pairs, we design a set of comprehension and reasoning tasks that require joint understanding of images and text, capturing the temporal flow of events and making sense of procedural knowledge. Our preliminary results indicate that RecipeQA will serve as a challenging test bed and an ideal benchmark for evaluating machine comprehension systems. The data and leaderboard are available at http://hucvl.github.io/recipeqa.\"\n", + "\n", + "response = client.search(\n", + " index=\"plagiarism-checker\",\n", + " size=1,\n", + " knn={\n", + " \"field\": \"abstract_vector.predicted_value\",\n", + " \"k\": 9,\n", + " \"num_candidates\": 974,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \"sentence-transformers__all-mpnet-base-v2\",\n", + " \"model_text\": model_text,\n", + " }\n", + " },\n", + " },\n", + ")\n", + "\n", + "for hit in response[\"hits\"][\"hits\"]:\n", + " score = hit[\"_score\"]\n", + " title = hit[\"_source\"][\"title\"]\n", + " abstract = hit[\"_source\"][\"abstract\"]\n", + " openai = hit[\"_source\"][\"openai-detector\"][\"predicted_value\"]\n", + " url = hit[\"_source\"][\"url\"]\n", + "\n", + " if score > 0.9:\n", + " print(f\"\\nHigh similarity detected! This might be plagiarism.\")\n", + " print(\n", + " f\"\\nMost similar document: '{title}'\\n\\nAbstract: {abstract}\\n\\nurl: {url}\\n\\nScore:{score}\\n\"\n", + " )\n", + "\n", + " if openai == \"Fake\":\n", + " print(\"This document may have been created by AI.\\n\")\n", + "\n", + " elif score < 0.7:\n", + " print(f\"\\nLow similarity detected. This might not be plagiarism.\")\n", + "\n", + " if openai == \"Fake\":\n", + " print(\"This document may have been created by AI.\\n\")\n", + "\n", + " else:\n", + " print(f\"\\nModerate similarity detected.\")\n", + " print(\n", + " f\"\\nMost similar document: '{title}'\\n\\nAbstract: {abstract}\\n\\nurl: {url}\\n\\nScore:{score}\\n\"\n", + " )\n", + "\n", + " if openai == \"Fake\":\n", + " print(\"This document may have been created by AI.\\n\")\n", + "\n", + "ml_client = MlClient(client)\n", + "\n", + "model_id = \"roberta-base-openai-detector\" # open ai text classification model\n", + "\n", + "document = [{\"text_field\": model_text}]\n", + "\n", + "ml_response = ml_client.infer_trained_model(model_id=model_id, docs=document)\n", + "\n", + "predicted_value = ml_response[\"inference_results\"][0][\"predicted_value\"]\n", + "\n", + "if predicted_value == \"Fake\":\n", + " print(\"Note: The text query you entered may have been generated by AI.\\n\")" + ], + "metadata": { + "id": "51Tjohr8K-tW" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# similar text - paraphrase plagiarism\n", + "\n", + "model_text = \"Comprehending and deducing information from culinary instructions represents a promising avenue for research aimed at empowering artificial intelligence to decipher step-by-step text. In this study, we present CuisineInquiry, a database for the multifaceted understanding of cooking guidelines. It encompasses a substantial number of informative recipes featuring various elements such as headings, explanations, and a matched assortment of visuals. Utilizing an extensive set of automatically crafted question-answer pairings, we formulate a series of tasks focusing on understanding and logic that necessitate a combined interpretation of visuals and written content. This involves capturing the sequential progression of events and extracting meaning from procedural expertise. Our initial findings suggest that CuisineInquiry is poised to function as a demanding experimental platform.\"\n", + "\n", + "response = client.search(\n", + " index=\"plagiarism-checker\",\n", + " size=1,\n", + " knn={\n", + " \"field\": \"abstract_vector.predicted_value\",\n", + " \"k\": 9,\n", + " \"num_candidates\": 974,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \"sentence-transformers__all-mpnet-base-v2\",\n", + " \"model_text\": model_text,\n", + " }\n", + " },\n", + " },\n", + ")\n", + "\n", + "for hit in response[\"hits\"][\"hits\"]:\n", + " score = hit[\"_score\"]\n", + " title = hit[\"_source\"][\"title\"]\n", + " abstract = hit[\"_source\"][\"abstract\"]\n", + " openai = hit[\"_source\"][\"openai-detector\"][\"predicted_value\"]\n", + " url = hit[\"_source\"][\"url\"]\n", + "\n", + " if score > 0.9:\n", + " print(f\"\\nHigh similarity detected! This might be plagiarism.\")\n", + " print(\n", + " f\"\\nMost similar document: '{title}'\\n\\nAbstract: {abstract}\\n\\nurl: {url}\\n\\nScore:{score}\\n\"\n", + " )\n", + "\n", + " if openai == \"Fake\":\n", + " print(\"This document may have been created by AI.\\n\")\n", + "\n", + " elif score < 0.7:\n", + " print(f\"\\nLow similarity detected. This might not be plagiarism.\")\n", + "\n", + " if openai == \"Fake\":\n", + " print(\"This document may have been created by AI.\\n\")\n", + "\n", + " else:\n", + " print(f\"\\nModerate similarity detected.\")\n", + " print(\n", + " f\"\\nMost similar document: '{title}'\\n\\nAbstract: {abstract}\\n\\nurl: {url}\\n\\nScore:{score}\\n\"\n", + " )\n", + "\n", + " if openai == \"Fake\":\n", + " print(\"This document may have been created by AI.\\n\")\n", + "\n", + "ml_client = MlClient(client)\n", + "\n", + "model_id = \"roberta-base-openai-detector\" # open ai text classification model\n", + "\n", + "document = [{\"text_field\": model_text}]\n", + "\n", + "ml_response = ml_client.infer_trained_model(model_id=model_id, docs=document)\n", + "\n", + "predicted_value = ml_response[\"inference_results\"][0][\"predicted_value\"]\n", + "\n", + "if predicted_value == \"Fake\":\n", + " print(\"Note: The text query you entered may have been generated by AI.\\n\")" + ], + "metadata": { + "id": "XcYCPXM0LAT3" + }, + "execution_count": null, + "outputs": [] + } + ] } diff --git a/supporting-blog-content/vector-search-implementation-guide-api/Load_Hugging_Face_Model.ipynb b/supporting-blog-content/vector-search-implementation-guide-api/Load_Hugging_Face_Model.ipynb index f1358957..6e447487 100644 --- a/supporting-blog-content/vector-search-implementation-guide-api/Load_Hugging_Face_Model.ipynb +++ b/supporting-blog-content/vector-search-implementation-guide-api/Load_Hugging_Face_Model.ipynb @@ -1,110 +1,110 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Load Huggingface Model\n", - "\n", - "This code will use [Eland](https://) to load an embedding model into Elasticsearch\n", - "\n", - "\n", - "Blog Vector Search (kNN) Implementation Guide - API Edition\n", - "\n" - ], - "metadata": { - "id": "sUmu4TerK9OG" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "eaGnibZA-78Z" - }, - "outputs": [], - "source": [ - "!pip install -q elasticsearch eland[pytorch]" - ] - }, - { - "cell_type": "code", - "source": [ - "import getpass" - ], - "metadata": { - "id": "Fxius9__-9Mb" - }, - "execution_count": 2, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Set Connection and Model information\n", - "\n", - "\n", - "* Cloud ID can be retrieved from the Elastic Cloud console\n", - "* Elastic API Key can be generated in Elasticsearch Kibana under Stack Management\n", - "* Model ID is the Model Card Name from Hugging Face\n", - "\n" - ], - "metadata": { - "id": "J3XJeDaJMMk9" - } - }, - { - "cell_type": "code", - "source": [ - "CLOUD_ID = getpass.getpass(\"Enter Elastic Cloud ID: \")\n", - "ELASTIC_API = getpass.getpass(\"Enter Elastic API: \")\n", - "MODEL_ID = getpass.getpass(\"Enter Model ID from Hugging Face Model Card \")" - ], - "metadata": { - "id": "e_zGEcyr_Ce-" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Load the model\n", - "If you want to also start the model, uncomment the `--start` line" - ], - "metadata": { - "id": "ZG6B0greNAQ7" - } - }, - { - "cell_type": "code", - "source": [ - "!eland_import_hub_model \\\n", - " --cloud-id $CLOUD_ID \\\n", - " --es-api-key $ELASTIC_API \\\n", - " --hub-model-id $MODEL_ID \\\n", - " --task-type text_embedding\n", - " # --start" - ], - "metadata": { - "id": "oPKVhWeT_Cc4" - }, - "execution_count": null, - "outputs": [] - } - ] + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Load Huggingface Model\n", + "\n", + "This code will use [Eland](https://) to load an embedding model into Elasticsearch\n", + "\n", + "\n", + "Blog Vector Search (kNN) Implementation Guide - API Edition\n", + "\n" + ], + "metadata": { + "id": "sUmu4TerK9OG" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eaGnibZA-78Z" + }, + "outputs": [], + "source": [ + "!pip install -q elasticsearch eland[pytorch]" + ] + }, + { + "cell_type": "code", + "source": [ + "import getpass" + ], + "metadata": { + "id": "Fxius9__-9Mb" + }, + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Set Connection and Model information\n", + "\n", + "\n", + "* Cloud ID can be retrieved from the Elastic Cloud console\n", + "* Elastic API Key can be generated in Elasticsearch Kibana under Stack Management\n", + "* Model ID is the Model Card Name from Hugging Face\n", + "\n" + ], + "metadata": { + "id": "J3XJeDaJMMk9" + } + }, + { + "cell_type": "code", + "source": [ + "CLOUD_ID = getpass.getpass(\"Enter Elastic Cloud ID: \")\n", + "ELASTIC_API = getpass.getpass(\"Enter Elastic API: \")\n", + "MODEL_ID = getpass.getpass(\"Enter Model ID from Hugging Face Model Card \")" + ], + "metadata": { + "id": "e_zGEcyr_Ce-" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Load the model\n", + "If you want to also start the model, uncomment the `--start` line" + ], + "metadata": { + "id": "ZG6B0greNAQ7" + } + }, + { + "cell_type": "code", + "source": [ + "!eland_import_hub_model \\\n", + " --cloud-id $CLOUD_ID \\\n", + " --es-api-key $ELASTIC_API \\\n", + " --hub-model-id $MODEL_ID \\\n", + " --task-type text_embedding\n", + " # --start" + ], + "metadata": { + "id": "oPKVhWeT_Cc4" + }, + "execution_count": null, + "outputs": [] + } + ] } \ No newline at end of file diff --git a/supporting-blog-content/vector-search-implementation-guide-api/vector_search_implementation_guide_api.ipynb b/supporting-blog-content/vector-search-implementation-guide-api/vector_search_implementation_guide_api.ipynb index 2293138e..f7fde521 100644 --- a/supporting-blog-content/vector-search-implementation-guide-api/vector_search_implementation_guide_api.ipynb +++ b/supporting-blog-content/vector-search-implementation-guide-api/vector_search_implementation_guide_api.ipynb @@ -1,619 +1,568 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Simplified Vector Search (kNN) Implementation Guide\n" - ], - "metadata": { - "id": "XU4UjiHpYdDT" - } - }, - { - "cell_type": "markdown", - "source": [ - "# Loading the Embedding Model\n", - "Loading embedding model: [sentence-transformers/all-distilroberta-v1](https://huggingface.co/sentence-transformers/all-distilroberta-v1)\n", - "\n", - "Loading code borrowed from [elasticsearch-labs](https://www.elastic.co/search-labs) NLP text search [example notebook](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/notebooks/integrations/hugging-face/loading-model-from-hugging-face.ipynb)\n" - ], - "metadata": { - "id": "5lV5UN90l4YN" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Z0TiDltHkebY" - }, - "outputs": [], - "source": [ - "# install packages\n", - "!pip install -qU eland elasticsearch transformers sentence-transformers torch==1.13\n" - ] - }, - { - "cell_type": "code", - "source": [ - "# import modules\n", - "import pandas as pd, json\n", - "from elasticsearch import Elasticsearch\n", - "from elasticsearch.helpers import bulk\n", - "from getpass import getpass\n", - "from urllib.request import urlopen\n", - "from pprint import pprint" - ], - "metadata": { - "id": "Riwvd3CHO9qU" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "API_KEY = getpass(\"Elastic deployment API Key\")\n", - "CLOUD_ID = getpass(\"Elastic deployment Cloud ID\")\n", - "HUB_MODEL_ID = getpass(\"Hugging Face Model Hub ID\") #eg sentence-transformers/all-distilroberta-v1\n", - "\n", - "es = Elasticsearch(cloud_id=CLOUD_ID,\n", - " api_key=API_KEY\n", - " )\n", - "es.info() # should return cluster info" - ], - "metadata": { - "id": "So9bJJDVNzgF" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "!eland_import_hub_model --cloud-id $CLOUD_ID --hub-model-id $HUB_MODEL_ID --task-type text_embedding --es-api-key $API_KEY --start" - ], - "metadata": { - "id": "dsFsmzZwpujb" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Ingest pipeline setup" - ], - "metadata": { - "id": "71wNrH0vl4zi" - } - }, - { - "cell_type": "code", - "source": [ - "pipeline = {\n", - " \"processors\": [\n", - " {\n", - " \"inference\": {\n", - " \"field_map\": {\n", - " \"my_text\": \"text_field\"\n", - " },\n", - " \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n", - " \"target_field\": \"ml.inference.my_vector\",\n", - " \"on_failure\": [\n", - " {\n", - " \"append\": {\n", - " \"field\": \"_source._ingest.inference_errors\",\n", - " \"value\": [\n", - " {\n", - " \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n", - " \"pipeline\": \"ml-inference-title-vector\",\n", - " \"timestamp\": \"{{{ _ingest.timestamp }}}\"\n", - " }\n", - " ]\n", - " }\n", - " }\n", - " ]\n", - " }\n", - " },\n", - " {\n", - " \"set\": {\n", - " \"field\": \"my_vector\",\n", - " \"if\": \"ctx?.ml?.inference != null && ctx.ml.inference['my_vector'] != null\",\n", - " \"copy_from\": \"ml.inference.my_vector.predicted_value\",\n", - " \"description\": \"Copy the predicted_value to 'my_vector'\"\n", - " }\n", - " },\n", - " {\n", - " \"remove\": {\n", - " \"field\": \"ml.inference.my_vector\",\n", - " \"ignore_missing\": True\n", - " }\n", - " }\n", - " ]\n", - "}\n", - "\n", - "pipeline_id = 'vector_embedding_demo'\n", - "response = es.ingest.put_pipeline(id=pipeline_id, body=pipeline)\n", - "\n", - "# Print the response\n", - "print(response)" - ], - "metadata": { - "id": "SL47BJNyl3-r", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "43588d08-9dfb-4b13-9c42-58e071cf3526" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "{'acknowledged': True}\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - ":44: DeprecationWarning: The 'body' parameter is deprecated and will be removed in a future version. Instead use individual parameters.\n", - " response = es.ingest.put_pipeline(id=pipeline_id, body=pipeline)\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Index Mapping / Template setup" - ], - "metadata": { - "id": "TgBeEw_Ql5I5" - } - }, - { - "cell_type": "code", - "source": [ - "index_patterns = [\n", - " \"my_vector_index-*\"\n", - " ]\n", - "\n", - "order = 1\n", - "\n", - "settings = {\n", - " \"number_of_shards\": 1,\n", - " \"number_of_replicas\": 1,\n", - " \"index.default_pipeline\": pipeline_id\n", - " }\n", - "\n", - "mappings = {\n", - " \"properties\": {\n", - " \"my_vector\": {\n", - " \"type\": \"dense_vector\",\n", - " \"dims\": 768,\n", - " \"index\": True,\n", - " \"similarity\": \"dot_product\"\n", - " },\n", - " \"my_text\": {\n", - " \"type\": \"text\"\n", - " }\n", - " },\n", - " \"_source\": {\n", - " \"excludes\": [\n", - " \"my_vector\"\n", - " ]\n", - " }\n", - " }\n", - "\n", - "\n", - "# Create the index template\n", - "response = es.indices.put_template(name=\"my_vector_index\",\n", - " index_patterns=index_patterns,\n", - " order=order,\n", - " settings=settings,\n", - " mappings=mappings\n", - " )\n", - "\n", - "\n", - "# Print the response\n", - "print(response)\n" - ], - "metadata": { - "id": "zNqjEiPZl36N", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "55130ac4-042f-4d65-bc4b-08c6527d85d4" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "{'acknowledged': True}\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - ":34: ElasticsearchWarning: Legacy index templates are deprecated in favor of composable templates.\n", - " response = es.indices.put_template(name=\"my_vector_index\",\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Indexing Data\n" - ], - "metadata": { - "id": "bztQcxbll5cs" - } - }, - { - "cell_type": "code", - "source": [ - "index_name = 'my_vector_index-01'" - ], - "metadata": { - "id": "XbapSs1c-hkd" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "data = [\n", - " (\"Hey, careful, man, there's a beverage here!\", \"The Dude\"),\n", - " (\"I’m The Dude. So, that’s what you call me. You know, that or, uh, His Dudeness, or, uh, Duder, or El Duderino, if you’re not into the whole brevity thing\", \"The Dude\"),\n", - " (\"You don't go out looking for a job dressed like that? On a weekday?\", \"The Big Lebowski\"),\n", - " (\"What do you mean brought it bowling, Dude?\", \"Walter Sobchak\"),\n", - " (\"Donny was a good bowler, and a good man. He was one of us. He was a man who loved the outdoors... and bowling, and as a surfer he explored the beaches of Southern California, from La Jolla to Leo Carrillo and... up to... Pismo\", \"Walter Sobchak\")\n", - "]\n", - "\n", - "actions = [\n", - " {\n", - " \"_op_type\": \"index\",\n", - " \"_index\": \"my_vector_index-01\",\n", - " \"_source\": {\n", - " \"my_text\": text,\n", - " \"my_metadata\": metadata\n", - " }\n", - " } for text, metadata in data\n", - "]\n", - "\n", - "bulk(es, actions)\n", - "\n", - "# Refresh the index to make sure all data is searchable\n", - "es.indices.refresh(index=\"my_vector_index-01\")\n" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "bSIJ-AngVmUi", - "outputId": "49074d6e-1d30-44e1-d565-edac0251eae1" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "ObjectApiResponse({'_shards': {'total': 2, 'successful': 1, 'failed': 0}})" - ] - }, - "metadata": {}, - "execution_count": 14 - } - ] + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Simplified Vector Search (kNN) Implementation Guide\n" + ], + "metadata": { + "id": "XU4UjiHpYdDT" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Loading the Embedding Model\n", + "Loading embedding model: [sentence-transformers/all-distilroberta-v1](https://huggingface.co/sentence-transformers/all-distilroberta-v1)\n", + "\n", + "Loading code borrowed from [elasticsearch-labs](https://www.elastic.co/search-labs) NLP text search [example notebook](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/notebooks/integrations/hugging-face/loading-model-from-hugging-face.ipynb)\n" + ], + "metadata": { + "id": "5lV5UN90l4YN" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Z0TiDltHkebY" + }, + "outputs": [], + "source": [ + "# install packages\n", + "!pip install -qU eland elasticsearch transformers sentence-transformers torch==1.13" + ] + }, + { + "cell_type": "code", + "source": [ + "# import modules\n", + "import pandas as pd, json\n", + "from elasticsearch import Elasticsearch\n", + "from elasticsearch.helpers import bulk\n", + "from getpass import getpass\n", + "from urllib.request import urlopen\n", + "from pprint import pprint" + ], + "metadata": { + "id": "Riwvd3CHO9qU" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "API_KEY = getpass(\"Elastic deployment API Key\")\n", + "CLOUD_ID = getpass(\"Elastic deployment Cloud ID\")\n", + "HUB_MODEL_ID = getpass(\n", + " \"Hugging Face Model Hub ID\"\n", + ") # eg sentence-transformers/all-distilroberta-v1\n", + "\n", + "es = Elasticsearch(cloud_id=CLOUD_ID, api_key=API_KEY)\n", + "es.info() # should return cluster info" + ], + "metadata": { + "id": "So9bJJDVNzgF" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!eland_import_hub_model --cloud-id $CLOUD_ID --hub-model-id $HUB_MODEL_ID --task-type text_embedding --es-api-key $API_KEY --start" + ], + "metadata": { + "id": "dsFsmzZwpujb" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Ingest pipeline setup" + ], + "metadata": { + "id": "71wNrH0vl4zi" + } + }, + { + "cell_type": "code", + "source": [ + "pipeline = {\n", + " \"processors\": [\n", + " {\n", + " \"inference\": {\n", + " \"field_map\": {\"my_text\": \"text_field\"},\n", + " \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n", + " \"target_field\": \"ml.inference.my_vector\",\n", + " \"on_failure\": [\n", + " {\n", + " \"append\": {\n", + " \"field\": \"_source._ingest.inference_errors\",\n", + " \"value\": [\n", + " {\n", + " \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n", + " \"pipeline\": \"ml-inference-title-vector\",\n", + " \"timestamp\": \"{{{ _ingest.timestamp }}}\",\n", + " }\n", + " ],\n", + " }\n", + " }\n", + " ],\n", + " }\n", + " },\n", + " {\n", + " \"set\": {\n", + " \"field\": \"my_vector\",\n", + " \"if\": \"ctx?.ml?.inference != null && ctx.ml.inference['my_vector'] != null\",\n", + " \"copy_from\": \"ml.inference.my_vector.predicted_value\",\n", + " \"description\": \"Copy the predicted_value to 'my_vector'\",\n", + " }\n", + " },\n", + " {\"remove\": {\"field\": \"ml.inference.my_vector\", \"ignore_missing\": True}},\n", + " ]\n", + "}\n", + "\n", + "pipeline_id = \"vector_embedding_demo\"\n", + "response = es.ingest.put_pipeline(id=pipeline_id, body=pipeline)\n", + "\n", + "# Print the response\n", + "print(response)" + ], + "metadata": { + "id": "SL47BJNyl3-r", + "colab": { + "base_uri": "https://localhost:8080/" }, + "outputId": "43588d08-9dfb-4b13-9c42-58e071cf3526" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "markdown", - "source": [ - "# Querying Data\n" - ], - "metadata": { - "id": "ENlZ3Ndjl5yl" - } + "output_type": "stream", + "name": "stdout", + "text": [ + "{'acknowledged': True}\n" + ] }, { - "cell_type": "markdown", - "source": [ - "Approximate k-nearest neighbor (kNN)" - ], - "metadata": { - "id": "Xk4CBDpimfDH" - } + "output_type": "stream", + "name": "stderr", + "text": [ + ":44: DeprecationWarning: The 'body' parameter is deprecated and will be removed in a future version. Instead use individual parameters.\n", + " response = es.ingest.put_pipeline(id=pipeline_id, body=pipeline)\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Index Mapping / Template setup" + ], + "metadata": { + "id": "TgBeEw_Ql5I5" + } + }, + { + "cell_type": "code", + "source": [ + "index_patterns = [\"my_vector_index-*\"]\n", + "\n", + "order = 1\n", + "\n", + "settings = {\n", + " \"number_of_shards\": 1,\n", + " \"number_of_replicas\": 1,\n", + " \"index.default_pipeline\": pipeline_id,\n", + "}\n", + "\n", + "mappings = {\n", + " \"properties\": {\n", + " \"my_vector\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 768,\n", + " \"index\": True,\n", + " \"similarity\": \"dot_product\",\n", + " },\n", + " \"my_text\": {\"type\": \"text\"},\n", + " },\n", + " \"_source\": {\"excludes\": [\"my_vector\"]},\n", + "}\n", + "\n", + "\n", + "# Create the index template\n", + "response = es.indices.put_template(\n", + " name=\"my_vector_index\",\n", + " index_patterns=index_patterns,\n", + " order=order,\n", + " settings=settings,\n", + " mappings=mappings,\n", + ")\n", + "\n", + "\n", + "# Print the response\n", + "print(response)" + ], + "metadata": { + "id": "zNqjEiPZl36N", + "colab": { + "base_uri": "https://localhost:8080/" }, + "outputId": "55130ac4-042f-4d65-bc4b-08c6527d85d4" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "code", - "source": [ - "knn = {\n", - " \"field\": \"my_vector\",\n", - " \"k\": 1,\n", - " \"num_candidates\": 5,\n", - " \"query_vector_builder\": {\n", - " \"text_embedding\": {\n", - " \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n", - " \"model_text\": \"Watchout I have a drink\"\n", - " }\n", - " }\n", - " }\n", - "\n", - "response = es.search(\n", - " index=index_name,\n", - " knn=knn,\n", - " source=True)\n", - "\n", - "pprint(response['hits']['hits'])" - ], - "metadata": { - "id": "xl76_rM4l3iC", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "9a796cf1-4beb-4405-91b9-c323db756d36" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "[{'_id': 'UO5Y3IoB3ljSe18vZY6D',\n", - " '_index': 'my_vector_index-01',\n", - " '_score': 0.78170115,\n", - " '_source': {'ml': {'inference': {}},\n", - " 'my_metadata': 'The Dude',\n", - " 'my_text': \"Hey, careful, man, there's a beverage here!\"}}]\n" - ] - } - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "{'acknowledged': True}\n" + ] }, { - "cell_type": "markdown", - "source": [ - "## Hybrid Searching (kNN + BM25) with RRF" - ], - "metadata": { - "id": "vhefCRd-mjk8" - } + "output_type": "stream", + "name": "stderr", + "text": [ + ":34: ElasticsearchWarning: Legacy index templates are deprecated in favor of composable templates.\n", + " response = es.indices.put_template(name=\"my_vector_index\",\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Indexing Data\n" + ], + "metadata": { + "id": "bztQcxbll5cs" + } + }, + { + "cell_type": "code", + "source": [ + "index_name = \"my_vector_index-01\"" + ], + "metadata": { + "id": "XbapSs1c-hkd" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "data = [\n", + " (\"Hey, careful, man, there's a beverage here!\", \"The Dude\"),\n", + " (\n", + " \"I’m The Dude. So, that’s what you call me. You know, that or, uh, His Dudeness, or, uh, Duder, or El Duderino, if you’re not into the whole brevity thing\",\n", + " \"The Dude\",\n", + " ),\n", + " (\n", + " \"You don't go out looking for a job dressed like that? On a weekday?\",\n", + " \"The Big Lebowski\",\n", + " ),\n", + " (\"What do you mean brought it bowling, Dude?\", \"Walter Sobchak\"),\n", + " (\n", + " \"Donny was a good bowler, and a good man. He was one of us. He was a man who loved the outdoors... and bowling, and as a surfer he explored the beaches of Southern California, from La Jolla to Leo Carrillo and... up to... Pismo\",\n", + " \"Walter Sobchak\",\n", + " ),\n", + "]\n", + "\n", + "actions = [\n", + " {\n", + " \"_op_type\": \"index\",\n", + " \"_index\": \"my_vector_index-01\",\n", + " \"_source\": {\"my_text\": text, \"my_metadata\": metadata},\n", + " }\n", + " for text, metadata in data\n", + "]\n", + "\n", + "bulk(es, actions)\n", + "\n", + "# Refresh the index to make sure all data is searchable\n", + "es.indices.refresh(index=\"my_vector_index-01\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "bSIJ-AngVmUi", + "outputId": "49074d6e-1d30-44e1-d565-edac0251eae1" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "code", - "source": [ - "query = {\n", - " \"match\": {\n", - " \"my_text\": \"bowling\"\n", - " }\n", - " }\n", - "\n", - "knn ={\n", - " \"field\": \"my_vector\",\n", - " \"k\": 3,\n", - " \"num_candidates\": 5,\n", - " \"query_vector_builder\": {\n", - " \"text_embedding\": {\n", - " \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n", - " \"model_text\": \"He enjoyed the game\"\n", - " }\n", - " }\n", - " }\n", - "\n", - "rank: {\n", - " \"rrf\": {}\n", - " }\n", - "\n", - "fields = [\n", - " \"my_text\",\n", - " \"my_metadata\"\n", - " ]\n", - "\n", - "\n", - "response = es.search(\n", - " index=index_name,\n", - " fields=fields,\n", - " knn=knn,\n", - " query=query,\n", - " size=2,\n", - " source=False\n", - " )\n", - "\n", - "pprint(response['hits']['hits'])" - ], - "metadata": { - "id": "wLY8Q6tEmk06", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "dc4dd649-3a66-4084-cba1-2e0e51984037" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "[{'_id': 'U-5Y3IoB3ljSe18vZY6D',\n", - " '_index': 'my_vector_index-01',\n", - " '_score': 1.8080788,\n", - " 'fields': {'my_metadata': ['Walter Sobchak'],\n", - " 'my_text': ['What do you mean brought it bowling, Dude?']}},\n", - " {'_id': 'VO5Y3IoB3ljSe18vZY6D',\n", - " '_index': 'my_vector_index-01',\n", - " '_score': 1.2358729,\n", - " 'fields': {'my_metadata': ['Walter Sobchak'],\n", - " 'my_text': ['Donny was a good bowler, and a good man. He was one '\n", - " 'of us. He was a man who loved the outdoors... and '\n", - " 'bowling, and as a surfer he explored the beaches of '\n", - " 'Southern California, from La Jolla to Leo Carrillo '\n", - " 'and... up to... Pismo']}}]\n" - ] - } + "output_type": "execute_result", + "data": { + "text/plain": [ + "ObjectApiResponse({'_shards': {'total': 2, 'successful': 1, 'failed': 0}})" ] + }, + "metadata": {}, + "execution_count": 14 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Querying Data\n" + ], + "metadata": { + "id": "ENlZ3Ndjl5yl" + } + }, + { + "cell_type": "markdown", + "source": [ + "Approximate k-nearest neighbor (kNN)" + ], + "metadata": { + "id": "Xk4CBDpimfDH" + } + }, + { + "cell_type": "code", + "source": [ + "knn = {\n", + " \"field\": \"my_vector\",\n", + " \"k\": 1,\n", + " \"num_candidates\": 5,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n", + " \"model_text\": \"Watchout I have a drink\",\n", + " }\n", + " },\n", + "}\n", + "\n", + "response = es.search(index=index_name, knn=knn, source=True)\n", + "\n", + "pprint(response[\"hits\"][\"hits\"])" + ], + "metadata": { + "id": "xl76_rM4l3iC", + "colab": { + "base_uri": "https://localhost:8080/" }, + "outputId": "9a796cf1-4beb-4405-91b9-c323db756d36" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "markdown", - "source": [ - "## Filtering" - ], - "metadata": { - "id": "HDBHn_kamlIL" - } + "output_type": "stream", + "name": "stdout", + "text": [ + "[{'_id': 'UO5Y3IoB3ljSe18vZY6D',\n", + " '_index': 'my_vector_index-01',\n", + " '_score': 0.78170115,\n", + " '_source': {'ml': {'inference': {}},\n", + " 'my_metadata': 'The Dude',\n", + " 'my_text': \"Hey, careful, man, there's a beverage here!\"}}]\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Hybrid Searching (kNN + BM25) with RRF" + ], + "metadata": { + "id": "vhefCRd-mjk8" + } + }, + { + "cell_type": "code", + "source": [ + "query = {\"match\": {\"my_text\": \"bowling\"}}\n", + "\n", + "knn = {\n", + " \"field\": \"my_vector\",\n", + " \"k\": 3,\n", + " \"num_candidates\": 5,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n", + " \"model_text\": \"He enjoyed the game\",\n", + " }\n", + " },\n", + "}\n", + "\n", + "rank: {\"rrf\": {}}\n", + "\n", + "fields = [\"my_text\", \"my_metadata\"]\n", + "\n", + "\n", + "response = es.search(\n", + " index=index_name, fields=fields, knn=knn, query=query, size=2, source=False\n", + ")\n", + "\n", + "pprint(response[\"hits\"][\"hits\"])" + ], + "metadata": { + "id": "wLY8Q6tEmk06", + "colab": { + "base_uri": "https://localhost:8080/" }, + "outputId": "dc4dd649-3a66-4084-cba1-2e0e51984037" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "code", - "source": [ - "knn ={\n", - " \"field\": \"my_vector\",\n", - " \"k\": 1,\n", - " \"num_candidates\": 5,\n", - " \"query_vector_builder\": {\n", - " \"text_embedding\": {\n", - " \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n", - " \"model_text\": \"Did you bring the dog?\"\n", - " }\n", - " },\n", - " \"filter\": {\n", - " \"term\": {\n", - " \"my_metadata\": \"The Dude\"\n", - " }\n", - " }\n", - " }\n", - "\n", - "fields = [\n", - " \"my_text\",\n", - " \"my_metadata\"\n", - " ]\n", - "\n", - "response = es.search(\n", - " index=index_name,\n", - " fields=fields,\n", - " knn=knn,\n", - " source=False\n", - " )\n", - "\n", - "pprint(response['hits']['hits'])" - ], - "metadata": { - "id": "yVDMHuM3mla7", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "ebd848da-8ecc-4683-cb81-719f5a12f815" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "[{'_id': 'UO5Y3IoB3ljSe18vZY6D',\n", - " '_index': 'my_vector_index-01',\n", - " '_score': 0.59285694,\n", - " 'fields': {'my_metadata': ['The Dude'],\n", - " 'my_text': [\"Hey, careful, man, there's a beverage here!\"]}}]\n" - ] - } - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "[{'_id': 'U-5Y3IoB3ljSe18vZY6D',\n", + " '_index': 'my_vector_index-01',\n", + " '_score': 1.8080788,\n", + " 'fields': {'my_metadata': ['Walter Sobchak'],\n", + " 'my_text': ['What do you mean brought it bowling, Dude?']}},\n", + " {'_id': 'VO5Y3IoB3ljSe18vZY6D',\n", + " '_index': 'my_vector_index-01',\n", + " '_score': 1.2358729,\n", + " 'fields': {'my_metadata': ['Walter Sobchak'],\n", + " 'my_text': ['Donny was a good bowler, and a good man. He was one '\n", + " 'of us. He was a man who loved the outdoors... and '\n", + " 'bowling, and as a surfer he explored the beaches of '\n", + " 'Southern California, from La Jolla to Leo Carrillo '\n", + " 'and... up to... Pismo']}}]\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Filtering" + ], + "metadata": { + "id": "HDBHn_kamlIL" + } + }, + { + "cell_type": "code", + "source": [ + "knn = {\n", + " \"field\": \"my_vector\",\n", + " \"k\": 1,\n", + " \"num_candidates\": 5,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n", + " \"model_text\": \"Did you bring the dog?\",\n", + " }\n", + " },\n", + " \"filter\": {\"term\": {\"my_metadata\": \"The Dude\"}},\n", + "}\n", + "\n", + "fields = [\"my_text\", \"my_metadata\"]\n", + "\n", + "response = es.search(index=index_name, fields=fields, knn=knn, source=False)\n", + "\n", + "pprint(response[\"hits\"][\"hits\"])" + ], + "metadata": { + "id": "yVDMHuM3mla7", + "colab": { + "base_uri": "https://localhost:8080/" }, + "outputId": "ebd848da-8ecc-4683-cb81-719f5a12f815" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "markdown", - "source": [ - "# Aggregrations\n", - "and Select fields returned" - ], - "metadata": { - "id": "N_Msyv4-m5ow" - } + "output_type": "stream", + "name": "stdout", + "text": [ + "[{'_id': 'UO5Y3IoB3ljSe18vZY6D',\n", + " '_index': 'my_vector_index-01',\n", + " '_score': 0.59285694,\n", + " 'fields': {'my_metadata': ['The Dude'],\n", + " 'my_text': [\"Hey, careful, man, there's a beverage here!\"]}}]\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Aggregrations\n", + "and Select fields returned" + ], + "metadata": { + "id": "N_Msyv4-m5ow" + } + }, + { + "cell_type": "code", + "source": [ + "knn = {\n", + " \"field\": \"my_vector\",\n", + " \"k\": 2,\n", + " \"num_candidates\": 5,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n", + " \"model_text\": \"did you bring it?\",\n", + " }\n", + " },\n", + "}\n", + "\n", + "aggs = {\"metadata\": {\"terms\": {\"field\": \"my_metadata\"}}}\n", + "\n", + "fields = [\"my_text\", \"my_metadata\"]\n", + "\n", + "response = es.search(index=index_name, fields=fields, aggs=aggs, knn=knn, source=False)\n", + "\n", + "pprint(response[\"hits\"][\"hits\"])" + ], + "metadata": { + "id": "jbwinE0fm5-I", + "colab": { + "base_uri": "https://localhost:8080/" }, + "outputId": "e8b02f4b-8a89-417f-a892-2e676a812a2d" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "code", - "source": [ - "knn = {\n", - " \"field\": \"my_vector\",\n", - " \"k\": 2,\n", - " \"num_candidates\": 5,\n", - " \"query_vector_builder\": {\n", - " \"text_embedding\": {\n", - " \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n", - " \"model_text\": \"did you bring it?\"\n", - " }\n", - " }\n", - " }\n", - "\n", - "aggs = {\n", - " \"metadata\": {\n", - " \"terms\": {\n", - " \"field\": \"my_metadata\"\n", - " }\n", - " }\n", - " }\n", - "\n", - "fields = [\n", - " \"my_text\",\n", - " \"my_metadata\"\n", - " ]\n", - "\n", - "response = es.search(\n", - " index=index_name,\n", - " fields=fields,\n", - " aggs=aggs,\n", - " knn=knn,\n", - " source=False\n", - " )\n", - "\n", - "pprint(response['hits']['hits'])" - ], - "metadata": { - "id": "jbwinE0fm5-I", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "e8b02f4b-8a89-417f-a892-2e676a812a2d" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "[{'_id': 'U-5Y3IoB3ljSe18vZY6D',\n", - " '_index': 'my_vector_index-01',\n", - " '_score': 0.74352247,\n", - " 'fields': {'my_metadata': ['Walter Sobchak'],\n", - " 'my_text': ['What do you mean brought it bowling, Dude?']}},\n", - " {'_id': 'UO5Y3IoB3ljSe18vZY6D',\n", - " '_index': 'my_vector_index-01',\n", - " '_score': 0.6010935,\n", - " 'fields': {'my_metadata': ['The Dude'],\n", - " 'my_text': [\"Hey, careful, man, there's a beverage here!\"]}}]\n" - ] - } - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "[{'_id': 'U-5Y3IoB3ljSe18vZY6D',\n", + " '_index': 'my_vector_index-01',\n", + " '_score': 0.74352247,\n", + " 'fields': {'my_metadata': ['Walter Sobchak'],\n", + " 'my_text': ['What do you mean brought it bowling, Dude?']}},\n", + " {'_id': 'UO5Y3IoB3ljSe18vZY6D',\n", + " '_index': 'my_vector_index-01',\n", + " '_score': 0.6010935,\n", + " 'fields': {'my_metadata': ['The Dude'],\n", + " 'my_text': [\"Hey, careful, man, there's a beverage here!\"]}}]\n" + ] } - ] + ] + } + ] }