add extra dimension suport and cleaned notebooks

microsoft · May 23, 2024 · 24d18de · 24d18de
1 parent 5c01f9c
commit 24d18de
Show file tree

Hide file tree

Showing 6 changed files with 435 additions and 16 deletions.
diff --git a/scripts/.env.sample b/scripts/.env.sample
@@ -1,6 +1,7 @@
 # resource switch 
 FLAG_EMBEDDING_MODEL = "AOAI" # "AOAI" or "COHERE"
 FLAG_COHERE = "ENGLISH" # "MULTILINGUAL" or "ENGLISH" options for Cohere embedding models
+FLAG_AOAI = "V3" # "V2" or "V3" options for AOAI embedding models
 
 # update vector dimension based on model chosen
 VECTOR_DIMENSION = 1536 # change it to desired, e.g., 1536 for AOAI ada 002, 1024 for COHERE

diff --git a/scripts/config.json b/scripts/config.json
@@ -1,14 +1,14 @@
-[
-    {
-        "data_path": "<path to data>",
-        "location": "<azure region, e.g. 'westus2'>",
-        "subscription_id": "<subscription id>",
-        "resource_group": "<resource group name>",
-        "search_service_name": "<search service name to use or create>",
-        "index_name": "<index name to use or create>",
-        "chunk_size": 1024,
-        "token_overlap": 128,
-        "semantic_config_name": "default",
-        "language": "<Language to support for example use 'en' for English. Checked supported languages here under lucene - https://learn.microsoft.com/en-us/azure/search/index-add-language-analyzers"
-    }
+[
+    {
+        "data_path": "<path to data>",
+        "location": "<azure region, e.g. 'westus2'>",
+        "subscription_id": "<subscription id>",
+        "resource_group": "<resource group name>",
+        "search_service_name": "<search service name to use or create>",
+        "index_name": "<index name to use or create>",
+        "chunk_size": 1024,
+        "token_overlap": 128,
+        "semantic_config_name": "default",
+        "language": "<Language to support for example use 'en' for English. Checked supported languages here under lucene - https://learn.microsoft.com/en-us/azure/search/index-add-language-analyzers"
+    }
 ]
diff --git a/scripts/data_preparation.py b/scripts/data_preparation.py
@@ -232,7 +232,7 @@ def create_or_update_search_index(
             "type": "Collection(Edm.Single)",
             "searchable": True,
             "retrievable": True,
-            "dimensions": os.getenv("VECTOR_DIMENSION", 1536),
+            "dimensions": int(os.getenv("VECTOR_DIMENSION", 1536)),
             "vectorSearchConfiguration": vector_config_name
         })
 

diff --git a/scripts/data_utils.py b/scripts/data_utils.py
@@ -651,6 +651,7 @@ def get_embedding(text, embedding_model_endpoint=None, embedding_model_key=None,
 
     FLAG_EMBEDDING_MODEL = os.getenv("FLAG_EMBEDDING_MODEL", "AOAI")
     FLAG_COHERE = os.getenv("FLAG_COHERE", "ENGLISH")
+    FLAG_AOAI = os.getenv("FLAG_AOAI", "V3")
 
     if azure_credential is None and (endpoint is None or key is None):
         raise Exception("EMBEDDING_MODEL_ENDPOINT and EMBEDDING_MODEL_KEY are required for embedding")
@@ -666,8 +667,13 @@ def get_embedding(text, embedding_model_endpoint=None, embedding_model_key=None,
             else:
                 api_key = embedding_model_key if embedding_model_key else os.getenv("AZURE_OPENAI_API_KEY")
 
-            client = AzureOpenAI(api_version=api_version, azure_endpoint=base_url, azure_ad_token=api_key)
-            embeddings = client.embeddings.create(model=deployment_id, input=text)
+            client = AzureOpenAI(api_version=api_version, azure_endpoint=base_url, api_key=api_key)
+            if FLAG_AOAI == "V2":
+                embeddings = client.embeddings.create(model=deployment_id, input=text)
+            elif FLAG_AOAI == "V3":   
+                embeddings = client.embeddings.create(model=deployment_id, 
+                                                      input=text, 
+                                                      dimensions=int(os.getenv("VECTOR_DIMENSION", 1536)))
 
             return embeddings.dict()['data'][0]['embedding']
 

diff --git a/scripts/readme.md b/scripts/readme.md
@@ -36,6 +36,9 @@ Disclaimer: Make sure there are no duplicate pages in your data. That could impa
 
      `python data_preparation.py --config config.json --njobs=4`
 
+### Batch creation of index
+Refer to the notebook run_batch_create_index.ipynb to create multiple indexes using one script.
+
 ## Optional: Use URL prefix
 Each document can be associated with a URL that is stored with each document chunk in the Azure Cognitive Search index in the `url` field. If your documents were downloaded from the web, you can specify a URL prefix to use to construct the document URLs when ingesting your data. Your config file should have an additional `url_prefix` parameter like so: