Skip to content

Commit

Permalink
add extra dimension suport and cleaned notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
Sophie Chen committed May 23, 2024
1 parent 5c01f9c commit 24d18de
Show file tree
Hide file tree
Showing 6 changed files with 435 additions and 16 deletions.
1 change: 1 addition & 0 deletions scripts/.env.sample
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# resource switch
FLAG_EMBEDDING_MODEL = "AOAI" # "AOAI" or "COHERE"
FLAG_COHERE = "ENGLISH" # "MULTILINGUAL" or "ENGLISH" options for Cohere embedding models
FLAG_AOAI = "V3" # "V2" or "V3" options for AOAI embedding models

# update vector dimension based on model chosen
VECTOR_DIMENSION = 1536 # change it to desired, e.g., 1536 for AOAI ada 002, 1024 for COHERE
Expand Down
26 changes: 13 additions & 13 deletions scripts/config.json
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
[
{
"data_path": "<path to data>",
"location": "<azure region, e.g. 'westus2'>",
"subscription_id": "<subscription id>",
"resource_group": "<resource group name>",
"search_service_name": "<search service name to use or create>",
"index_name": "<index name to use or create>",
"chunk_size": 1024,
"token_overlap": 128,
"semantic_config_name": "default",
"language": "<Language to support for example use 'en' for English. Checked supported languages here under lucene - https://learn.microsoft.com/en-us/azure/search/index-add-language-analyzers"
}
[
{
"data_path": "<path to data>",
"location": "<azure region, e.g. 'westus2'>",
"subscription_id": "<subscription id>",
"resource_group": "<resource group name>",
"search_service_name": "<search service name to use or create>",
"index_name": "<index name to use or create>",
"chunk_size": 1024,
"token_overlap": 128,
"semantic_config_name": "default",
"language": "<Language to support for example use 'en' for English. Checked supported languages here under lucene - https://learn.microsoft.com/en-us/azure/search/index-add-language-analyzers"
}
]
2 changes: 1 addition & 1 deletion scripts/data_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def create_or_update_search_index(
"type": "Collection(Edm.Single)",
"searchable": True,
"retrievable": True,
"dimensions": os.getenv("VECTOR_DIMENSION", 1536),
"dimensions": int(os.getenv("VECTOR_DIMENSION", 1536)),
"vectorSearchConfiguration": vector_config_name
})

Expand Down
10 changes: 8 additions & 2 deletions scripts/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,7 @@ def get_embedding(text, embedding_model_endpoint=None, embedding_model_key=None,

FLAG_EMBEDDING_MODEL = os.getenv("FLAG_EMBEDDING_MODEL", "AOAI")
FLAG_COHERE = os.getenv("FLAG_COHERE", "ENGLISH")
FLAG_AOAI = os.getenv("FLAG_AOAI", "V3")

if azure_credential is None and (endpoint is None or key is None):
raise Exception("EMBEDDING_MODEL_ENDPOINT and EMBEDDING_MODEL_KEY are required for embedding")
Expand All @@ -666,8 +667,13 @@ def get_embedding(text, embedding_model_endpoint=None, embedding_model_key=None,
else:
api_key = embedding_model_key if embedding_model_key else os.getenv("AZURE_OPENAI_API_KEY")

client = AzureOpenAI(api_version=api_version, azure_endpoint=base_url, azure_ad_token=api_key)
embeddings = client.embeddings.create(model=deployment_id, input=text)
client = AzureOpenAI(api_version=api_version, azure_endpoint=base_url, api_key=api_key)
if FLAG_AOAI == "V2":
embeddings = client.embeddings.create(model=deployment_id, input=text)
elif FLAG_AOAI == "V3":
embeddings = client.embeddings.create(model=deployment_id,
input=text,
dimensions=int(os.getenv("VECTOR_DIMENSION", 1536)))

return embeddings.dict()['data'][0]['embedding']

Expand Down
3 changes: 3 additions & 0 deletions scripts/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ Disclaimer: Make sure there are no duplicate pages in your data. That could impa

`python data_preparation.py --config config.json --njobs=4`

### Batch creation of index
Refer to the notebook run_batch_create_index.ipynb to create multiple indexes using one script.

## Optional: Use URL prefix
Each document can be associated with a URL that is stored with each document chunk in the Azure Cognitive Search index in the `url` field. If your documents were downloaded from the web, you can specify a URL prefix to use to construct the document URLs when ingesting your data. Your config file should have an additional `url_prefix` parameter like so:

Expand Down
Loading

0 comments on commit 24d18de

Please sign in to comment.