Skip to content

Commit

Permalink
Merge pull request #241 from nitya/aitour-2025-refresh
Browse files Browse the repository at this point in the history
Aitour 2025 refresh
  • Loading branch information
nitya authored Jan 24, 2025
2 parents 8c1c3dc + 136a287 commit 4bcb5a2
Show file tree
Hide file tree
Showing 60 changed files with 1,544 additions and 647 deletions.
10 changes: 3 additions & 7 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,13 @@
"context": ".."
},
"features": {
"ghcr.io/devcontainers/features/azure-cli:1": {
"installBicep": true,
"extensions": "ml"
},
"ghcr.io/devcontainers/features/azure-cli:1": {},
"ghcr.io/devcontainers/features/git:1": {},
"ghcr.io/azure/azure-dev/azd:latest": {},
"ghcr.io/devcontainers/features/docker-in-docker:2": {},
"ghcr.io/devcontainers/features/github-cli:1": {},
"ghcr.io/devcontainers/features/node:1": {
"version": "22.8.0"
}
"ghcr.io/devcontainers/features/node:1": {},
"ghcr.io/azure/azure-dev/azd:0": {}
},
"customizations": {
"vscode": {
Expand Down
3 changes: 2 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{
"python.terminal.activateEnvironment": true
"python.terminal.activateEnvironment": true,
"prompty.currentModelConfiguration": "default"
}
4 changes: 2 additions & 2 deletions azure.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ hooks:
postprovision:
posix:
shell: sh
continueOnError: false
continueOnError: true
interactive: true
run: infra/hooks/postprovision.sh
windows:
shell: pwsh
continueOnError: false
continueOnError: true
interactive: true
run: infra/hooks/postprovision.ps1
infra:
Expand Down
210 changes: 210 additions & 0 deletions data/scripts/create-azure-search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
#!/usr/bin/env python
# coding: utf-8

# # Generating your product search index
# Thereis notebook is designed to automatically create the product search index for you. It uses the [product catalog](products.csv) file to create the index. In order to do so it needs names ane keys for the following services:
#
# - Azure Search Service
# - Azure OpenAI Service
#
# You can find the names and keys in the Azure Portal. These need to be entered in a `.env` file in the root of this repository. The `.env` file is not checked in to source control. You can use the [`.env.sample`](../../.env.sample) file as a template.

# In[1]:


import os
import pandas as pd
from azure.identity import DefaultAzureCredential
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
HnswParameters,
HnswAlgorithmConfiguration,
SemanticPrioritizedFields,
SearchableField,
SearchField,
SearchFieldDataType,
SearchIndex,
SemanticSearch,
SemanticConfiguration,
SemanticField,
SimpleField,
VectorSearch,
VectorSearchAlgorithmKind,
VectorSearchAlgorithmMetric,
ExhaustiveKnnAlgorithmConfiguration,
ExhaustiveKnnParameters,
VectorSearchProfile,
)
from typing import List, Dict
from openai import AzureOpenAI
from dotenv import load_dotenv

from pathlib import Path

load_dotenv()


# In[2]:


def delete_index(search_index_client: SearchIndexClient, search_index: str):
print(f"deleting index {search_index}")
search_index_client.delete_index(search_index)


# In[3]:


def create_index_definition(name: str) -> SearchIndex:
"""
Returns an Azure Cognitive Search index with the given name.
"""
# The fields we want to index. The "embedding" field is a vector field that will
# be used for vector search.
fields = [
SimpleField(name="id", type=SearchFieldDataType.String, key=True),
SearchableField(name="content", type=SearchFieldDataType.String),
SimpleField(name="filepath", type=SearchFieldDataType.String),
SearchableField(name="title", type=SearchFieldDataType.String),
SimpleField(name="url", type=SearchFieldDataType.String),
SearchField(
name="contentVector",
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
searchable=True,
# Size of the vector created by the text-embedding-ada-002 model.
vector_search_dimensions=1536,
vector_search_profile_name="myHnswProfile",
),
]

# The "content" field should be prioritized for semantic ranking.
semantic_config = SemanticConfiguration(
name="default",
prioritized_fields=SemanticPrioritizedFields(
title_field=SemanticField(field_name="title"),
keywords_fields=[],
content_fields=[SemanticField(field_name="content")],
),
)

# For vector search, we want to use the HNSW (Hierarchical Navigable Small World)
# algorithm (a type of approximate nearest neighbor search algorithm) with cosine
# distance.
vector_search = VectorSearch(
algorithms=[
HnswAlgorithmConfiguration(
name="myHnsw",
kind=VectorSearchAlgorithmKind.HNSW,
parameters=HnswParameters(
m=4,
ef_construction=400,
ef_search=500,
metric=VectorSearchAlgorithmMetric.COSINE,
),
),
ExhaustiveKnnAlgorithmConfiguration(
name="myExhaustiveKnn",
kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
parameters=ExhaustiveKnnParameters(
metric=VectorSearchAlgorithmMetric.COSINE
),
),
],
profiles=[
VectorSearchProfile(
name="myHnswProfile",
algorithm_configuration_name="myHnsw",
),
VectorSearchProfile(
name="myExhaustiveKnnProfile",
algorithm_configuration_name="myExhaustiveKnn",
),
],
)

# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index.
index = SearchIndex(
name=name,
fields=fields,
semantic_search=semantic_search,
vector_search=vector_search,
)

return index


# In[4]:


def gen_contoso_products(
path: str,
) -> List[Dict[str, any]]:
openai_service_endoint = os.environ["AZURE_OPENAI_ENDPOINT"]
openai_deployment = "text-embedding-ada-002"

token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")
# openai.Embedding.create() -> client.embeddings.create()
client = AzureOpenAI(
api_version="2023-07-01-preview",
azure_endpoint=openai_service_endoint,
azure_deployment=openai_deployment,
azure_ad_token_provider=token_provider
)

products = pd.read_csv(path)
items = []
for product in products.to_dict("records"):
content = product["description"]
id = str(product["id"])
title = product["name"]
url = f"/products/{title.lower().replace(' ', '-')}"
emb = client.embeddings.create(input=content, model=openai_deployment)
rec = {
"id": id,
"content": content,
"filepath": f"{title.lower().replace(' ', '-')}",
"title": title,
"url": url,
"contentVector": emb.data[0].embedding,
}
items.append(rec)

return items


# In[5]:


contoso_search = os.environ["AZURE_SEARCH_ENDPOINT"]
index_name = "contoso-products"

search_index_client = SearchIndexClient(
contoso_search, DefaultAzureCredential()
)

delete_index(search_index_client, index_name)
index = create_index_definition(index_name)
print(f"creating index {index_name}")
search_index_client.create_or_update_index(index)
print(f"index {index_name} created")


# In[6]:


print(f"indexing documents")
docs = gen_contoso_products("../product_info/products.csv")
# Upload our data to the index.
search_client = SearchClient(
endpoint=contoso_search,
index_name=index_name,
credential=DefaultAzureCredential(),
)
print(f"uploading {len(docs)} documents to index {index_name}")
ds = search_client.upload_documents(docs)

72 changes: 72 additions & 0 deletions data/scripts/create-cosmos-db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/usr/bin/env python
# coding: utf-8

# In[1]:


from azure.cosmos import CosmosClient, exceptions, PartitionKey
from azure.identity import DefaultAzureCredential
import os
from dotenv import load_dotenv

load_dotenv()


# In[2]:


# from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential

# try:
# credential = DefaultAzureCredential()
# # Check if given credential can get token successfully.
# credential.get_token("https://management.azure.com/.default")
# except Exception as ex:
# # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
# # This will open a browser page for
# credential = InteractiveBrowserCredential()


# In[3]:


# Set the Cosmos DB endpoint, key and database name in the .env file. The key and endpoint can be found in the resource created in the portal.
COSMOS_ENDPOINT = os.environ["COSMOS_ENDPOINT"]
client = CosmosClient(COSMOS_ENDPOINT, credential=DefaultAzureCredential())
DATABASE_NAME = 'contoso-outdoor'
CONTAINER_NAME = 'customers'


# In[4]:


# Get the database and container created by Bicep
database = client.get_database_client(DATABASE_NAME)
container = database.get_container_client(CONTAINER_NAME)

print(database)


# In[5]:


# Loop through each json file in data/customer_info and insert into container
import os
import json
import glob
path = './../customer_info'
for filename in glob.glob(os.path.join(path, '*.json')):
with open(filename) as file:
customer = json.load(file)
container.upsert_item(customer)
print('Upserted item with id {0}'.format(customer['id']))


# In[6]:


# Get items from container to validate they were inserted
print('Get all items in container')
items = list(container.read_all_items(max_item_count=10))
print(items)

Loading

0 comments on commit 4bcb5a2

Please sign in to comment.