Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: update notebook to use latest scrapi code #238

Merged
merged 1 commit into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
218 changes: 46 additions & 172 deletions examples/vertex_ai_conversation/check_documents_in_datastore.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,11 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"metadata": {},
"outputs": [],
"source": [
"# Dependencies\n",
"!pip install google-cloud-discoveryengine --quiet\n",
"!pip install dfcx-scrapi --quiet\n",
"\n",
"from google.colab import auth\n",
"from google.auth import default\n",
Expand All @@ -67,169 +63,53 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# DiscoveryEngine Client and Helper Functions\n"
"# USER INPUTS\n",
"You can find your `datastore_id` by using the `get_data_stores_map` method in SCRAPI."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"from google.cloud import discoveryengine_v1\n",
"from google.api_core import operations_v1, grpc_helpers\n",
"from google.longrunning import operations_pb2\n",
"from typing import List, Optional\n",
"from google.api_core.client_options import ClientOptions\n",
"\n",
"\n",
"def list_documents(\n",
" project_id: str, location: str, datastore_id: str, rate_limit: int = 1):\n",
" \"\"\"Gets a list of docs in a datastore.\"\"\"\n",
" # Create a client\n",
" client_options = (\n",
" ClientOptions(api_endpoint=f\"{location}-discoveryengine.googleapis.com\")\n",
" if location != \"global\"\n",
" else None)\n",
" \n",
" client = discoveryengine_v1.DocumentServiceClient(client_options=client_options)\n",
"\n",
" request = discoveryengine_v1.ListDocumentsRequest(\n",
" parent=f'projects/{project_id}/locations/{location}/collections/default_collection/dataStores/{datastore_id}/branches/0',\n",
" page_size=1000\n",
" )\n",
"\n",
" res = client.list_documents(request=request)\n",
"\n",
" # setup the list with the first batch of docs\n",
" docs = res.documents\n",
"\n",
" while res.next_page_token:\n",
" # implement a rate_limit to prevent quota exhaustion\n",
" time.sleep(rate_limit)\n",
"\n",
" request = discoveryengine_v1.ListDocumentsRequest(\n",
" parent=f'projects/{project_id}/locations/{location}/collections/default_collection/dataStores/{datastore_id}/branches/0',\n",
" page_size=1000,\n",
" page_token=res.next_page_token\n",
" )\n",
"\n",
" res = client.list_documents(request=request)\n",
" docs.extend(res.documents)\n",
"\n",
" return docs\n",
"\n",
"def list_indexed_urls(\n",
" project_id: str,\n",
" location: str,\n",
" datastore_id: str,\n",
" docs: Optional[List[discoveryengine_v1.Document]] = None):\n",
" \"\"\"Get the list of docs in datastore, then parse to only urls.\"\"\"\n",
" if not docs:\n",
" docs = list_documents(project_id, location, datastore_id)\n",
" urls = [doc.content.uri for doc in docs]\n",
"from dfcx_scrapi.core.data_stores import DataStores\n",
"from dfcx_scrapi.core.search import Search\n",
"\n",
" return urls\n",
"PROJECT_ID = \"\" #@param{type: 'string'}\n",
"\n",
"def search_url(urls: List[str], url: str):\n",
" \"\"\"Searches a url in a list of urls.\"\"\"\n",
" for item in urls:\n",
" if url in item:\n",
" print(item)\n",
"s = Search()\n",
"ds = DataStores(project_id=PROJECT_ID)\n",
"\n",
"def search_doc_id(\n",
" doc_id: str, docs: Optional[List[discoveryengine_v1.Document]] = None):\n",
" \"\"\"Searches a doc_id in a list of docs.\"\"\"\n",
" if not docs:\n",
" docs = list_documents(project_id, location, datastore_id)\n",
"\n",
" doc_found = False\n",
" for doc in docs:\n",
" if doc.parent_document_id == document_id:\n",
" doc_found = True\n",
" print(doc)\n",
"\n",
" if not doc_found:\n",
" print(f\"Document not found for provided Doc ID: `{doc_id}`\")\n",
"\n",
"\n",
"def get_operations_status(operation_id: str):\n",
" \"\"\"Get the status of an import operation for Discovery Engine.\"\"\"\n",
" host = \"discoveryengine.googleapis.com\"\n",
" channel = grpc_helpers.create_channel(host)\n",
" client = operations_v1.OperationsClient(channel)\n",
"\n",
" response = client.get_operation(operation_id)\n",
"\n",
" return response\n",
"\n",
"PENDING_MESSAGE = \"\"\"\n",
"No docs found.\\n\\nIt\\'s likely one of two issues: \\n [1] Your data store is not finished indexing. \\n [2] Your data store failed indexing.\\n\n",
"If you just added your data store, it can take up to 4 hours before it will become available.\n",
"\"\"\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# USER INPUTS\n",
"You can find your `datastore_id` by going following these steps:\n",
"1. Click on Gen App Builder\n",
"2. Select your App / Engine\n",
"3. Select your Available Data Store\n",
"4. Find your Data Store ID"
"ds_map = ds.get_data_stores_map(reverse=True, location=\"global\")\n",
"ds_map"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"metadata": {},
"outputs": [],
"source": [
"project_id = '' #@param{type: 'string'}\n",
"location = 'global' #@param{type: 'string'}\n",
"datastore_id = '' #@param{type: 'string'}"
"# Access your datastore_id from the ds_map by using the human readable display name\n",
"datastore_id = ds_map[\"my-cool-datastore\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Check Data Store Index Status\n",
"Using the `list_documents` method, to check if the data store has finished indexing."
"Using the `check_datastore_index_status` method, to check if the data store has finished indexing."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"metadata": {},
"outputs": [],
"source": [
"docs = list_documents(project_id, location, datastore_id)\n",
"\n",
"if len(docs) == 0:\n",
" print(PENDING_MESSAGE)\n",
"else:\n",
" SUCCESS_MESSAGE = f\"\"\"\n",
" Success! 🎉\\n\n",
" Your indexing is complete.\\n\n",
" Your index contains {len(docs)} documents.\n",
" \"\"\"\n",
" print(SUCCESS_MESSAGE)"
"s.check_datastore_index_status(datastore_id)"
]
},
{
Expand All @@ -243,14 +123,10 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"metadata": {},
"outputs": [],
"source": [
"docs = list_documents(project_id, location, datastore_id)\n",
"docs = s.list_documents(datastore_id)\n",
"docs[0]"
]
},
Expand All @@ -264,14 +140,10 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"metadata": {},
"outputs": [],
"source": [
"urls = list_indexed_urls(project_id, location, datastore_id, docs)\n",
"urls = s.list_indexed_urls(datastore_id, docs)\n",
"urls[0]"
]
},
Expand All @@ -285,11 +157,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"metadata": {},
"outputs": [],
"source": [
"urls"
Expand All @@ -305,11 +173,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"metadata": {},
"outputs": [],
"source": [
"import json\n",
Expand All @@ -328,14 +192,10 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"metadata": {},
"outputs": [],
"source": [
"search_url(urls, 'tundra-250')"
"s.search_url(urls, 'tundra-250')"
]
},
{
Expand All @@ -351,16 +211,30 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"metadata": {},
"outputs": [],
"source": [
"document_id = 'a71d802406f2f0e546b621245e1cbc6a'\n",
"\n",
"s.search_doc_id(document_id=document_id, docs=docs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"List docs and search document ID all at once."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"document_id = 'a71d802406f2f0e546b621245e1cbc6a'\n",
"\n",
"search_doc_id(document_id, docs)"
"s.search_doc_id(document_id=document_id, datastore_id=datastore_id)"
]
}
],
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
install_requires=[
'google-cloud-dialogflow-cx',
'google-cloud-aiplatform',
'google-cloud-discoveryengine',
'rouge-score'
]
)
Loading