From bc1b5e30552071be1b99174fe5a82afae3e72e5b Mon Sep 17 00:00:00 2001 From: Saravana Date: Wed, 25 Sep 2024 01:57:39 +0530 Subject: [PATCH 01/10] Create csvparser.py I am adding the CSV parser python code. it works with basic CSV files. --- app/backend/prepdocslib/csvparser.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 app/backend/prepdocslib/csvparser.py diff --git a/app/backend/prepdocslib/csvparser.py b/app/backend/prepdocslib/csvparser.py new file mode 100644 index 0000000000..05f0c92d56 --- /dev/null +++ b/app/backend/prepdocslib/csvparser.py @@ -0,0 +1,20 @@ +import csv +from typing import IO, AsyncGenerator +from .page import Page +from .parser import Parser + + +class CsvParser(Parser): + """ + Concrete parser that can parse CSV into Page objects. Each row becomes a Page object. + """ + + async def parse(self, content: IO) -> AsyncGenerator[Page, None]: + # Ensure the file is read in text mode + text_content = content.read().decode('utf-8') # Decode bytes to string if opened in binary mode + reader = csv.reader(text_content.splitlines()) # Create CSV reader from text lines + offset = 0 + for i, row in enumerate(reader): + page_text = ",".join(row) # Combine CSV row elements back to a string + yield Page(i, offset, page_text) + offset += len(page_text) + 1 # Add 1 for the newline character or comma From 15307b8532ff018f378697224545e13a18fd942d Mon Sep 17 00:00:00 2001 From: Saravana Date: Wed, 25 Sep 2024 01:59:40 +0530 Subject: [PATCH 02/10] Update prepdocs.py updating the csv parser code and importing the CsvParser class --- app/backend/prepdocs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index deea428139..f649d5c8b5 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -20,6 +20,7 @@ IntegratedVectorizerStrategy, ) from prepdocslib.jsonparser import JsonParser +from prepdocslib.csvparser import CsvParser from prepdocslib.listfilestrategy import ( ADLSGen2ListFileStrategy, ListFileStrategy, @@ -183,6 +184,7 @@ def setup_file_processors( ".docx": FileProcessor(doc_int_parser, sentence_text_splitter), ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter), ".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter), + ".csv": FileProcessor(CsvParser(), sentence_text_splitter), ".png": FileProcessor(doc_int_parser, sentence_text_splitter), ".jpg": FileProcessor(doc_int_parser, sentence_text_splitter), ".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter), From e4f0dc6467be2b8d8690abc16aae38671f21a111 Mon Sep 17 00:00:00 2001 From: Saravana Date: Thu, 26 Sep 2024 22:03:08 +0530 Subject: [PATCH 03/10] Create test_csvparser.py Added CSV Test file --- tests/test_csvparser.py | 54 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 tests/test_csvparser.py diff --git a/tests/test_csvparser.py b/tests/test_csvparser.py new file mode 100644 index 0000000000..910fdf92f6 --- /dev/null +++ b/tests/test_csvparser.py @@ -0,0 +1,54 @@ +import io +import pytest +from prepdocslib.csvparser import CsvParser # Adjust import to the correct module + +@pytest.mark.asyncio +async def test_csvparser_single_row(): + # Mock CSV content with a single row in binary format + file = io.BytesIO(b"col1,col2,col3\nvalue1,value2,value3") + file.name = "test.csv" + csvparser = CsvParser() + + # Parse the file + pages = [page async for page in csvparser.parse(file)] + + # Assertions + assert len(pages) == 1 + assert pages[0].page_num == 0 + assert pages[0].offset == 0 + assert pages[0].text == "value1,value2,value3" + + +@pytest.mark.asyncio +async def test_csvparser_multiple_rows(): + # Mock CSV content with multiple rows in binary format + file = io.BytesIO(b"col1,col2,col3\nvalue1,value2,value3\nvalue4,value5,value6") + file.name = "test.csv" + csvparser = CsvParser() + + # Parse the file + pages = [page async for page in csvparser.parse(file)] + + # Assertions + assert len(pages) == 2 # Expect only data rows, skipping the header + assert pages[0].page_num == 0 + assert pages[0].offset == 0 + assert pages[0].text == "value1,value2,value3" + + assert pages[1].page_num == 1 + assert pages[1].offset == len(pages[0].text) + 1 # Length of the first row plus a newline + assert pages[1].text == "value4,value5,value6" + + +@pytest.mark.asyncio +async def test_csvparser_empty_file(): + # Mock empty CSV content in binary format + file = io.BytesIO(b"") + file.name = "test.csv" + csvparser = CsvParser() + + # Parse the file + pages = [page async for page in csvparser.parse(file)] + + # Assertions + assert len(pages) == 0 # No rows should be parsed from an empty file From 3d7c887bfeaaae6f359e5c5025f9e97ee8b517e7 Mon Sep 17 00:00:00 2001 From: Saravana Date: Fri, 27 Sep 2024 02:28:03 +0530 Subject: [PATCH 04/10] Update test_csvparser.py Formatted the file --- tests/test_csvparser.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_csvparser.py b/tests/test_csvparser.py index 910fdf92f6..3db9dc13f8 100644 --- a/tests/test_csvparser.py +++ b/tests/test_csvparser.py @@ -1,7 +1,10 @@ import io + import pytest + from prepdocslib.csvparser import CsvParser # Adjust import to the correct module + @pytest.mark.asyncio async def test_csvparser_single_row(): # Mock CSV content with a single row in binary format @@ -34,7 +37,7 @@ async def test_csvparser_multiple_rows(): assert pages[0].page_num == 0 assert pages[0].offset == 0 assert pages[0].text == "value1,value2,value3" - + assert pages[1].page_num == 1 assert pages[1].offset == len(pages[0].text) + 1 # Length of the first row plus a newline assert pages[1].text == "value4,value5,value6" From 02dd0f5fba513f3b7c29389ba97d16d9c3d2481b Mon Sep 17 00:00:00 2001 From: Saravana Date: Fri, 27 Sep 2024 02:29:58 +0530 Subject: [PATCH 05/10] Update csvparser.py Formatted the file --- app/backend/prepdocslib/csvparser.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/app/backend/prepdocslib/csvparser.py b/app/backend/prepdocslib/csvparser.py index 05f0c92d56..0277f752cf 100644 --- a/app/backend/prepdocslib/csvparser.py +++ b/app/backend/prepdocslib/csvparser.py @@ -1,5 +1,6 @@ import csv from typing import IO, AsyncGenerator + from .page import Page from .parser import Parser @@ -10,11 +11,20 @@ class CsvParser(Parser): """ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: - # Ensure the file is read in text mode - text_content = content.read().decode('utf-8') # Decode bytes to string if opened in binary mode - reader = csv.reader(text_content.splitlines()) # Create CSV reader from text lines + # Check if content is in bytes (binary file) and decode to string + if isinstance(content, (bytes, bytearray)): + content = content.decode("utf-8") + elif hasattr(content, "read"): # Handle BufferedReader + content = content.read().decode("utf-8") + + # Create a CSV reader from the text content + reader = csv.reader(content.splitlines()) offset = 0 + + # Skip the header row + next(reader, None) + for i, row in enumerate(reader): - page_text = ",".join(row) # Combine CSV row elements back to a string + page_text = ",".join(row) yield Page(i, offset, page_text) - offset += len(page_text) + 1 # Add 1 for the newline character or comma + offset += len(page_text) + 1 # Account for newline character From 79007022ae90b5e9c4185b361f3db84f4c129319 Mon Sep 17 00:00:00 2001 From: Saravana Date: Fri, 27 Sep 2024 19:52:52 +0530 Subject: [PATCH 06/10] Update prepdocs.py --- app/backend/prepdocs.py | 355 ++++++++++++++++++++++++++-------------- 1 file changed, 236 insertions(+), 119 deletions(-) diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index 898ae3709f..46abbc01b5 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -1,14 +1,12 @@ import argparse import asyncio import logging -import os from typing import Optional, Union from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider -from load_azd_env import load_azd_env from prepdocslib.blobmanager import BlobManager from prepdocslib.embeddings import ( AzureOpenAIEmbeddingService, @@ -34,7 +32,7 @@ from prepdocslib.textparser import TextParser from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter -logger = logging.getLogger("scripts") +logger = logging.getLogger("ingester") def clean_key_if_exists(key: Union[str, None]) -> Union[str, None]: @@ -45,7 +43,10 @@ def clean_key_if_exists(key: Union[str, None]) -> Union[str, None]: async def setup_search_info( - search_service: str, index_name: str, azure_credential: AsyncTokenCredential, search_key: Union[str, None] = None + search_service: str, + index_name: str, + azure_credential: AsyncTokenCredential, + search_key: Union[str, None] = None, ) -> SearchInfo: search_creds: Union[AsyncTokenCredential, AzureKeyCredential] = ( azure_credential if search_key is None else AzureKeyCredential(search_key) @@ -67,7 +68,9 @@ def setup_blob_manager( search_images: bool, storage_key: Union[str, None] = None, ): - storage_creds: Union[AsyncTokenCredential, str] = azure_credential if storage_key is None else storage_key + storage_creds: Union[AsyncTokenCredential, str] = ( + azure_credential if storage_key is None else storage_key + ) return BlobManager( endpoint=f"https://{storage_account}.blob.core.windows.net", container=storage_container, @@ -90,9 +93,15 @@ def setup_list_file_strategy( list_file_strategy: ListFileStrategy if datalake_storage_account: if datalake_filesystem is None or datalake_path is None: - raise ValueError("DataLake file system and path are required when using Azure Data Lake Gen2") - adls_gen2_creds: Union[AsyncTokenCredential, str] = azure_credential if datalake_key is None else datalake_key - logger.info("Using Data Lake Gen2 Storage Account: %s", datalake_storage_account) + raise ValueError( + "DataLake file system and path are required when using Azure Data Lake Gen2" + ) + adls_gen2_creds: Union[AsyncTokenCredential, str] = ( + azure_credential if datalake_key is None else datalake_key + ) + logger.info( + "Using Data Lake Gen2 Storage Account: %s", datalake_storage_account + ) list_file_strategy = ADLSGen2ListFileStrategy( data_lake_storage_account=datalake_storage_account, data_lake_filesystem=datalake_filesystem, @@ -103,7 +112,9 @@ def setup_list_file_strategy( logger.info("Using local files: %s", local_files) list_file_strategy = LocalListFileStrategy(path_pattern=local_files) else: - raise ValueError("Either local_files or datalake_storage_account must be provided.") + raise ValueError( + "Either local_files or datalake_storage_account must be provided." + ) return list_file_strategy @@ -139,7 +150,9 @@ def setup_embeddings_service( ) else: if openai_key is None: - raise ValueError("OpenAI key is required when using the non-Azure OpenAI API") + raise ValueError( + "OpenAI key is required when using the non-Azure OpenAI API" + ) return OpenAIEmbeddingService( open_ai_model_name=openai_model_name, open_ai_dimensions=openai_dimensions, @@ -157,37 +170,33 @@ def setup_file_processors( local_html_parser: bool = False, search_images: bool = False, ): - sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=search_images) + html_parser: Parser + pdf_parser: Parser + doc_int_parser: DocumentAnalysisParser - doc_int_parser: Optional[DocumentAnalysisParser] = None # check if Azure Document Intelligence credentials are provided if document_intelligence_service is not None: documentintelligence_creds: Union[AsyncTokenCredential, AzureKeyCredential] = ( - azure_credential if document_intelligence_key is None else AzureKeyCredential(document_intelligence_key) + azure_credential + if document_intelligence_key is None + else AzureKeyCredential(document_intelligence_key) ) doc_int_parser = DocumentAnalysisParser( endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/", credential=documentintelligence_creds, ) - - pdf_parser: Optional[Parser] = None if local_pdf_parser or document_intelligence_service is None: pdf_parser = LocalPdfParser() - elif document_intelligence_service is not None: - pdf_parser = doc_int_parser else: - logger.warning("No PDF parser available") - - html_parser: Optional[Parser] = None + pdf_parser = doc_int_parser if local_html_parser or document_intelligence_service is None: html_parser = LocalHTMLParser() - elif document_intelligence_service is not None: - html_parser = doc_int_parser else: - logger.warning("No HTML parser available") - - # These file formats can always be parsed: - file_processors = { + html_parser = doc_int_parser + sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=search_images) + return { + ".pdf": FileProcessor(pdf_parser, sentence_text_splitter), + ".html": FileProcessor(html_parser, sentence_text_splitter), ".json": FileProcessor(JsonParser(), SimpleTextSplitter()), ".docx": FileProcessor(doc_int_parser, sentence_text_splitter), ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter), @@ -202,39 +211,24 @@ def setup_file_processors( ".md": FileProcessor(TextParser(), sentence_text_splitter), ".txt": FileProcessor(TextParser(), sentence_text_splitter), } - # These require either a Python package or Document Intelligence - if pdf_parser is not None: - file_processors.update({".pdf": FileProcessor(pdf_parser, sentence_text_splitter)}) - if html_parser is not None: - file_processors.update({".html": FileProcessor(html_parser, sentence_text_splitter)}) - # These file formats require Document Intelligence - if doc_int_parser is not None: - file_processors.update( - { - ".docx": FileProcessor(doc_int_parser, sentence_text_splitter), - ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter), - ".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter), - ".png": FileProcessor(doc_int_parser, sentence_text_splitter), - ".jpg": FileProcessor(doc_int_parser, sentence_text_splitter), - ".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter), - ".tiff": FileProcessor(doc_int_parser, sentence_text_splitter), - ".bmp": FileProcessor(doc_int_parser, sentence_text_splitter), - ".heic": FileProcessor(doc_int_parser, sentence_text_splitter), - } - ) - return file_processors def setup_image_embeddings_service( - azure_credential: AsyncTokenCredential, vision_endpoint: Union[str, None], search_images: bool + azure_credential: AsyncTokenCredential, + vision_endpoint: Union[str, None], + search_images: bool, ) -> Union[ImageEmbeddings, None]: image_embeddings_service: Optional[ImageEmbeddings] = None if search_images: if vision_endpoint is None: - raise ValueError("A computer vision endpoint is required when GPT-4-vision is enabled.") + raise ValueError( + "A computer vision endpoint is required when GPT-4-vision is enabled." + ) image_embeddings_service = ImageEmbeddings( endpoint=vision_endpoint, - token_provider=get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default"), + token_provider=get_bearer_token_provider( + azure_credential, "https://cognitiveservices.azure.com/.default" + ), ) return image_embeddings_service @@ -249,18 +243,136 @@ async def main(strategy: Strategy, setup_index: bool = True): if __name__ == "__main__": parser = argparse.ArgumentParser( description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index.", - epilog="Example: prepdocs.py '.\\data\*' -v", + epilog="Example: prepdocs.py '.\\data\*' --storageaccount myaccount --container mycontainer --searchservice mysearch --index myindex -v", ) parser.add_argument("files", nargs="?", help="Files to be processed") + parser.add_argument( + "--datalakestorageaccount", + required=False, + help="Optional. Azure Data Lake Storage Gen2 Account name", + ) + parser.add_argument( + "--datalakefilesystem", + required=False, + default="gptkbcontainer", + help="Optional. Azure Data Lake Storage Gen2 filesystem name", + ) + parser.add_argument( + "--datalakepath", + required=False, + help="Optional. Azure Data Lake Storage Gen2 filesystem path containing files to index. If omitted, index the entire filesystem", + ) + parser.add_argument( + "--datalakekey", + required=False, + help="Optional. Use this key when authenticating to Azure Data Lake Gen2", + ) + parser.add_argument( + "--useacls", + action="store_true", + help="Store ACLs from Azure Data Lake Gen2 Filesystem in the search index", + ) + parser.add_argument( + "--category", + help="Value for the category field in the search index for all sections indexed in this run", + ) + parser.add_argument( + "--skipblobs", + action="store_true", + help="Skip uploading individual pages to Azure Blob Storage", + ) + parser.add_argument("--storageaccount", help="Azure Blob Storage account name") + parser.add_argument("--container", help="Azure Blob Storage container name") + parser.add_argument( + "--storageresourcegroup", help="Azure blob storage resource group" + ) + parser.add_argument( + "--storagekey", + required=False, + help="Optional. Use this Azure Blob Storage account key instead of the current user identity to login (use az login to set current user for Azure)", + ) + parser.add_argument( + "--tenantid", + required=False, + help="Optional. Use this to define the Azure directory where to authenticate)", + ) + parser.add_argument( + "--subscriptionid", + required=False, + help="Optional. Use this to define managed identity connection string in integrated vectorization", + ) + parser.add_argument( + "--searchservice", + help="Name of the Azure AI Search service where content should be indexed (must exist already)", + ) + parser.add_argument( + "--searchserviceassignedid", + required=False, + help="Search service system assigned Identity (Managed identity) (used for integrated vectorization)", + ) + parser.add_argument( + "--index", + help="Name of the Azure AI Search index where content should be indexed (will be created if it doesn't exist)", + ) + parser.add_argument( + "--searchkey", + required=False, + help="Optional. Use this Azure AI Search account key instead of the current user identity to login (use az login to set current user for Azure)", + ) + parser.add_argument( + "--searchanalyzername", + required=False, + default="en.microsoft", + help="Optional. Name of the Azure AI Search analyzer to use for the content field in the index", + ) + parser.add_argument( + "--openaihost", + help="Host of the API used to compute embeddings ('azure' or 'openai')", + ) + parser.add_argument( + "--openaiservice", + help="Name of the Azure OpenAI service used to compute embeddings", + ) + parser.add_argument( + "--openaideployment", + help="Name of the Azure OpenAI model deployment for an embedding model ('text-embedding-ada-002' recommended)", + ) + parser.add_argument( + "--openaimodelname", + help="Name of the Azure OpenAI embedding model ('text-embedding-ada-002' recommended)", + ) + parser.add_argument( + "--openaidimensions", + required=False, + default=1536, + type=int, + help="Dimensions for the embedding model (defaults to 1536 for 'text-embedding-ada-002')", + ) + parser.add_argument( + "--novectors", + action="store_true", + help="Don't compute embeddings for the sections (e.g. don't call the OpenAI embeddings API during indexing)", + ) + parser.add_argument( + "--disablebatchvectors", + action="store_true", + help="Don't compute embeddings in batch for the sections", + ) parser.add_argument( - "--category", help="Value for the category field in the search index for all sections indexed in this run" + "--openaicustomurl", + required=False, + help="Optional. Use this custom OpenAI URL instead of the default OpenAI URL", ) parser.add_argument( - "--skipblobs", action="store_true", help="Skip uploading individual pages to Azure Blob Storage" + "--openaikey", + required=False, + help="Optional. Use this OpenAI account key instead of the current Azure user identity to login.", ) parser.add_argument( - "--disablebatchvectors", action="store_true", help="Don't compute embeddings in batch for the sections" + "--openaiorg", + required=False, + help="This is required only when using non-Azure endpoints.", ) parser.add_argument( "--remove", @@ -272,20 +384,20 @@ async def main(strategy: Strategy, setup_index: bool = True): action="store_true", help="Remove all blobs from blob storage and documents from the search index", ) - - # Optional key specification: parser.add_argument( - "--searchkey", - required=False, - help="Optional. Use this Azure AI Search account key instead of the current user identity to login (use az login to set current user for Azure)", + "--localpdfparser", + action="store_true", + help="Use PyPdf local PDF parser (supports only digital PDFs) instead of Azure Document Intelligence service to extract text, tables and layout from the documents", ) parser.add_argument( - "--storagekey", - required=False, - help="Optional. Use this Azure Blob Storage account key instead of the current user identity to login (use az login to set current user for Azure)", + "--localhtmlparser", + action="store_true", + help="Use Beautiful soap local HTML parser instead of Azure Document Intelligence service to extract text, tables and layout from the documents", ) parser.add_argument( - "--datalakekey", required=False, help="Optional. Use this key when authenticating to Azure Data Lake Gen2" + "--documentintelligenceservice", + required=False, + help="Optional. Name of the Azure Document Intelligence service which will be used to extract text, tables and layout from the documents (must exist already)", ) parser.add_argument( "--documentintelligencekey", @@ -293,11 +405,21 @@ async def main(strategy: Strategy, setup_index: bool = True): help="Optional. Use this Azure Document Intelligence account key instead of the current user identity to login (use az login to set current user for Azure)", ) parser.add_argument( - "--searchserviceassignedid", + "--searchimages", + action="store_true", required=False, - help="Search service system assigned Identity (Managed identity) (used for integrated vectorization)", + help="Optional. Generate image embeddings to enable each page to be searched as an image", + ) + parser.add_argument( + "--visionendpoint", + required=False, + help="Optional, required if --searchimages is specified. Endpoint of Azure AI Vision service to use when embedding images.", + ) + parser.add_argument( + "--useintvectorization", + required=False, + help="Required if --useintvectorization is specified. Enable Integrated vectorizer indexer support which is in preview)", ) - parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") args = parser.parse_args() @@ -307,19 +429,23 @@ async def main(strategy: Strategy, setup_index: bool = True): # to avoid seeing the noisy INFO level logs from the Azure SDKs logger.setLevel(logging.INFO) - load_azd_env() - - use_int_vectorization = os.getenv("USE_FEATURE_INT_VECTORIZATION", "").lower() == "true" - use_gptvision = os.getenv("USE_GPT4V", "").lower() == "true" - use_acls = os.getenv("AZURE_ADLS_GEN2_STORAGE_ACCOUNT") is not None - dont_use_vectors = os.getenv("USE_VECTORS", "").lower() == "false" + use_int_vectorization = ( + args.useintvectorization and args.useintvectorization.lower() == "true" + ) - # Use the current user identity to connect to Azure services. See infra/main.bicep for role assignments. - if tenant_id := os.getenv("AZURE_TENANT_ID"): - logger.info("Connecting to Azure services using the azd credential for tenant %s", tenant_id) - azd_credential = AzureDeveloperCliCredential(tenant_id=tenant_id, process_timeout=60) + # Use the current user identity to connect to Azure services unless a key is explicitly set for any of them + if args.tenantid: + logger.info( + "Connecting to Azure services using the azd credential for tenant %s", + args.tenantid, + ) + azd_credential = AzureDeveloperCliCredential( + tenant_id=args.tenantid, process_timeout=60 + ) else: - logger.info("Connecting to Azure services using the azd credential for home tenant") + logger.info( + "Connecting to Azure services using the azd credential for home tenant" + ) azd_credential = AzureDeveloperCliCredential(process_timeout=60) if args.removeall: @@ -334,51 +460,40 @@ async def main(strategy: Strategy, setup_index: bool = True): search_info = loop.run_until_complete( setup_search_info( - search_service=os.environ["AZURE_SEARCH_SERVICE"], - index_name=os.environ["AZURE_SEARCH_INDEX"], + search_service=args.searchservice, + index_name=args.index, azure_credential=azd_credential, search_key=clean_key_if_exists(args.searchkey), ) ) blob_manager = setup_blob_manager( azure_credential=azd_credential, - storage_account=os.environ["AZURE_STORAGE_ACCOUNT"], - storage_container=os.environ["AZURE_STORAGE_CONTAINER"], - storage_resource_group=os.environ["AZURE_STORAGE_RESOURCE_GROUP"], - subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"], - search_images=use_gptvision, + storage_account=args.storageaccount, + storage_container=args.container, + storage_resource_group=args.storageresourcegroup, + subscription_id=args.subscriptionid, + search_images=args.searchimages, storage_key=clean_key_if_exists(args.storagekey), ) list_file_strategy = setup_list_file_strategy( azure_credential=azd_credential, local_files=args.files, - datalake_storage_account=os.getenv("AZURE_ADLS_GEN2_STORAGE_ACCOUNT"), - datalake_filesystem=os.getenv("AZURE_ADLS_GEN2_FILESYSTEM"), - datalake_path=os.getenv("AZURE_ADLS_GEN2_FILESYSTEM_PATH"), + datalake_storage_account=args.datalakestorageaccount, + datalake_filesystem=args.datalakefilesystem, + datalake_path=args.datalakepath, datalake_key=clean_key_if_exists(args.datalakekey), ) - - openai_host = os.environ["OPENAI_HOST"] - openai_key = None - if os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE"): - openai_key = os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE") - elif not openai_host.startswith("azure") and os.getenv("OPENAI_API_KEY"): - openai_key = os.getenv("OPENAI_API_KEY") - - openai_dimensions = 1536 - if os.getenv("AZURE_OPENAI_EMB_DIMENSIONS"): - openai_dimensions = int(os.environ["AZURE_OPENAI_EMB_DIMENSIONS"]) openai_embeddings_service = setup_embeddings_service( azure_credential=azd_credential, - openai_host=openai_host, - openai_model_name=os.environ["AZURE_OPENAI_EMB_MODEL_NAME"], - openai_service=os.getenv("AZURE_OPENAI_SERVICE"), - openai_custom_url=os.getenv("AZURE_OPENAI_CUSTOM_URL"), - openai_deployment=os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT"), - openai_dimensions=openai_dimensions, - openai_key=clean_key_if_exists(openai_key), - openai_org=os.getenv("OPENAI_ORGANIZATION"), - disable_vectors=dont_use_vectors, + openai_host=args.openaihost, + openai_model_name=args.openaimodelname, + openai_service=args.openaiservice, + openai_custom_url=args.openaicustomurl, + openai_deployment=args.openaideployment, + openai_dimensions=args.openaidimensions, + openai_key=clean_key_if_exists(args.openaikey), + openai_org=args.openaiorg, + disable_vectors=args.novectors, disable_batch_vectors=args.disablebatchvectors, ) @@ -390,25 +505,25 @@ async def main(strategy: Strategy, setup_index: bool = True): blob_manager=blob_manager, document_action=document_action, embeddings=openai_embeddings_service, - subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"], + subscription_id=args.subscriptionid, search_service_user_assigned_id=args.searchserviceassignedid, - search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"), - use_acls=use_acls, + search_analyzer_name=args.searchanalyzername, + use_acls=args.useacls, category=args.category, ) else: file_processors = setup_file_processors( azure_credential=azd_credential, - document_intelligence_service=os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE"), + document_intelligence_service=args.documentintelligenceservice, document_intelligence_key=clean_key_if_exists(args.documentintelligencekey), - local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER") == "true", - local_html_parser=os.getenv("USE_LOCAL_HTML_PARSER") == "true", - search_images=use_gptvision, + local_pdf_parser=args.localpdfparser, + local_html_parser=args.localhtmlparser, + search_images=args.searchimages, ) image_embeddings_service = setup_image_embeddings_service( azure_credential=azd_credential, - vision_endpoint=os.getenv("AZURE_VISION_ENDPOINT"), - search_images=use_gptvision, + vision_endpoint=args.visionendpoint, + search_images=args.searchimages, ) ingestion_strategy = FileStrategy( @@ -419,10 +534,12 @@ async def main(strategy: Strategy, setup_index: bool = True): document_action=document_action, embeddings=openai_embeddings_service, image_embeddings=image_embeddings_service, - search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"), - use_acls=use_acls, + search_analyzer_name=args.searchanalyzername, + use_acls=args.useacls, category=args.category, ) - loop.run_until_complete(main(ingestion_strategy, setup_index=not args.remove and not args.removeall)) + loop.run_until_complete( + main(ingestion_strategy, setup_index=not args.remove and not args.removeall) + ) loop.close() From 887df6ff1d8c3eb66fbb948df6b0510341a5845e Mon Sep 17 00:00:00 2001 From: Saravana Date: Tue, 1 Oct 2024 12:25:32 +0530 Subject: [PATCH 07/10] Update prepdocs.py --- app/backend/prepdocs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index 46abbc01b5..d8a9149744 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -8,6 +8,7 @@ from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider from prepdocslib.blobmanager import BlobManager +from prepdocslib.csvparser import CsvParser from prepdocslib.embeddings import ( AzureOpenAIEmbeddingService, ImageEmbeddings, @@ -20,7 +21,6 @@ IntegratedVectorizerStrategy, ) from prepdocslib.jsonparser import JsonParser -from prepdocslib.csvparser import CsvParser from prepdocslib.listfilestrategy import ( ADLSGen2ListFileStrategy, ListFileStrategy, From daf097ce820a82aa89280e7f76a2d53b5a74e98e Mon Sep 17 00:00:00 2001 From: Saravana Date: Tue, 1 Oct 2024 22:37:11 +0530 Subject: [PATCH 08/10] Update csvparser.py --- app/backend/prepdocslib/csvparser.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/app/backend/prepdocslib/csvparser.py b/app/backend/prepdocslib/csvparser.py index 0277f752cf..cc8cb6d90d 100644 --- a/app/backend/prepdocslib/csvparser.py +++ b/app/backend/prepdocslib/csvparser.py @@ -12,13 +12,14 @@ class CsvParser(Parser): async def parse(self, content: IO) -> AsyncGenerator[Page, None]: # Check if content is in bytes (binary file) and decode to string + content_str: str if isinstance(content, (bytes, bytearray)): - content = content.decode("utf-8") + content_str = content.decode("utf-8") elif hasattr(content, "read"): # Handle BufferedReader - content = content.read().decode("utf-8") + content_str = content.read().decode("utf-8") # Create a CSV reader from the text content - reader = csv.reader(content.splitlines()) + reader = csv.reader(content_str.splitlines()) offset = 0 # Skip the header row From cae6f3ad86dd8a080f701c5db5e3d4f58f0cbf76 Mon Sep 17 00:00:00 2001 From: Saravana Date: Wed, 2 Oct 2024 01:28:42 +0530 Subject: [PATCH 09/10] Update prepdocs.py --- app/backend/prepdocs.py | 56 +++++++++++------------------------------ 1 file changed, 14 insertions(+), 42 deletions(-) diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index d8a9149744..d276240ec0 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -68,9 +68,7 @@ def setup_blob_manager( search_images: bool, storage_key: Union[str, None] = None, ): - storage_creds: Union[AsyncTokenCredential, str] = ( - azure_credential if storage_key is None else storage_key - ) + storage_creds: Union[AsyncTokenCredential, str] = azure_credential if storage_key is None else storage_key return BlobManager( endpoint=f"https://{storage_account}.blob.core.windows.net", container=storage_container, @@ -93,15 +91,9 @@ def setup_list_file_strategy( list_file_strategy: ListFileStrategy if datalake_storage_account: if datalake_filesystem is None or datalake_path is None: - raise ValueError( - "DataLake file system and path are required when using Azure Data Lake Gen2" - ) - adls_gen2_creds: Union[AsyncTokenCredential, str] = ( - azure_credential if datalake_key is None else datalake_key - ) - logger.info( - "Using Data Lake Gen2 Storage Account: %s", datalake_storage_account - ) + raise ValueError("DataLake file system and path are required when using Azure Data Lake Gen2") + adls_gen2_creds: Union[AsyncTokenCredential, str] = azure_credential if datalake_key is None else datalake_key + logger.info("Using Data Lake Gen2 Storage Account: %s", datalake_storage_account) list_file_strategy = ADLSGen2ListFileStrategy( data_lake_storage_account=datalake_storage_account, data_lake_filesystem=datalake_filesystem, @@ -112,9 +104,7 @@ def setup_list_file_strategy( logger.info("Using local files: %s", local_files) list_file_strategy = LocalListFileStrategy(path_pattern=local_files) else: - raise ValueError( - "Either local_files or datalake_storage_account must be provided." - ) + raise ValueError("Either local_files or datalake_storage_account must be provided.") return list_file_strategy @@ -150,9 +140,7 @@ def setup_embeddings_service( ) else: if openai_key is None: - raise ValueError( - "OpenAI key is required when using the non-Azure OpenAI API" - ) + raise ValueError("OpenAI key is required when using the non-Azure OpenAI API") return OpenAIEmbeddingService( open_ai_model_name=openai_model_name, open_ai_dimensions=openai_dimensions, @@ -177,9 +165,7 @@ def setup_file_processors( # check if Azure Document Intelligence credentials are provided if document_intelligence_service is not None: documentintelligence_creds: Union[AsyncTokenCredential, AzureKeyCredential] = ( - azure_credential - if document_intelligence_key is None - else AzureKeyCredential(document_intelligence_key) + azure_credential if document_intelligence_key is None else AzureKeyCredential(document_intelligence_key) ) doc_int_parser = DocumentAnalysisParser( endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/", @@ -221,14 +207,10 @@ def setup_image_embeddings_service( image_embeddings_service: Optional[ImageEmbeddings] = None if search_images: if vision_endpoint is None: - raise ValueError( - "A computer vision endpoint is required when GPT-4-vision is enabled." - ) + raise ValueError("A computer vision endpoint is required when GPT-4-vision is enabled.") image_embeddings_service = ImageEmbeddings( endpoint=vision_endpoint, - token_provider=get_bearer_token_provider( - azure_credential, "https://cognitiveservices.azure.com/.default" - ), + token_provider=get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default"), ) return image_embeddings_service @@ -283,9 +265,7 @@ async def main(strategy: Strategy, setup_index: bool = True): ) parser.add_argument("--storageaccount", help="Azure Blob Storage account name") parser.add_argument("--container", help="Azure Blob Storage container name") - parser.add_argument( - "--storageresourcegroup", help="Azure blob storage resource group" - ) + parser.add_argument("--storageresourcegroup", help="Azure blob storage resource group") parser.add_argument( "--storagekey", required=False, @@ -429,9 +409,7 @@ async def main(strategy: Strategy, setup_index: bool = True): # to avoid seeing the noisy INFO level logs from the Azure SDKs logger.setLevel(logging.INFO) - use_int_vectorization = ( - args.useintvectorization and args.useintvectorization.lower() == "true" - ) + use_int_vectorization = args.useintvectorization and args.useintvectorization.lower() == "true" # Use the current user identity to connect to Azure services unless a key is explicitly set for any of them if args.tenantid: @@ -439,13 +417,9 @@ async def main(strategy: Strategy, setup_index: bool = True): "Connecting to Azure services using the azd credential for tenant %s", args.tenantid, ) - azd_credential = AzureDeveloperCliCredential( - tenant_id=args.tenantid, process_timeout=60 - ) + azd_credential = AzureDeveloperCliCredential(tenant_id=args.tenantid, process_timeout=60) else: - logger.info( - "Connecting to Azure services using the azd credential for home tenant" - ) + logger.info("Connecting to Azure services using the azd credential for home tenant") azd_credential = AzureDeveloperCliCredential(process_timeout=60) if args.removeall: @@ -539,7 +513,5 @@ async def main(strategy: Strategy, setup_index: bool = True): category=args.category, ) - loop.run_until_complete( - main(ingestion_strategy, setup_index=not args.remove and not args.removeall) - ) + loop.run_until_complete(main(ingestion_strategy, setup_index=not args.remove and not args.removeall)) loop.close() From 5aa16495ad56779639ba9bd5654f1a16f0901037 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Wed, 2 Oct 2024 12:41:11 -0700 Subject: [PATCH 10/10] Fix prepdocs and tests to match main --- app/backend/prepdocs.py | 316 ++++++++++++++------------------------- tests/test_app_config.py | 8 +- 2 files changed, 113 insertions(+), 211 deletions(-) diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index d276240ec0..420b4af39f 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -1,12 +1,14 @@ import argparse import asyncio import logging +import os from typing import Optional, Union from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider +from load_azd_env import load_azd_env from prepdocslib.blobmanager import BlobManager from prepdocslib.csvparser import CsvParser from prepdocslib.embeddings import ( @@ -32,7 +34,7 @@ from prepdocslib.textparser import TextParser from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter -logger = logging.getLogger("ingester") +logger = logging.getLogger("scripts") def clean_key_if_exists(key: Union[str, None]) -> Union[str, None]: @@ -43,10 +45,7 @@ def clean_key_if_exists(key: Union[str, None]) -> Union[str, None]: async def setup_search_info( - search_service: str, - index_name: str, - azure_credential: AsyncTokenCredential, - search_key: Union[str, None] = None, + search_service: str, index_name: str, azure_credential: AsyncTokenCredential, search_key: Union[str, None] = None ) -> SearchInfo: search_creds: Union[AsyncTokenCredential, AzureKeyCredential] = ( azure_credential if search_key is None else AzureKeyCredential(search_key) @@ -158,10 +157,9 @@ def setup_file_processors( local_html_parser: bool = False, search_images: bool = False, ): - html_parser: Parser - pdf_parser: Parser - doc_int_parser: DocumentAnalysisParser + sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=search_images) + doc_int_parser: Optional[DocumentAnalysisParser] = None # check if Azure Document Intelligence credentials are provided if document_intelligence_service is not None: documentintelligence_creds: Union[AsyncTokenCredential, AzureKeyCredential] = ( @@ -171,38 +169,55 @@ def setup_file_processors( endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/", credential=documentintelligence_creds, ) + + pdf_parser: Optional[Parser] = None if local_pdf_parser or document_intelligence_service is None: pdf_parser = LocalPdfParser() - else: + elif document_intelligence_service is not None: pdf_parser = doc_int_parser + else: + logger.warning("No PDF parser available") + + html_parser: Optional[Parser] = None if local_html_parser or document_intelligence_service is None: html_parser = LocalHTMLParser() - else: + elif document_intelligence_service is not None: html_parser = doc_int_parser - sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=search_images) - return { - ".pdf": FileProcessor(pdf_parser, sentence_text_splitter), - ".html": FileProcessor(html_parser, sentence_text_splitter), + else: + logger.warning("No HTML parser available") + + # These file formats can always be parsed: + file_processors = { ".json": FileProcessor(JsonParser(), SimpleTextSplitter()), - ".docx": FileProcessor(doc_int_parser, sentence_text_splitter), - ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter), - ".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter), - ".csv": FileProcessor(CsvParser(), sentence_text_splitter), - ".png": FileProcessor(doc_int_parser, sentence_text_splitter), - ".jpg": FileProcessor(doc_int_parser, sentence_text_splitter), - ".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter), - ".tiff": FileProcessor(doc_int_parser, sentence_text_splitter), - ".bmp": FileProcessor(doc_int_parser, sentence_text_splitter), - ".heic": FileProcessor(doc_int_parser, sentence_text_splitter), ".md": FileProcessor(TextParser(), sentence_text_splitter), ".txt": FileProcessor(TextParser(), sentence_text_splitter), + ".csv": FileProcessor(CsvParser(), sentence_text_splitter), } + # These require either a Python package or Document Intelligence + if pdf_parser is not None: + file_processors.update({".pdf": FileProcessor(pdf_parser, sentence_text_splitter)}) + if html_parser is not None: + file_processors.update({".html": FileProcessor(html_parser, sentence_text_splitter)}) + # These file formats require Document Intelligence + if doc_int_parser is not None: + file_processors.update( + { + ".docx": FileProcessor(doc_int_parser, sentence_text_splitter), + ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter), + ".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter), + ".png": FileProcessor(doc_int_parser, sentence_text_splitter), + ".jpg": FileProcessor(doc_int_parser, sentence_text_splitter), + ".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter), + ".tiff": FileProcessor(doc_int_parser, sentence_text_splitter), + ".bmp": FileProcessor(doc_int_parser, sentence_text_splitter), + ".heic": FileProcessor(doc_int_parser, sentence_text_splitter), + } + ) + return file_processors def setup_image_embeddings_service( - azure_credential: AsyncTokenCredential, - vision_endpoint: Union[str, None], - search_images: bool, + azure_credential: AsyncTokenCredential, vision_endpoint: Union[str, None], search_images: bool ) -> Union[ImageEmbeddings, None]: image_embeddings_service: Optional[ImageEmbeddings] = None if search_images: @@ -225,134 +240,18 @@ async def main(strategy: Strategy, setup_index: bool = True): if __name__ == "__main__": parser = argparse.ArgumentParser( description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index.", - epilog="Example: prepdocs.py '.\\data\*' --storageaccount myaccount --container mycontainer --searchservice mysearch --index myindex -v", + epilog="Example: prepdocs.py '.\\data\*' -v", ) parser.add_argument("files", nargs="?", help="Files to be processed") - parser.add_argument( - "--datalakestorageaccount", - required=False, - help="Optional. Azure Data Lake Storage Gen2 Account name", - ) - parser.add_argument( - "--datalakefilesystem", - required=False, - default="gptkbcontainer", - help="Optional. Azure Data Lake Storage Gen2 filesystem name", - ) - parser.add_argument( - "--datalakepath", - required=False, - help="Optional. Azure Data Lake Storage Gen2 filesystem path containing files to index. If omitted, index the entire filesystem", - ) - parser.add_argument( - "--datalakekey", - required=False, - help="Optional. Use this key when authenticating to Azure Data Lake Gen2", - ) - parser.add_argument( - "--useacls", - action="store_true", - help="Store ACLs from Azure Data Lake Gen2 Filesystem in the search index", - ) - parser.add_argument( - "--category", - help="Value for the category field in the search index for all sections indexed in this run", - ) - parser.add_argument( - "--skipblobs", - action="store_true", - help="Skip uploading individual pages to Azure Blob Storage", - ) - parser.add_argument("--storageaccount", help="Azure Blob Storage account name") - parser.add_argument("--container", help="Azure Blob Storage container name") - parser.add_argument("--storageresourcegroup", help="Azure blob storage resource group") - parser.add_argument( - "--storagekey", - required=False, - help="Optional. Use this Azure Blob Storage account key instead of the current user identity to login (use az login to set current user for Azure)", - ) - parser.add_argument( - "--tenantid", - required=False, - help="Optional. Use this to define the Azure directory where to authenticate)", - ) - parser.add_argument( - "--subscriptionid", - required=False, - help="Optional. Use this to define managed identity connection string in integrated vectorization", - ) - parser.add_argument( - "--searchservice", - help="Name of the Azure AI Search service where content should be indexed (must exist already)", - ) - parser.add_argument( - "--searchserviceassignedid", - required=False, - help="Search service system assigned Identity (Managed identity) (used for integrated vectorization)", - ) - parser.add_argument( - "--index", - help="Name of the Azure AI Search index where content should be indexed (will be created if it doesn't exist)", - ) - parser.add_argument( - "--searchkey", - required=False, - help="Optional. Use this Azure AI Search account key instead of the current user identity to login (use az login to set current user for Azure)", - ) - parser.add_argument( - "--searchanalyzername", - required=False, - default="en.microsoft", - help="Optional. Name of the Azure AI Search analyzer to use for the content field in the index", - ) - parser.add_argument( - "--openaihost", - help="Host of the API used to compute embeddings ('azure' or 'openai')", - ) - parser.add_argument( - "--openaiservice", - help="Name of the Azure OpenAI service used to compute embeddings", - ) - parser.add_argument( - "--openaideployment", - help="Name of the Azure OpenAI model deployment for an embedding model ('text-embedding-ada-002' recommended)", - ) - parser.add_argument( - "--openaimodelname", - help="Name of the Azure OpenAI embedding model ('text-embedding-ada-002' recommended)", - ) - parser.add_argument( - "--openaidimensions", - required=False, - default=1536, - type=int, - help="Dimensions for the embedding model (defaults to 1536 for 'text-embedding-ada-002')", - ) - parser.add_argument( - "--novectors", - action="store_true", - help="Don't compute embeddings for the sections (e.g. don't call the OpenAI embeddings API during indexing)", - ) - parser.add_argument( - "--disablebatchvectors", - action="store_true", - help="Don't compute embeddings in batch for the sections", - ) parser.add_argument( - "--openaicustomurl", - required=False, - help="Optional. Use this custom OpenAI URL instead of the default OpenAI URL", + "--category", help="Value for the category field in the search index for all sections indexed in this run" ) parser.add_argument( - "--openaikey", - required=False, - help="Optional. Use this OpenAI account key instead of the current Azure user identity to login.", + "--skipblobs", action="store_true", help="Skip uploading individual pages to Azure Blob Storage" ) parser.add_argument( - "--openaiorg", - required=False, - help="This is required only when using non-Azure endpoints.", + "--disablebatchvectors", action="store_true", help="Don't compute embeddings in batch for the sections" ) parser.add_argument( "--remove", @@ -364,42 +263,32 @@ async def main(strategy: Strategy, setup_index: bool = True): action="store_true", help="Remove all blobs from blob storage and documents from the search index", ) + + # Optional key specification: parser.add_argument( - "--localpdfparser", - action="store_true", - help="Use PyPdf local PDF parser (supports only digital PDFs) instead of Azure Document Intelligence service to extract text, tables and layout from the documents", - ) - parser.add_argument( - "--localhtmlparser", - action="store_true", - help="Use Beautiful soap local HTML parser instead of Azure Document Intelligence service to extract text, tables and layout from the documents", - ) - parser.add_argument( - "--documentintelligenceservice", + "--searchkey", required=False, - help="Optional. Name of the Azure Document Intelligence service which will be used to extract text, tables and layout from the documents (must exist already)", + help="Optional. Use this Azure AI Search account key instead of the current user identity to login (use az login to set current user for Azure)", ) parser.add_argument( - "--documentintelligencekey", + "--storagekey", required=False, - help="Optional. Use this Azure Document Intelligence account key instead of the current user identity to login (use az login to set current user for Azure)", + help="Optional. Use this Azure Blob Storage account key instead of the current user identity to login (use az login to set current user for Azure)", ) parser.add_argument( - "--searchimages", - action="store_true", - required=False, - help="Optional. Generate image embeddings to enable each page to be searched as an image", + "--datalakekey", required=False, help="Optional. Use this key when authenticating to Azure Data Lake Gen2" ) parser.add_argument( - "--visionendpoint", + "--documentintelligencekey", required=False, - help="Optional, required if --searchimages is specified. Endpoint of Azure AI Vision service to use when embedding images.", + help="Optional. Use this Azure Document Intelligence account key instead of the current user identity to login (use az login to set current user for Azure)", ) parser.add_argument( - "--useintvectorization", + "--searchserviceassignedid", required=False, - help="Required if --useintvectorization is specified. Enable Integrated vectorizer indexer support which is in preview)", + help="Search service system assigned Identity (Managed identity) (used for integrated vectorization)", ) + parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") args = parser.parse_args() @@ -409,15 +298,17 @@ async def main(strategy: Strategy, setup_index: bool = True): # to avoid seeing the noisy INFO level logs from the Azure SDKs logger.setLevel(logging.INFO) - use_int_vectorization = args.useintvectorization and args.useintvectorization.lower() == "true" + load_azd_env() - # Use the current user identity to connect to Azure services unless a key is explicitly set for any of them - if args.tenantid: - logger.info( - "Connecting to Azure services using the azd credential for tenant %s", - args.tenantid, - ) - azd_credential = AzureDeveloperCliCredential(tenant_id=args.tenantid, process_timeout=60) + use_int_vectorization = os.getenv("USE_FEATURE_INT_VECTORIZATION", "").lower() == "true" + use_gptvision = os.getenv("USE_GPT4V", "").lower() == "true" + use_acls = os.getenv("AZURE_ADLS_GEN2_STORAGE_ACCOUNT") is not None + dont_use_vectors = os.getenv("USE_VECTORS", "").lower() == "false" + + # Use the current user identity to connect to Azure services. See infra/main.bicep for role assignments. + if tenant_id := os.getenv("AZURE_TENANT_ID"): + logger.info("Connecting to Azure services using the azd credential for tenant %s", tenant_id) + azd_credential = AzureDeveloperCliCredential(tenant_id=tenant_id, process_timeout=60) else: logger.info("Connecting to Azure services using the azd credential for home tenant") azd_credential = AzureDeveloperCliCredential(process_timeout=60) @@ -434,40 +325,51 @@ async def main(strategy: Strategy, setup_index: bool = True): search_info = loop.run_until_complete( setup_search_info( - search_service=args.searchservice, - index_name=args.index, + search_service=os.environ["AZURE_SEARCH_SERVICE"], + index_name=os.environ["AZURE_SEARCH_INDEX"], azure_credential=azd_credential, search_key=clean_key_if_exists(args.searchkey), ) ) blob_manager = setup_blob_manager( azure_credential=azd_credential, - storage_account=args.storageaccount, - storage_container=args.container, - storage_resource_group=args.storageresourcegroup, - subscription_id=args.subscriptionid, - search_images=args.searchimages, + storage_account=os.environ["AZURE_STORAGE_ACCOUNT"], + storage_container=os.environ["AZURE_STORAGE_CONTAINER"], + storage_resource_group=os.environ["AZURE_STORAGE_RESOURCE_GROUP"], + subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"], + search_images=use_gptvision, storage_key=clean_key_if_exists(args.storagekey), ) list_file_strategy = setup_list_file_strategy( azure_credential=azd_credential, local_files=args.files, - datalake_storage_account=args.datalakestorageaccount, - datalake_filesystem=args.datalakefilesystem, - datalake_path=args.datalakepath, + datalake_storage_account=os.getenv("AZURE_ADLS_GEN2_STORAGE_ACCOUNT"), + datalake_filesystem=os.getenv("AZURE_ADLS_GEN2_FILESYSTEM"), + datalake_path=os.getenv("AZURE_ADLS_GEN2_FILESYSTEM_PATH"), datalake_key=clean_key_if_exists(args.datalakekey), ) + + openai_host = os.environ["OPENAI_HOST"] + openai_key = None + if os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE"): + openai_key = os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE") + elif not openai_host.startswith("azure") and os.getenv("OPENAI_API_KEY"): + openai_key = os.getenv("OPENAI_API_KEY") + + openai_dimensions = 1536 + if os.getenv("AZURE_OPENAI_EMB_DIMENSIONS"): + openai_dimensions = int(os.environ["AZURE_OPENAI_EMB_DIMENSIONS"]) openai_embeddings_service = setup_embeddings_service( azure_credential=azd_credential, - openai_host=args.openaihost, - openai_model_name=args.openaimodelname, - openai_service=args.openaiservice, - openai_custom_url=args.openaicustomurl, - openai_deployment=args.openaideployment, - openai_dimensions=args.openaidimensions, - openai_key=clean_key_if_exists(args.openaikey), - openai_org=args.openaiorg, - disable_vectors=args.novectors, + openai_host=openai_host, + openai_model_name=os.environ["AZURE_OPENAI_EMB_MODEL_NAME"], + openai_service=os.getenv("AZURE_OPENAI_SERVICE"), + openai_custom_url=os.getenv("AZURE_OPENAI_CUSTOM_URL"), + openai_deployment=os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT"), + openai_dimensions=openai_dimensions, + openai_key=clean_key_if_exists(openai_key), + openai_org=os.getenv("OPENAI_ORGANIZATION"), + disable_vectors=dont_use_vectors, disable_batch_vectors=args.disablebatchvectors, ) @@ -479,25 +381,25 @@ async def main(strategy: Strategy, setup_index: bool = True): blob_manager=blob_manager, document_action=document_action, embeddings=openai_embeddings_service, - subscription_id=args.subscriptionid, + subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"], search_service_user_assigned_id=args.searchserviceassignedid, - search_analyzer_name=args.searchanalyzername, - use_acls=args.useacls, + search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"), + use_acls=use_acls, category=args.category, ) else: file_processors = setup_file_processors( azure_credential=azd_credential, - document_intelligence_service=args.documentintelligenceservice, + document_intelligence_service=os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE"), document_intelligence_key=clean_key_if_exists(args.documentintelligencekey), - local_pdf_parser=args.localpdfparser, - local_html_parser=args.localhtmlparser, - search_images=args.searchimages, + local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER") == "true", + local_html_parser=os.getenv("USE_LOCAL_HTML_PARSER") == "true", + search_images=use_gptvision, ) image_embeddings_service = setup_image_embeddings_service( azure_credential=azd_credential, - vision_endpoint=args.visionendpoint, - search_images=args.searchimages, + vision_endpoint=os.getenv("AZURE_VISION_ENDPOINT"), + search_images=use_gptvision, ) ingestion_strategy = FileStrategy( @@ -508,8 +410,8 @@ async def main(strategy: Strategy, setup_index: bool = True): document_action=document_action, embeddings=openai_embeddings_service, image_embeddings=image_embeddings_service, - search_analyzer_name=args.searchanalyzername, - use_acls=args.useacls, + search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"), + use_acls=use_acls, category=args.category, ) diff --git a/tests/test_app_config.py b/tests/test_app_config.py index 29139d2a02..cc7f440083 100644 --- a/tests/test_app_config.py +++ b/tests/test_app_config.py @@ -63,7 +63,7 @@ async def test_app_user_upload_processors(monkeypatch, minimal_env): async with quart_app.test_app(): ingester = quart_app.config[app.CONFIG_INGESTER] assert ingester is not None - assert len(ingester.file_processors.keys()) == 5 + assert len(ingester.file_processors.keys()) == 6 @pytest.mark.asyncio @@ -77,7 +77,7 @@ async def test_app_user_upload_processors_docint(monkeypatch, minimal_env): async with quart_app.test_app(): ingester = quart_app.config[app.CONFIG_INGESTER] assert ingester is not None - assert len(ingester.file_processors.keys()) == 14 + assert len(ingester.file_processors.keys()) == 15 @pytest.mark.asyncio @@ -92,7 +92,7 @@ async def test_app_user_upload_processors_docint_localpdf(monkeypatch, minimal_e async with quart_app.test_app(): ingester = quart_app.config[app.CONFIG_INGESTER] assert ingester is not None - assert len(ingester.file_processors.keys()) == 14 + assert len(ingester.file_processors.keys()) == 15 assert ingester.file_processors[".pdf"] is not ingester.file_processors[".pptx"] @@ -108,7 +108,7 @@ async def test_app_user_upload_processors_docint_localhtml(monkeypatch, minimal_ async with quart_app.test_app(): ingester = quart_app.config[app.CONFIG_INGESTER] assert ingester is not None - assert len(ingester.file_processors.keys()) == 14 + assert len(ingester.file_processors.keys()) == 15 assert ingester.file_processors[".html"] is not ingester.file_processors[".pptx"]