From bc1b5e30552071be1b99174fe5a82afae3e72e5b Mon Sep 17 00:00:00 2001
From: Saravana <mskumar.87@gmail.com>
Date: Wed, 25 Sep 2024 01:57:39 +0530
Subject: [PATCH 01/10] Create csvparser.py

I am adding the CSV parser python code.
it works with basic CSV files.
---
 app/backend/prepdocslib/csvparser.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 app/backend/prepdocslib/csvparser.py

diff --git a/app/backend/prepdocslib/csvparser.py b/app/backend/prepdocslib/csvparser.py
new file mode 100644
index 0000000000..05f0c92d56
--- /dev/null
+++ b/app/backend/prepdocslib/csvparser.py
@@ -0,0 +1,20 @@
+import csv
+from typing import IO, AsyncGenerator
+from .page import Page
+from .parser import Parser
+
+
+class CsvParser(Parser):
+    """
+    Concrete parser that can parse CSV into Page objects. Each row becomes a Page object.
+    """
+
+    async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
+        # Ensure the file is read in text mode
+        text_content = content.read().decode('utf-8')  # Decode bytes to string if opened in binary mode
+        reader = csv.reader(text_content.splitlines())  # Create CSV reader from text lines
+        offset = 0
+        for i, row in enumerate(reader):
+            page_text = ",".join(row)  # Combine CSV row elements back to a string
+            yield Page(i, offset, page_text)
+            offset += len(page_text) + 1  # Add 1 for the newline character or comma

From 15307b8532ff018f378697224545e13a18fd942d Mon Sep 17 00:00:00 2001
From: Saravana <mskumar.87@gmail.com>
Date: Wed, 25 Sep 2024 01:59:40 +0530
Subject: [PATCH 02/10] Update prepdocs.py

updating the csv parser code and importing the CsvParser class
---
 app/backend/prepdocs.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py
index deea428139..f649d5c8b5 100644
--- a/app/backend/prepdocs.py
+++ b/app/backend/prepdocs.py
@@ -20,6 +20,7 @@
     IntegratedVectorizerStrategy,
 )
 from prepdocslib.jsonparser import JsonParser
+from prepdocslib.csvparser import CsvParser
 from prepdocslib.listfilestrategy import (
     ADLSGen2ListFileStrategy,
     ListFileStrategy,
@@ -183,6 +184,7 @@ def setup_file_processors(
         ".docx": FileProcessor(doc_int_parser, sentence_text_splitter),
         ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter),
         ".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter),
+        ".csv": FileProcessor(CsvParser(), sentence_text_splitter),
         ".png": FileProcessor(doc_int_parser, sentence_text_splitter),
         ".jpg": FileProcessor(doc_int_parser, sentence_text_splitter),
         ".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter),

From e4f0dc6467be2b8d8690abc16aae38671f21a111 Mon Sep 17 00:00:00 2001
From: Saravana <mskumar.87@gmail.com>
Date: Thu, 26 Sep 2024 22:03:08 +0530
Subject: [PATCH 03/10] Create test_csvparser.py

Added CSV Test file
---
 tests/test_csvparser.py | 54 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 tests/test_csvparser.py

diff --git a/tests/test_csvparser.py b/tests/test_csvparser.py
new file mode 100644
index 0000000000..910fdf92f6
--- /dev/null
+++ b/tests/test_csvparser.py
@@ -0,0 +1,54 @@
+import io
+import pytest
+from prepdocslib.csvparser import CsvParser  # Adjust import to the correct module
+
+@pytest.mark.asyncio
+async def test_csvparser_single_row():
+    # Mock CSV content with a single row in binary format
+    file = io.BytesIO(b"col1,col2,col3\nvalue1,value2,value3")
+    file.name = "test.csv"
+    csvparser = CsvParser()
+
+    # Parse the file
+    pages = [page async for page in csvparser.parse(file)]
+
+    # Assertions
+    assert len(pages) == 1
+    assert pages[0].page_num == 0
+    assert pages[0].offset == 0
+    assert pages[0].text == "value1,value2,value3"
+
+
+@pytest.mark.asyncio
+async def test_csvparser_multiple_rows():
+    # Mock CSV content with multiple rows in binary format
+    file = io.BytesIO(b"col1,col2,col3\nvalue1,value2,value3\nvalue4,value5,value6")
+    file.name = "test.csv"
+    csvparser = CsvParser()
+
+    # Parse the file
+    pages = [page async for page in csvparser.parse(file)]
+
+    # Assertions
+    assert len(pages) == 2  # Expect only data rows, skipping the header
+    assert pages[0].page_num == 0
+    assert pages[0].offset == 0
+    assert pages[0].text == "value1,value2,value3"
+    
+    assert pages[1].page_num == 1
+    assert pages[1].offset == len(pages[0].text) + 1  # Length of the first row plus a newline
+    assert pages[1].text == "value4,value5,value6"
+
+
+@pytest.mark.asyncio
+async def test_csvparser_empty_file():
+    # Mock empty CSV content in binary format
+    file = io.BytesIO(b"")
+    file.name = "test.csv"
+    csvparser = CsvParser()
+
+    # Parse the file
+    pages = [page async for page in csvparser.parse(file)]
+
+    # Assertions
+    assert len(pages) == 0  # No rows should be parsed from an empty file

From 3d7c887bfeaaae6f359e5c5025f9e97ee8b517e7 Mon Sep 17 00:00:00 2001
From: Saravana <mskumar.87@gmail.com>
Date: Fri, 27 Sep 2024 02:28:03 +0530
Subject: [PATCH 04/10] Update test_csvparser.py

Formatted the file
---
 tests/test_csvparser.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/test_csvparser.py b/tests/test_csvparser.py
index 910fdf92f6..3db9dc13f8 100644
--- a/tests/test_csvparser.py
+++ b/tests/test_csvparser.py
@@ -1,7 +1,10 @@
 import io
+
 import pytest
+
 from prepdocslib.csvparser import CsvParser  # Adjust import to the correct module
 
+
 @pytest.mark.asyncio
 async def test_csvparser_single_row():
     # Mock CSV content with a single row in binary format
@@ -34,7 +37,7 @@ async def test_csvparser_multiple_rows():
     assert pages[0].page_num == 0
     assert pages[0].offset == 0
     assert pages[0].text == "value1,value2,value3"
-    
+
     assert pages[1].page_num == 1
     assert pages[1].offset == len(pages[0].text) + 1  # Length of the first row plus a newline
     assert pages[1].text == "value4,value5,value6"

From 02dd0f5fba513f3b7c29389ba97d16d9c3d2481b Mon Sep 17 00:00:00 2001
From: Saravana <mskumar.87@gmail.com>
Date: Fri, 27 Sep 2024 02:29:58 +0530
Subject: [PATCH 05/10] Update csvparser.py

Formatted the file
---
 app/backend/prepdocslib/csvparser.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/app/backend/prepdocslib/csvparser.py b/app/backend/prepdocslib/csvparser.py
index 05f0c92d56..0277f752cf 100644
--- a/app/backend/prepdocslib/csvparser.py
+++ b/app/backend/prepdocslib/csvparser.py
@@ -1,5 +1,6 @@
 import csv
 from typing import IO, AsyncGenerator
+
 from .page import Page
 from .parser import Parser
 
@@ -10,11 +11,20 @@ class CsvParser(Parser):
     """
 
     async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
-        # Ensure the file is read in text mode
-        text_content = content.read().decode('utf-8')  # Decode bytes to string if opened in binary mode
-        reader = csv.reader(text_content.splitlines())  # Create CSV reader from text lines
+        # Check if content is in bytes (binary file) and decode to string
+        if isinstance(content, (bytes, bytearray)):
+            content = content.decode("utf-8")
+        elif hasattr(content, "read"):  # Handle BufferedReader
+            content = content.read().decode("utf-8")
+
+        # Create a CSV reader from the text content
+        reader = csv.reader(content.splitlines())
         offset = 0
+
+        # Skip the header row
+        next(reader, None)
+
         for i, row in enumerate(reader):
-            page_text = ",".join(row)  # Combine CSV row elements back to a string
+            page_text = ",".join(row)
             yield Page(i, offset, page_text)
-            offset += len(page_text) + 1  # Add 1 for the newline character or comma
+            offset += len(page_text) + 1  # Account for newline character

From 79007022ae90b5e9c4185b361f3db84f4c129319 Mon Sep 17 00:00:00 2001
From: Saravana <mskumar.87@gmail.com>
Date: Fri, 27 Sep 2024 19:52:52 +0530
Subject: [PATCH 06/10] Update prepdocs.py

---
 app/backend/prepdocs.py | 355 ++++++++++++++++++++++++++--------------
 1 file changed, 236 insertions(+), 119 deletions(-)

diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py
index 898ae3709f..46abbc01b5 100644
--- a/app/backend/prepdocs.py
+++ b/app/backend/prepdocs.py
@@ -1,14 +1,12 @@
 import argparse
 import asyncio
 import logging
-import os
 from typing import Optional, Union
 
 from azure.core.credentials import AzureKeyCredential
 from azure.core.credentials_async import AsyncTokenCredential
 from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider
 
-from load_azd_env import load_azd_env
 from prepdocslib.blobmanager import BlobManager
 from prepdocslib.embeddings import (
     AzureOpenAIEmbeddingService,
@@ -34,7 +32,7 @@
 from prepdocslib.textparser import TextParser
 from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter
 
-logger = logging.getLogger("scripts")
+logger = logging.getLogger("ingester")
 
 
 def clean_key_if_exists(key: Union[str, None]) -> Union[str, None]:
@@ -45,7 +43,10 @@ def clean_key_if_exists(key: Union[str, None]) -> Union[str, None]:
 
 
 async def setup_search_info(
-    search_service: str, index_name: str, azure_credential: AsyncTokenCredential, search_key: Union[str, None] = None
+    search_service: str,
+    index_name: str,
+    azure_credential: AsyncTokenCredential,
+    search_key: Union[str, None] = None,
 ) -> SearchInfo:
     search_creds: Union[AsyncTokenCredential, AzureKeyCredential] = (
         azure_credential if search_key is None else AzureKeyCredential(search_key)
@@ -67,7 +68,9 @@ def setup_blob_manager(
     search_images: bool,
     storage_key: Union[str, None] = None,
 ):
-    storage_creds: Union[AsyncTokenCredential, str] = azure_credential if storage_key is None else storage_key
+    storage_creds: Union[AsyncTokenCredential, str] = (
+        azure_credential if storage_key is None else storage_key
+    )
     return BlobManager(
         endpoint=f"https://{storage_account}.blob.core.windows.net",
         container=storage_container,
@@ -90,9 +93,15 @@ def setup_list_file_strategy(
     list_file_strategy: ListFileStrategy
     if datalake_storage_account:
         if datalake_filesystem is None or datalake_path is None:
-            raise ValueError("DataLake file system and path are required when using Azure Data Lake Gen2")
-        adls_gen2_creds: Union[AsyncTokenCredential, str] = azure_credential if datalake_key is None else datalake_key
-        logger.info("Using Data Lake Gen2 Storage Account: %s", datalake_storage_account)
+            raise ValueError(
+                "DataLake file system and path are required when using Azure Data Lake Gen2"
+            )
+        adls_gen2_creds: Union[AsyncTokenCredential, str] = (
+            azure_credential if datalake_key is None else datalake_key
+        )
+        logger.info(
+            "Using Data Lake Gen2 Storage Account: %s", datalake_storage_account
+        )
         list_file_strategy = ADLSGen2ListFileStrategy(
             data_lake_storage_account=datalake_storage_account,
             data_lake_filesystem=datalake_filesystem,
@@ -103,7 +112,9 @@ def setup_list_file_strategy(
         logger.info("Using local files: %s", local_files)
         list_file_strategy = LocalListFileStrategy(path_pattern=local_files)
     else:
-        raise ValueError("Either local_files or datalake_storage_account must be provided.")
+        raise ValueError(
+            "Either local_files or datalake_storage_account must be provided."
+        )
     return list_file_strategy
 
 
@@ -139,7 +150,9 @@ def setup_embeddings_service(
         )
     else:
         if openai_key is None:
-            raise ValueError("OpenAI key is required when using the non-Azure OpenAI API")
+            raise ValueError(
+                "OpenAI key is required when using the non-Azure OpenAI API"
+            )
         return OpenAIEmbeddingService(
             open_ai_model_name=openai_model_name,
             open_ai_dimensions=openai_dimensions,
@@ -157,37 +170,33 @@ def setup_file_processors(
     local_html_parser: bool = False,
     search_images: bool = False,
 ):
-    sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=search_images)
+    html_parser: Parser
+    pdf_parser: Parser
+    doc_int_parser: DocumentAnalysisParser
 
-    doc_int_parser: Optional[DocumentAnalysisParser] = None
     # check if Azure Document Intelligence credentials are provided
     if document_intelligence_service is not None:
         documentintelligence_creds: Union[AsyncTokenCredential, AzureKeyCredential] = (
-            azure_credential if document_intelligence_key is None else AzureKeyCredential(document_intelligence_key)
+            azure_credential
+            if document_intelligence_key is None
+            else AzureKeyCredential(document_intelligence_key)
         )
         doc_int_parser = DocumentAnalysisParser(
             endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/",
             credential=documentintelligence_creds,
         )
-
-    pdf_parser: Optional[Parser] = None
     if local_pdf_parser or document_intelligence_service is None:
         pdf_parser = LocalPdfParser()
-    elif document_intelligence_service is not None:
-        pdf_parser = doc_int_parser
     else:
-        logger.warning("No PDF parser available")
-
-    html_parser: Optional[Parser] = None
+        pdf_parser = doc_int_parser
     if local_html_parser or document_intelligence_service is None:
         html_parser = LocalHTMLParser()
-    elif document_intelligence_service is not None:
-        html_parser = doc_int_parser
     else:
-        logger.warning("No HTML parser available")
-
-    # These file formats can always be parsed:
-    file_processors = {
+        html_parser = doc_int_parser
+    sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=search_images)
+    return {
+        ".pdf": FileProcessor(pdf_parser, sentence_text_splitter),
+        ".html": FileProcessor(html_parser, sentence_text_splitter),
         ".json": FileProcessor(JsonParser(), SimpleTextSplitter()),
         ".docx": FileProcessor(doc_int_parser, sentence_text_splitter),
         ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter),
@@ -202,39 +211,24 @@ def setup_file_processors(
         ".md": FileProcessor(TextParser(), sentence_text_splitter),
         ".txt": FileProcessor(TextParser(), sentence_text_splitter),
     }
-    # These require either a Python package or Document Intelligence
-    if pdf_parser is not None:
-        file_processors.update({".pdf": FileProcessor(pdf_parser, sentence_text_splitter)})
-    if html_parser is not None:
-        file_processors.update({".html": FileProcessor(html_parser, sentence_text_splitter)})
-    # These file formats require Document Intelligence
-    if doc_int_parser is not None:
-        file_processors.update(
-            {
-                ".docx": FileProcessor(doc_int_parser, sentence_text_splitter),
-                ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter),
-                ".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter),
-                ".png": FileProcessor(doc_int_parser, sentence_text_splitter),
-                ".jpg": FileProcessor(doc_int_parser, sentence_text_splitter),
-                ".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter),
-                ".tiff": FileProcessor(doc_int_parser, sentence_text_splitter),
-                ".bmp": FileProcessor(doc_int_parser, sentence_text_splitter),
-                ".heic": FileProcessor(doc_int_parser, sentence_text_splitter),
-            }
-        )
-    return file_processors
 
 
 def setup_image_embeddings_service(
-    azure_credential: AsyncTokenCredential, vision_endpoint: Union[str, None], search_images: bool
+    azure_credential: AsyncTokenCredential,
+    vision_endpoint: Union[str, None],
+    search_images: bool,
 ) -> Union[ImageEmbeddings, None]:
     image_embeddings_service: Optional[ImageEmbeddings] = None
     if search_images:
         if vision_endpoint is None:
-            raise ValueError("A computer vision endpoint is required when GPT-4-vision is enabled.")
+            raise ValueError(
+                "A computer vision endpoint is required when GPT-4-vision is enabled."
+            )
         image_embeddings_service = ImageEmbeddings(
             endpoint=vision_endpoint,
-            token_provider=get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default"),
+            token_provider=get_bearer_token_provider(
+                azure_credential, "https://cognitiveservices.azure.com/.default"
+            ),
         )
     return image_embeddings_service
 
@@ -249,18 +243,136 @@ async def main(strategy: Strategy, setup_index: bool = True):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index.",
-        epilog="Example: prepdocs.py '.\\data\*' -v",
+        epilog="Example: prepdocs.py '.\\data\*' --storageaccount myaccount --container mycontainer --searchservice mysearch --index myindex -v",
     )
     parser.add_argument("files", nargs="?", help="Files to be processed")
+    parser.add_argument(
+        "--datalakestorageaccount",
+        required=False,
+        help="Optional. Azure Data Lake Storage Gen2 Account name",
+    )
+    parser.add_argument(
+        "--datalakefilesystem",
+        required=False,
+        default="gptkbcontainer",
+        help="Optional. Azure Data Lake Storage Gen2 filesystem name",
+    )
+    parser.add_argument(
+        "--datalakepath",
+        required=False,
+        help="Optional. Azure Data Lake Storage Gen2 filesystem path containing files to index. If omitted, index the entire filesystem",
+    )
+    parser.add_argument(
+        "--datalakekey",
+        required=False,
+        help="Optional. Use this key when authenticating to Azure Data Lake Gen2",
+    )
+    parser.add_argument(
+        "--useacls",
+        action="store_true",
+        help="Store ACLs from Azure Data Lake Gen2 Filesystem in the search index",
+    )
+    parser.add_argument(
+        "--category",
+        help="Value for the category field in the search index for all sections indexed in this run",
+    )
+    parser.add_argument(
+        "--skipblobs",
+        action="store_true",
+        help="Skip uploading individual pages to Azure Blob Storage",
+    )
+    parser.add_argument("--storageaccount", help="Azure Blob Storage account name")
+    parser.add_argument("--container", help="Azure Blob Storage container name")
+    parser.add_argument(
+        "--storageresourcegroup", help="Azure blob storage resource group"
+    )
+    parser.add_argument(
+        "--storagekey",
+        required=False,
+        help="Optional. Use this Azure Blob Storage account key instead of the current user identity to login (use az login to set current user for Azure)",
+    )
+    parser.add_argument(
+        "--tenantid",
+        required=False,
+        help="Optional. Use this to define the Azure directory where to authenticate)",
+    )
+    parser.add_argument(
+        "--subscriptionid",
+        required=False,
+        help="Optional. Use this to define managed identity connection string in integrated vectorization",
+    )
+    parser.add_argument(
+        "--searchservice",
+        help="Name of the Azure AI Search service where content should be indexed (must exist already)",
+    )
+    parser.add_argument(
+        "--searchserviceassignedid",
+        required=False,
+        help="Search service system assigned Identity (Managed identity) (used for integrated vectorization)",
+    )
+    parser.add_argument(
+        "--index",
+        help="Name of the Azure AI Search index where content should be indexed (will be created if it doesn't exist)",
+    )
+    parser.add_argument(
+        "--searchkey",
+        required=False,
+        help="Optional. Use this Azure AI Search account key instead of the current user identity to login (use az login to set current user for Azure)",
+    )
+    parser.add_argument(
+        "--searchanalyzername",
+        required=False,
+        default="en.microsoft",
+        help="Optional. Name of the Azure AI Search analyzer to use for the content field in the index",
+    )
+    parser.add_argument(
+        "--openaihost",
+        help="Host of the API used to compute embeddings ('azure' or 'openai')",
+    )
+    parser.add_argument(
+        "--openaiservice",
+        help="Name of the Azure OpenAI service used to compute embeddings",
+    )
+    parser.add_argument(
+        "--openaideployment",
+        help="Name of the Azure OpenAI model deployment for an embedding model ('text-embedding-ada-002' recommended)",
+    )
+    parser.add_argument(
+        "--openaimodelname",
+        help="Name of the Azure OpenAI embedding model ('text-embedding-ada-002' recommended)",
+    )
+    parser.add_argument(
+        "--openaidimensions",
+        required=False,
+        default=1536,
+        type=int,
+        help="Dimensions for the embedding model (defaults to 1536 for 'text-embedding-ada-002')",
+    )
+    parser.add_argument(
+        "--novectors",
+        action="store_true",
+        help="Don't compute embeddings for the sections (e.g. don't call the OpenAI embeddings API during indexing)",
+    )
+    parser.add_argument(
+        "--disablebatchvectors",
+        action="store_true",
+        help="Don't compute embeddings in batch for the sections",
+    )
 
     parser.add_argument(
-        "--category", help="Value for the category field in the search index for all sections indexed in this run"
+        "--openaicustomurl",
+        required=False,
+        help="Optional. Use this custom OpenAI URL instead of the default OpenAI URL",
     )
     parser.add_argument(
-        "--skipblobs", action="store_true", help="Skip uploading individual pages to Azure Blob Storage"
+        "--openaikey",
+        required=False,
+        help="Optional. Use this OpenAI account key instead of the current Azure user identity to login.",
     )
     parser.add_argument(
-        "--disablebatchvectors", action="store_true", help="Don't compute embeddings in batch for the sections"
+        "--openaiorg",
+        required=False,
+        help="This is required only when using non-Azure endpoints.",
     )
     parser.add_argument(
         "--remove",
@@ -272,20 +384,20 @@ async def main(strategy: Strategy, setup_index: bool = True):
         action="store_true",
         help="Remove all blobs from blob storage and documents from the search index",
     )
-
-    # Optional key specification:
     parser.add_argument(
-        "--searchkey",
-        required=False,
-        help="Optional. Use this Azure AI Search account key instead of the current user identity to login (use az login to set current user for Azure)",
+        "--localpdfparser",
+        action="store_true",
+        help="Use PyPdf local PDF parser (supports only digital PDFs) instead of Azure Document Intelligence service to extract text, tables and layout from the documents",
     )
     parser.add_argument(
-        "--storagekey",
-        required=False,
-        help="Optional. Use this Azure Blob Storage account key instead of the current user identity to login (use az login to set current user for Azure)",
+        "--localhtmlparser",
+        action="store_true",
+        help="Use Beautiful soap local HTML parser instead of Azure Document Intelligence service to extract text, tables and layout from the documents",
     )
     parser.add_argument(
-        "--datalakekey", required=False, help="Optional. Use this key when authenticating to Azure Data Lake Gen2"
+        "--documentintelligenceservice",
+        required=False,
+        help="Optional. Name of the Azure Document Intelligence service which will be used to extract text, tables and layout from the documents (must exist already)",
     )
     parser.add_argument(
         "--documentintelligencekey",
@@ -293,11 +405,21 @@ async def main(strategy: Strategy, setup_index: bool = True):
         help="Optional. Use this Azure Document Intelligence account key instead of the current user identity to login (use az login to set current user for Azure)",
     )
     parser.add_argument(
-        "--searchserviceassignedid",
+        "--searchimages",
+        action="store_true",
         required=False,
-        help="Search service system assigned Identity (Managed identity) (used for integrated vectorization)",
+        help="Optional. Generate image embeddings to enable each page to be searched as an image",
+    )
+    parser.add_argument(
+        "--visionendpoint",
+        required=False,
+        help="Optional, required if --searchimages is specified. Endpoint of Azure AI Vision service to use when embedding images.",
+    )
+    parser.add_argument(
+        "--useintvectorization",
+        required=False,
+        help="Required if --useintvectorization is specified. Enable Integrated vectorizer indexer support which is in preview)",
     )
-
     parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
     args = parser.parse_args()
 
@@ -307,19 +429,23 @@ async def main(strategy: Strategy, setup_index: bool = True):
         # to avoid seeing the noisy INFO level logs from the Azure SDKs
         logger.setLevel(logging.INFO)
 
-    load_azd_env()
-
-    use_int_vectorization = os.getenv("USE_FEATURE_INT_VECTORIZATION", "").lower() == "true"
-    use_gptvision = os.getenv("USE_GPT4V", "").lower() == "true"
-    use_acls = os.getenv("AZURE_ADLS_GEN2_STORAGE_ACCOUNT") is not None
-    dont_use_vectors = os.getenv("USE_VECTORS", "").lower() == "false"
+    use_int_vectorization = (
+        args.useintvectorization and args.useintvectorization.lower() == "true"
+    )
 
-    # Use the current user identity to connect to Azure services. See infra/main.bicep for role assignments.
-    if tenant_id := os.getenv("AZURE_TENANT_ID"):
-        logger.info("Connecting to Azure services using the azd credential for tenant %s", tenant_id)
-        azd_credential = AzureDeveloperCliCredential(tenant_id=tenant_id, process_timeout=60)
+    # Use the current user identity to connect to Azure services unless a key is explicitly set for any of them
+    if args.tenantid:
+        logger.info(
+            "Connecting to Azure services using the azd credential for tenant %s",
+            args.tenantid,
+        )
+        azd_credential = AzureDeveloperCliCredential(
+            tenant_id=args.tenantid, process_timeout=60
+        )
     else:
-        logger.info("Connecting to Azure services using the azd credential for home tenant")
+        logger.info(
+            "Connecting to Azure services using the azd credential for home tenant"
+        )
         azd_credential = AzureDeveloperCliCredential(process_timeout=60)
 
     if args.removeall:
@@ -334,51 +460,40 @@ async def main(strategy: Strategy, setup_index: bool = True):
 
     search_info = loop.run_until_complete(
         setup_search_info(
-            search_service=os.environ["AZURE_SEARCH_SERVICE"],
-            index_name=os.environ["AZURE_SEARCH_INDEX"],
+            search_service=args.searchservice,
+            index_name=args.index,
             azure_credential=azd_credential,
             search_key=clean_key_if_exists(args.searchkey),
         )
     )
     blob_manager = setup_blob_manager(
         azure_credential=azd_credential,
-        storage_account=os.environ["AZURE_STORAGE_ACCOUNT"],
-        storage_container=os.environ["AZURE_STORAGE_CONTAINER"],
-        storage_resource_group=os.environ["AZURE_STORAGE_RESOURCE_GROUP"],
-        subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"],
-        search_images=use_gptvision,
+        storage_account=args.storageaccount,
+        storage_container=args.container,
+        storage_resource_group=args.storageresourcegroup,
+        subscription_id=args.subscriptionid,
+        search_images=args.searchimages,
         storage_key=clean_key_if_exists(args.storagekey),
     )
     list_file_strategy = setup_list_file_strategy(
         azure_credential=azd_credential,
         local_files=args.files,
-        datalake_storage_account=os.getenv("AZURE_ADLS_GEN2_STORAGE_ACCOUNT"),
-        datalake_filesystem=os.getenv("AZURE_ADLS_GEN2_FILESYSTEM"),
-        datalake_path=os.getenv("AZURE_ADLS_GEN2_FILESYSTEM_PATH"),
+        datalake_storage_account=args.datalakestorageaccount,
+        datalake_filesystem=args.datalakefilesystem,
+        datalake_path=args.datalakepath,
         datalake_key=clean_key_if_exists(args.datalakekey),
     )
-
-    openai_host = os.environ["OPENAI_HOST"]
-    openai_key = None
-    if os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE"):
-        openai_key = os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE")
-    elif not openai_host.startswith("azure") and os.getenv("OPENAI_API_KEY"):
-        openai_key = os.getenv("OPENAI_API_KEY")
-
-    openai_dimensions = 1536
-    if os.getenv("AZURE_OPENAI_EMB_DIMENSIONS"):
-        openai_dimensions = int(os.environ["AZURE_OPENAI_EMB_DIMENSIONS"])
     openai_embeddings_service = setup_embeddings_service(
         azure_credential=azd_credential,
-        openai_host=openai_host,
-        openai_model_name=os.environ["AZURE_OPENAI_EMB_MODEL_NAME"],
-        openai_service=os.getenv("AZURE_OPENAI_SERVICE"),
-        openai_custom_url=os.getenv("AZURE_OPENAI_CUSTOM_URL"),
-        openai_deployment=os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT"),
-        openai_dimensions=openai_dimensions,
-        openai_key=clean_key_if_exists(openai_key),
-        openai_org=os.getenv("OPENAI_ORGANIZATION"),
-        disable_vectors=dont_use_vectors,
+        openai_host=args.openaihost,
+        openai_model_name=args.openaimodelname,
+        openai_service=args.openaiservice,
+        openai_custom_url=args.openaicustomurl,
+        openai_deployment=args.openaideployment,
+        openai_dimensions=args.openaidimensions,
+        openai_key=clean_key_if_exists(args.openaikey),
+        openai_org=args.openaiorg,
+        disable_vectors=args.novectors,
         disable_batch_vectors=args.disablebatchvectors,
     )
 
@@ -390,25 +505,25 @@ async def main(strategy: Strategy, setup_index: bool = True):
             blob_manager=blob_manager,
             document_action=document_action,
             embeddings=openai_embeddings_service,
-            subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"],
+            subscription_id=args.subscriptionid,
             search_service_user_assigned_id=args.searchserviceassignedid,
-            search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
-            use_acls=use_acls,
+            search_analyzer_name=args.searchanalyzername,
+            use_acls=args.useacls,
             category=args.category,
         )
     else:
         file_processors = setup_file_processors(
             azure_credential=azd_credential,
-            document_intelligence_service=os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE"),
+            document_intelligence_service=args.documentintelligenceservice,
             document_intelligence_key=clean_key_if_exists(args.documentintelligencekey),
-            local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER") == "true",
-            local_html_parser=os.getenv("USE_LOCAL_HTML_PARSER") == "true",
-            search_images=use_gptvision,
+            local_pdf_parser=args.localpdfparser,
+            local_html_parser=args.localhtmlparser,
+            search_images=args.searchimages,
         )
         image_embeddings_service = setup_image_embeddings_service(
             azure_credential=azd_credential,
-            vision_endpoint=os.getenv("AZURE_VISION_ENDPOINT"),
-            search_images=use_gptvision,
+            vision_endpoint=args.visionendpoint,
+            search_images=args.searchimages,
         )
 
         ingestion_strategy = FileStrategy(
@@ -419,10 +534,12 @@ async def main(strategy: Strategy, setup_index: bool = True):
             document_action=document_action,
             embeddings=openai_embeddings_service,
             image_embeddings=image_embeddings_service,
-            search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
-            use_acls=use_acls,
+            search_analyzer_name=args.searchanalyzername,
+            use_acls=args.useacls,
             category=args.category,
         )
 
-    loop.run_until_complete(main(ingestion_strategy, setup_index=not args.remove and not args.removeall))
+    loop.run_until_complete(
+        main(ingestion_strategy, setup_index=not args.remove and not args.removeall)
+    )
     loop.close()

From 887df6ff1d8c3eb66fbb948df6b0510341a5845e Mon Sep 17 00:00:00 2001
From: Saravana <mskumar.87@gmail.com>
Date: Tue, 1 Oct 2024 12:25:32 +0530
Subject: [PATCH 07/10] Update prepdocs.py

---
 app/backend/prepdocs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py
index 46abbc01b5..d8a9149744 100644
--- a/app/backend/prepdocs.py
+++ b/app/backend/prepdocs.py
@@ -8,6 +8,7 @@
 from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider
 
 from prepdocslib.blobmanager import BlobManager
+from prepdocslib.csvparser import CsvParser
 from prepdocslib.embeddings import (
     AzureOpenAIEmbeddingService,
     ImageEmbeddings,
@@ -20,7 +21,6 @@
     IntegratedVectorizerStrategy,
 )
 from prepdocslib.jsonparser import JsonParser
-from prepdocslib.csvparser import CsvParser
 from prepdocslib.listfilestrategy import (
     ADLSGen2ListFileStrategy,
     ListFileStrategy,

From daf097ce820a82aa89280e7f76a2d53b5a74e98e Mon Sep 17 00:00:00 2001
From: Saravana <mskumar.87@gmail.com>
Date: Tue, 1 Oct 2024 22:37:11 +0530
Subject: [PATCH 08/10] Update csvparser.py

---
 app/backend/prepdocslib/csvparser.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/app/backend/prepdocslib/csvparser.py b/app/backend/prepdocslib/csvparser.py
index 0277f752cf..cc8cb6d90d 100644
--- a/app/backend/prepdocslib/csvparser.py
+++ b/app/backend/prepdocslib/csvparser.py
@@ -12,13 +12,14 @@ class CsvParser(Parser):
 
     async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
         # Check if content is in bytes (binary file) and decode to string
+        content_str: str
         if isinstance(content, (bytes, bytearray)):
-            content = content.decode("utf-8")
+            content_str = content.decode("utf-8")
         elif hasattr(content, "read"):  # Handle BufferedReader
-            content = content.read().decode("utf-8")
+            content_str = content.read().decode("utf-8")
 
         # Create a CSV reader from the text content
-        reader = csv.reader(content.splitlines())
+        reader = csv.reader(content_str.splitlines())
         offset = 0
 
         # Skip the header row

From cae6f3ad86dd8a080f701c5db5e3d4f58f0cbf76 Mon Sep 17 00:00:00 2001
From: Saravana <mskumar.87@gmail.com>
Date: Wed, 2 Oct 2024 01:28:42 +0530
Subject: [PATCH 09/10] Update prepdocs.py

---
 app/backend/prepdocs.py | 56 +++++++++++------------------------------
 1 file changed, 14 insertions(+), 42 deletions(-)

diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py
index d8a9149744..d276240ec0 100644
--- a/app/backend/prepdocs.py
+++ b/app/backend/prepdocs.py
@@ -68,9 +68,7 @@ def setup_blob_manager(
     search_images: bool,
     storage_key: Union[str, None] = None,
 ):
-    storage_creds: Union[AsyncTokenCredential, str] = (
-        azure_credential if storage_key is None else storage_key
-    )
+    storage_creds: Union[AsyncTokenCredential, str] = azure_credential if storage_key is None else storage_key
     return BlobManager(
         endpoint=f"https://{storage_account}.blob.core.windows.net",
         container=storage_container,
@@ -93,15 +91,9 @@ def setup_list_file_strategy(
     list_file_strategy: ListFileStrategy
     if datalake_storage_account:
         if datalake_filesystem is None or datalake_path is None:
-            raise ValueError(
-                "DataLake file system and path are required when using Azure Data Lake Gen2"
-            )
-        adls_gen2_creds: Union[AsyncTokenCredential, str] = (
-            azure_credential if datalake_key is None else datalake_key
-        )
-        logger.info(
-            "Using Data Lake Gen2 Storage Account: %s", datalake_storage_account
-        )
+            raise ValueError("DataLake file system and path are required when using Azure Data Lake Gen2")
+        adls_gen2_creds: Union[AsyncTokenCredential, str] = azure_credential if datalake_key is None else datalake_key
+        logger.info("Using Data Lake Gen2 Storage Account: %s", datalake_storage_account)
         list_file_strategy = ADLSGen2ListFileStrategy(
             data_lake_storage_account=datalake_storage_account,
             data_lake_filesystem=datalake_filesystem,
@@ -112,9 +104,7 @@ def setup_list_file_strategy(
         logger.info("Using local files: %s", local_files)
         list_file_strategy = LocalListFileStrategy(path_pattern=local_files)
     else:
-        raise ValueError(
-            "Either local_files or datalake_storage_account must be provided."
-        )
+        raise ValueError("Either local_files or datalake_storage_account must be provided.")
     return list_file_strategy
 
 
@@ -150,9 +140,7 @@ def setup_embeddings_service(
         )
     else:
         if openai_key is None:
-            raise ValueError(
-                "OpenAI key is required when using the non-Azure OpenAI API"
-            )
+            raise ValueError("OpenAI key is required when using the non-Azure OpenAI API")
         return OpenAIEmbeddingService(
             open_ai_model_name=openai_model_name,
             open_ai_dimensions=openai_dimensions,
@@ -177,9 +165,7 @@ def setup_file_processors(
     # check if Azure Document Intelligence credentials are provided
     if document_intelligence_service is not None:
         documentintelligence_creds: Union[AsyncTokenCredential, AzureKeyCredential] = (
-            azure_credential
-            if document_intelligence_key is None
-            else AzureKeyCredential(document_intelligence_key)
+            azure_credential if document_intelligence_key is None else AzureKeyCredential(document_intelligence_key)
         )
         doc_int_parser = DocumentAnalysisParser(
             endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/",
@@ -221,14 +207,10 @@ def setup_image_embeddings_service(
     image_embeddings_service: Optional[ImageEmbeddings] = None
     if search_images:
         if vision_endpoint is None:
-            raise ValueError(
-                "A computer vision endpoint is required when GPT-4-vision is enabled."
-            )
+            raise ValueError("A computer vision endpoint is required when GPT-4-vision is enabled.")
         image_embeddings_service = ImageEmbeddings(
             endpoint=vision_endpoint,
-            token_provider=get_bearer_token_provider(
-                azure_credential, "https://cognitiveservices.azure.com/.default"
-            ),
+            token_provider=get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default"),
         )
     return image_embeddings_service
 
@@ -283,9 +265,7 @@ async def main(strategy: Strategy, setup_index: bool = True):
     )
     parser.add_argument("--storageaccount", help="Azure Blob Storage account name")
     parser.add_argument("--container", help="Azure Blob Storage container name")
-    parser.add_argument(
-        "--storageresourcegroup", help="Azure blob storage resource group"
-    )
+    parser.add_argument("--storageresourcegroup", help="Azure blob storage resource group")
     parser.add_argument(
         "--storagekey",
         required=False,
@@ -429,9 +409,7 @@ async def main(strategy: Strategy, setup_index: bool = True):
         # to avoid seeing the noisy INFO level logs from the Azure SDKs
         logger.setLevel(logging.INFO)
 
-    use_int_vectorization = (
-        args.useintvectorization and args.useintvectorization.lower() == "true"
-    )
+    use_int_vectorization = args.useintvectorization and args.useintvectorization.lower() == "true"
 
     # Use the current user identity to connect to Azure services unless a key is explicitly set for any of them
     if args.tenantid:
@@ -439,13 +417,9 @@ async def main(strategy: Strategy, setup_index: bool = True):
             "Connecting to Azure services using the azd credential for tenant %s",
             args.tenantid,
         )
-        azd_credential = AzureDeveloperCliCredential(
-            tenant_id=args.tenantid, process_timeout=60
-        )
+        azd_credential = AzureDeveloperCliCredential(tenant_id=args.tenantid, process_timeout=60)
     else:
-        logger.info(
-            "Connecting to Azure services using the azd credential for home tenant"
-        )
+        logger.info("Connecting to Azure services using the azd credential for home tenant")
         azd_credential = AzureDeveloperCliCredential(process_timeout=60)
 
     if args.removeall:
@@ -539,7 +513,5 @@ async def main(strategy: Strategy, setup_index: bool = True):
             category=args.category,
         )
 
-    loop.run_until_complete(
-        main(ingestion_strategy, setup_index=not args.remove and not args.removeall)
-    )
+    loop.run_until_complete(main(ingestion_strategy, setup_index=not args.remove and not args.removeall))
     loop.close()

From 5aa16495ad56779639ba9bd5654f1a16f0901037 Mon Sep 17 00:00:00 2001
From: Pamela Fox <pamela.fox@gmail.com>
Date: Wed, 2 Oct 2024 12:41:11 -0700
Subject: [PATCH 10/10] Fix prepdocs and tests to match main

---
 app/backend/prepdocs.py  | 316 ++++++++++++++-------------------------
 tests/test_app_config.py |   8 +-
 2 files changed, 113 insertions(+), 211 deletions(-)

diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py
index d276240ec0..420b4af39f 100644
--- a/app/backend/prepdocs.py
+++ b/app/backend/prepdocs.py
@@ -1,12 +1,14 @@
 import argparse
 import asyncio
 import logging
+import os
 from typing import Optional, Union
 
 from azure.core.credentials import AzureKeyCredential
 from azure.core.credentials_async import AsyncTokenCredential
 from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider
 
+from load_azd_env import load_azd_env
 from prepdocslib.blobmanager import BlobManager
 from prepdocslib.csvparser import CsvParser
 from prepdocslib.embeddings import (
@@ -32,7 +34,7 @@
 from prepdocslib.textparser import TextParser
 from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter
 
-logger = logging.getLogger("ingester")
+logger = logging.getLogger("scripts")
 
 
 def clean_key_if_exists(key: Union[str, None]) -> Union[str, None]:
@@ -43,10 +45,7 @@ def clean_key_if_exists(key: Union[str, None]) -> Union[str, None]:
 
 
 async def setup_search_info(
-    search_service: str,
-    index_name: str,
-    azure_credential: AsyncTokenCredential,
-    search_key: Union[str, None] = None,
+    search_service: str, index_name: str, azure_credential: AsyncTokenCredential, search_key: Union[str, None] = None
 ) -> SearchInfo:
     search_creds: Union[AsyncTokenCredential, AzureKeyCredential] = (
         azure_credential if search_key is None else AzureKeyCredential(search_key)
@@ -158,10 +157,9 @@ def setup_file_processors(
     local_html_parser: bool = False,
     search_images: bool = False,
 ):
-    html_parser: Parser
-    pdf_parser: Parser
-    doc_int_parser: DocumentAnalysisParser
+    sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=search_images)
 
+    doc_int_parser: Optional[DocumentAnalysisParser] = None
     # check if Azure Document Intelligence credentials are provided
     if document_intelligence_service is not None:
         documentintelligence_creds: Union[AsyncTokenCredential, AzureKeyCredential] = (
@@ -171,38 +169,55 @@ def setup_file_processors(
             endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/",
             credential=documentintelligence_creds,
         )
+
+    pdf_parser: Optional[Parser] = None
     if local_pdf_parser or document_intelligence_service is None:
         pdf_parser = LocalPdfParser()
-    else:
+    elif document_intelligence_service is not None:
         pdf_parser = doc_int_parser
+    else:
+        logger.warning("No PDF parser available")
+
+    html_parser: Optional[Parser] = None
     if local_html_parser or document_intelligence_service is None:
         html_parser = LocalHTMLParser()
-    else:
+    elif document_intelligence_service is not None:
         html_parser = doc_int_parser
-    sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=search_images)
-    return {
-        ".pdf": FileProcessor(pdf_parser, sentence_text_splitter),
-        ".html": FileProcessor(html_parser, sentence_text_splitter),
+    else:
+        logger.warning("No HTML parser available")
+
+    # These file formats can always be parsed:
+    file_processors = {
         ".json": FileProcessor(JsonParser(), SimpleTextSplitter()),
-        ".docx": FileProcessor(doc_int_parser, sentence_text_splitter),
-        ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter),
-        ".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter),
-        ".csv": FileProcessor(CsvParser(), sentence_text_splitter),
-        ".png": FileProcessor(doc_int_parser, sentence_text_splitter),
-        ".jpg": FileProcessor(doc_int_parser, sentence_text_splitter),
-        ".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter),
-        ".tiff": FileProcessor(doc_int_parser, sentence_text_splitter),
-        ".bmp": FileProcessor(doc_int_parser, sentence_text_splitter),
-        ".heic": FileProcessor(doc_int_parser, sentence_text_splitter),
         ".md": FileProcessor(TextParser(), sentence_text_splitter),
         ".txt": FileProcessor(TextParser(), sentence_text_splitter),
+        ".csv": FileProcessor(CsvParser(), sentence_text_splitter),
     }
+    # These require either a Python package or Document Intelligence
+    if pdf_parser is not None:
+        file_processors.update({".pdf": FileProcessor(pdf_parser, sentence_text_splitter)})
+    if html_parser is not None:
+        file_processors.update({".html": FileProcessor(html_parser, sentence_text_splitter)})
+    # These file formats require Document Intelligence
+    if doc_int_parser is not None:
+        file_processors.update(
+            {
+                ".docx": FileProcessor(doc_int_parser, sentence_text_splitter),
+                ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter),
+                ".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter),
+                ".png": FileProcessor(doc_int_parser, sentence_text_splitter),
+                ".jpg": FileProcessor(doc_int_parser, sentence_text_splitter),
+                ".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter),
+                ".tiff": FileProcessor(doc_int_parser, sentence_text_splitter),
+                ".bmp": FileProcessor(doc_int_parser, sentence_text_splitter),
+                ".heic": FileProcessor(doc_int_parser, sentence_text_splitter),
+            }
+        )
+    return file_processors
 
 
 def setup_image_embeddings_service(
-    azure_credential: AsyncTokenCredential,
-    vision_endpoint: Union[str, None],
-    search_images: bool,
+    azure_credential: AsyncTokenCredential, vision_endpoint: Union[str, None], search_images: bool
 ) -> Union[ImageEmbeddings, None]:
     image_embeddings_service: Optional[ImageEmbeddings] = None
     if search_images:
@@ -225,134 +240,18 @@ async def main(strategy: Strategy, setup_index: bool = True):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index.",
-        epilog="Example: prepdocs.py '.\\data\*' --storageaccount myaccount --container mycontainer --searchservice mysearch --index myindex -v",
+        epilog="Example: prepdocs.py '.\\data\*' -v",
     )
     parser.add_argument("files", nargs="?", help="Files to be processed")
-    parser.add_argument(
-        "--datalakestorageaccount",
-        required=False,
-        help="Optional. Azure Data Lake Storage Gen2 Account name",
-    )
-    parser.add_argument(
-        "--datalakefilesystem",
-        required=False,
-        default="gptkbcontainer",
-        help="Optional. Azure Data Lake Storage Gen2 filesystem name",
-    )
-    parser.add_argument(
-        "--datalakepath",
-        required=False,
-        help="Optional. Azure Data Lake Storage Gen2 filesystem path containing files to index. If omitted, index the entire filesystem",
-    )
-    parser.add_argument(
-        "--datalakekey",
-        required=False,
-        help="Optional. Use this key when authenticating to Azure Data Lake Gen2",
-    )
-    parser.add_argument(
-        "--useacls",
-        action="store_true",
-        help="Store ACLs from Azure Data Lake Gen2 Filesystem in the search index",
-    )
-    parser.add_argument(
-        "--category",
-        help="Value for the category field in the search index for all sections indexed in this run",
-    )
-    parser.add_argument(
-        "--skipblobs",
-        action="store_true",
-        help="Skip uploading individual pages to Azure Blob Storage",
-    )
-    parser.add_argument("--storageaccount", help="Azure Blob Storage account name")
-    parser.add_argument("--container", help="Azure Blob Storage container name")
-    parser.add_argument("--storageresourcegroup", help="Azure blob storage resource group")
-    parser.add_argument(
-        "--storagekey",
-        required=False,
-        help="Optional. Use this Azure Blob Storage account key instead of the current user identity to login (use az login to set current user for Azure)",
-    )
-    parser.add_argument(
-        "--tenantid",
-        required=False,
-        help="Optional. Use this to define the Azure directory where to authenticate)",
-    )
-    parser.add_argument(
-        "--subscriptionid",
-        required=False,
-        help="Optional. Use this to define managed identity connection string in integrated vectorization",
-    )
-    parser.add_argument(
-        "--searchservice",
-        help="Name of the Azure AI Search service where content should be indexed (must exist already)",
-    )
-    parser.add_argument(
-        "--searchserviceassignedid",
-        required=False,
-        help="Search service system assigned Identity (Managed identity) (used for integrated vectorization)",
-    )
-    parser.add_argument(
-        "--index",
-        help="Name of the Azure AI Search index where content should be indexed (will be created if it doesn't exist)",
-    )
-    parser.add_argument(
-        "--searchkey",
-        required=False,
-        help="Optional. Use this Azure AI Search account key instead of the current user identity to login (use az login to set current user for Azure)",
-    )
-    parser.add_argument(
-        "--searchanalyzername",
-        required=False,
-        default="en.microsoft",
-        help="Optional. Name of the Azure AI Search analyzer to use for the content field in the index",
-    )
-    parser.add_argument(
-        "--openaihost",
-        help="Host of the API used to compute embeddings ('azure' or 'openai')",
-    )
-    parser.add_argument(
-        "--openaiservice",
-        help="Name of the Azure OpenAI service used to compute embeddings",
-    )
-    parser.add_argument(
-        "--openaideployment",
-        help="Name of the Azure OpenAI model deployment for an embedding model ('text-embedding-ada-002' recommended)",
-    )
-    parser.add_argument(
-        "--openaimodelname",
-        help="Name of the Azure OpenAI embedding model ('text-embedding-ada-002' recommended)",
-    )
-    parser.add_argument(
-        "--openaidimensions",
-        required=False,
-        default=1536,
-        type=int,
-        help="Dimensions for the embedding model (defaults to 1536 for 'text-embedding-ada-002')",
-    )
-    parser.add_argument(
-        "--novectors",
-        action="store_true",
-        help="Don't compute embeddings for the sections (e.g. don't call the OpenAI embeddings API during indexing)",
-    )
-    parser.add_argument(
-        "--disablebatchvectors",
-        action="store_true",
-        help="Don't compute embeddings in batch for the sections",
-    )
 
     parser.add_argument(
-        "--openaicustomurl",
-        required=False,
-        help="Optional. Use this custom OpenAI URL instead of the default OpenAI URL",
+        "--category", help="Value for the category field in the search index for all sections indexed in this run"
     )
     parser.add_argument(
-        "--openaikey",
-        required=False,
-        help="Optional. Use this OpenAI account key instead of the current Azure user identity to login.",
+        "--skipblobs", action="store_true", help="Skip uploading individual pages to Azure Blob Storage"
     )
     parser.add_argument(
-        "--openaiorg",
-        required=False,
-        help="This is required only when using non-Azure endpoints.",
+        "--disablebatchvectors", action="store_true", help="Don't compute embeddings in batch for the sections"
     )
     parser.add_argument(
         "--remove",
@@ -364,42 +263,32 @@ async def main(strategy: Strategy, setup_index: bool = True):
         action="store_true",
         help="Remove all blobs from blob storage and documents from the search index",
     )
+
+    # Optional key specification:
     parser.add_argument(
-        "--localpdfparser",
-        action="store_true",
-        help="Use PyPdf local PDF parser (supports only digital PDFs) instead of Azure Document Intelligence service to extract text, tables and layout from the documents",
-    )
-    parser.add_argument(
-        "--localhtmlparser",
-        action="store_true",
-        help="Use Beautiful soap local HTML parser instead of Azure Document Intelligence service to extract text, tables and layout from the documents",
-    )
-    parser.add_argument(
-        "--documentintelligenceservice",
+        "--searchkey",
         required=False,
-        help="Optional. Name of the Azure Document Intelligence service which will be used to extract text, tables and layout from the documents (must exist already)",
+        help="Optional. Use this Azure AI Search account key instead of the current user identity to login (use az login to set current user for Azure)",
     )
     parser.add_argument(
-        "--documentintelligencekey",
+        "--storagekey",
         required=False,
-        help="Optional. Use this Azure Document Intelligence account key instead of the current user identity to login (use az login to set current user for Azure)",
+        help="Optional. Use this Azure Blob Storage account key instead of the current user identity to login (use az login to set current user for Azure)",
     )
     parser.add_argument(
-        "--searchimages",
-        action="store_true",
-        required=False,
-        help="Optional. Generate image embeddings to enable each page to be searched as an image",
+        "--datalakekey", required=False, help="Optional. Use this key when authenticating to Azure Data Lake Gen2"
     )
     parser.add_argument(
-        "--visionendpoint",
+        "--documentintelligencekey",
         required=False,
-        help="Optional, required if --searchimages is specified. Endpoint of Azure AI Vision service to use when embedding images.",
+        help="Optional. Use this Azure Document Intelligence account key instead of the current user identity to login (use az login to set current user for Azure)",
     )
     parser.add_argument(
-        "--useintvectorization",
+        "--searchserviceassignedid",
         required=False,
-        help="Required if --useintvectorization is specified. Enable Integrated vectorizer indexer support which is in preview)",
+        help="Search service system assigned Identity (Managed identity) (used for integrated vectorization)",
     )
+
     parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
     args = parser.parse_args()
 
@@ -409,15 +298,17 @@ async def main(strategy: Strategy, setup_index: bool = True):
         # to avoid seeing the noisy INFO level logs from the Azure SDKs
         logger.setLevel(logging.INFO)
 
-    use_int_vectorization = args.useintvectorization and args.useintvectorization.lower() == "true"
+    load_azd_env()
 
-    # Use the current user identity to connect to Azure services unless a key is explicitly set for any of them
-    if args.tenantid:
-        logger.info(
-            "Connecting to Azure services using the azd credential for tenant %s",
-            args.tenantid,
-        )
-        azd_credential = AzureDeveloperCliCredential(tenant_id=args.tenantid, process_timeout=60)
+    use_int_vectorization = os.getenv("USE_FEATURE_INT_VECTORIZATION", "").lower() == "true"
+    use_gptvision = os.getenv("USE_GPT4V", "").lower() == "true"
+    use_acls = os.getenv("AZURE_ADLS_GEN2_STORAGE_ACCOUNT") is not None
+    dont_use_vectors = os.getenv("USE_VECTORS", "").lower() == "false"
+
+    # Use the current user identity to connect to Azure services. See infra/main.bicep for role assignments.
+    if tenant_id := os.getenv("AZURE_TENANT_ID"):
+        logger.info("Connecting to Azure services using the azd credential for tenant %s", tenant_id)
+        azd_credential = AzureDeveloperCliCredential(tenant_id=tenant_id, process_timeout=60)
     else:
         logger.info("Connecting to Azure services using the azd credential for home tenant")
         azd_credential = AzureDeveloperCliCredential(process_timeout=60)
@@ -434,40 +325,51 @@ async def main(strategy: Strategy, setup_index: bool = True):
 
     search_info = loop.run_until_complete(
         setup_search_info(
-            search_service=args.searchservice,
-            index_name=args.index,
+            search_service=os.environ["AZURE_SEARCH_SERVICE"],
+            index_name=os.environ["AZURE_SEARCH_INDEX"],
             azure_credential=azd_credential,
             search_key=clean_key_if_exists(args.searchkey),
         )
     )
     blob_manager = setup_blob_manager(
         azure_credential=azd_credential,
-        storage_account=args.storageaccount,
-        storage_container=args.container,
-        storage_resource_group=args.storageresourcegroup,
-        subscription_id=args.subscriptionid,
-        search_images=args.searchimages,
+        storage_account=os.environ["AZURE_STORAGE_ACCOUNT"],
+        storage_container=os.environ["AZURE_STORAGE_CONTAINER"],
+        storage_resource_group=os.environ["AZURE_STORAGE_RESOURCE_GROUP"],
+        subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"],
+        search_images=use_gptvision,
         storage_key=clean_key_if_exists(args.storagekey),
     )
     list_file_strategy = setup_list_file_strategy(
         azure_credential=azd_credential,
         local_files=args.files,
-        datalake_storage_account=args.datalakestorageaccount,
-        datalake_filesystem=args.datalakefilesystem,
-        datalake_path=args.datalakepath,
+        datalake_storage_account=os.getenv("AZURE_ADLS_GEN2_STORAGE_ACCOUNT"),
+        datalake_filesystem=os.getenv("AZURE_ADLS_GEN2_FILESYSTEM"),
+        datalake_path=os.getenv("AZURE_ADLS_GEN2_FILESYSTEM_PATH"),
         datalake_key=clean_key_if_exists(args.datalakekey),
     )
+
+    openai_host = os.environ["OPENAI_HOST"]
+    openai_key = None
+    if os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE"):
+        openai_key = os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE")
+    elif not openai_host.startswith("azure") and os.getenv("OPENAI_API_KEY"):
+        openai_key = os.getenv("OPENAI_API_KEY")
+
+    openai_dimensions = 1536
+    if os.getenv("AZURE_OPENAI_EMB_DIMENSIONS"):
+        openai_dimensions = int(os.environ["AZURE_OPENAI_EMB_DIMENSIONS"])
     openai_embeddings_service = setup_embeddings_service(
         azure_credential=azd_credential,
-        openai_host=args.openaihost,
-        openai_model_name=args.openaimodelname,
-        openai_service=args.openaiservice,
-        openai_custom_url=args.openaicustomurl,
-        openai_deployment=args.openaideployment,
-        openai_dimensions=args.openaidimensions,
-        openai_key=clean_key_if_exists(args.openaikey),
-        openai_org=args.openaiorg,
-        disable_vectors=args.novectors,
+        openai_host=openai_host,
+        openai_model_name=os.environ["AZURE_OPENAI_EMB_MODEL_NAME"],
+        openai_service=os.getenv("AZURE_OPENAI_SERVICE"),
+        openai_custom_url=os.getenv("AZURE_OPENAI_CUSTOM_URL"),
+        openai_deployment=os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT"),
+        openai_dimensions=openai_dimensions,
+        openai_key=clean_key_if_exists(openai_key),
+        openai_org=os.getenv("OPENAI_ORGANIZATION"),
+        disable_vectors=dont_use_vectors,
         disable_batch_vectors=args.disablebatchvectors,
     )
 
@@ -479,25 +381,25 @@ async def main(strategy: Strategy, setup_index: bool = True):
             blob_manager=blob_manager,
             document_action=document_action,
             embeddings=openai_embeddings_service,
-            subscription_id=args.subscriptionid,
+            subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"],
             search_service_user_assigned_id=args.searchserviceassignedid,
-            search_analyzer_name=args.searchanalyzername,
-            use_acls=args.useacls,
+            search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
+            use_acls=use_acls,
             category=args.category,
         )
     else:
         file_processors = setup_file_processors(
             azure_credential=azd_credential,
-            document_intelligence_service=args.documentintelligenceservice,
+            document_intelligence_service=os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE"),
             document_intelligence_key=clean_key_if_exists(args.documentintelligencekey),
-            local_pdf_parser=args.localpdfparser,
-            local_html_parser=args.localhtmlparser,
-            search_images=args.searchimages,
+            local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER") == "true",
+            local_html_parser=os.getenv("USE_LOCAL_HTML_PARSER") == "true",
+            search_images=use_gptvision,
         )
         image_embeddings_service = setup_image_embeddings_service(
             azure_credential=azd_credential,
-            vision_endpoint=args.visionendpoint,
-            search_images=args.searchimages,
+            vision_endpoint=os.getenv("AZURE_VISION_ENDPOINT"),
+            search_images=use_gptvision,
         )
 
         ingestion_strategy = FileStrategy(
@@ -508,8 +410,8 @@ async def main(strategy: Strategy, setup_index: bool = True):
             document_action=document_action,
             embeddings=openai_embeddings_service,
             image_embeddings=image_embeddings_service,
-            search_analyzer_name=args.searchanalyzername,
-            use_acls=args.useacls,
+            search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
+            use_acls=use_acls,
             category=args.category,
         )
 
diff --git a/tests/test_app_config.py b/tests/test_app_config.py
index 29139d2a02..cc7f440083 100644
--- a/tests/test_app_config.py
+++ b/tests/test_app_config.py
@@ -63,7 +63,7 @@ async def test_app_user_upload_processors(monkeypatch, minimal_env):
     async with quart_app.test_app():
         ingester = quart_app.config[app.CONFIG_INGESTER]
         assert ingester is not None
-        assert len(ingester.file_processors.keys()) == 5
+        assert len(ingester.file_processors.keys()) == 6
 
 
 @pytest.mark.asyncio
@@ -77,7 +77,7 @@ async def test_app_user_upload_processors_docint(monkeypatch, minimal_env):
     async with quart_app.test_app():
         ingester = quart_app.config[app.CONFIG_INGESTER]
         assert ingester is not None
-        assert len(ingester.file_processors.keys()) == 14
+        assert len(ingester.file_processors.keys()) == 15
 
 
 @pytest.mark.asyncio
@@ -92,7 +92,7 @@ async def test_app_user_upload_processors_docint_localpdf(monkeypatch, minimal_e
     async with quart_app.test_app():
         ingester = quart_app.config[app.CONFIG_INGESTER]
         assert ingester is not None
-        assert len(ingester.file_processors.keys()) == 14
+        assert len(ingester.file_processors.keys()) == 15
         assert ingester.file_processors[".pdf"] is not ingester.file_processors[".pptx"]
 
 
@@ -108,7 +108,7 @@ async def test_app_user_upload_processors_docint_localhtml(monkeypatch, minimal_
     async with quart_app.test_app():
         ingester = quart_app.config[app.CONFIG_INGESTER]
         assert ingester is not None
-        assert len(ingester.file_processors.keys()) == 14
+        assert len(ingester.file_processors.keys()) == 15
         assert ingester.file_processors[".html"] is not ingester.file_processors[".pptx"]