From 60b37c1f17ccdd1686c8d066246d8b09abef4065 Mon Sep 17 00:00:00 2001
From: Jenkins <jenkins@ip-10-0-2-123.ec2.internal>
Date: Wed, 24 Apr 2024 23:37:26 +0000
Subject: [PATCH 01/25] test status updated

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fcc9189..c3cf72a 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 ![GitHub License](https://img.shields.io/github/license/arjbingly/Capstone_5)
 ![Linting](https://img.shields.io/github/actions/workflow/status/arjbingly/Capstone_5/sphinx-gitpg.yml?label=Docs)
 ![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/arjbingly/Capstone_5/build_linting.yml?label=Linting)
-![Static Badge](https://img.shields.io/badge/Tests-failing-red)
+![Static Badge](https://img.shields.io/badge/Tests-passing-darggreen)
 ![Static Badge](https://img.shields.io/badge/docstring%20style-google-yellow)
 ![Static Badge](https://img.shields.io/badge/linter%20-ruff-yellow)
 ![Static Badge](https://img.shields.io/badge/buildstyle-hatchling-purple?labelColor=white)

From 9b1af7816121a728ec568b2ea1b1bb09da91418d Mon Sep 17 00:00:00 2001
From: sanchitvj <sanchitvj1026@gmail.com>
Date: Tue, 30 Apr 2024 16:26:10 -0400
Subject: [PATCH 02/25] quantization modified

---
 config.ini                        |   6 +-
 pyproject.toml                    |   3 +-
 src/grag/quantize/get_releases.py |  68 +++++++++++
 src/grag/quantize/get_repo.py     |  33 +++++
 src/grag/quantize/quantize.py     |  64 ++++++----
 src/grag/quantize/utils.py        | 194 ++++++++++++++++++------------
 6 files changed, 263 insertions(+), 105 deletions(-)
 create mode 100644 src/grag/quantize/get_releases.py
 create mode 100644 src/grag/quantize/get_repo.py

diff --git a/config.ini b/config.ini
index 339d77c..b8573a7 100644
--- a/config.ini
+++ b/config.ini
@@ -12,7 +12,7 @@ n_ctx : 6000
 n_gpu_layers : -1
 # The number of layers to put on the GPU. Mixtral-18, gemma-20
 std_out : True
-base_dir : ${root:root_path}/models
+;base_dir : ${root:root_path}/models
 
 [chroma_client]
 host : localhost
@@ -64,5 +64,5 @@ env_path : ${root:root_path}/.env
 [root]
 root_path : /home/ubuntu/volume_2k/Capstone_5
 
-[quantize]
-llama_cpp_path : ${root:root_path}
+;[quantize]
+;llama_cpp_path : ${root:root_path}
diff --git a/pyproject.toml b/pyproject.toml
index 33fd0b7..bc92122 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,7 +46,8 @@ dependencies = [
     "bitsandbytes>=0.42.0",
     "accelerate>=0.28.0",
     "poppler-utils>=0.1.0",
-    "tesseract>=0.1.3"
+    "tesseract>=0.1.3",
+    "GitPython>=4.0",
 ]
 
 [project.optional-dependencies]
diff --git a/src/grag/quantize/get_releases.py b/src/grag/quantize/get_releases.py
new file mode 100644
index 0000000..5eb8e5b
--- /dev/null
+++ b/src/grag/quantize/get_releases.py
@@ -0,0 +1,68 @@
+import requests
+
+# 
+# def get_github_releases(user, repo):
+#     url = f"https://api.github.com/repos/{user}/{repo}/releases/latest"
+#     response = requests.get(url)
+#     releases = response.json()
+#     return releases
+# 
+# 
+# # Example usage
+# latest_release = get_github_releases('ggerganov',
+#                                      'llama.cpp')  # Replace 'nodejs' and 'node' with the appropriate user and repository
+# if 'tag_name' in latest_release:
+#     print(f"Latest Release Tag: {latest_release['tag_name']}, Assets: {len(latest_release['assets'])}")
+# else:
+#     print("Error fetching latest release:", latest_release.get('message', 'No error message provided'))
+
+
+def get_asset_download_url(user, repo, asset_name_pattern):
+    """Fetches the download URL of the first asset that matches a given name pattern in the latest release of the specified repository.
+
+    Args:
+        user (str): GitHub username or organization of the repository.
+        repo (str): Repository name.
+        asset_name_pattern (str): Substring to match in the asset's name.
+
+    Returns:
+        str: The download URL of the matching asset, or None if no match is found.
+    """
+    url = f"https://api.github.com/repos/{user}/{repo}/releases/latest"
+    response = requests.get(url)
+    if response.status_code == 200:
+        release = response.json()
+        for asset in release.get('assets', []):
+            if asset_name_pattern in asset['name']:
+                return asset['browser_download_url']
+        print("No asset found matching the pattern.")
+    else:
+        print("Failed to fetch release info:", response.status_code)
+    return None
+
+
+def download_release_asset(download_url, target_path):
+    """Downloads a file from a given URL and saves it to a specified path.
+
+    Args:
+        download_url (str): The URL of the file to download.
+        target_path (str): Path where the file will be saved.
+    """
+    response = requests.get(download_url, stream=True)
+    if response.status_code == 200:
+        with open(target_path, 'wb') as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        print(f"Downloaded successfully to {target_path}")
+    else:
+        print(f"Failed to download file: {response.status_code}")
+
+
+# Example usage
+user = 'ggerganov'
+repo = 'llama.cpp'
+asset_name_pattern = 'ubuntu-x64'  # Adjust the pattern to match the desired asset
+target_path = 'llama-cpp-ubuntu-x64.zip'
+download_url = get_asset_download_url(user, repo, asset_name_pattern)
+if download_url:
+    download_release_asset(download_url, target_path)
diff --git a/src/grag/quantize/get_repo.py b/src/grag/quantize/get_repo.py
new file mode 100644
index 0000000..f3981e0
--- /dev/null
+++ b/src/grag/quantize/get_repo.py
@@ -0,0 +1,33 @@
+import os
+
+from git import Repo
+
+
+def clone_or_update_repo_with_gitpython(repo_url, destination_folder):
+    """Clones a GitHub repository to a specified local directory or updates it if it already exists using GitPython.
+
+    Args:
+        repo_url (str): The URL of the repository to clone.
+        destination_folder (str): The local path where the repository should be cloned or updated.
+
+    Returns:
+        None
+    """
+    if os.path.isdir(destination_folder) and os.path.isdir(os.path.join(destination_folder, '.git')):
+        try:
+            repo = Repo(destination_folder)
+            origin = repo.remotes.origin
+            origin.pull()
+            print(f"Repository updated successfully in {destination_folder}")
+        except Exception as e:
+            print(f"Failed to update repository: {str(e)}")
+    else:
+        try:
+            Repo.clone_from(repo_url, destination_folder)
+            print(f"Repository cloned successfully into {destination_folder}")
+        except Exception as e:
+            print(f"Failed to clone repository: {str(e)}")
+
+
+# Example usage
+clone_or_update_repo_with_gitpython('https://github.com/ggerganov/llama.cpp.git', './llama.cpp')
diff --git a/src/grag/quantize/quantize.py b/src/grag/quantize/quantize.py
index 64fba47..74f735f 100644
--- a/src/grag/quantize/quantize.py
+++ b/src/grag/quantize/quantize.py
@@ -1,52 +1,66 @@
 """Interactive file for quantizing models."""
 
+import platform
 from pathlib import Path
 
 from grag.components.utils import get_config
 from grag.quantize.utils import (
-    building_llamacpp,
+    download_release_asset,
     fetch_model_repo,
+    get_asset_download_url,
     get_llamacpp_repo,
     quantize_model,
 )
 
 config = get_config()
-root_path = Path(config["quantize"]["llama_cpp_path"])
 
 if __name__ == "__main__":
     user_input = input(
-        "Enter the path to the llama_cpp cloned repo, or where you'd like to clone it. Press Enter to use the default config path: "
-    ).strip()
+        "Enter the path which you want to download all the source files. Press Enter to use the default path: ").strip()
 
-    if user_input != "":
+    if user_input == "":
+        try:
+            root_path = Path(config["quantize"]["llama_cpp_path"])
+        except KeyError:
+            root_path = Path('./grag-quantize')
+    else:
         root_path = Path(user_input)
 
-    res = get_llamacpp_repo(root_path)
+    get_llamacpp_repo(destination_folder=root_path)
+    os_name = platform.system()
+    architecture = platform.machine()
+    asset_name_pattern = 'bin'
+    match os_name, architecture:
+        case ('Darwin', 'x86_64'):
+            asset_name_pattern += '-macos-x64'
+        case ('Darwin', 'arm64'):
+            asset_name_pattern += '-macos-arm64'
+        case ('Windows', 'x86_64'):
+            asset_name_pattern += '-win-arm64-x64'
+        case ('Windows', 'arm64'):
+            asset_name_pattern += '-win-arm64-x64'
+        case ('Linux', 'x86_64'):
+            asset_name_pattern += '-ubuntu-x64'
+        case _:
+            raise ValueError(f"{os_name=}, {architecture=} is not supported by llama.cpp releases.")
 
-    if "Already up to date." in str(res.stdout):
-        print("Repository is already up to date. Skipping build.")
-    else:
-        print("Updates found. Starting build...")
-        building_llamacpp(root_path)
-
-    response = (
-        input("Do you want us to download the model? (y/n) [Enter for yes]: ")
-        .strip()
-        .lower()
-    )
+    download_url = get_asset_download_url(asset_name_pattern)
+    if download_url:
+        download_release_asset(download_url, root_path)
+
+    response = input("Do you want us to download the model? (y/n) [Enter for yes]: ").strip().lower()
     if response == "n":
-        print("Please copy the model folder to 'llama.cpp/models/' folder.")
-        _ = input("Enter if you have already copied the model:")
-        model_dir = Path(input("Enter the model directory name: "))
+        model_dir = Path(input("Enter path to the model directory: "))
     elif response == "y" or response == "":
         repo_id = input(
             "Please enter the repo_id for the model (you can check on https://huggingface.co/models): "
         ).strip()
-        fetch_model_repo(repo_id, root_path)
-        # model_dir = repo_id.split('/')[1]
-        model_dir = root_path / "llama.cpp" / "models" / repo_id.split("/")[1]
+        model_dir = fetch_model_repo(repo_id, root_path / 'models')
 
     quantization = input(
         "Enter quantization, recommended - Q5_K_M or Q4_K_M for more check https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19 : "
-    )
-    quantize_model(model_dir, quantization, root_path)
+    ).strip()
+    output_dir = input(
+        f"Enter path where you want to save the quantized model, else the following path will be used [{model_dir}]: ").strip()
+
+    quantize_model(model_dir, quantization, root_path, output_dir)
diff --git a/src/grag/quantize/utils.py b/src/grag/quantize/utils.py
index 0b90516..d390724 100644
--- a/src/grag/quantize/utils.py
+++ b/src/grag/quantize/utils.py
@@ -2,97 +2,105 @@
 
 import os
 import subprocess
+import sys
+import zipfile
 from pathlib import Path
 from typing import Optional, Union
 
+import requests
+from git import Repo
 from grag.components.utils import get_config
 from huggingface_hub import snapshot_download
 
 config = get_config()
 
 
-def get_llamacpp_repo(root_path: Union[str, Path]) -> subprocess.CompletedProcess:
-    """Clones or pulls the llama.cpp repository into the specified root path.
+def get_llamacpp_repo(repo_url: str = 'https://github.com/ggerganov/llama.cpp.git',
+                      destination_folder: Union[str, Path] = './grag-quantize') -> None:
+    """Clones a GitHub repository to a specified local directory or updates it if it already exists using GitPython.
 
     Args:
-        root_path: The root directory where the llama.cpp repository will be cloned or updated.
+        repo_url (str): The URL of the repository to clone.
+        destination_folder (str, Path): The local path where the repository should be cloned or updated.
 
     Returns:
-        A subprocess.CompletedProcess instance containing the result of the git operation.
+        None
     """
-    if os.path.exists(f"{root_path}/llama.cpp"):
-        print(f"Repo exists at: {root_path}/llama.cpp")
-        res = subprocess.run(
-            ["git", "-C", f"{root_path}/llama.cpp", "pull"],
-            check=True,
-            capture_output=True,
-        )
+    destination_folder = Path(destination_folder) / 'llama.cpp'
+    destination_folder.mkdir(parents=True, exist_ok=True)
+    if os.path.isdir(destination_folder) and os.path.isdir(os.path.join(destination_folder, '.git')):
+        try:
+            repo = Repo(destination_folder)
+            origin = repo.remotes.origin
+            origin.pull()
+            print(f"Repository updated successfully in {destination_folder}")
+        except Exception as e:
+            print(f"Failed to update repository: {str(e)}")
     else:
-        res = subprocess.run(
-            [
-                "git",
-                "clone",
-                "https://github.com/ggerganov/llama.cpp.git",
-                f"{root_path}/llama.cpp",
-            ],
-            check=True,
-            capture_output=True,
-        )
+        try:
+            Repo.clone_from(repo_url, destination_folder)
+            print(f"Repository cloned successfully into {destination_folder}")
+        except Exception as e:
+            print(f"Failed to clone repository: {str(e)}")
 
-    return res
 
+def get_asset_download_url(asset_name_pattern: str, user: str = 'ggerganov', repo: str = 'llama.cpp') -> Optional[str]:
+    """Fetches the download URL of the first asset that matches a given name pattern in the latest release of the specified repository.
 
-def building_llamacpp(root_path: Union[str, Path]) -> None:
-    """Attempts to build the llama.cpp project using make or cmake.
+    Args:
+        user (str): GitHub username or organization of the repository.
+        repo (str): Repository name.
+        asset_name_pattern (str): Substring to match in the asset's name.
+
+    Returns:
+        str: The download URL of the matching asset, or None if no match is found.
+    """
+    url = f"https://api.github.com/repos/{user}/{repo}/releases/latest"
+    response = requests.get(url)
+    if response.status_code == 200:
+        release = response.json()
+        for asset in release.get('assets', []):
+            if asset_name_pattern in asset['name']:
+                return asset['browser_download_url']
+        print("No asset found matching the pattern.")
+    else:
+        print("Failed to fetch release info:", response.status_code)
+    return None
+
+
+def download_release_asset(download_url: str, target_path: Union[Path, str] = './grag-quantize') -> None:
+    """Downloads a file from a given URL and saves it to a specified path.
 
     Args:
-        root_path (str): The root directory where the llama.cpp project is located.
+        download_url (str): The URL of the file to download.
+        target_path (str, Path): Path where the file will be saved.
     """
-    os.chdir(f"{root_path}/llama.cpp/")
-    try:
-        subprocess.run(["which", "make"], check=True, stdout=subprocess.DEVNULL)
-        subprocess.run(["make", "LLAMA_CUDA=1"], check=True)
-        print("Llama.cpp build successful.")
-    except subprocess.CalledProcessError:
-        try:
-            subprocess.run(["which", "cmake"], check=True, stdout=subprocess.DEVNULL)
-            subprocess.run(["mkdir", "build"], check=True)
-            subprocess.run(
-                [
-                    "cd",
-                    "build",
-                    "&&",
-                    "cmake",
-                    "..",
-                    "-DLLAMA_CUDA=ON",
-                    "&&",
-                    "cmake",
-                    "--build",
-                    ".",
-                    "--config",
-                    "Release",
-                ],
-                shell=True,
-                check=True,
-            )
-            print("Llama.cpp build successful.")
-        except subprocess.CalledProcessError:
-            print("Unable to build, cannot find make or cmake.")
-    finally:
-        os.chdir(
-            Path(__file__).parent
-        )  # Assuming you want to return to the root path after operation
-
-
-def fetch_model_repo(repo_id: str, root_path: Union[str, Path]) -> None:
+    target_path = Path(target_path)
+    target_path.mkdir(parents=True, exist_ok=True)
+    response = requests.get(download_url, stream=True)
+    if response.status_code == 200:
+        with open(target_path / 'llamacpp_release.zip', 'wb') as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        print(f"Downloaded successfully to {target_path}")
+        with zipfile.ZipFile(target_path / 'llamacpp_release.zip', 'r') as zip_ref:
+            # Extract all the contents into the destination directory
+            zip_ref.extractall(target_path)
+            print(f"Files extracted to {target_path}")
+    else:
+        print(f"Failed to download file: {response.status_code}")
+
+
+def fetch_model_repo(repo_id: str, model_path: Union[str, Path] = './grag-quantize/models') -> None:
     """Download model from huggingface.co/models.
 
     Args:
         repo_id (str): Repository ID of the model to download.
-        root_path (str): The root path where the model should be downloaded or copied.
+        model_path (str): The root path where the model should be downloaded or copied.
     """
-    local_dir = f"{root_path}/llama.cpp/models/{repo_id.split('/')[1]}"
-    os.makedirs(local_dir, exist_ok=True)
+    model_path = Path(model_path)
+    local_dir = model_path / f"{repo_id.split('/')[1]}"
+    local_dir.mkdir(parents=True, exist_ok=True)
     snapshot_download(
         repo_id=repo_id,
         local_dir=local_dir,
@@ -100,12 +108,13 @@ def fetch_model_repo(repo_id: str, root_path: Union[str, Path]) -> None:
         resume_download=True,
     )
     print(f"Model downloaded in {local_dir}")
+    return local_dir
 
 
 def quantize_model(
     model_dir_path: Union[str, Path],
     quantization: str,
-    root_path: Union[str, Path],
+    target_path: Union[str, Path] = './grag-quantize',  # path with both bulid and llamacpp
     output_dir: Optional[Union[str, Path]] = None,
 ) -> None:
     """Quantizes a specified model using a given quantization level.
@@ -116,20 +125,53 @@ def quantize_model(
         quantization (str): The quantization level to apply.
         root_path (str, Path): The root directory path of the project.
     """
-    os.chdir(f"{root_path}/llama.cpp/")
-    model_dir_path = Path(model_dir_path)
+    # os.chdir(f"{root_path}/llama.cpp/")
+    model_dir_path = Path(model_dir_path).resolve()
     if output_dir is None:
-        output_dir = config["llm"]["base_dir"]
+        try:
+            output_dir = config["llm"]["base_dir"]
+        except KeyError:
+            output_dir = Path('.')
 
     output_dir = Path(output_dir) / model_dir_path.name
-    os.makedirs(output_dir, exist_ok=True)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_dir = output_dir.resolve()
+
+    target_path = Path(target_path).resolve()
+    os.chdir(target_path / 'llama.cpp')
+    convert_script_path = os.path.join(target_path, 'llama.cpp')
+    sys.path.append(convert_script_path)
+
+    from convert import main as convert
 
-    subprocess.run(["python3", "convert.py", f"{model_dir_path}/"], check=True)
-    model_file = model_dir_path / "ggml-model-f32.gguf"
+    args_list = [f'{model_dir_path}',
+                 '--outfile', f'{output_dir}/ggml-model-f32.gguf']
+    if not os.path.exists(f'{output_dir}/ggml-model-f32.gguf'):
+        try:
+            convert(args_list)
+        except TypeError as e:
+            if 'with BpeVocab' in str(e):
+                args_list.extend(['--vocab-type', 'bpe'])
+                convert(args_list)
+            else:
+                raise e
+
+    model_file = output_dir / "ggml-model-f32.gguf"
     quantized_model_file = output_dir / f"ggml-model-{quantization}.gguf"
-    subprocess.run(
-        ["./quantize", str(model_file), str(quantized_model_file), quantization],
-        check=True,
-    )
+    binary_path = target_path / 'build' / 'bin' / 'quantize'
+    main_path = target_path / 'build' / 'bin' / 'main'
+    # cmd = f'{binary_path} {str(model_file)} {str(quantized_model_file)} {quantization}'
+    # os.system(cmd)
+    cmd = [str(binary_path), str(model_file), str(quantized_model_file), quantization]
+    run_cmd = [str(binary_path), '-m', str(quantized_model_file)]
+
+    try:
+        subprocess.run(cmd, check=True)
+    except PermissionError:
+        os.chmod(binary_path, 0o777)
+        os.chmod(main_path, 0o777)
+        subprocess.run(cmd, check=True)
+        subprocess.run(run_cmd, check=True)
     print(f"Quantized model present at {output_dir}")
+
     os.chdir(Path(__file__).parent)  # Return to the root path after operation

From f8923a16871034402231c37bddb8a1dddc3a2a5c Mon Sep 17 00:00:00 2001
From: sanchitvj <sanchitvj1026@gmail.com>
Date: Tue, 30 Apr 2024 18:18:40 -0400
Subject: [PATCH 03/25] added inference in quantization

---
 src/grag/quantize/quantize.py | 10 ++++++++-
 src/grag/quantize/utils.py    | 39 ++++++++++++++++++++++++++++-------
 2 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/src/grag/quantize/quantize.py b/src/grag/quantize/quantize.py
index 74f735f..d010a97 100644
--- a/src/grag/quantize/quantize.py
+++ b/src/grag/quantize/quantize.py
@@ -9,6 +9,7 @@
     fetch_model_repo,
     get_asset_download_url,
     get_llamacpp_repo,
+    inference_quantized_model,
     quantize_model,
 )
 
@@ -55,6 +56,8 @@
         repo_id = input(
             "Please enter the repo_id for the model (you can check on https://huggingface.co/models): "
         ).strip()
+        if repo_id == "":
+            raise ValueError("Repo ID you entered was empty. Please enter the repo_id for the model.")
         model_dir = fetch_model_repo(repo_id, root_path / 'models')
 
     quantization = input(
@@ -63,4 +66,9 @@
     output_dir = input(
         f"Enter path where you want to save the quantized model, else the following path will be used [{model_dir}]: ").strip()
 
-    quantize_model(model_dir, quantization, root_path, output_dir)
+    target_path, quantized_model_file = quantize_model(model_dir, quantization, root_path, output_dir)
+
+    inference = input(
+        "Do you want to inference the quantized model to check if quantization is successful? (y/n) [Enter for yes]: ").strip().lower()
+    inference = True if inference == "y" or inference == "" else False
+    inference_quantized_model(target_path, quantized_model_file)
diff --git a/src/grag/quantize/utils.py b/src/grag/quantize/utils.py
index d390724..f294677 100644
--- a/src/grag/quantize/utils.py
+++ b/src/grag/quantize/utils.py
@@ -111,6 +111,13 @@ def fetch_model_repo(repo_id: str, model_path: Union[str, Path] = './grag-quanti
     return local_dir
 
 
+def exec_quantize(quantized_model_file: Union[str, Path], cmd: list):
+    if not os.path.exists(quantized_model_file):
+        subprocess.run(cmd, check=True)
+    else:
+        print("Quantized model already exists for given quantization, skipping...")
+
+
 def quantize_model(
     model_dir_path: Union[str, Path],
     quantization: str,
@@ -155,23 +162,39 @@ def quantize_model(
                 convert(args_list)
             else:
                 raise e
+    else:
+        print('f32 gguf file already exists, skipping conversion...')
 
     model_file = output_dir / "ggml-model-f32.gguf"
     quantized_model_file = output_dir / f"ggml-model-{quantization}.gguf"
     binary_path = target_path / 'build' / 'bin' / 'quantize'
-    main_path = target_path / 'build' / 'bin' / 'main'
-    # cmd = f'{binary_path} {str(model_file)} {str(quantized_model_file)} {quantization}'
-    # os.system(cmd)
+
     cmd = [str(binary_path), str(model_file), str(quantized_model_file), quantization]
-    run_cmd = [str(binary_path), '-m', str(quantized_model_file)]
 
     try:
-        subprocess.run(cmd, check=True)
+        exec_quantize(quantized_model_file, cmd)
     except PermissionError:
         os.chmod(binary_path, 0o777)
-        os.chmod(main_path, 0o777)
-        subprocess.run(cmd, check=True)
-        subprocess.run(run_cmd, check=True)
+        exec_quantize(quantized_model_file, cmd)
+
     print(f"Quantized model present at {output_dir}")
 
     os.chdir(Path(__file__).parent)  # Return to the root path after operation
+
+    return target_path, quantized_model_file
+
+
+def inference_quantized_model(target_path: Union[str, Path], quantized_model_file: Union[str, Path]):
+    main_path = target_path / 'build' / 'bin' / 'main'
+    run_cmd = [str(main_path), '-m', str(quantized_model_file)]
+    try:
+        res = subprocess.run(run_cmd, check=True, text=True, capture_output=True)
+    except PermissionError:
+        os.chmod(main_path, 0o777)
+        res = subprocess.run(run_cmd, check=True, text=True, capture_output=True)
+
+    if subprocess.CalledProcessError:
+        raise RuntimeError(subprocess.CalledProcessError.stderr)
+    else:
+        print('Inference successfull for this quantized model.')
+        # print(res.stdout)

From cf05b50661982c5de15db6b76d167868345617fd Mon Sep 17 00:00:00 2001
From: sanchitvj <sanchitvj1026@gmail.com>
Date: Tue, 30 Apr 2024 23:03:21 -0400
Subject: [PATCH 04/25] added some tests

---
 src/grag/quantize/quantize.py       |  5 ++---
 src/grag/quantize/utils.py          | 15 +++++--------
 src/tests/quantize/quantize_test.py | 34 ++++++++++++++++++++---------
 3 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/src/grag/quantize/quantize.py b/src/grag/quantize/quantize.py
index d010a97..18db975 100644
--- a/src/grag/quantize/quantize.py
+++ b/src/grag/quantize/quantize.py
@@ -69,6 +69,5 @@
     target_path, quantized_model_file = quantize_model(model_dir, quantization, root_path, output_dir)
 
     inference = input(
-        "Do you want to inference the quantized model to check if quantization is successful? (y/n) [Enter for yes]: ").strip().lower()
-    inference = True if inference == "y" or inference == "" else False
-    inference_quantized_model(target_path, quantized_model_file)
+        "Do you want to inference the quantized model to check if quantization is successful? Warning: It takes time as it inferences on CPU. (y/n) [Enter for yes]: ").strip().lower()
+    inference_quantized_model(target_path, quantized_model_file) if inference == "y" or inference == "" else None
diff --git a/src/grag/quantize/utils.py b/src/grag/quantize/utils.py
index f294677..b4d5337 100644
--- a/src/grag/quantize/utils.py
+++ b/src/grag/quantize/utils.py
@@ -91,7 +91,7 @@ def download_release_asset(download_url: str, target_path: Union[Path, str] = '.
         print(f"Failed to download file: {response.status_code}")
 
 
-def fetch_model_repo(repo_id: str, model_path: Union[str, Path] = './grag-quantize/models') -> None:
+def fetch_model_repo(repo_id: str, model_path: Union[str, Path] = './grag-quantize/models') -> Union[str, Path]:
     """Download model from huggingface.co/models.
 
     Args:
@@ -111,7 +111,7 @@ def fetch_model_repo(repo_id: str, model_path: Union[str, Path] = './grag-quanti
     return local_dir
 
 
-def exec_quantize(quantized_model_file: Union[str, Path], cmd: list):
+def exec_quantize(quantized_model_file: Union[str, Path], cmd: list) -> None:
     if not os.path.exists(quantized_model_file):
         subprocess.run(cmd, check=True)
     else:
@@ -188,13 +188,8 @@ def inference_quantized_model(target_path: Union[str, Path], quantized_model_fil
     main_path = target_path / 'build' / 'bin' / 'main'
     run_cmd = [str(main_path), '-m', str(quantized_model_file)]
     try:
-        res = subprocess.run(run_cmd, check=True, text=True, capture_output=True)
+        subprocess.run(run_cmd, check=True, text=True, capture_output=True)
     except PermissionError:
         os.chmod(main_path, 0o777)
-        res = subprocess.run(run_cmd, check=True, text=True, capture_output=True)
-
-    if subprocess.CalledProcessError:
-        raise RuntimeError(subprocess.CalledProcessError.stderr)
-    else:
-        print('Inference successfull for this quantized model.')
-        # print(res.stdout)
+        subprocess.run(run_cmd, check=True, text=True, capture_output=True)
+    print('Inference successfull for this quantized model.')
diff --git a/src/tests/quantize/quantize_test.py b/src/tests/quantize/quantize_test.py
index 68078fe..5575cf1 100644
--- a/src/tests/quantize/quantize_test.py
+++ b/src/tests/quantize/quantize_test.py
@@ -2,18 +2,22 @@
 import shutil
 from pathlib import Path
 
+import pytest
 from grag.quantize.utils import (
-    building_llamacpp,
-    fetch_model_repo,
     get_llamacpp_repo,
-    quantize_model,
+    get_asset_download_url,
+    download_release_asset,
+    fetch_model_repo,
+
 )
 
-root_path = Path(__file__).parent / "test_data"
+root_path = Path(__file__).parent / "test_quantization"
 if os.path.exists(root_path):
     shutil.rmtree(root_path)
 os.makedirs(root_path, exist_ok=True)
 
+asset_pattern_list = ['-macos-x64', '-macos-arm64', '-win-arm64-x64', '-win-arm64-x64', '-ubuntu-x64']
+
 
 def test_get_llamacpp_repo():
     get_llamacpp_repo(root_path)
@@ -21,15 +25,25 @@ def test_get_llamacpp_repo():
     assert os.path.exists(repo_path)
 
 
-def test_build_llamacpp():
-    building_llamacpp(root_path)
-    bin_path = root_path / "llama.cpp" / "quantize"
-    assert os.path.exists(bin_path)
+@pytest.mark.parametrize("asset_pattern", asset_pattern_list)
+def test_get_asset_download_url(asset_pattern):
+    url = get_asset_download_url(asset_pattern, 'ggerganov', 'llama.cpp')
+    response = requests.get(url, stream=True)
+    assert response.status_code == 200
+
+
+@pytest.mark.parametrize("asset_pattern", asset_pattern_list)
+def test_download_release_asset(asset_pattern):
+    url = get_asset_download_url(asset_pattern, 'ggerganov', 'llama.cpp')
+    response = requests.get(url, stream=True)
+    download_release_asset(response, root_path)
+    assert os.path.exists(root_path / 'build' / 'bin' / 'quantize')
+    assert os.path.exists(root_path / 'build' / 'bin' / 'main')
 
 
 def test_fetch_model_repo():
-    fetch_model_repo("meta-llama/Llama-2-7b-chat", root_path)
-    model_dir_path = root_path / "llama.cpp" / "models" / "Llama-2-7b-chat"
+    fetch_model_repo("meta-llama/Llama-2-7b-chat", root_path / 'models')
+    model_dir_path = root_path / "models" / "Llama-2-7b-chat"
     assert os.path.exists(model_dir_path)
 
 

From a2353934fa68db1e0394ab2307131afd32245d26 Mon Sep 17 00:00:00 2001
From: sanchitvj <sanchitvj1026@gmail.com>
Date: Wed, 1 May 2024 17:48:29 -0400
Subject: [PATCH 05/25] ruff & type checked, all tests passed

---
 src/grag/components/llm.py          |   2 +-
 src/grag/quantize/get_releases.py   |  68 --------------
 src/grag/quantize/get_repo.py       |  33 -------
 src/grag/quantize/utils.py          | 137 ++++++++++++++++------------
 src/tests/quantize/quantize_test.py |  36 +++++---
 5 files changed, 100 insertions(+), 176 deletions(-)
 delete mode 100644 src/grag/quantize/get_releases.py
 delete mode 100644 src/grag/quantize/get_repo.py

diff --git a/src/grag/components/llm.py b/src/grag/components/llm.py
index f7ede73..e09b5bb 100644
--- a/src/grag/components/llm.py
+++ b/src/grag/components/llm.py
@@ -50,7 +50,7 @@ def __init__(
         device_map: str = "auto",
         task: str = "text-generation",
         max_new_tokens: str = "1024",
-        temperature: Union[str, int] = 0.1,
+        temperature: Union[str, float] = 0.1,
         n_batch: Union[str, int] = 1024,
         n_ctx: Union[str, int] = 6000,
         n_gpu_layers: Union[str, int] = -1,
diff --git a/src/grag/quantize/get_releases.py b/src/grag/quantize/get_releases.py
deleted file mode 100644
index 5eb8e5b..0000000
--- a/src/grag/quantize/get_releases.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import requests
-
-# 
-# def get_github_releases(user, repo):
-#     url = f"https://api.github.com/repos/{user}/{repo}/releases/latest"
-#     response = requests.get(url)
-#     releases = response.json()
-#     return releases
-# 
-# 
-# # Example usage
-# latest_release = get_github_releases('ggerganov',
-#                                      'llama.cpp')  # Replace 'nodejs' and 'node' with the appropriate user and repository
-# if 'tag_name' in latest_release:
-#     print(f"Latest Release Tag: {latest_release['tag_name']}, Assets: {len(latest_release['assets'])}")
-# else:
-#     print("Error fetching latest release:", latest_release.get('message', 'No error message provided'))
-
-
-def get_asset_download_url(user, repo, asset_name_pattern):
-    """Fetches the download URL of the first asset that matches a given name pattern in the latest release of the specified repository.
-
-    Args:
-        user (str): GitHub username or organization of the repository.
-        repo (str): Repository name.
-        asset_name_pattern (str): Substring to match in the asset's name.
-
-    Returns:
-        str: The download URL of the matching asset, or None if no match is found.
-    """
-    url = f"https://api.github.com/repos/{user}/{repo}/releases/latest"
-    response = requests.get(url)
-    if response.status_code == 200:
-        release = response.json()
-        for asset in release.get('assets', []):
-            if asset_name_pattern in asset['name']:
-                return asset['browser_download_url']
-        print("No asset found matching the pattern.")
-    else:
-        print("Failed to fetch release info:", response.status_code)
-    return None
-
-
-def download_release_asset(download_url, target_path):
-    """Downloads a file from a given URL and saves it to a specified path.
-
-    Args:
-        download_url (str): The URL of the file to download.
-        target_path (str): Path where the file will be saved.
-    """
-    response = requests.get(download_url, stream=True)
-    if response.status_code == 200:
-        with open(target_path, 'wb') as f:
-            for chunk in response.iter_content(chunk_size=8192):
-                f.write(chunk)
-        print(f"Downloaded successfully to {target_path}")
-    else:
-        print(f"Failed to download file: {response.status_code}")
-
-
-# Example usage
-user = 'ggerganov'
-repo = 'llama.cpp'
-asset_name_pattern = 'ubuntu-x64'  # Adjust the pattern to match the desired asset
-target_path = 'llama-cpp-ubuntu-x64.zip'
-download_url = get_asset_download_url(user, repo, asset_name_pattern)
-if download_url:
-    download_release_asset(download_url, target_path)
diff --git a/src/grag/quantize/get_repo.py b/src/grag/quantize/get_repo.py
deleted file mode 100644
index f3981e0..0000000
--- a/src/grag/quantize/get_repo.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import os
-
-from git import Repo
-
-
-def clone_or_update_repo_with_gitpython(repo_url, destination_folder):
-    """Clones a GitHub repository to a specified local directory or updates it if it already exists using GitPython.
-
-    Args:
-        repo_url (str): The URL of the repository to clone.
-        destination_folder (str): The local path where the repository should be cloned or updated.
-
-    Returns:
-        None
-    """
-    if os.path.isdir(destination_folder) and os.path.isdir(os.path.join(destination_folder, '.git')):
-        try:
-            repo = Repo(destination_folder)
-            origin = repo.remotes.origin
-            origin.pull()
-            print(f"Repository updated successfully in {destination_folder}")
-        except Exception as e:
-            print(f"Failed to update repository: {str(e)}")
-    else:
-        try:
-            Repo.clone_from(repo_url, destination_folder)
-            print(f"Repository cloned successfully into {destination_folder}")
-        except Exception as e:
-            print(f"Failed to clone repository: {str(e)}")
-
-
-# Example usage
-clone_or_update_repo_with_gitpython('https://github.com/ggerganov/llama.cpp.git', './llama.cpp')
diff --git a/src/grag/quantize/utils.py b/src/grag/quantize/utils.py
index b4d5337..7338a04 100644
--- a/src/grag/quantize/utils.py
+++ b/src/grag/quantize/utils.py
@@ -5,7 +5,7 @@
 import sys
 import zipfile
 from pathlib import Path
-from typing import Optional, Union
+from typing import Optional, Tuple, Union
 
 import requests
 from git import Repo
@@ -17,11 +17,11 @@
 
 def get_llamacpp_repo(repo_url: str = 'https://github.com/ggerganov/llama.cpp.git',
                       destination_folder: Union[str, Path] = './grag-quantize') -> None:
-    """Clones a GitHub repository to a specified local directory or updates it if it already exists using GitPython.
+    """Clones a GitHub repository to a specified local directory or updates it if it already exists. The directory is created if it does not exist. If the repository is already cloned, it pulls updates.
 
     Args:
-        repo_url (str): The URL of the repository to clone.
-        destination_folder (str, Path): The local path where the repository should be cloned or updated.
+        repo_url: The URL of the repository to clone.
+        destination_folder: The local path where the repository should be cloned or updated.
 
     Returns:
         None
@@ -48,12 +48,12 @@ def get_asset_download_url(asset_name_pattern: str, user: str = 'ggerganov', rep
     """Fetches the download URL of the first asset that matches a given name pattern in the latest release of the specified repository.
 
     Args:
-        user (str): GitHub username or organization of the repository.
-        repo (str): Repository name.
-        asset_name_pattern (str): Substring to match in the asset's name.
+        asset_name_pattern: Substring to match in the asset's name.
+        user: GitHub username or organization of the repository.
+        repo: Repository name.
 
     Returns:
-        str: The download URL of the matching asset, or None if no match is found.
+        The download URL of the matching asset, or None if no match is found.
     """
     url = f"https://api.github.com/repos/{user}/{repo}/releases/latest"
     response = requests.get(url)
@@ -68,35 +68,41 @@ def get_asset_download_url(asset_name_pattern: str, user: str = 'ggerganov', rep
     return None
 
 
-def download_release_asset(download_url: str, target_path: Union[Path, str] = './grag-quantize') -> None:
-    """Downloads a file from a given URL and saves it to a specified path.
+def download_release_asset(download_url: str, root_quantize: Union[Path, str] = './grag-quantize') -> None:
+    """Downloads a file from a given URL and saves it to a specified path. It also attempts to extract the file if it is a ZIP archive.
 
     Args:
-        download_url (str): The URL of the file to download.
-        target_path (str, Path): Path where the file will be saved.
+        download_url: The URL of the file to download.
+        root_quantize: Path where the file will be saved.
+
+    Returns:
+        None
     """
-    target_path = Path(target_path)
-    target_path.mkdir(parents=True, exist_ok=True)
+    root_quantize = Path(root_quantize)
+    root_quantize.mkdir(parents=True, exist_ok=True)
     response = requests.get(download_url, stream=True)
     if response.status_code == 200:
-        with open(target_path / 'llamacpp_release.zip', 'wb') as f:
+        with open(root_quantize / 'llamacpp_release.zip', 'wb') as f:
             for chunk in response.iter_content(chunk_size=8192):
                 f.write(chunk)
-        print(f"Downloaded successfully to {target_path}")
-        with zipfile.ZipFile(target_path / 'llamacpp_release.zip', 'r') as zip_ref:
+        print(f"Downloaded successfully to {root_quantize}")
+        with zipfile.ZipFile(root_quantize / 'llamacpp_release.zip', 'r') as zip_ref:
             # Extract all the contents into the destination directory
-            zip_ref.extractall(target_path)
-            print(f"Files extracted to {target_path}")
+            zip_ref.extractall(root_quantize)
+            print(f"Files extracted to {root_quantize}")
     else:
         print(f"Failed to download file: {response.status_code}")
 
 
 def fetch_model_repo(repo_id: str, model_path: Union[str, Path] = './grag-quantize/models') -> Union[str, Path]:
-    """Download model from huggingface.co/models.
+    """Downloads a model from huggingface.co/models to a specified directory.
 
     Args:
-        repo_id (str): Repository ID of the model to download.
-        model_path (str): The root path where the model should be downloaded or copied.
+        repo_id: Repository ID of the model to download (e.g., 'huggingface/gpt2').
+        model_path: The local directory where the model should be downloaded.
+
+    Returns:
+        The path to the directory where the model is downloaded.
     """
     model_path = Path(model_path)
     local_dir = model_path / f"{repo_id.split('/')[1]}"
@@ -111,42 +117,41 @@ def fetch_model_repo(repo_id: str, model_path: Union[str, Path] = './grag-quanti
     return local_dir
 
 
-def exec_quantize(quantized_model_file: Union[str, Path], cmd: list) -> None:
-    if not os.path.exists(quantized_model_file):
-        subprocess.run(cmd, check=True)
-    else:
-        print("Quantized model already exists for given quantization, skipping...")
-
-
 def quantize_model(
     model_dir_path: Union[str, Path],
     quantization: str,
-    target_path: Union[str, Path] = './grag-quantize',  # path with both bulid and llamacpp
-    output_dir: Optional[Union[str, Path]] = None,
-) -> None:
-    """Quantizes a specified model using a given quantization level.
+    root_quantize: Union[str, Path] = './grag-quantize',  # path with both build and llamacpp
+    output_dir: Optional[Path] = None,
+) -> Tuple[Path, Path]:
+    """Quantizes a specified model using a given quantization level and saves it to an optional directory. If the output directory is not specified, it defaults to a subdirectory under the provided model directory. The function also handles specific exceptions during the conversion process and ensures the creation of the necessary directories.
 
     Args:
-        output_dir (str, Path, optional): Directory to save quantized model. Defaults to None
-        model_dir_path (str, Path): The directory path of the model to be quantized.
-        quantization (str): The quantization level to apply.
-        root_path (str, Path): The root directory path of the project.
+        model_dir_path: The directory path of the model to be quantized. This path must exist and contain the model files.
+        quantization: The quantization level to apply (e.g., 'f32', 'f16'). This affects the precision and size of the model.
+        root_quantize: The root directory containing the quantization tools and scripts. This directory should have the necessary binary files and scripts for the quantization process.
+        output_dir: Optional directory to save the quantized model. If not specified, the function uses a default directory based on the model directory path.
+
+    Returns:
+        Tuple[Path, Path]: Returns a tuple containing the path to the root of the quantization tools and the path to the quantized model file.
+        
+    Raises:
+        PermissionError: If the function lacks permissions to execute the quantization binaries, it will attempt to modify permissions and retry.
+        TypeError: If there are issues with the provided model directory or quantization parameters.
     """
-    # os.chdir(f"{root_path}/llama.cpp/")
     model_dir_path = Path(model_dir_path).resolve()
     if output_dir is None:
         try:
-            output_dir = config["llm"]["base_dir"]
+            output_dir = Path(config["llm"]["base_dir"])
         except KeyError:
             output_dir = Path('.')
 
-    output_dir = Path(output_dir) / model_dir_path.name
+    output_dir = Path(output_dir) / model_dir_path.name if output_dir.stem != model_dir_path.name else output_dir
     output_dir.mkdir(parents=True, exist_ok=True)
     output_dir = output_dir.resolve()
 
-    target_path = Path(target_path).resolve()
-    os.chdir(target_path / 'llama.cpp')
-    convert_script_path = os.path.join(target_path, 'llama.cpp')
+    root_quantize = Path(root_quantize).resolve()
+    os.chdir(root_quantize / 'llama.cpp')
+    convert_script_path = os.path.join(root_quantize, 'llama.cpp')
     sys.path.append(convert_script_path)
 
     from convert import main as convert
@@ -165,31 +170,43 @@ def quantize_model(
     else:
         print('f32 gguf file already exists, skipping conversion...')
 
-    model_file = output_dir / "ggml-model-f32.gguf"
     quantized_model_file = output_dir / f"ggml-model-{quantization}.gguf"
-    binary_path = target_path / 'build' / 'bin' / 'quantize'
-
-    cmd = [str(binary_path), str(model_file), str(quantized_model_file), quantization]
+    if not os.path.exists(quantized_model_file):
+        converted_model_file = output_dir / "ggml-model-f32.gguf"
+        binary_path = root_quantize / 'build' / 'bin' / 'quantize'
+        cmd = [str(binary_path), str(converted_model_file), str(quantized_model_file), quantization]
 
-    try:
-        exec_quantize(quantized_model_file, cmd)
-    except PermissionError:
-        os.chmod(binary_path, 0o777)
-        exec_quantize(quantized_model_file, cmd)
+        try:
+            subprocess.run(cmd, check=True)
+        except PermissionError:
+            os.chmod(binary_path, 0o777)
+            subprocess.run(cmd, check=True)
+        print(f"Quantized model present at {output_dir}")
+    else:
+        print("Quantized model already exists for given quantization, skipping...")
+    os.chdir(Path(__file__).parent)  # Return to the root path after operation
 
-    print(f"Quantized model present at {output_dir}")
+    return root_quantize, quantized_model_file
 
-    os.chdir(Path(__file__).parent)  # Return to the root path after operation
 
-    return target_path, quantized_model_file
+def inference_quantized_model(root_quantize: Union[str, Path],
+                              quantized_model_file: Union[str, Path]) -> subprocess.CompletedProcess:
+    """Runs inference using a quantized model binary.
 
+    Args:
+        root_quantize: The root directory containing the compiled inference executable.
+        quantized_model_file: The file path to the quantized model to use for inference.
 
-def inference_quantized_model(target_path: Union[str, Path], quantized_model_file: Union[str, Path]):
-    main_path = target_path / 'build' / 'bin' / 'main'
-    run_cmd = [str(main_path), '-m', str(quantized_model_file)]
+    Returns:
+        The subprocess.CompletedProcess object containing the inference execution result.
+    """
+    root_quantize = Path(root_quantize)
+    main_path = root_quantize / 'build' / 'bin' / 'main'
+    run_cmd = [str(main_path), '-m', str(quantized_model_file), '-ngl', '-1']
     try:
-        subprocess.run(run_cmd, check=True, text=True, capture_output=True)
+        res = subprocess.run(run_cmd, check=True, text=True, capture_output=True)
     except PermissionError:
         os.chmod(main_path, 0o777)
-        subprocess.run(run_cmd, check=True, text=True, capture_output=True)
+        res = subprocess.run(run_cmd, check=True, text=True, capture_output=True)
     print('Inference successfull for this quantized model.')
+    return res
diff --git a/src/tests/quantize/quantize_test.py b/src/tests/quantize/quantize_test.py
index 5575cf1..86f2b12 100644
--- a/src/tests/quantize/quantize_test.py
+++ b/src/tests/quantize/quantize_test.py
@@ -3,12 +3,14 @@
 from pathlib import Path
 
 import pytest
+import requests
 from grag.quantize.utils import (
     get_llamacpp_repo,
     get_asset_download_url,
     download_release_asset,
     fetch_model_repo,
-
+    quantize_model,
+    inference_quantized_model,
 )
 
 root_path = Path(__file__).parent / "test_quantization"
@@ -16,11 +18,14 @@
     shutil.rmtree(root_path)
 os.makedirs(root_path, exist_ok=True)
 
+repo_id = 'meta-llama/Llama-2-7b-chat'
+model = 'Llama-2-7b-chat'
+quantization = 'Q4_K_M'
 asset_pattern_list = ['-macos-x64', '-macos-arm64', '-win-arm64-x64', '-win-arm64-x64', '-ubuntu-x64']
 
 
 def test_get_llamacpp_repo():
-    get_llamacpp_repo(root_path)
+    get_llamacpp_repo(destination_folder=root_path)
     repo_path = root_path / "llama.cpp" / ".git"
     assert os.path.exists(repo_path)
 
@@ -32,25 +37,28 @@ def test_get_asset_download_url(asset_pattern):
     assert response.status_code == 200
 
 
-@pytest.mark.parametrize("asset_pattern", asset_pattern_list)
-def test_download_release_asset(asset_pattern):
+def test_download_release_asset():
+    asset_pattern = '-ubuntu-x64'
     url = get_asset_download_url(asset_pattern, 'ggerganov', 'llama.cpp')
-    response = requests.get(url, stream=True)
-    download_release_asset(response, root_path)
+    download_release_asset(url, root_path)
     assert os.path.exists(root_path / 'build' / 'bin' / 'quantize')
     assert os.path.exists(root_path / 'build' / 'bin' / 'main')
 
 
 def test_fetch_model_repo():
-    fetch_model_repo("meta-llama/Llama-2-7b-chat", root_path / 'models')
-    model_dir_path = root_path / "models" / "Llama-2-7b-chat"
-    assert os.path.exists(model_dir_path)
+    local_dir = fetch_model_repo(repo_id, root_path / 'models')
+    assert os.path.exists(local_dir)
 
 
 def test_quantize_model():
-    model_dir_path = root_path / "llama.cpp" / "models" / "Llama-2-7b-chat"
-    quantize_model(
-        model_dir_path, "Q3_K_M", root_path, output_dir=model_dir_path.parent
-    )
-    gguf_file_path = model_dir_path / "ggml-model-Q3_K_M.gguf"
+    model_dir_path = root_path / "models" / model
+    # output_dir = root_path / "models" / "Llama-2-7b-chat"
+    quantize_model(model_dir_path, quantization, root_path, model_dir_path)
+    gguf_file_path = model_dir_path / f"ggml-model-{quantization}.gguf"
     assert os.path.exists(gguf_file_path)
+
+
+def test_inference_quantized_model():
+    quantized_model_file = root_path / 'models' / model / f'ggml-model-{quantization}.gguf'
+    res = inference_quantized_model(root_path, quantized_model_file)
+    assert isinstance(res.stdout, str)

From 118b2ff1102ca03032021884e49b5662fac95d70 Mon Sep 17 00:00:00 2001
From: Sanchit Vijay <sanchitvj1026@gmail.com>
Date: Wed, 1 May 2024 17:56:10 -0400
Subject: [PATCH 06/25] Update pyproject.toml

---
 pyproject.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index b49db60..bd7aba6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,7 +47,6 @@ dependencies = [
     "accelerate>=0.28.0",
     "poppler-utils>=0.1.0",
     "tesseract>=0.1.3",
-    "GitPython>=4.0",
 ]
 
 [project.optional-dependencies]

From 82c21b4cfcb4cc3d977f9261605395882581842d Mon Sep 17 00:00:00 2001
From: Sanchit Vijay <sanchitvj1026@gmail.com>
Date: Wed, 1 May 2024 18:09:24 -0400
Subject: [PATCH 07/25] Update branch_Jenkinsfile

---
 ci/branch_Jenkinsfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/branch_Jenkinsfile b/ci/branch_Jenkinsfile
index 5032e25..473746a 100644
--- a/ci/branch_Jenkinsfile
+++ b/ci/branch_Jenkinsfile
@@ -70,6 +70,7 @@ pipeline {
             steps {
                 withPythonEnv(PYTHONPATH){
                     sh 'pip install mypy'
+                    sh 'mypy --install-types'
                     catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE'){
                         sh 'python3 -m mypy -p src.grag --junit-xml mypy-report.xml'
                     }

From b06280a63b71a49b9f5546b1e842f31caa20c7f5 Mon Sep 17 00:00:00 2001
From: Sanchit Vijay <sanchitvj1026@gmail.com>
Date: Wed, 1 May 2024 18:19:38 -0400
Subject: [PATCH 08/25] Update branch_Jenkinsfile

---
 ci/branch_Jenkinsfile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ci/branch_Jenkinsfile b/ci/branch_Jenkinsfile
index 473746a..2f5e0fb 100644
--- a/ci/branch_Jenkinsfile
+++ b/ci/branch_Jenkinsfile
@@ -70,9 +70,8 @@ pipeline {
             steps {
                 withPythonEnv(PYTHONPATH){
                     sh 'pip install mypy'
-                    sh 'mypy --install-types'
                     catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE'){
-                        sh 'python3 -m mypy -p src.grag --junit-xml mypy-report.xml'
+                        sh 'python3 -m mypy -p src.grag --install-types --junit-xml mypy-report.xml'
                     }
                 }
             }

From 783919e2595af55dcf0ed800aed2d35e991fc1df Mon Sep 17 00:00:00 2001
From: Sanchit Vijay <sanchitvj1026@gmail.com>
Date: Wed, 1 May 2024 18:25:59 -0400
Subject: [PATCH 09/25] Update branch_Jenkinsfile

---
 ci/branch_Jenkinsfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci/branch_Jenkinsfile b/ci/branch_Jenkinsfile
index 2f5e0fb..34770c7 100644
--- a/ci/branch_Jenkinsfile
+++ b/ci/branch_Jenkinsfile
@@ -70,8 +70,9 @@ pipeline {
             steps {
                 withPythonEnv(PYTHONPATH){
                     sh 'pip install mypy'
+                    sh 'python3 -m pip install types-requests'
                     catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE'){
-                        sh 'python3 -m mypy -p src.grag --install-types --junit-xml mypy-report.xml'
+                        sh 'python3 -m mypy -p src.grag --junit-xml mypy-report.xml'
                     }
                 }
             }

From afd150eb3f31a5259f15925d1d99f748230dbae2 Mon Sep 17 00:00:00 2001
From: Sanchit Vijay <sanchitvj1026@gmail.com>
Date: Wed, 1 May 2024 18:59:01 -0400
Subject: [PATCH 10/25] added mypy type-requests

---
 ci/Jenkinsfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/Jenkinsfile b/ci/Jenkinsfile
index bcce629..3bcc2f6 100644
--- a/ci/Jenkinsfile
+++ b/ci/Jenkinsfile
@@ -70,6 +70,7 @@ pipeline {
             steps {
                 withPythonEnv(PYTHONPATH){
                     sh 'pip install mypy'
+                    sh 'python3 -m pip install types-requests'
                     catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE'){
                         sh 'python3 -m mypy -p src.grag --junit-xml mypy-report.xml'
                     }

From 12cc461c52423f42b8e2128fc0b45dc7bf14dbcc Mon Sep 17 00:00:00 2001
From: Arjun Bingly <arjunbin@gmail.com>
Date: Thu, 2 May 2024 14:39:48 -0400
Subject: [PATCH 11/25] Change min python version to 3.10

---
 pyproject.toml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index bd7aba6..e858d0b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ name = "grag"
 dynamic = ["version"]
 description = 'A simple package for implementing RAG'
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.10"
 license = { file = 'LICENSE' }
 keywords = ["RAG", "Retrieval Augmented Generation", "LLM", "retrieval", "quantization"]
 authors = [
@@ -17,8 +17,6 @@ authors = [
 classifiers = [
     "Development Status :: 4 - Beta",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",

From b386cf1c808304927b92a8bdeaee6b2764e32af3 Mon Sep 17 00:00:00 2001
From: sanchitvj <sanchitvj1026@gmail.com>
Date: Thu, 2 May 2024 14:58:17 -0400
Subject: [PATCH 12/25] added exception for root path

---
 src/grag/quantize/quantize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/grag/quantize/quantize.py b/src/grag/quantize/quantize.py
index 18db975..feaeb76 100644
--- a/src/grag/quantize/quantize.py
+++ b/src/grag/quantize/quantize.py
@@ -22,7 +22,7 @@
     if user_input == "":
         try:
             root_path = Path(config["quantize"]["llama_cpp_path"])
-        except KeyError:
+        except KeyError or TypeError:
             root_path = Path('./grag-quantize')
     else:
         root_path = Path(user_input)

From 2a8c71cf3195073ff8f501eefdf4bf2d00510b84 Mon Sep 17 00:00:00 2001
From: Arjun Bingly <arjunbin@gmail.com>
Date: Thu, 2 May 2024 15:42:35 -0400
Subject: [PATCH 13/25] Syntax error for multiple exceptions

---
 src/grag/quantize/quantize.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/grag/quantize/quantize.py b/src/grag/quantize/quantize.py
index feaeb76..2b41963 100644
--- a/src/grag/quantize/quantize.py
+++ b/src/grag/quantize/quantize.py
@@ -22,8 +22,10 @@
     if user_input == "":
         try:
             root_path = Path(config["quantize"]["llama_cpp_path"])
-        except KeyError or TypeError:
+            print(f'Using {root_path} from config.ini')
+        except (KeyError, TypeError):
             root_path = Path('./grag-quantize')
+            print(f'Using {root_path}, default.')
     else:
         root_path = Path(user_input)
 

From fa6757d4a46acd928a20486054ad9ee950bd31de Mon Sep 17 00:00:00 2001
From: Arjun Bingly <arjunbin@gmail.com>
Date: Thu, 2 May 2024 15:42:49 -0400
Subject: [PATCH 14/25] Parse yes/no

---
 src/grag/quantize/quantize.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/grag/quantize/quantize.py b/src/grag/quantize/quantize.py
index 2b41963..03ea440 100644
--- a/src/grag/quantize/quantize.py
+++ b/src/grag/quantize/quantize.py
@@ -51,16 +51,20 @@
     if download_url:
         download_release_asset(download_url, root_path)
 
-    response = input("Do you want us to download the model? (y/n) [Enter for yes]: ").strip().lower()
-    if response == "n":
+    response = input("Do you want us to download the model? (yes[y]/no[n]) [Enter for yes]: ").strip().lower()
+    if response == '':
+        response = 'yes'
+    if response.lower()[0] == "n":
         model_dir = Path(input("Enter path to the model directory: "))
-    elif response == "y" or response == "":
+    elif response.lower()[0] == "y":
         repo_id = input(
             "Please enter the repo_id for the model (you can check on https://huggingface.co/models): "
         ).strip()
         if repo_id == "":
             raise ValueError("Repo ID you entered was empty. Please enter the repo_id for the model.")
         model_dir = fetch_model_repo(repo_id, root_path / 'models')
+    else:
+        raise ValueError("Please enter either 'yes', 'y' or 'no', 'n'.")
 
     quantization = input(
         "Enter quantization, recommended - Q5_K_M or Q4_K_M for more check https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19 : "

From bc9a48db6ffe8f3a85b68a3389159b32b54c15bc Mon Sep 17 00:00:00 2001
From: sanchitvj <sanchitvj1026@gmail.com>
Date: Thu, 2 May 2024 17:03:36 -0400
Subject: [PATCH 15/25] gated repo exception handling

---
 src/grag/quantize/utils.py | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/src/grag/quantize/utils.py b/src/grag/quantize/utils.py
index 7338a04..b497be2 100644
--- a/src/grag/quantize/utils.py
+++ b/src/grag/quantize/utils.py
@@ -10,7 +10,8 @@
 import requests
 from git import Repo
 from grag.components.utils import get_config
-from huggingface_hub import snapshot_download
+from huggingface_hub import login, snapshot_download
+from huggingface_hub.utils import GatedRepoError
 
 config = get_config()
 
@@ -107,12 +108,29 @@ def fetch_model_repo(repo_id: str, model_path: Union[str, Path] = './grag-quanti
     model_path = Path(model_path)
     local_dir = model_path / f"{repo_id.split('/')[1]}"
     local_dir.mkdir(parents=True, exist_ok=True)
-    snapshot_download(
-        repo_id=repo_id,
-        local_dir=local_dir,
-        local_dir_use_symlinks="auto",
-        resume_download=True,
-    )
+
+    try:
+        snapshot_download(
+            repo_id=repo_id,
+            local_dir=local_dir,
+            local_dir_use_symlinks="auto",
+            resume_download=True,
+        )
+    except GatedRepoError:
+        print(
+            "This model comes under gated repository. You must be authenticated to download the model. For more: https://huggingface.co/docs/hub/en/models-gated")
+        resp = input("If you have auth token, please provide it here ['n' or enter to exit]: ")
+        if resp == 'n' or resp == '':
+            print("No token provided, exiting.")
+            exit(0)
+        else:
+            login(resp)
+            snapshot_download(
+                repo_id=repo_id,
+                local_dir=local_dir,
+                local_dir_use_symlinks="auto",
+                resume_download=True,
+            )
     print(f"Model downloaded in {local_dir}")
     return local_dir
 

From 26037b8621b9a20905ce253f60a2947c762427a3 Mon Sep 17 00:00:00 2001
From: Arjun Bingly <arjunbin@gmail.com>
Date: Thu, 2 May 2024 17:18:29 -0400
Subject: [PATCH 16/25] Error handling output_dir in quantize_model

---
 src/grag/quantize/utils.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/grag/quantize/utils.py b/src/grag/quantize/utils.py
index b497be2..66dff09 100644
--- a/src/grag/quantize/utils.py
+++ b/src/grag/quantize/utils.py
@@ -136,10 +136,10 @@ def fetch_model_repo(repo_id: str, model_path: Union[str, Path] = './grag-quanti
 
 
 def quantize_model(
-    model_dir_path: Union[str, Path],
-    quantization: str,
-    root_quantize: Union[str, Path] = './grag-quantize',  # path with both build and llamacpp
-    output_dir: Optional[Path] = None,
+        model_dir_path: Union[str, Path],
+        quantization: str,
+        root_quantize: Union[str, Path] = './grag-quantize',  # path with both build and llamacpp
+        output_dir: Optional[Union[Path, str]] = None,
 ) -> Tuple[Path, Path]:
     """Quantizes a specified model using a given quantization level and saves it to an optional directory. If the output directory is not specified, it defaults to a subdirectory under the provided model directory. The function also handles specific exceptions during the conversion process and ensures the creation of the necessary directories.
 
@@ -160,10 +160,11 @@ def quantize_model(
     if output_dir is None:
         try:
             output_dir = Path(config["llm"]["base_dir"])
-        except KeyError:
+        except (KeyError, TypeError):
             output_dir = Path('.')
-
-    output_dir = Path(output_dir) / model_dir_path.name if output_dir.stem != model_dir_path.name else output_dir
+    else:
+        output_dir = Path(output_dir)
+    output_dir = output_dir / model_dir_path.name if output_dir.stem != model_dir_path.name else output_dir
     output_dir.mkdir(parents=True, exist_ok=True)
     output_dir = output_dir.resolve()
 

From bb6d0c83aeff2d376270bf63dd96a81fc9acb408 Mon Sep 17 00:00:00 2001
From: Arjun Bingly <arjunbin@gmail.com>
Date: Thu, 2 May 2024 17:27:59 -0400
Subject: [PATCH 17/25] Huggingface-cli login response handling

---
 src/grag/quantize/utils.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/grag/quantize/utils.py b/src/grag/quantize/utils.py
index 66dff09..1dda7da 100644
--- a/src/grag/quantize/utils.py
+++ b/src/grag/quantize/utils.py
@@ -119,18 +119,20 @@ def fetch_model_repo(repo_id: str, model_path: Union[str, Path] = './grag-quanti
     except GatedRepoError:
         print(
             "This model comes under gated repository. You must be authenticated to download the model. For more: https://huggingface.co/docs/hub/en/models-gated")
-        resp = input("If you have auth token, please provide it here ['n' or enter to exit]: ")
-        if resp == 'n' or resp == '':
-            print("No token provided, exiting.")
+        resp = input("You will be redirected to hugginface-cli to login. [To exit, enter 'n']: ")
+        if resp.lower() == "n":
+            print("User exited.")
             exit(0)
-        else:
-            login(resp)
+        elif resp == "":
+            login()
             snapshot_download(
                 repo_id=repo_id,
                 local_dir=local_dir,
                 local_dir_use_symlinks="auto",
                 resume_download=True,
             )
+        else:
+            raise ValueError('Invalid response received.')
     print(f"Model downloaded in {local_dir}")
     return local_dir
 

From 5847a3afd9c1d7da0151f8a58247c13d8f2edb7c Mon Sep 17 00:00:00 2001
From: Arjun Bingly <arjunbin@gmail.com>
Date: Thu, 2 May 2024 23:27:05 -0400
Subject: [PATCH 18/25] HuggingFace url resolver

---
 src/grag/quantize/quantize.py | 11 +++++++++--
 src/grag/quantize/utils.py    |  9 +++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/grag/quantize/quantize.py b/src/grag/quantize/quantize.py
index 03ea440..e8d9a98 100644
--- a/src/grag/quantize/quantize.py
+++ b/src/grag/quantize/quantize.py
@@ -11,6 +11,7 @@
     get_llamacpp_repo,
     inference_quantized_model,
     quantize_model,
+    repo_id_resolver,
 )
 
 config = get_config()
@@ -58,10 +59,11 @@
         model_dir = Path(input("Enter path to the model directory: "))
     elif response.lower()[0] == "y":
         repo_id = input(
-            "Please enter the repo_id for the model (you can check on https://huggingface.co/models): "
+            "Please enter the repo_id or the url for the model (you can check on https://huggingface.co/models): "
         ).strip()
         if repo_id == "":
             raise ValueError("Repo ID you entered was empty. Please enter the repo_id for the model.")
+        repo_id = repo_id_resolver(repo_id)
         model_dir = fetch_model_repo(repo_id, root_path / 'models')
     else:
         raise ValueError("Please enter either 'yes', 'y' or 'no', 'n'.")
@@ -76,4 +78,9 @@
 
     inference = input(
         "Do you want to inference the quantized model to check if quantization is successful? Warning: It takes time as it inferences on CPU. (y/n) [Enter for yes]: ").strip().lower()
-    inference_quantized_model(target_path, quantized_model_file) if inference == "y" or inference == "" else None
+    if response == '':
+        response = 'yes'
+    if response.lower()[0] == "y":
+        inference_quantized_model(target_path, quantized_model_file)
+    else:
+        print("Model quantized, but not tested.")
diff --git a/src/grag/quantize/utils.py b/src/grag/quantize/utils.py
index 1dda7da..d9a65c9 100644
--- a/src/grag/quantize/utils.py
+++ b/src/grag/quantize/utils.py
@@ -95,6 +95,15 @@ def download_release_asset(download_url: str, root_quantize: Union[Path, str] =
         print(f"Failed to download file: {response.status_code}")
 
 
+def repo_id_resolver(repo_url: str) -> str:
+    """Resolves the HuggingFace repository ID given a URL."""
+    repo_url = repo_url.rstrip(' ')
+    repo_url = repo_url.lstrip(' ')
+    repo_url = repo_url.rstrip('/')
+    repo_lst = repo_url.split('/')
+    return f'{repo_lst[-2]}/{repo_lst[-1]}'
+
+
 def fetch_model_repo(repo_id: str, model_path: Union[str, Path] = './grag-quantize/models') -> Union[str, Path]:
     """Downloads a model from huggingface.co/models to a specified directory.
 

From 305801c049f6a6c9f02c53ca269c861f292c3a49 Mon Sep 17 00:00:00 2001
From: sanchitvj <sanchitvj1026@gmail.com>
Date: Sat, 4 May 2024 16:33:39 -0400
Subject: [PATCH 19/25] modified url resolver

corrected path issues, type checked, ruff formatted.
---
 src/grag/quantize/quantize.py |  7 +++--
 src/grag/quantize/utils.py    | 55 ++++++++++++++++++++++++++---------
 2 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/src/grag/quantize/quantize.py b/src/grag/quantize/quantize.py
index e8d9a98..702908b 100644
--- a/src/grag/quantize/quantize.py
+++ b/src/grag/quantize/quantize.py
@@ -1,6 +1,7 @@
 """Interactive file for quantizing models."""
 
 import platform
+import sys
 from pathlib import Path
 
 from grag.components.utils import get_config
@@ -68,11 +69,13 @@
     else:
         raise ValueError("Please enter either 'yes', 'y' or 'no', 'n'.")
 
+    sys.stdin.flush()
+
+    output_dir = input(
+        f"Enter path where you want to save the quantized model, else the following path will be used [{model_dir}]: ").strip()
     quantization = input(
         "Enter quantization, recommended - Q5_K_M or Q4_K_M for more check https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19 : "
     ).strip()
-    output_dir = input(
-        f"Enter path where you want to save the quantized model, else the following path will be used [{model_dir}]: ").strip()
 
     target_path, quantized_model_file = quantize_model(model_dir, quantization, root_path, output_dir)
 
diff --git a/src/grag/quantize/utils.py b/src/grag/quantize/utils.py
index d9a65c9..fe108cb 100644
--- a/src/grag/quantize/utils.py
+++ b/src/grag/quantize/utils.py
@@ -96,12 +96,37 @@ def download_release_asset(download_url: str, root_quantize: Union[Path, str] =
 
 
 def repo_id_resolver(repo_url: str) -> str:
-    """Resolves the HuggingFace repository ID given a URL."""
-    repo_url = repo_url.rstrip(' ')
-    repo_url = repo_url.lstrip(' ')
-    repo_url = repo_url.rstrip('/')
-    repo_lst = repo_url.split('/')
-    return f'{repo_lst[-2]}/{repo_lst[-1]}'
+    """Resolves the HuggingFace repository ID given a full URL to a model or dataset page.
+
+    This function parses a HuggingFace URL to extract the repository ID, which typically
+    consists of a user or organization name followed by the repository name. If the URL
+    does not start with the expected HuggingFace URL prefix, it returns the input URL unchanged.
+
+    Args:
+        repo_url: The full URL string pointing to a specific HuggingFace repository.
+
+    Returns:
+        The repository ID in the format 'username/repository_name' if the URL is valid,
+        otherwise returns the original URL.
+
+    Examples:
+        Input: "https://huggingface.co/gpt2/models"
+        Output: "gpt2/models"
+
+        Input: "https://huggingface.co/facebook/bart-large"
+        Output: "facebook/bart-large"
+
+        Input: "some_other_url"
+        Output: "some_other_url"
+    """
+    if repo_url.startswith('https://huggingface'):
+        repo_url = repo_url.rstrip(' ')
+        repo_url = repo_url.lstrip(' ')
+        repo_url = repo_url.rstrip('/')
+        repo_lst = repo_url.split('/')
+        return f'{repo_lst[-2]}/{repo_lst[-1]}'
+    else:
+        return repo_url
 
 
 def fetch_model_repo(repo_id: str, model_path: Union[str, Path] = './grag-quantize/models') -> Union[str, Path]:
@@ -128,7 +153,8 @@ def fetch_model_repo(repo_id: str, model_path: Union[str, Path] = './grag-quanti
     except GatedRepoError:
         print(
             "This model comes under gated repository. You must be authenticated to download the model. For more: https://huggingface.co/docs/hub/en/models-gated")
-        resp = input("You will be redirected to hugginface-cli to login. [To exit, enter 'n']: ")
+        resp = input(
+            "You will be redirected to hugginface-cli to login. If you don't have token checkout above link or else paste the token when prompted. [To exit, enter 'n']: ")
         if resp.lower() == "n":
             print("User exited.")
             exit(0)
@@ -147,10 +173,10 @@ def fetch_model_repo(repo_id: str, model_path: Union[str, Path] = './grag-quanti
 
 
 def quantize_model(
-        model_dir_path: Union[str, Path],
-        quantization: str,
-        root_quantize: Union[str, Path] = './grag-quantize',  # path with both build and llamacpp
-        output_dir: Optional[Union[Path, str]] = None,
+    model_dir_path: Union[str, Path],
+    quantization: str,
+    root_quantize: Union[str, Path] = './grag-quantize',  # path with both build and llamacpp
+    output_dir: Optional[Union[Path, str]] = None,
 ) -> Tuple[Path, Path]:
     """Quantizes a specified model using a given quantization level and saves it to an optional directory. If the output directory is not specified, it defaults to a subdirectory under the provided model directory. The function also handles specific exceptions during the conversion process and ensures the creation of the necessary directories.
 
@@ -168,14 +194,15 @@ def quantize_model(
         TypeError: If there are issues with the provided model directory or quantization parameters.
     """
     model_dir_path = Path(model_dir_path).resolve()
-    if output_dir is None:
+    if output_dir == '' or output_dir is None:
         try:
             output_dir = Path(config["llm"]["base_dir"])
         except (KeyError, TypeError):
-            output_dir = Path('.')
+            output_dir = model_dir_path
     else:
         output_dir = Path(output_dir)
-    output_dir = output_dir / model_dir_path.name if output_dir.stem != model_dir_path.name else output_dir
+
+    output_dir = output_dir / model_dir_path.name if output_dir.name != model_dir_path.name else output_dir
     output_dir.mkdir(parents=True, exist_ok=True)
     output_dir = output_dir.resolve()
 

From c4bc51e724fcb55fe5d551046923c333b6ce263a Mon Sep 17 00:00:00 2001
From: sanchitvj <sanchitvj1026@gmail.com>
Date: Sat, 4 May 2024 16:58:34 -0400
Subject: [PATCH 20/25] quantize test passed

---
 src/tests/quantize/quantize_test.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/tests/quantize/quantize_test.py b/src/tests/quantize/quantize_test.py
index 86f2b12..574bada 100644
--- a/src/tests/quantize/quantize_test.py
+++ b/src/tests/quantize/quantize_test.py
@@ -8,6 +8,7 @@
     get_llamacpp_repo,
     get_asset_download_url,
     download_release_asset,
+    repo_id_resolver,
     fetch_model_repo,
     quantize_model,
     inference_quantized_model,
@@ -19,8 +20,9 @@
 os.makedirs(root_path, exist_ok=True)
 
 repo_id = 'meta-llama/Llama-2-7b-chat'
+repo_url = 'https://huggingface.co/meta-llama/Llama-2-7b-chat'
 model = 'Llama-2-7b-chat'
-quantization = 'Q4_K_M'
+quantization = 'Q2_K'
 asset_pattern_list = ['-macos-x64', '-macos-arm64', '-win-arm64-x64', '-win-arm64-x64', '-ubuntu-x64']
 
 
@@ -45,6 +47,11 @@ def test_download_release_asset():
     assert os.path.exists(root_path / 'build' / 'bin' / 'main')
 
 
+def test_repo_id_resolver():
+    repo_id_ = repo_id_resolver(repo_url)
+    assert repo_id == repo_id_
+
+
 def test_fetch_model_repo():
     local_dir = fetch_model_repo(repo_id, root_path / 'models')
     assert os.path.exists(local_dir)

From 89ca88c9b0ba0cb10507a40d97a1799afdf29573 Mon Sep 17 00:00:00 2001
From: sanchitvj <sanchitvj1026@gmail.com>
Date: Tue, 7 May 2024 17:09:36 -0400
Subject: [PATCH 21/25] lower cased the system and arch

---
 src/grag/quantize/quantize.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/grag/quantize/quantize.py b/src/grag/quantize/quantize.py
index 702908b..246d5f0 100644
--- a/src/grag/quantize/quantize.py
+++ b/src/grag/quantize/quantize.py
@@ -32,19 +32,19 @@
         root_path = Path(user_input)
 
     get_llamacpp_repo(destination_folder=root_path)
-    os_name = platform.system()
-    architecture = platform.machine()
+    os_name = str(platform.system()).lower()
+    architecture = str(platform.machine()).lower()
     asset_name_pattern = 'bin'
     match os_name, architecture:
-        case ('Darwin', 'x86_64'):
+        case ('darwin', 'x86_64'):
             asset_name_pattern += '-macos-x64'
-        case ('Darwin', 'arm64'):
+        case ('darwin', 'arm64'):
             asset_name_pattern += '-macos-arm64'
-        case ('Windows', 'x86_64'):
+        case ('windows', 'x86_64'):
             asset_name_pattern += '-win-arm64-x64'
-        case ('Windows', 'arm64'):
+        case ('windows', 'arm64'):
             asset_name_pattern += '-win-arm64-x64'
-        case ('Linux', 'x86_64'):
+        case ('linux', 'x86_64'):
             asset_name_pattern += '-ubuntu-x64'
         case _:
             raise ValueError(f"{os_name=}, {architecture=} is not supported by llama.cpp releases.")

From 9ab317affbe825b1a1092547605f810092f0a0da Mon Sep 17 00:00:00 2001
From: sanchitvj <sanchitvj1026@gmail.com>
Date: Tue, 7 May 2024 17:20:27 -0400
Subject: [PATCH 22/25] support for AMD architecture

---
 src/grag/quantize/quantize.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/grag/quantize/quantize.py b/src/grag/quantize/quantize.py
index 246d5f0..ec1e3ca 100644
--- a/src/grag/quantize/quantize.py
+++ b/src/grag/quantize/quantize.py
@@ -44,6 +44,8 @@
             asset_name_pattern += '-win-arm64-x64'
         case ('windows', 'arm64'):
             asset_name_pattern += '-win-arm64-x64'
+        case ('windows', 'amd64'):
+            asset_name_pattern += '-win-arm64-x64'
         case ('linux', 'x86_64'):
             asset_name_pattern += '-ubuntu-x64'
         case _:

From 688bdd1954cc9a39dfcd43c57bbc8b8713ea4013 Mon Sep 17 00:00:00 2001
From: Sanchit Vijay <sanchitvj1026@gmail.com>
Date: Tue, 7 May 2024 17:31:28 -0400
Subject: [PATCH 23/25] Update get_started.llms.rst

---
 src/docs/get_started.llms.rst | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/docs/get_started.llms.rst b/src/docs/get_started.llms.rst
index 53f0f21..bfde5ad 100644
--- a/src/docs/get_started.llms.rst
+++ b/src/docs/get_started.llms.rst
@@ -30,8 +30,12 @@ After running the above command, user will be prompted with the following:
 
 2.  Input the **model path**:
 
-* If user wants to download a model from `HuggingFace <https://huggingface.co/models>`_, the user should provide the repository path from HuggingFace.
+* If user wants to download a model from `HuggingFace <https://huggingface.co/models>`_, the user should provide the repository path or URL from HuggingFace.
 
 * If the user has the model downloaded locally, then user will be instructed to copy the model and input the name of the model directory.
 
-3. Finally, the user will be prompted to enter **quantization** settings (recommended Q5_K_M or Q4_K_M, etc.). For more details, check `llama.cpp/examples/quantize/quantize.cpp <https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19>`_.
+3. The user will be asked where to put the quantized model otherwise it will go in the directory where you downloaded model repository.
+
+4. Finally, the user will be prompted to enter **quantization** settings (recommended Q5_K_M or Q4_K_M, etc.). For more details, check `llama.cpp/examples/quantize/quantize.cpp <https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19>`_.
+
+5. Optionally, user can inference the quantized model with the next prompt. This inference will be on CPU so it takes time if model is large one.

From 5a570f6690ccf5bd12b300968d4b638b5df0f84f Mon Sep 17 00:00:00 2001
From: sanchitvj <sanchitvj1026@gmail.com>
Date: Tue, 7 May 2024 18:19:48 -0400
Subject: [PATCH 24/25] quantize compatible for windows

---
 src/grag/quantize/utils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/grag/quantize/utils.py b/src/grag/quantize/utils.py
index fe108cb..2a7ebe9 100644
--- a/src/grag/quantize/utils.py
+++ b/src/grag/quantize/utils.py
@@ -1,6 +1,7 @@
 """Utility functions for quantization."""
 
 import os
+import platform
 import subprocess
 import sys
 import zipfile
@@ -230,7 +231,11 @@ def quantize_model(
     quantized_model_file = output_dir / f"ggml-model-{quantization}.gguf"
     if not os.path.exists(quantized_model_file):
         converted_model_file = output_dir / "ggml-model-f32.gguf"
-        binary_path = root_quantize / 'build' / 'bin' / 'quantize'
+        os_name = str(platform.system()).lower()
+        if os_name == 'windows':
+            binary_path = root_quantize / 'quantize.exe'
+        else:
+            binary_path = root_quantize / 'build' / 'bin' / 'quantize'
         cmd = [str(binary_path), str(converted_model_file), str(quantized_model_file), quantization]
 
         try:

From 78f1d9b3a322181375bdd0e9675eaccb189527e9 Mon Sep 17 00:00:00 2001
From: Sanchit Vijay <sanchitvj1026@gmail.com>
Date: Thu, 9 May 2024 15:02:09 -0400
Subject: [PATCH 25/25] Update get_started.llms.rst

---
 src/docs/get_started.llms.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/docs/get_started.llms.rst b/src/docs/get_started.llms.rst
index bfde5ad..8ecfe3f 100644
--- a/src/docs/get_started.llms.rst
+++ b/src/docs/get_started.llms.rst
@@ -39,3 +39,5 @@ After running the above command, user will be prompted with the following:
 4. Finally, the user will be prompted to enter **quantization** settings (recommended Q5_K_M or Q4_K_M, etc.). For more details, check `llama.cpp/examples/quantize/quantize.cpp <https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19>`_.
 
 5. Optionally, user can inference the quantized model with the next prompt. This inference will be on CPU so it takes time if model is large one.
+
+Note: Windows users have to use WSL, and follow linux guidelines for quantizing models.