arjbingly · sanchitvj · Apr 24, 2024 · Apr 26, 2024 · Apr 26, 2024 · Apr 29, 2024
diff --git a/ci/Jenkinsfile b/ci/Jenkinsfile
@@ -70,6 +70,7 @@ pipeline {
             steps {
                 withPythonEnv(PYTHONPATH){
                     sh 'pip install mypy'
+                    sh 'python3 -m pip install types-requests'
                     catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE'){
                         sh 'python3 -m mypy -p src.grag --junit-xml mypy-report.xml'
                     }

diff --git a/ci/branch_Jenkinsfile b/ci/branch_Jenkinsfile
@@ -70,6 +70,7 @@ pipeline {
             steps {
                 withPythonEnv(PYTHONPATH){
                     sh 'pip install mypy'
+                    sh 'python3 -m pip install types-requests'
                     catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE'){
                         sh 'python3 -m mypy -p src.grag --junit-xml mypy-report.xml'
                     }

diff --git a/config.ini b/config.ini
@@ -12,7 +12,7 @@ n_ctx : 6000
 n_gpu_layers : -1
 # The number of layers to put on the GPU. Mixtral-18, gemma-20
 std_out : True
-base_dir : ${root:root_path}/models
+;base_dir : ${root:root_path}/models
 
 [chroma_client]
 host : localhost
@@ -64,5 +64,5 @@ env_path : ${root:root_path}/.env
 [root]
 root_path : /home/ubuntu/volume_2k/Capstone_5
 
-[quantize]
-llama_cpp_path : ${root:root_path}
+;[quantize]
+;llama_cpp_path : ${root:root_path}
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ name = "grag"
 dynamic = ["version"]
 description = 'A simple package for implementing RAG'
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.10"
 license = { file = 'LICENSE' }
 keywords = ["RAG", "Retrieval Augmented Generation", "LLM", "retrieval", "quantization"]
 authors = [
@@ -17,8 +17,6 @@ authors = [
 classifiers = [
     "Development Status :: 4 - Beta",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
@@ -46,7 +44,7 @@ dependencies = [
     "bitsandbytes>=0.42.0",
     "accelerate>=0.28.0",
     "poppler-utils>=0.1.0",
-    "tesseract>=0.1.3"
+    "tesseract>=0.1.3",
 ]
 
 [project.optional-dependencies]

diff --git a/src/docs/get_started.llms.rst b/src/docs/get_started.llms.rst
@@ -30,8 +30,14 @@ After running the above command, user will be prompted with the following:
 
 2.  Input the **model path**:
 
-* If user wants to download a model from `HuggingFace <https://huggingface.co/models>`_, the user should provide the repository path from HuggingFace.
+* If user wants to download a model from `HuggingFace <https://huggingface.co/models>`_, the user should provide the repository path or URL from HuggingFace.
 
 * If the user has the model downloaded locally, then user will be instructed to copy the model and input the name of the model directory.
 
-3. Finally, the user will be prompted to enter **quantization** settings (recommended Q5_K_M or Q4_K_M, etc.). For more details, check `llama.cpp/examples/quantize/quantize.cpp <https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19>`_.
+3. The user will be asked where to put the quantized model otherwise it will go in the directory where you downloaded model repository.
+
+4. Finally, the user will be prompted to enter **quantization** settings (recommended Q5_K_M or Q4_K_M, etc.). For more details, check `llama.cpp/examples/quantize/quantize.cpp <https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19>`_.
+
+5. Optionally, user can inference the quantized model with the next prompt. This inference will be on CPU so it takes time if model is large one.
+
+Note: Windows users have to use WSL, and follow linux guidelines for quantizing models.
diff --git a/src/grag/components/llm.py b/src/grag/components/llm.py
@@ -50,7 +50,7 @@ def __init__(
         device_map: str = "auto",
         task: str = "text-generation",
         max_new_tokens: str = "1024",
-        temperature: Union[str, int] = 0.1,
+        temperature: Union[str, float] = 0.1,
         n_batch: Union[str, int] = 1024,
         n_ctx: Union[str, int] = 6000,
         n_gpu_layers: Union[str, int] = -1,

diff --git a/src/grag/quantize/quantize.py b/src/grag/quantize/quantize.py
@@ -1,52 +1,91 @@
 """Interactive file for quantizing models."""
 
+import platform
+import sys
 from pathlib import Path
 
 from grag.components.utils import get_config
 from grag.quantize.utils import (
-    building_llamacpp,
+    download_release_asset,
     fetch_model_repo,
+    get_asset_download_url,
     get_llamacpp_repo,
+    inference_quantized_model,
     quantize_model,
+    repo_id_resolver,
 )
 
 config = get_config()
-root_path = Path(config["quantize"]["llama_cpp_path"])
 
 if __name__ == "__main__":
     user_input = input(
-        "Enter the path to the llama_cpp cloned repo, or where you'd like to clone it. Press Enter to use the default config path: "
-    ).strip()
+        "Enter the path which you want to download all the source files. Press Enter to use the default path: ").strip()
 
-    if user_input != "":
+    if user_input == "":
+        try:
+            root_path = Path(config["quantize"]["llama_cpp_path"])
+            print(f'Using {root_path} from config.ini')
+        except (KeyError, TypeError):
+            root_path = Path('./grag-quantize')
+            print(f'Using {root_path}, default.')
+    else:
         root_path = Path(user_input)
 
-    res = get_llamacpp_repo(root_path)
+    get_llamacpp_repo(destination_folder=root_path)
+    os_name = str(platform.system()).lower()
+    architecture = str(platform.machine()).lower()
+    asset_name_pattern = 'bin'
+    match os_name, architecture:
+        case ('darwin', 'x86_64'):
+            asset_name_pattern += '-macos-x64'
+        case ('darwin', 'arm64'):
+            asset_name_pattern += '-macos-arm64'
+        case ('windows', 'x86_64'):
+            asset_name_pattern += '-win-arm64-x64'
+        case ('windows', 'arm64'):
+            asset_name_pattern += '-win-arm64-x64'
+        case ('windows', 'amd64'):
+            asset_name_pattern += '-win-arm64-x64'
+        case ('linux', 'x86_64'):
+            asset_name_pattern += '-ubuntu-x64'
+        case _:
+            raise ValueError(f"{os_name=}, {architecture=} is not supported by llama.cpp releases.")
 
-    if "Already up to date." in str(res.stdout):
-        print("Repository is already up to date. Skipping build.")
-    else:
-        print("Updates found. Starting build...")
-        building_llamacpp(root_path)
-
-    response = (
-        input("Do you want us to download the model? (y/n) [Enter for yes]: ")
-        .strip()
-        .lower()
-    )
-    if response == "n":
-        print("Please copy the model folder to 'llama.cpp/models/' folder.")
-        _ = input("Enter if you have already copied the model:")
-        model_dir = Path(input("Enter the model directory name: "))
-    elif response == "y" or response == "":
+    download_url = get_asset_download_url(asset_name_pattern)
+    if download_url:
+        download_release_asset(download_url, root_path)
+
+    response = input("Do you want us to download the model? (yes[y]/no[n]) [Enter for yes]: ").strip().lower()
+    if response == '':
+        response = 'yes'
+    if response.lower()[0] == "n":
+        model_dir = Path(input("Enter path to the model directory: "))
+    elif response.lower()[0] == "y":
         repo_id = input(
-            "Please enter the repo_id for the model (you can check on https://huggingface.co/models): "
+            "Please enter the repo_id or the url for the model (you can check on https://huggingface.co/models): "
         ).strip()
-        fetch_model_repo(repo_id, root_path)
-        # model_dir = repo_id.split('/')[1]
-        model_dir = root_path / "llama.cpp" / "models" / repo_id.split("/")[1]
+        if repo_id == "":
+            raise ValueError("Repo ID you entered was empty. Please enter the repo_id for the model.")
+        repo_id = repo_id_resolver(repo_id)
+        model_dir = fetch_model_repo(repo_id, root_path / 'models')
+    else:
+        raise ValueError("Please enter either 'yes', 'y' or 'no', 'n'.")
 
+    sys.stdin.flush()
+
+    output_dir = input(
+        f"Enter path where you want to save the quantized model, else the following path will be used [{model_dir}]: ").strip()
     quantization = input(
         "Enter quantization, recommended - Q5_K_M or Q4_K_M for more check https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19 : "
-    )
-    quantize_model(model_dir, quantization, root_path)
+    ).strip()
+
+    target_path, quantized_model_file = quantize_model(model_dir, quantization, root_path, output_dir)
+
+    inference = input(
+        "Do you want to inference the quantized model to check if quantization is successful? Warning: It takes time as it inferences on CPU. (y/n) [Enter for yes]: ").strip().lower()
+    if response == '':
+        response = 'yes'
+    if response.lower()[0] == "y":
+        inference_quantized_model(target_path, quantized_model_file)
+    else:
+        print("Model quantized, but not tested.")