diff --git a/ci/Jenkinsfile b/ci/Jenkinsfile
index bcce629..3bcc2f6 100644
--- a/ci/Jenkinsfile
+++ b/ci/Jenkinsfile
@@ -70,6 +70,7 @@ pipeline {
             steps {
                 withPythonEnv(PYTHONPATH){
                     sh 'pip install mypy'
+                    sh 'python3 -m pip install types-requests'
                     catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE'){
                         sh 'python3 -m mypy -p src.grag --junit-xml mypy-report.xml'
                     }
diff --git a/ci/branch_Jenkinsfile b/ci/branch_Jenkinsfile
index 5032e25..34770c7 100644
--- a/ci/branch_Jenkinsfile
+++ b/ci/branch_Jenkinsfile
@@ -70,6 +70,7 @@ pipeline {
             steps {
                 withPythonEnv(PYTHONPATH){
                     sh 'pip install mypy'
+                    sh 'python3 -m pip install types-requests'
                     catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE'){
                         sh 'python3 -m mypy -p src.grag --junit-xml mypy-report.xml'
                     }
diff --git a/config.ini b/config.ini
index 339d77c..b8573a7 100644
--- a/config.ini
+++ b/config.ini
@@ -12,7 +12,7 @@ n_ctx : 6000
 n_gpu_layers : -1
 # The number of layers to put on the GPU. Mixtral-18, gemma-20
 std_out : True
-base_dir : ${root:root_path}/models
+;base_dir : ${root:root_path}/models
 
 [chroma_client]
 host : localhost
@@ -64,5 +64,5 @@ env_path : ${root:root_path}/.env
 [root]
 root_path : /home/ubuntu/volume_2k/Capstone_5
 
-[quantize]
-llama_cpp_path : ${root:root_path}
+;[quantize]
+;llama_cpp_path : ${root:root_path}
diff --git a/pyproject.toml b/pyproject.toml
index ce25a6e..e858d0b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ name = "grag"
 dynamic = ["version"]
 description = 'A simple package for implementing RAG'
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.10"
 license = { file = 'LICENSE' }
 keywords = ["RAG", "Retrieval Augmented Generation", "LLM", "retrieval", "quantization"]
 authors = [
@@ -17,8 +17,6 @@ authors = [
 classifiers = [
     "Development Status :: 4 - Beta",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
@@ -46,7 +44,7 @@ dependencies = [
     "bitsandbytes>=0.42.0",
     "accelerate>=0.28.0",
     "poppler-utils>=0.1.0",
-    "tesseract>=0.1.3"
+    "tesseract>=0.1.3",
 ]
 
 [project.optional-dependencies]
diff --git a/src/docs/get_started.llms.rst b/src/docs/get_started.llms.rst
index 53f0f21..8ecfe3f 100644
--- a/src/docs/get_started.llms.rst
+++ b/src/docs/get_started.llms.rst
@@ -30,8 +30,14 @@ After running the above command, user will be prompted with the following:
 
 2.  Input the **model path**:
 
-* If user wants to download a model from `HuggingFace <https://huggingface.co/models>`_, the user should provide the repository path from HuggingFace.
+* If user wants to download a model from `HuggingFace <https://huggingface.co/models>`_, the user should provide the repository path or URL from HuggingFace.
 
 * If the user has the model downloaded locally, then user will be instructed to copy the model and input the name of the model directory.
 
-3. Finally, the user will be prompted to enter **quantization** settings (recommended Q5_K_M or Q4_K_M, etc.). For more details, check `llama.cpp/examples/quantize/quantize.cpp <https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19>`_.
+3. The user will be asked where to put the quantized model otherwise it will go in the directory where you downloaded model repository.
+
+4. Finally, the user will be prompted to enter **quantization** settings (recommended Q5_K_M or Q4_K_M, etc.). For more details, check `llama.cpp/examples/quantize/quantize.cpp <https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19>`_.
+
+5. Optionally, user can inference the quantized model with the next prompt. This inference will be on CPU so it takes time if model is large one.
+
+Note: Windows users have to use WSL, and follow linux guidelines for quantizing models.
diff --git a/src/grag/components/llm.py b/src/grag/components/llm.py
index f7ede73..e09b5bb 100644
--- a/src/grag/components/llm.py
+++ b/src/grag/components/llm.py
@@ -50,7 +50,7 @@ def __init__(
         device_map: str = "auto",
         task: str = "text-generation",
         max_new_tokens: str = "1024",
-        temperature: Union[str, int] = 0.1,
+        temperature: Union[str, float] = 0.1,
         n_batch: Union[str, int] = 1024,
         n_ctx: Union[str, int] = 6000,
         n_gpu_layers: Union[str, int] = -1,
diff --git a/src/grag/quantize/quantize.py b/src/grag/quantize/quantize.py
index 64fba47..ec1e3ca 100644
--- a/src/grag/quantize/quantize.py
+++ b/src/grag/quantize/quantize.py
@@ -1,52 +1,91 @@
 """Interactive file for quantizing models."""
 
+import platform
+import sys
 from pathlib import Path
 
 from grag.components.utils import get_config
 from grag.quantize.utils import (
-    building_llamacpp,
+    download_release_asset,
     fetch_model_repo,
+    get_asset_download_url,
     get_llamacpp_repo,
+    inference_quantized_model,
     quantize_model,
+    repo_id_resolver,
 )
 
 config = get_config()
-root_path = Path(config["quantize"]["llama_cpp_path"])
 
 if __name__ == "__main__":
     user_input = input(
-        "Enter the path to the llama_cpp cloned repo, or where you'd like to clone it. Press Enter to use the default config path: "
-    ).strip()
+        "Enter the path which you want to download all the source files. Press Enter to use the default path: ").strip()
 
-    if user_input != "":
+    if user_input == "":
+        try:
+            root_path = Path(config["quantize"]["llama_cpp_path"])
+            print(f'Using {root_path} from config.ini')
+        except (KeyError, TypeError):
+            root_path = Path('./grag-quantize')
+            print(f'Using {root_path}, default.')
+    else:
         root_path = Path(user_input)
 
-    res = get_llamacpp_repo(root_path)
+    get_llamacpp_repo(destination_folder=root_path)
+    os_name = str(platform.system()).lower()
+    architecture = str(platform.machine()).lower()
+    asset_name_pattern = 'bin'
+    match os_name, architecture:
+        case ('darwin', 'x86_64'):
+            asset_name_pattern += '-macos-x64'
+        case ('darwin', 'arm64'):
+            asset_name_pattern += '-macos-arm64'
+        case ('windows', 'x86_64'):
+            asset_name_pattern += '-win-arm64-x64'
+        case ('windows', 'arm64'):
+            asset_name_pattern += '-win-arm64-x64'
+        case ('windows', 'amd64'):
+            asset_name_pattern += '-win-arm64-x64'
+        case ('linux', 'x86_64'):
+            asset_name_pattern += '-ubuntu-x64'
+        case _:
+            raise ValueError(f"{os_name=}, {architecture=} is not supported by llama.cpp releases.")
 
-    if "Already up to date." in str(res.stdout):
-        print("Repository is already up to date. Skipping build.")
-    else:
-        print("Updates found. Starting build...")
-        building_llamacpp(root_path)
-
-    response = (
-        input("Do you want us to download the model? (y/n) [Enter for yes]: ")
-        .strip()
-        .lower()
-    )
-    if response == "n":
-        print("Please copy the model folder to 'llama.cpp/models/' folder.")
-        _ = input("Enter if you have already copied the model:")
-        model_dir = Path(input("Enter the model directory name: "))
-    elif response == "y" or response == "":
+    download_url = get_asset_download_url(asset_name_pattern)
+    if download_url:
+        download_release_asset(download_url, root_path)
+
+    response = input("Do you want us to download the model? (yes[y]/no[n]) [Enter for yes]: ").strip().lower()
+    if response == '':
+        response = 'yes'
+    if response.lower()[0] == "n":
+        model_dir = Path(input("Enter path to the model directory: "))
+    elif response.lower()[0] == "y":
         repo_id = input(
-            "Please enter the repo_id for the model (you can check on https://huggingface.co/models): "
+            "Please enter the repo_id or the url for the model (you can check on https://huggingface.co/models): "
         ).strip()
-        fetch_model_repo(repo_id, root_path)
-        # model_dir = repo_id.split('/')[1]
-        model_dir = root_path / "llama.cpp" / "models" / repo_id.split("/")[1]
+        if repo_id == "":
+            raise ValueError("Repo ID you entered was empty. Please enter the repo_id for the model.")
+        repo_id = repo_id_resolver(repo_id)
+        model_dir = fetch_model_repo(repo_id, root_path / 'models')
+    else:
+        raise ValueError("Please enter either 'yes', 'y' or 'no', 'n'.")
 
+    sys.stdin.flush()
+
+    output_dir = input(
+        f"Enter path where you want to save the quantized model, else the following path will be used [{model_dir}]: ").strip()
     quantization = input(
         "Enter quantization, recommended - Q5_K_M or Q4_K_M for more check https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19 : "
-    )
-    quantize_model(model_dir, quantization, root_path)
+    ).strip()
+
+    target_path, quantized_model_file = quantize_model(model_dir, quantization, root_path, output_dir)
+
+    inference = input(
+        "Do you want to inference the quantized model to check if quantization is successful? Warning: It takes time as it inferences on CPU. (y/n) [Enter for yes]: ").strip().lower()
+    if response == '':
+        response = 'yes'
+    if response.lower()[0] == "y":
+        inference_quantized_model(target_path, quantized_model_file)
+    else:
+        print("Model quantized, but not tested.")
diff --git a/src/grag/quantize/utils.py b/src/grag/quantize/utils.py
index 0b90516..2a7ebe9 100644
--- a/src/grag/quantize/utils.py
+++ b/src/grag/quantize/utils.py
@@ -1,135 +1,274 @@
 """Utility functions for quantization."""
 
 import os
+import platform
 import subprocess
+import sys
+import zipfile
 from pathlib import Path
-from typing import Optional, Union
+from typing import Optional, Tuple, Union
 
+import requests
+from git import Repo
 from grag.components.utils import get_config
-from huggingface_hub import snapshot_download
+from huggingface_hub import login, snapshot_download
+from huggingface_hub.utils import GatedRepoError
 
 config = get_config()
 
 
-def get_llamacpp_repo(root_path: Union[str, Path]) -> subprocess.CompletedProcess:
-    """Clones or pulls the llama.cpp repository into the specified root path.
+def get_llamacpp_repo(repo_url: str = 'https://github.com/ggerganov/llama.cpp.git',
+                      destination_folder: Union[str, Path] = './grag-quantize') -> None:
+    """Clones a GitHub repository to a specified local directory or updates it if it already exists. The directory is created if it does not exist. If the repository is already cloned, it pulls updates.
 
     Args:
-        root_path: The root directory where the llama.cpp repository will be cloned or updated.
+        repo_url: The URL of the repository to clone.
+        destination_folder: The local path where the repository should be cloned or updated.
 
     Returns:
-        A subprocess.CompletedProcess instance containing the result of the git operation.
+        None
     """
-    if os.path.exists(f"{root_path}/llama.cpp"):
-        print(f"Repo exists at: {root_path}/llama.cpp")
-        res = subprocess.run(
-            ["git", "-C", f"{root_path}/llama.cpp", "pull"],
-            check=True,
-            capture_output=True,
-        )
+    destination_folder = Path(destination_folder) / 'llama.cpp'
+    destination_folder.mkdir(parents=True, exist_ok=True)
+    if os.path.isdir(destination_folder) and os.path.isdir(os.path.join(destination_folder, '.git')):
+        try:
+            repo = Repo(destination_folder)
+            origin = repo.remotes.origin
+            origin.pull()
+            print(f"Repository updated successfully in {destination_folder}")
+        except Exception as e:
+            print(f"Failed to update repository: {str(e)}")
     else:
-        res = subprocess.run(
-            [
-                "git",
-                "clone",
-                "https://github.com/ggerganov/llama.cpp.git",
-                f"{root_path}/llama.cpp",
-            ],
-            check=True,
-            capture_output=True,
-        )
+        try:
+            Repo.clone_from(repo_url, destination_folder)
+            print(f"Repository cloned successfully into {destination_folder}")
+        except Exception as e:
+            print(f"Failed to clone repository: {str(e)}")
 
-    return res
 
+def get_asset_download_url(asset_name_pattern: str, user: str = 'ggerganov', repo: str = 'llama.cpp') -> Optional[str]:
+    """Fetches the download URL of the first asset that matches a given name pattern in the latest release of the specified repository.
 
-def building_llamacpp(root_path: Union[str, Path]) -> None:
-    """Attempts to build the llama.cpp project using make or cmake.
+    Args:
+        asset_name_pattern: Substring to match in the asset's name.
+        user: GitHub username or organization of the repository.
+        repo: Repository name.
+
+    Returns:
+        The download URL of the matching asset, or None if no match is found.
+    """
+    url = f"https://api.github.com/repos/{user}/{repo}/releases/latest"
+    response = requests.get(url)
+    if response.status_code == 200:
+        release = response.json()
+        for asset in release.get('assets', []):
+            if asset_name_pattern in asset['name']:
+                return asset['browser_download_url']
+        print("No asset found matching the pattern.")
+    else:
+        print("Failed to fetch release info:", response.status_code)
+    return None
+
+
+def download_release_asset(download_url: str, root_quantize: Union[Path, str] = './grag-quantize') -> None:
+    """Downloads a file from a given URL and saves it to a specified path. It also attempts to extract the file if it is a ZIP archive.
 
     Args:
-        root_path (str): The root directory where the llama.cpp project is located.
+        download_url: The URL of the file to download.
+        root_quantize: Path where the file will be saved.
+
+    Returns:
+        None
     """
-    os.chdir(f"{root_path}/llama.cpp/")
-    try:
-        subprocess.run(["which", "make"], check=True, stdout=subprocess.DEVNULL)
-        subprocess.run(["make", "LLAMA_CUDA=1"], check=True)
-        print("Llama.cpp build successful.")
-    except subprocess.CalledProcessError:
-        try:
-            subprocess.run(["which", "cmake"], check=True, stdout=subprocess.DEVNULL)
-            subprocess.run(["mkdir", "build"], check=True)
-            subprocess.run(
-                [
-                    "cd",
-                    "build",
-                    "&&",
-                    "cmake",
-                    "..",
-                    "-DLLAMA_CUDA=ON",
-                    "&&",
-                    "cmake",
-                    "--build",
-                    ".",
-                    "--config",
-                    "Release",
-                ],
-                shell=True,
-                check=True,
-            )
-            print("Llama.cpp build successful.")
-        except subprocess.CalledProcessError:
-            print("Unable to build, cannot find make or cmake.")
-    finally:
-        os.chdir(
-            Path(__file__).parent
-        )  # Assuming you want to return to the root path after operation
+    root_quantize = Path(root_quantize)
+    root_quantize.mkdir(parents=True, exist_ok=True)
+    response = requests.get(download_url, stream=True)
+    if response.status_code == 200:
+        with open(root_quantize / 'llamacpp_release.zip', 'wb') as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        print(f"Downloaded successfully to {root_quantize}")
+        with zipfile.ZipFile(root_quantize / 'llamacpp_release.zip', 'r') as zip_ref:
+            # Extract all the contents into the destination directory
+            zip_ref.extractall(root_quantize)
+            print(f"Files extracted to {root_quantize}")
+    else:
+        print(f"Failed to download file: {response.status_code}")
+
+
+def repo_id_resolver(repo_url: str) -> str:
+    """Resolves the HuggingFace repository ID given a full URL to a model or dataset page.
+
+    This function parses a HuggingFace URL to extract the repository ID, which typically
+    consists of a user or organization name followed by the repository name. If the URL
+    does not start with the expected HuggingFace URL prefix, it returns the input URL unchanged.
+
+    Args:
+        repo_url: The full URL string pointing to a specific HuggingFace repository.
+
+    Returns:
+        The repository ID in the format 'username/repository_name' if the URL is valid,
+        otherwise returns the original URL.
+
+    Examples:
+        Input: "https://huggingface.co/gpt2/models"
+        Output: "gpt2/models"
+
+        Input: "https://huggingface.co/facebook/bart-large"
+        Output: "facebook/bart-large"
+
+        Input: "some_other_url"
+        Output: "some_other_url"
+    """
+    if repo_url.startswith('https://huggingface'):
+        repo_url = repo_url.rstrip(' ')
+        repo_url = repo_url.lstrip(' ')
+        repo_url = repo_url.rstrip('/')
+        repo_lst = repo_url.split('/')
+        return f'{repo_lst[-2]}/{repo_lst[-1]}'
+    else:
+        return repo_url
 
 
-def fetch_model_repo(repo_id: str, root_path: Union[str, Path]) -> None:
-    """Download model from huggingface.co/models.
+def fetch_model_repo(repo_id: str, model_path: Union[str, Path] = './grag-quantize/models') -> Union[str, Path]:
+    """Downloads a model from huggingface.co/models to a specified directory.
 
     Args:
-        repo_id (str): Repository ID of the model to download.
-        root_path (str): The root path where the model should be downloaded or copied.
+        repo_id: Repository ID of the model to download (e.g., 'huggingface/gpt2').
+        model_path: The local directory where the model should be downloaded.
+
+    Returns:
+        The path to the directory where the model is downloaded.
     """
-    local_dir = f"{root_path}/llama.cpp/models/{repo_id.split('/')[1]}"
-    os.makedirs(local_dir, exist_ok=True)
-    snapshot_download(
-        repo_id=repo_id,
-        local_dir=local_dir,
-        local_dir_use_symlinks="auto",
-        resume_download=True,
-    )
+    model_path = Path(model_path)
+    local_dir = model_path / f"{repo_id.split('/')[1]}"
+    local_dir.mkdir(parents=True, exist_ok=True)
+
+    try:
+        snapshot_download(
+            repo_id=repo_id,
+            local_dir=local_dir,
+            local_dir_use_symlinks="auto",
+            resume_download=True,
+        )
+    except GatedRepoError:
+        print(
+            "This model comes under gated repository. You must be authenticated to download the model. For more: https://huggingface.co/docs/hub/en/models-gated")
+        resp = input(
+            "You will be redirected to hugginface-cli to login. If you don't have token checkout above link or else paste the token when prompted. [To exit, enter 'n']: ")
+        if resp.lower() == "n":
+            print("User exited.")
+            exit(0)
+        elif resp == "":
+            login()
+            snapshot_download(
+                repo_id=repo_id,
+                local_dir=local_dir,
+                local_dir_use_symlinks="auto",
+                resume_download=True,
+            )
+        else:
+            raise ValueError('Invalid response received.')
     print(f"Model downloaded in {local_dir}")
+    return local_dir
 
 
 def quantize_model(
     model_dir_path: Union[str, Path],
     quantization: str,
-    root_path: Union[str, Path],
-    output_dir: Optional[Union[str, Path]] = None,
-) -> None:
-    """Quantizes a specified model using a given quantization level.
+    root_quantize: Union[str, Path] = './grag-quantize',  # path with both build and llamacpp
+    output_dir: Optional[Union[Path, str]] = None,
+) -> Tuple[Path, Path]:
+    """Quantizes a specified model using a given quantization level and saves it to an optional directory. If the output directory is not specified, it defaults to a subdirectory under the provided model directory. The function also handles specific exceptions during the conversion process and ensures the creation of the necessary directories.
 
     Args:
-        output_dir (str, Path, optional): Directory to save quantized model. Defaults to None
-        model_dir_path (str, Path): The directory path of the model to be quantized.
-        quantization (str): The quantization level to apply.
-        root_path (str, Path): The root directory path of the project.
+        model_dir_path: The directory path of the model to be quantized. This path must exist and contain the model files.
+        quantization: The quantization level to apply (e.g., 'f32', 'f16'). This affects the precision and size of the model.
+        root_quantize: The root directory containing the quantization tools and scripts. This directory should have the necessary binary files and scripts for the quantization process.
+        output_dir: Optional directory to save the quantized model. If not specified, the function uses a default directory based on the model directory path.
+
+    Returns:
+        Tuple[Path, Path]: Returns a tuple containing the path to the root of the quantization tools and the path to the quantized model file.
+        
+    Raises:
+        PermissionError: If the function lacks permissions to execute the quantization binaries, it will attempt to modify permissions and retry.
+        TypeError: If there are issues with the provided model directory or quantization parameters.
     """
-    os.chdir(f"{root_path}/llama.cpp/")
-    model_dir_path = Path(model_dir_path)
-    if output_dir is None:
-        output_dir = config["llm"]["base_dir"]
+    model_dir_path = Path(model_dir_path).resolve()
+    if output_dir == '' or output_dir is None:
+        try:
+            output_dir = Path(config["llm"]["base_dir"])
+        except (KeyError, TypeError):
+            output_dir = model_dir_path
+    else:
+        output_dir = Path(output_dir)
+
+    output_dir = output_dir / model_dir_path.name if output_dir.name != model_dir_path.name else output_dir
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_dir = output_dir.resolve()
 
-    output_dir = Path(output_dir) / model_dir_path.name
-    os.makedirs(output_dir, exist_ok=True)
+    root_quantize = Path(root_quantize).resolve()
+    os.chdir(root_quantize / 'llama.cpp')
+    convert_script_path = os.path.join(root_quantize, 'llama.cpp')
+    sys.path.append(convert_script_path)
+
+    from convert import main as convert
+
+    args_list = [f'{model_dir_path}',
+                 '--outfile', f'{output_dir}/ggml-model-f32.gguf']
+    if not os.path.exists(f'{output_dir}/ggml-model-f32.gguf'):
+        try:
+            convert(args_list)
+        except TypeError as e:
+            if 'with BpeVocab' in str(e):
+                args_list.extend(['--vocab-type', 'bpe'])
+                convert(args_list)
+            else:
+                raise e
+    else:
+        print('f32 gguf file already exists, skipping conversion...')
 
-    subprocess.run(["python3", "convert.py", f"{model_dir_path}/"], check=True)
-    model_file = model_dir_path / "ggml-model-f32.gguf"
     quantized_model_file = output_dir / f"ggml-model-{quantization}.gguf"
-    subprocess.run(
-        ["./quantize", str(model_file), str(quantized_model_file), quantization],
-        check=True,
-    )
-    print(f"Quantized model present at {output_dir}")
+    if not os.path.exists(quantized_model_file):
+        converted_model_file = output_dir / "ggml-model-f32.gguf"
+        os_name = str(platform.system()).lower()
+        if os_name == 'windows':
+            binary_path = root_quantize / 'quantize.exe'
+        else:
+            binary_path = root_quantize / 'build' / 'bin' / 'quantize'
+        cmd = [str(binary_path), str(converted_model_file), str(quantized_model_file), quantization]
+
+        try:
+            subprocess.run(cmd, check=True)
+        except PermissionError:
+            os.chmod(binary_path, 0o777)
+            subprocess.run(cmd, check=True)
+        print(f"Quantized model present at {output_dir}")
+    else:
+        print("Quantized model already exists for given quantization, skipping...")
     os.chdir(Path(__file__).parent)  # Return to the root path after operation
+
+    return root_quantize, quantized_model_file
+
+
+def inference_quantized_model(root_quantize: Union[str, Path],
+                              quantized_model_file: Union[str, Path]) -> subprocess.CompletedProcess:
+    """Runs inference using a quantized model binary.
+
+    Args:
+        root_quantize: The root directory containing the compiled inference executable.
+        quantized_model_file: The file path to the quantized model to use for inference.
+
+    Returns:
+        The subprocess.CompletedProcess object containing the inference execution result.
+    """
+    root_quantize = Path(root_quantize)
+    main_path = root_quantize / 'build' / 'bin' / 'main'
+    run_cmd = [str(main_path), '-m', str(quantized_model_file), '-ngl', '-1']
+    try:
+        res = subprocess.run(run_cmd, check=True, text=True, capture_output=True)
+    except PermissionError:
+        os.chmod(main_path, 0o777)
+        res = subprocess.run(run_cmd, check=True, text=True, capture_output=True)
+    print('Inference successfull for this quantized model.')
+    return res
diff --git a/src/tests/quantize/quantize_test.py b/src/tests/quantize/quantize_test.py
index 68078fe..574bada 100644
--- a/src/tests/quantize/quantize_test.py
+++ b/src/tests/quantize/quantize_test.py
@@ -2,41 +2,70 @@
 import shutil
 from pathlib import Path
 
+import pytest
+import requests
 from grag.quantize.utils import (
-    building_llamacpp,
-    fetch_model_repo,
     get_llamacpp_repo,
+    get_asset_download_url,
+    download_release_asset,
+    repo_id_resolver,
+    fetch_model_repo,
     quantize_model,
+    inference_quantized_model,
 )
 
-root_path = Path(__file__).parent / "test_data"
+root_path = Path(__file__).parent / "test_quantization"
 if os.path.exists(root_path):
     shutil.rmtree(root_path)
 os.makedirs(root_path, exist_ok=True)
 
+repo_id = 'meta-llama/Llama-2-7b-chat'
+repo_url = 'https://huggingface.co/meta-llama/Llama-2-7b-chat'
+model = 'Llama-2-7b-chat'
+quantization = 'Q2_K'
+asset_pattern_list = ['-macos-x64', '-macos-arm64', '-win-arm64-x64', '-win-arm64-x64', '-ubuntu-x64']
+
 
 def test_get_llamacpp_repo():
-    get_llamacpp_repo(root_path)
+    get_llamacpp_repo(destination_folder=root_path)
     repo_path = root_path / "llama.cpp" / ".git"
     assert os.path.exists(repo_path)
 
 
-def test_build_llamacpp():
-    building_llamacpp(root_path)
-    bin_path = root_path / "llama.cpp" / "quantize"
-    assert os.path.exists(bin_path)
+@pytest.mark.parametrize("asset_pattern", asset_pattern_list)
+def test_get_asset_download_url(asset_pattern):
+    url = get_asset_download_url(asset_pattern, 'ggerganov', 'llama.cpp')
+    response = requests.get(url, stream=True)
+    assert response.status_code == 200
+
+
+def test_download_release_asset():
+    asset_pattern = '-ubuntu-x64'
+    url = get_asset_download_url(asset_pattern, 'ggerganov', 'llama.cpp')
+    download_release_asset(url, root_path)
+    assert os.path.exists(root_path / 'build' / 'bin' / 'quantize')
+    assert os.path.exists(root_path / 'build' / 'bin' / 'main')
+
+
+def test_repo_id_resolver():
+    repo_id_ = repo_id_resolver(repo_url)
+    assert repo_id == repo_id_
 
 
 def test_fetch_model_repo():
-    fetch_model_repo("meta-llama/Llama-2-7b-chat", root_path)
-    model_dir_path = root_path / "llama.cpp" / "models" / "Llama-2-7b-chat"
-    assert os.path.exists(model_dir_path)
+    local_dir = fetch_model_repo(repo_id, root_path / 'models')
+    assert os.path.exists(local_dir)
 
 
 def test_quantize_model():
-    model_dir_path = root_path / "llama.cpp" / "models" / "Llama-2-7b-chat"
-    quantize_model(
-        model_dir_path, "Q3_K_M", root_path, output_dir=model_dir_path.parent
-    )
-    gguf_file_path = model_dir_path / "ggml-model-Q3_K_M.gguf"
+    model_dir_path = root_path / "models" / model
+    # output_dir = root_path / "models" / "Llama-2-7b-chat"
+    quantize_model(model_dir_path, quantization, root_path, model_dir_path)
+    gguf_file_path = model_dir_path / f"ggml-model-{quantization}.gguf"
     assert os.path.exists(gguf_file_path)
+
+
+def test_inference_quantized_model():
+    quantized_model_file = root_path / 'models' / model / f'ggml-model-{quantization}.gguf'
+    res = inference_quantized_model(root_path, quantized_model_file)
+    assert isinstance(res.stdout, str)