diff --git a/ci/Jenkinsfile b/ci/Jenkinsfile index bcce629..3bcc2f6 100644 --- a/ci/Jenkinsfile +++ b/ci/Jenkinsfile @@ -70,6 +70,7 @@ pipeline { steps { withPythonEnv(PYTHONPATH){ sh 'pip install mypy' + sh 'python3 -m pip install types-requests' catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE'){ sh 'python3 -m mypy -p src.grag --junit-xml mypy-report.xml' } diff --git a/ci/branch_Jenkinsfile b/ci/branch_Jenkinsfile index 5032e25..34770c7 100644 --- a/ci/branch_Jenkinsfile +++ b/ci/branch_Jenkinsfile @@ -70,6 +70,7 @@ pipeline { steps { withPythonEnv(PYTHONPATH){ sh 'pip install mypy' + sh 'python3 -m pip install types-requests' catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE'){ sh 'python3 -m mypy -p src.grag --junit-xml mypy-report.xml' } diff --git a/config.ini b/config.ini index 339d77c..b8573a7 100644 --- a/config.ini +++ b/config.ini @@ -12,7 +12,7 @@ n_ctx : 6000 n_gpu_layers : -1 # The number of layers to put on the GPU. Mixtral-18, gemma-20 std_out : True -base_dir : ${root:root_path}/models +;base_dir : ${root:root_path}/models [chroma_client] host : localhost @@ -64,5 +64,5 @@ env_path : ${root:root_path}/.env [root] root_path : /home/ubuntu/volume_2k/Capstone_5 -[quantize] -llama_cpp_path : ${root:root_path} +;[quantize] +;llama_cpp_path : ${root:root_path} diff --git a/pyproject.toml b/pyproject.toml index ce25a6e..e858d0b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "grag" dynamic = ["version"] description = 'A simple package for implementing RAG' readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.10" license = { file = 'LICENSE' } keywords = ["RAG", "Retrieval Augmented Generation", "LLM", "retrieval", "quantization"] authors = [ @@ -17,8 +17,6 @@ authors = [ classifiers = [ "Development Status :: 4 - Beta", "Programming Language :: Python", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -46,7 +44,7 @@ dependencies = [ "bitsandbytes>=0.42.0", "accelerate>=0.28.0", "poppler-utils>=0.1.0", - "tesseract>=0.1.3" + "tesseract>=0.1.3", ] [project.optional-dependencies] diff --git a/src/docs/get_started.llms.rst b/src/docs/get_started.llms.rst index 53f0f21..8ecfe3f 100644 --- a/src/docs/get_started.llms.rst +++ b/src/docs/get_started.llms.rst @@ -30,8 +30,14 @@ After running the above command, user will be prompted with the following: 2. Input the **model path**: -* If user wants to download a model from `HuggingFace `_, the user should provide the repository path from HuggingFace. +* If user wants to download a model from `HuggingFace `_, the user should provide the repository path or URL from HuggingFace. * If the user has the model downloaded locally, then user will be instructed to copy the model and input the name of the model directory. -3. Finally, the user will be prompted to enter **quantization** settings (recommended Q5_K_M or Q4_K_M, etc.). For more details, check `llama.cpp/examples/quantize/quantize.cpp `_. +3. The user will be asked where to put the quantized model otherwise it will go in the directory where you downloaded model repository. + +4. Finally, the user will be prompted to enter **quantization** settings (recommended Q5_K_M or Q4_K_M, etc.). For more details, check `llama.cpp/examples/quantize/quantize.cpp `_. + +5. Optionally, user can inference the quantized model with the next prompt. This inference will be on CPU so it takes time if model is large one. + +Note: Windows users have to use WSL, and follow linux guidelines for quantizing models. diff --git a/src/grag/components/llm.py b/src/grag/components/llm.py index f7ede73..e09b5bb 100644 --- a/src/grag/components/llm.py +++ b/src/grag/components/llm.py @@ -50,7 +50,7 @@ def __init__( device_map: str = "auto", task: str = "text-generation", max_new_tokens: str = "1024", - temperature: Union[str, int] = 0.1, + temperature: Union[str, float] = 0.1, n_batch: Union[str, int] = 1024, n_ctx: Union[str, int] = 6000, n_gpu_layers: Union[str, int] = -1, diff --git a/src/grag/quantize/quantize.py b/src/grag/quantize/quantize.py index 64fba47..ec1e3ca 100644 --- a/src/grag/quantize/quantize.py +++ b/src/grag/quantize/quantize.py @@ -1,52 +1,91 @@ """Interactive file for quantizing models.""" +import platform +import sys from pathlib import Path from grag.components.utils import get_config from grag.quantize.utils import ( - building_llamacpp, + download_release_asset, fetch_model_repo, + get_asset_download_url, get_llamacpp_repo, + inference_quantized_model, quantize_model, + repo_id_resolver, ) config = get_config() -root_path = Path(config["quantize"]["llama_cpp_path"]) if __name__ == "__main__": user_input = input( - "Enter the path to the llama_cpp cloned repo, or where you'd like to clone it. Press Enter to use the default config path: " - ).strip() + "Enter the path which you want to download all the source files. Press Enter to use the default path: ").strip() - if user_input != "": + if user_input == "": + try: + root_path = Path(config["quantize"]["llama_cpp_path"]) + print(f'Using {root_path} from config.ini') + except (KeyError, TypeError): + root_path = Path('./grag-quantize') + print(f'Using {root_path}, default.') + else: root_path = Path(user_input) - res = get_llamacpp_repo(root_path) + get_llamacpp_repo(destination_folder=root_path) + os_name = str(platform.system()).lower() + architecture = str(platform.machine()).lower() + asset_name_pattern = 'bin' + match os_name, architecture: + case ('darwin', 'x86_64'): + asset_name_pattern += '-macos-x64' + case ('darwin', 'arm64'): + asset_name_pattern += '-macos-arm64' + case ('windows', 'x86_64'): + asset_name_pattern += '-win-arm64-x64' + case ('windows', 'arm64'): + asset_name_pattern += '-win-arm64-x64' + case ('windows', 'amd64'): + asset_name_pattern += '-win-arm64-x64' + case ('linux', 'x86_64'): + asset_name_pattern += '-ubuntu-x64' + case _: + raise ValueError(f"{os_name=}, {architecture=} is not supported by llama.cpp releases.") - if "Already up to date." in str(res.stdout): - print("Repository is already up to date. Skipping build.") - else: - print("Updates found. Starting build...") - building_llamacpp(root_path) - - response = ( - input("Do you want us to download the model? (y/n) [Enter for yes]: ") - .strip() - .lower() - ) - if response == "n": - print("Please copy the model folder to 'llama.cpp/models/' folder.") - _ = input("Enter if you have already copied the model:") - model_dir = Path(input("Enter the model directory name: ")) - elif response == "y" or response == "": + download_url = get_asset_download_url(asset_name_pattern) + if download_url: + download_release_asset(download_url, root_path) + + response = input("Do you want us to download the model? (yes[y]/no[n]) [Enter for yes]: ").strip().lower() + if response == '': + response = 'yes' + if response.lower()[0] == "n": + model_dir = Path(input("Enter path to the model directory: ")) + elif response.lower()[0] == "y": repo_id = input( - "Please enter the repo_id for the model (you can check on https://huggingface.co/models): " + "Please enter the repo_id or the url for the model (you can check on https://huggingface.co/models): " ).strip() - fetch_model_repo(repo_id, root_path) - # model_dir = repo_id.split('/')[1] - model_dir = root_path / "llama.cpp" / "models" / repo_id.split("/")[1] + if repo_id == "": + raise ValueError("Repo ID you entered was empty. Please enter the repo_id for the model.") + repo_id = repo_id_resolver(repo_id) + model_dir = fetch_model_repo(repo_id, root_path / 'models') + else: + raise ValueError("Please enter either 'yes', 'y' or 'no', 'n'.") + sys.stdin.flush() + + output_dir = input( + f"Enter path where you want to save the quantized model, else the following path will be used [{model_dir}]: ").strip() quantization = input( "Enter quantization, recommended - Q5_K_M or Q4_K_M for more check https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19 : " - ) - quantize_model(model_dir, quantization, root_path) + ).strip() + + target_path, quantized_model_file = quantize_model(model_dir, quantization, root_path, output_dir) + + inference = input( + "Do you want to inference the quantized model to check if quantization is successful? Warning: It takes time as it inferences on CPU. (y/n) [Enter for yes]: ").strip().lower() + if response == '': + response = 'yes' + if response.lower()[0] == "y": + inference_quantized_model(target_path, quantized_model_file) + else: + print("Model quantized, but not tested.") diff --git a/src/grag/quantize/utils.py b/src/grag/quantize/utils.py index 0b90516..2a7ebe9 100644 --- a/src/grag/quantize/utils.py +++ b/src/grag/quantize/utils.py @@ -1,135 +1,274 @@ """Utility functions for quantization.""" import os +import platform import subprocess +import sys +import zipfile from pathlib import Path -from typing import Optional, Union +from typing import Optional, Tuple, Union +import requests +from git import Repo from grag.components.utils import get_config -from huggingface_hub import snapshot_download +from huggingface_hub import login, snapshot_download +from huggingface_hub.utils import GatedRepoError config = get_config() -def get_llamacpp_repo(root_path: Union[str, Path]) -> subprocess.CompletedProcess: - """Clones or pulls the llama.cpp repository into the specified root path. +def get_llamacpp_repo(repo_url: str = 'https://github.com/ggerganov/llama.cpp.git', + destination_folder: Union[str, Path] = './grag-quantize') -> None: + """Clones a GitHub repository to a specified local directory or updates it if it already exists. The directory is created if it does not exist. If the repository is already cloned, it pulls updates. Args: - root_path: The root directory where the llama.cpp repository will be cloned or updated. + repo_url: The URL of the repository to clone. + destination_folder: The local path where the repository should be cloned or updated. Returns: - A subprocess.CompletedProcess instance containing the result of the git operation. + None """ - if os.path.exists(f"{root_path}/llama.cpp"): - print(f"Repo exists at: {root_path}/llama.cpp") - res = subprocess.run( - ["git", "-C", f"{root_path}/llama.cpp", "pull"], - check=True, - capture_output=True, - ) + destination_folder = Path(destination_folder) / 'llama.cpp' + destination_folder.mkdir(parents=True, exist_ok=True) + if os.path.isdir(destination_folder) and os.path.isdir(os.path.join(destination_folder, '.git')): + try: + repo = Repo(destination_folder) + origin = repo.remotes.origin + origin.pull() + print(f"Repository updated successfully in {destination_folder}") + except Exception as e: + print(f"Failed to update repository: {str(e)}") else: - res = subprocess.run( - [ - "git", - "clone", - "https://github.com/ggerganov/llama.cpp.git", - f"{root_path}/llama.cpp", - ], - check=True, - capture_output=True, - ) + try: + Repo.clone_from(repo_url, destination_folder) + print(f"Repository cloned successfully into {destination_folder}") + except Exception as e: + print(f"Failed to clone repository: {str(e)}") - return res +def get_asset_download_url(asset_name_pattern: str, user: str = 'ggerganov', repo: str = 'llama.cpp') -> Optional[str]: + """Fetches the download URL of the first asset that matches a given name pattern in the latest release of the specified repository. -def building_llamacpp(root_path: Union[str, Path]) -> None: - """Attempts to build the llama.cpp project using make or cmake. + Args: + asset_name_pattern: Substring to match in the asset's name. + user: GitHub username or organization of the repository. + repo: Repository name. + + Returns: + The download URL of the matching asset, or None if no match is found. + """ + url = f"https://api.github.com/repos/{user}/{repo}/releases/latest" + response = requests.get(url) + if response.status_code == 200: + release = response.json() + for asset in release.get('assets', []): + if asset_name_pattern in asset['name']: + return asset['browser_download_url'] + print("No asset found matching the pattern.") + else: + print("Failed to fetch release info:", response.status_code) + return None + + +def download_release_asset(download_url: str, root_quantize: Union[Path, str] = './grag-quantize') -> None: + """Downloads a file from a given URL and saves it to a specified path. It also attempts to extract the file if it is a ZIP archive. Args: - root_path (str): The root directory where the llama.cpp project is located. + download_url: The URL of the file to download. + root_quantize: Path where the file will be saved. + + Returns: + None """ - os.chdir(f"{root_path}/llama.cpp/") - try: - subprocess.run(["which", "make"], check=True, stdout=subprocess.DEVNULL) - subprocess.run(["make", "LLAMA_CUDA=1"], check=True) - print("Llama.cpp build successful.") - except subprocess.CalledProcessError: - try: - subprocess.run(["which", "cmake"], check=True, stdout=subprocess.DEVNULL) - subprocess.run(["mkdir", "build"], check=True) - subprocess.run( - [ - "cd", - "build", - "&&", - "cmake", - "..", - "-DLLAMA_CUDA=ON", - "&&", - "cmake", - "--build", - ".", - "--config", - "Release", - ], - shell=True, - check=True, - ) - print("Llama.cpp build successful.") - except subprocess.CalledProcessError: - print("Unable to build, cannot find make or cmake.") - finally: - os.chdir( - Path(__file__).parent - ) # Assuming you want to return to the root path after operation + root_quantize = Path(root_quantize) + root_quantize.mkdir(parents=True, exist_ok=True) + response = requests.get(download_url, stream=True) + if response.status_code == 200: + with open(root_quantize / 'llamacpp_release.zip', 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + print(f"Downloaded successfully to {root_quantize}") + with zipfile.ZipFile(root_quantize / 'llamacpp_release.zip', 'r') as zip_ref: + # Extract all the contents into the destination directory + zip_ref.extractall(root_quantize) + print(f"Files extracted to {root_quantize}") + else: + print(f"Failed to download file: {response.status_code}") + + +def repo_id_resolver(repo_url: str) -> str: + """Resolves the HuggingFace repository ID given a full URL to a model or dataset page. + + This function parses a HuggingFace URL to extract the repository ID, which typically + consists of a user or organization name followed by the repository name. If the URL + does not start with the expected HuggingFace URL prefix, it returns the input URL unchanged. + + Args: + repo_url: The full URL string pointing to a specific HuggingFace repository. + + Returns: + The repository ID in the format 'username/repository_name' if the URL is valid, + otherwise returns the original URL. + + Examples: + Input: "https://huggingface.co/gpt2/models" + Output: "gpt2/models" + + Input: "https://huggingface.co/facebook/bart-large" + Output: "facebook/bart-large" + + Input: "some_other_url" + Output: "some_other_url" + """ + if repo_url.startswith('https://huggingface'): + repo_url = repo_url.rstrip(' ') + repo_url = repo_url.lstrip(' ') + repo_url = repo_url.rstrip('/') + repo_lst = repo_url.split('/') + return f'{repo_lst[-2]}/{repo_lst[-1]}' + else: + return repo_url -def fetch_model_repo(repo_id: str, root_path: Union[str, Path]) -> None: - """Download model from huggingface.co/models. +def fetch_model_repo(repo_id: str, model_path: Union[str, Path] = './grag-quantize/models') -> Union[str, Path]: + """Downloads a model from huggingface.co/models to a specified directory. Args: - repo_id (str): Repository ID of the model to download. - root_path (str): The root path where the model should be downloaded or copied. + repo_id: Repository ID of the model to download (e.g., 'huggingface/gpt2'). + model_path: The local directory where the model should be downloaded. + + Returns: + The path to the directory where the model is downloaded. """ - local_dir = f"{root_path}/llama.cpp/models/{repo_id.split('/')[1]}" - os.makedirs(local_dir, exist_ok=True) - snapshot_download( - repo_id=repo_id, - local_dir=local_dir, - local_dir_use_symlinks="auto", - resume_download=True, - ) + model_path = Path(model_path) + local_dir = model_path / f"{repo_id.split('/')[1]}" + local_dir.mkdir(parents=True, exist_ok=True) + + try: + snapshot_download( + repo_id=repo_id, + local_dir=local_dir, + local_dir_use_symlinks="auto", + resume_download=True, + ) + except GatedRepoError: + print( + "This model comes under gated repository. You must be authenticated to download the model. For more: https://huggingface.co/docs/hub/en/models-gated") + resp = input( + "You will be redirected to hugginface-cli to login. If you don't have token checkout above link or else paste the token when prompted. [To exit, enter 'n']: ") + if resp.lower() == "n": + print("User exited.") + exit(0) + elif resp == "": + login() + snapshot_download( + repo_id=repo_id, + local_dir=local_dir, + local_dir_use_symlinks="auto", + resume_download=True, + ) + else: + raise ValueError('Invalid response received.') print(f"Model downloaded in {local_dir}") + return local_dir def quantize_model( model_dir_path: Union[str, Path], quantization: str, - root_path: Union[str, Path], - output_dir: Optional[Union[str, Path]] = None, -) -> None: - """Quantizes a specified model using a given quantization level. + root_quantize: Union[str, Path] = './grag-quantize', # path with both build and llamacpp + output_dir: Optional[Union[Path, str]] = None, +) -> Tuple[Path, Path]: + """Quantizes a specified model using a given quantization level and saves it to an optional directory. If the output directory is not specified, it defaults to a subdirectory under the provided model directory. The function also handles specific exceptions during the conversion process and ensures the creation of the necessary directories. Args: - output_dir (str, Path, optional): Directory to save quantized model. Defaults to None - model_dir_path (str, Path): The directory path of the model to be quantized. - quantization (str): The quantization level to apply. - root_path (str, Path): The root directory path of the project. + model_dir_path: The directory path of the model to be quantized. This path must exist and contain the model files. + quantization: The quantization level to apply (e.g., 'f32', 'f16'). This affects the precision and size of the model. + root_quantize: The root directory containing the quantization tools and scripts. This directory should have the necessary binary files and scripts for the quantization process. + output_dir: Optional directory to save the quantized model. If not specified, the function uses a default directory based on the model directory path. + + Returns: + Tuple[Path, Path]: Returns a tuple containing the path to the root of the quantization tools and the path to the quantized model file. + + Raises: + PermissionError: If the function lacks permissions to execute the quantization binaries, it will attempt to modify permissions and retry. + TypeError: If there are issues with the provided model directory or quantization parameters. """ - os.chdir(f"{root_path}/llama.cpp/") - model_dir_path = Path(model_dir_path) - if output_dir is None: - output_dir = config["llm"]["base_dir"] + model_dir_path = Path(model_dir_path).resolve() + if output_dir == '' or output_dir is None: + try: + output_dir = Path(config["llm"]["base_dir"]) + except (KeyError, TypeError): + output_dir = model_dir_path + else: + output_dir = Path(output_dir) + + output_dir = output_dir / model_dir_path.name if output_dir.name != model_dir_path.name else output_dir + output_dir.mkdir(parents=True, exist_ok=True) + output_dir = output_dir.resolve() - output_dir = Path(output_dir) / model_dir_path.name - os.makedirs(output_dir, exist_ok=True) + root_quantize = Path(root_quantize).resolve() + os.chdir(root_quantize / 'llama.cpp') + convert_script_path = os.path.join(root_quantize, 'llama.cpp') + sys.path.append(convert_script_path) + + from convert import main as convert + + args_list = [f'{model_dir_path}', + '--outfile', f'{output_dir}/ggml-model-f32.gguf'] + if not os.path.exists(f'{output_dir}/ggml-model-f32.gguf'): + try: + convert(args_list) + except TypeError as e: + if 'with BpeVocab' in str(e): + args_list.extend(['--vocab-type', 'bpe']) + convert(args_list) + else: + raise e + else: + print('f32 gguf file already exists, skipping conversion...') - subprocess.run(["python3", "convert.py", f"{model_dir_path}/"], check=True) - model_file = model_dir_path / "ggml-model-f32.gguf" quantized_model_file = output_dir / f"ggml-model-{quantization}.gguf" - subprocess.run( - ["./quantize", str(model_file), str(quantized_model_file), quantization], - check=True, - ) - print(f"Quantized model present at {output_dir}") + if not os.path.exists(quantized_model_file): + converted_model_file = output_dir / "ggml-model-f32.gguf" + os_name = str(platform.system()).lower() + if os_name == 'windows': + binary_path = root_quantize / 'quantize.exe' + else: + binary_path = root_quantize / 'build' / 'bin' / 'quantize' + cmd = [str(binary_path), str(converted_model_file), str(quantized_model_file), quantization] + + try: + subprocess.run(cmd, check=True) + except PermissionError: + os.chmod(binary_path, 0o777) + subprocess.run(cmd, check=True) + print(f"Quantized model present at {output_dir}") + else: + print("Quantized model already exists for given quantization, skipping...") os.chdir(Path(__file__).parent) # Return to the root path after operation + + return root_quantize, quantized_model_file + + +def inference_quantized_model(root_quantize: Union[str, Path], + quantized_model_file: Union[str, Path]) -> subprocess.CompletedProcess: + """Runs inference using a quantized model binary. + + Args: + root_quantize: The root directory containing the compiled inference executable. + quantized_model_file: The file path to the quantized model to use for inference. + + Returns: + The subprocess.CompletedProcess object containing the inference execution result. + """ + root_quantize = Path(root_quantize) + main_path = root_quantize / 'build' / 'bin' / 'main' + run_cmd = [str(main_path), '-m', str(quantized_model_file), '-ngl', '-1'] + try: + res = subprocess.run(run_cmd, check=True, text=True, capture_output=True) + except PermissionError: + os.chmod(main_path, 0o777) + res = subprocess.run(run_cmd, check=True, text=True, capture_output=True) + print('Inference successfull for this quantized model.') + return res diff --git a/src/tests/quantize/quantize_test.py b/src/tests/quantize/quantize_test.py index 68078fe..574bada 100644 --- a/src/tests/quantize/quantize_test.py +++ b/src/tests/quantize/quantize_test.py @@ -2,41 +2,70 @@ import shutil from pathlib import Path +import pytest +import requests from grag.quantize.utils import ( - building_llamacpp, - fetch_model_repo, get_llamacpp_repo, + get_asset_download_url, + download_release_asset, + repo_id_resolver, + fetch_model_repo, quantize_model, + inference_quantized_model, ) -root_path = Path(__file__).parent / "test_data" +root_path = Path(__file__).parent / "test_quantization" if os.path.exists(root_path): shutil.rmtree(root_path) os.makedirs(root_path, exist_ok=True) +repo_id = 'meta-llama/Llama-2-7b-chat' +repo_url = 'https://huggingface.co/meta-llama/Llama-2-7b-chat' +model = 'Llama-2-7b-chat' +quantization = 'Q2_K' +asset_pattern_list = ['-macos-x64', '-macos-arm64', '-win-arm64-x64', '-win-arm64-x64', '-ubuntu-x64'] + def test_get_llamacpp_repo(): - get_llamacpp_repo(root_path) + get_llamacpp_repo(destination_folder=root_path) repo_path = root_path / "llama.cpp" / ".git" assert os.path.exists(repo_path) -def test_build_llamacpp(): - building_llamacpp(root_path) - bin_path = root_path / "llama.cpp" / "quantize" - assert os.path.exists(bin_path) +@pytest.mark.parametrize("asset_pattern", asset_pattern_list) +def test_get_asset_download_url(asset_pattern): + url = get_asset_download_url(asset_pattern, 'ggerganov', 'llama.cpp') + response = requests.get(url, stream=True) + assert response.status_code == 200 + + +def test_download_release_asset(): + asset_pattern = '-ubuntu-x64' + url = get_asset_download_url(asset_pattern, 'ggerganov', 'llama.cpp') + download_release_asset(url, root_path) + assert os.path.exists(root_path / 'build' / 'bin' / 'quantize') + assert os.path.exists(root_path / 'build' / 'bin' / 'main') + + +def test_repo_id_resolver(): + repo_id_ = repo_id_resolver(repo_url) + assert repo_id == repo_id_ def test_fetch_model_repo(): - fetch_model_repo("meta-llama/Llama-2-7b-chat", root_path) - model_dir_path = root_path / "llama.cpp" / "models" / "Llama-2-7b-chat" - assert os.path.exists(model_dir_path) + local_dir = fetch_model_repo(repo_id, root_path / 'models') + assert os.path.exists(local_dir) def test_quantize_model(): - model_dir_path = root_path / "llama.cpp" / "models" / "Llama-2-7b-chat" - quantize_model( - model_dir_path, "Q3_K_M", root_path, output_dir=model_dir_path.parent - ) - gguf_file_path = model_dir_path / "ggml-model-Q3_K_M.gguf" + model_dir_path = root_path / "models" / model + # output_dir = root_path / "models" / "Llama-2-7b-chat" + quantize_model(model_dir_path, quantization, root_path, model_dir_path) + gguf_file_path = model_dir_path / f"ggml-model-{quantization}.gguf" assert os.path.exists(gguf_file_path) + + +def test_inference_quantized_model(): + quantized_model_file = root_path / 'models' / model / f'ggml-model-{quantization}.gguf' + res = inference_quantized_model(root_path, quantized_model_file) + assert isinstance(res.stdout, str)