diff --git a/README.md b/README.md index cada9e4..63b56a2 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,8 @@ ggufy run -h - `-t`, `--max-tokens`: Maximum number of tokens to generate (default: 200) - `--cpu`: Force CPU usage even if GPU is available - `--stream`: Enable streaming output for real-time text generation +- `--gpu-layers`: Number of layers to offload to GPU (default: all) +- `--force-download`: Force re-download of the model even if it exists in cache ### 4. 4. Examples @@ -173,7 +175,15 @@ By using the `--stream` flag, you can enable real-time streaming of the generate
-## 9. Troubleshooting +## 9. Checksum Verification and Force Download + +GGUFy includes checksum verification to ensure the integrity of downloaded model files. After each download, the script verifies the file's hash against the expected hash (if provided by the server). + +If you encounter issues with a model file, such as corruption or incomplete downloads, you can use the `--force-download` flag to re-download the entire file: + +
+ +## 10. Troubleshooting 1. If `ggufy` command is not found, make sure you've restarted your terminal or sourced your shell configuration file after running the setup script. @@ -195,13 +205,13 @@ By using the `--stream` flag, you can enable real-time streaming of the generate
-## 10. Contributing +## 11. Contributing Contributions are welcome! Please feel free to submit a Pull Request.
-## 11. License +## 12. License MIT License diff --git a/ggufy.py b/ggufy.py index 88c146d..b53a52d 100644 --- a/ggufy.py +++ b/ggufy.py @@ -63,7 +63,29 @@ def find_latest_gguf(username, repo, token): print(f"Latest GGUF file found: {latest_file}\n") return latest_file -def download_model(model_path, token): +def get_cached_model_path(username, repo, gguf_file): + # Create a unique filename based on the model path + model_id = hashlib.md5(f"{username}/{repo}/{gguf_file}".encode()).hexdigest() + cached_path = os.path.join(CACHE_DIR, f"ggufy-{model_id}.gguf") + + # Save metadata + metadata = { + "repo_name": f"{username}/{repo}", + "file_name": gguf_file + } + with open(f"{cached_path}.json", "w") as f: + json.dump(metadata, f) + + return cached_path + +def get_file_hash(file_path): + hasher = hashlib.md5() + with open(file_path, 'rb') as f: + for chunk in iter(lambda: f.read(4096), b""): + hasher.update(chunk) + return hasher.hexdigest() + +def download_model(model_path, token, force_download=False): username, repo, file_name = parse_model_path(model_path) if file_name == 'latest': @@ -73,70 +95,100 @@ def download_model(model_path, token): cached_path = get_cached_model_path(username, repo, gguf_file) - if os.path.exists(cached_path): - print(f"Using cached model: {cached_path}\n") + if os.path.exists(cached_path) and not force_download: + print(f"Using cached model: {cached_path}") return cached_path, gguf_file model_url = f"https://huggingface.co/{username}/{repo}/resolve/main/{gguf_file}" - print(f"Downloading model: {gguf_file}\n") - response = requests.get(model_url, stream=True, headers=get_headers(token)) + print(f"Downloading model: {gguf_file}") + + # Check if partial download exists + file_mode = 'ab' if os.path.exists(cached_path) and not force_download else 'wb' + initial_pos = os.path.getsize(cached_path) if os.path.exists(cached_path) and not force_download else 0 + + headers = get_headers(token) + if initial_pos > 0: + headers['Range'] = f'bytes={initial_pos}-' + + response = requests.get(model_url, stream=True, headers=headers) + if response.status_code == 404: raise ValueError(f"GGUF file '{gguf_file}' not found in {username}/{repo}") response.raise_for_status() - total_size = int(response.headers.get('content-length', 0)) + total_size = int(response.headers.get('content-length', 0)) + initial_pos os.makedirs(CACHE_DIR, exist_ok=True) - with open(cached_path, 'wb') as file: - progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True) + with open(cached_path, file_mode) as file, tqdm( + desc=gguf_file, + initial=initial_pos, + total=total_size, + unit='iB', + unit_scale=True, + unit_divisor=1024, + ) as progress_bar: for chunk in response.iter_content(chunk_size=8192): size = file.write(chunk) progress_bar.update(size) - progress_bar.close() - print(f"Model downloaded and cached: {cached_path}\n") + print(f"Model downloaded and cached: {cached_path}") + + # Verify the download + print("Verifying download...") + expected_hash = response.headers.get('ETag', '').strip('"') + if expected_hash: + actual_hash = get_file_hash(cached_path) + if actual_hash != expected_hash: + print("Warning: Downloaded file hash does not match expected hash.") + print("The file may be corrupted. You might want to try downloading again with --force-download.") + else: + print("Warning: Unable to verify file integrity. ETag not provided by server.") + return cached_path, gguf_file -def animated_loading(): - chars = [' ', '. ', '.. ', '... '] - while getattr(threading.current_thread(), "do_run", True): - for char in chars: - sys.stdout.write('\r' + f"Generating {char}") - sys.stdout.flush() - time.sleep(0.8) - sys.stdout.write('\r' + ' ' * 20 + '\r') - sys.stdout.flush() - -def run_gguf_model(model_path, context, max_tokens, token, force_cpu=False, stream=False): +def run_gguf_model(model_path, context, max_tokens, token, force_cpu=False, stream=False, n_gpu_layers=None, force_download=False): try: - print("\nInitializing GGUFy...\n") - model_file, gguf_file = download_model(model_path, token) - print(f"Model file: {model_file}\n") + print("Initializing GGUFY Runner...") + model_file, gguf_file = download_model(model_path, token, force_download) + print(f"Model file: {model_file}") + + print("Loading model into memory...") - print("Loading model into memory...\n") - # Check for GPU availability gpu_layers = 0 if not force_cpu: try: from llama_cpp import llama_cpp - gpu_layers = llama_cpp.llama_n_gpu_layers - print(f"GPU acceleration is available. Using {gpu_layers} GPU layers.\n") + if n_gpu_layers is None: + n_gpu_layers = llama_cpp.llama_n_gpu_layers(model_file) + gpu_layers = n_gpu_layers + print(f"GPU acceleration is available. Using {gpu_layers} GPU layers.") except AttributeError: - print("GPU acceleration is not available. Using CPU.\n") + print("GPU acceleration is not available. Using CPU.") else: - print("Forced CPU usage. GPU will not be used even if available.\n") - - llm = Llama(model_path=model_file, n_ctx=context, n_gpu_layers=gpu_layers) - print("\nModel loaded successfully.\n") + print("Forced CPU usage. GPU will not be used even if available.") + + try: + llm = Llama(model_path=model_file, n_ctx=context, n_gpu_layers=gpu_layers) + print("Model loaded successfully.") + except RuntimeError as e: + if "tensor" in str(e) and "data is not within the file bounds" in str(e): + print("Error: The model file appears to be corrupted or incomplete.") + print("Try running the command again with the --force-download flag to re-download the model.") + return + else: + raise while True: - prompt = input("Any questions? (or 'quit' to exit): ").strip() + prompt = input("Enter your prompt (or 'quit' to exit): ").strip() if prompt.lower() == 'quit': break + print(f"Generating text with prompt: '{prompt}'") + if stream: + print("\nGenerated text:") for chunk in llm(prompt, max_tokens=max_tokens, stream=True): print(chunk['choices'][0]['text'], end='', flush=True) print("\n") @@ -153,21 +205,31 @@ def run_gguf_model(model_path, context, max_tokens, token, force_cpu=False, stre loading_thread.do_run = False loading_thread.join() - print("\n") + print("\nGenerated text:") print(output['choices'][0]['text']) - + print("\n" + "-"*50 + "\n") except Exception as e: print(f"An error occurred: {e}") +def animated_loading(): + chars = [' ', '. ', '.. ', '... '] + while getattr(threading.current_thread(), "do_run", True): + for char in chars: + sys.stdout.write('\r' + f"Generating {char}") + sys.stdout.flush() + time.sleep(0.8) + sys.stdout.write('\r' + ' ' * 20 + '\r') + sys.stdout.flush() + def remove_ggufy(): print("Removing GGUFy and all related files...\n") # Remove configuration directory if os.path.exists(CONFIG_DIR): shutil.rmtree(CONFIG_DIR) - print(f"Removed configuration directory: {CONFIG_DIR}") + print(f"Removed configuration directory: {CONFIG_DIR}\n") # Remove cache directory if os.path.exists(CACHE_DIR): @@ -208,26 +270,6 @@ def list_cached_models(): print(f"- {repo_name}: {file_name}") except FileNotFoundError: print(f"- Unknown: {filename}") - -def get_cached_model_path(username, repo, gguf_file): - # Create a unique filename based on the model path - model_id = hashlib.md5(f"{username}/{repo}/{gguf_file}".encode()).hexdigest() - cached_path = os.path.join(CACHE_DIR, f"ggufy-{model_id}.gguf") - - # Save metadata - metadata = { - "repo_name": f"{username}/{repo}", - "file_name": gguf_file - } - with open(f"{cached_path}.json", "w") as f: - json.dump(metadata, f) - - return cached_path - -""" def get_cached_model_path(username, repo, gguf_file): - # Create a unique filename based on the model path - model_id = hashlib.md5(f"{username}/{repo}/{gguf_file}".encode()).hexdigest() - return os.path.join(CACHE_DIR, f"ggufy-{model_id}.gguf") """ def main(): parser = argparse.ArgumentParser(description="Run GGUF models from Hugging Face Hub") @@ -243,6 +285,8 @@ def main(): run_parser.add_argument("-t", "--max-tokens", type=int, default=200, help="Maximum number of tokens to generate") run_parser.add_argument("--cpu", action="store_true", help="Force CPU usage even if GPU is available") run_parser.add_argument("--stream", action="store_true", help="Enable streaming output") + run_parser.add_argument("--gpu-layers", type=int, help="Number of layers to offload to GPU (default: all)") + run_parser.add_argument("--force-download", action="store_true", help="Force re-download of the model even if it exists in cache") # List command list_parser = subparsers.add_parser("list", help="List cached models") @@ -260,7 +304,9 @@ def main(): print("No API token found. Please run 'ggufy login' first.\n") sys.exit(1) try: - run_gguf_model(args.model_path, args.context, args.max_tokens, token, force_cpu=args.cpu, stream=args.stream) + run_gguf_model(args.model_path, args.context, args.max_tokens, token, + force_cpu=args.cpu, stream=args.stream, n_gpu_layers=args.gpu_layers, + force_download=args.force_download) except Exception as e: print(f"Error: {e}") sys.exit(1)