diff --git a/README.md b/README.md
index cada9e4..63b56a2 100644
--- a/README.md
+++ b/README.md
@@ -101,6 +101,8 @@ ggufy run -h
- `-t`, `--max-tokens`: Maximum number of tokens to generate (default: 200)
- `--cpu`: Force CPU usage even if GPU is available
- `--stream`: Enable streaming output for real-time text generation
+- `--gpu-layers`: Number of layers to offload to GPU (default: all)
+- `--force-download`: Force re-download of the model even if it exists in cache
### 4. 4. Examples
@@ -173,7 +175,15 @@ By using the `--stream` flag, you can enable real-time streaming of the generate
-## 9. Troubleshooting
+## 9. Checksum Verification and Force Download
+
+GGUFy includes checksum verification to ensure the integrity of downloaded model files. After each download, the script verifies the file's hash against the expected hash (if provided by the server).
+
+If you encounter issues with a model file, such as corruption or incomplete downloads, you can use the `--force-download` flag to re-download the entire file:
+
+
+
+## 10. Troubleshooting
1. If `ggufy` command is not found, make sure you've restarted your terminal or sourced your shell configuration file after running the setup script.
@@ -195,13 +205,13 @@ By using the `--stream` flag, you can enable real-time streaming of the generate
-## 10. Contributing
+## 11. Contributing
Contributions are welcome! Please feel free to submit a Pull Request.
-## 11. License
+## 12. License
MIT License
diff --git a/ggufy.py b/ggufy.py
index 88c146d..b53a52d 100644
--- a/ggufy.py
+++ b/ggufy.py
@@ -63,7 +63,29 @@ def find_latest_gguf(username, repo, token):
print(f"Latest GGUF file found: {latest_file}\n")
return latest_file
-def download_model(model_path, token):
+def get_cached_model_path(username, repo, gguf_file):
+ # Create a unique filename based on the model path
+ model_id = hashlib.md5(f"{username}/{repo}/{gguf_file}".encode()).hexdigest()
+ cached_path = os.path.join(CACHE_DIR, f"ggufy-{model_id}.gguf")
+
+ # Save metadata
+ metadata = {
+ "repo_name": f"{username}/{repo}",
+ "file_name": gguf_file
+ }
+ with open(f"{cached_path}.json", "w") as f:
+ json.dump(metadata, f)
+
+ return cached_path
+
+def get_file_hash(file_path):
+ hasher = hashlib.md5()
+ with open(file_path, 'rb') as f:
+ for chunk in iter(lambda: f.read(4096), b""):
+ hasher.update(chunk)
+ return hasher.hexdigest()
+
+def download_model(model_path, token, force_download=False):
username, repo, file_name = parse_model_path(model_path)
if file_name == 'latest':
@@ -73,70 +95,100 @@ def download_model(model_path, token):
cached_path = get_cached_model_path(username, repo, gguf_file)
- if os.path.exists(cached_path):
- print(f"Using cached model: {cached_path}\n")
+ if os.path.exists(cached_path) and not force_download:
+ print(f"Using cached model: {cached_path}")
return cached_path, gguf_file
model_url = f"https://huggingface.co/{username}/{repo}/resolve/main/{gguf_file}"
- print(f"Downloading model: {gguf_file}\n")
- response = requests.get(model_url, stream=True, headers=get_headers(token))
+ print(f"Downloading model: {gguf_file}")
+
+ # Check if partial download exists
+ file_mode = 'ab' if os.path.exists(cached_path) and not force_download else 'wb'
+ initial_pos = os.path.getsize(cached_path) if os.path.exists(cached_path) and not force_download else 0
+
+ headers = get_headers(token)
+ if initial_pos > 0:
+ headers['Range'] = f'bytes={initial_pos}-'
+
+ response = requests.get(model_url, stream=True, headers=headers)
+
if response.status_code == 404:
raise ValueError(f"GGUF file '{gguf_file}' not found in {username}/{repo}")
response.raise_for_status()
- total_size = int(response.headers.get('content-length', 0))
+ total_size = int(response.headers.get('content-length', 0)) + initial_pos
os.makedirs(CACHE_DIR, exist_ok=True)
- with open(cached_path, 'wb') as file:
- progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True)
+ with open(cached_path, file_mode) as file, tqdm(
+ desc=gguf_file,
+ initial=initial_pos,
+ total=total_size,
+ unit='iB',
+ unit_scale=True,
+ unit_divisor=1024,
+ ) as progress_bar:
for chunk in response.iter_content(chunk_size=8192):
size = file.write(chunk)
progress_bar.update(size)
- progress_bar.close()
- print(f"Model downloaded and cached: {cached_path}\n")
+ print(f"Model downloaded and cached: {cached_path}")
+
+ # Verify the download
+ print("Verifying download...")
+ expected_hash = response.headers.get('ETag', '').strip('"')
+ if expected_hash:
+ actual_hash = get_file_hash(cached_path)
+ if actual_hash != expected_hash:
+ print("Warning: Downloaded file hash does not match expected hash.")
+ print("The file may be corrupted. You might want to try downloading again with --force-download.")
+ else:
+ print("Warning: Unable to verify file integrity. ETag not provided by server.")
+
return cached_path, gguf_file
-def animated_loading():
- chars = [' ', '. ', '.. ', '... ']
- while getattr(threading.current_thread(), "do_run", True):
- for char in chars:
- sys.stdout.write('\r' + f"Generating {char}")
- sys.stdout.flush()
- time.sleep(0.8)
- sys.stdout.write('\r' + ' ' * 20 + '\r')
- sys.stdout.flush()
-
-def run_gguf_model(model_path, context, max_tokens, token, force_cpu=False, stream=False):
+def run_gguf_model(model_path, context, max_tokens, token, force_cpu=False, stream=False, n_gpu_layers=None, force_download=False):
try:
- print("\nInitializing GGUFy...\n")
- model_file, gguf_file = download_model(model_path, token)
- print(f"Model file: {model_file}\n")
+ print("Initializing GGUFY Runner...")
+ model_file, gguf_file = download_model(model_path, token, force_download)
+ print(f"Model file: {model_file}")
+
+ print("Loading model into memory...")
- print("Loading model into memory...\n")
-
# Check for GPU availability
gpu_layers = 0
if not force_cpu:
try:
from llama_cpp import llama_cpp
- gpu_layers = llama_cpp.llama_n_gpu_layers
- print(f"GPU acceleration is available. Using {gpu_layers} GPU layers.\n")
+ if n_gpu_layers is None:
+ n_gpu_layers = llama_cpp.llama_n_gpu_layers(model_file)
+ gpu_layers = n_gpu_layers
+ print(f"GPU acceleration is available. Using {gpu_layers} GPU layers.")
except AttributeError:
- print("GPU acceleration is not available. Using CPU.\n")
+ print("GPU acceleration is not available. Using CPU.")
else:
- print("Forced CPU usage. GPU will not be used even if available.\n")
-
- llm = Llama(model_path=model_file, n_ctx=context, n_gpu_layers=gpu_layers)
- print("\nModel loaded successfully.\n")
+ print("Forced CPU usage. GPU will not be used even if available.")
+
+ try:
+ llm = Llama(model_path=model_file, n_ctx=context, n_gpu_layers=gpu_layers)
+ print("Model loaded successfully.")
+ except RuntimeError as e:
+ if "tensor" in str(e) and "data is not within the file bounds" in str(e):
+ print("Error: The model file appears to be corrupted or incomplete.")
+ print("Try running the command again with the --force-download flag to re-download the model.")
+ return
+ else:
+ raise
while True:
- prompt = input("Any questions? (or 'quit' to exit): ").strip()
+ prompt = input("Enter your prompt (or 'quit' to exit): ").strip()
if prompt.lower() == 'quit':
break
+ print(f"Generating text with prompt: '{prompt}'")
+
if stream:
+ print("\nGenerated text:")
for chunk in llm(prompt, max_tokens=max_tokens, stream=True):
print(chunk['choices'][0]['text'], end='', flush=True)
print("\n")
@@ -153,21 +205,31 @@ def run_gguf_model(model_path, context, max_tokens, token, force_cpu=False, stre
loading_thread.do_run = False
loading_thread.join()
- print("\n")
+ print("\nGenerated text:")
print(output['choices'][0]['text'])
-
+
print("\n" + "-"*50 + "\n")
except Exception as e:
print(f"An error occurred: {e}")
+def animated_loading():
+ chars = [' ', '. ', '.. ', '... ']
+ while getattr(threading.current_thread(), "do_run", True):
+ for char in chars:
+ sys.stdout.write('\r' + f"Generating {char}")
+ sys.stdout.flush()
+ time.sleep(0.8)
+ sys.stdout.write('\r' + ' ' * 20 + '\r')
+ sys.stdout.flush()
+
def remove_ggufy():
print("Removing GGUFy and all related files...\n")
# Remove configuration directory
if os.path.exists(CONFIG_DIR):
shutil.rmtree(CONFIG_DIR)
- print(f"Removed configuration directory: {CONFIG_DIR}")
+ print(f"Removed configuration directory: {CONFIG_DIR}\n")
# Remove cache directory
if os.path.exists(CACHE_DIR):
@@ -208,26 +270,6 @@ def list_cached_models():
print(f"- {repo_name}: {file_name}")
except FileNotFoundError:
print(f"- Unknown: {filename}")
-
-def get_cached_model_path(username, repo, gguf_file):
- # Create a unique filename based on the model path
- model_id = hashlib.md5(f"{username}/{repo}/{gguf_file}".encode()).hexdigest()
- cached_path = os.path.join(CACHE_DIR, f"ggufy-{model_id}.gguf")
-
- # Save metadata
- metadata = {
- "repo_name": f"{username}/{repo}",
- "file_name": gguf_file
- }
- with open(f"{cached_path}.json", "w") as f:
- json.dump(metadata, f)
-
- return cached_path
-
-""" def get_cached_model_path(username, repo, gguf_file):
- # Create a unique filename based on the model path
- model_id = hashlib.md5(f"{username}/{repo}/{gguf_file}".encode()).hexdigest()
- return os.path.join(CACHE_DIR, f"ggufy-{model_id}.gguf") """
def main():
parser = argparse.ArgumentParser(description="Run GGUF models from Hugging Face Hub")
@@ -243,6 +285,8 @@ def main():
run_parser.add_argument("-t", "--max-tokens", type=int, default=200, help="Maximum number of tokens to generate")
run_parser.add_argument("--cpu", action="store_true", help="Force CPU usage even if GPU is available")
run_parser.add_argument("--stream", action="store_true", help="Enable streaming output")
+ run_parser.add_argument("--gpu-layers", type=int, help="Number of layers to offload to GPU (default: all)")
+ run_parser.add_argument("--force-download", action="store_true", help="Force re-download of the model even if it exists in cache")
# List command
list_parser = subparsers.add_parser("list", help="List cached models")
@@ -260,7 +304,9 @@ def main():
print("No API token found. Please run 'ggufy login' first.\n")
sys.exit(1)
try:
- run_gguf_model(args.model_path, args.context, args.max_tokens, token, force_cpu=args.cpu, stream=args.stream)
+ run_gguf_model(args.model_path, args.context, args.max_tokens, token,
+ force_cpu=args.cpu, stream=args.stream, n_gpu_layers=args.gpu_layers,
+ force_download=args.force_download)
except Exception as e:
print(f"Error: {e}")
sys.exit(1)