improvement: add --force-download for resumable model download

wansatya · Oct 21, 2024 · 1bd0dc8 · 1bd0dc8
1 parent 85405f0
commit 1bd0dc8
Show file tree

Hide file tree

Showing 2 changed files with 117 additions and 61 deletions.
diff --git a/README.md b/README.md
@@ -101,6 +101,8 @@ ggufy run -h
 - `-t`, `--max-tokens`: Maximum number of tokens to generate (default: 200)
 - `--cpu`: Force CPU usage even if GPU is available
 - `--stream`: Enable streaming output for real-time text generation
+- `--gpu-layers`: Number of layers to offload to GPU (default: all)
+- `--force-download`: Force re-download of the model even if it exists in cache
 
 ### 4. 4. Examples
 
@@ -173,7 +175,15 @@ By using the `--stream` flag, you can enable real-time streaming of the generate
 
 <br/>
 
-## 9. Troubleshooting
+## 9. Checksum Verification and Force Download
+
+GGUFy includes checksum verification to ensure the integrity of downloaded model files. After each download, the script verifies the file's hash against the expected hash (if provided by the server).
+
+If you encounter issues with a model file, such as corruption or incomplete downloads, you can use the `--force-download` flag to re-download the entire file:
+
+<br/>
+
+## 10. Troubleshooting
 
 1. If `ggufy` command is not found, make sure you've restarted your terminal or sourced your shell configuration file after running the setup script.
 
@@ -195,13 +205,13 @@ By using the `--stream` flag, you can enable real-time streaming of the generate
 
 <br/>
 
-## 10. Contributing
+## 11. Contributing
 
 Contributions are welcome! Please feel free to submit a Pull Request.
 
 <br/>
 
-## 11. License
+## 12. License
 
 MIT License
 

diff --git a/ggufy.py b/ggufy.py
@@ -63,7 +63,29 @@ def find_latest_gguf(username, repo, token):
     print(f"Latest GGUF file found: {latest_file}\n")
     return latest_file
 
-def download_model(model_path, token):
+def get_cached_model_path(username, repo, gguf_file):
+    # Create a unique filename based on the model path
+    model_id = hashlib.md5(f"{username}/{repo}/{gguf_file}".encode()).hexdigest()
+    cached_path = os.path.join(CACHE_DIR, f"ggufy-{model_id}.gguf")
+
+    # Save metadata
+    metadata = {
+        "repo_name": f"{username}/{repo}",
+        "file_name": gguf_file
+    }
+    with open(f"{cached_path}.json", "w") as f:
+        json.dump(metadata, f)
+
+    return cached_path
+
+def get_file_hash(file_path):
+    hasher = hashlib.md5()
+    with open(file_path, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hasher.update(chunk)
+    return hasher.hexdigest()
+
+def download_model(model_path, token, force_download=False):
     username, repo, file_name = parse_model_path(model_path)
 
     if file_name == 'latest':
@@ -73,70 +95,100 @@ def download_model(model_path, token):
 
     cached_path = get_cached_model_path(username, repo, gguf_file)
 
-    if os.path.exists(cached_path):
-        print(f"Using cached model: {cached_path}\n")
+    if os.path.exists(cached_path) and not force_download:
+        print(f"Using cached model: {cached_path}")
         return cached_path, gguf_file
 
     model_url = f"https://huggingface.co/{username}/{repo}/resolve/main/{gguf_file}"
 
-    print(f"Downloading model: {gguf_file}\n")
-    response = requests.get(model_url, stream=True, headers=get_headers(token))
+    print(f"Downloading model: {gguf_file}")
+
+    # Check if partial download exists
+    file_mode = 'ab' if os.path.exists(cached_path) and not force_download else 'wb'
+    initial_pos = os.path.getsize(cached_path) if os.path.exists(cached_path) and not force_download else 0
+
+    headers = get_headers(token)
+    if initial_pos > 0:
+        headers['Range'] = f'bytes={initial_pos}-'
+
+    response = requests.get(model_url, stream=True, headers=headers)
+
     if response.status_code == 404:
         raise ValueError(f"GGUF file '{gguf_file}' not found in {username}/{repo}")
     response.raise_for_status()
 
-    total_size = int(response.headers.get('content-length', 0))
+    total_size = int(response.headers.get('content-length', 0)) + initial_pos
 
     os.makedirs(CACHE_DIR, exist_ok=True)
-    with open(cached_path, 'wb') as file:
-        progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True)
+    with open(cached_path, file_mode) as file, tqdm(
+        desc=gguf_file,
+        initial=initial_pos,
+        total=total_size,
+        unit='iB',
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as progress_bar:
         for chunk in response.iter_content(chunk_size=8192):
             size = file.write(chunk)
             progress_bar.update(size)
-        progress_bar.close()
 
-    print(f"Model downloaded and cached: {cached_path}\n")
+    print(f"Model downloaded and cached: {cached_path}")
+
+    # Verify the download
+    print("Verifying download...")
+    expected_hash = response.headers.get('ETag', '').strip('"')
+    if expected_hash:
+        actual_hash = get_file_hash(cached_path)
+        if actual_hash != expected_hash:
+            print("Warning: Downloaded file hash does not match expected hash.")
+            print("The file may be corrupted. You might want to try downloading again with --force-download.")
+    else:
+        print("Warning: Unable to verify file integrity. ETag not provided by server.")
+
     return cached_path, gguf_file
 
-def animated_loading():
-    chars = ['    ', '.   ', '..  ', '... ']
-    while getattr(threading.current_thread(), "do_run", True):
-        for char in chars:
-            sys.stdout.write('\r' + f"Generating {char}")
-            sys.stdout.flush()
-            time.sleep(0.8)
-    sys.stdout.write('\r' + ' ' * 20 + '\r')
-    sys.stdout.flush()
-
-def run_gguf_model(model_path, context, max_tokens, token, force_cpu=False, stream=False):
+def run_gguf_model(model_path, context, max_tokens, token, force_cpu=False, stream=False, n_gpu_layers=None, force_download=False):
     try:
-        print("\nInitializing GGUFy...\n")
-        model_file, gguf_file = download_model(model_path, token)
-        print(f"Model file: {model_file}\n")
+        print("Initializing GGUFY Runner...")
+        model_file, gguf_file = download_model(model_path, token, force_download)
+        print(f"Model file: {model_file}")
+
+        print("Loading model into memory...")
 
-        print("Loading model into memory...\n")
-
         # Check for GPU availability
         gpu_layers = 0
         if not force_cpu:
             try:
                 from llama_cpp import llama_cpp
-                gpu_layers = llama_cpp.llama_n_gpu_layers
-                print(f"GPU acceleration is available. Using {gpu_layers} GPU layers.\n")
+                if n_gpu_layers is None:
+                    n_gpu_layers = llama_cpp.llama_n_gpu_layers(model_file)
+                gpu_layers = n_gpu_layers
+                print(f"GPU acceleration is available. Using {gpu_layers} GPU layers.")
             except AttributeError:
-                print("GPU acceleration is not available. Using CPU.\n")
+                print("GPU acceleration is not available. Using CPU.")
         else:
-            print("Forced CPU usage. GPU will not be used even if available.\n")
-
-        llm = Llama(model_path=model_file, n_ctx=context, n_gpu_layers=gpu_layers)
-        print("\nModel loaded successfully.\n")
+            print("Forced CPU usage. GPU will not be used even if available.")
+
+        try:
+            llm = Llama(model_path=model_file, n_ctx=context, n_gpu_layers=gpu_layers)
+            print("Model loaded successfully.")
+        except RuntimeError as e:
+            if "tensor" in str(e) and "data is not within the file bounds" in str(e):
+                print("Error: The model file appears to be corrupted or incomplete.")
+                print("Try running the command again with the --force-download flag to re-download the model.")
+                return
+            else:
+                raise
 
         while True:
-            prompt = input("Any questions? (or 'quit' to exit): ").strip()
+            prompt = input("Enter your prompt (or 'quit' to exit): ").strip()
             if prompt.lower() == 'quit':
                 break
 
+            print(f"Generating text with prompt: '{prompt}'")
+
             if stream:
+                print("\nGenerated text:")
                 for chunk in llm(prompt, max_tokens=max_tokens, stream=True):
                     print(chunk['choices'][0]['text'], end='', flush=True)
                 print("\n")
@@ -153,21 +205,31 @@ def run_gguf_model(model_path, context, max_tokens, token, force_cpu=False, stre
                 loading_thread.do_run = False
                 loading_thread.join()
 
-                print("\n")
+                print("\nGenerated text:")
                 print(output['choices'][0]['text'])
-
+            
             print("\n" + "-"*50 + "\n")
 
     except Exception as e:
         print(f"An error occurred: {e}")
 
+def animated_loading():
+    chars = ['    ', '.   ', '..  ', '... ']
+    while getattr(threading.current_thread(), "do_run", True):
+        for char in chars:
+            sys.stdout.write('\r' + f"Generating {char}")
+            sys.stdout.flush()
+            time.sleep(0.8)
+    sys.stdout.write('\r' + ' ' * 20 + '\r')
+    sys.stdout.flush()
+
 def remove_ggufy():
     print("Removing GGUFy and all related files...\n")
 
     # Remove configuration directory
     if os.path.exists(CONFIG_DIR):
         shutil.rmtree(CONFIG_DIR)
-        print(f"Removed configuration directory: {CONFIG_DIR}")
+        print(f"Removed configuration directory: {CONFIG_DIR}\n")
 
     # Remove cache directory
     if os.path.exists(CACHE_DIR):
@@ -208,26 +270,6 @@ def list_cached_models():
                     print(f"- {repo_name}: {file_name}")
             except FileNotFoundError:
                 print(f"- Unknown: {filename}")
-
-def get_cached_model_path(username, repo, gguf_file):
-    # Create a unique filename based on the model path
-    model_id = hashlib.md5(f"{username}/{repo}/{gguf_file}".encode()).hexdigest()
-    cached_path = os.path.join(CACHE_DIR, f"ggufy-{model_id}.gguf")
-
-    # Save metadata
-    metadata = {
-        "repo_name": f"{username}/{repo}",
-        "file_name": gguf_file
-    }
-    with open(f"{cached_path}.json", "w") as f:
-        json.dump(metadata, f)
-
-    return cached_path
-
-""" def get_cached_model_path(username, repo, gguf_file):
-    # Create a unique filename based on the model path
-    model_id = hashlib.md5(f"{username}/{repo}/{gguf_file}".encode()).hexdigest()
-    return os.path.join(CACHE_DIR, f"ggufy-{model_id}.gguf") """
 
 def main():
     parser = argparse.ArgumentParser(description="Run GGUF models from Hugging Face Hub")
@@ -243,6 +285,8 @@ def main():
     run_parser.add_argument("-t", "--max-tokens", type=int, default=200, help="Maximum number of tokens to generate")
     run_parser.add_argument("--cpu", action="store_true", help="Force CPU usage even if GPU is available")
     run_parser.add_argument("--stream", action="store_true", help="Enable streaming output")
+    run_parser.add_argument("--gpu-layers", type=int, help="Number of layers to offload to GPU (default: all)")
+    run_parser.add_argument("--force-download", action="store_true", help="Force re-download of the model even if it exists in cache")
 
     # List command
     list_parser = subparsers.add_parser("list", help="List cached models")
@@ -260,7 +304,9 @@ def main():
             print("No API token found. Please run 'ggufy login' first.\n")
             sys.exit(1)
         try:
-            run_gguf_model(args.model_path, args.context, args.max_tokens, token, force_cpu=args.cpu, stream=args.stream)
+            run_gguf_model(args.model_path, args.context, args.max_tokens, token, 
+                           force_cpu=args.cpu, stream=args.stream, n_gpu_layers=args.gpu_layers,
+                           force_download=args.force_download)
         except Exception as e:
             print(f"Error: {e}")
             sys.exit(1)