Skip to content

Commit

Permalink
improvement: add --force-download for resumable model download
Browse files Browse the repository at this point in the history
  • Loading branch information
wansatya committed Oct 21, 2024
1 parent 85405f0 commit 1bd0dc8
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 61 deletions.
16 changes: 13 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ ggufy run -h
- `-t`, `--max-tokens`: Maximum number of tokens to generate (default: 200)
- `--cpu`: Force CPU usage even if GPU is available
- `--stream`: Enable streaming output for real-time text generation
- `--gpu-layers`: Number of layers to offload to GPU (default: all)
- `--force-download`: Force re-download of the model even if it exists in cache

### 4. 4. Examples

Expand Down Expand Up @@ -173,7 +175,15 @@ By using the `--stream` flag, you can enable real-time streaming of the generate

<br/>

## 9. Troubleshooting
## 9. Checksum Verification and Force Download

GGUFy includes checksum verification to ensure the integrity of downloaded model files. After each download, the script verifies the file's hash against the expected hash (if provided by the server).

If you encounter issues with a model file, such as corruption or incomplete downloads, you can use the `--force-download` flag to re-download the entire file:

<br/>

## 10. Troubleshooting

1. If `ggufy` command is not found, make sure you've restarted your terminal or sourced your shell configuration file after running the setup script.

Expand All @@ -195,13 +205,13 @@ By using the `--stream` flag, you can enable real-time streaming of the generate

<br/>

## 10. Contributing
## 11. Contributing

Contributions are welcome! Please feel free to submit a Pull Request.

<br/>

## 11. License
## 12. License

MIT License

Expand Down
162 changes: 104 additions & 58 deletions ggufy.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,29 @@ def find_latest_gguf(username, repo, token):
print(f"Latest GGUF file found: {latest_file}\n")
return latest_file

def download_model(model_path, token):
def get_cached_model_path(username, repo, gguf_file):
# Create a unique filename based on the model path
model_id = hashlib.md5(f"{username}/{repo}/{gguf_file}".encode()).hexdigest()
cached_path = os.path.join(CACHE_DIR, f"ggufy-{model_id}.gguf")

# Save metadata
metadata = {
"repo_name": f"{username}/{repo}",
"file_name": gguf_file
}
with open(f"{cached_path}.json", "w") as f:
json.dump(metadata, f)

return cached_path

def get_file_hash(file_path):
hasher = hashlib.md5()
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b""):
hasher.update(chunk)
return hasher.hexdigest()

def download_model(model_path, token, force_download=False):
username, repo, file_name = parse_model_path(model_path)

if file_name == 'latest':
Expand All @@ -73,70 +95,100 @@ def download_model(model_path, token):

cached_path = get_cached_model_path(username, repo, gguf_file)

if os.path.exists(cached_path):
print(f"Using cached model: {cached_path}\n")
if os.path.exists(cached_path) and not force_download:
print(f"Using cached model: {cached_path}")
return cached_path, gguf_file

model_url = f"https://huggingface.co/{username}/{repo}/resolve/main/{gguf_file}"

print(f"Downloading model: {gguf_file}\n")
response = requests.get(model_url, stream=True, headers=get_headers(token))
print(f"Downloading model: {gguf_file}")

# Check if partial download exists
file_mode = 'ab' if os.path.exists(cached_path) and not force_download else 'wb'
initial_pos = os.path.getsize(cached_path) if os.path.exists(cached_path) and not force_download else 0

headers = get_headers(token)
if initial_pos > 0:
headers['Range'] = f'bytes={initial_pos}-'

response = requests.get(model_url, stream=True, headers=headers)

if response.status_code == 404:
raise ValueError(f"GGUF file '{gguf_file}' not found in {username}/{repo}")
response.raise_for_status()

total_size = int(response.headers.get('content-length', 0))
total_size = int(response.headers.get('content-length', 0)) + initial_pos

os.makedirs(CACHE_DIR, exist_ok=True)
with open(cached_path, 'wb') as file:
progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True)
with open(cached_path, file_mode) as file, tqdm(
desc=gguf_file,
initial=initial_pos,
total=total_size,
unit='iB',
unit_scale=True,
unit_divisor=1024,
) as progress_bar:
for chunk in response.iter_content(chunk_size=8192):
size = file.write(chunk)
progress_bar.update(size)
progress_bar.close()

print(f"Model downloaded and cached: {cached_path}\n")
print(f"Model downloaded and cached: {cached_path}")

# Verify the download
print("Verifying download...")
expected_hash = response.headers.get('ETag', '').strip('"')
if expected_hash:
actual_hash = get_file_hash(cached_path)
if actual_hash != expected_hash:
print("Warning: Downloaded file hash does not match expected hash.")
print("The file may be corrupted. You might want to try downloading again with --force-download.")
else:
print("Warning: Unable to verify file integrity. ETag not provided by server.")

return cached_path, gguf_file

def animated_loading():
chars = [' ', '. ', '.. ', '... ']
while getattr(threading.current_thread(), "do_run", True):
for char in chars:
sys.stdout.write('\r' + f"Generating {char}")
sys.stdout.flush()
time.sleep(0.8)
sys.stdout.write('\r' + ' ' * 20 + '\r')
sys.stdout.flush()

def run_gguf_model(model_path, context, max_tokens, token, force_cpu=False, stream=False):
def run_gguf_model(model_path, context, max_tokens, token, force_cpu=False, stream=False, n_gpu_layers=None, force_download=False):
try:
print("\nInitializing GGUFy...\n")
model_file, gguf_file = download_model(model_path, token)
print(f"Model file: {model_file}\n")
print("Initializing GGUFY Runner...")
model_file, gguf_file = download_model(model_path, token, force_download)
print(f"Model file: {model_file}")

print("Loading model into memory...")

print("Loading model into memory...\n")

# Check for GPU availability
gpu_layers = 0
if not force_cpu:
try:
from llama_cpp import llama_cpp
gpu_layers = llama_cpp.llama_n_gpu_layers
print(f"GPU acceleration is available. Using {gpu_layers} GPU layers.\n")
if n_gpu_layers is None:
n_gpu_layers = llama_cpp.llama_n_gpu_layers(model_file)
gpu_layers = n_gpu_layers
print(f"GPU acceleration is available. Using {gpu_layers} GPU layers.")
except AttributeError:
print("GPU acceleration is not available. Using CPU.\n")
print("GPU acceleration is not available. Using CPU.")
else:
print("Forced CPU usage. GPU will not be used even if available.\n")

llm = Llama(model_path=model_file, n_ctx=context, n_gpu_layers=gpu_layers)
print("\nModel loaded successfully.\n")
print("Forced CPU usage. GPU will not be used even if available.")

try:
llm = Llama(model_path=model_file, n_ctx=context, n_gpu_layers=gpu_layers)
print("Model loaded successfully.")
except RuntimeError as e:
if "tensor" in str(e) and "data is not within the file bounds" in str(e):
print("Error: The model file appears to be corrupted or incomplete.")
print("Try running the command again with the --force-download flag to re-download the model.")
return
else:
raise

while True:
prompt = input("Any questions? (or 'quit' to exit): ").strip()
prompt = input("Enter your prompt (or 'quit' to exit): ").strip()
if prompt.lower() == 'quit':
break

print(f"Generating text with prompt: '{prompt}'")

if stream:
print("\nGenerated text:")
for chunk in llm(prompt, max_tokens=max_tokens, stream=True):
print(chunk['choices'][0]['text'], end='', flush=True)
print("\n")
Expand All @@ -153,21 +205,31 @@ def run_gguf_model(model_path, context, max_tokens, token, force_cpu=False, stre
loading_thread.do_run = False
loading_thread.join()

print("\n")
print("\nGenerated text:")
print(output['choices'][0]['text'])

print("\n" + "-"*50 + "\n")

except Exception as e:
print(f"An error occurred: {e}")

def animated_loading():
chars = [' ', '. ', '.. ', '... ']
while getattr(threading.current_thread(), "do_run", True):
for char in chars:
sys.stdout.write('\r' + f"Generating {char}")
sys.stdout.flush()
time.sleep(0.8)
sys.stdout.write('\r' + ' ' * 20 + '\r')
sys.stdout.flush()

def remove_ggufy():
print("Removing GGUFy and all related files...\n")

# Remove configuration directory
if os.path.exists(CONFIG_DIR):
shutil.rmtree(CONFIG_DIR)
print(f"Removed configuration directory: {CONFIG_DIR}")
print(f"Removed configuration directory: {CONFIG_DIR}\n")

# Remove cache directory
if os.path.exists(CACHE_DIR):
Expand Down Expand Up @@ -208,26 +270,6 @@ def list_cached_models():
print(f"- {repo_name}: {file_name}")
except FileNotFoundError:
print(f"- Unknown: {filename}")

def get_cached_model_path(username, repo, gguf_file):
# Create a unique filename based on the model path
model_id = hashlib.md5(f"{username}/{repo}/{gguf_file}".encode()).hexdigest()
cached_path = os.path.join(CACHE_DIR, f"ggufy-{model_id}.gguf")

# Save metadata
metadata = {
"repo_name": f"{username}/{repo}",
"file_name": gguf_file
}
with open(f"{cached_path}.json", "w") as f:
json.dump(metadata, f)

return cached_path

""" def get_cached_model_path(username, repo, gguf_file):
# Create a unique filename based on the model path
model_id = hashlib.md5(f"{username}/{repo}/{gguf_file}".encode()).hexdigest()
return os.path.join(CACHE_DIR, f"ggufy-{model_id}.gguf") """

def main():
parser = argparse.ArgumentParser(description="Run GGUF models from Hugging Face Hub")
Expand All @@ -243,6 +285,8 @@ def main():
run_parser.add_argument("-t", "--max-tokens", type=int, default=200, help="Maximum number of tokens to generate")
run_parser.add_argument("--cpu", action="store_true", help="Force CPU usage even if GPU is available")
run_parser.add_argument("--stream", action="store_true", help="Enable streaming output")
run_parser.add_argument("--gpu-layers", type=int, help="Number of layers to offload to GPU (default: all)")
run_parser.add_argument("--force-download", action="store_true", help="Force re-download of the model even if it exists in cache")

# List command
list_parser = subparsers.add_parser("list", help="List cached models")
Expand All @@ -260,7 +304,9 @@ def main():
print("No API token found. Please run 'ggufy login' first.\n")
sys.exit(1)
try:
run_gguf_model(args.model_path, args.context, args.max_tokens, token, force_cpu=args.cpu, stream=args.stream)
run_gguf_model(args.model_path, args.context, args.max_tokens, token,
force_cpu=args.cpu, stream=args.stream, n_gpu_layers=args.gpu_layers,
force_download=args.force_download)
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
Expand Down

0 comments on commit 1bd0dc8

Please sign in to comment.