Skip to content

Commit

Permalink
auto detect which model to run (#36)
Browse files Browse the repository at this point in the history
* auto detect which model to run

* fix model name

* remove max_samples

* fix bash script

* chore: max tokens

---------

Co-authored-by: Justus Mattern <[email protected]>
  • Loading branch information
felix-red-panda and justusmattern27 authored Feb 4, 2025
1 parent 8a539ea commit 0fba3ac
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 3 deletions.
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ export GCP_CREDENTIALS_BASE64=$(base64 -w 0 /path/to/your/service-account-key.js
uv run python src/genesys/generate.py @ configs/debug.toml --gcp_bucket checkpoints_pi/test_data
```

automatically detect the right model to run
```sh
./script/entrypoint.sh
```

for dev setup:

```
Expand Down Expand Up @@ -59,5 +64,3 @@ run
```
sudo docker run --gpus all -it primeintellect/genesys:latest uv run python src/genesys/generate.py @ configs/debug.toml
```


1 change: 0 additions & 1 deletion configs/r1.toml → configs/deepseek_r1.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,5 @@ top_p = 0.95 # 0.95 is the value from the R1 model card for evaluation
max_tokens = 12288

[data]
max_samples = 2000000
batch_size = 512
path = "justus27/test-vcu,justus27/test-vfc,justus27/test-jdg,justus27/test-vfm"
9 changes: 9 additions & 0 deletions configs/llama70b.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name_model = "deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
num_gpus = 8
sample_per_file = 512
temperature = 0.6 # 0.6 is the value from the R1 model card for evaluation
top_p = 0.95 # 0.95 is the value from the R1 model card for evaluation

[data]
batch_size = 512
path = "justus27/test-vcu,justus27/test-vfc,justus27/test-jdg,justus27/test-vfm"
17 changes: 17 additions & 0 deletions script/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

# Save the original stdout to FD 3
exec 3>&1

# Capture both stdout and stderr while also printing to terminal
output=$(uv run python src/genesys/auto_detect_model_config.py 2>&1 | tee /dev/fd/3)
exit_code=${PIPESTATUS[0]}

if [ $exit_code -ne 0 ]; then
echo "Error: $output"
exit $exit_code
fi

model_name=$output
echo "The model to run is: $model_name"
uv run python src/genesys/generate.py @ configs/"$model_name".toml
30 changes: 30 additions & 0 deletions src/genesys/auto_detect_model_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import torch
import sys


def get_model_name(device_name: str) -> str:
"""find out which model we can run"""
if "A100" in device_name:
return "llama70b"
elif "H100" in device_name:
# data from https://www.nvidia.com/en-us/data-center/h100/
# NOTE: Specifications are one-half lower without sparsity.
if "NVL" in device_name:
return "llama70b"
elif "PCIe" in device_name:
return "llama70b"
else: # for H100 SXM and other variants
return "llama70b"
elif "H200" in device_name:
return "deepseek_r1"
else: # for other GPU types, assume A100
return "llama70b"

if __name__ == "__main__":
try:
device_info_torch = torch.cuda.get_device_name(torch.device("cuda:0"))
run_model_name = get_model_name(device_info_torch)
print(run_model_name)
except Exception as e:
print(f"Error: {str(e)}", file=sys.stderr)
sys.exit(1)

0 comments on commit 0fba3ac

Please sign in to comment.