forked from ajbrock/BigGAN-PyTorch
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun_on_raven.sh
69 lines (59 loc) · 2.28 KB
/
run_on_raven.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/bin/bash -l
#SBATCH -D ./
### OUTPUTS, ERRORS AND JOB NAME:
### Replace these with a path to an existing folder where you have writing access.
### The %j will be replaced by a unique SLURM job id (useful to avoid ovewriting existing files)
#SBATCH -o /ptmp/pierocor/BigGan_out/output/E256_174_w8_1e-5_2e-5_s0_p.%j
#SBATCH -e /ptmp/pierocor/BigGan_out/output/E256_174_w8_1e-5_2e-5_s0_p.%j
#SBATCH -J E256_174_w8_1e-5_2e-5_s0_p
### TIME LIMIT: e.g.
### 1-00:00:00 -> 1 day (Maximum)
### 0-00:20:00 -> 20 minutes
#SBATCH --time=1-00:00:00
### NODE features:
### No need to modify them on raven!
#SBATCH --constraint="gpu"
#SBATCH --gres=gpu:a100:4
#SBATCH --mem=0
#SBATCH --nodes=1
#SBATCH --ntasks-per-socket=2
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=18
#SBATCH --threads-per-core=1
### Modules and env variables
source raven.env
### print loaded modules and basic SLURM info
module list
echo -e "Nodes: ${SLURM_JOB_NUM_NODES} \t NTASK: ${SLURM_NTASKS}"
echo "${SLURM_NODELIST}"
DATA_ROOT="/ptmp/pierocor/datasets" # This should work but you can use a different one
WEIGHTS_ROOT="/ptmp/pierocor/BigGan_out/weights" # Replace by a folder where you have writing aaccess
LOGS_ROOT="/ptmp/pierocor/BigGan_out/logs" # Replace by a folder where you have writing aaccess
SAMPLE_ROOT="/ptmp/pierocor/BigGan_out/samples" # Replace by a folder where you have writing aaccess
### Run the program:
### Change wathever you want and have fun!
srun python train.py \
--data_root $DATA_ROOT \
--weights_root $WEIGHTS_ROOT \
--logs_root $LOGS_ROOT \
--samples_root $SAMPLE_ROOT \
--num_epochs 5 \
--dataset E256_hdf5 \
--shuffle --num_workers 8 --batch_size 174 \
--num_G_accumulations 1 --num_D_accumulations 1 \
--num_D_steps 1 --G_lr 1e-5 --D_lr 2e-5 --D_B2 0.999 --G_B2 0.999 \
--G_attn 64 --D_attn 64 \
--G_nl inplace_relu --D_nl inplace_relu \
--SN_eps 1e-6 --BN_eps 1e-5 --adam_eps 1e-6 \
--G_ortho 0.0 \
--G_shared \
--G_init ortho --D_init ortho \
--hier --dim_z 120 --shared_dim 128 \
--G_eval_mode \
--G_ch 96 --D_ch 96 \
--ema --use_ema --ema_start 20000 \
--test_every 1000 --save_every 1000 \
--num_best_copies 5 --num_save_copies 2 \
--seed 0 --parallel
### RESUME:
### if you run the same config twice and you use the --resume flag, it will load the last checkpoint