forked from Surrey-EEEM071-CVDL/CourseWork
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTrioAug-Finetune-Arch.submit_file
57 lines (45 loc) · 2.34 KB
/
TrioAug-Finetune-Arch.submit_file
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
####################
#
# Example Job for HTCondor
#
####################
#---------------------------------------------
# Name your batch so it's easy to distinguish in condor_q.
JobBatchName = "Trio-Finetune-Architecture-and-Eval"
# --------------------------------------------
# Executable: Choose cu version depends on docker_image
executable = /mnt/fast/nobackup/users/nt00601/miniconda3/envs/cu118_py311/bin/python3.11
# ---------------------------------------------------
# Universe (vanilla, docker): Choose CUDADriverVersion depends on what's shown on condor_status
# see https://docs.pages.surrey.ac.uk/research_computing/condor/tips.html#cuda-requirements
universe = docker
docker_image = nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
#docker_image = nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu22.04
#docker_image = nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu20.04
# -------------------------------------------------
# Event, out and error logs
log = c$(cluster).p$(process).log
error = c$(cluster).p$(process).error
# --------------------------------------
# GPU, Storage and CUDA Requirements for the Job
# All of these requirements say: We are using rtx2080 or rtx3090
requirements = (CUDAGlobalMemoryMb > 20000) && (CUDAGlobalMemoryMb < 75000) && (CUDACapability > 7) && \
(HasWeka)
# Resources
request_GPUs = 2
+GPUMem = 24000
request_CPUs = 1
request_memory = 26G
#This job will complete in less than 1 hour
+JobRunTime = 4
#This job can checkpoint
+CanCheckpoint = true
# Request for guaranteed run time(measured in s to match epoch runtime). 0 mean job is happy to checkpoint and move at any time.
# This lets Condor remove our job ASAP if a machine needs rebooting. Useful when we can checkpoint and restore
MaxJobRetirementTime = 0
# -----------------------------------
arguments = $(script) -s veri -t veri --use-avai-gpus --workers 8 --workers 16 -a $(a) --root $(root) --height 224 --width 224 --random-erase --color-aug --color-jitter --optim sgd --lr 0.5 --weight-decay 2e-05 --lr-scheduler sequential --label-smooth --max-epoch 10 --train-batch-size 128 --test-batch-size 100 --save-dir $(save)
root = /mnt/fast/nobackup/users/nt00601/content
save = /mnt/fast/nobackup/users/nt00601/CourseWork-main/logs/$(a)/Finetune/TrioAug
script = /mnt/fast/nobackup/users/nt00601/CourseWork-main/main.py
queue 1 a in resnet50, resnet50_fc512