-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelper.sh
executable file
·202 lines (175 loc) · 7.56 KB
/
helper.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/bin/bash
function systemCheck()
{
# Grab information about CPU cores
Socket=$(lscpu | awk '/^Socket\(s\)/{ print $2 }')
Cores_per_socket=$(lscpu | awk '/^Core\(s\) per socket/{ print $4 }')
Cores=$(( $(lscpu | awk '/^Socket\(s\)/{ print $2 }') * $(lscpu | awk '/^Core\(s\) per socket/{ print $4 }') ))
# Grab information about NUMA
Numa_nodes=$(lscpu | awk '/^NUMA node\(s\)/{ print $3 }')
Cores_per_numa=$(( $Cores / $Numa_nodes ))
# Grab information about devices/accelerators
Num_devices=$(rocminfo | grep "amdgcn-amd-amdhsa--" | wc -l)
# List the NUMA affinity of each GPU#id in a list
# On parrypeak this should sort of yield Gpu_numa_affinity_list=(2 3 0 1)
# which should read as GPU0 has affinity to NUMA node 2, and so on.
Gpu_numa_affinity_list=()
# Also, sometimes NUMA affinity is screwed up and all GPUs are not evenly distributed.
# Thus we attempt to find, Number of NUMA nodes with GPUs/devices in them
Numa_nodes_wGPUs=1
for ((idx = 0; idx < ${Num_devices}; idx++))
do
IDX=$(rocm-smi --showtoponuma | grep "GPU\[$idx" | awk 'NR==1 {print $6}')
Gpu_numa_affinity_list=("${Gpu_numa_affinity_list[@]}" "$IDX")
if [ $idx -gt 0 ] && [ ${Gpu_numa_affinity_list[$(($idx-1))]} -ne ${Gpu_numa_affinity_list[$idx]} ]
then
Numa_nodes_wGPUs=$(( $Numa_nodes_wGPUs + 1 ))
fi
done
# Lets find the max. NUMA node number to ensure we are not going out of bounds
# This will allow us to correctly set up the start and stop counts of the CPUs
Gpu_numa_affinity_list_max=${Gpu_numa_affinity_list[0]}
for n in "${Gpu_numa_affinity_list[@]}" ; do
((n > Gpu_numa_affinity_list_max)) && Gpu_numa_affinity_list_max=$(( $n+1 ))
done
if [[ $Gpu_numa_affinity_list_max -gt $Numa_nodes ]]; then
echo "ERROR: Something is wrong with the NUMA affinity, and setup.
Please check your system config again. Exiting..."
exit 1
fi
# Find number of GPUs/accelerators on each NUMA node.
# this assumes that the GPUs/accelerators are evenly distributed.
Ngus_perNuma=$(( $Num_devices/$Numa_nodes_wGPUs ))
}
function inputCheck()
{
# Check any user inputs for accelerators
if [ -z "${NUM_GPUS}" ]; then
let NUM_GPUS=$Num_devices
fi
if [ -z "${GPU_STRIDE}" ]; then
let GPU_STRIDE=1
fi
if [ -z "${GPU_START}" ]; then
let GPU_START=0
Available_devices=$(( ($Num_devices - $GPU_START) / $GPU_STRIDE ))
if [[ $NUM_GPUS -gt $Available_devices ]]; then
if [ $OMPI_COMM_WORLD_LOCAL_RANK -eq 0 ]; then
echo " ======================================================================
WARNING: After skipping first $GPU_START devices, with stride $GPU_STRIDE, only $Available_devices devices
are available. Cannot meet the requested $NUM_GPUS GPUs.
Will instead use $Available_devices GPUs for run!
======================================================================"
fi
NUM_GPUS=$Available_devices
fi
fi
if [ -z "${OMP_STRIDE}" ]; then
let OMP_STRIDE=1
fi
if [ -z "${RANK_STRIDE}" ]; then
Cores_requested=$(( $NUM_GPUS * $Cores_per_numa / ${Ngus_perNuma} ))
let RANK_STRIDE=$Cores_requested/${OMPI_COMM_WORLD_LOCAL_SIZE}
fi
if [ -z "${CPU_SHIFT}" ]; then
let CPU_SHIFT=0
fi
if [ -z "${CPU_FOR_OS}" ]; then
let CPU_FOR_OS=0
fi
}
function setupAffinity()
{
# Evaluate how many ranks to be distributed to each device
let ranks_per_gpu=$(((${OMPI_COMM_WORLD_LOCAL_SIZE}+${NUM_GPUS}-1)/${NUM_GPUS}))
# Evaluate GPU #id for each rank
let my_gpu=$(($OMPI_COMM_WORLD_LOCAL_RANK*$GPU_STRIDE/$ranks_per_gpu))+${GPU_START}
# Evaluate local rank per device
if [[ $ranks_per_gpu -gt 1 ]]
then
if [[ $Ngus_perNuma -gt 1 ]]
then
local_rank_per_gpu=$(( $OMPI_COMM_WORLD_LOCAL_RANK % ${Cores_per_numa} ))
else
local_rank_per_gpu=$(( $OMPI_COMM_WORLD_LOCAL_RANK % ${ranks_per_gpu} ))
fi
else
local_rank_list=(0)
for ((i = 1; i < ${Num_devices}; i++))
do
if [ ${Gpu_numa_affinity_list[$(($i-1))]} -eq ${Gpu_numa_affinity_list[$i]} ]
then
entry=$(( ${local_rank_list[$(($i-1))]} + 1 ))
else
entry=0
fi
local_rank_list=("${local_rank_list[@]}" "${entry}")
done
local_rank_per_gpu=${local_rank_list[${my_gpu}]}
fi
# Evaluate the start and stop positions (including OMP threads) for each rank
let cpu_start=$(($Cores_per_numa*${Gpu_numa_affinity_list[${my_gpu}]}))+$(($RANK_STRIDE*${local_rank_per_gpu}))+${CPU_SHIFT}
let cpu_stop=$(($cpu_start+$OMP_NUM_THREADS*$OMP_STRIDE-1))+${CPU_FOR_OS}
# Total CPUs needed = stop-start
# To check if MPI size is more than available (with NGPUs and RANK_STRIDE):
# if (MPI_Size (1+cpus_needed) > Cores_requested); then exit; else continue.
cpus_needed=$(( $cpu_stop - $cpu_start + 1))
# Evaluate avail cores = Cores_req/(1+cpus_needed). Note 1 was already added above.
available_cores=$(( $Cores_requested/$cpus_needed ))
if [ ${OMPI_COMM_WORLD_LOCAL_SIZE} -gt $available_cores ]; then
echo "ERROR: Based on your configuration, cannot run with more than $available_cores ranks.
Please check your setup - NUM_GPUS, RANK_STRIDE in this script.
Exiting..."
exit 1
fi
export GOMP_CPU_AFFINITY=$cpu_start-$cpu_stop:$OMP_STRIDE
# export OMP_PLACES="{$cpu_start:$OMP_NUM_THREADS:$OMP_STRIDE}"
# export ROCR_VISIBLE_DEVICES=${Gpu_numa_affinity_list[${my_gpu}]}
export ROCR_VISIBLE_DEVICES=${my_gpu}
}
function printAffinity()
{
# eval "taskset --cpu-list ${cpu_start}-${cpu_stop} $*"
echo -e "Local Rank = "$OMPI_COMM_WORLD_LOCAL_RANK"\tGOMP_CPU_AFFINITY = "$GOMP_CPU_AFFINITY"\t NUMA Node = "${Gpu_numa_affinity_list[${my_gpu}]}"\tROCR_VISIBLE_DEVICES = "$ROCR_VISIBLE_DEVICES
# echo -e "Local Rank = "$OMPI_COMM_WORLD_LOCAL_RANK"\tOMP_PLACES = "$OMP_PLACES"\t NUMA Node = "${Gpu_numa_affinity_list[${my_gpu}]}"\tROCR_VISIBLE_DEVICES = "$ROCR_VISIBLE_DEVICES
}
function printVars()
{
echo "============================================"
echo -e "Core \t\t\t: $Cores"
echo -e "Socket \t\t\t: $Socket"
echo -e "Cores_per_socket \t: ${Cores_per_socket}"
echo -e "Numa Nodes \t\t: ${Numa_nodes}"
echo -e "Cores_per_Numa \t\t: ${Cores_per_numa}"
echo -e "Num_devices \t\t: ${Num_devices}"
echo -e "Numa nodes w GPUs \t: ${Numa_nodes_wGPUs}"
echo -e "NGPUs per NUMA \t\t: ${Ngus_perNuma}"
echo -e "Available Devices \t: ${Available_devices}"
echo -e "Cores Requested \t: ${Cores_requested}"
echo -e "Rank Stride \t\t: ${RANK_STRIDE}"
echo -e "ranks_per_gpu \t\t: ${ranks_per_gpu}"
echo -e "available cores \t: ${available_cores}"
echo -e "GPU Numa Affinity List \t: ${Gpu_numa_affinity_list[*]}"
echo "============================================"
}
function profilePerRank()
{
# Add a date/timestamp to distinguish the profiles
now=$(date +'%m%d%Y')
prof_name=motorbike_simple_${now}
eval "${ROCM_PATH}/bin/rocprof --hsa-trace --roctx-trace -d ./rocprof/${prof_name}.${OMPI_COMM_WORLD_RANK} -o ./rocprof/${prof_name}.${OMPI_COMM_WORLD_RANK}/${prof_name}.${OMPI_COMM_WORLD_RANK}.csv $*"
}
systemCheck
inputCheck
setupAffinity
printAffinity
# wait
# sleep 3
# if [[ $OMPI_COMM_WORLD_LOCAL_RANK -eq 0 ]]; then
# printVars
# fi
# profilePerRank
$@
#
# End of file
# Author: Suyash Tandon