-
Notifications
You must be signed in to change notification settings - Fork 96
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Slurm job template: how a job can probe instance topology and hostnam…
…e-instanceid mappings
- Loading branch information
Verdi March
committed
Apr 16, 2024
1 parent
e87d5d1
commit dd72aaf
Showing
2 changed files
with
210 additions
and
0 deletions.
There are no files selected for viewing
154 changes: 154 additions & 0 deletions
154
4.validation_and_observability/6.slurm-job-template-probe-ec2/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
# Slurm job template to probe EC2 informations | ||
|
||
Usage: review and customize [job-template.sbatch](job-template.sbatch) to your need. | ||
|
||
Below shows the sample output (trimmed) of customizing the template to run Megatron-LM, i.e., | ||
replace the `srun -l /usr/bin/hostname` with the relevant parts from the [Megatron-LM | ||
example](../../3.test_cases/1.megatron-lm/)). | ||
|
||
```text | ||
... | ||
+ validate_ec2_same_spine | ||
++ lstopo_ec2 | ||
++ INSTANCE_IDS=($(srun cat /sys/devices/virtual/dmi/id/board_asset_tag)) | ||
+++ srun cat /sys/devices/virtual/dmi/id/board_asset_tag | ||
++ local INSTANCE_IDS | ||
++ aws ec2 describe-instance-topology --instance-ids i-1111111111example i-0000000000example | ||
+ local 'TOPO_JSON={ | ||
"Instances": [ | ||
{ | ||
"InstanceId": "i-0000000000example", | ||
"InstanceType": "p4de.24xlarge", | ||
"NetworkNodes": [ | ||
"nn-1111111111example", | ||
"nn-2222222222example", | ||
"nn-3333333333example" | ||
], | ||
"AvailabilityZone": "us-west-2b", | ||
"ZoneId": "usw2-az2" | ||
}, | ||
{ | ||
"InstanceId": "i-1111111111example", | ||
"InstanceType": "p4de.24xlarge", | ||
"NetworkNodes": [ | ||
"nn-1111111111example", | ||
"nn-2222222222example", | ||
"nn-3333333333example" | ||
], | ||
"AvailabilityZone": "us-west-2b", | ||
"ZoneId": "usw2-az2" | ||
} | ||
] | ||
}' | ||
+ echo '{ | ||
"Instances": [ | ||
{ | ||
"InstanceId": "i-0000000000example", | ||
"InstanceType": "p4de.24xlarge", | ||
"NetworkNodes": [ | ||
"nn-1111111111example", | ||
"nn-2222222222example", | ||
"nn-3333333333example" | ||
], | ||
"AvailabilityZone": "us-west-2b", | ||
"ZoneId": "usw2-az2" | ||
}, | ||
{ | ||
"InstanceId": "i-1111111111example", | ||
"InstanceType": "p4de.24xlarge", | ||
"NetworkNodes": [ | ||
"nn-1111111111example", | ||
"nn-2222222222example", | ||
"nn-3333333333example" | ||
], | ||
"AvailabilityZone": "us-west-2b", | ||
"ZoneId": "usw2-az2" | ||
} | ||
] | ||
}' | ||
{ | ||
"Instances": [ | ||
{ | ||
"InstanceId": "i-0000000000example", | ||
"InstanceType": "p4de.24xlarge", | ||
"NetworkNodes": [ | ||
"nn-1111111111example", | ||
"nn-2222222222example", | ||
"nn-3333333333example" | ||
], | ||
"AvailabilityZone": "us-west-2b", | ||
"ZoneId": "usw2-az2" | ||
}, | ||
{ | ||
"InstanceId": "i-1111111111example", | ||
"InstanceType": "p4de.24xlarge", | ||
"NetworkNodes": [ | ||
"nn-1111111111example", | ||
"nn-2222222222example", | ||
"nn-3333333333example" | ||
], | ||
"AvailabilityZone": "us-west-2b", | ||
"ZoneId": "usw2-az2" | ||
} | ||
] | ||
} | ||
++ echo '{ | ||
"Instances": [ | ||
{ | ||
"InstanceId": "i-0000000000example", | ||
"InstanceType": "p4de.24xlarge", | ||
"NetworkNodes": [ | ||
"nn-1111111111example", | ||
"nn-2222222222example", | ||
"nn-3333333333example" | ||
], | ||
"AvailabilityZone": "us-west-2b", | ||
"ZoneId": "usw2-az2" | ||
}, | ||
{ | ||
"InstanceId": "i-1111111111example", | ||
"InstanceType": "p4de.24xlarge", | ||
"NetworkNodes": [ | ||
"nn-1111111111example", | ||
"nn-2222222222example", | ||
"nn-3333333333example" | ||
], | ||
"AvailabilityZone": "us-west-2b", | ||
"ZoneId": "usw2-az2" | ||
} | ||
] | ||
}' | ||
++ grep '^ *"nn\-.................\"' | ||
++ sort -n | ||
++ uniq -c | ||
++ wc -l | ||
+ local UNIQ_NN=3 | ||
+ echo Expected 3 nn ids, got 3 nn ids | ||
Expected 3 nn ids, got 3 nn ids | ||
+ [[ 3 -eq 3 ]] | ||
... | ||
+ srun -l bash -c 'echo "hostname <=> instance_id mapping: $(hostname) <=> $(cat /sys/devices/virtual/dmi/id/board_asset_tag)"' | ||
1: hostname <=> instance_id mapping: p4de-st-p4de-2 <=> i-0000000000example | ||
0: hostname <=> instance_id mapping: p4de-st-p4de-1 <=> i-1111111111example | ||
+ srun -l bash -c 'echo BEFORE: $(hostname) $(sudo lctl get_param llite.*.stats | grep write_bytes)' | ||
0: BEFORE: p4de-st-p4de-1 write_bytes 10361 samples [bytes] 1 2147479552 401369912220 | ||
1: BEFORE: p4de-st-p4de-2 | ||
... | ||
++ date | ||
+ BEGIN_TRAINING='Mon Apr 8 08:33:01 UTC 2024' | ||
+ SECONDS=0 | ||
+ srun -l --container-image ... --container-mounts ... python -m torch.distributed.run ... /workspace/Megatron-LM/pretrain_gpt.py ... | ||
... | ||
++ date | ||
+ END_TRAINING='Mon Apr 8 09:02:25 UTC 2024' | ||
+ echo 'BEGIN_TRAINING: Mon Apr 8 08:33:01 UTC 2024' | ||
BEGIN_TRAINING: Mon Apr 8 08:33:01 UTC 2024 | ||
+ echo 'END_TRAINING : Mon Apr 8 09:02:25 UTC 2024' | ||
END_TRAINING : Mon Apr 8 09:02:25 UTC 2024 | ||
+ echo 'Elapsed: 29min 24sec' | ||
Elapsed: 29min 24sec | ||
+ srun -l bash -c 'echo AFTER: $(hostname) $(sudo lctl get_param llite.*.stats | grep write_bytes)' | ||
srun: Step created for StepId=191.4 | ||
1: AFTER: p4de-st-p4de-2 | ||
0: AFTER: p4de-st-p4de-1 write_bytes 11553 samples [bytes] 1 2147479552 775980517197 | ||
``` |
56 changes: 56 additions & 0 deletions
56
4.validation_and_observability/6.slurm-job-template-probe-ec2/job-template.sbatch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
#!/bin/bash | ||
|
||
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# SPDX-License-Identifier: MIT-0 | ||
|
||
#SBATCH --nodes=2 # number of nodes to use | ||
|
||
set -exuo pipefail | ||
|
||
|
||
################################################################################ | ||
## Preamble | ||
################################################################################ | ||
# Helper function to query instance topology | ||
lstopo_ec2() { | ||
local INSTANCE_IDS=( $(srun cat /sys/devices/virtual/dmi/id/board_asset_tag) ) | ||
aws ec2 describe-instance-topology --instance-ids "${INSTANCE_IDS[@]}" | ||
} | ||
|
||
# Are SLURM_NNODES on the same network spine? | ||
validate_ec2_same_spine() { | ||
local TOPO_JSON="$(lstopo_ec2)" | ||
echo "${TOPO_JSON}" | ||
|
||
local UNIQ_NN=$(echo "$TOPO_JSON" | grep '^ *"nn\-.................\"' | sort -n | uniq -c | wc -l) | ||
echo Expected 3 nn ids, got $UNIQ_NN nn ids | ||
[[ $UNIQ_NN -eq 3 ]] || echo WARNING: ec2 instances on different network spine... | ||
} | ||
validate_ec2_same_spine | ||
|
||
# Track instance ids, later on to view their CloudWatch metrics. | ||
srun -l bash -c "echo \"hostname <=> instance_id mapping: \$(hostname) <=> \$(cat /sys/devices/virtual/dmi/id/board_asset_tag)\"" | ||
|
||
# Track per-instance cumulative Lustre statistics. In this example, we only show the write_bytes. | ||
srun -l bash -c "echo BEFORE: \$(hostname) \$(sudo lctl get_param llite.*.stats | grep write_bytes)" || true | ||
|
||
env | ||
|
||
|
||
################################################################################ | ||
## Actual scripts. Below example runs /usr/bin/hostname on all allocated nodes. | ||
################################################################################ | ||
BEGIN_TRAINING=$(date) | ||
SECONDS=0 | ||
srun -l /usr/bin/hostname | ||
END_TRAINING=$(date) | ||
echo "BEGIN_TRAINING: ${BEGIN_TRAINING}" | ||
echo "END_TRAINING : ${END_TRAINING}" | ||
echo "Elapsed: $(($SECONDS / 60))min $(($SECONDS % 60))sec" | ||
|
||
|
||
################################################################################ | ||
## Postamble | ||
################################################################################ | ||
# Track per-instance cumulative Lustre statistics. In this exmaple, we only show the write_bytes. | ||
srun -l bash -c "echo AFTER: \$(hostname) \$(sudo lctl get_param llite.*.stats | grep write_bytes)" || true |