Skip to content

Commit

Permalink
Add single node Neuron test to the e2e tester
Browse files Browse the repository at this point in the history
  • Loading branch information
weicongw committed Jun 20, 2024
1 parent 56cbd21 commit 26c6501
Show file tree
Hide file tree
Showing 12 changed files with 606 additions and 1 deletion.
7 changes: 6 additions & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,9 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- run: docker build --file e2e2/test/images/Dockerfile.aws-efa-nccl-tests .
- run: docker build --file e2e2/test/images/Dockerfile.aws-efa-nccl-tests .
build-neuronx:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- run: docker build --file e2e2/test/images/Dockerfile.neuronx-tests .
15 changes: 15 additions & 0 deletions e2e2/internal/framework_extensions/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"

appsv1 "k8s.io/api/apps/v1"
batchv1 "k8s.io/api/batch/v1"
apimachinerywait "k8s.io/apimachinery/pkg/util/wait"

"sigs.k8s.io/e2e-framework/klient/k8s"
Expand Down Expand Up @@ -41,3 +42,17 @@ func (c *ConditionExtension) DaemonSetReady(daemonset k8s.Object) apimachinerywa
return
}
}

func (c *ConditionExtension) JobSucceeded(job k8s.Object) apimachinerywait.ConditionWithContextFunc {
return func(ctx context.Context) (done bool, err error) {
if err := c.resources.Get(ctx, job.GetName(), job.GetNamespace(), job); err != nil {
return false, err
}
status := job.(*batchv1.Job).Status
spec := job.(*batchv1.Job).Spec
if status.Succeeded == *spec.Completions {
done = true
}
return
}
}
80 changes: 80 additions & 0 deletions e2e2/test/cases/neuron/main_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package nvidia

import (
"context"
_ "embed"
"flag"
"log"
"os"
"slices"
"testing"
"time"

fwext "github.com/aws/aws-k8s-tester/e2e2/internal/framework_extensions"
appsv1 "k8s.io/api/apps/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/e2e-framework/klient/wait"
"sigs.k8s.io/e2e-framework/pkg/env"
"sigs.k8s.io/e2e-framework/pkg/envconf"
)

var (
testenv env.Environment
neuronTestImage *string
)

var (
//go:embed manifests/k8s-neuron-device-plugin-rbac.yml
neuronDevicePlugiRbacManifest []byte
//go:embed manifests/k8s-neuron-device-plugin.yml
neuronDevicePluginManifest []byte
)

func TestMain(m *testing.M) {
neuronTestImage = flag.String("neuronTestImage", "", "image for neuron single node test")
cfg, err := envconf.NewFromFlags()
if err != nil {
log.Fatalf("failed to initialize test environment: %v", err)
}
testenv = env.NewWithConfig(cfg)

// all NVIDIA tests require the device plugin and MPI operator
manifests := [][]byte{
neuronDevicePluginManifest,
neuronDevicePlugiRbacManifest,
}

testenv.Setup(
func(ctx context.Context, config *envconf.Config) (context.Context, error) {
err := fwext.ApplyManifests(config.Client().RESTConfig(), manifests...)
if err != nil {
return ctx, err
}
return ctx, nil
},
func(ctx context.Context, config *envconf.Config) (context.Context, error) {
ds := appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{Name: "neuron-device-plugin-daemonset", Namespace: "kube-system"},
}
err := wait.For(fwext.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&ds),
wait.WithTimeout(time.Minute*5))
if err != nil {
return ctx, err
}
return ctx, nil
},
)

testenv.Finish(
func(ctx context.Context, config *envconf.Config) (context.Context, error) {
slices.Reverse(manifests)
err = fwext.DeleteManifests(config.Client().RESTConfig(), manifests...)
if err != nil {
return ctx, err
}
return ctx, nil
},
)

os.Exit(testenv.Run(m))
}
58 changes: 58 additions & 0 deletions e2e2/test/cases/neuron/manifests/k8s-neuron-device-plugin-rbac.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: neuron-device-plugin
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- apiGroups:
- ""
resources:
- pods
verbs:
- update
- patch
- get
- list
- watch
- apiGroups:
- ""
resources:
- nodes/status
verbs:
- patch
- update
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: neuron-device-plugin
namespace: kube-system
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: neuron-device-plugin
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: neuron-device-plugin
subjects:
- kind: ServiceAccount
name: neuron-device-plugin
namespace: kube-system
97 changes: 97 additions & 0 deletions e2e2/test/cases/neuron/manifests/k8s-neuron-device-plugin.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: neuron-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: neuron-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
# Uncomment the annotation below if k8s version is 1.13 or lower
# annotations:
# scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
name: neuron-device-plugin-ds
spec:
serviceAccount: neuron-device-plugin
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- key: aws.amazon.com/neuron
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
# Uncomment following matchExpressions if using k8s 1.16 or lower
#- matchExpressions:
# - key: "beta.kubernetes.io/instance-type"
# operator: In
# values:
# - inf1.xlarge
# - inf1.2xlarge
# - inf1.6xlarge
# - inf1.24xlarge
# - inf2.xlarge
# - inf2.8xlarge
# - inf2.24xlarge
# - inf2.48xlarge
# - trn1.2xlarge
# - trn1.32xlarge
# - trn1n.32xlarge
- matchExpressions:
- key: "node.kubernetes.io/instance-type"
operator: In
values:
- inf1.xlarge
- inf1.2xlarge
- inf1.6xlarge
- inf1.24xlarge
- inf2.xlarge
- inf2.8xlarge
- inf2.24xlarge
- inf2.48xlarge
- trn1.2xlarge
- trn1.32xlarge
- trn1n.32xlarge
containers:
# Find all neuron-device-plugin images at https://gallery.ecr.aws/neuron/neuron-device-plugin
- image: public.ecr.aws/neuron/neuron-device-plugin:2.19.16.0
imagePullPolicy: Always
name: neuron-device-plugin
env:
- name: KUBECONFIG
value: /etc/kubernetes/kubelet.conf
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: infa-map
mountPath: /run
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: infa-map
hostPath:
path: /run



30 changes: 30 additions & 0 deletions e2e2/test/cases/neuron/manifests/single-node-test-neuronx.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
kind: Job
apiVersion: batch/v1
metadata:
name: neuronx-single-node
labels:
app: neuronx-single-node
spec:
template:
metadata:
labels:
app: neuronx-single-node
spec:
containers:
- name: neuronx-single-node-test
image: {{.NeuronTestImage}}
command:
- /bin/bash
- ./pytorch_tests/singleNodeTest.sh
imagePullPolicy: Always
resources:
limits:
cpu: "4"
memory: 4Gi
aws.amazon.com/neuron: "1"
requests:
cpu: "1"
memory: 1Gi
aws.amazon.com/neuron: "1"
restartPolicy: Never
backoffLimit: 4
70 changes: 70 additions & 0 deletions e2e2/test/cases/neuron/neuron_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package nvidia

import (
"context"
_ "embed"
"fmt"
"testing"
"time"

fwext "github.com/aws/aws-k8s-tester/e2e2/internal/framework_extensions"
"sigs.k8s.io/e2e-framework/klient/wait"
"sigs.k8s.io/e2e-framework/pkg/envconf"
"sigs.k8s.io/e2e-framework/pkg/features"

batchv1 "k8s.io/api/batch/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

var (
//go:embed manifests/single-node-test-neuronx.yaml
neuronSingleNodeManifest []byte
renderedNeuronSingleNodeManifest []byte
)

type neuronSingleNodeManifestTplVars struct {
NeuronTestImage string
}

func TestMPIJobPytorchTraining(t *testing.T) {
singleNode := features.New("single-node").
WithLabel("suite", "neuron").
WithLabel("hardware", "gpu").
Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
if *neuronTestImage == "" {
t.Fatal(fmt.Errorf("neuronTestImage must be set to run neuron single node test, use https://github.com/aws/aws-k8s-tester/blob/main/e2e2/test/images/Dockerfile.neuronx-tests to build the image and -neuronTestImage to set the image url"))
}
renderedNeuronSingleNodeManifest, err := fwext.RenderManifests(neuronSingleNodeManifest, neuronSingleNodeManifestTplVars{
NeuronTestImage: *neuronTestImage,
})
if err != nil {
t.Fatal(err)
}
err = fwext.ApplyManifests(cfg.Client().RESTConfig(), renderedNeuronSingleNodeManifest)
if err != nil {
t.Fatal(err)
}
return ctx
}).
Assess("Single node test Job succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{Name: "neuronx-single-node", Namespace: "default"},
}
err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job),
wait.WithTimeout(time.Minute*20))
if err != nil {
t.Fatal(err)
}
return ctx
}).
Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
err := fwext.DeleteManifests(cfg.Client().RESTConfig(), renderedNeuronSingleNodeManifest)
if err != nil {
t.Fatal(err)
}
return ctx
}).
Feature()

testenv.Test(t, singleNode)
}
5 changes: 5 additions & 0 deletions e2e2/test/images/Dockerfile.neuronx-tests
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Start with the Neuron base image
FROM public.ecr.aws/neuron/pytorch-training-neuronx:2.1.2-neuronx-py310-sdk2.18.2-ubuntu20.04

WORKDIR /
COPY e2e2/test/images/pytorch_tests/ ./pytorch_tests
5 changes: 5 additions & 0 deletions e2e2/test/images/pytorch_tests/singleNodeTest.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/usr/bin/env bash

torchrun --nproc_per_node=2 --nnodes=1 pytorch_tests/testNeuronSingleAllReduce.py
torchrun --nproc_per_node=2 --nnodes=1 pytorch_tests/testNeuronParallelState.py
torchrun --nproc_per_node=2 --nnodes=1 pytorch_tests/testNeuronMlp.py
Loading

0 comments on commit 26c6501

Please sign in to comment.