From 5f4c1e10ae156b5273800fed558d9743bafb4a82 Mon Sep 17 00:00:00 2001 From: "Kasravi, Kam D" Date: Wed, 1 Jul 2020 09:24:57 -0700 Subject: [PATCH] k8s kustomize refactoring and improvements --- examples/common/KubernetesMlOps.md | 28 ++++++++++ .../k8s/mlops/base/kustomization.yaml | 0 examples/common/k8s/mlops/base/mlops.env | 5 ++ .../fp32 => common}/k8s/mlops/base/mlops.yaml | 0 .../mlops/base/persistent-volume-claim.yaml | 0 .../k8s/mlops/base/persistent-volume.yaml | 0 .../k8s/mlops/base/service-account.yaml | 0 .../k8s/mlops/multi-node/kustomization.yaml | 8 +++ .../k8s/mlops/multi-node/mlops.yaml | 2 + .../k8s/mlops/multi-node/mpi-job.yaml | 18 +----- .../k8s/mlops/single-node/kustomization.yaml | 8 +++ .../k8s/mlops/single-node/mlops.yaml | 2 + .../common/k8s/mlops/single-node/pod.yaml | 16 ++++++ .../training/fp32/k8s/mlops/base/mlops.env | 4 -- .../k8s/mlops/multi-node/kustomization.yaml | 24 +++++--- .../fp32/k8s/mlops/multi-node/mlops.env | 2 + .../k8s/mlops/multi-node/mpi-job-patch.yaml | 8 +-- .../fp32/k8s/mlops/multi-node/mpi-job.yaml | 56 ------------------- .../k8s/mlops/single-node/kustomization.yaml | 24 +++++--- .../fp32/k8s/mlops/single-node/mlops.env | 2 + .../fp32/k8s/mlops/single-node/mlops.yaml | 11 ---- .../fp32/k8s/mlops/single-node/pod-patch.yaml | 10 +++- .../fp32/k8s/mlops/single-node/pod.yaml | 25 --------- .../fp32/k8s/mlops/base/kustomization.yaml | 35 ------------ .../training/fp32/k8s/mlops/base/mlops.env | 4 -- .../training/fp32/k8s/mlops/base/mlops.yaml | 7 --- .../mlops/base/persistent-volume-claim.yaml | 24 -------- .../k8s/mlops/base/persistent-volume.yaml | 26 --------- .../fp32/k8s/mlops/base/service-account.yaml | 4 -- .../k8s/mlops/multi-node/kustomization.yaml | 24 +++++--- .../fp32/k8s/mlops/multi-node/mlops.env | 2 + .../fp32/k8s/mlops/multi-node/mlops.yaml | 13 ----- .../k8s/mlops/multi-node/mpi-job-patch.yaml | 9 +-- .../k8s/mlops/single-node/kustomization.yaml | 24 +++++--- .../fp32/k8s/mlops/single-node/mlops.env | 2 + .../fp32/k8s/mlops/single-node/pod-patch.yaml | 9 ++- .../fp32/k8s/mlops/single-node/pod.yaml | 25 --------- 37 files changed, 159 insertions(+), 302 deletions(-) create mode 100644 examples/common/KubernetesMlOps.md rename examples/{image_recognition/tensorflow/resnet50v1_5/training/fp32 => common}/k8s/mlops/base/kustomization.yaml (100%) create mode 100644 examples/common/k8s/mlops/base/mlops.env rename examples/{image_recognition/tensorflow/resnet50v1_5/training/fp32 => common}/k8s/mlops/base/mlops.yaml (100%) rename examples/{image_recognition/tensorflow/resnet50v1_5/training/fp32 => common}/k8s/mlops/base/persistent-volume-claim.yaml (100%) rename examples/{image_recognition/tensorflow/resnet50v1_5/training/fp32 => common}/k8s/mlops/base/persistent-volume.yaml (100%) rename examples/{image_recognition/tensorflow/resnet50v1_5/training/fp32 => common}/k8s/mlops/base/service-account.yaml (100%) create mode 100644 examples/common/k8s/mlops/multi-node/kustomization.yaml rename examples/{image_recognition/tensorflow/resnet50v1_5/training/fp32 => common}/k8s/mlops/multi-node/mlops.yaml (93%) rename examples/{language_modeling/tensorflow/bert_large/training/fp32 => common}/k8s/mlops/multi-node/mpi-job.yaml (59%) create mode 100644 examples/common/k8s/mlops/single-node/kustomization.yaml rename examples/{language_modeling/tensorflow/bert_large/training/fp32 => common}/k8s/mlops/single-node/mlops.yaml (88%) create mode 100644 examples/common/k8s/mlops/single-node/pod.yaml delete mode 100644 examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/base/mlops.env delete mode 100644 examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/multi-node/mpi-job.yaml delete mode 100644 examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/single-node/mlops.yaml delete mode 100644 examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/single-node/pod.yaml delete mode 100644 examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/kustomization.yaml delete mode 100644 examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/mlops.env delete mode 100644 examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/mlops.yaml delete mode 100644 examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/persistent-volume-claim.yaml delete mode 100644 examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/persistent-volume.yaml delete mode 100644 examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/service-account.yaml delete mode 100644 examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/multi-node/mlops.yaml delete mode 100644 examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/single-node/pod.yaml diff --git a/examples/common/KubernetesMlOps.md b/examples/common/KubernetesMlOps.md new file mode 100644 index 000000000..e63f14454 --- /dev/null +++ b/examples/common/KubernetesMlOps.md @@ -0,0 +1,28 @@ +# Kubernetes Mlops + +## Using NFS as storage + +Configuring the examples to use NFS as storage requires specifying these values in the mlops.env within the tree show below: + +``` +examples +└── common + └── k8s + └── mlops + ├── base + │   └── mlops.env + ├── multi-node + └── single-node +``` + +The NFS related values within mlops.env are shown below: + +``` +NFS_PATH=/exported_users +NFS_MOUNT_PATH=/home +NFS_SERVER=0.0.0.0 +``` + +They should reflect values specific to your NFS implementation. NFS_PATH and NFS_SERVER are typically found in /etc/mtab +and are NFS server values. NFS_MOUNT_PATH is a nfs client option indicating where the exported file system is mounted at. + diff --git a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/base/kustomization.yaml b/examples/common/k8s/mlops/base/kustomization.yaml similarity index 100% rename from examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/base/kustomization.yaml rename to examples/common/k8s/mlops/base/kustomization.yaml diff --git a/examples/common/k8s/mlops/base/mlops.env b/examples/common/k8s/mlops/base/mlops.env new file mode 100644 index 000000000..99a57ab5f --- /dev/null +++ b/examples/common/k8s/mlops/base/mlops.env @@ -0,0 +1,5 @@ +DATASET_DIR=/dataset +NFS_PATH=/exported_users +NFS_MOUNT_PATH=/home +NFS_SERVER=0.0.0.0 +OUTPUT_DIR=/workspace/output diff --git a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/base/mlops.yaml b/examples/common/k8s/mlops/base/mlops.yaml similarity index 100% rename from examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/base/mlops.yaml rename to examples/common/k8s/mlops/base/mlops.yaml diff --git a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/base/persistent-volume-claim.yaml b/examples/common/k8s/mlops/base/persistent-volume-claim.yaml similarity index 100% rename from examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/base/persistent-volume-claim.yaml rename to examples/common/k8s/mlops/base/persistent-volume-claim.yaml diff --git a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/base/persistent-volume.yaml b/examples/common/k8s/mlops/base/persistent-volume.yaml similarity index 100% rename from examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/base/persistent-volume.yaml rename to examples/common/k8s/mlops/base/persistent-volume.yaml diff --git a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/base/service-account.yaml b/examples/common/k8s/mlops/base/service-account.yaml similarity index 100% rename from examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/base/service-account.yaml rename to examples/common/k8s/mlops/base/service-account.yaml diff --git a/examples/common/k8s/mlops/multi-node/kustomization.yaml b/examples/common/k8s/mlops/multi-node/kustomization.yaml new file mode 100644 index 000000000..2d5c6bf2d --- /dev/null +++ b/examples/common/k8s/mlops/multi-node/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +bases: +- ../base +resources: +- mpi-job.yaml +configurations: +- mlops.yaml diff --git a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/multi-node/mlops.yaml b/examples/common/k8s/mlops/multi-node/mlops.yaml similarity index 93% rename from examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/multi-node/mlops.yaml rename to examples/common/k8s/mlops/multi-node/mlops.yaml index 9b6c67dc6..f42a6e3da 100644 --- a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/multi-node/mlops.yaml +++ b/examples/common/k8s/mlops/multi-node/mlops.yaml @@ -1,4 +1,6 @@ varReference: +- kind: MPIJob + path: metadata/name - kind: MPIJob path: spec/mpiReplicaSpecs/Launcher/template/spec/containers/image - kind: MPIJob diff --git a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/multi-node/mpi-job.yaml b/examples/common/k8s/mlops/multi-node/mpi-job.yaml similarity index 59% rename from examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/multi-node/mpi-job.yaml rename to examples/common/k8s/mlops/multi-node/mpi-job.yaml index ae3db8c3c..0726596bd 100644 --- a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/multi-node/mpi-job.yaml +++ b/examples/common/k8s/mlops/multi-node/mpi-job.yaml @@ -1,7 +1,7 @@ apiVersion: kubeflow.org/v1alpha2 kind: MPIJob metadata: - name: bert-large-fp32-training + name: $(MODEL_NAME) spec: slotsPerWorker: 1 cleanPodPolicy: Running @@ -13,17 +13,8 @@ spec: serviceAccountName: model-service containers: - name: mpi-launcher - image: $(REGISTRY)/model-zoo:2.1.0-language-modeling-bert-large-fp32-training imagePullPolicy: Always workingDir: / - command: - - $(MODEL_DIR)/examples/fp32_training_multi_node.sh - volumeMounts: - - name: datasets - mountPath: $(DATASET_DIR) - readOnly: true - - name: users - mountPath: /Users volumes: - name: datasets persistentVolumeClaim: @@ -38,15 +29,8 @@ spec: serviceAccountName: model-service containers: - name: mpi-worker - image: $(REGISTRY)/model-zoo:2.1.0-language-modeling-bert-large-fp32-training imagePullPolicy: Always workingDir: / - volumeMounts: - - name: datasets - mountPath: $(DATASET_DIR) - readOnly: true - - name: users - mountPath: /Users volumes: - name: datasets persistentVolumeClaim: diff --git a/examples/common/k8s/mlops/single-node/kustomization.yaml b/examples/common/k8s/mlops/single-node/kustomization.yaml new file mode 100644 index 000000000..cebb0b2a3 --- /dev/null +++ b/examples/common/k8s/mlops/single-node/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +bases: +- ../base +resources: +- pod.yaml +configurations: +- mlops.yaml diff --git a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/single-node/mlops.yaml b/examples/common/k8s/mlops/single-node/mlops.yaml similarity index 88% rename from examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/single-node/mlops.yaml rename to examples/common/k8s/mlops/single-node/mlops.yaml index ad885f2dd..e5d002884 100644 --- a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/single-node/mlops.yaml +++ b/examples/common/k8s/mlops/single-node/mlops.yaml @@ -1,6 +1,8 @@ varReference: - kind: PersistentVolume path: spec/hostPath/path +- kind: Pod + path: metadata/name - kind: Pod path: spec/containers/image - kind: Pod diff --git a/examples/common/k8s/mlops/single-node/pod.yaml b/examples/common/k8s/mlops/single-node/pod.yaml new file mode 100644 index 000000000..ea0d29da2 --- /dev/null +++ b/examples/common/k8s/mlops/single-node/pod.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Pod +metadata: + name: $(MODEL_NAME) +spec: + serviceAccountName: model-service + containers: + - name: single-node + imagePullPolicy: Always + volumes: + - name: users + persistentVolumeClaim: + claimName: users-pvc + - name: datasets + persistentVolumeClaim: + claimName: datasets-pvc diff --git a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/base/mlops.env b/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/base/mlops.env deleted file mode 100644 index a3e6d56c9..000000000 --- a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/base/mlops.env +++ /dev/null @@ -1,4 +0,0 @@ -DATASET_DIR=/tf_dataset -NFS_PATH=/aipg_lab_home_pool_01 -NFS_SERVER=10.38.210.18 -OUTPUT_DIR=/workspace/output diff --git a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/multi-node/kustomization.yaml b/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/multi-node/kustomization.yaml index c330d9687..ae3e27c8b 100644 --- a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/multi-node/kustomization.yaml +++ b/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/multi-node/kustomization.yaml @@ -1,13 +1,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization bases: -- ../base -resources: -- mpi-job.yaml -images: -- name: model-zoo - newName: model-zoo - newTag: 2.1.0-image-recognition-resnet50v1-5-fp32-training +- ../../../common/k8s/mlops/multi-node configMapGenerator: - name: mlops-env env: mlops.env @@ -17,6 +11,13 @@ generatorOptions: patchesStrategicMerge: - mpi-job-patch.yaml vars: +- name: MODEL_NAME + fieldref: + fieldPath: data.MODEL_NAME + objref: + apiVersion: v1 + kind: ConfigMap + name: mlops-env - name: MODEL_DIR fieldref: fieldPath: data.MODEL_DIR @@ -24,6 +25,13 @@ vars: apiVersion: v1 kind: ConfigMap name: mlops-env +- name: NFS_MOUNT_PATH + fieldref: + fieldPath: data.NFS_MOUNT_PATH + objref: + apiVersion: v1 + kind: ConfigMap + name: mlops-env - name: REGISTRY fieldref: fieldPath: data.REGISTRY @@ -31,5 +39,3 @@ vars: apiVersion: v1 kind: ConfigMap name: mlops-env -configurations: -- mlops.yaml diff --git a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/multi-node/mlops.env b/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/multi-node/mlops.env index 2ed8294d5..08694f962 100644 --- a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/multi-node/mlops.env +++ b/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/multi-node/mlops.env @@ -1,4 +1,6 @@ DATASET_DIR=/tf_dataset +MODEL_NAME=resnet50v1_5_fp32_training MODEL_DIR=/workspace/resnet50v1_5_fp32_training +NFS_MOUNT_PATH=/Users OUTPUT_DIR=/tmp/output REGISTRY=amr-registry.caas.intel.com/aipg-tf diff --git a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/multi-node/mpi-job-patch.yaml b/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/multi-node/mpi-job-patch.yaml index 094301f3e..10b74267a 100644 --- a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/multi-node/mpi-job-patch.yaml +++ b/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/multi-node/mpi-job-patch.yaml @@ -1,7 +1,7 @@ apiVersion: kubeflow.org/v1alpha2 kind: MPIJob metadata: - name: resnet50v1-5-fp32-training + name: $(MODEL_NAME) spec: mpiReplicaSpecs: Launcher: @@ -10,7 +10,6 @@ spec: containers: - name: mpi-launcher image: $(REGISTRY)/model-zoo:2.1.0-image-recognition-resnet50v1-5-fp32-training - imagePullPolicy: Always securityContext: runAsUser: 0 runAsGroup: 0 @@ -18,7 +17,6 @@ spec: envFrom: - configMapRef: name: mlops-env - workingDir: / command: - $(MODEL_DIR)/examples/fp32_training_multi_node.sh volumeMounts: @@ -26,7 +24,7 @@ spec: mountPath: $(DATASET_DIR) readOnly: true - name: users - mountPath: /Users + mountPath: $(NFS_MOUNT_PATH) Worker: template: spec: @@ -46,4 +44,4 @@ spec: mountPath: $(DATASET_DIR) readOnly: true - name: users - mountPath: /Users + mountPath: $(NFS_MOUNT_PATH) diff --git a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/multi-node/mpi-job.yaml b/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/multi-node/mpi-job.yaml deleted file mode 100644 index 27feb3bab..000000000 --- a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/multi-node/mpi-job.yaml +++ /dev/null @@ -1,56 +0,0 @@ -apiVersion: kubeflow.org/v1alpha2 -kind: MPIJob -metadata: - name: resnet50v1-5-fp32-training -spec: - slotsPerWorker: 1 - cleanPodPolicy: Running - mpiReplicaSpecs: - Launcher: - replicas: 1 - template: - spec: - serviceAccountName: model-service - containers: - - name: mpi-launcher - image: $(REGISTRY)/model-zoo:2.1.0-image-recognition-resnet50v1-5-fp32-training - imagePullPolicy: Always - workingDir: / - command: - - /workspace/resnet50v1_5_fp32_training/examples/fp32_training_multi_node.sh - volumeMounts: - - name: datasets - mountPath: $(DATASET_DIR) - readOnly: true - - name: users - mountPath: /Users - volumes: - - name: datasets - persistentVolumeClaim: - claimName: datasets-pvc - - name: users - persistentVolumeClaim: - claimName: users-pvc - Worker: - replicas: 2 - template: - spec: - serviceAccountName: model-service - containers: - - name: mpi-worker - image: $(REGISTRY)/model-zoo:2.1.0-image-recognition-resnet50v1-5-fp32-training - imagePullPolicy: Always - workingDir: / - volumeMounts: - - name: datasets - mountPath: $(DATASET_DIR) - readOnly: true - - name: users - mountPath: /Users - volumes: - - name: datasets - persistentVolumeClaim: - claimName: datasets-pvc - - name: users - persistentVolumeClaim: - claimName: users-pvc diff --git a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/single-node/kustomization.yaml b/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/single-node/kustomization.yaml index c08d952f3..7a201306a 100644 --- a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/single-node/kustomization.yaml +++ b/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/single-node/kustomization.yaml @@ -1,13 +1,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization bases: -- ../base -resources: -- pod.yaml -images: -- name: model-zoo - newName: model-zoo - newTag: 2.1.0-image-recognition-resnet50v1-5-fp32-training +- ../../../common/k8s/mlops/single-node configMapGenerator: - name: mlops-env env: mlops.env @@ -17,6 +11,13 @@ generatorOptions: patchesStrategicMerge: - pod-patch.yaml vars: +- name: MODEL_NAME + fieldref: + fieldPath: data.MODEL_NAME + objref: + apiVersion: v1 + kind: ConfigMap + name: mlops-env - name: MODEL_DIR fieldref: fieldPath: data.MODEL_DIR @@ -24,6 +25,13 @@ vars: apiVersion: v1 kind: ConfigMap name: mlops-env +- name: NFS_MOUNT_PATH + fieldref: + fieldPath: data.NFS_MOUNT_PATH + objref: + apiVersion: v1 + kind: ConfigMap + name: mlops-env - name: REGISTRY fieldref: fieldPath: data.REGISTRY @@ -31,5 +39,3 @@ vars: apiVersion: v1 kind: ConfigMap name: mlops-env -configurations: -- mlops.yaml diff --git a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/single-node/mlops.env b/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/single-node/mlops.env index 2ed8294d5..62737d4f0 100644 --- a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/single-node/mlops.env +++ b/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/single-node/mlops.env @@ -1,4 +1,6 @@ DATASET_DIR=/tf_dataset +MODEL_NAME=resnet50v1_5_fp32_training MODEL_DIR=/workspace/resnet50v1_5_fp32_training +/NFS_MOUNT_PATH=/Users OUTPUT_DIR=/tmp/output REGISTRY=amr-registry.caas.intel.com/aipg-tf diff --git a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/single-node/mlops.yaml b/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/single-node/mlops.yaml deleted file mode 100644 index ad885f2dd..000000000 --- a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/single-node/mlops.yaml +++ /dev/null @@ -1,11 +0,0 @@ -varReference: -- kind: PersistentVolume - path: spec/hostPath/path -- kind: Pod - path: spec/containers/image -- kind: Pod - path: spec/containers/workingDir -- kind: Pod - path: spec/containers/command -- kind: Pod - path: spec/containers/volumeMounts/mountPath diff --git a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/single-node/pod-patch.yaml b/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/single-node/pod-patch.yaml index 910d92845..2d59383db 100644 --- a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/single-node/pod-patch.yaml +++ b/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/single-node/pod-patch.yaml @@ -1,7 +1,7 @@ apiVersion: v1 kind: Pod metadata: - name: resnet50v1-5-fp32-training + name: $(MODEL_NAME) spec: securityContext: runAsUser: 0 @@ -16,4 +16,10 @@ spec: name: mlops-env workingDir: / command: - - $(MODEL_DIR)/examples/fp32_training_multi_node.sh + - $(MODEL_DIR)/examples/fp32_training_demo.sh + volumeMounts: + - name: datasets + mountPath: $(DATASET_DIR) + readOnly: true + - name: users + mountPath: $(NFS_MOUNT_PATH) diff --git a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/single-node/pod.yaml b/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/single-node/pod.yaml deleted file mode 100644 index aad196eb0..000000000 --- a/examples/image_recognition/tensorflow/resnet50v1_5/training/fp32/k8s/mlops/single-node/pod.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: resnet50v1-5-fp32-training -spec: - serviceAccountName: model-service - containers: - - name: single-node - image: $(REGISTRY)/model-zoo:2.1.0-image-recognition-resnet50v1-5-fp32-training - imagePullPolicy: Always - command: - - $(MODEL_DIR)/examples/tensorflow/resnet50v1_5/fp32_training_full.sh - volumeMounts: - - name: datasets - mountPath: $(DATASET_DIR) - readOnly: true - - name: users - mountPath: /Users - volumes: - - name: users - persistentVolumeClaim: - claimName: users-pvc - - name: datasets - persistentVolumeClaim: - claimName: datasets-pvc diff --git a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/kustomization.yaml b/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/kustomization.yaml deleted file mode 100644 index efd184f9c..000000000 --- a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/kustomization.yaml +++ /dev/null @@ -1,35 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -resources: -- persistent-volume.yaml -- persistent-volume-claim.yaml -- service-account.yaml -configMapGenerator: -- name: mlops-env - env: mlops.env -generatorOptions: - disableNameSuffixHash: true -vars: -- name: DATASET_DIR - fieldref: - fieldPath: data.DATASET_DIR - objref: - apiVersion: v1 - kind: ConfigMap - name: mlops-env -- name: NFS_SERVER - fieldref: - fieldPath: data.NFS_SERVER - objref: - apiVersion: v1 - kind: ConfigMap - name: mlops-env -- name: NFS_PATH - fieldref: - fieldPath: data.NFS_PATH - objref: - apiVersion: v1 - kind: ConfigMap - name: mlops-env -configurations: -- mlops.yaml diff --git a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/mlops.env b/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/mlops.env deleted file mode 100644 index a3e6d56c9..000000000 --- a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/mlops.env +++ /dev/null @@ -1,4 +0,0 @@ -DATASET_DIR=/tf_dataset -NFS_PATH=/aipg_lab_home_pool_01 -NFS_SERVER=10.38.210.18 -OUTPUT_DIR=/workspace/output diff --git a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/mlops.yaml b/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/mlops.yaml deleted file mode 100644 index 552e821c9..000000000 --- a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/mlops.yaml +++ /dev/null @@ -1,7 +0,0 @@ -varReference: -- kind: PersistentVolume - path: spec/hostPath/path -- kind: PersistentVolume - path: spec/nfs/server -- kind: PersistentVolume - path: spec/nfs/path diff --git a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/persistent-volume-claim.yaml b/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/persistent-volume-claim.yaml deleted file mode 100644 index 6d025f709..000000000 --- a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/persistent-volume-claim.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: datasets-pvc -spec: - accessModes: - - ReadOnlyMany - resources: - requests: - storage: 5Gi ---- -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: users-pvc -spec: - accessModes: - - ReadWriteMany - resources: - requests: - storage: 6Gi - selector: - matchLabels: - storage-class: users-nfs diff --git a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/persistent-volume.yaml b/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/persistent-volume.yaml deleted file mode 100644 index aab03062c..000000000 --- a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/persistent-volume.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: v1 -kind: PersistentVolume -metadata: - name: datasets-pv -spec: - accessModes: - - ReadOnlyMany - capacity: - storage: 5Gi - hostPath: - path: $(DATASET_DIR) ---- -apiVersion: v1 -kind: PersistentVolume -metadata: - name: users-pv - labels: - storage-class: users-nfs -spec: - accessModes: - - ReadWriteMany - capacity: - storage: 6Gi - nfs: - server: $(NFS_SERVER) - path: $(NFS_PATH) diff --git a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/service-account.yaml b/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/service-account.yaml deleted file mode 100644 index e7fdbd2e1..000000000 --- a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/base/service-account.yaml +++ /dev/null @@ -1,4 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: model-service diff --git a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/multi-node/kustomization.yaml b/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/multi-node/kustomization.yaml index 7e772de43..ae3e27c8b 100644 --- a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/multi-node/kustomization.yaml +++ b/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/multi-node/kustomization.yaml @@ -1,13 +1,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization bases: -- ../base -resources: -- mpi-job.yaml -images: -- name: model-zoo - newName: model-zoo - newTag: 2.1.0-language-modeling-bert-large-fp32-training +- ../../../common/k8s/mlops/multi-node configMapGenerator: - name: mlops-env env: mlops.env @@ -17,6 +11,13 @@ generatorOptions: patchesStrategicMerge: - mpi-job-patch.yaml vars: +- name: MODEL_NAME + fieldref: + fieldPath: data.MODEL_NAME + objref: + apiVersion: v1 + kind: ConfigMap + name: mlops-env - name: MODEL_DIR fieldref: fieldPath: data.MODEL_DIR @@ -24,6 +25,13 @@ vars: apiVersion: v1 kind: ConfigMap name: mlops-env +- name: NFS_MOUNT_PATH + fieldref: + fieldPath: data.NFS_MOUNT_PATH + objref: + apiVersion: v1 + kind: ConfigMap + name: mlops-env - name: REGISTRY fieldref: fieldPath: data.REGISTRY @@ -31,5 +39,3 @@ vars: apiVersion: v1 kind: ConfigMap name: mlops-env -configurations: -- mlops.yaml diff --git a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/multi-node/mlops.env b/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/multi-node/mlops.env index f771e3d1f..98079e978 100644 --- a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/multi-node/mlops.env +++ b/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/multi-node/mlops.env @@ -1,4 +1,6 @@ DATASET_DIR=/tf_dataset +MODEL_NAME=bert-large-fp32-training MODEL_DIR=/workspace/bert-large-fp32-training +NFS_MOUNT_PATH=/Users OUTPUT_DIR=/tmp/output REGISTRY=amr-registry.caas.intel.com/aipg-tf diff --git a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/multi-node/mlops.yaml b/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/multi-node/mlops.yaml deleted file mode 100644 index 9b6c67dc6..000000000 --- a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/multi-node/mlops.yaml +++ /dev/null @@ -1,13 +0,0 @@ -varReference: -- kind: MPIJob - path: spec/mpiReplicaSpecs/Launcher/template/spec/containers/image -- kind: MPIJob - path: spec/mpiReplicaSpecs/Launcher/template/spec/containers/command -- kind: MPIJob - path: spec/mpiReplicaSpecs/Launcher/template/spec/containers/volumeMounts/mountPath -- kind: MPIJob - path: spec/mpiReplicaSpecs/Worker/template/spec/containers/image -- kind: MPIJob - path: spec/mpiReplicaSpecs/Worker/template/spec/containers/command -- kind: MPIJob - path: spec/mpiReplicaSpecs/Worker/template/spec/containers/volumeMounts/mountPath diff --git a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/multi-node/mpi-job-patch.yaml b/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/multi-node/mpi-job-patch.yaml index cd772a3fe..793b4ec50 100644 --- a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/multi-node/mpi-job-patch.yaml +++ b/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/multi-node/mpi-job-patch.yaml @@ -1,7 +1,7 @@ apiVersion: kubeflow.org/v1alpha2 kind: MPIJob metadata: - name: bert-large-fp32-training + name: $(MODEL_NAME) spec: mpiReplicaSpecs: Launcher: @@ -10,7 +10,6 @@ spec: containers: - name: mpi-launcher image: $(REGISTRY)/model-zoo:2.1.0-language-modeling-bert-large-fp32-training - imagePullPolicy: Always securityContext: runAsUser: 0 runAsGroup: 0 @@ -18,7 +17,6 @@ spec: envFrom: - configMapRef: name: mlops-env - workingDir: / command: - $(MODEL_DIR)/examples/fp32_training_multi_node.sh volumeMounts: @@ -26,7 +24,7 @@ spec: mountPath: $(DATASET_DIR) readOnly: true - name: users - mountPath: /Users + mountPath: $(NFS_MOUNT_PATH) Worker: template: spec: @@ -40,10 +38,9 @@ spec: envFrom: - configMapRef: name: mlops-env - workingDir: / volumeMounts: - name: datasets mountPath: $(DATASET_DIR) readOnly: true - name: users - mountPath: /Users + mountPath: $(NFS_MOUNT_PATH) diff --git a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/single-node/kustomization.yaml b/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/single-node/kustomization.yaml index baaaa482d..7a201306a 100644 --- a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/single-node/kustomization.yaml +++ b/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/single-node/kustomization.yaml @@ -1,13 +1,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization bases: -- ../base -resources: -- pod.yaml -images: -- name: model-zoo - newName: model-zoo - newTag: 2.1.0-language-modeling-bert-large-fp32-training +- ../../../common/k8s/mlops/single-node configMapGenerator: - name: mlops-env env: mlops.env @@ -17,6 +11,13 @@ generatorOptions: patchesStrategicMerge: - pod-patch.yaml vars: +- name: MODEL_NAME + fieldref: + fieldPath: data.MODEL_NAME + objref: + apiVersion: v1 + kind: ConfigMap + name: mlops-env - name: MODEL_DIR fieldref: fieldPath: data.MODEL_DIR @@ -24,6 +25,13 @@ vars: apiVersion: v1 kind: ConfigMap name: mlops-env +- name: NFS_MOUNT_PATH + fieldref: + fieldPath: data.NFS_MOUNT_PATH + objref: + apiVersion: v1 + kind: ConfigMap + name: mlops-env - name: REGISTRY fieldref: fieldPath: data.REGISTRY @@ -31,5 +39,3 @@ vars: apiVersion: v1 kind: ConfigMap name: mlops-env -configurations: -- mlops.yaml diff --git a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/single-node/mlops.env b/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/single-node/mlops.env index f771e3d1f..98079e978 100644 --- a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/single-node/mlops.env +++ b/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/single-node/mlops.env @@ -1,4 +1,6 @@ DATASET_DIR=/tf_dataset +MODEL_NAME=bert-large-fp32-training MODEL_DIR=/workspace/bert-large-fp32-training +NFS_MOUNT_PATH=/Users OUTPUT_DIR=/tmp/output REGISTRY=amr-registry.caas.intel.com/aipg-tf diff --git a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/single-node/pod-patch.yaml b/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/single-node/pod-patch.yaml index d60832a52..7e37ecd43 100644 --- a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/single-node/pod-patch.yaml +++ b/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/single-node/pod-patch.yaml @@ -1,7 +1,7 @@ apiVersion: v1 kind: Pod metadata: - name: bert-large-fp32-training + name: $(MODEL_NAME) spec: securityContext: runAsUser: 0 @@ -10,10 +10,15 @@ spec: containers: - name: single-node image: $(REGISTRY)/model-zoo:2.1.0-language-modeling-bert-fp32-training - imagePullPolicy: Always envFrom: - configMapRef: name: mlops-env workingDir: / command: - $(MODEL_DIR)/examples/fp32_training_single_node.sh + volumeMounts: + - name: datasets + mountPath: $(DATASET_DIR) + readOnly: true + - name: users + mountPath: $(NFS_MOUNT_PATH) diff --git a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/single-node/pod.yaml b/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/single-node/pod.yaml deleted file mode 100644 index e3fb44c47..000000000 --- a/examples/language_modeling/tensorflow/bert_large/training/fp32/k8s/mlops/single-node/pod.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: bert-large-fp32-training -spec: - serviceAccountName: model-service - containers: - - name: single-node - image: $(REGISTRY)/model-zoo:2.1.0-language-modeling-bert-large-fp32-training - imagePullPolicy: Always - command: - - $(MODEL_DIR)/bert-large-fp32-training/examples/fp32_training_single_node.sh - volumeMounts: - - name: datasets - mountPath: $(DATASET_DIR) - readOnly: true - - name: users - mountPath: /Users - volumes: - - name: users - persistentVolumeClaim: - claimName: users-pvc - - name: datasets - persistentVolumeClaim: - claimName: datasets-pvc