Skip to content

Commit

Permalink
Merge branch 'kdkasrav/k8s-improvements' into 'develop'
Browse files Browse the repository at this point in the history
k8s kustomize refactoring and improvements

See merge request intelai/models!218
  • Loading branch information
Kasravi, Kam D committed Jul 1, 2020
2 parents c792b90 + 5f4c1e1 commit 755d932
Show file tree
Hide file tree
Showing 37 changed files with 159 additions and 302 deletions.
28 changes: 28 additions & 0 deletions examples/common/KubernetesMlOps.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Kubernetes Mlops

## Using NFS as storage

Configuring the examples to use NFS as storage requires specifying these values in the mlops.env within the tree show below:

```
examples
└── common
└── k8s
└── mlops
├── base
│   └── mlops.env
├── multi-node
└── single-node
```

The NFS related values within mlops.env are shown below:

```
NFS_PATH=/exported_users
NFS_MOUNT_PATH=/home
NFS_SERVER=0.0.0.0
```

They should reflect values specific to your NFS implementation. NFS_PATH and NFS_SERVER are typically found in /etc/mtab
and are NFS server values. NFS_MOUNT_PATH is a nfs client option indicating where the exported file system is mounted at.

5 changes: 5 additions & 0 deletions examples/common/k8s/mlops/base/mlops.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
DATASET_DIR=/dataset
NFS_PATH=/exported_users
NFS_MOUNT_PATH=/home
NFS_SERVER=0.0.0.0
OUTPUT_DIR=/workspace/output
File renamed without changes.
8 changes: 8 additions & 0 deletions examples/common/k8s/mlops/multi-node/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
bases:
- ../base
resources:
- mpi-job.yaml
configurations:
- mlops.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
varReference:
- kind: MPIJob
path: metadata/name
- kind: MPIJob
path: spec/mpiReplicaSpecs/Launcher/template/spec/containers/image
- kind: MPIJob
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: kubeflow.org/v1alpha2
kind: MPIJob
metadata:
name: bert-large-fp32-training
name: $(MODEL_NAME)
spec:
slotsPerWorker: 1
cleanPodPolicy: Running
Expand All @@ -13,17 +13,8 @@ spec:
serviceAccountName: model-service
containers:
- name: mpi-launcher
image: $(REGISTRY)/model-zoo:2.1.0-language-modeling-bert-large-fp32-training
imagePullPolicy: Always
workingDir: /
command:
- $(MODEL_DIR)/examples/fp32_training_multi_node.sh
volumeMounts:
- name: datasets
mountPath: $(DATASET_DIR)
readOnly: true
- name: users
mountPath: /Users
volumes:
- name: datasets
persistentVolumeClaim:
Expand All @@ -38,15 +29,8 @@ spec:
serviceAccountName: model-service
containers:
- name: mpi-worker
image: $(REGISTRY)/model-zoo:2.1.0-language-modeling-bert-large-fp32-training
imagePullPolicy: Always
workingDir: /
volumeMounts:
- name: datasets
mountPath: $(DATASET_DIR)
readOnly: true
- name: users
mountPath: /Users
volumes:
- name: datasets
persistentVolumeClaim:
Expand Down
8 changes: 8 additions & 0 deletions examples/common/k8s/mlops/single-node/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
bases:
- ../base
resources:
- pod.yaml
configurations:
- mlops.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
varReference:
- kind: PersistentVolume
path: spec/hostPath/path
- kind: Pod
path: metadata/name
- kind: Pod
path: spec/containers/image
- kind: Pod
Expand Down
16 changes: 16 additions & 0 deletions examples/common/k8s/mlops/single-node/pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: v1
kind: Pod
metadata:
name: $(MODEL_NAME)
spec:
serviceAccountName: model-service
containers:
- name: single-node
imagePullPolicy: Always
volumes:
- name: users
persistentVolumeClaim:
claimName: users-pvc
- name: datasets
persistentVolumeClaim:
claimName: datasets-pvc

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,13 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
bases:
- ../base
resources:
- mpi-job.yaml
images:
- name: model-zoo
newName: model-zoo
newTag: 2.1.0-image-recognition-resnet50v1-5-fp32-training
- ../../../common/k8s/mlops/multi-node
configMapGenerator:
- name: mlops-env
env: mlops.env
Expand All @@ -17,19 +11,31 @@ generatorOptions:
patchesStrategicMerge:
- mpi-job-patch.yaml
vars:
- name: MODEL_NAME
fieldref:
fieldPath: data.MODEL_NAME
objref:
apiVersion: v1
kind: ConfigMap
name: mlops-env
- name: MODEL_DIR
fieldref:
fieldPath: data.MODEL_DIR
objref:
apiVersion: v1
kind: ConfigMap
name: mlops-env
- name: NFS_MOUNT_PATH
fieldref:
fieldPath: data.NFS_MOUNT_PATH
objref:
apiVersion: v1
kind: ConfigMap
name: mlops-env
- name: REGISTRY
fieldref:
fieldPath: data.REGISTRY
objref:
apiVersion: v1
kind: ConfigMap
name: mlops-env
configurations:
- mlops.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
DATASET_DIR=/tf_dataset
MODEL_NAME=resnet50v1_5_fp32_training
MODEL_DIR=/workspace/resnet50v1_5_fp32_training
NFS_MOUNT_PATH=/Users
OUTPUT_DIR=/tmp/output
REGISTRY=amr-registry.caas.intel.com/aipg-tf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: kubeflow.org/v1alpha2
kind: MPIJob
metadata:
name: resnet50v1-5-fp32-training
name: $(MODEL_NAME)
spec:
mpiReplicaSpecs:
Launcher:
Expand All @@ -10,23 +10,21 @@ spec:
containers:
- name: mpi-launcher
image: $(REGISTRY)/model-zoo:2.1.0-image-recognition-resnet50v1-5-fp32-training
imagePullPolicy: Always
securityContext:
runAsUser: 0
runAsGroup: 0
fsGroup: 0
envFrom:
- configMapRef:
name: mlops-env
workingDir: /
command:
- $(MODEL_DIR)/examples/fp32_training_multi_node.sh
volumeMounts:
- name: datasets
mountPath: $(DATASET_DIR)
readOnly: true
- name: users
mountPath: /Users
mountPath: $(NFS_MOUNT_PATH)
Worker:
template:
spec:
Expand All @@ -46,4 +44,4 @@ spec:
mountPath: $(DATASET_DIR)
readOnly: true
- name: users
mountPath: /Users
mountPath: $(NFS_MOUNT_PATH)

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,13 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
bases:
- ../base
resources:
- pod.yaml
images:
- name: model-zoo
newName: model-zoo
newTag: 2.1.0-image-recognition-resnet50v1-5-fp32-training
- ../../../common/k8s/mlops/single-node
configMapGenerator:
- name: mlops-env
env: mlops.env
Expand All @@ -17,19 +11,31 @@ generatorOptions:
patchesStrategicMerge:
- pod-patch.yaml
vars:
- name: MODEL_NAME
fieldref:
fieldPath: data.MODEL_NAME
objref:
apiVersion: v1
kind: ConfigMap
name: mlops-env
- name: MODEL_DIR
fieldref:
fieldPath: data.MODEL_DIR
objref:
apiVersion: v1
kind: ConfigMap
name: mlops-env
- name: NFS_MOUNT_PATH
fieldref:
fieldPath: data.NFS_MOUNT_PATH
objref:
apiVersion: v1
kind: ConfigMap
name: mlops-env
- name: REGISTRY
fieldref:
fieldPath: data.REGISTRY
objref:
apiVersion: v1
kind: ConfigMap
name: mlops-env
configurations:
- mlops.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
DATASET_DIR=/tf_dataset
MODEL_NAME=resnet50v1_5_fp32_training
MODEL_DIR=/workspace/resnet50v1_5_fp32_training
/NFS_MOUNT_PATH=/Users
OUTPUT_DIR=/tmp/output
REGISTRY=amr-registry.caas.intel.com/aipg-tf

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: v1
kind: Pod
metadata:
name: resnet50v1-5-fp32-training
name: $(MODEL_NAME)
spec:
securityContext:
runAsUser: 0
Expand All @@ -16,4 +16,10 @@ spec:
name: mlops-env
workingDir: /
command:
- $(MODEL_DIR)/examples/fp32_training_multi_node.sh
- $(MODEL_DIR)/examples/fp32_training_demo.sh
volumeMounts:
- name: datasets
mountPath: $(DATASET_DIR)
readOnly: true
- name: users
mountPath: $(NFS_MOUNT_PATH)
Loading

0 comments on commit 755d932

Please sign in to comment.