From a2eabdeb326556c6d12770e3ce0d5c351de8682c Mon Sep 17 00:00:00 2001 From: Abolfazl Shahbazi Date: Sat, 12 Dec 2020 15:08:46 -0800 Subject: [PATCH] Checkout remaining changes from develop --- benchmarks/README.md | 1 + .../common/tensorflow/container_init.sh | 8 +- benchmarks/common/tensorflow/start.sh | 45 +++- .../tensorflow/mobilenet_v1/README.md | 14 ++ .../inference/bfloat16/__init__.py | 17 ++ .../inference/bfloat16/config.json | 6 + .../inference/bfloat16/model_init.py | 104 +++++++++ .../mobilenet_v1/inference/fp32/model_init.py | 7 +- .../tensorflow/bert_base/README.md | 3 +- .../inference/bfloat16/model_init.py | 5 +- .../bert_base/inference/fp32/model_init.py | 5 +- .../tensorflow/ssd-mobilenet/README.md | 14 ++ .../inference/bfloat16/__init__.py | 17 ++ .../inference/bfloat16/config.json | 7 + .../inference/bfloat16/model_init.py | 65 ++++++ .../inference/fp32/model_init.py | 2 +- .../recommendation/tensorflow/ncf/README.md | 100 +++++++- .../tensorflow/wide_deep_large_ds/README.md | 218 +++++++++++++----- .../inference/fp32/model_init.py | 8 +- .../inference/int8/model_init.py | 8 +- ...intel-python-dlrm-bf16-training.Dockerfile | 37 --- .../dl/pytorch/ipex-pytorch.md | 52 ----- .../resnet50v1_5/inference/fp32/README.md | 17 -- .../inference/fp32/mlops/serving/.gitignore | 2 + .../inference/fp32/mlops/serving/Krmfile | 198 ++++++++++++++++ .../inference/fp32/mlops/serving/service.yaml | 18 ++ .../training/fp32/.docs/quickstart.md | 2 +- .../resnet50v1_5/training/fp32/README.md | 23 +- .../training/fp32/mlops/multi-node/.gitignore | 2 + .../fp32/mlops/single-node/.gitignore | 1 + .../training/fp32/.docs/kubernetes.md | 2 + .../training/fp32/.docs/quickstart.md | 2 +- .../bert_large/training/fp32/README.md | 21 +- .../training/fp32/mlops/multi-node/.gitignore | 2 + .../multi-node/user-allocated-pvc/Krmfile | 30 +-- .../mlops/multi-node/user-mounted-nfs/Krmfile | 30 +-- .../fp32/mlops/single-node/.gitignore | 2 + .../single-node/user-allocated-pvc/Krmfile | 62 ++--- .../user-allocated-pvc/config-map.yaml | 3 +- .../single-node/user-allocated-pvc/pod.yaml | 34 --- .../single-node/user-mounted-nfs/Krmfile | 44 +--- .../user-mounted-nfs/config-map.yaml | 3 +- .../single-node/user-mounted-nfs/pod.yaml | 38 +-- .../rfcn/inference/fp32/.docs/kubernetes.md | 5 +- .../rfcn/inference/fp32/.docs/quickstart.md | 4 +- .../tensorflow/rfcn/inference/fp32/README.md | 26 +-- .../inference/fp32/mlops/pipeline/.gitignore | 2 + .../mlops/pipeline/user-allocated-pvc/Krmfile | 34 +-- .../user-allocated-pvc/config-map.yaml | 161 +++++++++++++ .../user-allocated-pvc/kustomization.yaml | 3 - .../user-allocated-pvc/serving_accuracy.yaml | 27 ++- .../mlops/pipeline/user-mounted-nfs/Krmfile | 34 +-- .../pipeline/user-mounted-nfs/config-map.yaml | 161 +++++++++++++ .../user-mounted-nfs/kustomization.yaml | 3 - .../user-mounted-nfs/serving_accuracy.yaml | 27 ++- .../inference/fp32/mlops/serving/.gitignore | 2 + .../mlops/serving/user-allocated-pvc/pod.yaml | 10 + .../mlops/serving/user-mounted-nfs/pod.yaml | 10 + .../training/fp32/README.md | 17 -- .../training/fp32/mlops/pipeline/.gitignore | 3 + .../mlops/pipeline/user-allocated-pvc/Krmfile | 12 +- .../user-allocated-pvc/kustomization.yaml | 3 - .../user-allocated-pvc/train_and_serve.yaml | 1 - .../mlops/pipeline/user-mounted-nfs/Krmfile | 12 +- .../user-mounted-nfs/kustomization.yaml | 3 - .../user-mounted-nfs/train_and_serve.yaml | 1 - .../fp32/mlops/single-node/.gitignore | 3 + .../single-node/user-allocated-pvc/Krmfile | 12 +- .../single-node/user-mounted-nfs/Krmfile | 12 +- .../bert_large/inference/export_classifier.py | 1 + .../bert_large/inference/generic_ops.py | 2 +- .../bert_large/inference/run_classifier.py | 10 + .../training/bfloat16/generic_ops.py | 2 +- .../bert_large/training/fp32/generic_ops.py | 2 +- .../rfcn/inference/fp32/coco_mAP.sh | 9 +- .../tensorflow_benchmarks_tf2.0.patch | 2 +- .../training/bfloat16/benchmark-tf-2.0.diff | 2 +- .../training/fp32/benchmark-tf-2.0.diff | 2 +- .../dataset/preprocess_csv_tfrecords.py | 20 +- .../tf_model_args/tf_mobilenet_v1_args.json | 10 +- .../tf_model_args/tf_ssd_mobilenet_args.json | 4 +- .../tf_wide_deep_large_ds_args.json | 4 +- .../k8s/bert-large-fp32-training-k8s_spec.yml | 2 - .../resnet50v1-5-fp32-inference-k8s_spec.yml | 2 - .../resnet50v1-5-fp32-training-k8s_spec.yml | 2 - .../k8s/rfcn-fp32-inference-k8s_spec.yml | 2 - ...e-deep-large-ds-fp32-training-k8s_spec.yml | 2 - tox.ini | 2 +- 88 files changed, 1308 insertions(+), 641 deletions(-) create mode 100644 benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/bfloat16/__init__.py create mode 100644 benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/bfloat16/config.json create mode 100644 benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/bfloat16/model_init.py create mode 100644 benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/bfloat16/__init__.py create mode 100644 benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/bfloat16/config.json create mode 100644 benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/bfloat16/model_init.py delete mode 100644 dockerfiles/model_containers/intel-python-dlrm-bf16-training.Dockerfile delete mode 100644 docs/container_portal/dl/pytorch/ipex-pytorch.md create mode 100644 k8s/image_recognition/tensorflow/resnet50v1_5/inference/fp32/mlops/serving/.gitignore create mode 100644 k8s/image_recognition/tensorflow/resnet50v1_5/inference/fp32/mlops/serving/Krmfile create mode 100644 k8s/image_recognition/tensorflow/resnet50v1_5/inference/fp32/mlops/serving/service.yaml create mode 100644 k8s/image_recognition/tensorflow/resnet50v1_5/training/fp32/mlops/multi-node/.gitignore create mode 100644 k8s/image_recognition/tensorflow/resnet50v1_5/training/fp32/mlops/single-node/.gitignore create mode 100644 k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/multi-node/.gitignore create mode 100644 k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/.gitignore create mode 100644 k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/.gitignore create mode 100644 k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/serving/.gitignore create mode 100644 k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/.gitignore create mode 100644 k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/single-node/.gitignore diff --git a/benchmarks/README.md b/benchmarks/README.md index ce33a0190..333face28 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -23,6 +23,7 @@ dependencies to be installed: | Image Recognition | TensorFlow | [ResNet 50](https://arxiv.org/pdf/1512.03385.pdf) | Inference | Model Containers: [Int8](https://software.intel.com/content/www/us/en/develop/articles/containers/resnet50-int8-inference-tensorflow-container.html) [FP32](https://software.intel.com/content/www/us/en/develop/articles/containers/resnet50-fp32-inference-tensorflow-container.html)
Model Packages: [Int8](https://software.intel.com/content/www/us/en/develop/articles/containers/resnet50-int8-inference-tensorflow-model.html) [FP32](https://software.intel.com/content/www/us/en/develop/articles/containers/resnet50-fp32-inference-tensorflow-model.html) | [Int8](image_recognition/tensorflow/resnet50/README.md#int8-inference-instructions) [FP32](image_recognition/tensorflow/resnet50/README.md#fp32-inference-instructions) | | Image Recognition | TensorFlow | [ResNet 50v1.5](https://github.com/tensorflow/models/tree/master/official/resnet) | Inference | Model Containers: [Int8](https://software.intel.com/content/www/us/en/develop/articles/containers/resnet50v1-5-int8-inference-tensorflow-container.html) [FP32](https://software.intel.com/content/www/us/en/develop/articles/containers/resnet50v1-5-fp32-inference-container.html) [BFloat16**](https://software.intel.com/content/www/us/en/develop/articles/containers/resnet50v1-5-bfloat16-inference-tensorflow-container.html)
Model Packages: [Int8](https://software.intel.com/content/www/us/en/develop/articles/containers/resnet50v1-5-int8-inference-tensorflow-model.html) [FP32](https://software.intel.com/content/www/us/en/develop/articles/containers/resnet50v1-5-fp32-inference-model-package.html) [BFloat16**](https://software.intel.com/content/www/us/en/develop/articles/containers/resnet50v1-5-bfloat16-inference-tensorflow-model.html) | [Int8](image_recognition/tensorflow/resnet50v1_5/README.md#int8-inference-instructions) [FP32](image_recognition/tensorflow/resnet50v1_5/README.md#fp32-inference-instructions) [BFloat16**](image_recognition/tensorflow/resnet50v1_5/README.md#bfloat16-inference-instructions) | | Image Recognition | TensorFlow | [ResNet 50v1.5](https://github.com/tensorflow/models/tree/master/official/resnet) | Training | Model Containers: [FP32](https://software.intel.com/content/www/us/en/develop/articles/containers/resnet50v1-5-fp32-training-tensorflow-container.html) [BFloat16**](https://software.intel.com/content/www/us/en/develop/articles/containers/resnet50v1-5-bfloat16-training-tensorflow-container.html)
Model Packages: [FP32](https://software.intel.com/content/www/us/en/develop/articles/containers/resnet50v1-5-fp32-training-tensorflow-model.html) [BFloat16**](https://software.intel.com/content/www/us/en/develop/articles/containers/resnet50v1-5-bfloat16-training-tensorflow-model.html) | [FP32](image_recognition/tensorflow/resnet50v1_5/README.md#fp32-training-instructions) [BFloat16**](image_recognition/tensorflow/resnet50v1_5/README.md#bfloat16-training-instructions) | +| Image Segmentation | TensorFlow | [3D U-Net](https://arxiv.org/pdf/1606.06650.pdf) | Inference | | [FP32](image_segmentation/tensorflow/3d_unet/README.md) | | Image Segmentation | TensorFlow | [UNet](https://arxiv.org/pdf/1606.06650.pdf) | Inference | Model Containers: [FP32](https://software.intel.com/content/www/us/en/develop/articles/containers/unet-fp32-inference-tensorflow-container.html)
Model Packages: [FP32](https://software.intel.com/content/www/us/en/develop/articles/containers/unet-fp32-inference-tensorflow-model.html) | [FP32](image_segmentation/tensorflow/unet/README.md#fp32-inference-instructions) | | Image Segmentation | TensorFlow | [MaskRCNN](https://arxiv.org/abs/1703.06870) | Inference | Model Containers: [FP32](https://software.intel.com/content/www/us/en/develop/articles/containers/mask-rcnn-fp32-inference-tensorflow-container.html)
Model Packages: [FP32](https://software.intel.com/content/www/us/en/develop/articles/containers/mask-rcnn-fp32-inference-tensorflow-model.html) | [FP32](image_segmentation/tensorflow/maskrcnn/README.md#fp32-training-instructions) | | Language Modeling | TensorFlow | [BERT](https://arxiv.org/pdf/1810.04805.pdf) | Inference | Model Containers: [FP32](https://software.intel.com/content/www/us/en/develop/articles/containers/bert-large-fp32-inference-tensorflow-container.html)
Model Packages: [FP32](https://software.intel.com/content/www/us/en/develop/articles/containers/bert-large-fp32-inference-tensorflow-model.html) | [FP32](language_modeling/tensorflow/bert_large/README.md#fp32-inference-instructions) [BFloat16**](language_modeling/tensorflow/bert_large/README.md#bfloat16-inference-instructions) | diff --git a/benchmarks/common/tensorflow/container_init.sh b/benchmarks/common/tensorflow/container_init.sh index 0913eee0b..ff0bce322 100755 --- a/benchmarks/common/tensorflow/container_init.sh +++ b/benchmarks/common/tensorflow/container_init.sh @@ -17,5 +17,9 @@ # This file includes runtime installs for model containers -apt-get install numactl -y - +if (( $(id -u) == 0 )); then + apt-get install numactl -y +else + echo "Please run as root" + exit 1 +fi diff --git a/benchmarks/common/tensorflow/start.sh b/benchmarks/common/tensorflow/start.sh index 242df6cf6..83f90c378 100644 --- a/benchmarks/common/tensorflow/start.sh +++ b/benchmarks/common/tensorflow/start.sh @@ -461,6 +461,20 @@ function add_calibration_arg() { echo "${calibration_arg}" } +# 3D UNet model +function 3d_unet() { + if [[ ${PRECISION} == "fp32" ]] && [[ ${MODE} == "inference" ]]; then + if [[ ${NOINSTALL} != "True" ]]; then + pip install -r "${MOUNT_BENCHMARK}/${USE_CASE}/${FRAMEWORK}/${MODEL_NAME}/requirements.txt" + fi + export PYTHONPATH=${PYTHONPATH}:${MOUNT_INTELAI_MODELS_SOURCE}/inference/fp32 + PYTHONPATH=${PYTHONPATH} CMD=${CMD} run_model + else + echo "${PRECISION} ${MODE} is not supported for ${MODEL_NAME}" + exit 1 + fi +} + #BERT model function bert() { if [ ${PRECISION} == "fp32" ]; then @@ -725,7 +739,7 @@ function maskrcnn() { # mobilenet_v1 model function mobilenet_v1() { - if [ ${PRECISION} == "fp32" ]; then + if [ ${PRECISION} == "fp32" ] || [ ${PRECISION} == "bfloat16" ]; then CMD="${CMD} $(add_arg "--input_height" ${input_height}) $(add_arg "--input_width" ${input_width}) \ $(add_arg "--warmup_steps" ${warmup_steps}) $(add_arg "--steps" ${steps}) \ $(add_arg "--input_layer" ${input_layer}) $(add_arg "--output_layer" ${output_layer})" @@ -767,10 +781,24 @@ function mtcc() { # NCF model function ncf() { - if [ ${PRECISION} == "fp32" ]; then - # For nfc, if dataset location is empty, script downloads dataset at given location. + if [[ -n "${clean}" ]]; then + CMD="${CMD} --clean" + fi + + # NCF supports different datasets including ml-1m and ml-20m. + if [[ -n "${dataset}" && ${dataset} != "" ]]; then + CMD="${CMD} --dataset=${dataset}" + fi + + if [[ -n "${te}" && ${te} != "" ]]; then + CMD="${CMD} -te=${te}" + fi + + if [ ${PRECISION} == "fp32" -o ${PRECISION} == "bfloat16" ]; then + # For ncf, if dataset location is empty, script downloads dataset at given location. if [ ! -d "${DATASET_LOCATION}" ]; then - mkdir -p /dataset + mkdir -p ./dataset + CMD="${CMD} --data-location=./dataset" fi export PYTHONPATH=${PYTHONPATH}:${MOUNT_EXTERNAL_MODELS_SOURCE} @@ -895,7 +923,7 @@ function rfcn() { # SSD-MobileNet model function ssd_mobilenet() { - if [ ${PRECISION} == "fp32" ]; then + if [ ${PRECISION} == "fp32" ] || [ ${PRECISION} == "bfloat16" ]; then if [ ${BATCH_SIZE} != "-1" ]; then echo "Warning: SSD-MobileNet FP32 inference script does not use the batch_size arg" fi @@ -1287,9 +1315,6 @@ function wide_deep_large_ds() { if [ "${kmp_block_time}" != None ] ; then CMD="${CMD} --kmp_block_time=${kmp_block_time}" fi - if [ "${kmp_affinity}" != None ]; then - CMD="${CMD} --kmp_affinity=${kmp_affinity}" - fi if [ "${kmp_settings}" != None ]; then CMD="${CMD} --kmp_settings=${kmp_settings}" fi @@ -1306,7 +1331,9 @@ function wide_deep_large_ds() { LOGFILE=${OUTPUT_DIR}/${LOG_FILENAME} MODEL_NAME=$(echo ${MODEL_NAME} | tr 'A-Z' 'a-z') -if [ ${MODEL_NAME} == "bert" ]; then +if [ ${MODEL_NAME} == "3d_unet" ]; then + 3d_unet +elif [ ${MODEL_NAME} == "bert" ]; then bert elif [ ${MODEL_NAME} == "dcgan" ]; then dcgan diff --git a/benchmarks/image_recognition/tensorflow/mobilenet_v1/README.md b/benchmarks/image_recognition/tensorflow/mobilenet_v1/README.md index d87edc065..51d5886ee 100644 --- a/benchmarks/image_recognition/tensorflow/mobilenet_v1/README.md +++ b/benchmarks/image_recognition/tensorflow/mobilenet_v1/README.md @@ -4,6 +4,7 @@ This document has instructions for how to run MobileNet V1 for the following modes/precisions: * [Int8 inference](#int8-inference-instructions) * [FP32 inference](#fp32-inference-instructions) +* [BFloat16 inference](#bfloat16-inference-instructions) Instructions and scripts for model training are coming later. @@ -279,3 +280,16 @@ $ wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/mob Ran inference with batch size 100 Log location outside container: {--output-dir value}/benchmark_mobilenet_v1_inference_fp32_20190110_211648.log ``` + +# BFloat16 Inference Instructions + +MobileNet v1 BFloat16 inference depends on Auto-Mixed-Precision to convert graph from FP32 to BFloat16 online. +Before evaluating MobileNet v1 BFloat16 inference, please set the following environment variables: + +``` +export TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_INFERLIST_REMOVE=BiasAdd \ +export TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_DENYLIST_REMOVE=Softmax \ +export TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_ALLOWLIST_ADD=BiasAdd,Softmax +``` + +The instructions are the same as FP32 inference instructions above, except one needs to change the `--precision=fp32` to `--precision=bfloat16` in the above commands. diff --git a/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/bfloat16/__init__.py b/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/bfloat16/__init__.py new file mode 100644 index 000000000..199f25228 --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/bfloat16/__init__.py @@ -0,0 +1,17 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2020 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/bfloat16/config.json b/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/bfloat16/config.json new file mode 100644 index 000000000..f0b327528 --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/bfloat16/config.json @@ -0,0 +1,6 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1 + } +} diff --git a/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/bfloat16/model_init.py b/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/bfloat16/model_init.py new file mode 100644 index 000000000..779e18f0d --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/bfloat16/model_init.py @@ -0,0 +1,104 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2020 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import os +from common.base_model_init import BaseModelInitializer +from common.base_model_init import set_env_var + + +class ModelInitializer(BaseModelInitializer): + """ Model initializer for MobileNet V1 BFloat16 inference """ + + def __init__(self, args, custom_args=[], platform_util=None): + super(ModelInitializer, self).__init__(args, custom_args, platform_util) + + # use default batch size if -1 + if self.args.batch_size == -1: + self.args.batch_size = 128 + + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) + + # set num_inter_threads and num_intra_threads (override inter threads to 2) + self.set_num_inter_intra_threads(num_inter_threads=2) + + script_name = "accuracy.py" if self.args.accuracy_only \ + else "benchmark.py" + script_path = os.path.join( + self.args.intelai_models, self.args.mode, script_name) + self.command_prefix = "{} {}".format(self.python_exe, script_path) + + if self.args.socket_id != -1: + self.command_prefix = "numactl --cpunodebind={} -l {}".format( + str(self.args.socket_id), self.command_prefix) + + set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) + + self.parse_args() + + if not self.args.accuracy_only: + # add args for the benchmark script + script_args_list = [ + "input_graph", "input_height", "input_width", "batch_size", + "input_layer", "output_layer", "num_inter_threads", + "num_intra_threads", "warmup_steps", "steps", "precision"] + self.command_prefix = self.add_args_to_command( + self.command_prefix, script_args_list) + else: + # add args for the accuracy script + script_args_list = [ + "input_graph", "data_location", "input_height", "input_width", + "batch_size", "input_layer", "output_layer", + "num_inter_threads", "num_intra_threads", "precision"] + self.command_prefix = self.add_args_to_command( + self.command_prefix, script_args_list) + + def parse_args(self): + if self.custom_args is None: + return + + parser = argparse.ArgumentParser() + parser.add_argument( + "--input_height", default=224, + dest='input_height', type=int, help="input height") + parser.add_argument( + "--input_width", default=224, + dest='input_width', type=int, help="input width") + parser.add_argument( + "--warmup_steps", dest="warmup_steps", + help="number of warmup steps", + type=int, default=10) + parser.add_argument( + "--steps", dest="steps", + help="number of steps", + type=int, default=50) + parser.add_argument( + "--input_layer", dest="input_layer", + help="name of input layer", + type=str, default="input") + parser.add_argument( + "--output_layer", dest="output_layer", + help="name of output layer", + type=str, default="MobilenetV1/Predictions/Reshape_1") + + self.args = parser.parse_args(self.custom_args, namespace=self.args) + + def run(self): + self.run_command(self.command_prefix) diff --git a/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/fp32/model_init.py b/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/fp32/model_init.py index 73141a0c1..71d3af1eb 100644 --- a/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/fp32/model_init.py +++ b/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/fp32/model_init.py @@ -44,8 +44,7 @@ def __init__(self, args, custom_args=[], platform_util=None): script_name = "accuracy.py" if self.args.accuracy_only \ else "benchmark.py" script_path = os.path.join( - self.args.intelai_models, self.args.mode, self.args.precision, - script_name) + self.args.intelai_models, self.args.mode, script_name) self.command_prefix = "{} {}".format(self.python_exe, script_path) if self.args.socket_id != -1: @@ -61,7 +60,7 @@ def __init__(self, args, custom_args=[], platform_util=None): script_args_list = [ "input_graph", "input_height", "input_width", "batch_size", "input_layer", "output_layer", "num_inter_threads", - "num_intra_threads", "warmup_steps", "steps"] + "num_intra_threads", "warmup_steps", "steps", "precision"] self.command_prefix = self.add_args_to_command( self.command_prefix, script_args_list) else: @@ -69,7 +68,7 @@ def __init__(self, args, custom_args=[], platform_util=None): script_args_list = [ "input_graph", "data_location", "input_height", "input_width", "batch_size", "input_layer", "output_layer", - "num_inter_threads", "num_intra_threads"] + "num_inter_threads", "num_intra_threads", "precision"] self.command_prefix = self.add_args_to_command( self.command_prefix, script_args_list) diff --git a/benchmarks/language_modeling/tensorflow/bert_base/README.md b/benchmarks/language_modeling/tensorflow/bert_base/README.md index 8ec82569c..32105e18b 100644 --- a/benchmarks/language_modeling/tensorflow/bert_base/README.md +++ b/benchmarks/language_modeling/tensorflow/bert_base/README.md @@ -86,7 +86,8 @@ ${PYTHON} export_classifier.py \ --bert_config_file=$BERT_BASE_DIR/bert_config.json \ --output_dir=${OUTPUT_DIR} \ --precision=fp32 \ - --saved_model=true + --saved_model=true \ + --experimental_gelu=True # Disable this flag if your TenorFlow doesn't support ``` ## Inference diff --git a/benchmarks/language_modeling/tensorflow/bert_base/inference/bfloat16/model_init.py b/benchmarks/language_modeling/tensorflow/bert_base/inference/bfloat16/model_init.py index 6f85af9ab..12ed01a9b 100644 --- a/benchmarks/language_modeling/tensorflow/bert_base/inference/bfloat16/model_init.py +++ b/benchmarks/language_modeling/tensorflow/bert_base/inference/bfloat16/model_init.py @@ -51,6 +51,8 @@ def __init__(self, args, custom_args=[], platform_util=None): default='Classifier') arg_parser.add_argument("--max-seq-length", type=int, dest="max_seq_length", default=None) arg_parser.add_argument("--profile", dest="profile", default=None) + arg_parser.add_argument('--experimental-gelu', help=' [Experimental] Use experimental gelu op.', + dest="experimental_gelu", default="False") arg_parser.add_argument("--config-file", dest="bert_config_file", default="bert_config.json") arg_parser.add_argument("--vocab-file", dest="vocab_file", default="vocab.txt") arg_parser.add_argument('--task-name', help=' Task name for classifier', dest="task_name", default='MRPC') @@ -111,7 +113,8 @@ def expand_data_path(path): " --do_eval=" + str(self.args.do_eval) + eoo + \ " --vocab_file=" + str(self.args.vocab_file) + eoo + \ " --data_dir=" + str(self.args.data_dir) + eoo + \ - " --eval_batch_size=" + str(self.args.batch_size) + " --eval_batch_size=" + str(self.args.batch_size) + \ + " --experimental_gelu=" + str(self.args.experimental_gelu) if self.args.accuracy_only: model_args += " --mode=accuracy" diff --git a/benchmarks/language_modeling/tensorflow/bert_base/inference/fp32/model_init.py b/benchmarks/language_modeling/tensorflow/bert_base/inference/fp32/model_init.py index 6f85af9ab..87561c6ab 100644 --- a/benchmarks/language_modeling/tensorflow/bert_base/inference/fp32/model_init.py +++ b/benchmarks/language_modeling/tensorflow/bert_base/inference/fp32/model_init.py @@ -51,6 +51,8 @@ def __init__(self, args, custom_args=[], platform_util=None): default='Classifier') arg_parser.add_argument("--max-seq-length", type=int, dest="max_seq_length", default=None) arg_parser.add_argument("--profile", dest="profile", default=None) + arg_parser.add_argument('--experimental-gelu', help=' [Experimental] Use experimental gelu op.', + dest="experimental_gelu", default="False") arg_parser.add_argument("--config-file", dest="bert_config_file", default="bert_config.json") arg_parser.add_argument("--vocab-file", dest="vocab_file", default="vocab.txt") arg_parser.add_argument('--task-name', help=' Task name for classifier', dest="task_name", default='MRPC') @@ -111,7 +113,8 @@ def expand_data_path(path): " --do_eval=" + str(self.args.do_eval) + eoo + \ " --vocab_file=" + str(self.args.vocab_file) + eoo + \ " --data_dir=" + str(self.args.data_dir) + eoo + \ - " --eval_batch_size=" + str(self.args.batch_size) + " --eval_batch_size=" + str(self.args.batch_size) + eoo + \ + " --experimental_gelu=" + str(self.args.experimental_gelu) if self.args.accuracy_only: model_args += " --mode=accuracy" diff --git a/benchmarks/object_detection/tensorflow/ssd-mobilenet/README.md b/benchmarks/object_detection/tensorflow/ssd-mobilenet/README.md index 62e1be663..8919e2c72 100644 --- a/benchmarks/object_detection/tensorflow/ssd-mobilenet/README.md +++ b/benchmarks/object_detection/tensorflow/ssd-mobilenet/README.md @@ -5,6 +5,7 @@ following modes/precisions: - [SSD-MobileNet](#ssd-mobilenet) - [Int8 Inference Instructions](#int8-inference-instructions) - [FP32 Inference Instructions](#fp32-inference-instructions) + - [BFloat16 Inference Instructions](#bfloat16-inference-instructions) Instructions and scripts for model training and inference for other precisions are coming later. @@ -237,3 +238,16 @@ Below is a sample log file tail when testing accuracy: Ran inference with batch size -1 Log location outside container: {--output-dir value}/benchmark_ssd-mobilenet_inference_fp32_20190123_225145.log ``` + +# BFloat16 Inference Instructions + +SSD-MobileNet BFloat16 inference depends on Auto-Mixed-Precision to convert graph from FP32 to BFloat16 online. +Before evaluating SSD-MobileNet BFloat16 inference, please set the following environment variables: + +``` +export TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_ALLOWLIST_ADD=BiasAdd,Relu6,Mul,AddV2 +export TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_INFERLIST_REMOVE=BiasAdd,AddV2,Mul +export TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_CLEARLIST_REMOVE=Relu6 +``` + +The instructions are the same as FP32 inference instructions above, except one needs to change the `--precision=fp32` to `--precision=bfloat16` in the above commands. diff --git a/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/bfloat16/__init__.py b/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/bfloat16/__init__.py new file mode 100644 index 000000000..199f25228 --- /dev/null +++ b/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/bfloat16/__init__.py @@ -0,0 +1,17 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2020 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/bfloat16/config.json b/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/bfloat16/config.json new file mode 100644 index 000000000..273b45b40 --- /dev/null +++ b/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/bfloat16/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/bfloat16/model_init.py b/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/bfloat16/model_init.py new file mode 100644 index 000000000..94e0685e2 --- /dev/null +++ b/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/bfloat16/model_init.py @@ -0,0 +1,65 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2020 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os + +from common.base_model_init import BaseModelInitializer, set_env_var + + +class ModelInitializer(BaseModelInitializer): + # SSD-MobileNet BFloat16 inference model initialization + args = None + custom_args = [] + + def __init__(self, args, custom_args=[], platform_util=None): + super(ModelInitializer, self).__init__(args, custom_args, platform_util) + + # Set the num_inter_threads and num_intra_threads + # if user did not provide then default value based on platform will be set + self.set_num_inter_intra_threads(self.args.num_inter_threads, + self.args.num_intra_threads) + + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) + + benchmark_script = os.path.join(self.args.intelai_models, self.args.mode, + "infer_detections.py") + self.command_prefix = self.get_command_prefix(self.args.socket_id) \ + + "{} {}".format(self.python_exe, benchmark_script) + set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) + + self.command_prefix += " -g {0}".format(self.args.input_graph) + self.command_prefix += " -i 1000" + self.command_prefix += " -w 200" + self.command_prefix += " -a {0}".format(self.args.num_intra_threads) + self.command_prefix += " -e {0}".format(self.args.num_inter_threads) + self.command_prefix += " -p {0}".format(self.args.precision) + if self.args.data_location: + self.command_prefix += " -d {0}".format(self.args.data_location) + + if self.args.accuracy_only: + self.command_prefix += " -r" + assert self.args.data_location, "accuracy must provide the data." + else: + # Did not support multi-batch accuracy check. + self.command_prefix += " -b {0}".format(self.args.batch_size) + + def run(self): + # Run script from the tensorflow models research directory + self.run_command(self.command_prefix) diff --git a/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/fp32/model_init.py b/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/fp32/model_init.py index d59e734f6..4bf58188d 100644 --- a/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/fp32/model_init.py +++ b/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/fp32/model_init.py @@ -41,7 +41,7 @@ def __init__(self, args, custom_args=[], platform_util=None): self.set_kmp_vars(config_file_path) benchmark_script = os.path.join(self.args.intelai_models, self.args.mode, - self.args.precision, "infer_detections.py") + "infer_detections.py") self.command_prefix = self.get_command_prefix(self.args.socket_id) \ + "{} {}".format(self.python_exe, benchmark_script) set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) diff --git a/benchmarks/recommendation/tensorflow/ncf/README.md b/benchmarks/recommendation/tensorflow/ncf/README.md index b14155c9c..a7a44a2d8 100644 --- a/benchmarks/recommendation/tensorflow/ncf/README.md +++ b/benchmarks/recommendation/tensorflow/ncf/README.md @@ -2,6 +2,8 @@ This document has instructions for how to run NCF for the following modes/precisions: +* [FP32 training](#fp32-training-instructions) +* [BFloat16 training](#bfloat16-training-instructions) * [FP32 inference](#fp32-inference-instructions) Instructions and scripts for model training and inference. @@ -10,12 +12,28 @@ Instructions and scripts for model training and inference. 1. Dataset +Support two datasets: ml-1m, ml-20m. It can be specified with flag `dataset=ml-1m` or `dataset=ml-20m`. This model uses official tensorflow models repo, where [ncf](https://github.com/tensorflow/models/tree/master/official/recommendation) model automatically downloads movielens ml-1m dataset as default if the `--data-location` flag is not set. -If you want to download movielens 1M dataset and provide that path to `--data-location`, check this [reference](https://grouplens.org/datasets/movielens/1m/) +If you want to download movielens 1M/20M dataset and provide that path to `--data-location`, check this [reference](https://grouplens.org/datasets/movielens/) -2. Clone the official `tensorflow/models` repository with tag `v1.11` and make a small change to `data_async_generation.py`, commenting out a line that causes a crash in the model script. Store the path to the current directory. +2. Clone the official `tensorflow/models` repository. +For training, please checkout with tag `r2.1_model_reference `: +``` +$ git clone https://github.com/tensorflow/models.git +$ cd models +$ git checkout r2.1_model_reference +``` + +For inference, please checkout with tag `v1.11`: +``` +$ git clone https://github.com/tensorflow/models.git +$ cd models +$ git checkout r2.1_model_reference +``` + +For inference, please checkout with tag `v1.11`: ``` $ MODEL_WORK_DIR=${MODEL_WORK_DIR:=`pwd`} $ pushd $MODEL_WORK_DIR @@ -23,7 +41,6 @@ $ pushd $MODEL_WORK_DIR $ git clone https://github.com/tensorflow/models.git tf_models $ cd tf_models $ git checkout v1.11 -$ sed -i.bak 's/atexit.register/# atexit.register/g' official/recommendation/data_async_generation.py ``` 3. Now clone `IntelAI/models` repository, then navigate to the `benchmarks` folder: @@ -34,7 +51,8 @@ $ git clone https://github.com/IntelAI/models.git $ cd models/benchmarks ``` -4. Download and extract the pre-trained model. +4. Download and extract the pre-trained model, be careful it only works for 1M dataset. +Skip this step if training only. ``` $ wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_5/ncf_fp32_pretrained_model.tar.gz $ tar -xzvf ncf_fp32_pretrained_model.tar.gz @@ -42,10 +60,75 @@ $ tar -xzvf ncf_fp32_pretrained_model.tar.gz 5. Run the `launch_benchmark.py` script with the appropriate parameters. * `--model-source-dir` - Path to official tensorflow models from step2. -* `--checkpoint` - Path to checkpoint directory for the Pre-trained model from step4 +* `--checkpoint` - Path to checkpoint directory for the Pre-trained model from step4. Checkpoint will be stored in this directory while training. + +For training, suggest options are: +* `--batch-size 98304` +* `--precision fp32` or `--precision bfloat16` +* `dataset=ml-20m` - suggest to use 20M dataset for training +* `clean=1` - delete any files stored in `--checkpoint`. Disable this flag if want to reuse pre-trained model. +* `te=12` - set the max epoch. NCF will train 6+ epochs to SOTA, this flag will stop training when reach specific epoch. Set it to 1 if only want to test performance + +``` +$ python launch_benchmark.py \ + --checkpoint /home//ncf_fp32_pretrained_model \ + --model-source-dir /home//tensorflow/models \ + --model-name ncf \ + --framework tensorflow \ + --mode training \ + --precision bfloat16 \ + --batch-size 98304 \ + --num-inter-threads 2 \ + --verbose \ + --docker-image intelaipg/intel-optimized-tensorflow:1.14 \ + -- dataset=ml-20m clean=1 te=12 +``` + +NCF will train 6+ epochs to SOTA. The tail of training log looks as below if trained to SOTA. +HR: Hit Ratio (HR), should >= 0.635 if use 20M dataset +NDCG: Normalized Discounted Cumulative Gain +``` +I0122 12:00:14.874159 140303790921536 ncf_estimator_main.py:179] Iteration 6: HR = 0.6356, NDCG = 0.3787, Loss = 0.1567 +I0122 12:00:14.874222 140303790921536 model_helpers.py:53] Stop threshold of 0.635 was passed with metric value 0.635591685772. +I0122 12:00:14.874658 140303790921536 mlperf_log.py:136] NCF_RAW_:::MLPv0.5.0 ncf 1579665614.874648094 (ncf_estimator_main.py:187) run_stop: {"success": true} +NCF_RAW_:::MLPv0.5.0 ncf 1579665614.881932020 (ncf_estimator_main.py:193) run_final +I0122 12:00:14.881944 140303790921536 mlperf_log.py:134] NCF_RAW_:::MLPv0.5.0 ncf 1579665614.881932020 (ncf_estimator_main.py:193) run_final +``` + +For training, suggest options are: +* `--batch-size 98304` +* `--precision fp32` or `--precision bfloat16` +* `dataset=ml-20m` - suggest to use 20M dataset for training +* `clean=1` - delete any files stored in `--checkpoint`. Disable this flag if want to reuse pre-trained model. +* `te=12` - set the max epoch. NCF will train 6+ epochs to SOTA, this flag will stop training when reach specific epoch. Set it to 1 if only want to test performance + +``` +$ python launch_benchmark.py \ + --checkpoint /home//ncf_fp32_pretrained_model \ + --model-source-dir /home//tensorflow/models \ + --model-name ncf \ + --framework tensorflow \ + --mode training \ + --precision bfloat16 \ + --batch-size 98304 \ + --num-inter-threads 2 \ + --verbose \ + --docker-image intelaipg/intel-optimized-tensorflow:1.14 \ + -- dataset=ml-20m clean=1 te=12 +``` +NCF will train 6+ epochs to SOTA. The tail of training log looks as below if trained to SOTA. +HR: Hit Ratio (HR), should >= 0.635 if use 20M dataset +NDCG: Normalized Discounted Cumulative Gain +``` +I0122 12:00:14.874159 140303790921536 ncf_estimator_main.py:179] Iteration 6: HR = 0.6356, NDCG = 0.3787, Loss = 0.1567 +I0122 12:00:14.874222 140303790921536 model_helpers.py:53] Stop threshold of 0.635 was passed with metric value 0.635591685772. +I0122 12:00:14.874658 140303790921536 mlperf_log.py:136] NCF_RAW_:::MLPv0.5.0 ncf 1579665614.874648094 (ncf_estimator_main.py:187) run_stop: {"success": true} +NCF_RAW_:::MLPv0.5.0 ncf 1579665614.881932020 (ncf_estimator_main.py:193) run_final +I0122 12:00:14.881944 140303790921536 mlperf_log.py:134] NCF_RAW_:::MLPv0.5.0 ncf 1579665614.881932020 (ncf_estimator_main.py:193) run_final +``` -For batch inference, `--batch-size 256`, `--socket-id 0`, `--checkpoint` path from step5, `--model-source-dir` path from step2 +For batch inference, `--batch-size 256`, `--socket-id 0`, ``` $ python launch_benchmark.py \ @@ -75,7 +158,7 @@ Average recommendations/sec across 23594 steps: 903932.8 (0.28381 msec/batch) ... ``` -For online inference, `--batch-size 1`, `--socket-id 0`, `--checkpoint` path from step5, `--model-source-dir` path from step2 +For online inference, `--batch-size 1`, `--socket-id 0`, ``` $ python launch_benchmark.py \ @@ -98,7 +181,8 @@ The tail of online inference log, looks as below. Average recommendations/sec across 6040001 steps: 4573.0 (0.21920 msec/batch) ... ``` -For Accuracy, `--batch-size 256`, `--socket-id 0`, `--checkpoint` path from step5, `--model-source-dir` path from step2 + +For Accuracy, `--batch-size 256`, `--socket-id 0`, ``` $ python launch_benchmark.py \ diff --git a/benchmarks/recommendation/tensorflow/wide_deep_large_ds/README.md b/benchmarks/recommendation/tensorflow/wide_deep_large_ds/README.md index dd75e2db4..0983917e9 100755 --- a/benchmarks/recommendation/tensorflow/wide_deep_large_ds/README.md +++ b/benchmarks/recommendation/tensorflow/wide_deep_large_ds/README.md @@ -20,24 +20,25 @@ following modes/precisions: $ git clone https://github.com/IntelAI/models.git ``` -2. Download large Kaggle Display Advertising Challenge Dataset from - http://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/ +2. Download large Kaggle Display Advertising Challenge Dataset - Note: The dataset does not contain the eval.txt file required for measuring model accuracy. So, download the evaluation - dataset for accuracy measurement from https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version/eval.csv - Download the train dataset from https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version/train.csv + Download large Kaggle Display Advertising Challenge Dataset from http://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/ + + The evaluation dataset for accuracy measurement is not available in the above link can be downloaded from https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version/eval.csv + + Download the train dataset(in csv format) from https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version/train.csv 3. Pre-process the downloaded dataset to tfrecords using [preprocess_csv_tfrecords.py](/models/recommendation/tensorflow/wide_deep_large_ds/dataset/preprocess_csv_tfrecords.py) - Copy the eval.csv and test.csv into your current working directory (i.e. root of models repo) and launch + Copy the eval.csv and test.csv into your current working directory (i.e. root of models repo) and launch. This preprocess step requires Pandas module to be installed. - * Launch docker + * Launch docker ``` $ cd $MODEL_WORK_DIR/models/ $ docker run -it --privileged -u root:root \ -w /models \ --volume $MODEL_WORK_DIR:/models \ - intelaipg/intel-optimized-tensorflow:latest-prs-bdw \ + intel/intel-optimized-tensorflow:1.15.2 \ /bin/bash ``` @@ -45,16 +46,32 @@ following modes/precisions: ``` apt-get install python-pandas pip install pandas + ``` + + * If you are unable to fetch please do: + ``` + apt-get update + ``` + * Now run the data preprocessing step: + ``` cd models python models/recommendation/tensorflow/wide_deep_large_ds/dataset/preprocess_csv_tfrecords.py \ --inputcsv-datafile eval.csv \ --calibrationcsv-datafile train.csv \ --outputfile-name preprocessed_eval ``` - + + * Process Train dataset for Model Quantization + ``` + python models/recommendation/tensorflow/wide_deep_large_ds/dataset/preprocess_csv_tfrecords.py \ + --inputcsv-datafile train.csv \ + --calibrationcsv-datafile eval.csv \ + --outputfile-name preprocessed_train + ``` * Process test dataset + If you have test dataset without true labels, following command can be used to generate the processed test set on which you can run inference. The test.txt is in tab-separated values (TSV) format and they must be converted into comma-separated values (CSV) format before doing the pre-processing. On docker console run the below commands to pre-process test datasets ``` $ tr '\t' ',' < test.txt > test.csv @@ -62,7 +79,7 @@ following modes/precisions: --inputcsv-datafile test.csv \ --outputfile-name preprocessed_test ``` - Now preprocessed eval and test datasets will be stored as eval_preprocessed_eval.tfrecords and test_preprocessed_test.tfrecords in $MODEL_WORK_DIR/models/ directory + Now preprocessed eval, train and test datasets will be stored as eval_preprocessed_eval.tfrecords , train_preprocessed_train.tfrecords and test_preprocessed_test.tfrecords respectively in $MODEL_WORK_DIR/models/ directory 4. Exit out of docker once the dataset pre-processing completes. ``` @@ -95,16 +112,16 @@ when calling `launch_benchmark.py` and the script will run without TCMalloc. --mode inference \ --framework tensorflow \ --batch-size 1000 \ - --socket-id 0 \ --accuracy-only \ - --docker-image intel/intel-optimized-tensorflow:2.3.0 \ + --docker-image intel/intel-optimized-tensorflow:1.15.2 \ --in-graph $MODEL_WORK_DIR/wide_deep_int8_pretrained_model.pb \ --data-location $MODEL_WORK_DIR/models/eval_preprocessed_eval.tfrecords ``` 3. Run Performance test - * Running in online inference mode, set `--batch-size 1` + + * Running online inference mode to measure latency, set `--batch-size 1` ``` $ cd $MODEL_WORK_DIR/models/benchmarks @@ -116,55 +133,89 @@ when calling `launch_benchmark.py` and the script will run without TCMalloc. --framework tensorflow \ --benchmark-only \ --batch-size 1 \ - --socket-id 0 \ - --docker-image intel/intel-optimized-tensorflow:2.3.0 \ + --docker-image intel/intel-optimized-tensorflow:1.15.2 \ --in-graph $MODEL_WORK_DIR/wide_deep_int8_pretrained_model.pb \ --data-location $MODEL_WORK_DIR/models/eval_preprocessed_eval.tfrecords \ - --num-intra-threads 1 --num-inter-threads 1 --num-cores 1 \ + --num-intra-threads 1 --num-inter-threads 1 \ -- num_omp_threads=1 ``` - * Running in batch inference mode, set `--batch-size 512` - - Case 1 : Disabling `use_parallel_batches` option. In this case the batches are inferred in sequential order. By default `use_parallel_batches` is disabled. Kmp variables can also be set by using the arguments shown below. - + * Running batch inference mode, set `--batch-size 512` \ + The "numactl" is a utility which can be used to control NUMA policy for processes or shared memory. To install numactl do: ``` + apt install numactl + ``` + By default numactl is disabled. User can use NUMA policy control only on bare metal in advanced cases to specify number of cores to be used. This can be specified as shown in the command below only on bare metal. Below are the commands that gives best performance on 28 cores. The hyperparameters num-intra-threads, num-inter-threads, num_omp_threads etc should be tuned for best performance. + + Case 1 : Disabling `use_parallel_batches` option. In this case the batches are inferred in sequential order. By default `use_parallel_batches` is disabled. Kmp variables can also be set by using the arguments shown below or through config file $MODEL_WORK_DIR/models/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/int8/config.json + + + ``` $ cd $MODEL_WORK_DIR/models/benchmarks - $ python launch_benchmark.py \ + $ numactl --physcpubind=0-27 -m 0 python launch_benchmark.py \ --model-name wide_deep_large_ds \ --precision int8 \ --mode inference \ --framework tensorflow \ --benchmark-only \ --batch-size 512 \ - --socket-id 0 \ - --docker-image intel/intel-optimized-tensorflow:2.3.0 \ --in-graph $MODEL_WORK_DIR/wide_deep_int8_pretrained_model.pb \ --data-location $MODEL_WORK_DIR/models/eval_preprocessed_eval.tfrecords \ - --num-intra-threads 28 --num-inter-threads 1 --num-cores 28 \ - -- num_omp_threads=16 kmp_block_time=0 kmp_settings=1 kmp_affinity="noverbose,warnings,respect,granularity=core,none" - - ``` + --num-intra-threads 28 --num-inter-threads 1 \ + -- num_omp_threads=16 kmp_block_time=1 kmp_settings=1 + ``` + * The log file is saved to the value of `--output-dir`. The tail of the log output when the script completes + should look something like this: + + ``` + -------------------------------------------------- + Total test records : 2000000 + Batch size is : 512 + Number of batches : 3907 + Classification accuracy (%) : 77.636 + Inference duration (seconds) : 3.1534 + Average Latency (ms/batch) : 0.8285 + Throughput is (records/sec) : 617992.762 + -------------------------------------------------- + + Ran inference with batch size 512 + Log location outside container: {--output-dir value}/benchmark_wide_deep_large_ds_inference_int8_20190225_061815.log + ``` + Case 2 : Enabling `use_parallel_batches` option. In this case multiples batches are inferred in parallel. Number of batches to be executed in parallel can be given by argument num_parallel_batches. ``` $ cd $MODEL_WORK_DIR/models/benchmarks - $ python launch_benchmark.py \ + $ numactl --physcpubind=0-27 -m 0 python launch_benchmark.py \ --model-name wide_deep_large_ds \ --precision int8 \ --mode inference \ --framework tensorflow \ --benchmark-only \ --batch-size 512 \ - --socket-id 0 \ - --docker-image intel/intel-optimized-tensorflow:2.3.0 \ --in-graph $MODEL_WORK_DIR/wide_deep_int8_pretrained_model.pb \ --data-location $MODEL_WORK_DIR/models/eval_preprocessed_eval.tfrecords \ - --num-intra-threads 1 --num-inter-threads 28 --num-cores 28 \ - -- num_omp_threads=1 use_parallel_batches=True num_parallel_batches=28 kmp_block_time=0 kmp_settings=1 kmp_affinity="noverbose,warnings,respect,granularity=core,none" - ``` + --num-intra-threads 1 --num-inter-threads 28 \ + -- num_omp_threads=1 use_parallel_batches=True num_parallel_batches=28 kmp_block_time=0 kmp_settings=1 + ``` + + * The log file is saved to the value of `--output-dir`. The tail of the log output when the script completes + should look something like this: + + ``` + -------------------------------------------------- + Total test records : 2000000 + Batch size is : 512 + Number of batches : 3907 + Inference duration (seconds) : 1.7056 + Average Latency (ms/batch) : 12.2259 + Throughput is (records/sec) : 1172597.21 + -------------------------------------------------- + Ran inference with batch size 512 + Log location outside container: {--output-dir value}/benchmark_wide_deep_large_ds_inference_int8_20190225_061815.log + ``` 4. To return to where you started from: ``` @@ -186,13 +237,12 @@ $ popd ``` $ cd $MODEL_WORK_DIR/models/benchmarks - $ python launch_benchmark.py + $ python launch_benchmark.py \ --model-name wide_deep_large_ds \ --precision fp32 \ --mode inference \ --framework tensorflow \ --batch-size 1000 \ - --socket-id 0 \ --accuracy-only \ --docker-image intel/intel-optimized-tensorflow:2.3.0 \ --in-graph $MODEL_WORK_DIR/wide_deep_fp32_pretrained_model.pb \ @@ -201,66 +251,90 @@ $ popd 3. Run Performance test - * Running in online inference mode, set `--batch-size 1` + * Running in online inference mode for measuring latency , set `--batch-size 1` ``` $ cd $MODEL_WORK_DIR/models/benchmarks - $ python launch_benchmark.py + $ python launch_benchmark.py \ --model-name wide_deep_large_ds \ --precision fp32 \ --mode inference \ --framework tensorflow \ --benchmark-only \ --batch-size 1 \ - --socket-id 0 \ --docker-image intel/intel-optimized-tensorflow:2.3.0 \ --in-graph $MODEL_WORK_DIR/wide_deep_fp32_pretrained_model.pb \ --data-location $MODEL_WORK_DIR/models/eval_preprocessed_eval.tfrecords \ - --num-intra-threads 1 --num-inter-threads 1 --num-cores 1 \ + --num-intra-threads 1 --num-inter-threads 1 \ -- num_omp_threads=1 ``` * Running in batch inference mode, set `--batch-size 512` + By default numactl is disabled. User can specify as shown in the command below. Below are the commands that gives best performance on 28 cores. The hyperparameters num-intra-threads, num-inter-threads, num_omp_threads etc should be tuned for best performance. + + Case 1 : Disabling `use_parallel_batches` option. Kmp variables can also be set by using the arguments shown below or through config file $MODEL_WORK_DIR/models/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/fp32/config.json - Case 1 : Disabling `use_parallel_batches` option - - ``` + ``` $ cd $MODEL_WORK_DIR/models/benchmarks - $ python launch_benchmark.py + $ numactl --physcpubind=0-27 -m 0 python launch_benchmark.py \ --model-name wide_deep_large_ds \ --precision fp32 \ --mode inference \ --framework tensorflow \ --benchmark-only \ --batch-size 512 \ - --socket-id 0 \ - --docker-image intel/intel-optimized-tensorflow:2.3.0 \ --in-graph $MODEL_WORK_DIR/wide_deep_fp32_pretrained_model.pb \ --data-location $MODEL_WORK_DIR/models/eval_preprocessed_eval.tfrecords \ - --num-intra-threads 28 --num-inter-threads 1 --num-cores 28 \ - -- num_omp_threads=20 kmp_block_time=0 kmp_settings=1 kmp_affinity="noverbose,warnings,respect,granularity=core,none" - ``` + --num-intra-threads 28 --num-inter-threads 1 \ + -- num_omp_threads=20 kmp_block_time=1 kmp_settings=1 + ``` + -------------------------------------------------- + Total test records : 2000000 + Batch size is : 512 + Number of batches : 3907 + Classification accuracy (%) : 77.6693 + No of correct predictions : 1553386 + Inference duration (seconds) : 6.4442 + Average Latency (ms/batch) : 1.6931 + Throughput is (records/sec) : 302410.809 + -------------------------------------------------- Case 2 : Enabling `use_parallel_batches` option. - ``` - cd /home//models/benchmarks + ``` + cd $MODEL_WORK_DIR/models/benchmarks - python launch_benchmark.py + numactl --physcpubind=0-27 -m 0 python launch_benchmark.py \ --model-name wide_deep_large_ds \ --precision fp32 \ --mode inference \ --framework tensorflow \ --benchmark-only \ --batch-size 512 \ - --socket-id 0 \ - --docker-image intel/intel-optimized-tensorflow:2.3.0 \ --in-graph $MODEL_WORK_DIR/wide_deep_fp32_pretrained_model.pb \ --data-location $MODEL_WORK_DIR/models/eval_preprocessed_eval.tfrecords \ - --num-intra-threads 1 --num-inter-threads 28 --num-cores 28 \ - -- num_omp_threads=1 use_parallel_batches=True num_parallel_batches=28 kmp_block_time=0 kmp_settings=1 kmp_affinity="noverbose,warnings,respect,granularity=core,none" - ``` + --num-intra-threads 1 --num-inter-threads 28 \ + -- num_omp_threads=1 use_parallel_batches=True num_parallel_batches=28 kmp_block_time=0 kmp_settings=1 + ``` + + * The log file is saved to the value of `--output-dir`. The tail of the log output when the script completes + should look something like this: + ``` + -------------------------------------------------- + Total test records : 2000000 + Batch size is : 512 + Number of batches : 3907 + Classification accuracy (%) : 77.6693 + No of correct predictions : 1553386 + Inference duration (seconds) : 3.4655 + Average Latency (ms/batch) : 24.8406 + Throughput is (records/sec) : 577120.456 + -------------------------------------------------- + Ran inference with batch size 512 + Log location outside container: {--output-dir value}/benchmark_wide_deep_large_ds_inference_fp32_20190225_062206.log + ``` + 4. To return to where you started from: ``` @@ -268,24 +342,40 @@ $ popd ``` ## FP32 Training Instructions -1. Download large Kaggle Display Advertising Challenge Dataset from - http://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/ +1. Download large Kaggle Display Advertising Challenge Dataset - Download the large version of train dataset from https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version/train.csv - - Download the large version of evaluation dataset from https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version/eval.csv - + Download large Kaggle Display Advertising Challenge Dataset from http://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/ + The evaluation dataset for accuracy measurement is not available in the above link can be downloaded from https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version/eval.csv + + Download the large version of train dataset(in csv format) from https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version/train.csv + 2. Train Wide and Deep model by providing location of train.csv, eval.csv - * Train the model (The model will be trained for 10 epochs if -- steps is not specified) + * Train the model (The model will be trained for 10 epochs if -- steps is not specified). --output-dir arg can be used to specify the directory where checkpoints and saved model to be saved. ``` + cd $MODEL_WORK_DIR/models/benchmarks + $ python launch_benchmark.py --model-name wide_deep_large_ds \ --precision fp32 \ --mode training \ --framework tensorflow \ --batch-size 512 \ - --data-location /root/dataset \ + --data-location $MODEL_WORK_DIR \ --docker-image intel/intel-optimized-tensorflow:2.3.0 ``` + Once the training completes successfully the path of checkpoint files and saved_model.pb will be printed as shown below + ``` + INFO:tensorflow:SavedModel written to: home//temp-1602670603/saved_model.pb + Using TensorFlow version 2.3.0 + Begin training and evaluation + Saving model checkpoints to /home//model_WIDE_AND_DEEP_1602670581 + ****Computing statistics of train dataset***** + estimator built + fit done + evaluate done + Model exported to home/ + ``` + + \ No newline at end of file diff --git a/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/fp32/model_init.py b/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/fp32/model_init.py index 97a3baec0..93b2e193c 100755 --- a/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/fp32/model_init.py +++ b/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/fp32/model_init.py @@ -43,9 +43,6 @@ def parse_args(self): parser.add_argument('--kmp_block_time', dest='kmp_block_time', help='number of kmp block time.', type=str, default=None) - parser.add_argument('--kmp_affinity', dest='kmp_affinity', - help='kmp affinity value', - type=str, default=None) parser.add_argument('--kmp_settings', dest='kmp_settings', help='kmp settings', type=str, default=None) @@ -53,7 +50,7 @@ def parse_args(self): namespace=self.args) config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") self.set_kmp_vars(config_file_path, kmp_settings=str(self.args.kmp_settings), - kmp_blocktime=str(self.args.kmp_block_time), kmp_affinity=str(self.args.kmp_affinity)) + kmp_blocktime=str(self.args.kmp_block_time)) def run_benchmark(self): enable_parallel_batches = getattr(self.args, 'use_parallel_batches') @@ -66,9 +63,6 @@ def run_benchmark(self): else: benchmark_script = os.path.join(self.args.intelai_models, self.args.mode, "inference.py") command_prefix = self.get_command_prefix(-1) - if self.args.socket_id != -1 and self.args.num_cores != -1: - command_prefix = command_prefix + " numactl --physcpubind=0-{} --membind={} ".\ - format(str(int(self.args.num_cores) - 1), self.args.socket_id) cmd_prefix = command_prefix + self.python_exe + " " + benchmark_script cmd = self.add_args_to_command(cmd_prefix, script_args_list) self.run_command(cmd) diff --git a/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/int8/model_init.py b/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/int8/model_init.py index 812d60c12..0976a1a73 100755 --- a/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/int8/model_init.py +++ b/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/int8/model_init.py @@ -43,9 +43,6 @@ def parse_args(self): parser.add_argument('--kmp_block_time', dest='kmp_block_time', help='number of kmp block time.', type=str, default=None) - parser.add_argument('--kmp_affinity', dest='kmp_affinity', - help='kmp affinity value', - type=str, default=None) parser.add_argument('--kmp_settings', dest='kmp_settings', help='kmp settings', type=str, default=None) @@ -53,7 +50,7 @@ def parse_args(self): namespace=self.args) config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") self.set_kmp_vars(config_file_path, kmp_settings=str(self.args.kmp_settings), - kmp_blocktime=str(self.args.kmp_block_time), kmp_affinity=str(self.args.kmp_affinity)) + kmp_blocktime=str(self.args.kmp_block_time)) def run_benchmark(self): enable_parallel_batches = getattr(self.args, 'use_parallel_batches') @@ -66,9 +63,6 @@ def run_benchmark(self): else: benchmark_script = os.path.join(self.args.intelai_models, self.args.mode, "inference.py") command_prefix = self.get_command_prefix(-1) - if self.args.socket_id != -1 and self.args.num_cores != -1: - command_prefix = command_prefix + " numactl --physcpubind=0-{} --membind={} ".\ - format(str(int(self.args.num_cores) - 1), self.args.socket_id) cmd_prefix = command_prefix + self.python_exe + " " + benchmark_script cmd = self.add_args_to_command(cmd_prefix, script_args_list) self.run_command(cmd) diff --git a/dockerfiles/model_containers/intel-python-dlrm-bf16-training.Dockerfile b/dockerfiles/model_containers/intel-python-dlrm-bf16-training.Dockerfile deleted file mode 100644 index cdf297074..000000000 --- a/dockerfiles/model_containers/intel-python-dlrm-bf16-training.Dockerfile +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) 2020 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -# -# THIS IS A GENERATED DOCKERFILE. -# -# This file was assembled from multiple pieces, whose use is documented -# throughout. Please refer to the TensorFlow dockerfiles documentation -# for more information. - -ARG INTEL_PYTHON_TAG=latest -FROM intelpython/intelpython3_core:$INTEL_PYTHON_TAG - -RUN conda install -y -c intel/label/oneapibeta pytorch - -RUN conda install -y -c intel/label/oneapibeta intel-extension-for-pytorch - -RUN conda install -y -c intel/label/oneapibeta torch_ccl -ARG PYTHON_VERSION=3.7 -ENV LD_LIBRARY_PATH="/opt/conda/lib/python${PYTHON_VERSION}/site-packages/ccl/lib/:${LD_LIBRARY_PATH}" - -RUN python -m pip install onnx && \ - python -m pip install -e git+https://github.com/mlperf/logging@0.7.0-rc2#egg=logging && \ - conda install -y -c intel scikit-learn && \ - conda install -c conda-forge gperftools && \ - conda clean -a \ diff --git a/docs/container_portal/dl/pytorch/ipex-pytorch.md b/docs/container_portal/dl/pytorch/ipex-pytorch.md deleted file mode 100644 index c7a3d1153..000000000 --- a/docs/container_portal/dl/pytorch/ipex-pytorch.md +++ /dev/null @@ -1,52 +0,0 @@ - -# Intel IPEX container - -## Pull Command - -``` -docker pull intel/intel-optimized-pytorch:1.5.0-rc3-ipex-latest -``` - -## Tags & Pull Commands for Other Versions - -| OS | Target | Version | Size | Updated | Pull Command | -| --- | ------ | ------- | ---- | ------- | ------------ | -| ubuntu:20.04 | | 1.5.0-rc3-ipex-latest | 708.77MB | | `docker pull intel/intel-optimized-pytorch:1.5.0-rc3-ipex-latest` | - -## Description -Intel® Extensions for PyTorch extends the original PyTorch framework by creating extensions that optimize performance of Deep Learning models. This container contains PyTorch v1.5.1, Intel® Extensions for Pytorch and, IPEX version 0.2. -## Documentation and Sources - -- [Docker Repo]() -- [Main Github](https://github.com/intel/intel-extension-for-pytorch) -- [Readme](https://github.com/intel/intel-extension-for-pytorch/blob/master/docker/README.md) -- [Dockerfile](https://github.com/intel/intel-extension-for-pytorch/blob/master/docker/Dockerfile) - -## License Agreement -LEGAL NOTICE: By accessing, downloading or using this software and any required dependent software (the “Software Package”), -you agree to the terms and conditions of the software license agreements for the Software Package, which may also include notices, -disclaimers, or license terms for third party software included with the Software Package.  -Please refer to the [license file](https://github.com/intel/intel-extension-for-pytorch/blob/master/LICENSE.txt) for additional details. - -## Metadata -This is for internal use on the Intel® oneContainer Portal. - -- SEO Keyword: [PyTorch] -- Search/Browser Title: [Intel® Extension for PyTorch\*] -- Search Description: [This container contains PyTorch\*, Intel® Extension for PyTorch\*, and torchvision 0.6.] -- Short Title: [Intel® Extension for PyTorch\*] -- Short Description: [This container contains PyTorch\*, Intel® Extension for PyTorch\*, and torchvision 0.6.] -- Intel Keywords: [PyTorch, Extension, Deep Learning] -- OS Tags (choose at least one): - - [ ] Ubuntu\* -- Platform Tags (can choose multiple): - - [ ] CPU -- Use Case Tags (can choose multiple): - - [ ] AI Inference - - [ ] AI Training - - [ ] Cloud Computing -- App Domain Tags (can choose multiple): - - [ ] Artificial Intelligence - - [ ] Data Center - - [ ] High Performance Computing (HPC) - - [ ] Cloud & Edge Computing diff --git a/k8s/image_recognition/tensorflow/resnet50v1_5/inference/fp32/README.md b/k8s/image_recognition/tensorflow/resnet50v1_5/inference/fp32/README.md index 3485a6e4c..dd17f8f1e 100644 --- a/k8s/image_recognition/tensorflow/resnet50v1_5/inference/fp32/README.md +++ b/k8s/image_recognition/tensorflow/resnet50v1_5/inference/fp32/README.md @@ -209,23 +209,6 @@ deployment, and other resources using the following commands: kubectl delete -f serving.yaml ``` - -## TroubleShooting - -- Pod doesn't start. Status is ErrImagePull.
- Docker recently implemented rate limits.
- See this [note](https://thenewstack.io/docker-hub-limits-what-they-are-and-how-to-route-around-them/) about rate limits and work-arounds. - -- Argo workflow steps do not execute.
- Error from `argo get ` is 'failed to save outputs: Failed to establish pod watch: timed out waiting for the condition'.
- See this argo [issue](https://github.com/argoproj/argo/issues/4186). This is due to the workflow running as non-root.
- Devops will need to change the workflow-executor to k8sapi as described [here](https://github.com/argoproj/argo/blob/master/docs/workflow-executors.md). - -- MpiOperator can't create workers. Error is '/bin/sh: /etc/hosts: Permission denied'. This is due to a bug in mpi-operator in the 'latest' container image - when the workers run as non-root. See this [issue](https://github.com/kubeflow/mpi-operator/issues/288).
- Use the container images: mpioperator/mpi-operator:v02.3 and mpioperator/kubectl-delivery:v0.2.3. - - ## License diff --git a/k8s/image_recognition/tensorflow/resnet50v1_5/inference/fp32/mlops/serving/.gitignore b/k8s/image_recognition/tensorflow/resnet50v1_5/inference/fp32/mlops/serving/.gitignore new file mode 100644 index 000000000..c1adae73d --- /dev/null +++ b/k8s/image_recognition/tensorflow/resnet50v1_5/inference/fp32/mlops/serving/.gitignore @@ -0,0 +1,2 @@ +Makefile +serving.yaml diff --git a/k8s/image_recognition/tensorflow/resnet50v1_5/inference/fp32/mlops/serving/Krmfile b/k8s/image_recognition/tensorflow/resnet50v1_5/inference/fp32/mlops/serving/Krmfile new file mode 100644 index 000000000..44407c8c4 --- /dev/null +++ b/k8s/image_recognition/tensorflow/resnet50v1_5/inference/fp32/mlops/serving/Krmfile @@ -0,0 +1,198 @@ +apiVersion: config.k8s.io/v1alpha1 +kind: Krmfile +openAPI: + definitions: + io.k8s.cli.setters.MODEL_NAME: + description: model name + x-k8s-cli: + setter: + name: MODEL_NAME + value: resnet50v1_5 + isSet: true + setBy: model-builder + io.k8s.cli.setters.MODEL_SERVING_NAME: + description: model serving name + x-k8s-cli: + setter: + name: MODEL_SERVING_NAME + value: resnet50v1-5-fp32-inference + isSet: true + setBy: model-builder + io.k8s.cli.setters.MODEL_SERVING_LABEL: + description: selector label + x-k8s-cli: + setter: + name: MODEL_SERVING_LABEL + value: resnet50v1-5-fp32-server + isSet: true + setBy: model-builder + io.k8s.cli.setters.MODEL_SERVING_IMAGE_NAME: + description: image name + x-k8s-cli: + setter: + name: MODEL_SERVING_IMAGE_NAME + value: intel/intel-optimized-tensorflow-serving + isSet: true + setBy: model-builder + io.k8s.cli.setters.MODEL_SERVING_IMAGE_VERSION: + description: image tag + x-k8s-cli: + setter: + name: MODEL_SERVING_IMAGE_VERSION + value: 2.3.0 + isSet: true + setBy: model-builder + io.k8s.cli.setters.REGISTRY: + description: image location + x-k8s-cli: + setter: + name: REGISTRY + value: docker.io + isSet: true + setBy: model-builder + io.k8s.cli.substitutions.IMAGE: + description: image name + x-k8s-cli: + substitution: + name: IMAGE + pattern: ${REGISTRY}/${MODEL_SERVING_IMAGE_NAME}:${MODEL_SERVING_IMAGE_VERSION}${IMAGE_SUFFIX} + values: + - marker: ${REGISTRY} + ref: '#/definitions/io.k8s.cli.setters.REGISTRY' + - marker: ${MODEL_SERVING_IMAGE_NAME} + ref: '#/definitions/io.k8s.cli.setters.MODEL_SERVING_IMAGE_NAME' + - marker: ${MODEL_SERVING_IMAGE_VERSION} + ref: '#/definitions/io.k8s.cli.setters.MODEL_SERVING_IMAGE_VERSION' + - marker: ${IMAGE_SUFFIX} + ref: '#/definitions/io.k8s.cli.setters.IMAGE_SUFFIX' + io.k8s.cli.setters.MODEL_DIR: + description: mounted model directory + x-k8s-cli: + setter: + name: MODEL_DIR + value: /models + isSet: true + setBy: model-builder + required: true + io.k8s.cli.setters.MODEL_BASE_NAME: + description: base directory name + x-k8s-cli: + setter: + name: MODEL_BASE_NAME + value: savedmodels + isSet: true + setBy: model-builder + required: true + io.k8s.cli.substitutions.MODEL_BASE_PATH: + description: model base path + x-k8s-cli: + substitution: + name: MODEL_BASE_PATH + pattern: ${MODEL_DIR}/${MODEL_BASE_NAME} + values: + - marker: ${MODEL_DIR} + ref: '#/definitions/io.k8s.cli.setters.MODEL_DIR' + - marker: ${MODEL_BASE_NAME} + ref: '#/definitions/io.k8s.cli.setters.MODEL_BASE_NAME' + io.k8s.cli.setters.MODEL_PORT: + type: integer + description: model container port + x-k8s-cli: + setter: + name: MODEL_PORT + value: "8500" + isSet: true + setBy: model-builder + required: true + io.k8s.cli.setters.MODEL_SERVICE_PORT: + type: integer + description: model service port + x-k8s-cli: + setter: + name: MODEL_SERVICE_PORT + value: "8501" + isSet: true + setBy: model-builder + required: true + io.k8s.cli.setters.REPLICAS: + type: integer + description: number of replicas + x-k8s-cli: + setter: + name: REPLICAS + value: "3" + isSet: true + setBy: model-builder + required: true + io.k8s.cli.setters.IMAGE_SUFFIX: + description: appended to image name + x-k8s-cli: + setter: + name: IMAGE_SUFFIX + value: "" + isSet: true + setBy: model-builder + io.k8s.cli.setters.USER_ID: + type: integer + description: process owner id + x-k8s-cli: + setter: + name: USER_ID + value: "0" + isSet: true + setBy: model-builder + required: true + io.k8s.cli.substitutions.GROUP_ID_VALUE: + x-k8s-cli: + substitution: + name: GROUP_ID_VALUE + pattern: "${GROUP_ID}" + values: + - marker: ${GROUP_ID} + ref: '#/definitions/io.k8s.cli.setters.GROUP_ID' + io.k8s.cli.substitutions.USER_ID_VALUE: + x-k8s-cli: + substitution: + name: USER_ID_VALUE + pattern: "${USER_ID}" + values: + - marker: ${USER_ID} + ref: '#/definitions/io.k8s.cli.setters.USER_ID' + io.k8s.cli.setters.GROUP_NAME: + description: process group name + x-k8s-cli: + setter: + name: GROUP_NAME + value: root + isSet: true + setBy: model-builder + required: true + io.k8s.cli.setters.USER_NAME: + description: process owner name + x-k8s-cli: + setter: + name: USER_NAME + value: root + isSet: true + setBy: model-builder + required: true + io.k8s.cli.setters.FS_ID: + type: integer + description: owner id of mounted volumes + x-k8s-cli: + setter: + name: FS_ID + value: "0" + isSet: true + setBy: model-builder + required: true + io.k8s.cli.setters.GROUP_ID: + type: integer + description: process group id + x-k8s-cli: + setter: + name: GROUP_ID + value: "0" + isSet: true + setBy: model-builder + required: true diff --git a/k8s/image_recognition/tensorflow/resnet50v1_5/inference/fp32/mlops/serving/service.yaml b/k8s/image_recognition/tensorflow/resnet50v1_5/inference/fp32/mlops/serving/service.yaml new file mode 100644 index 000000000..d95293332 --- /dev/null +++ b/k8s/image_recognition/tensorflow/resnet50v1_5/inference/fp32/mlops/serving/service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + name: resnet50v1-5-fp32-inference # {"$openapi":"MODEL_SERVING_NAME"} + labels: + app: resnet50v1-5-fp32-server # {"$openapi":"MODEL_SERVING_LABEL"} +spec: + # comment or delete the following line if you want to use a LoadBalancer + type: NodePort # https://kubernetes.io/docs/concepts/services-networking/service/#nodeport + # if your cluster supports it, uncomment the following to automatically create + # an external load-balanced IP for the frontend service. + # type: LoadBalancer + ports: + - protocol: TCP + port: 8501 # {"$openapi":"MODEL_SERVICE_PORT"} + targetPort: 8500 # {"$openapi":"MODEL_PORT"} + selector: + app: resnet50v1-5-fp32-server # {"$openapi":"MODEL_SERVING_LABEL"} diff --git a/k8s/image_recognition/tensorflow/resnet50v1_5/training/fp32/.docs/quickstart.md b/k8s/image_recognition/tensorflow/resnet50v1_5/training/fp32/.docs/quickstart.md index d04b209ed..30c9b4689 100644 --- a/k8s/image_recognition/tensorflow/resnet50v1_5/training/fp32/.docs/quickstart.md +++ b/k8s/image_recognition/tensorflow/resnet50v1_5/training/fp32/.docs/quickstart.md @@ -3,7 +3,7 @@ | Script name | Description | |-------------|-------------| -| [`launch_benchmark.py`](mlops/single-node/user-mounted-nfs/pod.yaml#L18) | Executes a short run using small batch sizes and a limited number of steps to demonstrate the training flow | +| [`fp32_training_demo.sh`](mlops/single-node/fp32_training_demo.sh) | Executes a short run using small batch sizes and a limited number of steps to demonstrate the training flow | These quickstart scripts can be run in the following environment: * [Kubernetes](#kubernetes) diff --git a/k8s/image_recognition/tensorflow/resnet50v1_5/training/fp32/README.md b/k8s/image_recognition/tensorflow/resnet50v1_5/training/fp32/README.md index c2e4da1de..fd88ae6e8 100644 --- a/k8s/image_recognition/tensorflow/resnet50v1_5/training/fp32/README.md +++ b/k8s/image_recognition/tensorflow/resnet50v1_5/training/fp32/README.md @@ -26,7 +26,7 @@ ImageNet dataset in the TF records format. | Script name | Description | |-------------|-------------| -| [`launch_benchmark.py`](mlops/single-node/user-mounted-nfs/pod.yaml#L18) | Executes a short run using small batch sizes and a limited number of steps to demonstrate the training flow | +| [`fp32_training_demo.sh`](mlops/single-node/fp32_training_demo.sh) | Executes a short run using small batch sizes and a limited number of steps to demonstrate the training flow | These quickstart scripts can be run in the following environment: * [Kubernetes](#kubernetes) @@ -104,8 +104,8 @@ using kustomize's [cfg set](https://github.com/kubernetes-sigs/kustomize/blob/ma | GROUP_NAME | root | process group name | | NFS_PATH | /nfs | nfs path | | NFS_SERVER | 0.0.0.0 | nfs server | -| PVC_NAME | workdisk | pvc name | -| PVC_PATH | /pvc | pvc path | +| PVC_NAME | workdisk | model-builder | pvc name | +| PVC_PATH | /pvc | model-builder | pvc path | | USER_ID | 0 | process owner id | | USER_NAME | root | process owner name | @@ -282,23 +282,6 @@ Removing the pod and related resources is done by running: kubectl delete -f .yaml ``` - -## TroubleShooting - -- Pod doesn't start. Status is ErrImagePull.
- Docker recently implemented rate limits.
- See this [note](https://thenewstack.io/docker-hub-limits-what-they-are-and-how-to-route-around-them/) about rate limits and work-arounds. - -- Argo workflow steps do not execute.
- Error from `argo get ` is 'failed to save outputs: Failed to establish pod watch: timed out waiting for the condition'.
- See this argo [issue](https://github.com/argoproj/argo/issues/4186). This is due to the workflow running as non-root.
- Devops will need to change the workflow-executor to k8sapi as described [here](https://github.com/argoproj/argo/blob/master/docs/workflow-executors.md). - -- MpiOperator can't create workers. Error is '/bin/sh: /etc/hosts: Permission denied'. This is due to a bug in mpi-operator in the 'latest' container image - when the workers run as non-root. See this [issue](https://github.com/kubeflow/mpi-operator/issues/288).
- Use the container images: mpioperator/mpi-operator:v02.3 and mpioperator/kubectl-delivery:v0.2.3. - - ## License diff --git a/k8s/image_recognition/tensorflow/resnet50v1_5/training/fp32/mlops/multi-node/.gitignore b/k8s/image_recognition/tensorflow/resnet50v1_5/training/fp32/mlops/multi-node/.gitignore new file mode 100644 index 000000000..1bc5a4d64 --- /dev/null +++ b/k8s/image_recognition/tensorflow/resnet50v1_5/training/fp32/mlops/multi-node/.gitignore @@ -0,0 +1,2 @@ +Makefile +*.yaml diff --git a/k8s/image_recognition/tensorflow/resnet50v1_5/training/fp32/mlops/single-node/.gitignore b/k8s/image_recognition/tensorflow/resnet50v1_5/training/fp32/mlops/single-node/.gitignore new file mode 100644 index 000000000..f3c7a7c5d --- /dev/null +++ b/k8s/image_recognition/tensorflow/resnet50v1_5/training/fp32/mlops/single-node/.gitignore @@ -0,0 +1 @@ +Makefile diff --git a/k8s/language_modeling/tensorflow/bert_large/training/fp32/.docs/kubernetes.md b/k8s/language_modeling/tensorflow/bert_large/training/fp32/.docs/kubernetes.md index 6a9bae8fb..715dfa9de 100644 --- a/k8s/language_modeling/tensorflow/bert_large/training/fp32/.docs/kubernetes.md +++ b/k8s/language_modeling/tensorflow/bert_large/training/fp32/.docs/kubernetes.md @@ -58,6 +58,8 @@ The distributed training algorithm is handled by [mpirun](https://www.open-mpi.o In a terminal, `cd` to the multi-node directory. Each use case under this directory has parameters that can be changed using kustomize's [cfg set](https://github.com/kubernetes-sigs/kustomize/blob/master/cmd/config/docs/commands/set.md) +##### [Mlops](https://en.wikipedia.org/wiki/MLOps) + ###### User mounted nfs and user allocated pvc parameter values | NAME | VALUE | DESCRIPTION | diff --git a/k8s/language_modeling/tensorflow/bert_large/training/fp32/.docs/quickstart.md b/k8s/language_modeling/tensorflow/bert_large/training/fp32/.docs/quickstart.md index b670333c3..23e2f1288 100644 --- a/k8s/language_modeling/tensorflow/bert_large/training/fp32/.docs/quickstart.md +++ b/k8s/language_modeling/tensorflow/bert_large/training/fp32/.docs/quickstart.md @@ -3,7 +3,7 @@ | Script name | Description | |-------------|-------------| -| [`launch_benchmark.py`](mlops/single-node/user-mounted-nfs/pod.yaml#L18) | This script is used by the single node Kubernetes job to run bert classifier inference. | +| [`fp32_training_single_node.sh`](mlops/single-node/fp32_training_single_node.sh) | This script is used by the single node Kubernetes job to run bert classifier inference. | These quickstart scripts can be run in the following environment: * [Kubernetes](#kubernetes) diff --git a/k8s/language_modeling/tensorflow/bert_large/training/fp32/README.md b/k8s/language_modeling/tensorflow/bert_large/training/fp32/README.md index 53f8f68b1..a2895d2b4 100644 --- a/k8s/language_modeling/tensorflow/bert_large/training/fp32/README.md +++ b/k8s/language_modeling/tensorflow/bert_large/training/fp32/README.md @@ -50,7 +50,7 @@ that directory when running bert fine tuning using the SQuAD data. | Script name | Description | |-------------|-------------| -| [`launch_benchmark.py`](mlops/single-node/user-mounted-nfs/pod.yaml#L18) | This script is used by the single node Kubernetes job to run bert classifier inference. | +| [`fp32_training_single_node.sh`](mlops/single-node/fp32_training_single_node.sh) | This script is used by the single node Kubernetes job to run bert classifier inference. | These quickstart scripts can be run in the following environment: * [Kubernetes](#kubernetes) @@ -116,6 +116,8 @@ The distributed training algorithm is handled by [mpirun](https://www.open-mpi.o In a terminal, `cd` to the multi-node directory. Each use case under this directory has parameters that can be changed using kustomize's [cfg set](https://github.com/kubernetes-sigs/kustomize/blob/master/cmd/config/docs/commands/set.md) +##### [Mlops](https://en.wikipedia.org/wiki/MLOps) + ###### User mounted nfs and user allocated pvc parameter values | NAME | VALUE | DESCRIPTION | @@ -304,23 +306,6 @@ Removing the pod and related resources is done by running: kubectl delete -f .yaml ``` - -## TroubleShooting - -- Pod doesn't start. Status is ErrImagePull.
- Docker recently implemented rate limits.
- See this [note](https://thenewstack.io/docker-hub-limits-what-they-are-and-how-to-route-around-them/) about rate limits and work-arounds. - -- Argo workflow steps do not execute.
- Error from `argo get ` is 'failed to save outputs: Failed to establish pod watch: timed out waiting for the condition'.
- See this argo [issue](https://github.com/argoproj/argo/issues/4186). This is due to the workflow running as non-root.
- Devops will need to change the workflow-executor to k8sapi as described [here](https://github.com/argoproj/argo/blob/master/docs/workflow-executors.md). - -- MpiOperator can't create workers. Error is '/bin/sh: /etc/hosts: Permission denied'. This is due to a bug in mpi-operator in the 'latest' container image - when the workers run as non-root. See this [issue](https://github.com/kubeflow/mpi-operator/issues/288).
- Use the container images: mpioperator/mpi-operator:v02.3 and mpioperator/kubectl-delivery:v0.2.3. - - ## License diff --git a/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/multi-node/.gitignore b/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/multi-node/.gitignore new file mode 100644 index 000000000..1bc5a4d64 --- /dev/null +++ b/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/multi-node/.gitignore @@ -0,0 +1,2 @@ +Makefile +*.yaml diff --git a/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/multi-node/user-allocated-pvc/Krmfile b/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/multi-node/user-allocated-pvc/Krmfile index 51ab213d7..d28011c1f 100644 --- a/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/multi-node/user-allocated-pvc/Krmfile +++ b/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/multi-node/user-allocated-pvc/Krmfile @@ -42,7 +42,7 @@ openAPI: x-k8s-cli: setter: name: BERT_BASE_DIR - value: bert_official/MRPC/uncased_L-12_H-768_A-12 + value: bert_official/MRPC isSet: true required: true setBy: model-builder @@ -97,24 +97,14 @@ openAPI: x-k8s-cli: substitution: name: IMAGE - pattern: ${REGISTRY}/intel/language-modeling:${IMAGE_VERSION}-${MODEL_NAME}${IMAGE_SUFFIX} + pattern: ${REGISTRY}/intel/language-modeling:tf-2.3.0-imz-2.2.0-${MODEL_NAME}${IMAGE_SUFFIX} values: - marker: ${REGISTRY} ref: '#/definitions/io.k8s.cli.setters.REGISTRY' - - marker: ${IMAGE_VERSION} - ref: '#/definitions/io.k8s.cli.setters.IMAGE_VERSION' - marker: ${MODEL_NAME} ref: '#/definitions/io.k8s.cli.setters.MODEL_NAME' - marker: ${IMAGE_SUFFIX} ref: '#/definitions/io.k8s.cli.setters.IMAGE_SUFFIX' - io.k8s.cli.setters.IMAGE_VERSION: - description: image version - x-k8s-cli: - setter: - name: IMAGE_VERSION - value: tf-2.3.0-imz-2.2.0 - isSet: true - setBy: model-builder io.k8s.cli.substitutions.GROUP_ID_VALUE: x-k8s-cli: substitution: @@ -156,6 +146,14 @@ openAPI: isSet: true required: true setBy: model-builder + io.k8s.cli.substitutions.PYTHONPATH: + x-k8s-cli: + substitution: + name: PYTHONPATH + pattern: ${MODEL_DIR}/models/language_modeling/tensorflow/bert_large/training:$PYTHONPATH + values: + - marker: ${MODEL_DIR} + ref: '#/definitions/io.k8s.cli.setters.MODEL_DIR' io.k8s.cli.substitutions.MODEL_SCRIPT: x-k8s-cli: substitution: @@ -231,11 +229,3 @@ openAPI: isSet: true required: true setBy: model-builder - io.k8s.cli.substitutions.PYTHONPATH: - x-k8s-cli: - substitution: - name: PYTHONPATH - pattern: ${MODEL_DIR}/benchmarks:${MODEL_DIR}/models/language_modeling/tensorflow/bert_large/training:$PYTHONPATH - values: - - marker: ${MODEL_DIR} - ref: '#/definitions/io.k8s.cli.setters.MODEL_DIR' diff --git a/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/multi-node/user-mounted-nfs/Krmfile b/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/multi-node/user-mounted-nfs/Krmfile index 3349ca778..797838f71 100644 --- a/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/multi-node/user-mounted-nfs/Krmfile +++ b/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/multi-node/user-mounted-nfs/Krmfile @@ -42,7 +42,7 @@ openAPI: x-k8s-cli: setter: name: BERT_BASE_DIR - value: bert_official/MRPC/uncased_L-12_H-768_A-12 + value: bert_official/MRPC isSet: true required: true setBy: model-builder @@ -97,24 +97,14 @@ openAPI: x-k8s-cli: substitution: name: IMAGE - pattern: ${REGISTRY}/intel/language-modeling:${IMAGE_VERSION}-${MODEL_NAME}${IMAGE_SUFFIX} + pattern: ${REGISTRY}/intel/language-modeling:tf-2.3.0-imz-2.2.0-${MODEL_NAME}${IMAGE_SUFFIX} values: - marker: ${REGISTRY} ref: '#/definitions/io.k8s.cli.setters.REGISTRY' - - marker: ${IMAGE_VERSION} - ref: '#/definitions/io.k8s.cli.setters.IMAGE_VERSION' - marker: ${MODEL_NAME} ref: '#/definitions/io.k8s.cli.setters.MODEL_NAME' - marker: ${IMAGE_SUFFIX} ref: '#/definitions/io.k8s.cli.setters.IMAGE_SUFFIX' - io.k8s.cli.setters.IMAGE_VERSION: - description: image version - x-k8s-cli: - setter: - name: IMAGE_VERSION - value: tf-2.3.0-imz-2.2.0 - isSet: true - setBy: model-builder io.k8s.cli.substitutions.GROUP_ID_VALUE: x-k8s-cli: substitution: @@ -156,6 +146,14 @@ openAPI: isSet: true required: true setBy: model-builder + io.k8s.cli.substitutions.PYTHONPATH: + x-k8s-cli: + substitution: + name: PYTHONPATH + pattern: ${MODEL_DIR}/models/language_modeling/tensorflow/bert_large/training:$PYTHONPATH + values: + - marker: ${MODEL_DIR} + ref: '#/definitions/io.k8s.cli.setters.MODEL_DIR' io.k8s.cli.substitutions.MODEL_SCRIPT: x-k8s-cli: substitution: @@ -213,11 +211,3 @@ openAPI: ref: '#/definitions/io.k8s.cli.setters.MODEL_NAME' - marker: ${OUTPUT_DIR} ref: '#/definitions/io.k8s.cli.setters.OUTPUT_DIR' - io.k8s.cli.substitutions.PYTHONPATH: - x-k8s-cli: - substitution: - name: PYTHONPATH - pattern: ${MODEL_DIR}/benchmarks:${MODEL_DIR}/models/language_modeling/tensorflow/bert_large/training:$PYTHONPATH - values: - - marker: ${MODEL_DIR} - ref: '#/definitions/io.k8s.cli.setters.MODEL_DIR' diff --git a/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/.gitignore b/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/.gitignore new file mode 100644 index 000000000..1bc5a4d64 --- /dev/null +++ b/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/.gitignore @@ -0,0 +1,2 @@ +Makefile +*.yaml diff --git a/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-allocated-pvc/Krmfile b/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-allocated-pvc/Krmfile index 0f4988c4a..063417c7e 100644 --- a/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-allocated-pvc/Krmfile +++ b/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-allocated-pvc/Krmfile @@ -63,27 +63,18 @@ openAPI: isSet: true setBy: model-builder io.k8s.cli.substitutions.IMAGE: + description: image name x-k8s-cli: substitution: name: IMAGE - pattern: ${REGISTRY}/intel/language-modeling:${IMAGE_VERSION}-${MODEL_NAME}${IMAGE_SUFFIX} + pattern: ${REGISTRY}/intel/language-modeling:tf-2.3.0-imz-2.2.0-${MODEL_NAME}${IMAGE_SUFFIX} values: - marker: ${REGISTRY} ref: '#/definitions/io.k8s.cli.setters.REGISTRY' - - marker: ${IMAGE_VERSION} - ref: '#/definitions/io.k8s.cli.setters.IMAGE_VERSION' - marker: ${MODEL_NAME} ref: '#/definitions/io.k8s.cli.setters.MODEL_NAME' - marker: ${IMAGE_SUFFIX} ref: '#/definitions/io.k8s.cli.setters.IMAGE_SUFFIX' - io.k8s.cli.setters.IMAGE_VERSION: - description: image version - x-k8s-cli: - setter: - name: IMAGE_VERSION - value: tf-2.3.0-imz-2.2.0 - isSet: true - setBy: model-builder io.k8s.cli.substitutions.GROUP_ID_VALUE: x-k8s-cli: substitution: @@ -115,6 +106,24 @@ openAPI: value: fp32_training_single_node.sh isSet: true setBy: model-builder + io.k8s.cli.setters.NFS_PATH: + description: nfs path + x-k8s-cli: + setter: + name: NFS_PATH + value: /nfs + isSet: true + required: true + setBy: model-builder + io.k8s.cli.setters.NFS_SERVER: + description: nfs server + x-k8s-cli: + setter: + name: NFS_SERVER + value: 0.0.0.0 + isSet: true + required: true + setBy: model-builder io.k8s.cli.substitutions.COMMAND: x-k8s-cli: substitution: @@ -203,34 +212,3 @@ openAPI: isSet: true required: true setBy: model-builder - io.k8s.cli.setters.CHECKPOINT_DIR: - description: checkpoint directory - x-k8s-cli: - setter: - name: CHECKPOINT_DIR - value: /checkpoints - isSet: true - required: true - setBy: model-builder - io.k8s.cli.substitutions.CHECKPOINT_PATH: - x-k8s-cli: - substitution: - name: CHECKPOINT_PATH - pattern: ${PVC_PATH}/${USER_NAME}/${MODEL_NAME}/${CHECKPOINT_DIR} - values: - - marker: ${PVC_PATH} - ref: '#/definitions/io.k8s.cli.setters.PVC_PATH' - - marker: ${USER_NAME} - ref: '#/definitions/io.k8s.cli.setters.USER_NAME' - - marker: ${MODEL_NAME} - ref: '#/definitions/io.k8s.cli.setters.MODEL_NAME' - - marker: ${CHECKPOINT_DIR} - ref: '#/definitions/io.k8s.cli.setters.CHECKPOINT_DIR' - io.k8s.cli.substitutions.PYTHONPATH: - x-k8s-cli: - substitution: - name: PYTHONPATH - pattern: ${MODEL_DIR}/benchmarks:${MODEL_DIR}/models/language_modeling/tensorflow/bert_large/training:$PYTHONPATH - values: - - marker: ${MODEL_DIR} - ref: '#/definitions/io.k8s.cli.setters.MODEL_DIR' diff --git a/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-allocated-pvc/config-map.yaml b/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-allocated-pvc/config-map.yaml index c9a83fd1a..a6c2edbe8 100644 --- a/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-allocated-pvc/config-map.yaml +++ b/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-allocated-pvc/config-map.yaml @@ -3,7 +3,6 @@ apiVersion: v1 metadata: name: bert-large-fp32-training # {"$openapi":"MODEL_NAME"} data: - CHECKPOINT_DIR: /pvc/root/bert-large-fp32-training/checkpoints # {"$openapi":"CHECKPOINT_PATH"} BERT_BASE_DIR: bert_official/MRPC/uncased_L-12_H-768_A-12 # {"$openapi":"BERT_BASE_DIR"} DATASET_DIR: /datasets # {"$openapi":"DATASET_DIR"} FS_ID: "0" # {"$openapi":"FS_ID_VALUE"} @@ -13,6 +12,6 @@ data: LD_LIBRARY_PATH: /usr/local/lib:/usr/local/lib/mpirun:$LD_LIBRARY_PATH MODEL_DIR: /workspace/bert-large-fp32-training # {"$openapi":"MODEL_DIR"} OUTPUT_DIR: /pvc/root/bert-large-fp32-training/output # {"$openapi":"OUTPUT_PATH"} - PYTHONPATH: /workspace/bert-large-fp32-training/benchmarks:/workspace/bert-large-fp32-training/models/language_modeling/tensorflow/bert_large/training:$PYTHONPATH # {"$openapi":"PYTHONPATH"} + PYTHONPATH: /workspace/bert-large-fp32-training/models/language_modeling/tensorflow/bert_large/training:$PYTHONPATH # {"$openapi":"PYTHONPATH"} USER_ID: "0" # {"$openapi":"USER_ID_VALUE"} USER_NAME: root # {"$openapi":"USER_NAME"} diff --git a/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-allocated-pvc/pod.yaml b/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-allocated-pvc/pod.yaml index ee292352a..e1ffc23b9 100644 --- a/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-allocated-pvc/pod.yaml +++ b/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-allocated-pvc/pod.yaml @@ -8,44 +8,10 @@ spec: runAsUser: 0 # {"$openapi":"USER_ID"} runAsGroup: 0 # {"$openapi":"GROUP_ID"} fsGroup: 0 # {"$openapi":"FS_ID"} - initContainers: - - name: create-output-dir - image: docker.io/intel/language-modeling:tf-2.3.0-imz-2.2.0-bert-large-fp32-training # {"$openapi":"IMAGE"} - imagePullPolicy: IfNotPresent - workingDir: /workspace/bert-large-fp32-training # {"$openapi":"MODEL_DIR"} - command: - - mkdir - args: - - -p - - $(OUTPUT_DIR) - envFrom: - - configMapRef: - name: bert-large-fp32-training # {"$openapi":"MODEL_NAME"} - volumeMounts: - - name: pvc-path - mountPath: /pvc # {"$openapi":"PVC_PATH"} - - name: create-checkpoint-dir - image: docker.io/intel/language-modeling:tf-2.3.0-imz-2.2.0-bert-large-fp32-training # {"$openapi":"IMAGE"} - imagePullPolicy: IfNotPresent - workingDir: /workspace/bert-large-fp32-training # {"$openapi":"MODEL_DIR"} - command: - - mkdir - args: - - -p - - $(CHECKPOINT_DIR) - envFrom: - - configMapRef: - name: bert-large-fp32-training # {"$openapi":"MODEL_NAME"} - volumeMounts: - - name: pvc-path - mountPath: /pvc # {"$openapi":"PVC_PATH"} containers: - name: single-node image: docker.io/intel/language-modeling:tf-2.3.0-imz-2.2.0-bert-large-fp32-training # {"$openapi":"IMAGE"} workingDir: /workspace/bert-large-fp32-training # {"$openapi":"MODEL_DIR"} - envFrom: - - configMapRef: - name: bert-large-fp32-training # {"$openapi":"MODEL_NAME"} command: - python args: # {"$openapi":"COMMAND"} diff --git a/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-mounted-nfs/Krmfile b/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-mounted-nfs/Krmfile index d04e11f30..63a532558 100644 --- a/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-mounted-nfs/Krmfile +++ b/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-mounted-nfs/Krmfile @@ -63,27 +63,18 @@ openAPI: isSet: true setBy: model-builder io.k8s.cli.substitutions.IMAGE: + description: image name x-k8s-cli: substitution: name: IMAGE - pattern: ${REGISTRY}/intel/language-modeling:${IMAGE_VERSION}-${MODEL_NAME}${IMAGE_SUFFIX} + pattern: ${REGISTRY}/intel/language-modeling:tf-2.3.0-imz-2.2.0-${MODEL_NAME}${IMAGE_SUFFIX} values: - marker: ${REGISTRY} ref: '#/definitions/io.k8s.cli.setters.REGISTRY' - - marker: ${IMAGE_VERSION} - ref: '#/definitions/io.k8s.cli.setters.IMAGE_VERSION' - marker: ${MODEL_NAME} ref: '#/definitions/io.k8s.cli.setters.MODEL_NAME' - marker: ${IMAGE_SUFFIX} ref: '#/definitions/io.k8s.cli.setters.IMAGE_SUFFIX' - io.k8s.cli.setters.IMAGE_VERSION: - description: image version - x-k8s-cli: - setter: - name: IMAGE_VERSION - value: tf-2.3.0-imz-2.2.0 - isSet: true - setBy: model-builder io.k8s.cli.substitutions.GROUP_ID_VALUE: x-k8s-cli: substitution: @@ -203,34 +194,3 @@ openAPI: isSet: true setBy: model-builder required: true - io.k8s.cli.setters.CHECKPOINT_DIR: - description: checkpoint directory - x-k8s-cli: - setter: - name: CHECKPOINT_DIR - value: /checkpoints - isSet: true - required: true - setBy: model-builder - io.k8s.cli.substitutions.CHECKPOINT_PATH: - x-k8s-cli: - substitution: - name: CHECKPOINT_PATH - pattern: ${NFS_PATH}/${USER_NAME}/${MODEL_NAME}/${CHECKPOINT_DIR} - values: - - marker: ${NFS_PATH} - ref: '#/definitions/io.k8s.cli.setters.NFS_PATH' - - marker: ${USER_NAME} - ref: '#/definitions/io.k8s.cli.setters.USER_NAME' - - marker: ${MODEL_NAME} - ref: '#/definitions/io.k8s.cli.setters.MODEL_NAME' - - marker: ${CHECKPOINT_DIR} - ref: '#/definitions/io.k8s.cli.setters.CHECKPOINT_DIR' - io.k8s.cli.substitutions.PYTHONPATH: - x-k8s-cli: - substitution: - name: PYTHONPATH - pattern: ${MODEL_DIR}/benchmarks:${MODEL_DIR}/models/language_modeling/tensorflow/bert_large/training:$PYTHONPATH - values: - - marker: ${MODEL_DIR} - ref: '#/definitions/io.k8s.cli.setters.MODEL_DIR' diff --git a/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-mounted-nfs/config-map.yaml b/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-mounted-nfs/config-map.yaml index 41cc850bb..46e5f107a 100644 --- a/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-mounted-nfs/config-map.yaml +++ b/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-mounted-nfs/config-map.yaml @@ -3,7 +3,6 @@ apiVersion: v1 metadata: name: bert-large-fp32-training # {"$openapi":"MODEL_NAME"} data: - CHECKPOINT_DIR: /nfs/root/bert-large-fp32-training/checkpoints # {"$openapi":"CHECKPOINT_PATH"} BERT_BASE_DIR: bert_official/MRPC/uncased_L-12_H-768_A-12 # {"$openapi":"BERT_BASE_DIR"} DATASET_DIR: /datasets # {"$openapi":"DATASET_DIR"} FS_ID: "0" # {"$openapi":"FS_ID_VALUE"} @@ -13,6 +12,6 @@ data: LD_LIBRARY_PATH: /usr/local/lib:/usr/local/lib/mpirun:$LD_LIBRARY_PATH MODEL_DIR: /workspace/bert-large-fp32-training # {"$openapi":"MODEL_DIR"} OUTPUT_DIR: /nfs/root/bert-large-fp32-training/output # {"$openapi":"OUTPUT_PATH"} - PYTHONPATH: /workspace/bert-large-fp32-training/benchmarks:/workspace/bert-large-fp32-training/models/language_modeling/tensorflow/bert_large/training:$PYTHONPATH # {"$openapi":"PYTHONPATH"} + PYTHONPATH: /workspace/bert-large-fp32-training/models/language_modeling/tensorflow/bert_large/training:$PYTHONPATH # {"$openapi":"PYTHONPATH"} USER_ID: "0" # {"$openapi":"USER_ID_VALUE"} USER_NAME: root # {"$openapi":"USER_NAME"} diff --git a/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-mounted-nfs/pod.yaml b/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-mounted-nfs/pod.yaml index 21b2e02c8..775c24899 100644 --- a/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-mounted-nfs/pod.yaml +++ b/k8s/language_modeling/tensorflow/bert_large/training/fp32/mlops/single-node/user-mounted-nfs/pod.yaml @@ -8,44 +8,10 @@ spec: runAsUser: 0 # {"$openapi":"USER_ID"} runAsGroup: 0 # {"$openapi":"GROUP_ID"} fsGroup: 0 # {"$openapi":"FS_ID"} - initContainers: - - name: create-output-dir - image: docker.io/intel/language-modeling:tf-2.3.0-imz-2.2.0-bert-large-fp32-training # {"$openapi":"IMAGE"} - imagePullPolicy: IfNotPresent - workingDir: /workspace/bert-large-fp32-training # {"$openapi":"MODEL_DIR"} - command: - - mkdir - args: - - -p - - $(OUTPUT_DIR) - envFrom: - - configMapRef: - name: bert-large-fp32-training # {"$openapi":"MODEL_NAME"} - volumeMounts: - - name: nfs-path - mountPath: /nfs # {"$openapi":"NFS_PATH"} - - name: create-checkpoint-dir - image: docker.io/intel/language-modeling:tf-2.3.0-imz-2.2.0-bert-large-fp32-training # {"$openapi":"IMAGE"} - imagePullPolicy: IfNotPresent - workingDir: /workspace/bert-large-fp32-training # {"$openapi":"MODEL_DIR"} - command: - - mkdir - args: - - -p - - $(CHECKPOINT_DIR) - envFrom: - - configMapRef: - name: bert-large-fp32-training # {"$openapi":"MODEL_NAME"} - volumeMounts: - - name: nfs-path - mountPath: /nfs # {"$openapi":"NFS_PATH"} containers: - name: single-node image: docker.io/intel/language-modeling:tf-2.3.0-imz-2.2.0-bert-large-fp32-training # {"$openapi":"IMAGE"} workingDir: /workspace/bert-large-fp32-training # {"$openapi":"MODEL_DIR"} - envFrom: - - configMapRef: - name: bert-large-fp32-training # {"$openapi":"MODEL_NAME"} command: - python args: # {"$openapi":"COMMAND"} @@ -76,8 +42,8 @@ spec: - name: datasets mountPath: /datasets # {"$openapi":"DATASET_DIR"} readOnly: true - - name: nfs-path - mountPath: /nfs # {"$openapi":"NFS_PATH"} + - name: pvc-path + mountPath: /pvc # {"$openapi":"PVC_PATH"} volumes: - name: datasets hostPath: diff --git a/k8s/object_detection/tensorflow/rfcn/inference/fp32/.docs/kubernetes.md b/k8s/object_detection/tensorflow/rfcn/inference/fp32/.docs/kubernetes.md index f864b6244..5431d417c 100644 --- a/k8s/object_detection/tensorflow/rfcn/inference/fp32/.docs/kubernetes.md +++ b/k8s/object_detection/tensorflow/rfcn/inference/fp32/.docs/kubernetes.md @@ -175,6 +175,9 @@ The parameters that can be changed within the pipeline are shown in the table be | PVC_NAME | workdisk | pvc name | | PVC_PATH | /pvc | pvc path | | OUTPUT_DIR | output | output dir basename | +| PREPROCESS_DIR | /workspace/preprocess-coco-val | container preprocess directory | +| PREPROCESS_NAME | preprocess-coco-val | preprocess image part | +| PREPROCESS_SCRIPT | preprocess_coco_val.sh | preprocess script | | USER_ID | 0 | process owner id | | USER_NAME | root | process owner name | @@ -196,7 +199,7 @@ kustomize cfg set . PVC_NAME -R In both use cases, the user should change the values below so the pod is deployed with the user's identity[^3]. -[^3]: In order for the argo workflow to run as a non root user it must set the WorkflowExecutor to be k8sapi, otherwise the workflow will fail with "Got permission denied while trying to connect to the Docker daemon socket at unix:///var/run/docker.sock". See argo issues [2239](https://github.com/argoproj/argo/issues/2239),[4186](https://github.com/argoproj/argo/issues/4186). Setting argo's WorkflowExecutor to k8sapi is described [here](https://argoproj.github.io/argo/workflow-executors/). This must be performed by devops. +[^3]: In order for the argo workflow to run as a non root user it must set the WorkflowExecutor to be k8sapi, otherwise the workflow will fail with "Got permission denied while trying to connect to the Docker daemon socket at unix:///var/run/docker.sock". See argo issue [2239](https://github.com/argoproj/argo/issues/2239). Setting argo's WorkflowExecutor to k8sapi is described [here](https://argoproj.github.io/argo/workflow-executors/). This must be performed by devops. ``` kustomize cfg set . FS_ID -R diff --git a/k8s/object_detection/tensorflow/rfcn/inference/fp32/.docs/quickstart.md b/k8s/object_detection/tensorflow/rfcn/inference/fp32/.docs/quickstart.md index d61cb3270..4e6cd77dc 100644 --- a/k8s/object_detection/tensorflow/rfcn/inference/fp32/.docs/quickstart.md +++ b/k8s/object_detection/tensorflow/rfcn/inference/fp32/.docs/quickstart.md @@ -3,8 +3,8 @@ | Script name | Description | |-------------|-------------| -| [`fp32_inference.sh`](mlops/serving/user-mounted-nfs/pod.yaml#L16) | Runs inference on a directory of raw images for 500 steps and outputs performance metrics. | -| [`fp32_accuracy.sh`](mlops/pipeline/user-mounted-nfs/serving_accuracy.yaml#L49) | Processes the TF records to run inference and check accuracy on the results. | +| [`fp32_inference.sh`](mlops/serving/user-mounted-nfs/config-map.yaml#L117) | Runs inference on a directory of raw images for 500 steps and outputs performance metrics. | +| [`fp32_accuracy.sh`](mlops/pipeline/user-mounted-nfs/config-map.yaml#L117) | Processes the TF records to run inference and check accuracy on the results. | These quickstart scripts can be run in the following environment: * [Kubernetes](#kubernetes) diff --git a/k8s/object_detection/tensorflow/rfcn/inference/fp32/README.md b/k8s/object_detection/tensorflow/rfcn/inference/fp32/README.md index 0fdbef6a9..3aa3f702d 100644 --- a/k8s/object_detection/tensorflow/rfcn/inference/fp32/README.md +++ b/k8s/object_detection/tensorflow/rfcn/inference/fp32/README.md @@ -27,8 +27,8 @@ downloading and preprocessing the COCO validation dataset. | Script name | Description | |-------------|-------------| -| [`fp32_inference.sh`](mlops/serving/user-mounted-nfs/pod.yaml#L16) | Runs inference on a directory of raw images for 500 steps and outputs performance metrics. | -| [`fp32_accuracy.sh`](mlops/pipeline/user-mounted-nfs/serving_accuracy.yaml#L49) | Processes the TF records to run inference and check accuracy on the results. | +| [`fp32_inference.sh`](mlops/serving/user-mounted-nfs/config-map.yaml#L117) | Runs inference on a directory of raw images for 500 steps and outputs performance metrics. | +| [`fp32_accuracy.sh`](mlops/pipeline/user-mounted-nfs/config-map.yaml#L117) | Processes the TF records to run inference and check accuracy on the results. | These quickstart scripts can be run in the following environment: * [Kubernetes](#kubernetes) @@ -211,6 +211,9 @@ The parameters that can be changed within the pipeline are shown in the table be | PVC_NAME | workdisk | pvc name | | PVC_PATH | /pvc | pvc path | | OUTPUT_DIR | output | output dir basename | +| PREPROCESS_DIR | /workspace/preprocess-coco-val | container preprocess directory | +| PREPROCESS_NAME | preprocess-coco-val | preprocess image part | +| PREPROCESS_SCRIPT | preprocess_coco_val.sh | preprocess script | | USER_ID | 0 | process owner id | | USER_NAME | root | process owner name | @@ -232,7 +235,7 @@ kustomize cfg set . PVC_NAME -R In both use cases, the user should change the values below so the pod is deployed with the user's identity[^3]. -[^3]: In order for the argo workflow to run as a non root user it must set the WorkflowExecutor to be k8sapi, otherwise the workflow will fail with "Got permission denied while trying to connect to the Docker daemon socket at unix:///var/run/docker.sock". See argo issues [2239](https://github.com/argoproj/argo/issues/2239),[4186](https://github.com/argoproj/argo/issues/4186). Setting argo's WorkflowExecutor to k8sapi is described [here](https://argoproj.github.io/argo/workflow-executors/). This must be performed by devops. +[^3]: In order for the argo workflow to run as a non root user it must set the WorkflowExecutor to be k8sapi, otherwise the workflow will fail with "Got permission denied while trying to connect to the Docker daemon socket at unix:///var/run/docker.sock". See argo issue [2239](https://github.com/argoproj/argo/issues/2239). Setting argo's WorkflowExecutor to k8sapi is described [here](https://argoproj.github.io/argo/workflow-executors/). This must be performed by devops. ``` kustomize cfg set . FS_ID -R @@ -288,23 +291,6 @@ kubectl delete -f object_detection.yaml See the [Advanced Options for Model Packages and Containers](/quickstart/common/ModelPackagesAdvancedOptions.md) document for more advanced use cases. - -## TroubleShooting - -- Pod doesn't start. Status is ErrImagePull.
- Docker recently implemented rate limits.
- See this [note](https://thenewstack.io/docker-hub-limits-what-they-are-and-how-to-route-around-them/) about rate limits and work-arounds. - -- Argo workflow steps do not execute.
- Error from `argo get ` is 'failed to save outputs: Failed to establish pod watch: timed out waiting for the condition'.
- See this argo [issue](https://github.com/argoproj/argo/issues/4186). This is due to the workflow running as non-root.
- Devops will need to change the workflow-executor to k8sapi as described [here](https://github.com/argoproj/argo/blob/master/docs/workflow-executors.md). - -- MpiOperator can't create workers. Error is '/bin/sh: /etc/hosts: Permission denied'. This is due to a bug in mpi-operator in the 'latest' container image - when the workers run as non-root. See this [issue](https://github.com/kubeflow/mpi-operator/issues/288).
- Use the container images: mpioperator/mpi-operator:v02.3 and mpioperator/kubectl-delivery:v0.2.3. - - ## License diff --git a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/.gitignore b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/.gitignore new file mode 100644 index 000000000..1bc5a4d64 --- /dev/null +++ b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/.gitignore @@ -0,0 +1,2 @@ +Makefile +*.yaml diff --git a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-allocated-pvc/Krmfile b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-allocated-pvc/Krmfile index e9217173f..bf94be592 100644 --- a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-allocated-pvc/Krmfile +++ b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-allocated-pvc/Krmfile @@ -61,32 +61,14 @@ openAPI: x-k8s-cli: substitution: name: IMAGE - pattern: ${REGISTRY}/intel/object-detection:${IMAGE_VERSION}-${MODEL_NAME}${IMAGE_SUFFIX} + pattern: ${REGISTRY}/intel/object-detection:tf-2.3.0-imz-2.2.0-${MODEL_NAME}${IMAGE_SUFFIX} values: - marker: ${REGISTRY} ref: '#/definitions/io.k8s.cli.setters.REGISTRY' - - marker: ${IMAGE_VERSION} - ref: '#/definitions/io.k8s.cli.setters.IMAGE_VERSION' - marker: ${MODEL_NAME} ref: '#/definitions/io.k8s.cli.setters.MODEL_NAME' - marker: ${IMAGE_SUFFIX} ref: '#/definitions/io.k8s.cli.setters.IMAGE_SUFFIX' - io.k8s.cli.setters.IMAGE_VERSION: - description: image version - x-k8s-cli: - setter: - name: IMAGE_VERSION - value: tf-2.3.0-imz-2.2.0 - isSet: true - setBy: model-builder - io.k8s.cli.setters.PREPROCESS_IMAGE_VERSION: - description: image version - x-k8s-cli: - setter: - name: PREPROCESS_IMAGE_VERSION - value: tf-1.15.2-imz-2.2.0 - isSet: true - setBy: model-builder io.k8s.cli.setters.IMAGE_SUFFIX: description: appended to image name x-k8s-cli: @@ -105,6 +87,14 @@ openAPI: ref: '#/definitions/io.k8s.cli.setters.MODEL_DIR' - marker: ${MODEL_SCRIPT} ref: '#/definitions/io.k8s.cli.setters.MODEL_SCRIPT' + io.k8s.cli.substitutions.MODEL_CONFIGMAP_NAME: + x-k8s-cli: + substitution: + name: MODEL_CONFIGMAP_NAME + pattern: mlops-scripts-${MODEL_NAME} + values: + - marker: ${MODEL_NAME} + ref: '#/definitions/io.k8s.cli.setters.MODEL_NAME' io.k8s.cli.setters.MODEL_DIR: description: container model directory x-k8s-cli: @@ -176,13 +166,11 @@ openAPI: description: image name x-k8s-cli: substitution: - name: PREPROCESS_IMAGE - pattern: ${REGISTRY}/intel/object-detection:${PREPROCESS_IMAGE_VERSION}-${PREPROCESS_NAME} + name: IMAGE + pattern: ${REGISTRY}/intel/object-detection:tf-1.15.2-imz-2.2.0-${PREPROCESS_NAME} values: - marker: ${REGISTRY} ref: '#/definitions/io.k8s.cli.setters.REGISTRY' - - marker: ${PREPROCESS_IMAGE_VERSION} - ref: '#/definitions/io.k8s.cli.setters.PREPROCESS_IMAGE_VERSION' - marker: ${PREPROCESS_NAME} ref: '#/definitions/io.k8s.cli.setters.PREPROCESS_NAME' - marker: ${IMAGE_SUFFIX} diff --git a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-allocated-pvc/config-map.yaml b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-allocated-pvc/config-map.yaml index d1d16b300..2bb629c30 100644 --- a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-allocated-pvc/config-map.yaml +++ b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-allocated-pvc/config-map.yaml @@ -12,3 +12,164 @@ data: USER_ID: "0" # {"$openapi":"USER_ID_VALUE"} USER_NAME: root # {"$openapi":"USER_NAME"} VAL_IMAGE_DIR: /datasets/val2017 # {"$openapi":"VAL_IMAGE_DIR"} + preprocess_coco_val.sh: | + #!/usr/bin/env bash + # + # Copyright (c) 2020 Intel Corporation + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + # + + # This script preprocesses the validation images for the COCO Dataset to create + # TF records files. The raw validation images and annotations must be downloaded + # prior to running this script (https://cocodataset.org/#download). + # + # The following vars need to be set: + # VAL_IMAGE_DIR: Points to the raw validation images (extracted from val2017.zip) + # ANNOTATIONS_DIR: Points to the annotations (extracted from annotations_trainval2017.zip) + # OUTPUT_DIR: Path where the TF records file will be written + # + # This is intended to be used with the create_coco_tf_record.py script from the + # TensorFlow Model Garden, commit 1efe98bb8e8d98bbffc703a90d88df15fc2ce906. + # + # NOTE: This pre-processes the validation images only + + # If the DATASET_DIR is set, then ensure it exists and set paths for the images and annotations + if [[ ! -z "${DATASET_DIR}" ]]; then + if [[ ! -d "${DATASET_DIR}" ]]; then + echo "ERROR: The specified DATASET_DIR ($DATASET_DIR) does not exist." + exit 1 + fi + + VAL_IMAGE_DIR=${DATASET_DIR}/val2017 + ANNOTATIONS_DIR=${DATASET_DIR}/annotations + fi + + # Verify that the a directory exists for the raw validation images + if [[ ! -d "${VAL_IMAGE_DIR}" ]]; then + echo "ERROR: The VAL_IMAGE_DIR (${VAL_IMAGE_DIR}) does not exist. This var needs to point to the raw coco validation images." + exit 1 + fi + + # Verify that the a directory exists for the annotations + if [[ ! -d "${ANNOTATIONS_DIR}" ]]; then + echo "ERROR: The ANNOTATIONS_DIR (${ANNOTATIONS_DIR}) does not exist. This var needs to point to the coco annotations directory." + exit 1 + fi + + # Verify that we have the path to the tensorflow/models code + if [[ ! -d "${TF_MODELS_DIR}" ]]; then + echo "ERROR: The TF_MODELS_DIR var needs to be defined to point to a clone of the tensorflow/models git repo" + exit 1 + fi + + # Create the output directory in case it doesn't already exist + mkdir -p ${OUTPUT_DIR} + + # Checkout the specified branch for the tensorflow/models code + if [[ ! -z "${TF_MODELS_BRANCH}" ]]; then + cd ${TF_MODELS_DIR} + git checkout ${TF_MODELS_BRANCH} + fi + + # Set the PYTHONPATH + cd ${TF_MODELS_DIR}/research + export PYTHONPATH=$PYTHONPATH:`pwd`:`pwd`/slim + + # Create empty dir and json for train/test image preprocessing, so that we don't require + # the user to also download train/test images when all that's needed is validation images. + EMPTY_DIR=${OUTPUT_DIR}/empty_dir + EMPTY_ANNOTATIONS=${OUTPUT_DIR}/empty.json + mkdir -p ${EMPTY_DIR} + echo "{ \"images\": {}, \"categories\": {}}" > ${EMPTY_ANNOTATIONS} + + cd ${TF_MODELS_DIR}/research/object_detection/dataset_tools + python create_coco_tf_record.py --logtostderr \ + --train_image_dir="${EMPTY_DIR}" \ + --val_image_dir="${VAL_IMAGE_DIR}" \ + --test_image_dir="${EMPTY_DIR}" \ + --train_annotations_file="${EMPTY_ANNOTATIONS}" \ + --val_annotations_file="${ANNOTATIONS_DIR}/instances_val2017.json" \ + --testdev_annotations_file="${EMPTY_ANNOTATIONS}" \ + --output_dir="${OUTPUT_DIR}" + + # remove dummy directory and annotations file + rm -rf ${EMPTY_DIR} + rm -rf ${EMPTY_ANNOTATIONS} + + # since we only grab the validation dataset, the TF records files for train + # and test images are size 0. Delete those to prevent confusion. + rm -f ${OUTPUT_DIR}/coco_testdev.record + rm -f ${OUTPUT_DIR}/coco_train.record + + echo "TF records in the output directory:" + ls -l ${OUTPUT_DIR} + fp32_accuracy.sh: | + #!/usr/bin/env bash + # + # Copyright (c) 2020 Intel Corporation + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + # + + if [ -z "${OUTPUT_DIR}" ]; then + echo "The required environment variable OUTPUT_DIR has not been set" + exit 1 + fi + + # Create the output directory in case it doesn't already exist + mkdir -p ${OUTPUT_DIR} + + if [ -z "${DATASET_DIR}" ]; then + echo "The required environment variable DATASET_DIR has not been set" + exit 1 + fi + + if [ -z "${TF_MODELS_DIR}" ]; then + echo "The required environment variable MODELS_DIR has not been set" + exit 1 + fi + + # Untar pretrained model files + pretrained_model_dir="${OUTPUT_DIR}/pretrained_model/rfcn_resnet101_coco_2018_01_28" + if [ ! -d "${pretrained_model_dir}" ]; then + mkdir -p ${OUTPUT_DIR}/pretrained_model + tar -C ${OUTPUT_DIR}/pretrained_model/ -xvf pretrained_model/rfcn_fp32_model.tar.gz + chmod -R u+w ${OUTPUT_DIR}/pretrained_model/ + fi + FROZEN_GRAPH="${pretrained_model_dir}/frozen_inference_graph.pb" + + source "$(dirname $0)/common/utils.sh" + _command python benchmarks/launch_benchmark.py \ + --model-name rfcn \ + --mode inference \ + --precision fp32 \ + --framework tensorflow \ + --model-source-dir ${TF_MODELS_DIR} \ + --data-location ${DATASET_DIR} \ + --in-graph ${FROZEN_GRAPH} \ + --batch-size 1 \ + --accuracy-only \ + --output-dir ${OUTPUT_DIR} \ + $@ \ + -- split="${OUTPUT_DIR}/accuracy_message" diff --git a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-allocated-pvc/kustomization.yaml b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-allocated-pvc/kustomization.yaml index 213b180d6..a1c21752f 100644 --- a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-allocated-pvc/kustomization.yaml +++ b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-allocated-pvc/kustomization.yaml @@ -2,8 +2,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - config-map.yaml -- service-account.yaml -- role.yaml -- role-binding.yaml - serving_accuracy.yaml namespace: default diff --git a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-allocated-pvc/serving_accuracy.yaml b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-allocated-pvc/serving_accuracy.yaml index 0d66410c2..f5bb78f01 100644 --- a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-allocated-pvc/serving_accuracy.yaml +++ b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-allocated-pvc/serving_accuracy.yaml @@ -4,7 +4,6 @@ metadata: name: rfcn-fp32-inference-wf # {"$openapi":"WORKFLOW_NAME"} spec: entrypoint: rfcn-fp32-inference # {"$openapi":"MODEL_NAME"} - name: rfcn-fp32-inference # {"$openapi":"MODEL_NAME"} templates: - name: rfcn-fp32-inference # {"$openapi":"MODEL_NAME"} steps: @@ -30,6 +29,9 @@ spec: - name: coco-raw-data mountPath: /datasets # {"$openapi":"DATASET_DIR"} readOnly: true + - name: mlops-scripts + mountPath: /workspace/preprocess-coco-val/scripts/preprocess_coco_val.sh # {"$openapi":"PREPROCESS_COMMAND"} + subPath: preprocess_coco_val.sh # {"$openapi":"PREPROCESS_SCRIPT"} volumes: - name: coco-raw-data hostPath: @@ -37,6 +39,13 @@ spec: - name: pvc-path persistentVolumeClaim: claimName: workdisk # {"$openapi":"PVC_NAME"} + - name: mlops-scripts + configMap: + name: mlops-scripts-rfcn-fp32-inference # {"$openapi":"MODEL_CONFIGMAP_NAME"} + defaultMode: 0770 + items: + - key: preprocess_coco_val.sh # {"$openapi":"PREPROCESS_SCRIPT"} + path: preprocess_coco_val.sh # {"$openapi":"PREPROCESS_SCRIPT"} - name: rfcn-fp32-accuracy securityContext: runAsUser: 0 # {"$openapi":"USER_ID"} @@ -51,15 +60,19 @@ spec: - configMapRef: name: rfcn-fp32-inference # {"$openapi":"MODEL_NAME"} volumeMounts: - - name: coco-raw-data - mountPath: /datasets # {"$openapi":"DATASET_DIR"} - readOnly: true - name: pvc-path mountPath: /pvc # {"$openapi":"PVC_PATH"} + - name: mlops-scripts + mountPath: /workspace/rfcn-fp32-inference/quickstart/fp32_accuracy.sh # {"$openapi":"MODEL_COMMAND"} + subPath: fp32_accuracy.sh # {"$openapi":"MODEL_SCRIPT"} volumes: - - name: coco-raw-data - hostPath: - path: /datasets # {"$openapi":"DATASET_DIR"} - name: pvc-path persistentVolumeClaim: claimName: workdisk # {"$openapi":"PVC_NAME"} + - name: mlops-scripts + configMap: + name: rfcn-fp32-inference # {"$openapi":"MODEL_NAME"} + defaultMode: 0770 + items: + - key: fp32_accuracy.sh # {"$openapi":"MODEL_SCRIPT"} + path: fp32_accuracy.sh # {"$openapi":"MODEL_SCRIPT"} diff --git a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-mounted-nfs/Krmfile b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-mounted-nfs/Krmfile index fce8478d6..4f049a437 100644 --- a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-mounted-nfs/Krmfile +++ b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-mounted-nfs/Krmfile @@ -61,24 +61,14 @@ openAPI: x-k8s-cli: substitution: name: IMAGE - pattern: ${REGISTRY}/intel/object-detection:${IMAGE_VERSION}-${MODEL_NAME}${IMAGE_SUFFIX} + pattern: ${REGISTRY}/intel/object-detection:tf-2.3.0-imz-2.2.0-${MODEL_NAME}${IMAGE_SUFFIX} values: - marker: ${REGISTRY} ref: '#/definitions/io.k8s.cli.setters.REGISTRY' - - marker: ${IMAGE_VERSION} - ref: '#/definitions/io.k8s.cli.setters.IMAGE_VERSION' - marker: ${MODEL_NAME} ref: '#/definitions/io.k8s.cli.setters.MODEL_NAME' - marker: ${IMAGE_SUFFIX} ref: '#/definitions/io.k8s.cli.setters.IMAGE_SUFFIX' - io.k8s.cli.setters.IMAGE_VERSION: - description: image version - x-k8s-cli: - setter: - name: IMAGE_VERSION - value: tf-2.3.0-imz-2.2.0 - isSet: true - setBy: model-builder io.k8s.cli.setters.IMAGE_SUFFIX: description: appended to image name x-k8s-cli: @@ -97,6 +87,14 @@ openAPI: ref: '#/definitions/io.k8s.cli.setters.MODEL_DIR' - marker: ${MODEL_SCRIPT} ref: '#/definitions/io.k8s.cli.setters.MODEL_SCRIPT' + io.k8s.cli.substitutions.MODEL_CONFIGMAP_NAME: + x-k8s-cli: + substitution: + name: MODEL_CONFIGMAP_NAME + pattern: mlops-scripts-${MODEL_NAME} + values: + - marker: ${MODEL_NAME} + ref: '#/definitions/io.k8s.cli.setters.MODEL_NAME' io.k8s.cli.setters.MODEL_DIR: description: container model directory x-k8s-cli: @@ -178,25 +176,15 @@ openAPI: name: PREPROCESS_DIR value: /workspace/preprocess-coco-val setBy: model-builder - io.k8s.cli.setters.PREPROCESS_IMAGE_VERSION: - description: image version - x-k8s-cli: - setter: - name: PREPROCESS_IMAGE_VERSION - value: tf-1.15.2-imz-2.2.0 - isSet: true - setBy: model-builder io.k8s.cli.substitutions.PREPROCESS_IMAGE: description: image name x-k8s-cli: substitution: - name: PREPROCESS_IMAGE - pattern: ${REGISTRY}/intel/object-detection:${PREPROCESS_IMAGE_VERSION}-${PREPROCESS_NAME} + name: IMAGE + pattern: ${REGISTRY}/intel/object-detection:tf-1.15.2-imz-2.2.0-${PREPROCESS_NAME} values: - marker: ${REGISTRY} ref: '#/definitions/io.k8s.cli.setters.REGISTRY' - - marker: ${PREPROCESS_IMAGE_VERSION} - ref: '#/definitions/io.k8s.cli.setters.PREPROCESS_IMAGE_VERSION' - marker: ${PREPROCESS_NAME} ref: '#/definitions/io.k8s.cli.setters.PREPROCESS_NAME' - marker: ${IMAGE_SUFFIX} diff --git a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-mounted-nfs/config-map.yaml b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-mounted-nfs/config-map.yaml index d1e11fb7b..6818975c6 100644 --- a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-mounted-nfs/config-map.yaml +++ b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-mounted-nfs/config-map.yaml @@ -12,3 +12,164 @@ data: USER_ID: "0" # {"$openapi":"USER_ID_VALUE"} USER_NAME: root # {"$openapi":"USER_NAME"} VAL_IMAGE_DIR: /datasets/val2017 # {"$openapi":"VAL_IMAGE_DIR"} + preprocess_coco_val.sh: | + #!/usr/bin/env bash + # + # Copyright (c) 2020 Intel Corporation + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + # + + # This script preprocesses the validation images for the COCO Dataset to create + # TF records files. The raw validation images and annotations must be downloaded + # prior to running this script (https://cocodataset.org/#download). + # + # The following vars need to be set: + # VAL_IMAGE_DIR: Points to the raw validation images (extracted from val2017.zip) + # ANNOTATIONS_DIR: Points to the annotations (extracted from annotations_trainval2017.zip) + # OUTPUT_DIR: Path where the TF records file will be written + # + # This is intended to be used with the create_coco_tf_record.py script from the + # TensorFlow Model Garden, commit 1efe98bb8e8d98bbffc703a90d88df15fc2ce906. + # + # NOTE: This pre-processes the validation images only + + # If the DATASET_DIR is set, then ensure it exists and set paths for the images and annotations + if [[ ! -z "${DATASET_DIR}" ]]; then + if [[ ! -d "${DATASET_DIR}" ]]; then + echo "ERROR: The specified DATASET_DIR ($DATASET_DIR) does not exist." + exit 1 + fi + + VAL_IMAGE_DIR=${DATASET_DIR}/val2017 + ANNOTATIONS_DIR=${DATASET_DIR}/annotations + fi + + # Verify that the a directory exists for the raw validation images + if [[ ! -d "${VAL_IMAGE_DIR}" ]]; then + echo "ERROR: The VAL_IMAGE_DIR (${VAL_IMAGE_DIR}) does not exist. This var needs to point to the raw coco validation images." + exit 1 + fi + + # Verify that the a directory exists for the annotations + if [[ ! -d "${ANNOTATIONS_DIR}" ]]; then + echo "ERROR: The ANNOTATIONS_DIR (${ANNOTATIONS_DIR}) does not exist. This var needs to point to the coco annotations directory." + exit 1 + fi + + # Verify that we have the path to the tensorflow/models code + if [[ ! -d "${TF_MODELS_DIR}" ]]; then + echo "ERROR: The TF_MODELS_DIR var needs to be defined to point to a clone of the tensorflow/models git repo" + exit 1 + fi + + # Create the output directory in case it doesn't already exist + mkdir -p ${OUTPUT_DIR} + + # Checkout the specified branch for the tensorflow/models code + if [[ ! -z "${TF_MODELS_BRANCH}" ]]; then + cd ${TF_MODELS_DIR} + git checkout ${TF_MODELS_BRANCH} + fi + + # Set the PYTHONPATH + cd ${TF_MODELS_DIR}/research + export PYTHONPATH=$PYTHONPATH:`pwd`:`pwd`/slim + + # Create empty dir and json for train/test image preprocessing, so that we don't require + # the user to also download train/test images when all that's needed is validation images. + EMPTY_DIR=${OUTPUT_DIR}/empty_dir + EMPTY_ANNOTATIONS=${OUTPUT_DIR}/empty.json + mkdir -p ${EMPTY_DIR} + echo "{ \"images\": {}, \"categories\": {}}" > ${EMPTY_ANNOTATIONS} + + cd ${TF_MODELS_DIR}/research/object_detection/dataset_tools + python create_coco_tf_record.py --logtostderr \ + --train_image_dir="${EMPTY_DIR}" \ + --val_image_dir="${VAL_IMAGE_DIR}" \ + --test_image_dir="${EMPTY_DIR}" \ + --train_annotations_file="${EMPTY_ANNOTATIONS}" \ + --val_annotations_file="${ANNOTATIONS_DIR}/instances_val2017.json" \ + --testdev_annotations_file="${EMPTY_ANNOTATIONS}" \ + --output_dir="${OUTPUT_DIR}" + + # remove dummy directory and annotations file + rm -rf ${EMPTY_DIR} + rm -rf ${EMPTY_ANNOTATIONS} + + # since we only grab the validation dataset, the TF records files for train + # and test images are size 0. Delete those to prevent confusion. + rm -f ${OUTPUT_DIR}/coco_testdev.record + rm -f ${OUTPUT_DIR}/coco_train.record + + echo "TF records in the output directory:" + ls -l ${OUTPUT_DIR} + fp32_accuracy.sh: | + #!/usr/bin/env bash + # + # Copyright (c) 2020 Intel Corporation + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + # + + if [ -z "${OUTPUT_DIR}" ]; then + echo "The required environment variable OUTPUT_DIR has not been set" + exit 1 + fi + + # Create the output directory in case it doesn't already exist + mkdir -p ${OUTPUT_DIR} + + if [ -z "${DATASET_DIR}" ]; then + echo "The required environment variable DATASET_DIR has not been set" + exit 1 + fi + + if [ -z "${TF_MODELS_DIR}" ]; then + echo "The required environment variable MODELS_DIR has not been set" + exit 1 + fi + + # Untar pretrained model files + pretrained_model_dir="${OUTPUT_DIR}/pretrained_model/rfcn_resnet101_coco_2018_01_28" + if [ ! -d "${pretrained_model_dir}" ]; then + mkdir -p ${OUTPUT_DIR}/pretrained_model + tar -C ${OUTPUT_DIR}/pretrained_model/ -xvf pretrained_model/rfcn_fp32_model.tar.gz + chmod -R u+w ${OUTPUT_DIR}/pretrained_model/ + fi + FROZEN_GRAPH="${pretrained_model_dir}/frozen_inference_graph.pb" + + source "$(dirname $0)/common/utils.sh" + _command python benchmarks/launch_benchmark.py \ + --model-name rfcn \ + --mode inference \ + --precision fp32 \ + --framework tensorflow \ + --model-source-dir ${TF_MODELS_DIR} \ + --data-location ${DATASET_DIR} \ + --in-graph ${FROZEN_GRAPH} \ + --batch-size 1 \ + --accuracy-only \ + --output-dir ${OUTPUT_DIR} \ + $@ \ + -- split="${OUTPUT_DIR}/accuracy_message" diff --git a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-mounted-nfs/kustomization.yaml b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-mounted-nfs/kustomization.yaml index 213b180d6..a1c21752f 100644 --- a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-mounted-nfs/kustomization.yaml +++ b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-mounted-nfs/kustomization.yaml @@ -2,8 +2,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - config-map.yaml -- service-account.yaml -- role.yaml -- role-binding.yaml - serving_accuracy.yaml namespace: default diff --git a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-mounted-nfs/serving_accuracy.yaml b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-mounted-nfs/serving_accuracy.yaml index 9569b6e15..557c40a6a 100644 --- a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-mounted-nfs/serving_accuracy.yaml +++ b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/pipeline/user-mounted-nfs/serving_accuracy.yaml @@ -4,7 +4,6 @@ metadata: name: rfcn-fp32-inference-wf # {"$openapi":"WORKFLOW_NAME"} spec: entrypoint: rfcn-fp32-inference # {"$openapi":"MODEL_NAME"} - serviceAccountName: rfcn-fp32-inference # {"$openapi":"MODEL_NAME"} templates: - name: rfcn-fp32-inference # {"$openapi":"MODEL_NAME"} steps: @@ -30,6 +29,9 @@ spec: - name: coco-raw-data mountPath: /datasets # {"$openapi":"DATASET_DIR"} readOnly: true + - name: mlops-scripts + mountPath: /workspace/preprocess-coco-val/scripts/preprocess_coco_val.sh # {"$openapi":"PREPROCESS_COMMAND"} + subPath: preprocess_coco_val.sh # {"$openapi":"PREPROCESS_SCRIPT"} volumes: - name: coco-raw-data hostPath: @@ -38,6 +40,13 @@ spec: nfs: server: 0.0.0.0 # {"$openapi":"NFS_SERVER"} path: /nfs # {"$openapi":"NFS_PATH"} + - name: mlops-scripts + configMap: + name: rfcn-fp32-inference # {"$openapi":"MODEL_NAME"} + defaultMode: 0770 + items: + - key: preprocess_coco_val.sh # {"$openapi":"PREPROCESS_SCRIPT"} + path: preprocess_coco_val.sh # {"$openapi":"PREPROCESS_SCRIPT"} - name: rfcn-fp32-accuracy securityContext: runAsUser: 0 # {"$openapi":"USER_ID"} @@ -54,14 +63,18 @@ spec: volumeMounts: - name: nfs-path mountPath: /nfs # {"$openapi":"NFS_PATH"} - - name: coco-raw-data - mountPath: /datasets # {"$openapi":"DATASET_DIR"} - readOnly: true + - name: mlops-scripts + mountPath: /workspace/rfcn-fp32-inference/quickstart/fp32_accuracy.sh # {"$openapi":"MODEL_COMMAND"} + subPath: fp32_accuracy.sh # {"$openapi":"MODEL_SCRIPT"} volumes: - - name: coco-raw-data - hostPath: - path: /datasets # {"$openapi":"DATASET_DIR"} - name: nfs-path nfs: server: 0.0.0.0 # {"$openapi":"NFS_SERVER"} path: /nfs # {"$openapi":"NFS_PATH"} + - name: mlops-scripts + configMap: + name: rfcn-fp32-inference # {"$openapi":"MODEL_NAME"} + defaultMode: 0770 + items: + - key: fp32_accuracy.sh # {"$openapi":"MODEL_SCRIPT"} + path: fp32_accuracy.sh # {"$openapi":"MODEL_SCRIPT"} diff --git a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/serving/.gitignore b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/serving/.gitignore new file mode 100644 index 000000000..1bc5a4d64 --- /dev/null +++ b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/serving/.gitignore @@ -0,0 +1,2 @@ +Makefile +*.yaml diff --git a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/serving/user-allocated-pvc/pod.yaml b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/serving/user-allocated-pvc/pod.yaml index 75a717ad2..8564ef366 100644 --- a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/serving/user-allocated-pvc/pod.yaml +++ b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/serving/user-allocated-pvc/pod.yaml @@ -23,6 +23,9 @@ spec: readOnly: true - name: pvc-path mountPath: /pvc # {"$openapi":"PVC_PATH"} + - name: mlops-scripts + mountPath: /workspace/rfcn-fp32-inference/quickstart/fp32_inference.sh # {"$openapi":"COMMAND"} + subPath: fp32_inference.sh # {"$openapi":"MODEL_SCRIPT"} volumes: - name: datasets hostPath: @@ -30,4 +33,11 @@ spec: - name: pvc-path persistentVolumeClaim: claimName: workdisk # {"$openapi":"PVC_NAME"} + - name: mlops-scripts + configMap: + name: mlops-scripts + defaultMode: 0770 + items: + - key: fp32_inference.sh # {"$openapi":"MODEL_SCRIPT"} + path: fp32_inference.sh # {"$openapi":"MODEL_SCRIPT"} restartPolicy: OnFailure diff --git a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/serving/user-mounted-nfs/pod.yaml b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/serving/user-mounted-nfs/pod.yaml index 69b35188d..dadac7331 100644 --- a/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/serving/user-mounted-nfs/pod.yaml +++ b/k8s/object_detection/tensorflow/rfcn/inference/fp32/mlops/serving/user-mounted-nfs/pod.yaml @@ -23,6 +23,9 @@ spec: readOnly: true - name: nfs-path mountPath: /nfs # {"$openapi":"NFS_PATH"} + - name: mlops-scripts + mountPath: /workspace/rfcn-fp32-inference/quickstart/fp32_inference.sh # {"$openapi":"COMMAND"} + subPath: fp32_inference.sh # {"$openapi":"MODEL_SCRIPT"} volumes: - name: datasets hostPath: @@ -31,4 +34,11 @@ spec: nfs: server: 0.0.0.0 # {"$openapi":"NFS_SERVER"} path: /nfs # {"$openapi":"NFS_PATH"} + - name: mlops-scripts + configMap: + name: rfcn-fp32-inference # {"$openapi":"MODEL_NAME"} + defaultMode: 0770 + items: + - key: fp32_inference.sh # {"$openapi":"MODEL_SCRIPT"} + path: fp32_inference.sh # {"$openapi":"MODEL_SCRIPT"} restartPolicy: OnFailure diff --git a/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/README.md b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/README.md index a42d693a1..37c58e4c0 100644 --- a/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/README.md +++ b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/README.md @@ -341,23 +341,6 @@ deployment, and other resources using the following commands: kubectl delete -f .yaml ``` - -## TroubleShooting - -- Pod doesn't start. Status is ErrImagePull.
- Docker recently implemented rate limits.
- See this [note](https://thenewstack.io/docker-hub-limits-what-they-are-and-how-to-route-around-them/) about rate limits and work-arounds. - -- Argo workflow steps do not execute.
- Error from `argo get ` is 'failed to save outputs: Failed to establish pod watch: timed out waiting for the condition'.
- See this argo [issue](https://github.com/argoproj/argo/issues/4186). This is due to the workflow running as non-root.
- Devops will need to change the workflow-executor to k8sapi as described [here](https://github.com/argoproj/argo/blob/master/docs/workflow-executors.md). - -- MpiOperator can't create workers. Error is '/bin/sh: /etc/hosts: Permission denied'. This is due to a bug in mpi-operator in the 'latest' container image - when the workers run as non-root. See this [issue](https://github.com/kubeflow/mpi-operator/issues/288).
- Use the container images: mpioperator/mpi-operator:v02.3 and mpioperator/kubectl-delivery:v0.2.3. - - ## License diff --git a/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/.gitignore b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/.gitignore new file mode 100644 index 000000000..8916ab561 --- /dev/null +++ b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/.gitignore @@ -0,0 +1,3 @@ +Makefile +user-allocated-pvc.yaml +user-mounted-nfs.yaml diff --git a/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-allocated-pvc/Krmfile b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-allocated-pvc/Krmfile index 4f47aae74..5bfe29dca 100644 --- a/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-allocated-pvc/Krmfile +++ b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-allocated-pvc/Krmfile @@ -92,24 +92,14 @@ openAPI: x-k8s-cli: substitution: name: IMAGE - pattern: ${REGISTRY}/intel/recommendation:${IMAGE_VERSION}-${MODEL_NAME}${IMAGE_SUFFIX} + pattern: ${REGISTRY}/intel/recommendation:tf-2.3.0-imz-2.2.0-${MODEL_NAME}${IMAGE_SUFFIX} values: - marker: ${REGISTRY} ref: '#/definitions/io.k8s.cli.setters.REGISTRY' - - marker: ${IMAGE_VERSION} - ref: '#/definitions/io.k8s.cli.setters.IMAGE_VERSION' - marker: ${MODEL_NAME} ref: '#/definitions/io.k8s.cli.setters.MODEL_NAME' - marker: ${IMAGE_SUFFIX} ref: '#/definitions/io.k8s.cli.setters.IMAGE_SUFFIX' - io.k8s.cli.setters.IMAGE_VERSION: - description: image version - x-k8s-cli: - setter: - name: IMAGE_VERSION - value: tf-2.3.0-imz-2.2.0 - isSet: true - setBy: model-builder io.k8s.cli.setters.IMAGE_SUFFIX: description: appended to image name x-k8s-cli: diff --git a/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-allocated-pvc/kustomization.yaml b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-allocated-pvc/kustomization.yaml index be32ca989..d41dc04fb 100644 --- a/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-allocated-pvc/kustomization.yaml +++ b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-allocated-pvc/kustomization.yaml @@ -2,8 +2,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - config-map.yaml -- service-account.yaml -- role.yaml -- role-binding.yaml - train_and_serve.yaml namespace: default diff --git a/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-allocated-pvc/train_and_serve.yaml b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-allocated-pvc/train_and_serve.yaml index 7eeb424c9..0c9adf0f4 100644 --- a/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-allocated-pvc/train_and_serve.yaml +++ b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-allocated-pvc/train_and_serve.yaml @@ -4,7 +4,6 @@ metadata: name: wide-deep-large-ds-fp32-train-serve-wf # {"$openapi":"WORKFLOW_NAME"} spec: entrypoint: wide-deep-large-ds-fp32-training # {"$openapi":"MODEL_NAME"} - serviceAccountName: wide-deep-large-ds-fp32-training # {"$openapi":"MODEL_NAME"} arguments: parameters: - name: DEPLOYMENT_NAME diff --git a/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-mounted-nfs/Krmfile b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-mounted-nfs/Krmfile index 2861a4126..a66424417 100644 --- a/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-mounted-nfs/Krmfile +++ b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-mounted-nfs/Krmfile @@ -92,24 +92,14 @@ openAPI: x-k8s-cli: substitution: name: IMAGE - pattern: ${REGISTRY}/intel/recommendation:${IMAGE_VERSION}-${MODEL_NAME}${IMAGE_SUFFIX} + pattern: ${REGISTRY}/intel/recommendation:tf-2.3.0-imz-2.2.0-${MODEL_NAME}${IMAGE_SUFFIX} values: - marker: ${REGISTRY} ref: '#/definitions/io.k8s.cli.setters.REGISTRY' - - marker: ${IMAGE_VERSION} - ref: '#/definitions/io.k8s.cli.setters.IMAGE_VERSION' - marker: ${MODEL_NAME} ref: '#/definitions/io.k8s.cli.setters.MODEL_NAME' - marker: ${IMAGE_SUFFIX} ref: '#/definitions/io.k8s.cli.setters.IMAGE_SUFFIX' - io.k8s.cli.setters.IMAGE_VERSION: - description: image version - x-k8s-cli: - setter: - name: IMAGE_VERSION - value: tf-2.3.0-imz-2.2.0 - isSet: true - setBy: model-builder io.k8s.cli.setters.IMAGE_SUFFIX: description: appended to image name x-k8s-cli: diff --git a/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-mounted-nfs/kustomization.yaml b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-mounted-nfs/kustomization.yaml index be32ca989..d41dc04fb 100644 --- a/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-mounted-nfs/kustomization.yaml +++ b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-mounted-nfs/kustomization.yaml @@ -2,8 +2,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - config-map.yaml -- service-account.yaml -- role.yaml -- role-binding.yaml - train_and_serve.yaml namespace: default diff --git a/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-mounted-nfs/train_and_serve.yaml b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-mounted-nfs/train_and_serve.yaml index 5c2b8c42c..5bcd437d5 100644 --- a/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-mounted-nfs/train_and_serve.yaml +++ b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/pipeline/user-mounted-nfs/train_and_serve.yaml @@ -4,7 +4,6 @@ metadata: name: wide-deep-large-ds-fp32-train-serve-wf # {"$openapi":"WORKFLOW_NAME"} spec: entrypoint: wide-deep-large-ds-fp32-training # {"$openapi":"MODEL_NAME"} - serviceAccountName: wide-deep-large-ds-fp32-training # {"$openapi":"MODEL_NAME"} arguments: parameters: - name: DEPLOYMENT_NAME diff --git a/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/single-node/.gitignore b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/single-node/.gitignore new file mode 100644 index 000000000..8916ab561 --- /dev/null +++ b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/single-node/.gitignore @@ -0,0 +1,3 @@ +Makefile +user-allocated-pvc.yaml +user-mounted-nfs.yaml diff --git a/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/single-node/user-allocated-pvc/Krmfile b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/single-node/user-allocated-pvc/Krmfile index b2b1c8f83..151665650 100644 --- a/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/single-node/user-allocated-pvc/Krmfile +++ b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/single-node/user-allocated-pvc/Krmfile @@ -79,24 +79,14 @@ openAPI: x-k8s-cli: substitution: name: IMAGE - pattern: ${REGISTRY}/intel/recommendation:${IMAGE_VERSION}-${MODEL_NAME}${IMAGE_SUFFIX} + pattern: ${REGISTRY}/intel/recommendation:tf-2.3.0-imz-2.2.0-${MODEL_NAME}${IMAGE_SUFFIX} values: - marker: ${REGISTRY} ref: '#/definitions/io.k8s.cli.setters.REGISTRY' - - marker: ${IMAGE_VERSION} - ref: '#/definitions/io.k8s.cli.setters.IMAGE_VERSION' - marker: ${MODEL_NAME} ref: '#/definitions/io.k8s.cli.setters.MODEL_NAME' - marker: ${IMAGE_SUFFIX} ref: '#/definitions/io.k8s.cli.setters.IMAGE_SUFFIX' - io.k8s.cli.setters.IMAGE_VERSION: - description: image version - x-k8s-cli: - setter: - name: IMAGE_VERSION - value: tf-2.3.0-imz-2.2.0 - isSet: true - setBy: model-builder io.k8s.cli.substitutions.GROUP_ID_VALUE: x-k8s-cli: substitution: diff --git a/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/single-node/user-mounted-nfs/Krmfile b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/single-node/user-mounted-nfs/Krmfile index c6769abde..c988fa46b 100644 --- a/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/single-node/user-mounted-nfs/Krmfile +++ b/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/mlops/single-node/user-mounted-nfs/Krmfile @@ -79,24 +79,14 @@ openAPI: x-k8s-cli: substitution: name: IMAGE - pattern: ${REGISTRY}/intel/recommendation:${IMAGE_VERSION}-${MODEL_NAME}${IMAGE_SUFFIX} + pattern: ${REGISTRY}/intel/recommendation:tf-2.3.0-imz-2.2.0-${MODEL_NAME}${IMAGE_SUFFIX} values: - marker: ${REGISTRY} ref: '#/definitions/io.k8s.cli.setters.REGISTRY' - - marker: ${IMAGE_VERSION} - ref: '#/definitions/io.k8s.cli.setters.IMAGE_VERSION' - marker: ${MODEL_NAME} ref: '#/definitions/io.k8s.cli.setters.MODEL_NAME' - marker: ${IMAGE_SUFFIX} ref: '#/definitions/io.k8s.cli.setters.IMAGE_SUFFIX' - io.k8s.cli.setters.IMAGE_VERSION: - description: image version - x-k8s-cli: - setter: - name: IMAGE_VERSION - value: tf-2.3.0-imz-2.2.0 - isSet: true - setBy: model-builder io.k8s.cli.substitutions.GROUP_ID_VALUE: x-k8s-cli: substitution: diff --git a/models/language_modeling/tensorflow/bert_large/inference/export_classifier.py b/models/language_modeling/tensorflow/bert_large/inference/export_classifier.py index f85e3c7e8..5be0be5d4 100644 --- a/models/language_modeling/tensorflow/bert_large/inference/export_classifier.py +++ b/models/language_modeling/tensorflow/bert_large/inference/export_classifier.py @@ -50,6 +50,7 @@ def __init__(self, use_one_hot_embeddings = False bert_config = BertConfig.from_json_file(bert_config) + bert_config.experimental_gelu = FLAGS.experimental_gelu if FLAGS.precision: bert_config.precision = FLAGS.precision diff --git a/models/language_modeling/tensorflow/bert_large/inference/generic_ops.py b/models/language_modeling/tensorflow/bert_large/inference/generic_ops.py index 689dbc87d..4cce30a77 100755 --- a/models/language_modeling/tensorflow/bert_large/inference/generic_ops.py +++ b/models/language_modeling/tensorflow/bert_large/inference/generic_ops.py @@ -97,7 +97,7 @@ def gelu(x): `x` with the GELU activation applied. """ if _use_experimental_gelu: - return tf.nn.gelu(x) + return tf.nn.gelu(features=x, approximate=True) else: x = i_cast(x) cdf = 0.5 * (1.0 + tf.tanh( diff --git a/models/language_modeling/tensorflow/bert_large/inference/run_classifier.py b/models/language_modeling/tensorflow/bert_large/inference/run_classifier.py index e8bf37aff..f2c15730a 100644 --- a/models/language_modeling/tensorflow/bert_large/inference/run_classifier.py +++ b/models/language_modeling/tensorflow/bert_large/inference/run_classifier.py @@ -143,8 +143,16 @@ "The profile output will be generated in the output_dir") flags.DEFINE_string("precision", "fp32", "[Optional] TensorFlow training precision.") + flags.DEFINE_string("frozen_graph_path", None, "path of frozen graph.") +flags.DEFINE_bool("experimental_gelu", False, + "[Optional] If true, use experimental gelu op in model." + " Be careful this flag will crash model with incompatible TF.") + +flags.DEFINE_bool("optimized_softmax", False, + "[Optional] If true, use optimized softmax op in model.") + class LoggerHook(tf.estimator.SessionRunHook): """ Logs runtime. """ @@ -894,6 +902,8 @@ def main(_): "At least one of `do_train`, `do_eval` or `do_predict' must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + bert_config.experimental_gelu = FLAGS.experimental_gelu + if FLAGS.precision: bert_config.precision = FLAGS.precision diff --git a/models/language_modeling/tensorflow/bert_large/training/bfloat16/generic_ops.py b/models/language_modeling/tensorflow/bert_large/training/bfloat16/generic_ops.py index a4ba570b9..33f4ed988 100755 --- a/models/language_modeling/tensorflow/bert_large/training/bfloat16/generic_ops.py +++ b/models/language_modeling/tensorflow/bert_large/training/bfloat16/generic_ops.py @@ -97,7 +97,7 @@ def gelu(x): `x` with the GELU activation applied. """ if _use_experimental_gelu: - return tf.nn.gelu(x) + return tf.nn.gelu(features=x, approximate=True) else: x = i_cast(x) cdf = 0.5 * (1.0 + tf.tanh( diff --git a/models/language_modeling/tensorflow/bert_large/training/fp32/generic_ops.py b/models/language_modeling/tensorflow/bert_large/training/fp32/generic_ops.py index b9fd2b4a1..ad2baab3f 100755 --- a/models/language_modeling/tensorflow/bert_large/training/fp32/generic_ops.py +++ b/models/language_modeling/tensorflow/bert_large/training/fp32/generic_ops.py @@ -87,7 +87,7 @@ def gelu(x): `x` with the GELU activation applied. """ if _use_experimental_gelu : - return tf.nn.gelu(x) + return tf.nn.gelu(features=x, approximate=True) else: x = i_cast(x) cdf = 0.5 * (1.0 + tf.tanh( diff --git a/models/object_detection/tensorflow/rfcn/inference/fp32/coco_mAP.sh b/models/object_detection/tensorflow/rfcn/inference/fp32/coco_mAP.sh index b82043847..d52f04398 100755 --- a/models/object_detection/tensorflow/rfcn/inference/fp32/coco_mAP.sh +++ b/models/object_detection/tensorflow/rfcn/inference/fp32/coco_mAP.sh @@ -31,6 +31,7 @@ fi export PYTHONPATH=$PYTHONPATH:${TF_MODELS_ROOT}/research:${TF_MODELS_ROOT}/research/slim:${TF_MODELS_ROOT}/research/object_detection +echo "PWD=$PWD" echo "SPLIT=${SPLIT}" echo "FROZEN_GRAPH=${FROZEN_GRAPH}" echo "TF_RECORD_FILES=${TF_RECORD_FILES}" @@ -49,15 +50,15 @@ mkdir -p ${SPLIT}_eval_metrics echo " label_map_path: '${TF_MODELS_ROOT}/research/object_detection/data/mscoco_label_map.pbtxt' tf_record_input_reader: { input_path: '${SPLIT}_detections.tfrecord' } -" > ${SPLIT}_eval_metrics/${SPLIT}_input_config.pbtxt +" > ${SPLIT}_eval_metrics/$(basename ${SPLIT})_input_config.pbtxt echo " metrics_set: 'coco_detection_metrics' -" > ${SPLIT}_eval_metrics/${SPLIT}_eval_config.pbtxt +" > ${SPLIT}_eval_metrics/$(basename ${SPLIT})_eval_config.pbtxt python -m object_detection.metrics.offline_eval_map_corloc \ --eval_dir=${SPLIT}_eval_metrics \ - --eval_config_path=${SPLIT}_eval_metrics/${SPLIT}_eval_config.pbtxt \ - --input_config_path=${SPLIT}_eval_metrics/${SPLIT}_input_config.pbtxt + --eval_config_path=${SPLIT}_eval_metrics/$(basename ${SPLIT})_eval_config.pbtxt \ + --input_config_path=${SPLIT}_eval_metrics/$(basename ${SPLIT})_input_config.pbtxt diff --git a/models/object_detection/tensorflow/ssd-resnet34/inference/tensorflow_benchmarks_tf2.0.patch b/models/object_detection/tensorflow/ssd-resnet34/inference/tensorflow_benchmarks_tf2.0.patch index dd35c6136..c6fc9211c 100644 --- a/models/object_detection/tensorflow/ssd-resnet34/inference/tensorflow_benchmarks_tf2.0.patch +++ b/models/object_detection/tensorflow/ssd-resnet34/inference/tensorflow_benchmarks_tf2.0.patch @@ -7,7 +7,7 @@ index 56d8c88..dddf57d 100644 import tensorflow as tf -from tensorflow.contrib.all_reduce.python import all_reduce -+from tensorflow.python.distribute import all_reduce ++from tensorflow.python.distribute.v1 import all_reduce from tensorflow.python.framework import device as pydev from tensorflow.python.framework import ops from tensorflow.python.ops import collective_ops diff --git a/models/object_detection/tensorflow/ssd-resnet34/training/bfloat16/benchmark-tf-2.0.diff b/models/object_detection/tensorflow/ssd-resnet34/training/bfloat16/benchmark-tf-2.0.diff index e7e66c1a3..a665db498 100644 --- a/models/object_detection/tensorflow/ssd-resnet34/training/bfloat16/benchmark-tf-2.0.diff +++ b/models/object_detection/tensorflow/ssd-resnet34/training/bfloat16/benchmark-tf-2.0.diff @@ -89,7 +89,7 @@ index 56d8c88..72e34ea 100644 import tensorflow as tf -from tensorflow.contrib.all_reduce.python import all_reduce -+from tensorflow.python.distribute import all_reduce ++from tensorflow.python.distribute.v1 import all_reduce +#from tensorflow.contrib.all_reduce.python import all_reduce from tensorflow.python.framework import device as pydev from tensorflow.python.framework import ops diff --git a/models/object_detection/tensorflow/ssd-resnet34/training/fp32/benchmark-tf-2.0.diff b/models/object_detection/tensorflow/ssd-resnet34/training/fp32/benchmark-tf-2.0.diff index e7e66c1a3..a665db498 100644 --- a/models/object_detection/tensorflow/ssd-resnet34/training/fp32/benchmark-tf-2.0.diff +++ b/models/object_detection/tensorflow/ssd-resnet34/training/fp32/benchmark-tf-2.0.diff @@ -89,7 +89,7 @@ index 56d8c88..72e34ea 100644 import tensorflow as tf -from tensorflow.contrib.all_reduce.python import all_reduce -+from tensorflow.python.distribute import all_reduce ++from tensorflow.python.distribute.v1 import all_reduce +#from tensorflow.contrib.all_reduce.python import all_reduce from tensorflow.python.framework import device as pydev from tensorflow.python.framework import ops diff --git a/models/recommendation/tensorflow/wide_deep_large_ds/dataset/preprocess_csv_tfrecords.py b/models/recommendation/tensorflow/wide_deep_large_ds/dataset/preprocess_csv_tfrecords.py index b0854a263..51bab1d6e 100644 --- a/models/recommendation/tensorflow/wide_deep_large_ds/dataset/preprocess_csv_tfrecords.py +++ b/models/recommendation/tensorflow/wide_deep_large_ds/dataset/preprocess_csv_tfrecords.py @@ -27,7 +27,20 @@ import argparse import numpy as np import tensorflow as tf -tf.enable_eager_execution() +def version_is_less_than(a, b): + a_parts = a.split('.') + b_parts = b.split('.') + + for i in range(len(a_parts)): + if int(a_parts[i]) < int(b_parts[i]): + print('{} < {}, version_is_less_than() returning False'.format( + a_parts[i], b_parts[i])) + return True + return False +print("TensorFlow version {}".format(tf.__version__)) +required_tf_version = '2.0.0' +if version_is_less_than(tf.__version__ , required_tf_version): + tf.compat.v1.enable_eager_execution() parser = argparse.ArgumentParser() parser.add_argument('--inputcsv-datafile', type=str, help='full path of data file e.g. eval.csv', @@ -117,8 +130,7 @@ print('max list',max_list) print('range list',range_list) - -with tf.python_io.TFRecordWriter(output_file) as writer: +with tf.io.TFRecordWriter(output_file) as writer: print('*****Processing data******') for row in csv: no_of_rows = no_of_rows+1 @@ -137,7 +149,7 @@ new_categorical_list.append("") else: new_categorical_list.append(new_categorical_dict[i]) - hash_values = tf.string_to_hash_bucket_fast( + hash_values = tf.strings.to_hash_bucket_fast( new_categorical_list, 1000).numpy() new_numerical_dict = dict(zip(NUMERIC_COLUMNS2, normalized_vals)) example = tf.train.Example() diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_mobilenet_v1_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_mobilenet_v1_args.json index 763620059..6e81d2a0f 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_mobilenet_v1_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_mobilenet_v1_args.json @@ -1,19 +1,19 @@ [ { "_comment": "mobilenet_v1_fp32_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=-1 --accuracy-only --verbose --checkpoint=/checkpoints --in-graph=/in_graph/mobilenet_v1_1.0_224_frozen.pb --data-location=/dataset", - "output": "python /workspace/intelai_models/inference/fp32/accuracy.py --batch_size=100 --data_location=/dataset --num_intra_threads=56 --num_inter_threads=2 --input_graph=/in_graph/mobilenet_v1_1.0_224_frozen.pb --input_height=224 --input_width=224 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1"}, + "output": "python /workspace/intelai_models/inference/accuracy.py --precision=fp32 --batch_size=100 --data_location=/dataset --num_intra_threads=56 --num_inter_threads=2 --input_graph=/in_graph/mobilenet_v1_1.0_224_frozen.pb --input_height=224 --input_width=224 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1"}, { "_comment": "mobilenet_v1_fp32_latency", - "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id 0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset", - "output": "numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/fp32/benchmark.py --batch_size=1 --num_intra_threads=28 --num_inter_threads=2 --input_height=224 --input_width=224 --warmup_steps=10 --steps=50 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1"}, + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id 0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset", + "output": "numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/benchmark.py --precision=fp32 --batch_size=100 --num_intra_threads=28 --num_inter_threads=2 --input_height=224 --input_width=224 --warmup_steps=10 --steps=50 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1"}, { "_comment": "mobilenet_v1_fp32_throughput", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id 0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset", - "output": "numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/fp32/benchmark.py --batch_size=100 --num_intra_threads=28 --num_inter_threads=2 --input_height=224 --input_width=224 --warmup_steps=10 --steps=50 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1"}, + "output": "numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/benchmark.py --precision=fp32 --batch_size=100 --num_intra_threads=28 --num_inter_threads=2 --input_height=224 --input_width=224 --warmup_steps=10 --steps=50 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1"}, { "_comment": "mobilenet_v1_fp32_dummy_data_output-dir", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --checkpoint=/checkpoints", - "output": "numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/fp32/benchmark.py --batch_size=100 --num_intra_threads=28 --num_inter_threads=2 --input_height=224 --input_width=224 --warmup_steps=10 --steps=50 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1"}, + "output": "numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/benchmark.py --precision=fp32 --batch_size=100 --num_intra_threads=28 --num_inter_threads=2 --input_height=224 --input_width=224 --warmup_steps=10 --steps=50 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1"}, { "_comment": "mobilenet_v1_int8_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/models_mobilenetv1_int8_pretrained_model.pb --data-location=/dataset --input_height=224 --input_width=224", diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_ssd_mobilenet_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_ssd_mobilenet_args.json index f0de3e022..a5762431c 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_ssd_mobilenet_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_ssd_mobilenet_args.json @@ -1,11 +1,11 @@ [ { "_comment": "ssd_mobilenet_fp32_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-mobilenet --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id=0 --accuracy-only --verbose --in-graph=/in_graph/frozen_inference_graph.pb --benchmark-dir=/workspace/benchmarks --data-location=/dataset", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/infer_detections.py -g /in_graph/frozen_inference_graph.pb -i 1000 -w 200 -a 28 -e 1 -d /dataset -r"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/infer_detections.py -g /in_graph/frozen_inference_graph.pb -i 1000 -w 200 -a 28 -e 1 -d /dataset -r"}, { "_comment": "ssd_mobilenet_fp32", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-mobilenet --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id=0 --benchmark-only --verbose --in-graph=/in_graph/frozen_inference_graph.pb --data-location=/dataset", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/infer_detections.py -g /in_graph/frozen_inference_graph.pb -i 1000 -w 200 -a 28 -e 1 -d /dataset -b -1"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/infer_detections.py -g /in_graph/frozen_inference_graph.pb -i 1000 -w 200 -a 28 -e 1 -d /dataset -b -1"}, { "_comment": "ssd_mobilenet_int8_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-mobilenet --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/ssdmobilenet_int8_pretrained_model.pb --data-location=/dataset", diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_wide_deep_large_ds_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_wide_deep_large_ds_args.json index c4e5e65f9..dc938ff52 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_wide_deep_large_ds_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_wide_deep_large_ds_args.json @@ -5,7 +5,7 @@ { "_comment": "wide_deep_large_int8_28_cores", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=28 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_int8_pretrained_model.pb --data-location=/dataset", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --physcpubind=0-27 --membind=0 python /workspace/intelai_models/inference/inference.py --batch_size=1 --data_location=/dataset --input_graph=/in_graph/wide_deep_int8_pretrained_model.pb"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/inference.py --batch_size=1 --data_location=/dataset --input_graph=/in_graph/wide_deep_int8_pretrained_model.pb"}, { "_comment": "wide_deep_large_int8_latency", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_int8_pretrained_model.pb --data-location=/dataset", @@ -21,7 +21,7 @@ { "_comment": "wide_deep_large_fp32_28_cores", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=28 --batch-size=512 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data-location=/dataset", - "output": "numactl --physcpubind=0-27 --membind=0 python /workspace/intelai_models/inference/inference.py --batch_size=512 --data_location=/dataset --input_graph=/in_graph/wide_deep_fp32_pretrained_model.pb"}, + "output": "python /workspace/intelai_models/inference/inference.py --batch_size=512 --data_location=/dataset --input_graph=/in_graph/wide_deep_fp32_pretrained_model.pb"}, { "_comment": "wide_deep_large_fp32_throughput", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=512 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data-location=/dataset", diff --git a/tools/docker/specs/k8s/bert-large-fp32-training-k8s_spec.yml b/tools/docker/specs/k8s/bert-large-fp32-training-k8s_spec.yml index 566c8355f..b5a38451d 100644 --- a/tools/docker/specs/k8s/bert-large-fp32-training-k8s_spec.yml +++ b/tools/docker/specs/k8s/bert-large-fp32-training-k8s_spec.yml @@ -30,8 +30,6 @@ slice_sets: uri: models/k8s/language_modeling/tensorflow/bert_large/training/fp32/.docs/quickstart.md - name: Kubernetes uri: models/k8s/language_modeling/tensorflow/bert_large/training/fp32/.docs/kubernetes.md - - name: Trouble Shooting - uri: models/k8s/language_modeling/tensorflow/bert_large/training/fp32/.docs/troubleshooting.md - name: License link uri: models/k8s/language_modeling/tensorflow/bert_large/training/fp32/.docs/license.md args: diff --git a/tools/docker/specs/k8s/resnet50v1-5-fp32-inference-k8s_spec.yml b/tools/docker/specs/k8s/resnet50v1-5-fp32-inference-k8s_spec.yml index 1d310c385..845c8f801 100644 --- a/tools/docker/specs/k8s/resnet50v1-5-fp32-inference-k8s_spec.yml +++ b/tools/docker/specs/k8s/resnet50v1-5-fp32-inference-k8s_spec.yml @@ -37,8 +37,6 @@ slice_sets: uri: models/k8s/image_recognition/tensorflow/resnet50v1_5/inference/fp32/.docs/quickstart.md - name: Kubernetes uri: models/k8s/image_recognition/tensorflow/resnet50v1_5/inference/fp32/.docs/kubernetes.md - - name: Trouble Shooting - uri: models/k8s/image_recognition/tensorflow/resnet50v1_5/inference/fp32/.docs/troubleshooting.md - name: License link uri: models/k8s/image_recognition/tensorflow/resnet50v1_5/inference/fp32/.docs/license.md runtime: diff --git a/tools/docker/specs/k8s/resnet50v1-5-fp32-training-k8s_spec.yml b/tools/docker/specs/k8s/resnet50v1-5-fp32-training-k8s_spec.yml index 8de7bd794..90abb26de 100644 --- a/tools/docker/specs/k8s/resnet50v1-5-fp32-training-k8s_spec.yml +++ b/tools/docker/specs/k8s/resnet50v1-5-fp32-training-k8s_spec.yml @@ -30,8 +30,6 @@ slice_sets: uri: models/k8s/image_recognition/tensorflow/resnet50v1_5/training/fp32/.docs/quickstart.md - name: Kubernetes uri: models/k8s/image_recognition/tensorflow/resnet50v1_5/training/fp32/.docs/kubernetes.md - - name: Trouble Shooting - uri: models/k8s/image_recognition/tensorflow/resnet50v1_5/training/fp32/.docs/troubleshooting.md - name: License link uri: models/k8s/image_recognition/tensorflow/resnet50v1_5/training/fp32/.docs/license.md args: diff --git a/tools/docker/specs/k8s/rfcn-fp32-inference-k8s_spec.yml b/tools/docker/specs/k8s/rfcn-fp32-inference-k8s_spec.yml index e3a240a3f..d8205b6b0 100644 --- a/tools/docker/specs/k8s/rfcn-fp32-inference-k8s_spec.yml +++ b/tools/docker/specs/k8s/rfcn-fp32-inference-k8s_spec.yml @@ -30,8 +30,6 @@ slice_sets: uri: models/k8s/object_detection/tensorflow/rfcn/inference/fp32/.docs/quickstart.md - name: Kubernetes uri: models/k8s/object_detection/tensorflow/rfcn/inference/fp32/.docs/kubernetes.md - - name: Trouble Shooting - uri: models/k8s/object_detection/tensorflow/rfcn/inference/fp32/.docs/troubleshooting.md - name: License link uri: models/k8s/object_detection/tensorflow/rfcn/inference/fp32/.docs/license.md args: diff --git a/tools/docker/specs/k8s/wide-deep-large-ds-fp32-training-k8s_spec.yml b/tools/docker/specs/k8s/wide-deep-large-ds-fp32-training-k8s_spec.yml index aaa6e4a20..ead4d605d 100644 --- a/tools/docker/specs/k8s/wide-deep-large-ds-fp32-training-k8s_spec.yml +++ b/tools/docker/specs/k8s/wide-deep-large-ds-fp32-training-k8s_spec.yml @@ -37,8 +37,6 @@ slice_sets: uri: models/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/.docs/quickstart.md - name: Kubernetes uri: models/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/.docs/kubernetes.md - - name: Trouble Shooting - uri: models/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/.docs/troubleshooting.md - name: License link uri: models/k8s/recommendation/tensorflow/wide_deep_large_ds/training/fp32/.docs/license.md runtime: diff --git a/tox.ini b/tox.ini index 1613d0961..5fc5fdbb9 100644 --- a/tox.ini +++ b/tox.ini @@ -21,7 +21,7 @@ addopts = -p no:warnings --cov=benchmarks/ --cov-config=tox.ini - --cov-fail-under=72 + --cov-fail-under=71 --cov-report xml:test_data/coverage.xml --cov-report html:test_data/coverage_html_report --cov-report term-missing