diff --git a/.gitignore b/.gitignore index daa78d1cf7..db57aefa57 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,18 @@ hps_tf/merlin_hps.egg-info/ hps_torch/_skbuild hps_torch/dist/ hps_torch/merlin_hps.egg-info/ +.gitlab/ +ci/benchmark/ +ci/common/ +ci/draco-oci/ +ci/dracorno/ +ci/integration_test/ +ci/post_test/ +ci/selene/ +ci/utest/ +ci/benchmark.yml +ci/common.yml +ci/release.yml +ci/rules.gitlab_ci.yml +ci/template.yml +.gitlab-ci.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml deleted file mode 100644 index 7960ca8305..0000000000 --- a/.gitlab-ci.yml +++ /dev/null @@ -1,521 +0,0 @@ -include: -# - remote: 'https://gitlab.com/yesolutions/gitlab-ci-templates/raw/main/templates/pre-commit-autofix.yaml' - - project: "dl/devops/gitlab-ci-slurm" - ref: master - file: "/.gitlab-ci.yml" - - /ci/common.yml - - /ci/template.yml - - /ci/benchmark.yml - - /ci/rules.gitlab_ci.yml - -nightly_build_all: - extends: .build_nightly - variables: - REMOTE_REPO: https://github.com/NVIDIA-Merlin/Merlin.git - DST_IMAGE: ${IMAGE_ALL} - DOCKER_FILE: dockerfile.ctr - BUILD_ARGS: --build-arg HUGECTR_DEV_MODE=true - -nightly_build_sok_tf2: - extends: .build_nightly - variables: - REMOTE_REPO: https://github.com/NVIDIA-Merlin/Merlin.git - DST_IMAGE: ${IMAGE_SOK_TF2} - DOCKER_FILE: dockerfile.tf - BUILD_ARGS: --build-arg HUGECTR_DEV_MODE=true - -nightly_build_sok_tf1: - extends: .build_nightly_tf1 - variables: - DST_IMAGE: ${IMAGE_SOK_TF1} - DOCKER_FILE: Dockerfile.sok1 - BUILD_ARGS: --build-arg HUGECTR_DEV_MODE=true - -nightly_build_unified_container.tf: - extends: .build_nightly - variables: - REMOTE_REPO: https://github.com/NVIDIA-Merlin/Merlin.git - DST_IMAGE: ${CI_REGISTRY}/dl/hugectr/hugectr:unified.tf.latest - DOCKER_FILE: dockerfile.tf - BUILD_ARGS: --build-arg _CI_JOB_TOKEN=${RD_CI_JOB_TOKEN} --build-arg _HUGECTR_REPO=gitlab-master.nvidia.com/dl/hugectr/hugectr.git --build-arg HUGECTR_VER=${CI_COMMIT_BRANCH} - -nightly_build_unified_container.ctr: - extends: .build_nightly - variables: - REMOTE_REPO: https://github.com/NVIDIA-Merlin/Merlin.git - DST_IMAGE: ${CI_REGISTRY}/dl/hugectr/hugectr:unified.ctr.latest - DOCKER_FILE: dockerfile.ctr - BUILD_ARGS: --build-arg _CI_JOB_TOKEN=${RD_CI_JOB_TOKEN} --build-arg _HUGECTR_REPO=gitlab-master.nvidia.com/dl/hugectr/hugectr.git --build-arg HUGECTR_VER=${CI_COMMIT_BRANCH} --build-arg _HUGECTR_BACKEND_REPO=gitlab-master.nvidia.com/dl/hugectr/hugectr_inference_backend.git --build-arg HUGECTR_BACKEND_VER=hugectr_performance_test - -nightly_build_optimized: - extends: .build_nightly - variables: - REMOTE_REPO: https://${RD_CI_JOB_TOKEN}gitlab-master.nvidia.com/dl/mlperf/optimized.git - DST_IMAGE: ${IMAGE_OPTIMIZED} - DOCKER_FILE: Dockerfile - BUILD_ARGS: --build-arg RELEASE=false --build-arg FROM_IMAGE_NAME=gitlab-master.nvidia.com:5005/dl/dgx/pytorch:master-py3-devel - OPTIMIZED: 1 - rules: - - if: $NIGHTLY_OPTIMIZED == "1" - when: always - - when: never - -build_optimized: - extends: .build_nightly - variables: - REMOTE_REPO: https://${RD_CI_JOB_TOKEN}gitlab-master.nvidia.com/dl/mlperf/optimized.git - DST_IMAGE: ${IMAGE_OPTIMIZED}.${CI_PIPELINE_ID} - DOCKER_FILE: Dockerfile - BUILD_ARGS: --build-arg FROM_IMAGE_NAME=gitlab-master.nvidia.com:5005/dl/dgx/pytorch:master-py3-devel - OPTIMIZED: 1 - rules: - - if: $NIGHTLY_OPTIMIZED == "1" - when: always - - when: never - -### Stage: build -format_check_python: - extends: .python_format - variables: - EXCLUDE: "third_party|docs|notebooks|tutorial" - -format_check_clang: - extends: .clang_format - variables: - EXCLUDE: ./third_party - STYLE: file - EXECUTABLE: clang-format14 - EXTENSIONS: "h,hpp,cpp,cu,cuh" - -codespell_check: - extends: .codespell_check - variables: - PRE_COM_IMAGE: registry.gitlab.com/yesolutions/docker-pre-commit - -build_train_single_node: - extends: .build_hugectr - variables: - FROM_IMAGE: ${IMAGE_ALL} - DST_IMAGE: $TRAIN_IMAGE_VERSIONED - CMAKE_OPTION: "-DCMAKE_BUILD_TYPE=Release -DKEY_HIT_RATIO=ON -DSM=\"60;61;70;75;80;90\" -DCLANGFORMAT=OFF" - BUILD_HUGECTR: 1 - BUILD_HUGECTR2ONNX: 1 - -build_train_single_node_latest: - extends: .build_hugectr_daily - variables: - FROM_IMAGE: ${MERLIN_NIGHTLY_DEVEL} - DST_IMAGE: $TRAIN_IMAGE_VERSIONED_LATEST - CMAKE_OPTION: "-DCMAKE_BUILD_TYPE=Release -DKEY_HIT_RATIO=ON -DSM=\"60;61;70;75;80;90\" -DCLANGFORMAT=OFF" - BUILD_HUGECTR: 1 - BUILD_HUGECTR2ONNX: 1 - -build_train_single_node_with_hdfs_minimal: - extends: .build_hugectr - variables: - FROM_IMAGE: ${IMAGE_ALL} - DST_IMAGE: $TRAIN_IMAGE_VERSIONED_WITH_HDFS_MINI - CMAKE_OPTION: "-DCMAKE_BUILD_TYPE=Release -DKEY_HIT_RATIO=ON -DSM=\"60;61;70;75;80;90\" -DCLANGFORMAT=OFF -DENABLE_HDFS=MINIMAL" - BUILD_HUGECTR: 1 - BUILD_HUGECTR2ONNX: 1 - -build_train_single_node_with_hdfs_full: - extends: .build_hugectr_daily - variables: - FROM_IMAGE: ${IMAGE_ALL} - DST_IMAGE: $TRAIN_IMAGE_VERSIONED_WITH_HDFS - CMAKE_OPTION: "-DCMAKE_BUILD_TYPE=Release -DKEY_HIT_RATIO=ON -DSM=\"60;61;70;75;80;90\" -DCLANGFORMAT=OFF -DENABLE_HDFS=ON" - BUILD_HUGECTR: 1 - BUILD_HUGECTR2ONNX: 1 - -build_train_single_node_with_s3: - extends: .build_hugectr - variables: - FROM_IMAGE: ${IMAGE_ALL} - DST_IMAGE: ${TRAIN_IMAGE_VERSIONED_WITH_S3} - CMAKE_OPTION: "-DCMAKE_BUILD_TYPE=Release -DKEY_HIT_RATIO=ON -DSM=\"60;61;70;75;80;90\" -DCLANGFORMAT=OFF -DENABLE_S3=ON" - BUILD_HUGECTR: 1 - BUILD_HUGECTR2ONNX: 1 - -build_train_single_node_with_gcs: - extends: .build_hugectr - variables: - FROM_IMAGE: ${IMAGE_ALL} - DST_IMAGE: ${TRAIN_IMAGE_VERSIONED_WITH_GCS} - CMAKE_OPTION: "-DCMAKE_BUILD_TYPE=Release -DKEY_HIT_RATIO=ON -DSM=\"60;61;70;75;80;90\" -DCLANGFORMAT=OFF -DENABLE_GCS=ON" - BUILD_HUGECTR: 1 - BUILD_HUGECTR2ONNX: 1 - -build_train_multi_node: - extends: .build_hugectr - variables: - FROM_IMAGE: ${IMAGE_ALL} - DST_IMAGE: $TRAIN_IMAGE_MULTINODE_VERSIONED - CMAKE_OPTION: "-DCMAKE_BUILD_TYPE=Release -DKEY_HIT_RATIO=ON -DENABLE_MULTINODES=ON -DSM=\"60;61;70;75;80;90\" -DCLANGFORMAT=OFF" - BUILD_HUGECTR: 1 - -build_train_inference: - extends: .build_hugectr - variables: - FROM_IMAGE: ${IMAGE_ALL} - DST_IMAGE: $TRAIN_INFER_IMAGE_VERSIONED - CMAKE_OPTION: "-DENABLE_INFERENCE=ON -DCMAKE_BUILD_TYPE=Release -DSM=\"60;61;70;75;80;90\" -DCLANGFORMAT=OFF" - BUILD_HUGECTR: 1 - -### Stage: test -build_inference: - extends: .build_hugectr - variables: - FROM_IMAGE: ${IMAGE_ALL} - DST_IMAGE: $INFER_IMAGE_VERSIONED - CMAKE_OPTION: "-DENABLE_INFERENCE=ON -DCMAKE_BUILD_TYPE=Release -DSM=\"70;75;80;90\" -DCLANGFORMAT=OFF" - BUILD_HUGECTR: 1 - BUILD_HPS_BACKEND: 1 - HUGECTR_BACKEND_VER: main - TRITON_BRANCH: ${TARGET_TRITON_BRANCH} - -build_sok_tf2: - extends: .build_sok - variables: - FROM_IMAGE: ${IMAGE_ALL} - DST_IMAGE: $SOK_IMAGE_VERSIONED_TF2 - CMAKE_OPTION: "-DSM=\"60;61;70;75;80;90\"" - BUILD_SOK: 1 - -build_sok_tf1: - extends: .build_sok - variables: - FROM_IMAGE: ${IMAGE_SOK_TF1} - DST_IMAGE: $SOK_IMAGE_VERSIONED_TF1 - CMAKE_OPTION: "-DSM=\"60;61;70;75;80;90\"" - BUILD_SOK: 1 - -build_hugectr_hps_trt_plugin: - extends: .build_hugectr - variables: - FROM_IMAGE: ${IMAGE_ALL} - DST_IMAGE: $HUGECTR_TRT_IMAGE_VERSIONED - CMAKE_OPTION: "-DCMAKE_BUILD_TYPE=Release -DKEY_HIT_RATIO=ON -DSM=\"60;61;70;75;80;90\" -DCLANGFORMAT=OFF" - BUILD_HUGECTR: 1 - BUILD_HUGECTR2ONNX: 1 - BUILD_TRT_PLUGIN: 1 - TRT_CMAKE_OPTION: "-DSM=\"70;75;80;90\"" - #BUILD_HPS_BACKEND: 1 - #HUGECTR_BACKEND_VER: main - #TRITON_BRANCH: ${TRITON_BRANCH} - -build_tf_hps_trt_plugin: - extends: .build_hugectr - variables: - FROM_IMAGE: ${IMAGE_ALL} - DST_IMAGE: $TF_TRT_IMAGE_VERSIONED - BUILD_TF_PLUGIN: 1 - BUILD_TRT_PLUGIN: 1 - TRT_CMAKE_OPTION: "-DSM=\"70;75;80;90\"" - #BUILD_HPS_BACKEND: 1 - #HUGECTR_BACKEND_VER: main - #TRITON_BRANCH: r22.11 - -build_pytorch_hps_trt_plugin: - extends: .build_hugectr - variables: - FROM_IMAGE: ${IMAGE_ALL} - DST_IMAGE: $PYTORCH_TRT_IMAGE_VERSIONED - BUILD_TORCH_PLUGIN: 1 - BUILD_TRT_PLUGIN: 1 - TRT_CMAKE_OPTION: "-DSM=\"70;75;80;90\"" - #BUILD_HPS_BACKEND: 1 - #HUGECTR_BACKEND_VER: main - #TRITON_BRANCH: r22.11 - -# Check cluster busy or not -check_cluster_status: - extends: .trigger:rules:selene - stage: pre_test - tags: - - nvidia.com/cuda.driver.major=470 - - $BUILD_TAG - script: - - docker login -u ${CI_PRIVATE_USER} -p "${CI_PRIVATE_KEY}" "${CI_REGISTRY}" - - docker pull ${CONT} - - RC=0 - - docker run -d --rm --name cluster_idle_${CI_PIPELINE_ID} ${EXTRA_DOCKER_RUN_ARGS} ${CONT} sleep infinity - - docker exec cluster_idle_${CI_PIPELINE_ID} bash -cx "python get_selene_runner_status.py --quota ${SELENE_QUEUE_QUOTA} --token \"${CLUSTER_TOKEN}\" " || RC=$? - - echo "$RC" - - echo "NEW_CI_CONCURRENT_ID=${CI_CONCURRENT_ID}" >> other_param.env - - if [[ $RC == 0 ]]; then - echo "Run jobs in draco-oci cluster!"; - cp ./ci/draco-oci/ci.yml ./test-ci.yml; - echo "NEW_SBATCH_OTHER_PARAMS=" >> other_param.env; - else - echo "Run jobs in other cluster!"; - cp ./ci/dracorno/ci.yml ./test-ci.yml; - echo "NEW_SBATCH_OTHER_PARAMS=--nv-meta ml-model.hugectr --gpus-per-node=8" >> other_param.env; - fi - - cat other_param.env - artifacts: - paths: - - ./test-ci.yml - reports: - dotenv: other_param.env - variables: - CONT: gitlab-master.nvidia.com:5005/dl/hugectr/hugectr/emma:get_selene_status_new - allow_failure: false - timeout: 15 minutes - -trigger_test_pipeline: - extends: .trigger:rules:selene - stage: - test - needs: - - check_cluster_status - trigger: - include: - - artifact: test-ci.yml - job: check_cluster_status - strategy: depend - variables: - PARENT_SOURCE: ${CI_PIPELINE_SOURCE} - PARENT_PIPELINE_ID: ${CI_PIPELINE_ID} - GCS_ACCESS_FILE: ${GCS_ACCESS_FILE} - PARENT_GCS_ACCESS_FILE: ${GCS_ACCESS_FILE} - SBATCH_OTHER_PARAMS: ${NEW_SBATCH_OTHER_PARAMS} - -criteo_multi_node: - extends: .cluster_test_job_daily - needs: - - build_train_multi_node - variables: - CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT} - CI_SLURM_TIME: "00:15:00" - CI_SLURM_NODES: 2 - SLURM_JOB_NUM_NODES: 2 - TEST_CMD: ./ci/integration_test/criteo/criteo_multi_node.sub - -dlrm_dcnv2_benchmark_8node: - extends: .cluster_test_job_daily - needs: - - build_train_multi_node - variables: - CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED - MOUNTS: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/dlrm/datasets/criteo_multihot_raw:/data,/lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/dlrm/datasets/criteo_multihot_raw:/data_val - CI_SLURM_TIME: "02:00:00" - CI_SLURM_NODES: 8 - SLURM_JOB_NUM_NODES: 8 - TEST_CMD: ./ci/integration_test/dlrm/train_dcnv2_8node.sub - -wdl_multi_gpu: - extends: .cluster_test_job_daily # test on selene needs to extend .cluster_test_job - needs: - - build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED # image name - MOUNTS: ${DRACO_OCI_DATASET_NEW_CRITEO}:${DATASET_MOUNT} # mount - CI_SLURM_TIME: "00:15:00" # estimate job time. Less time, higher priority - TEST_CMD: ./ci/integration_test/wdl/wdl_daily.sub - -deepfm_multi_gpu: - extends: .cluster_test_job_daily - needs: - - build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT} - CI_SLURM_TIME: "00:15:00" - TEST_CMD: ./ci/integration_test/deepfm/deepfm_daily.sub - -dcn_multi_node: - extends: .cluster_test_job_daily - needs: - - build_train_multi_node - variables: - CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT} - CI_SLURM_TIME: "01:00:00" - CI_SLURM_NODES: 4 - SLURM_JOB_NUM_NODES: 4 - TEST_CMD: ./ci/integration_test/dcn/dcn_multi_node.sub - -py_low_level: - extends: .cluster_test_job_daily - needs: - - build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: /raid:/raid,${DRACO_OCI_DATASET_NEW_CRITEO}:${NEW_CRITEO_MOUNT} - CI_SLURM_TIME: "01:00:00" - TEST_CMD: ./ci/integration_test/py_interface/py_low_level.sub - -ebc_single_node: - extends: .cluster_test_job_daily - needs: - - build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_DATASET_NEW_CRITEO}:${DATASET_MOUNT},/raid:/raid - CI_SLURM_TIME: "02:00:00" - TEST_CMD: ./ci/integration_test/ebc/ebc.sub - -py_multi_node: - extends: .cluster_test_job_daily - needs: - - build_train_multi_node - variables: - CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT} - CI_SLURM_TIME: "00:15:00" - CI_SLURM_NODES: 4 - SLURM_JOB_NUM_NODES: 4 - TEST_CMD: ./ci/integration_test/py_interface/py_multi_node.sub - -inference_benchmark: - extends: .cluster_test_job_daily - needs: - - build_inference - before_script: - - export BZ=1 - - export MIXED_PRECISION=FP32 - variables: - CONT: $INFER_IMAGE_VERSIONED - MOUNTS: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/inference/dlrm_regression/dlrm/1:/model/dlrm/1,/lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/keynote_inference/perf_data:/perf_data - WORKDIR: /workdir - CI_SLURM_TIME: "00:15:00" - TEST_CMD: ./ci/benchmark/inference_benchmark/run.sub - -inference_ps_test: - extends: .computelab_test_job_daily - allow_failure: false - stage: test - needs: - - build_inference - script: - - export CONT=${INFER_IMAGE_VERSIONED} - - bash ./ci/integration_test/inference/ps_test.sh - -inference_embedding_cache_update_test: - extends: .computelab_test_job_daily - allow_failure: false - stage: test - needs: - - build_inference - script: - - export CONT=${INFER_IMAGE_VERSIONED} - - bash ./ci/integration_test/inference/embedding_cache_update_test.sh - -#hdfs backend test -hdfs_backend_test: - extends: .computelab_test_job_daily - needs: - - build_train_single_node_with_hdfs_full - script: - - export CONT=${TRAIN_IMAGE_VERSIONED_WITH_HDFS} - - bash ./ci/integration_test/hdfs/hdfs_backend_test.sh - -continuous_training_inference: - extends: .test_local - variables: - CONT: ${UNIFIED_CTR_LATEST} - MOUNTS: -v /opt/ci/demo:/scripts -v /opt/ci/wdl_infer:/wdl_infer - CMD: "apt update -y --fix-missing && apt install bc && cd /scripts && bash run_continuouse_test.sh" - -# NVT regression -e2e_nvt_regression_test: - extends: .cluster_test_job_daily - needs: - - build_train_single_node_latest - variables: - CONT: $TRAIN_IMAGE_VERSIONED_LATEST - MOUNTS: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/criteo_1TB/day_1:/workdir/tools/day_1,/lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/inference/nvt_regression:/workdir/samples/din/raw_data,/lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/criteo_1TB/day_0:/dir/to/criteo/day_0 - CI_SLURM_TIME: "01:00:00" - TEST_CMD: ./ci/integration_test/nvt/nvt_regression_test.sub - -nb_hps_demo: - extends: .cluster_test_job_daily - needs: - - build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - CI_SLURM_TIME: "00:45:00" - TEST_CMD: ./ci/integration_test/notebooks/hps_demo.sub - -test_sok_pypi: - extends: .cluster_test_job_daily - needs: - - build_sok_tf2 - variables: - CONT: $SOK_IMAGE_VERSIONED_TF2 - CI_SLURM_TIME: "00:30:00" - TEST_CMD: ./ci/integration_test/sok/test_sok_pypi.sub - -wdl_check: - # Push logs to gitlab - extends: .cluster_post_test_job_daily - needs: - - wdl_multi_gpu - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_LOGDIR}/wdl_multi_gpu:/logs - CI_SLURM_TIME: "00:15:00" - TEST_CMD: ./ci/post_test/check_wdl.sub - -inference_benchmark_check: - extends: .cluster_post_test_job_daily - needs: - - inference_benchmark - - build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_LOGDIR}/inference_benchmark:/logs - CI_SLURM_TIME: "00:15:00" - TEST_CMD: ./ci/post_test/check_inference_benchmark.sub - -inference_cpu_memory_usage: - extends: .cluster_test_job_daily - needs: - - build_inference - before_script: - - export BZ=1 - - export MIXED_PRECISION=FP32 - - mkdir -p ${DRACO_OCI_LOGDIR}/inference_cpu_memory - variables: - CONT: $INFER_IMAGE_VERSIONED - MOUNTS: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/inference/dlrm_regression/dlrm/1:/model/dlrm/1,${DRACO_OCI_LOGDIR}/inference_cpu_memory:/logs - WORKDIR: /workdir - CI_SLURM_TIME: "00:30:00" - TEST_CMD: ./ci/benchmark/hps_memory_check/run.sub - -inference_CPU_Memory_check: - extends: .cluster_post_test_job_daily - needs: - - inference_cpu_memory_usage - - build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_LOGDIR}/inference_cpu_memory:/logs - CI_SLURM_TIME: "00:15:00" - TEST_CMD: ./ci/post_test/check_cpu_usage.sub - -dlrm_dcnv2_8node_check: - # Push logs to gitlab - extends: .cluster_post_test_job_daily - needs: - - dlrm_dcnv2_benchmark_8node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_LOGDIR}/dlrm_dcnv2_benchmark_8node:/logs - CI_SLURM_TIME: "00:15:00" - TEST_CMD: ./ci/post_test/check_dcnv2_dlrm_8node.sub - -# rm_logs: -# extends: .cluster_test_job -# variables: -# GPFSFOLDER: "$LOGDIR" -# GIT_CLONE_PATH: /lustre/fsw/devtech/hpc-hugectr/hugectr-ci/$CI_CONCURRENT_ID/$CI_PROJECT_NAME -# CONT: $TRAIN_IMAGE_VERSIONED -# MOUNTS: /lustre/fsw/devtech:/logs -# SLURM_ACCOUNT: devtech -# WALLTIME: "00:15:00" -# DGXNNODES: 1 -# TEST_CMD: ./ci/common/clean_logs.sub - diff --git a/.gitlab/issue_templates/Bug Report.md b/.gitlab/issue_templates/Bug Report.md deleted file mode 100644 index 97f114eb1c..0000000000 --- a/.gitlab/issue_templates/Bug Report.md +++ /dev/null @@ -1,28 +0,0 @@ -## REMOVE THIS SECTION WHEN FILING ISSUE -**NOTE:** Make the issue title clear enough and prefix it with `[BUG]` - -**NOTE:** Label the issue with `bug` at least. - - -## Bug Report -**Describe the bug** -- A clear but concise description of what the bug is. - -**Reproduction Steps** -- Provide a step-by-step instructions of how to reproduce the issue which can help us resolve more efficiently. -If you have a simple reproducer, e.g., a simple Python script or an standalone C++ code, that should be helpful as well. Refer to [this guide](http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) to make a decent bug report. - -**Expected behavior** -- A clear and concise description of what you expected to happen. - -**Screenshots** -- (If applicable) add screenshots to help explain your problem. - -**Environment (please complete the following information):** - - OS: [e.g. Ubuntu xx.yy] - - Graphic card: [e.g. a single NVIDIA V100 or NVIDIA DGX A100] - - CUDA version: [e.g. CUDA 11.x] - - Container: [e.g. NGC, devel_all, etc.] - -**Additional context** -- Add any other context about the problem here. diff --git a/.gitlab/issue_templates/Documentation Request.md b/.gitlab/issue_templates/Documentation Request.md deleted file mode 100644 index 62b8a9fa74..0000000000 --- a/.gitlab/issue_templates/Documentation Request.md +++ /dev/null @@ -1,11 +0,0 @@ -## REMOVE THIS SECTION WHEN FILING ISSUE -**NOTE:** Label the issue with `feature::doc` and/or `component::*` at least. - -## Docuemntation Request - -**Describe the problem** -- A clear but concise description of what documentation should be enhanced or added. - -**Describe the steps** -- (For a problem) describe how to reproduce the problem or inconvenience -- (For a new documentation) describe how to navigate/search the new documentation diff --git a/.gitlab/issue_templates/Epic.md b/.gitlab/issue_templates/Epic.md deleted file mode 100644 index 8738f3aa22..0000000000 --- a/.gitlab/issue_templates/Epic.md +++ /dev/null @@ -1,22 +0,0 @@ -## REMOVE THIS SECTION WHEN FILING ISSUE -**NOTE:** Make the issue title clear enough and prefix it with `[RMP]` - -**NOTE:** `feature in roadmap` label is for an epic and its child stories. - -**NOTE:** Label the issue with `feature::*` and/or `component::*` at least. - -**NOTE:** If the description is empty or the user story is ill-defined, the issue will be marked as `status::Needs Definition`. - -**NOTE:** If there are any GitLab issues which prevent this issue from being started/completed, link them as `is blocked by`. - -**NOTE:** The milestone of epic is the same as that of the final child issue. - -## Feature in Roadmap -**Problem Definition** -- Epic or `[RMP]` feature is a collection of user stories. Describe their common problem and goal in the high level. - -**User stories** -- List the child stories or GitLab issues which consist of this epic. - -**Design Document** -- (If applicable) diff --git a/.gitlab/issue_templates/Feature Request.md b/.gitlab/issue_templates/Feature Request.md deleted file mode 100644 index 5c77eb3a3c..0000000000 --- a/.gitlab/issue_templates/Feature Request.md +++ /dev/null @@ -1,27 +0,0 @@ -## REMOVE THIS SECTION WHEN FILING ISSUE -**NOTE:** If it is a feature request related to a specific customer, use `[Customer Requirement - XX, YY]` as its title prefix. - -**NOTE:** If the description is empty or the user story is ill-defined, the issue will be marked as `status::Needs Definition`. The title should be clear enough as well. - -**NOTE:** Label the issue with `feature::*` and/or `component::*` at least. - -**NOTE:** If there are any GitLab issues which prevent this issue from being started/completed, link them as `is blocked by`. - -## Feature Request -**User Story** -- A clear but concise requirement definition from **an user perspetive**. An user can be a customer, a team member, MLPerf team, another Merlin component, etc. Make the story independent and small if possible while specifying the **problem** and **goal** clearly. - - Example 1. As our customer XX and YY put their data in HDFS, I will extend the HugeCTR DataReader to load the HDFS-resident dataset. - - Example 2. Because the the MLPerf vx.y submission uses the model XX which includes the layer YY, we'd like to implement the GPU acccelerated YY layer. - -**Use Cases** -- (If applicable) A pesudo code (Python or C++) level description of how it is used or intercts with other components. - -**Test Cases** -- (If applicable) Describe the utest and integration test while providng the test data, input parameters, test steps and expected results. - - -**Design Document** -- (If applicable) - -**Task List** -- Provide the atomic tasks required to complete the issue. diff --git a/ci/benchmark.yml b/ci/benchmark.yml deleted file mode 100644 index cf11ddf7cd..0000000000 --- a/ci/benchmark.yml +++ /dev/null @@ -1,5 +0,0 @@ -include: - - local: /ci/benchmark/sok/ci.yml - - local: /ci/benchmark/inference_benchmark/ci.yml - - local: /ci/benchmark/train_benchmark/ci.yml - - local: /ci/benchmark/hps_backend_benchmark/ci.yml diff --git a/ci/benchmark/147gb_model_benchmark/run.sub b/ci/benchmark/147gb_model_benchmark/run.sub deleted file mode 100644 index b2ce5c38de..0000000000 --- a/ci/benchmark/147gb_model_benchmark/run.sub +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -srun --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx "bash /workdir/ci/benchmark/147gb_model_benchmark/test.sh" \ No newline at end of file diff --git a/ci/benchmark/147gb_model_benchmark/test.sh b/ci/benchmark/147gb_model_benchmark/test.sh deleted file mode 100644 index f3c6a9ff0e..0000000000 --- a/ci/benchmark/147gb_model_benchmark/test.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env bash - -mkdir -p /147gb_model_benchmark/model_repo - -cd /147gb_model_benchmark - -cp /model_repo/light.json ./ - -cp /model_repo/dynamic_build.py ./ - -cp /model_repo/*.onnx ./ - -cp -r /model_repo/dynamic*trt ./model_repo - -python3 dynamic_build.py - -mv dynamic_1fc_lite.trt model_repo/dynamic_1fc_lite_hps_trt/1 - -mv dynamic_3fc_lite.trt model_repo/dynamic_3fc_lite_hps_trt/1 - -mv dynamic_dlrm.trt model_repo/dynamic_dlrm_hps_trt/1 - -LD_PRELOAD=/usr/local/hps_trt/lib/libhps_plugin.so tritonserver --model-repository=model_repo --load-model=dynamic_1fc_lite_hps_trt --load-model=dynamic_3fc_lite_hps_trt --load-model=dynamic_dlrm_hps_trt --model-control-mode=explicit & - -while [[ $(curl -v localhost:8000/v2/health/ready 2>&1 | grep "OK" | wc -l) -eq 0 ]]; do - sleep 10; -done - -echo "Successfully launching the Triton server for all models" - -batch_size=(256 1024 4096 16384) - -model_name=("dynamic_1fc_lite_hps_trt" "dynamic_3fc_lite_hps_trt" "dynamic_dlrm_hps_trt") - -for b in ${batch_size[*]}; -do - for m in ${model_name[*]}; - do - echo $b $m - perf_analyzer -m ${m} -u localhost:8000 --input-data /model_repo/perf_data/${b}.json --shape categorical_features:${b},26 --shape numerical_features:${b},13 - done -done diff --git a/ci/benchmark/hps_backend_benchmark/ci.yml b/ci/benchmark/hps_backend_benchmark/ci.yml deleted file mode 100644 index 74fddcbfd8..0000000000 --- a/ci/benchmark/hps_backend_benchmark/ci.yml +++ /dev/null @@ -1,31 +0,0 @@ -hps_backend--256: - extends: .hps_backend_benchmark -hps_backend--1024: - extends: .hps_backend_benchmark -hps_backend--2048: - extends: .hps_backend_benchmark -hps_backend--8192: - extends: .hps_backend_benchmark -hps_backend--131072: - extends: .hps_backend_benchmark - -check_hps_backend_result: - extends: - - collect_benchmark_result - - .benchmark:rules:weekly - variables: - GPFSFOLDER: $LOGDIR/hps_backend_result - TEST_CMD: ./ci/post_test/check_hps_backend_result.sub - -hps_database_backend: - extends: - - .selene_luna_job - - .benchmark:rules:weekly - stage: hps_benchmark - variables: - GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE} - CONT: ${UNIFIED_CTR_LATEST} - SLURM_ACCOUNT: coreai_devtech_hugectr - WALLTIME: "00:30:00" - DGXNNODES: 1 - TEST_CMD: ./ci/benchmark/hps_backend_benchmark/hps_database_backend.sub diff --git a/ci/benchmark/hps_backend_benchmark/hps_database_backend.sub b/ci/benchmark/hps_backend_benchmark/hps_database_backend.sub deleted file mode 100644 index 2a8c14587a..0000000000 --- a/ci/benchmark/hps_backend_benchmark/hps_database_backend.sub +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" bash -cx " \ - cd /usr/local/hugectr/bin && \ - mpirun -np 1 --allow-run-as-root db_bench --db_type hashmap --no_test_insert_evict --no_test_upsert && \ - mpirun -np 1 --allow-run-as-root db_bench --db_type hashmap --no_test_insert_evict --no_test_fetch && \ - mpirun -np 1 --allow-run-as-root db_bench --db_type hashmap --no_test_upsert --no_test_fetch" diff --git a/ci/benchmark/hps_backend_benchmark/run.sub b/ci/benchmark/hps_backend_benchmark/run.sub deleted file mode 100644 index c7626095a8..0000000000 --- a/ci/benchmark/hps_backend_benchmark/run.sub +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -: "${BZ:?BZ not set}" -CAT_SHAPE=$[BZ*26] - -srun --container-mounts="${MOUNTS}" --container-image="${CONT}" bash -cx "\ - tritonserver --model-repository=/hps_backend_benchmark/dlrm_model_repo/ --backend-config=hps,ps=/hps_backend_benchmark/dlrm_model_repo/hps_lookup/dlrm.json --load-model=hps_lookup --model-control-mode=explicit & \ - sleep 100 && \ - curl -v localhost:8000/v2/health/ready && \ - perf_analyzer -m hps_lookup -u localhost:8000 --input-data /hps_backend_benchmark/perf_data/${BZ}.json --shape KEYS:${CAT_SHAPE} --shape NUMKEYS:1 --metrics-interval 10000 --collect-metrics \ -" diff --git a/ci/benchmark/hps_memory_check/run.sub b/ci/benchmark/hps_memory_check/run.sub deleted file mode 100644 index ac188113f5..0000000000 --- a/ci/benchmark/hps_memory_check/run.sub +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - - -srun --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx "bash /workdir/ci/benchmark/hps_memory_check/test.sh" diff --git a/ci/benchmark/hps_memory_check/test.sh b/ci/benchmark/hps_memory_check/test.sh deleted file mode 100644 index 043169b951..0000000000 --- a/ci/benchmark/hps_memory_check/test.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash - -python3 ${WORKDIR}/ci/common/generate_inference_config.py --config_template ${WORKDIR}/ci/common/config_pbtxt_template.txt --ps_template ${WORKDIR}/ci/common/ps_template.json --batchsize 64 --mixed_precision false --ec_type dynamic --config_output /model/dlrm/config.pbtxt --ps_output /model/ps.json - -tritonserver --model-repository=/model/ --load-model=dlrm --log-verbose=1 --model-control-mode=explicit --backend-directory=/usr/local/hugectr/backends --backend-config=hps,ps=/model/ps.json > /dev/null 2> /dev/null & -echo > /logs/cpu_dynamic_mem.log -while [[ $(curl -v localhost:8000/v2/health/ready 2>&1 | grep "OK" | wc -l) -eq 0 ]]; do - (top -d 1 -n 10 -b | grep triton) >> /logs/cpu_dynamic_mem.log -done -kill -s 9 `pgrep tritonserver` -sleep 10; - -python3 ${WORKDIR}/ci/common/generate_inference_config.py --config_template ${WORKDIR}/ci/common/config_pbtxt_template.txt --ps_template ${WORKDIR}/ci/common/ps_template.json --batchsize 64 --mixed_precision false --ec_type uvm --config_output /model/dlrm/config.pbtxt --ps_output /model/ps.json -tritonserver --model-repository=/model/ --load-model=dlrm --log-verbose=1 --model-control-mode=explicit --backend-directory=/usr/local/hugectr/backends --backend-config=hps,ps=/model/ps.json > /dev/null 2> /dev/null & -echo > /logs/cpu_uvm_mem.log -while [[ $(curl -v localhost:8000/v2/health/ready 2>&1 | grep "OK" | wc -l) -eq 0 ]]; do - (top -d 1 -n 10 -b | grep triton) >> /logs/cpu_uvm_mem.log -done -kill -s 9 `pgrep tritonserver` -sleep 10; - -python3 ${WORKDIR}/ci/common/generate_inference_config.py --config_template ${WORKDIR}/ci/common/config_pbtxt_template.txt --ps_template ${WORKDIR}/ci/common/ps_template.json --batchsize 64 --mixed_precision false --ec_type static --config_output /model/dlrm/config.pbtxt --ps_output /model/ps.json -tritonserver --model-repository=/model/ --load-model=dlrm --log-verbose=1 --model-control-mode=explicit --backend-directory=/usr/local/hugectr/backends --backend-config=hps,ps=/model/ps.json > /dev/null 2> /dev/null & -echo > /logs/cpu_static_mem.log -while [[ $(curl -v localhost:8000/v2/health/ready 2>&1 | grep "OK" | wc -l) -eq 0 ]]; do - (top -d 1 -n 10 -b | grep triton) >> /logs/cpu_static_mem.log -done -kill -s 9 `pgrep tritonserver` - - - - diff --git a/ci/benchmark/hps_plugin_benchmark/run.sub b/ci/benchmark/hps_plugin_benchmark/run.sub deleted file mode 100644 index 935ee849f9..0000000000 --- a/ci/benchmark/hps_plugin_benchmark/run.sub +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -srun --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx "bash /workdir/ci/benchmark/hps_plugin_benchmark/test.sh" diff --git a/ci/benchmark/hps_plugin_benchmark/test.sh b/ci/benchmark/hps_plugin_benchmark/test.sh deleted file mode 100644 index fe1d7eb8fc..0000000000 --- a/ci/benchmark/hps_plugin_benchmark/test.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash - -mkdir -p /hps_plugin_benchmark - -cd /hps_plugin_benchmark - -python3 /workdir/docs/source/hierarchical_parameter_server/hps_dlrm_benchmark_scripts/create_tf_models.py - -python3 /workdir/docs/source/hierarchical_parameter_server/hps_dlrm_benchmark_scripts/create_trt_engines.py - -cp -r /model_repo ./ - -mv dlrm_tf_saved_model model_repo/native_tf/1/model.savedmodel - -mv hps_plugin_dlrm_tf_saved_model model_repo/tf_with_hps/1/model.savedmodel - -mv fp32_hps_plugin_dlrm.trt model_repo/fp32_trt_with_hps/1 - -mv fp16_hps_plugin_dlrm.trt model_repo/fp16_trt_with_hps/1 - -LD_PRELOAD=/usr/local/hps_trt/lib/libhps_plugin.so:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/merlin_hps-1.0.0-py${PYTHON_VERSION}-linux-x86_64.egg/hierarchical_parameter_server/lib/libhierarchical_parameter_server.so tritonserver --model-repository=model_repo --load-model=native_tf --load-model=tf_with_hps --load-model=fp32_trt_with_hps --load-model=fp16_trt_with_hps --model-control-mode=explicit & - -while [[ $(curl -v localhost:8000/v2/health/ready 2>&1 | grep "OK" | wc -l) -eq 0 ]]; do - sleep 10; -done - -echo "Successfully launching the Triton server for all models" - -batch_size=(32 1024 16384) - -model_name=("native_tf" "tf_with_hps" "fp32_trt_with_hps" "fp16_trt_with_hps") - -for b in ${batch_size[*]}; -do - for m in ${model_name[*]}; - do - echo $b $m - perf_analyzer -m ${m} -u localhost:8000 --input-data /perf_data/${b}.json --shape categorical_features:${b},26 --shape numerical_features:${b},13 - done -done diff --git a/ci/benchmark/hps_tf_fuse_table_benchmark/run.sub b/ci/benchmark/hps_tf_fuse_table_benchmark/run.sub deleted file mode 100644 index 61dec2266b..0000000000 --- a/ci/benchmark/hps_tf_fuse_table_benchmark/run.sub +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -srun --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx "bash /workdir/ci/benchmark/hps_tf_fuse_table_benchmark/test.sh" \ No newline at end of file diff --git a/ci/benchmark/hps_tf_fuse_table_benchmark/test.sh b/ci/benchmark/hps_tf_fuse_table_benchmark/test.sh deleted file mode 100644 index 9fc4f64b8b..0000000000 --- a/ci/benchmark/hps_tf_fuse_table_benchmark/test.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash - -mkdir -p /hps_tf_fuse_table_benchmark - -cd /hps_tf_fuse_table_benchmark - -cp -r /model_repo ./ - -cp -r /model_repo/8_table.json ./ - -cp -r /model_repo/embeddings ./ - -LD_PRELOAD=/usr/local/lib/python${PYTHON_VERSION}/dist-packages/merlin_hps-1.0.0-py${PYTHON_VERSION}-linux-x86_64.egg/hierarchical_parameter_server/lib/libhierarchical_parameter_server.so tritonserver --model-repository=model_repo --load-model=8_static_table_unfused --load-model=8_static_table_autofused --load-model=8_dynamic_table_unfused --load-model=8_dynamic_table_autofused --model-control-mode=explicit & - -while [[ $(curl -v localhost:8000/v2/health/ready 2>&1 | grep "OK" | wc -l) -eq 0 ]]; do - sleep 10; -done - -echo "Successfully launching the Triton server for all models" - -batch_size=(256 1024 4096 16384) - -model_name=("8_static_table_unfused" "8_static_table_autofused" "8_dynamic_table_unfused" "8_dynamic_table_autofused") - -for b in ${batch_size[*]}; -do - for m in ${model_name[*]}; - do - echo $b $m - perf_analyzer -m ${m} -u localhost:8000 --input-data /perf_data/${b}.json --shape input_1:${b},10 --shape input_2:${b},10 --shape input_3:${b},10 --shape input_4:${b},10 --shape input_5:${b},10 --shape input_6:${b},10 --shape input_7:${b},10 --shape input_8:${b},10 - done -done diff --git a/ci/benchmark/hps_torch_fuse_table_benchmark/run.sub b/ci/benchmark/hps_torch_fuse_table_benchmark/run.sub deleted file mode 100644 index f717f9076b..0000000000 --- a/ci/benchmark/hps_torch_fuse_table_benchmark/run.sub +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -srun --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx "bash /workdir/ci/benchmark/hps_torch_fuse_table_benchmark/test.sh" \ No newline at end of file diff --git a/ci/benchmark/hps_torch_fuse_table_benchmark/test.sh b/ci/benchmark/hps_torch_fuse_table_benchmark/test.sh deleted file mode 100644 index 511d462d5c..0000000000 --- a/ci/benchmark/hps_torch_fuse_table_benchmark/test.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash - -mkdir -p /hps_torch_fuse_table_benchmark - -cd /hps_torch_fuse_table_benchmark - -cp -r /model_repo ./ - -cp -r /model_repo/8_table.json ./ - -cp -r /model_repo/embeddings ./ - -LD_PRELOAD=/usr/local/lib/python${PYTHON_VERSION}/dist-packages/merlin_hps-0.0.0-py${PYTHON_VERSION}-linux-x86_64.egg/hps_torch/lib/libhps_torch.so tritonserver --model-repository=model_repo --load-model=8_static_table_autofused --load-model=8_static_table_unfused --load-model=8_dynamic_table_autofused --load-model=8_dynamic_table_unfused --model-control-mode=explicit & - -while [[ $(curl -v localhost:8000/v2/health/ready 2>&1 | grep "OK" | wc -l) -eq 0 ]]; do - sleep 10; -done - -echo "Successfully launching the Triton server for all models" - -batch_size=(256 1024 4096 16384) - -model_name=("8_static_table_unfused" "8_static_table_autofused" "8_dynamic_table_unfused" "8_dynamic_table_autofused") - -for b in ${batch_size[*]}; -do - for m in ${model_name[*]}; - do - echo $b $m - perf_analyzer -m ${m} -u localhost:8000 --input-data /perf_data/${b}.json --shape input_1:8,${b},10 - done -done diff --git a/ci/benchmark/inference_benchmark/ci.yml b/ci/benchmark/inference_benchmark/ci.yml deleted file mode 100644 index 790b142876..0000000000 --- a/ci/benchmark/inference_benchmark/ci.yml +++ /dev/null @@ -1,8 +0,0 @@ -infernece--256xFP32: - extends: .inference_benchmark -infernece--1024xFP32: - extends: .inference_benchmark -#infernece--8192xFP16: -# extends: .inference_benchmark -#infernece--131072xFP16: -# extends: .inference_benchmark diff --git a/ci/benchmark/inference_benchmark/run.sub b/ci/benchmark/inference_benchmark/run.sub deleted file mode 100644 index cd964dac95..0000000000 --- a/ci/benchmark/inference_benchmark/run.sub +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -: "${BZ:?BZ not set}" -: "${MIXED_PRECISION:? MIXED_PRECISION}" -if [ ${MIXED_PRECISION} == "FP16" ];then - export MIXED_PRECISION="True"; -else - export MIXED_PRECISION="False" -fi -export CATCOLUMN=$(( $BZ * 26 )) -export DES=$(( 13 * $BZ )) -export ROWINDEX=$(( 26 * $BZ + 1 )) -export WORKDIR=${WORKDIR} - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" --container-workdir="${WORKDIR}" bash -cx "chmod +x ./ci/benchmark/inference_benchmark/test.sh && ./ci/benchmark/inference_benchmark/test.sh" diff --git a/ci/benchmark/inference_benchmark/test.sh b/ci/benchmark/inference_benchmark/test.sh deleted file mode 100644 index b24f473eea..0000000000 --- a/ci/benchmark/inference_benchmark/test.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash - -python3 ${WORKDIR}/ci/common/generate_inference_config.py --config_template ${WORKDIR}/ci/common/config_pbtxt_template.txt --ps_template ${WORKDIR}/ci/common/ps_template.json --batchsize ${BZ} --mixed_precision ${MIXED_PRECISION} --config_output /model/dlrm/config.pbtxt --ps_output /model/ps.json - -tritonserver --model-repository=/model/ --load-model=dlrm --log-verbose=1 --model-control-mode=explicit --backend-directory=/usr/local/hugectr/backends --backend-config=hps,ps=/model/ps.json > /dev/null 2> /dev/null & -#tritonserver --model-repository=/model/ --load-model=dlrm --log-verbose=1 --model-control-mode=explicit --backend-directory=/usr/local/hugectr/backends --backend-config=hugectr,ps=/model/ps.json & - -while [[ $(curl -v localhost:8000/v2/health/ready 2>&1 | grep "OK" | wc -l) -eq 0 ]]; do - sleep 10; -done -perf_analyzer -m dlrm -u localhost:8000 --input-data /perf_data/${BZ}.json --shape KEYS:${CATCOLUMN} --shape NUMKEYS:1 diff --git a/ci/benchmark/sok/ci.yml b/ci/benchmark/sok/ci.yml deleted file mode 100644 index dcffc37b6f..0000000000 --- a/ci/benchmark/sok/ci.yml +++ /dev/null @@ -1,9 +0,0 @@ - -sok--65536x1: - extends: - - .sok_benchmark - - .benchmark:rules:weekly -sok--65536x8: - extends: - - .sok_benchmark - - .benchmark:rules:biweekly diff --git a/ci/benchmark/sok/sok_dlrm.sub b/ci/benchmark/sok/sok_dlrm.sub deleted file mode 100644 index 39cb17d5a0..0000000000 --- a/ci/benchmark/sok/sok_dlrm.sub +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -: "${BZ:?BZ not set}" -: "${GPU_NUM:?GPU_NUM not set}" - -srun --ntasks="${SLURM_JOB_NUM_NODES}" bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3" - -if [ $GPU_NUM == 1 ];then - srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /hugectr/test/sok_perf_test/dlrm; - python3 main.py \ - --global_batch_size=${BZ} \ - --train_file_pattern=\"/dataset/train/*.csv\" \ - --test_file_pattern=\"/dataset/test/*.csv\" \ - --embedding_layer=\"SOK\" \ - --embedding_vec_size=32 \ - --bottom_stack 512 256 32 \ - --top_stack 1024 1024 512 256 1 \ - --distribute_strategy=\"mirrored\" \ - --gpu_num=1" -else - srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /hugectr/test/sok_perf_test/dlrm; - mpirun -np ${GPU_NUM} --allow-run-as-root --oversubscribe \ - python3 main.py \ - --global_batch_size=${BZ} \ - --train_file_pattern=\"/dataset/train/*.csv\" \ - --test_file_pattern=\"/dataset/test/*.csv\" \ - --embedding_layer=\"SOK\" \ - --embedding_vec_size=32 \ - --bottom_stack 512 256 32 \ - --top_stack 1024 1024 512 256 1 \ - --distribute_strategy=\"multiworker\"" -fi diff --git a/ci/benchmark/train_benchmark/benchmark_train.py b/ci/benchmark/train_benchmark/benchmark_train.py deleted file mode 100644 index cc7609a09e..0000000000 --- a/ci/benchmark/train_benchmark/benchmark_train.py +++ /dev/null @@ -1,950 +0,0 @@ -import hugectr -import json -import sys -import argparse -import os -from mpi4py import MPI - -comm = MPI.COMM_WORLD -rank = comm.Get_rank() - - -def create_wdl(solver): - dataset_path = os.getenv("WDL_DATA_PATH") - if not os.path.exists("./wdl_data_parquet"): - os.symlink(dataset_path, "./wdl_data_parquet", target_is_directory=True) - reader = hugectr.DataReaderParams( - data_reader_type=hugectr.DataReaderType_t.Parquet, - source=["./wdl_data_parquet/train/_file_list.txt"], - eval_source="./wdl_data_parquet/val/_file_list.txt", - check_type=hugectr.Check_t.Non, - slot_size_array=[ - 203750, - 18573, - 14082, - 7020, - 18966, - 4, - 6382, - 1246, - 49, - 185920, - 71354, - 67346, - 11, - 2166, - 7340, - 60, - 4, - 934, - 15, - 204208, - 141572, - 199066, - 60940, - 9115, - 72, - 34, - 278899, - 355877, - ], - ) - optimizer = hugectr.CreateOptimizer( - optimizer_type=hugectr.Optimizer_t.SGD, - update_type=hugectr.Update_t.Local, - atomic_update=True, - ) - model = hugectr.Model(solver, reader, optimizer) - model.add( - hugectr.Input( - label_dim=1, - label_name="label", - dense_dim=13, - dense_name="dense", - data_reader_sparse_param_array=[ - hugectr.DataReaderSparseParam("deep_data", 1, True, 26), - hugectr.DataReaderSparseParam("wide_data", 1, True, 2), - ], - ) - ) - model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, - workspace_size_per_gpu_in_mb=75, - embedding_vec_size=1, - combiner="sum", - sparse_embedding_name="sparse_embedding2", - bottom_name="wide_data", - optimizer=optimizer, - ) - ) - model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, - workspace_size_per_gpu_in_mb=1074, - embedding_vec_size=16, - combiner="sum", - sparse_embedding_name="sparse_embedding1", - bottom_name="deep_data", - optimizer=optimizer, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Reshape, - bottom_names=["sparse_embedding1"], - top_names=["reshape1"], - leading_dim=416, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Reshape, - bottom_names=["sparse_embedding2"], - top_names=["reshape_wide"], - leading_dim=2, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.ReduceSum, - bottom_names=["reshape_wide"], - top_names=["reshape2"], - axis=1, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Concat, - bottom_names=["reshape1", "dense"], - top_names=["concat1"], - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["concat1"], - top_names=["fc1"], - num_output=1024, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc1"], top_names=["relu1"] - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Dropout, - bottom_names=["relu1"], - top_names=["dropout1"], - dropout_rate=0.5, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["dropout1"], - top_names=["fc2"], - num_output=1024, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc2"], top_names=["relu2"] - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Dropout, - bottom_names=["relu2"], - top_names=["dropout2"], - dropout_rate=0.5, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["dropout2"], - top_names=["fc3"], - num_output=1, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Add, bottom_names=["fc3", "reshape2"], top_names=["add1"] - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss, - bottom_names=["add1", "label"], - top_names=["loss"], - ) - ) - return model - - -def create_dcn(solver): - dataset_path = os.getenv("DCN_DATA_PATH") - if not os.path.exists("./dcn_data"): - os.symlink(dataset_path, "./dcn_data", target_is_directory=True) - os.chdir("./dcn_data") - reader = hugectr.DataReaderParams( - data_reader_type=hugectr.DataReaderType_t.Parquet, - source=["./_file_list.txt"], - eval_source="./_file_list.txt", - check_type=hugectr.Check_t.Non, - slot_size_array=[ - 203931, - 18598, - 14092, - 7012, - 18977, - 4, - 6385, - 1245, - 49, - 186213, - 71328, - 67288, - 11, - 2168, - 7338, - 61, - 4, - 932, - 15, - 204515, - 141526, - 199433, - 60919, - 9137, - 71, - 34, - ], - ) - optimizer = hugectr.CreateOptimizer( - optimizer_type=hugectr.Optimizer_t.SGD, - update_type=hugectr.Update_t.Local, - atomic_update=True, - ) - model = hugectr.Model(solver, reader, optimizer) - model.add( - hugectr.Input( - label_dim=1, - label_name="label", - dense_dim=13, - dense_name="dense", - data_reader_sparse_param_array=[hugectr.DataReaderSparseParam("data1", 2, False, 26)], - ) - ) - model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, - workspace_size_per_gpu_in_mb=300, - embedding_vec_size=16, - combiner="sum", - sparse_embedding_name="sparse_embedding1", - bottom_name="data1", - optimizer=optimizer, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Reshape, - bottom_names=["sparse_embedding1"], - top_names=["reshape1"], - leading_dim=416, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Concat, - bottom_names=["reshape1", "dense"], - top_names=["concat1"], - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.MultiCross, - bottom_names=["concat1"], - top_names=["multicross1"], - num_layers=6, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["concat1"], - top_names=["fc1"], - num_output=1024, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc1"], top_names=["relu1"] - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Dropout, - bottom_names=["relu1"], - top_names=["dropout1"], - dropout_rate=0.5, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["dropout1"], - top_names=["fc2"], - num_output=1024, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc2"], top_names=["relu2"] - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Dropout, - bottom_names=["relu2"], - top_names=["dropout2"], - dropout_rate=0.5, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Concat, - bottom_names=["dropout2", "multicross1"], - top_names=["concat2"], - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["concat2"], - top_names=["fc3"], - num_output=1, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss, - bottom_names=["fc3", "label"], - top_names=["loss"], - ) - ) - return model - - -def create_deepfm(solver): - dataset_path = os.getenv("DCN_DATA_PATH") - if not os.path.exists("./dcn_data"): - os.symlink(dataset_path, "./dcn_data", target_is_directory=True) - os.chdir("./dcn_data") - reader = hugectr.DataReaderParams( - data_reader_type=hugectr.DataReaderType_t.Parquet, - source=["./_file_list.txt"], - eval_source="./_file_list.txt", - check_type=hugectr.Check_t.Non, - slot_size_array=[ - 203931, - 18598, - 14092, - 7012, - 18977, - 4, - 6385, - 1245, - 49, - 186213, - 71328, - 67288, - 11, - 2168, - 7338, - 61, - 4, - 932, - 15, - 204515, - 141526, - 199433, - 60919, - 9137, - 71, - 34, - ], - ) - optimizer = hugectr.CreateOptimizer( - optimizer_type=hugectr.Optimizer_t.SGD, - update_type=hugectr.Update_t.Local, - atomic_update=True, - ) - model = hugectr.Model(solver, reader, optimizer) - model.add( - hugectr.Input( - label_dim=1, - label_name="label", - dense_dim=13, - dense_name="dense", - data_reader_sparse_param_array=[hugectr.DataReaderSparseParam("data1", 2, False, 26)], - ) - ) - model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, - workspace_size_per_gpu_in_mb=300, - embedding_vec_size=11, - combiner="sum", - sparse_embedding_name="sparse_embedding1", - bottom_name="data1", - optimizer=optimizer, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Reshape, - bottom_names=["sparse_embedding1"], - top_names=["reshape1"], - leading_dim=11, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Slice, - bottom_names=["reshape1"], - top_names=["slice11", "slice12"], - ranges=[(0, 10), (10, 11)], - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Reshape, - bottom_names=["slice11"], - top_names=["reshape2"], - leading_dim=260, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Reshape, - bottom_names=["slice12"], - top_names=["reshape3"], - leading_dim=26, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.WeightMultiply, - bottom_names=["dense"], - top_names=["weight_multiply1"], - weight_dims=[13, 10], - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.WeightMultiply, - bottom_names=["dense"], - top_names=["weight_multiply2"], - weight_dims=[13, 1], - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Concat, - bottom_names=["reshape2", "weight_multiply1"], - top_names=["concat1"], - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["concat1"], - top_names=["fc1"], - num_output=400, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc1"], top_names=["relu1"] - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Dropout, - bottom_names=["relu1"], - top_names=["dropout1"], - dropout_rate=0.5, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["dropout1"], - top_names=["fc2"], - num_output=400, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc2"], top_names=["relu2"] - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Dropout, - bottom_names=["relu2"], - top_names=["dropout2"], - dropout_rate=0.5, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["dropout2"], - top_names=["fc3"], - num_output=400, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc3"], top_names=["relu3"] - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Dropout, - bottom_names=["relu3"], - top_names=["dropout3"], - dropout_rate=0.5, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["dropout3"], - top_names=["fc4"], - num_output=1, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.FmOrder2, - bottom_names=["concat1"], - top_names=["fmorder2"], - out_dim=10, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.ReduceSum, - bottom_names=["fmorder2"], - top_names=["reducesum1"], - axis=1, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Concat, - bottom_names=["reshape3", "weight_multiply2"], - top_names=["concat2"], - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.ReduceSum, - bottom_names=["concat2"], - top_names=["reducesum2"], - axis=1, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Add, - bottom_names=["fc4", "reducesum1", "reducesum2"], - top_names=["add"], - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss, - bottom_names=["add", "label"], - top_names=["loss"], - ) - ) - return model - - -def create_din(solver): - reader = hugectr.DataReaderParams( - data_reader_type=hugectr.DataReaderType_t.Parquet, - source=["./din_data/train/_file_list.txt"], - eval_source="./din_data/valid/_file_list.txt", - check_type=hugectr.Check_t.Non, - num_workers=1, - slot_size_array=[ - 192403, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 63001, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 801, - ], - ) - optimizer = hugectr.CreateOptimizer( - optimizer_type=hugectr.Optimizer_t.Adam, - update_type=hugectr.Update_t.Global, - beta1=0.9, - beta2=0.999, - epsilon=0.000000001, - ) - model = hugectr.Model(solver, reader, optimizer) - model.add( - hugectr.Input( - label_dim=1, - label_name="label", - dense_dim=0, - dense_name="dense", - data_reader_sparse_param_array=[ - hugectr.DataReaderSparseParam("UserID", 1, True, 1), - hugectr.DataReaderSparseParam("GoodID", 1, True, 11), - hugectr.DataReaderSparseParam("CateID", 1, True, 11), - ], - ) - ) - - model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, - workspace_size_per_gpu_in_mb=84, - embedding_vec_size=18, - combiner="sum", - sparse_embedding_name="sparse_embedding_user", - bottom_name="UserID", - optimizer=optimizer, - ) - ) - model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, - workspace_size_per_gpu_in_mb=72, - embedding_vec_size=18, - combiner="sum", - sparse_embedding_name="sparse_embedding_good", - bottom_name="GoodID", - optimizer=optimizer, - ) - ) - model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, - workspace_size_per_gpu_in_mb=30, - embedding_vec_size=18, - combiner="sum", - sparse_embedding_name="sparse_embedding_cate", - bottom_name="CateID", - optimizer=optimizer, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.FusedReshapeConcat, - bottom_names=["sparse_embedding_good", "sparse_embedding_cate"], - top_names=["FusedReshapeConcat_item_his_em", "FusedReshapeConcat_item"], - ) - ) - - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Slice, - bottom_names=["FusedReshapeConcat_item"], - top_names=["item1", "item2"], - ranges=[(0, 36), (0, 36)], - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Slice, - bottom_names=["FusedReshapeConcat_item_his_em"], - top_names=["item_his1", "item_his2", "item_his3", "item_his4", "item_his5"], - ranges=[(0, 36), (0, 36), (0, 36), (0, 36), (0, 36)], - ) - ) - - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Scale, - bottom_names=["item1"], - top_names=["Scale_item"], - axis=1, - factor=10, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Slice, - bottom_names=["Scale_item"], - top_names=["Scale_item1", "Scale_item2", "Scale_item3"], - ranges=[(0, 36), (0, 36), (0, 36)], - ) - ) - - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Sub, - bottom_names=["Scale_item1", "item_his1"], - top_names=["sub_ih"], - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.ElementwiseMultiply, - bottom_names=["Scale_item2", "item_his2"], - top_names=["ElementWiseMul_i"], - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Concat, - bottom_names=["Scale_item3", "item_his3", "sub_ih", "ElementWiseMul_i"], - top_names=["concat_i_h"], - ) - ) - - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["concat_i_h"], - top_names=["fc_att_i2"], - num_output=40, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["fc_att_i2"], - top_names=["fc_att_i3"], - num_output=1, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Reshape, - bottom_names=["fc_att_i3"], - top_names=["reshape_score"], - leading_dim=10, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Softmax, - bottom_names=["reshape_score"], - top_names=["softmax_att_i"], - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Scale, - bottom_names=["softmax_att_i"], - top_names=["Scale_i"], - axis=0, - factor=36, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Reshape, - bottom_names=["item_his4"], - top_names=["reshape_item_his"], - leading_dim=360, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.ElementwiseMultiply, # matmul - bottom_names=["Scale_i", "reshape_item_his"], - top_names=["ElementwiseMul_ih"], - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.ReduceSum, - bottom_names=["ElementwiseMul_ih"], - top_names=["reduce_ih"], - axis=1, - ) - ) - - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Reshape, - bottom_names=["item_his5"], - top_names=["reshape_his"], - leading_dim=36, - time_step=10, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.ReduceMean, - bottom_names=["reshape_his"], - top_names=["reduce_item_his"], - axis=1, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Reshape, - bottom_names=["reduce_item_his"], - top_names=["reshape_reduce_item_his"], - leading_dim=36, - ) - ) - - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Reshape, - bottom_names=["sparse_embedding_user"], - top_names=["reshape_user"], - leading_dim=18, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Concat, - bottom_names=[ - "reshape_user", - "reshape_reduce_item_his", - "reduce_ih", - "item2", - ], - top_names=["concat_din_i"], - ) - ) - # build_fcn_net - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["concat_din_i"], - top_names=["fc_din_i1"], - num_output=200, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.PReLU_Dice, - bottom_names=["fc_din_i1"], - top_names=["dice_1"], - elu_alpha=0.2, - eps=1e-8, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["dice_1"], - top_names=["fc_din_i2"], - num_output=80, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.PReLU_Dice, - bottom_names=["fc_din_i2"], - top_names=["dice_2"], - elu_alpha=0.2, - eps=1e-8, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["dice_2"], - top_names=["fc3"], - num_output=1, - ) - ) - model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss, - bottom_names=["fc3", "label"], - top_names=["loss"], - ) - ) - return model - - -def multi_node_test(): - parser = argparse.ArgumentParser() - parser.add_argument("--benchmark", type=str, required=True) - parser.add_argument("--batchsize_per_gpu", type=int, required=True) - parser.add_argument("--node_num", type=int, required=True, default=1) - parser.add_argument("--gpu_num", type=int, required=True, default=1) - parser.add_argument("--use_mixed_precision", action="store_true", default=False) - args = parser.parse_args() - - vvgpu = [[g for g in range(args.gpu_num)] for _ in range(args.node_num)] - batchsize = args.batchsize_per_gpu * args.node_num * args.gpu_num - - args.i64_input_key = True - if args.use_mixed_precision: - args.scaler = 1024 - else: - args.scaler = 1 - if args.benchmark.lower() == "din": - args.i64_input_key = False - - solver = hugectr.CreateSolver( - max_eval_batches=1, # we dont evaluate - batchsize_eval=args.gpu_num * args.node_num, # we dont evaluate - batchsize=batchsize, - vvgpu=vvgpu, - lr=1e-3, - i64_input_key=args.i64_input_key, - use_mixed_precision=args.use_mixed_precision, - scaler=args.scaler, - ) - - if args.benchmark.lower() == "wdl": - model = create_wdl(solver) - if args.benchmark.lower() == "din": - model = create_din(solver) - if args.benchmark.lower() == "dcn": - model = create_dcn(solver) - if args.benchmark.lower() == "deepfm": - model = create_deepfm(solver) - - model.compile() - model.summary() - - model.fit( - max_iter=2000, - display=200, - eval_interval=3000, # benchmark we dont want evaluate - snapshot=3000, # benchmark we dont want snapshot - ) - - -if __name__ == "__main__": - multi_node_test() diff --git a/ci/benchmark/train_benchmark/ci.yml b/ci/benchmark/train_benchmark/ci.yml deleted file mode 100644 index e767f9cb4a..0000000000 --- a/ci/benchmark/train_benchmark/ci.yml +++ /dev/null @@ -1,110 +0,0 @@ -train--wdl--1x1x256xFP32: - extends: - - .train_benchmark - - .benchmark:rules:weekly - stage: wdl_benchmark -train--wdl--1x1x8192xFP16: - extends: - - .train_benchmark - - .benchmark:rules:weekly - stage: wdl_benchmark - -train--wdl--1x4x2048xFP32: - extends: - - .train_benchmark - - .benchmark:rules:biweekly - stage: wdl_benchmark -train--wdl--1x4x2048xFP16: - extends: - - .train_benchmark - - .benchmark:rules:biweekly - stage: wdl_benchmark - -train--wdl--2x8x4096xFP32: - extends: - - .train_benchmark - - .benchmark:rules:monthly - stage: wdl_benchmark -train--wdl--4x8x4096xFP16: - extends: - - .train_benchmark - - .benchmark:rules:monthly - stage: wdl_benchmark - -train--dcn--1x1x256xFP16: - extends: - - .train_benchmark - - .benchmark:rules:weekly - stage: dcn_benchmark -train--dcn--1x1x8192xFP32: - extends: - - .train_benchmark - - .benchmark:rules:weekly - stage: dcn_benchmark - -train--dcn--1x8x2048xFP32: - extends: - - .train_benchmark - - .benchmark:rules:biweekly - stage: dcn_benchmark -train--dcn--1x8x2048xFP16: - extends: - - .train_benchmark - - .benchmark:rules:biweekly - stage: dcn_benchmark - -train--dcn--2x8x4096xFP32: - extends: - - .train_benchmark - - .benchmark:rules:monthly - stage: dcn_benchmark -train--dcn--4x8x4096xFP16: - extends: - - .train_benchmark - - .benchmark:rules:monthly - stage: dcn_benchmark - -train--deepfm--1x1x256xFP32: - extends: - - .train_benchmark - - .benchmark:rules:weekly - stage: deepfm_benchmark -train--deepfm--1x1x8192xFP16: - extends: - - .train_benchmark - - .benchmark:rules:weekly - stage: deepfm_benchmark - -train--deepfm--1x2x1024xFP32: - extends: - - .train_benchmark - - .benchmark:rules:biweekly - stage: deepfm_benchmark -train--deepfm--1x2x1024xFP16: - extends: - - .train_benchmark - - .benchmark:rules:biweekly - stage: deepfm_benchmark - -train--deepfm--2x8x8192xFP32: - extends: - - .train_benchmark - - .benchmark:rules:monthly - stage: deepfm_benchmark -train--deepfm--4x8x8192xFP16: - extends: - - .train_benchmark - - .benchmark:rules:monthly - stage: deepfm_benchmark - -train--dlrm--1x8x6912xFP16: - extends: - - .train_benchmark - - .benchmark:rules:biweekly - stage: dlrm_benchmark - -train--dlrm--14x8x640xFP16: - extends: - - .train_benchmark - - .benchmark:rules:monthly - stage: dlrm_benchmark diff --git a/ci/benchmark/train_benchmark/run.sub b/ci/benchmark/train_benchmark/run.sub deleted file mode 100644 index 3df33865c4..0000000000 --- a/ci/benchmark/train_benchmark/run.sub +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -set +x - -: "${BENCHMARK:?BENCHMARK not set}" -: "${BZ_PER_GPU:?BZ_PER_GPU not set}" -: "${NODE_NUM:?NODE_NUM not set}" -: "${GPU_NUM:?GPU_NUM not set}" -: "${MIXED_PRECISION:? MIXED_PRECISION}" -if [ ${MIXED_PRECISION} == "FP16" ];then - MIXED_PRECISION_FLAG="--use_mixed_precision"; -fi - -export WDL_DATA_PATH=${NEW_CRITEO_MOUNT}/wdl_data_parquet -export DCN_DATA_PATH=${OLD_CRITEO_MOUNT}/dcn_parquet - -srun --ntasks="${NODE_NUM}" bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3" - -if [[ ${BENCHMARK} == "dlrm" && $NODE_NUM -gt 1 ]]; then - srun --ntasks="${NODE_NUM}" --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \ - cd /raid/datasets/criteo/mlperf/40m.limit_preshuffled/ && \ - numactl --membind=1,3,5,7 python3 /hugectr/samples/dlrm/dgx_a100_14x8x640.py" -elif [[ ${BENCHMARK} == "dlrm" && $NODE_NUM -eq 1 ]]; then - srun --ntasks="${NODE_NUM}" --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \ - cd /raid/datasets/criteo/mlperf/40m.limit_preshuffled/ && \ - numactl --membind=1,3,5,7 python3 /hugectr/samples/dlrm/dgx_a100.py" -else - srun --ntasks="${NODE_NUM}" --container-image="${CONT}" --container-mounts="${MOUNTS}" $USE_SHARP bash -cx " \ - cd /hugectr/ci/benchmark/train_benchmark && \ - python3 ./benchmark_train.py \ - --benchmark ${BENCHMARK} \ - --batchsize_per_gpu ${BZ_PER_GPU} \ - --node_num ${NODE_NUM} \ - --gpu_num ${GPU_NUM} \ - ${MIXED_PRECISION_FLAG}" -fi diff --git a/ci/common.yml b/ci/common.yml deleted file mode 100644 index b2a4b39493..0000000000 --- a/ci/common.yml +++ /dev/null @@ -1,66 +0,0 @@ -variables: - IMAGE_ALL: ${BASE_DEVEL_REGISTRY}:devel_all - MERLIN_NIGHTLY_DEVEL: ${BASE_DEVEL_REGISTRY}:merlin_nightly - IMAGE_SOK_TF2: ${BASE_DEVEL_REGISTRY}:devel_embedding_tf2 - IMAGE_SOK_TF1: ${BASE_DEVEL_REGISTRY}:devel_embedding_tf1 - IMAGE_OPTIMIZED: gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_optimized - IMAGE_PYTORCH: ${BASE_DEVEL_REGISTRY}:devel_pytorch - DATASET: /lustre/fsw/devtech/hpc-hugectr/criteo_kaggle - DATASET_MOUNT: /dataset/criteo_kaggle - DIN_DATASET: /lustre/fsw/devtech/hpc-hugectr/din - BST_DATASET: /lustre/fsw/devtech/hpc-hugectr/bst - NCF_DATASET: /lustre/fsw/devtech/hpc-hugectr/movie_len - MMOE_DATASET: /lustre/fsw/devtech/hpc-hugectr/mmoe_data - DIN_DATASET_MOUNT: /etc/workspace/din - BST_DATASET_MOUNT: /etc/workspace/bst - NCF_DATASET_MOUNT: /etc/workspace/ncf_data - MMOE_DATASET_MOUNT: /etc/workspace/mmoe_data - DATASET_CRITEO_SELENE: /lustre/fsw/devtech/hpc-hugectr/criteo_kaggle - DATASET_CRITEO_CIRCE: /gpfs/fs1/minseokl/datasets/criteo_kaggle - DATASET_NEW_CRITEO_SELENE: /lustre/fsw/devtech/hpc-hugectr/new-criteo-dataset - DATASET_NEW_CRITEO_CIRCE: /gpfs/fs1/minseokl/datasets/new-criteo-dataset - CRITEO_MOUNT: /etc/workspace/criteo_kaggle - NEW_CRITEO_MOUNT: /etc/workspace/new_criteo_kaggle - DLRM_MOUNT: /etc/workspace/dataset - GIT_CLONE_PATH_SELENE: /lustre/fsw/devtech/hpc-hugectr/hugectr-ci/$CI_CONCURRENT_ID/$CI_PROJECT_NAME - TRAIN_IMAGE_VERSIONED: ${CI_REGISTRY}/dl/hugectr/hugectr:train.$CI_PIPELINE_ID - TRAIN_IMAGE_VERSIONED_LATEST: ${CI_REGISTRY}/dl/hugectr/hugectr:train.latest.$CI_PIPELINE_ID - TRAIN_IMAGE_VERSIONED_WITH_HDFS: ${CI_REGISTRY}/dl/hugectr/hugectr:train.with_hdfs.$CI_PIPELINE_ID - TRAIN_IMAGE_VERSIONED_WITH_HDFS_MINI: ${CI_REGISTRY}/dl/hugectr/hugectr:train.with_hdfs_mini.$CI_PIPELINE_ID - TRAIN_IMAGE_VERSIONED_WITH_S3: ${CI_REGISTRY}/dl/hugectr/hugectr:train.with_s3.${CI_PIPELINE_ID} - TRAIN_IMAGE_VERSIONED_WITH_GCS: ${CI_REGISTRY}/dl/hugectr/hugectr:train.with_gcs.${CI_PIPELINE_ID} - TRAIN_IMAGE_MULTINODE_VERSIONED: ${CI_REGISTRY}/dl/hugectr/hugectr:train.multinode.${CI_PIPELINE_ID} - TRAIN_INFER_IMAGE_VERSIONED: ${CI_REGISTRY}/dl/hugectr/hugectr:train.infer.${CI_PIPELINE_ID} - INFER_IMAGE_VERSIONED: ${CI_REGISTRY}/dl/hugectr/hugectr:infer.${CI_PIPELINE_ID} - SOK_IMAGE_VERSIONED_TF2: "${CI_REGISTRY}/dl/hugectr/hugectr:sok_tf2_${CI_PIPELINE_ID}" - SOK_IMAGE_VERSIONED_TF1: "${CI_REGISTRY}/dl/hugectr/hugectr:sok_tf1_${CI_PIPELINE_ID}" - TF_PLUGIN_IMAGE_VERSIONED: ${CI_REGISTRY}/dl/hugectr/hugectr:tf_plugin.${CI_PIPELINE_ID} - HUGECTR_TRT_IMAGE_VERSIONED: ${CI_REGISTRY}/dl/hugectr/hugectr:hugectr_trt_plugin.${CI_PIPELINE_ID} - TF_TRT_IMAGE_VERSIONED: ${CI_REGISTRY}/dl/hugectr/hugectr:tf_trt_plugin.${CI_PIPELINE_ID} - PYTORCH_TRT_IMAGE_VERSIONED: ${CI_REGISTRY}/dl/hugectr/hugectr:pytorch_trt_plugin.${CI_PIPELINE_ID} - UNIFIED_CTR_LATEST: ${BASE_DEVEL_REGISTRY}:unified.ctr.latest - UNIFIED_TF_LATEST: ${BASE_DEVEL_REGISTRY}:unified.tf.latest - UNIFIED_TORCH_LATEST: ${BASE_DEVEL_REGISTRY}:unified.torch.latest - LOGDIR: /lustre/fsw/devtech/hpc-hugectr/hugectr-ci/${CI_PIPELINE_ID} - RD_CI_JOB_TOKEN: gitlab-ci-token:${CI_JOB_TOKEN}@ - EXTRA_DOCKER_RUN_ARGS: "--label RUNNER_ID=${RUNNER_ID}" - # Variables for child pipeline on Dracorno cluster - DRACO_DATASET: ${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/criteo_kaggle - DRACO_WDL_PARQUET_DATASET: ${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/ - DRACO_CRITEO_PARQUET_DATASET: ${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/ - DRACO_DIN_DATASET: ${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/hpc-hugectr/din - # TODO Change DRACO NCF - DRACO_NCF_DATASET: ${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/hpc-hugectr/ncf_data - DRACO_DATASET_NEW_CRITEO: ${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/new_criteo_kaggle - DRACO_MMOE_DATASET: ${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/mmoe_data - DRACO_BST_DATASET: ${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/bst - DRACO_CLUSTER: "dracorno" - DRACO_LOGDIR: ${DATA_PREFIX}/fs1/projects/gpu_compute/users/svcnvdlfw/hugectr_ci/${PARENT_PIPELINE_ID} - # DRACO-OCI cluster - DRACO_OCI_DATASET: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/criteo_kaggle - DRACO_OCI_DIN_DATASET: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/din - DRACO_OCI_NCF_DATASET: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/movie_len - DRACO_OCI_DATASET_NEW_CRITEO: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/new-criteo-dataset - DRACO_OCI_MMOE_DATASET: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/mmoe_data - DRACO_OCI_BST_DATASET: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/bst - DRACO_OCI_LOGDIR: /lustre/fsw/portfolios/coreai/users/svcnvdlfw/hugectr_ci/${PARENT_PIPELINE_ID} diff --git a/ci/common/clean_logs.sub b/ci/common/clean_logs.sub deleted file mode 100644 index 89db427cd6..0000000000 --- a/ci/common/clean_logs.sub +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - rm -rf " \ No newline at end of file diff --git a/ci/common/config_pbtxt_template.txt b/ci/common/config_pbtxt_template.txt deleted file mode 100644 index a08fed7538..0000000000 --- a/ci/common/config_pbtxt_template.txt +++ /dev/null @@ -1,81 +0,0 @@ -name: "dlrm" -backend: "hps" -max_batch_size:%%batchsize, -input [ - { - name: "KEYS" - data_type: TYPE_INT64 - dims: [ -1 ] - }, - { - name: "NUMKEYS" - data_type: TYPE_INT32 - dims: [ -1 ] - } -] -output [ - { - name: "OUTPUT0" - data_type: TYPE_FP32 - dims: [ -1 ] - } -] -instance_group [ - { - count: 1 - kind : KIND_GPU - gpus:[0] - } -] - -parameters [ - { - key: "config" - value: { string_value: "/model/dlrm/1/dlrm.json" } - }, - { - key: "gpucache" - value: { string_value: "true" } - }, - { - key: "hit_rate_threshold" - value: { string_value: "1.1" } - }, - { - key: "gpucacheper" - value: { string_value: "0.5" } - }, - { - key: "label_dim" - value: { string_value: "1" } - }, - { - key: "mixed_precision" - value: { string_value: "%%mixed_precision" } - }, - - { - key: "slots" - value: { string_value: "26" } - }, - { - key: "cat_feature_num" - value: { string_value: "26" } - }, - { - key: "des_feature_num" - value: { string_value: "13" } - }, - { - key: "max_nnz" - value: { string_value: "2" } - }, - { - key: "embedding_vector_size" - value: { string_value: "128" } - }, - { - key: "embeddingkey_long_type" - value: { string_value: "true" } - } -] diff --git a/ci/common/generate_inference_config.py b/ci/common/generate_inference_config.py deleted file mode 100644 index 28991fad44..0000000000 --- a/ci/common/generate_inference_config.py +++ /dev/null @@ -1,37 +0,0 @@ -from argparse import ArgumentParser -import json - -parser = ArgumentParser() -parser.add_argument("--config_template", type=str) -parser.add_argument("--ps_template", type=str) -parser.add_argument("--batchsize", type=str) -parser.add_argument("--mixed_precision", type=str) -parser.add_argument("--ec_type", type=str, default="dynamic") -parser.add_argument("--config_output", type=str) -parser.add_argument("--ps_output", type=str) -args = parser.parse_args() - -with open(args.config_template, "r") as f: - config_pbtxt_template = f.readlines() - config_pbtxt_template = "".join(config_pbtxt_template) - -config_pbtxt = config_pbtxt_template.replace("%%batchsize", args.batchsize).replace( - "%%mixed_precision", args.mixed_precision -) -with open(args.config_output, "w") as f: - f.write(config_pbtxt) - -with open(args.ps_template, "r") as f: - ps_json_template = json.load(f) - - -def str2bool(v): - return v.lower() in ("true") - - -ps_json_template["models"][0]["max_batch_size"] = int(args.batchsize) -ps_json_template["models"][0]["mixed_precision"] = str2bool(args.mixed_precision) -ps_json_template["models"][0]["embedding_cache_type"] = args.ec_type - -with open(args.ps_output, "w") as f: - json.dump(ps_json_template, f) diff --git a/ci/common/ps_template.json b/ci/common/ps_template.json deleted file mode 100644 index 7743b23773..0000000000 --- a/ci/common/ps_template.json +++ /dev/null @@ -1,39 +0,0 @@ -{ - "supportlonglong": true, - "volatile_db": { - "type": "parallel_hash_map", - "address":"127.0.0.1:7000,127.0.0.1:7001,127.0.0.1:7002", - "num_partitions": 15, - "overflow_policy": "evict_random", - "overflow_margin": 1000000, - "overflow_resolution_target": 0.95, - "initial_cache_rate": 1.0, - "cache_missed_embeddings": false, - "update_filters": [ "^hps_.+$" ] - }, - "persistent_db": { - "type": "disabled", - "path": "/hugectr/test/utest/wdl_test_files/rocksdb" - }, - "models": [ - { - "model":"dlrm", - "sparse_files":["/model/dlrm/1/dlrm0_sparse_20000.model"], - "dense_file":"/model/dlrm/1/dlrm_dense_20000.model", - "network_file":"/model/dlrm/1/dlrm.json", - "num_of_worker_buffer_in_pool": 6, - "num_of_refresher_buffer_in_pool":1, - "deployed_device_list":[0], - "max_batch_size":1, - "mixed_precision":true, - "default_value_for_each_table":[0.0], - "cache_refresh_percentage_per_iteration":0.2, - "hit_rate_threshold":1.1, - "gpucacheper":0.5, - "maxnum_catfeature_query_per_table_per_sample":[26], - "embedding_vecsize_per_table":[128], - "gpucache":true - } - ] -} - diff --git a/ci/draco-oci/ci.yml b/ci/draco-oci/ci.yml deleted file mode 100644 index 1b2ec27ff8..0000000000 --- a/ci/draco-oci/ci.yml +++ /dev/null @@ -1,443 +0,0 @@ -include: - - project: "dl/devops/gitlab-ci-slurm" - ref: master - file: "/.gitlab-ci.yml" - - /ci/common.yml - - /ci/template.yml - - /ci/rules.gitlab_ci.yml - -## Stage: test -# unit test -utests: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - CONT: ${TRAIN_IMAGE_VERSIONED} - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT},${DRACO_OCI_PREFIX}/inference:/hugectr/test/utest/ - CI_SLURM_TIME: "02:00:00" - TEST_CMD: ./ci/utest/utest.sub - -utests_embedding_collection: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT},${DRACO_OCI_PREFIX}/inference/:/hugectr/test/utest/,/raid:/raid,/lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/dlrm/datasets/criteo_multihot_raw:/data - CI_SLURM_TIME: "01:30:00" - TEST_CMD: ./ci/utest/utest_embedding_collection.sub - -utest_core23: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - CONT: ${TRAIN_IMAGE_VERSIONED} - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT},${DRACO_OCI_PREFIX}/inference:/hugectr/test/utest/ - TEST_CMD: ./ci/utest/utest_core23.sub - -utests_layer_1: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - CONT: ${TRAIN_IMAGE_VERSIONED} - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT},${DRACO_OCI_PREFIX}/inference:/hugectr/test/utest/ - TEST_CMD: ./ci/utest/utest_layer_1.sub - -utests_layer_2: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - CONT: ${TRAIN_IMAGE_VERSIONED} - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT},${DRACO_OCI_PREFIX}/inference:/hugectr/test/utest/ - TEST_CMD: ./ci/utest/utest_layer_2.sub - -utests_embedding: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT},${DRACO_OCI_PREFIX}/inference/:/hugectr/test/utest/ - CI_SLURM_TIME: "01:00:00" - TEST_CMD: ./ci/utest/utest_embedding.sub - -utests_hybrid_e2e: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT},${DRACO_OCI_PREFIX}/inference/:/hugectr/test/utest/ - TEST_CMD: ./ci/utest/utest_hybrid_e2e.sub - -utests_hps: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_inference - variables: - CONT: $TRAIN_INFER_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT},${DRACO_OCI_PREFIX}/inference/:/hugectr/test/utest/ - TEST_CMD: ./ci/utest/utest_hps.sub - -criteo: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT} - CI_SLURM_TIME: "00:15:00" - TEST_CMD: ./ci/integration_test/criteo/criteo.sub - -dcn: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT} - CI_SLURM_TIME: "00:40:00" - TEST_CMD: ./ci/integration_test/dcn/dcn.sub - -dcn_8gpu: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT} - CI_SLURM_TIME: "01:00:00" - TEST_CMD: ./ci/integration_test/dcn/dcn_8gpu.sub - -dlrm_dcnv2_benchmark_1node: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_multi_node - variables: - CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED - MOUNTS: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/dlrm/datasets/criteo_multihot_raw:/data,/lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/dlrm/datasets/criteo_multihot_raw:/data_val - CI_SLURM_TIME: "02:00:00" - TEST_CMD: ./ci/integration_test/dlrm/train_dcnv2_1node.sub - -wdl: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_DATASET_NEW_CRITEO}:${DATASET_MOUNT} - CI_SLURM_TIME: "00:15:00" - TEST_CMD: ./ci/integration_test/wdl/wdl.sub - -deepfm: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT} - CI_SLURM_TIME: "00:15:00" - TEST_CMD: ./ci/integration_test/deepfm/deepfm.sub - -mmoe: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_MMOE_DATASET}:${MMOE_DATASET_MOUNT} - CI_SLURM_TIME: "00:15:00" - TEST_CMD: ./ci/integration_test/mmoe/mmoe.sub - -inference_hps: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT},${DRACO_OCI_PREFIX}/inference/:/hugectr/test/utest/ - CI_SLURM_TIME: "01:30:00" - TEST_CMD: ./ci/integration_test/inference/inference_hps.sub - -embedding_cache_perf: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - TEST_CMD: ./ci/integration_test/inference/embedding_cache_perf_test.sub - -din_single_node: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_DIN_DATASET}:${DIN_DATASET_MOUNT} - CI_SLURM_TIME: "00:15:00" - TEST_CMD: ./ci/integration_test/din/din.sub - -bst_single_node: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_BST_DATASET}:${BST_DATASET_MOUNT} - CI_SLURM_TIME: "00:15:00" - TEST_CMD: ./ci/integration_test/bst/bst.sub - -py_single_node: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT},${DRACO_OCI_DATASET_NEW_CRITEO}:${NEW_CRITEO_MOUNT} - CI_SLURM_TIME: "00:30:00" - TEST_CMD: ./ci/integration_test/py_interface/py_single_node.sub - -ebc_multi_node: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_multi_node - variables: - CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED - MOUNTS: ${DRACO_OCI_DATASET_NEW_CRITEO}:${DATASET_MOUNT},/raid:/raid - CI_SLURM_TIME: "02:00:00" - CI_SLURM_NODES: 2 - SLURM_JOB_NUM_NODES: 2 - TEST_CMD: ./ci/integration_test/ebc/ebc.sub - -ebc_utest_multi_node: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_multi_node - variables: - CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED - MOUNTS: /raid:/raid - CI_SLURM_TIME: "01:30:00" - CI_SLURM_NODES: 2 - SLURM_JOB_NUM_NODES: 2 - TEST_CMD: ./ci/integration_test/ebc/utest.multinode.sub - -hugectr2onnx: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_DATASET_NEW_CRITEO}:${NEW_CRITEO_MOUNT},${DRACO_OCI_DIN_DATASET}:${DIN_DATASET_MOUNT},${DRACO_OCI_NCF_DATASET}:${NCF_DATASET_MOUNT},${DRACO_OCI_MMOE_DATASET}:${MMOE_DATASET_MOUNT} - CI_SLURM_TIME: "01:00:00" - TEST_CMD: ./ci/integration_test/hugectr2onnx/hugectr2onnx.sub - -hps_tf_plugin: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_tf_hps_trt_plugin - variables: - CONT: $TF_TRT_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT} - TEST_CMD: ./ci/integration_test/hps/hps_tf.sub - -hps_torch_plugin: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_pytorch_hps_trt_plugin - variables: - CONT: $PYTORCH_TRT_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT} - TEST_CMD: ./ci/integration_test/hps/hps_torch.sub - -sparse_operation_kit_ut-TF2: - extends: .draco_oci_sok_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_sok_tf2 - variables: - CONT: $SOK_IMAGE_VERSIONED_TF2 - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT} - CI_SLURM_TIME: "02:00:00" - TEST_CMD: ./ci/integration_test/sok/sok_tf2_unit.sub - -sparse_operation_kit_ut-TF1: - extends: .draco_oci_sok_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_sok_tf1 - variables: - CONT: $SOK_IMAGE_VERSIONED_TF1 - MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT} - CI_SLURM_TIME: "02:00:00" - TEST_CMD: ./ci/integration_test/sok/sok_tf1_unit.sub - -hps_trt_in_hugectr: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_hugectr_hps_trt_plugin - variables: - CONT: $HUGECTR_TRT_IMAGE_VERSIONED - TEST_CMD: ./ci/integration_test/hps/hps_trt_in_merlin_hugectr.sub - -hps_trt_in_tf: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_tf_hps_trt_plugin - variables: - CONT: $TF_TRT_IMAGE_VERSIONED - TEST_CMD: ./ci/integration_test/hps/hps_trt_in_merlin_tf.sub - -hps_trt_in_pytorch: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_pytorch_hps_trt_plugin - variables: - CONT: $PYTORCH_TRT_IMAGE_VERSIONED - TEST_CMD: ./ci/integration_test/hps/hps_trt_in_merlin_pytorch.sub - -hps_plugin_benchmark: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_tf_hps_trt_plugin - variables: - CONT: $TF_TRT_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_PREFIX}/hps_tf_benchmark/hps_plugin_ci_model_repo:/model_repo,${DRACO_OCI_PREFIX}/hps_tf_benchmark/perf_data:/perf_data - CI_SLURM_TIME: "00:45:00" - TEST_CMD: ./ci/benchmark/hps_plugin_benchmark/run.sub - -s3_backend_test: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node_with_s3 - variables: - CONT: $TRAIN_IMAGE_VERSIONED_WITH_S3 - TEST_CMD: ./ci/integration_test/s3/s3_backend_test.sub - -gcs_backend_test: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node_with_gcs - variables: - CONT: $TRAIN_IMAGE_VERSIONED_WITH_GCS - TEST_CMD: ./ci/integration_test/gcs/gcs_backend_test.sub - -hps_torch_fuse_table_benchmark: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_pytorch_hps_trt_plugin - variables: - CONT: $PYTORCH_TRT_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_PREFIX}/hps_torch_fuse_table_benchmark/ci_model_repo:/model_repo,${DRACO_OCI_PREFIX}/hps_torch_fuse_table_benchmark/perf_data:/perf_data - CI_SLURM_TIME: "00:45:00" - TEST_CMD: ./ci/benchmark/hps_torch_fuse_table_benchmark/run.sub - -hps_tf_fuse_table_benchmark: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_tf_hps_trt_plugin - variables: - CONT: $TF_TRT_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_PREFIX}/hps_tf_fuse_table_benchmark/ci_model_repo:/model_repo,${DRACO_OCI_PREFIX}/hps_tf_fuse_table_benchmark/perf_data:/perf_data - CI_SLURM_TIME: "00:45:00" - TEST_CMD: ./ci/benchmark/hps_tf_fuse_table_benchmark/run.sub - -hps_plugin_benchmark_check: - extends: .draco_oci_post_test_job - needs: - - hps_plugin_benchmark - variables: - CONT: $TF_TRT_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_LOGDIR}/hps_plugin_benchmark:/logs - CI_SLURM_TIME: "00:15:00" - TEST_CMD: ./ci/post_test/check_hps_plugin_benchmark.sub - -dlrm_dcnv2_1node_check: - # Push logs to gitlab - extends: .draco_oci_post_test_job - needs: - - dlrm_dcnv2_benchmark_1node - variables: - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_LOGDIR}/dlrm_dcnv2_1node:/logs - CI_SLURM_TIME: "00:15:00" - TEST_CMD: ./ci/post_test/check_dcnv2_dlrm_1node.sub - - -hps_torch_fuse_table_benchmark_check: - extends: .draco_oci_post_test_job - needs: - - hps_torch_fuse_table_benchmark - variables: - CONT: $PYTORCH_TRT_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_LOGDIR}/hps_torch_fuse_table_benchmark:/logs - CI_SLURM_TIME: "00:15:00" - TEST_CMD: ./ci/post_test/check_hps_torch_fuse_table_benchmark.sub - -hps_tf_fuse_table_benchmark_check: - extends: .draco_oci_post_test_job - needs: - - hps_tf_fuse_table_benchmark - variables: - CONT: $TF_TRT_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_LOGDIR}/hps_tf_fuse_table_benchmark:/logs - CI_SLURM_TIME: "00:15:00" - TEST_CMD: ./ci/post_test/check_hps_tf_fuse_table_benchmark.sub - -147gb_model_benchmark: - extends: .draco_oci_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_tf_hps_trt_plugin - variables: - CONT: $TF_TRT_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_PREFIX}/hps_tf_benchmark/147gb_ci_model_repo:/model_repo - CI_SLURM_TIME: "00:45:00" - TEST_CMD: ./ci/benchmark/147gb_model_benchmark/run.sub - -147gb_model_benchmark_check: - extends: .draco_oci_post_test_job - needs: - - 147gb_model_benchmark - variables: - CONT: $TF_TRT_IMAGE_VERSIONED - MOUNTS: ${DRACO_OCI_LOGDIR}/147gb_model_benchmark:/logs - CI_SLURM_TIME: "00:15:00" - TEST_CMD: ./ci/post_test/check_147gb_model_benchmark.sub - diff --git a/ci/dracorno/ci.yml b/ci/dracorno/ci.yml deleted file mode 100644 index 9be1f833e7..0000000000 --- a/ci/dracorno/ci.yml +++ /dev/null @@ -1,264 +0,0 @@ -include: - - project: "dl/devops/gitlab-ci-slurm" - ref: master - file: "/.gitlab-ci.yml" - - /ci/common.yml - - /ci/template.yml - - /ci/rules.gitlab_ci.yml - - -utests_layer_1: - extends: .dracorno_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $DRACO_LOGDIR/utests_layer_1 - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_DATASET}:${DATASET_MOUNT},${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/hpc-hugectr/inference:/hugectr/test/utest/ - TEST_CMD: ./ci/utest/utest_layer_1.sub - -utests_layer_2: - extends: .dracorno_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $DRACO_LOGDIR/utests_layer_2 - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_DATASET}:${DATASET_MOUNT},${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/hpc-hugectr/inference:/hugectr/test/utest/ - TEST_CMD: ./ci/utest/utest_layer_2.sub - -# Comment out due to OOM error on Dracorno -#utests_embedding: -# extends: .dracorno_test_job -# needs: -# - pipeline: $PARENT_PIPELINE_ID -# job: build_train_single_node -# variables: -# GPFSFOLDER: $DRACO_LOGDIR/utests_embedding -# CONT: $TRAIN_IMAGE_VERSIONED -# MOUNTS: ${DRACO_DATASET}:${DATASET_MOUNT},${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/hpc-hugectr/inference:/hugectr/test/utest/ -# TEST_CMD: ./ci/utest/utest_embedding.sub - -criteo: - extends: .dracorno_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $DRACO_LOGDIR/criteo - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_CRITEO_PARQUET_DATASET}:${DATASET_MOUNT} - TEST_CMD: ./ci/integration_test/criteo/criteo.sub - -dcn: - extends: .dracorno_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $DRACO_LOGDIR/dcn - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_DATASET}:${DATASET_MOUNT} - TEST_CMD: ./ci/integration_test/dcn/dcn.sub - -# Comment out due to unknown hang -#dcn_8gpu: -# extends: .dracorno_test_job -# needs: -# - pipeline: $PARENT_PIPELINE_ID -# job: build_train_single_node -# variables: -# GPFSFOLDER: $DRACO_LOGDIR/dcn_8gpu -# CONT: $TRAIN_IMAGE_VERSIONED -# MOUNTS: ${DRACO_DATASET}:${DATASET_MOUNT} -# TEST_CMD: ./ci/integration_test/dcn/dcn_8gpu.sub - -wdl: - extends: .dracorno_test_job # test on selene needs to extend .dracorno_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $DRACO_LOGDIR/wdl # log dir, usually $LOGDIR + job name - CONT: $TRAIN_IMAGE_VERSIONED # image name - MOUNTS: ${DRACO_WDL_PARQUET_DATASET}:${DATASET_MOUNT} # node num - TEST_CMD: ./ci/integration_test/wdl/wdl.sub - -deepfm: - extends: .dracorno_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $DRACO_LOGDIR/deepfm - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_DATASET}:${DATASET_MOUNT} - TEST_CMD: ./ci/integration_test/deepfm/deepfm.sub - -mmoe: - extends: .dracorno_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $DRACO_LOGDIR/mmoe - #GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE} - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_MMOE_DATASET}:${MMOE_DATASET_MOUNT} - WALLTIME: "00:15:00" - TEST_CMD: ./ci/integration_test/mmoe/mmoe.sub - -inference_hps: - extends: .dracorno_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_inference - variables: - GPFSFOLDER: $DRACO_LOGDIR/inference_hps - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_DATASET}:${DATASET_MOUNT},${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/hpc-hugectr/inference:/hugectr/test/utest/ - TEST_CMD: ./ci/integration_test/inference/inference_hps.sub - -din_single_node: - extends: .dracorno_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $DRACO_LOGDIR/din_single_node - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_DIN_DATASET}:${DIN_DATASET_MOUNT} - TEST_CMD: ./ci/integration_test/din/din.sub - -bst_single_node: - extends: .dracorno_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $DRACO_LOGDIR/bst_single_node - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_BST_DATASET}:${BST_DATASET_MOUNT} - TEST_CMD: ./ci/integration_test/bst/bst.sub - -py_single_node: - extends: .dracorno_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $DRACO_LOGDIR/py_single_node - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_DATASET}:${DATASET_MOUNT},${DRACO_DATASET_NEW_CRITEO}:${NEW_CRITEO_MOUNT} - TEST_CMD: ./ci/integration_test/py_interface/py_single_node.sub - -hugectr2onnx: - extends: .dracorno_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $DRACO_LOGDIR/hugectr2onnx - #GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE} - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DRACO_DATASET_NEW_CRITEO}:/etc/workspace/new_criteo_kaggle,${DRACO_DIN_DATASET}:/etc/workspace/din,${DRACO_NCF_DATASET}:/etc/workspace/ncf_data,${DRACO_MMOE_DATASET}:/etc/workspace/mmoe_data - TEST_CMD: ./ci/integration_test/hugectr2onnx/hugectr2onnx.sub - -ebc_multi_node: - extends: .dracorno_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_multi_node - variables: - GPFSFOLDER: $DRACO_LOGDIR/ebc_multi_node - CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED - MOUNTS: ${DRACO_DATASET_NEW_CRITEO}:${DATASET_MOUNT} - WALLTIME: "01:00:00" - DGXNNODES: 2 - TEST_CMD: ./ci/integration_test/ebc/ebc.sub - -### Stage: test -hps_tf_plugin: - extends: .dracorno_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_tf_hps_trt_plugin - variables: - GPFSFOLDER: $DRACO_LOGDIR/hps_tf - CONT: $TF_TRT_IMAGE_VERSIONED - MOUNTS: ${DRACO_DATASET}:${DATASET_MOUNT} - TEST_CMD: ./ci/integration_test/hps/hps_tf.sub - -# hps_torch_plugin -hps_torch_plugin: - extends: .dracorno_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_pytorch_hps_trt_plugin - variables: - GPFSFOLDER: $DRACO_LOGDIR/hps_torch - CONT: $PYTORCH_TRT_IMAGE_VERSIONED - MOUNTS: ${DRACO_DATASET}:${DATASET_MOUNT} - TEST_CMD: ./ci/integration_test/hps/hps_torch.sub - -s3_backend_test: - extends: .dracorno_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node_with_s3 - variables: - GPFSFOLDER: $DRACO_LOGDIR/s3_backend_test - CONT: $TRAIN_IMAGE_VERSIONED_WITH_S3 - DGXNNODES: 1 - TEST_CMD: ./ci/integration_test/s3/s3_backend_test.sub - -hps_plugin_benchmark: - extends: .dracorno_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_tf_hps_trt_plugin - variables: - GPFSFOLDER: $DRACO_LOGDIR/hps_plugin_benchmark - CONT: $TF_TRT_IMAGE_VERSIONED - MOUNTS: ${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/hpc-hugectr/hps_tf_benchmark/hps_plugin_ci_model_repo:/model_repo,${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/hpc-hugectr/hps_tf_benchmark/perf_data:/perf_data - WALLTIME: "00:45:00" - TEST_CMD: ./ci/benchmark/hps_plugin_benchmark/run.sub - -147gb_model_benchmark: - extends: .dracorno_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_tf_hps_trt_plugin - variables: - GPFSFOLDER: $DRACO_LOGDIR/147gb_model_benchmark - CONT: $TF_TRT_IMAGE_VERSIONED - MOUNTS: ${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/hpc-hugectr/hps_tf_benchmark/147gb_ci_model_repo:/model_repo - WALLTIME: "00:45:00" - TEST_CMD: ./ci/benchmark/147gb_model_benchmark/run.sub - -#SOK ut tests -sparse_operation_kit_ut-TF1: - extends: - - .dracorno_sok_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_sok_tf1 - variables: - GPFSFOLDER: $DRACO_LOGDIR/sparse_operation_kit-tf1 - CONT: $SOK_IMAGE_VERSIONED_TF1 - MOUNTS: ${DRACO_DATASET}:${DATASET_MOUNT} - TEST_CMD: ./ci/integration_test/sok/sok_tf1_unit.sub - -sparse_operation_kit_ut-TF2: - extends: - - .dracorno_sok_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_sok_tf2 - variables: - GPFSFOLDER: $DRACO_LOGDIR/sparse_operation_kit-tf2 - CONT: $SOK_IMAGE_VERSIONED_TF2 - MOUNTS: ${DRACO_DATASET}:${DATASET_MOUNT} - TEST_CMD: ./ci/integration_test/sok/sok_tf2_unit.sub diff --git a/ci/integration_test/bst/bst.sub b/ci/integration_test/bst/bst.sub deleted file mode 100644 index fdcad25dce..0000000000 --- a/ci/integration_test/bst/bst.sub +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /etc/workspace/bst && \ - python3 /workdir/test/pybind_test/bst_fp32_1gpu.py" - diff --git a/ci/integration_test/criteo/criteo.sub b/ci/integration_test/criteo/criteo.sub deleted file mode 100644 index fc161a14b0..0000000000 --- a/ci/integration_test/criteo/criteo.sub +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd ${DATASET_MOUNT}/criteo_parquet && \ - mkdir /workdir/export_predictions_criteo_1gpu/ && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/criteo_1gpu.json && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/criteo_8gpu.json && \ - cd /dataset/criteo_kaggle/criteo_parquet_multi_slots && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/criteo_parquet_multi_slots_1gpu.json && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/criteo_parquet_multi_slots_8gpu.json" \ No newline at end of file diff --git a/ci/integration_test/criteo/criteo_multi_node.sub b/ci/integration_test/criteo/criteo_multi_node.sub deleted file mode 100644 index 9d55b18d17..0000000000 --- a/ci/integration_test/criteo/criteo_multi_node.sub +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -#TODO: add restriction of NCCL_IB_HCA for draco-oci, may need remove it for other cluster -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \ - export NCCL_IB_HCA=\"=mlx5_1\" && \ - cd /dataset/criteo_kaggle/criteo_parquet && \ - python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/criteo_2node_4gpu.json" diff --git a/ci/integration_test/dcn/dcn.sub b/ci/integration_test/dcn/dcn.sub deleted file mode 100644 index f33623526c..0000000000 --- a/ci/integration_test/dcn/dcn.sub +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /dataset/criteo_kaggle/dcn_parquet && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dcn_fp16_1gpu.json && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dcn_1gpu.json && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_distributed_1gpu.json && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_localized_1gpu.json && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_localized_2gpu.json && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dcnv2_parquet_distributed_1gpu.json && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dcnv2_parquet_localized_1gpu.json && \ - cd /dataset/criteo_kaggle/dcn_parquet_vec_column && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_distributed_1gpu.json && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_localized_1gpu.json" diff --git a/ci/integration_test/dcn/dcn_8gpu.sub b/ci/integration_test/dcn/dcn_8gpu.sub deleted file mode 100644 index 427d97575a..0000000000 --- a/ci/integration_test/dcn/dcn_8gpu.sub +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /dataset/criteo_kaggle/dcn_parquet && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_distributed_8gpu.json && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_localized_8gpu.json && \ - cd /dataset/criteo_kaggle/dcn_parquet_vec_column && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_distributed_8gpu.json && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_localized_8gpu.json" diff --git a/ci/integration_test/dcn/dcn_multi_node.sub b/ci/integration_test/dcn/dcn_multi_node.sub deleted file mode 100644 index 863596f304..0000000000 --- a/ci/integration_test/dcn/dcn_multi_node.sub +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -set -e - -#TODO: add restriction of NCCL_IB_HCA for draco-oci, may need remove it for other cluster -srun --ntasks=4 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \ - export NCCL_IB_HCA=\"=mlx5_1\" && \ - cd /dataset/criteo_kaggle/dcn_parquet && \ - python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_distributed_4node_2gpu.json" - - -srun --ntasks=2 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \ - export NCCL_IB_HCA=\"=mlx5_1\" && \ - cd /dataset/criteo_kaggle/dcn_parquet && \ - python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_localized_2node_4gpu.json" diff --git a/ci/integration_test/deepfm/deepfm.sub b/ci/integration_test/deepfm/deepfm.sub deleted file mode 100644 index 462d210a39..0000000000 --- a/ci/integration_test/deepfm/deepfm.sub +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /dataset/criteo_kaggle/dcn_parquet && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/deepfm_1gpu.json && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/deepfm_fp16_1gpu.json" diff --git a/ci/integration_test/deepfm/deepfm_daily.sub b/ci/integration_test/deepfm/deepfm_daily.sub deleted file mode 100644 index b92b63f2eb..0000000000 --- a/ci/integration_test/deepfm/deepfm_daily.sub +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /dataset/criteo_kaggle/dcn_parquet && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/deepfm_8gpu.json" diff --git a/ci/integration_test/din/din.sub b/ci/integration_test/din/din.sub deleted file mode 100644 index a44f7884bc..0000000000 --- a/ci/integration_test/din/din.sub +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /etc/workspace/din && \ - python3 /workdir/test/pybind_test/din_fp32_1gpu.py && - python3 /workdir/test/pybind_test/din_matmul_fp32_1gpu.py" - -# srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ -# cd /etc/workspace/din && \ -# python3 /workdir/test/pybind_test/din_fp32_2gpu.py" \ No newline at end of file diff --git a/ci/integration_test/dlrm/train_dcnv2_1node.sub b/ci/integration_test/dlrm/train_dcnv2_1node.sub deleted file mode 100644 index 72b4b290bb..0000000000 --- a/ci/integration_test/dlrm/train_dcnv2_1node.sub +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -set -ex - -srun --ntasks="${SLURM_JOB_NUM_NODES}" bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3" - -srun --mpi=pmix --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /workdir/samples/dlrm && \ - curl --header \"PRIVATE-TOKEN: ${GIT_TOKEN}\" \"${REPO_LINK}\" -o optimized.tar && \ - tar xvf optimized.tar --strip-components=3 -C . && \ - pip install -r requirements.txt && \ - source ${DLRMV2_1NODE_CONFIG} && \ - python3 ${COMMAND}" diff --git a/ci/integration_test/dlrm/train_dcnv2_8node.sub b/ci/integration_test/dlrm/train_dcnv2_8node.sub deleted file mode 100644 index 0d6e4706b5..0000000000 --- a/ci/integration_test/dlrm/train_dcnv2_8node.sub +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -set -ex - -srun --mpi=pmix --ntasks="${SLURM_JOB_NUM_NODES}" bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3" - -srun --mpi=pmix --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /workdir/samples/dlrm && \ - curl --header \"PRIVATE-TOKEN: ${GIT_TOKEN}\" \"${REPO_LINK}\" -o optimized.tar && \ - tar xvf optimized.tar --strip-components=3 -C . && \ - pip install -r requirements.txt && \ - source ${DLRMV2_8NODE_CONFIG} && \ - python3 ${COMMAND}" diff --git a/ci/integration_test/ebc/ebc.sub b/ci/integration_test/ebc/ebc.sub deleted file mode 100644 index 520ec1b9d1..0000000000 --- a/ci/integration_test/ebc/ebc.sub +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /dataset/criteo_kaggle/ && \ - python3 /workdir/test/embedding_collection_test/dlrm_train_ftrl.py --shard_plan round_robin" - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /dataset/criteo_kaggle/ && \ - python3 /workdir/test/embedding_collection_test/dlrm_train_ftrl.py --shard_plan uniform" - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /dataset/criteo_kaggle/ && \ - python3 /workdir/test/embedding_collection_test/dlrm_train_ftrl.py --shard_plan hybrid" - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /dataset/criteo_kaggle/ && \ - python3 /workdir/test/embedding_collection_test/dlrm_train_ftrl.py --shard_plan hybrid --grouped_allreduce" - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" \ - env NCCL_LAUNCH_MODE=GROUP bash -cx " \ - cd /dataset/criteo_kaggle/ && \ - python3 /workdir/test/embedding_collection_test/dlrm_train_ftrl.py --shard_plan round_robin --use_dynamic_hash_table" - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" \ - env NCCL_LAUNCH_MODE=GROUP bash -cx " \ - cd /dataset/criteo_kaggle/ && \ - python3 /workdir/test/embedding_collection_test/dlrm_train_ftrl.py --shard_plan uniform --use_dynamic_hash_table" - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" \ - env NCCL_LAUNCH_MODE=GROUP bash -cx " \ - cd /dataset/criteo_kaggle/ && \ - python3 /workdir/test/embedding_collection_test/dlrm_train_ftrl.py --shard_plan round_robin --use_dynamic_hash_table --optimizer ftrl " - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" \ - env NCCL_LAUNCH_MODE=GROUP bash -cx " \ - cd /dataset/criteo_kaggle/ && \ - python3 /workdir/test/embedding_collection_test/dlrm_train_ftrl.py --shard_plan round_robin --optimizer ftrl " - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" \ - env NCCL_LAUNCH_MODE=GROUP bash -cx " \ - cd /dataset/criteo_kaggle/ && \ - python3 /workdir/test/embedding_collection_test/dlrm_train_ftrl.py --shard_plan uniform --use_dynamic_hash_table --optimizer ftrl" - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" \ - env NCCL_LAUNCH_MODE=GROUP bash -cx " \ - cd /dataset/criteo_kaggle/ && \ - python3 /workdir/test/embedding_collection_test/dlrm_train_ftrl.py --shard_plan uniform --optimizer ftrl" - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" \ - env NCCL_LAUNCH_MODE=GROUP bash -cx " \ - cd /dataset/criteo_kaggle/ && \ - python3 /workdir/test/embedding_collection_test/dlrm_train_ftrl.py --shard_plan round_robin --use_mixed_precision" - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" \ - env NCCL_LAUNCH_MODE=GROUP bash -cx " \ - cd /dataset/criteo_kaggle/ && \ - python3 /workdir/test/embedding_collection_test/dlrm_train_ftrl.py --shard_plan uniform --use_mixed_precision" diff --git a/ci/integration_test/ebc/utest.multinode.sub b/ci/integration_test/ebc/utest.multinode.sub deleted file mode 100644 index c16c36e130..0000000000 --- a/ci/integration_test/ebc/utest.multinode.sub +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /workdir/build/bin && \ - ./embedding_collection_test --gtest_filter=test_embedding_collection.utest_2node" diff --git a/ci/integration_test/gcs/gcs_backend_test.sub b/ci/integration_test/gcs/gcs_backend_test.sub deleted file mode 100644 index da521f0ec1..0000000000 --- a/ci/integration_test/gcs/gcs_backend_test.sub +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -echo "${GCS_ACCESS_FILE}" > ./gcs_credential - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="./gcs_credential:/hugectr/gcs_credential" bash -cx "\ - export GOOGLE_APPLICATION_CREDENTIALS=/hugectr/gcs_credential && \ - cd /workdir/build/bin && \ - ./gcs_backend_test" diff --git a/ci/integration_test/hdfs/hdfs_backend_test.sh b/ci/integration_test/hdfs/hdfs_backend_test.sh deleted file mode 100755 index 520892ee01..0000000000 --- a/ci/integration_test/hdfs/hdfs_backend_test.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -echo $(pwd) -set -ex - -docker login -u ${CI_PRIVATE_USER} -p "${CI_PRIVATE_KEY}" "${CI_REGISTRY}" -ID=$(docker run --gpus all -d -u root ${CONT} bash -cx "\ -ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa && \ -cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \ -/etc/init.d/ssh start && \ -hdfs namenode -format && \ -bash /opt/hadoop/sbin/start-dfs.sh && \ -cd /workdir/build/bin && \ -./hdfs_backend_test && \ -./file_loader_test") - -docker logs -f $ID -exitCode=$(docker wait $ID) -docker rm $ID -exit $exitCode diff --git a/ci/integration_test/hps/hps_tf.sub b/ci/integration_test/hps/hps_tf.sub deleted file mode 100644 index 6af222cb75..0000000000 --- a/ci/integration_test/hps/hps_tf.sub +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /workdir/hps_tf/test/unit && \ - pytest && \ - cd /workdir/hps_tf/test/integration && \ - pytest test_naive_dnn_hps.py -s && \ - pytest test_multi_table_sparse_input_hps.py -s && \ - pytest test_lazy_initialization_hps.py -s && \ - pytest test_multi_gpu_hps.py -s && \ - pytest test_tf_context_stream.py -s && \ - pytest test_hps_table_fusion.py -s" - -# Add workaround due to the known issue of tensorflow -sleep 10 -EXITCODE=`sacct -j "${SLURM_JOBID}" -n --format=exitcode | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g'` -echo "Job exit code: ${EXITCODE}" - -if [ ${EXITCODE} -eq 6 ]&&[ ${CI_JOB_NAME}=="hps_tf" ]; then - echo "Rerun the job, if job exit code is 6 and job name is hps_tf." - srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /workdir/hps_tf/test/unit && \ - pytest && \ - cd /workdir/hps_tf/test/integration && \ - pytest test_naive_dnn_hps.py -s && \ - pytest test_multi_table_sparse_input_hps.py -s && \ - pytest test_lazy_initialization_hps.py -s && \ - pytest test_multi_gpu_hps.py -s && \ - pytest test_tf_context_stream.py -s && \ - pytest test_hps_table_fusion.py -s" - # Get the last job step exit code as job exit code. - sed -i 's/sort -r -u | head -1/tail -1/g' ${JOBSCRIPTSDIR}/mlperf-ci/jobexitcode.sh - echo "Rerun job finished!"; -fi - diff --git a/ci/integration_test/hps/hps_torch.sub b/ci/integration_test/hps/hps_torch.sub deleted file mode 100644 index 13aa18387e..0000000000 --- a/ci/integration_test/hps/hps_torch.sub +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /workdir/hps_torch/test/unit && \ - pytest -s && \ - cd /workdir/hps_torch/test/integration && \ - pytest test_hps_table_fusion.py -s" - -# Add workaround due to the known issue of tensorflow -sleep 10 -EXITCODE=`sacct -j "${SLURM_JOBID}" -n --format=exitcode | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g'` -echo "Job exit code: ${EXITCODE}" - -if [ ${EXITCODE} -eq 6 ]&&[ ${CI_JOB_NAME}=="hps_tf" ]; then - echo "Rerun the job, if job exit code is 6 and job name is hps_tf." - srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /workdir/hps_torch/test/unit && \ - pytest -s && \ - cd /workdir/hps_torch/test/integration && \ - pytest test_hps_table_fusion.py -s" - # Get the last job step exit code as job exit code. - sed -i 's/sort -r -u | head -1/tail -1/g' ${JOBSCRIPTSDIR}/mlperf-ci/jobexitcode.sh - echo "Rerun job finished!"; -fi - diff --git a/ci/integration_test/hps/hps_trt_in_merlin_hugectr.sub b/ci/integration_test/hps/hps_trt_in_merlin_hugectr.sub deleted file mode 100644 index 37bbaa97bd..0000000000 --- a/ci/integration_test/hps/hps_trt_in_merlin_hugectr.sub +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" bash -cx " \ - cd /workdir/hps_trt/test/unit && \ - pytest test_hps.py -s && \ - cd /workdir/hps_trt/test/integration && \ - pytest test_for_hugectr_train.py -s && \ - pytest test_for_hugectr.py -s" diff --git a/ci/integration_test/hps/hps_trt_in_merlin_pytorch.sub b/ci/integration_test/hps/hps_trt_in_merlin_pytorch.sub deleted file mode 100644 index af10f0bbae..0000000000 --- a/ci/integration_test/hps/hps_trt_in_merlin_pytorch.sub +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" bash -cx " \ - cd /workdir/hps_trt/test/unit && \ - pytest test_hps.py -s && \ - cd /workdir/hps_trt/test/integration && \ - pytest test_for_pytorch.py -s" diff --git a/ci/integration_test/hps/hps_trt_in_merlin_tf.sub b/ci/integration_test/hps/hps_trt_in_merlin_tf.sub deleted file mode 100644 index bc0705748c..0000000000 --- a/ci/integration_test/hps/hps_trt_in_merlin_tf.sub +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" bash -cx " \ - cd /workdir/hps_trt/test/unit && \ - pytest test_hps.py -s && \ - cd /workdir/hps_trt/test/integration && \ - pytest test_for_tf.py -s" diff --git a/ci/integration_test/hugectr2onnx/hugectr2onnx.sub b/ci/integration_test/hugectr2onnx/hugectr2onnx.sub deleted file mode 100644 index c7cc687878..0000000000 --- a/ci/integration_test/hugectr2onnx/hugectr2onnx.sub +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - mkdir -p /onnx_converter/onnx_models/ && - mkdir -p /onnx_converter/hugectr_models/ && - mkdir -p /onnx_converter/graph_files && - python3 /workdir/test/onnx_converter_test/layer_type_test.py && - cd ${NEW_CRITEO_MOUNT} && - mpirun -np 1 --allow-run-as-root python3 /workdir/test/onnx_converter_test/train_scripts/dcn.py && - mpirun -np 1 --allow-run-as-root python3 /workdir/test/onnx_converter_test/train_scripts/deepfm.py && - mpirun -np 1 --allow-run-as-root python3 /workdir/test/onnx_converter_test/train_scripts/dlrm.py && - mpirun -np 1 --allow-run-as-root python3 /workdir/test/onnx_converter_test/train_scripts/dlrm_mlp.py && - mpirun -np 1 --allow-run-as-root python3 /workdir/test/onnx_converter_test/train_scripts/wdl.py && - python3 /workdir/test/onnx_converter_test/hugectr2onnx_dcn_test.py && - python3 /workdir/test/onnx_converter_test/hugectr2onnx_wdl_test.py && - cd ${DIN_DATASET_MOUNT} && - mpirun -np 1 --allow-run-as-root python3 /workdir/test/onnx_converter_test/train_scripts/din_parquet.py && - mpirun -np 1 --allow-run-as-root python3 /workdir/test/onnx_converter_test/train_scripts/din_try.py && - python3 /workdir/test/onnx_converter_test/hugectr2onnx_din_test.py && - cd ${NCF_DATASET_MOUNT} && - mpirun -np 1 --allow-run-as-root python3 /workdir/test/onnx_converter_test/train_scripts/ncf.py && - mpirun -np 1 --allow-run-as-root python3 /workdir/test/onnx_converter_test/train_scripts/gmf.py && - mpirun -np 1 --allow-run-as-root python3 /workdir/test/onnx_converter_test/train_scripts/neumf.py && - python3 /workdir/test/onnx_converter_test/hugectr2onnx_ncf_test.py && - cd ${MMOE_DATASET_MOUNT} && - mpirun -np 1 --allow-run-as-root python3 /workdir/test/onnx_converter_test/train_scripts/mmoe_parquet.py && - python3 /workdir/test/onnx_converter_test/hugectr2onnx_mmoe_test.py" diff --git a/ci/integration_test/inference/embedding_cache_perf_test.sub b/ci/integration_test/inference/embedding_cache_perf_test.sub deleted file mode 100644 index 1d0502cdeb..0000000000 --- a/ci/integration_test/inference/embedding_cache_perf_test.sub +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" bash -cx "\ - cd /workdir/build/bin && \ - ./ec_perf_test" diff --git a/ci/integration_test/inference/embedding_cache_test.sh b/ci/integration_test/inference/embedding_cache_test.sh deleted file mode 100644 index 67ba342a0c..0000000000 --- a/ci/integration_test/inference/embedding_cache_test.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -echo $(pwd) -set -x - -docker login -u ${CI_PRIVATE_USER} -p "${CI_PRIVATE_KEY}" "${CI_REGISTRY}" - -ID=$(docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "cd /usr/local/redis/7000 && ../src/redis-server redis.conf ") -docker logs $ID - -ID=$(docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "cd /usr/local/redis/7001 && ../src/redis-server redis.conf ") -docker logs $ID - -ID=$(docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "cd /usr/local/redis/7002 && ../src/redis-server redis.conf ") -docker logs $ID - -ID=$(docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "cd /usr/local/redis/src && echo yes | ./redis-cli --cluster create 127.0.0.1:7000 127.0.0.1:7001 127.0.0.1:7002 --cluster-replicas 0 ") -docker logs $ID - -ID=$(docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "export JAVA_HOME=/usr/local/jdk-16.0.2 && /usr/local/zookeeper/bin/zkServer.sh start && /usr/local/kafka/bin/kafka-server-start.sh /usr/local/kafka/config/server.properties ") -docker logs $ID - -ID=$(docker run --net=host -v /mnt/nvdl/usr/aleliu/inference_ci/model_repository:/models -v /mnt/nvdl/usr/aleliu/inference_ci/:/hugectr/test/utest/ -u root -d ${CONT} bash -cx "cd /workdir/build/bin && mkdir -p /hugectr/Test_Data/rockdb && ./embedding_cache_test || exit 1") -docker logs -f $ID \ No newline at end of file diff --git a/ci/integration_test/inference/embedding_cache_update_test.sh b/ci/integration_test/inference/embedding_cache_update_test.sh deleted file mode 100644 index 06a75517df..0000000000 --- a/ci/integration_test/inference/embedding_cache_update_test.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -echo $(pwd) -set -x - -docker login -u ${CI_PRIVATE_USER} -p "${CI_PRIVATE_KEY}" "${CI_REGISTRY}" - -ID=$(docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "cd /usr/local/redis/7000 && ../src/redis-server redis.conf ") -docker logs $ID - -ID=$(docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "cd /usr/local/redis/7001 && ../src/redis-server redis.conf ") -docker logs $ID - -ID=$(docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "cd /usr/local/redis/7002 && ../src/redis-server redis.conf ") -docker logs $ID - -ID=$(docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "cd /usr/local/redis/src && echo yes | ./redis-cli --cluster create 127.0.0.1:7000 127.0.0.1:7001 127.0.0.1:7002 --cluster-replicas 0 ") -docker logs $ID - -ID=$(docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "export JAVA_HOME=/usr/local/jdk-16.0.2 && /usr/local/zookeeper/bin/zkServer.sh start && /usr/local/kafka/bin/kafka-server-start.sh /usr/local/kafka/config/server.properties ") -docker logs $ID - -ID=$(docker run --net=host --gpus=all -v /home/hugectr-ci/data/inference_ci/model_repository:/models -u root -d ${CONT} bash -cx "cd /workdir/build/bin && mkdir -p /hugectr/Test_Data/rockdb && ./embedding_cache_update_test || exit 1") -docker logs -f $ID -exitCode=$(docker wait $ID) -docker rm $ID -exit $exitCode - diff --git a/ci/integration_test/inference/inference_hps.sub b/ci/integration_test/inference/inference_hps.sub deleted file mode 100755 index 950f137499..0000000000 --- a/ci/integration_test/inference/inference_hps.sub +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /dataset/criteo_kaggle/dcn && \ - python3 /workdir/test/inference/hps/lookup_session_test.py hps_lookup /hugectr/test/utest/wdl_test_files/wdl0_sparse_2000.model,/hugectr/test/utest/wdl_test_files/wdl1_sparse_2000.model /hugectr/test/utest/wdl_test_files/first_ten.csv && \ - pip install torchvision==0.17.2+cu121 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121 && \ - python3 /workdir/test/inference/hps/hpsdlpack.py hpsdlpack /hugectr/test/utest/wdl_test_files/wdl0_sparse_2000.model,/hugectr/test/utest/wdl_test_files/wdl1_sparse_2000.model /hugectr/test/utest/wdl_test_files/first_ten.csv" diff --git a/ci/integration_test/inference/ps_test.sh b/ci/integration_test/inference/ps_test.sh deleted file mode 100644 index 0e8133d80c..0000000000 --- a/ci/integration_test/inference/ps_test.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -echo $(pwd) -set -x - -docker login -u ${CI_PRIVATE_USER} -p "${CI_PRIVATE_KEY}" "${CI_REGISTRY}" - -ID=$(docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "cd /usr/local/redis/7000 && ../src/redis-server redis.conf ") -docker logs $ID - -ID=$(docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "cd /usr/local/redis/7001 && ../src/redis-server redis.conf ") -docker logs $ID - -ID=$(docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "cd /usr/local/redis/7002 && ../src/redis-server redis.conf ") -docker logs $ID - -ID=$(docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "cd /usr/local/redis/src && echo yes | ./redis-cli --cluster create 127.0.0.1:7000 127.0.0.1:7001 127.0.0.1:7002 --cluster-replicas 0 ") -docker logs $ID - -ID=$(docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "export JAVA_HOME=/usr/local/jdk-16.0.2 && /usr/local/zookeeper/bin/zkServer.sh start && /usr/local/kafka/bin/kafka-server-start.sh /usr/local/kafka/config/server.properties ") -docker logs $ID - -ID=$(docker run --gpus=all --net=host -v /home/hugectr-ci/data/inference_ci/model_repository:/models -u root -d ${CONT} bash -cx "cd /workdir/build/bin && mkdir -p /hugectr/Test_Data/rockdb && ./parameter_server_test || exit 1 ") -docker logs -f $ID -exitCode=$(docker wait $ID) -docker rm $ID -exit $exitCode - - -#docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "cd /usr/local/redis/7000 && ../src/redis-server redis.conf " -#docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "cd /usr/local/redis/7001 && ../src/redis-server redis.conf " -#docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "cd /usr/local/redis/7002 && ../src/redis-server redis.conf " -#docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "cd /usr/local/redis/src && echo yes | ./redis-cli --cluster create 127.0.0.1:7000 127.0.0.1:7001 127.0.0.1:7002 --cluster-replicas 0 "; -#docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "export JAVA_HOME=/usr/local/jdk-16.0.2 && /usr/local/zookeeper/bin/zkServer.sh start && /usr/local/kafka/bin/kafka-server-start.sh /usr/local/kafka/config/server.properties " -#sleep 5 -#docker run --gpus=all --rm -v /gpfs/fs1/yingcanw:/hugectr/ -v /gpfs/fs1/yingcanw/wdl_infer:/wdl_infer --net=host gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_train sh -cx "cd /hugectr/hugectr/infer_build && ./bin/parameter_server_test"; diff --git a/ci/integration_test/mmoe/mmoe.sub b/ci/integration_test/mmoe/mmoe.sub deleted file mode 100644 index 64a1856c3b..0000000000 --- a/ci/integration_test/mmoe/mmoe.sub +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /etc/workspace/mmoe_data && \ - python3 /workdir/test/pybind_test/mmoe_test.py" diff --git a/ci/integration_test/notebooks/hps_demo.sub b/ci/integration_test/notebooks/hps_demo.sub deleted file mode 100644 index 39bb904e6e..0000000000 --- a/ci/integration_test/notebooks/hps_demo.sub +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" bash -cx " \ - chmod +x /usr/local/hugectr/bin/* && \ - pip install torchvision==0.17.2+cu121 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121 && \ - cd /workdir/test/notebook_test && pytest hps_demo.py" diff --git a/ci/integration_test/notebooks/notebook_hugectr.sub b/ci/integration_test/notebooks/notebook_hugectr.sub deleted file mode 100644 index 31c702078f..0000000000 --- a/ci/integration_test/notebooks/notebook_hugectr.sub +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - chmod +x /usr/local/hugectr/bin/* && \ - cd /workdir/tools/ && \ - ln -s /workdir/tools/day_0 /workdir/test/notebook_test/day_0 && \ - bash preprocess.sh 0 wdl_data pandas 1 1 100 && \ - ln -s /workdir/tools/wdl_data /workdir/test/notebook_test/wdl_data && \ - sed -i 's/from mpi4py import MPI/#from mpi4py import MPI/g' /workdir/notebooks/multi_gpu_offline_inference.ipynb && \ - cd /workdir/test/notebook_test && pytest notebook_hugectr.py && \ - rm -rf /workdir/test/notebook_test/wdl_data /workdir/tools/wdl_data && cd /workdir/tools" diff --git a/ci/integration_test/nvt/nvt_regression_test.sub b/ci/integration_test/nvt/nvt_regression_test.sub deleted file mode 100644 index 5410bddb07..0000000000 --- a/ci/integration_test/nvt/nvt_regression_test.sub +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cex " \ - export PYTHONPATH=$PYTHONPATH:/usr/local/hugectr/lib && \ - cd /workdir/samples/din && \ - mkdir -p din_data && cd utils && \ - bash preprocess.sh && \ - cd /workdir/tools/ && \ - bash preprocess.sh 1 criteo_data nvt 1 0 0 && \ - sed -i 's/from mpi4py import MPI/#from mpi4py import MPI/g' /workdir/notebooks/hugectr_e2e_demo_with_nvtabular.ipynb && \ - cd /workdir/test/notebook_test && pytest e2e_test_with_nvt.py" diff --git a/ci/integration_test/py_interface/py_low_level.sub b/ci/integration_test/py_interface/py_low_level.sub deleted file mode 100644 index 7da3e229e9..0000000000 --- a/ci/integration_test/py_interface/py_low_level.sub +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -set -e - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /etc/workspace/new_criteo_kaggle && \ - mkdir /workdir/tmp/ && \ - python3 /workdir/test/pybind_test/wdl_fp16_8gpu_export.py /workdir/test/scripts/wdl_fp16_8gpu.json /workdir/tmp/" diff --git a/ci/integration_test/py_interface/py_multi_node.sub b/ci/integration_test/py_interface/py_multi_node.sub deleted file mode 100644 index 7f75184342..0000000000 --- a/ci/integration_test/py_interface/py_multi_node.sub +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -set -e - -#TODO: add restriction of NCCL_IB_HCA for draco-oci, may need remove it for other cluster -srun --ntasks=4 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \ - export NCCL_IB_HCA=\"=mlx5_1\" && \ - cd /dataset/criteo_kaggle/dcn_parquet && \ - python3 /workdir/test/pybind_test/dcn_4node_2gpu.py /workdir/test/scripts/dcn_parquet_distributed_4node_2gpu.json" - -srun --ntasks=2 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \ - export NCCL_IB_HCA=\"=mlx5_1\" && \ - cd /dataset/criteo_kaggle/criteo_parquet && \ - python3 /workdir/test/pybind_test/criteo_2node_4gpu.py /workdir/test/scripts/criteo_2node_4gpu.json" diff --git a/ci/integration_test/py_interface/py_single_node.sub b/ci/integration_test/py_interface/py_single_node.sub deleted file mode 100644 index 1f34d35ac0..0000000000 --- a/ci/integration_test/py_interface/py_single_node.sub +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /dataset/criteo_kaggle/dcn_parquet && \ - python3 /workdir/test/pybind_test/dcn_1gpu.py && \ - cd /etc/workspace/new_criteo_kaggle && \ - python3 /workdir/test/pybind_test/wdl_fp16_8gpu.py /workdir/test/scripts/wdl_fp16_8gpu.json" \ No newline at end of file diff --git a/ci/integration_test/s3/s3_backend_test.sub b/ci/integration_test/s3/s3_backend_test.sub deleted file mode 100644 index afb073d3ed..0000000000 --- a/ci/integration_test/s3/s3_backend_test.sub +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" bash -cx "\ - mkdir -p ~/.aws && cd ~/.aws && \ - echo \"[default]\" > credentials && \ - echo \"aws_access_key_id = ${AWS_ACCESS_KEY_ID}\" >> credentials && \ - echo \"aws_secret_access_key = ${AWS_SECRET_ACCESS_KEY}\" >> credentials && \ - cd /workdir/build/bin && \ - ./s3_backend_test" - diff --git a/ci/integration_test/sok/sok.sub b/ci/integration_test/sok/sok.sub deleted file mode 100644 index b50edc4bc9..0000000000 --- a/ci/integration_test/sok/sok.sub +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /workdir/sparse_operation_kit/unit_test/test_scripts && \ - bash script.sh" - -# Add workaround due to the known issue of tensorflow -sleep 10 -EXITCODE=`sacct -j "${SLURM_JOBID}" -n --format=exitcode | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g'` -echo "Job exit code: ${EXITCODE}" - -if [[ ${EXITCODE} -eq 6 && ${CI_JOB_NAME} == "sparse_operation_kit-TF2" ]]; then - echo "Rerun the job, if job exit code is 6 and job name is sparse_operation_kit-TF2." - srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /workdir/sparse_operation_kit/unit_test/test_scripts && \ - bash script.sh" - # Get the last job step exit code as job exit code. - sed -i 's/sort -r -u | head -1/tail -1/g' ${JOBSCRIPTSDIR}/mlperf-ci/jobexitcode.sh - echo "Rerun job finished!"; -fi - diff --git a/ci/integration_test/sok/sok_tf1_unit.sub b/ci/integration_test/sok/sok_tf1_unit.sub deleted file mode 100644 index 1efd50d0fd..0000000000 --- a/ci/integration_test/sok/sok_tf1_unit.sub +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -set -e - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /workdir/sparse_operation_kit/sparse_operation_kit/test/function_test && \ - bash run_function_test.sh" diff --git a/ci/integration_test/sok/sok_tf2_unit.sub b/ci/integration_test/sok/sok_tf2_unit.sub deleted file mode 100644 index 8e131593e5..0000000000 --- a/ci/integration_test/sok/sok_tf2_unit.sub +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -set -e - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /workdir/sparse_operation_kit/sparse_operation_kit/test/function_test && \ - bash run_function_test.sh" - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /workdir/sparse_operation_kit/sparse_operation_kit/test/function_test && \ - bash run_function_test_multi_process.sh 1" - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /workdir/sparse_operation_kit/sparse_operation_kit/test/function_test && \ - bash run_dump_load_test_multi_process.sh 1" diff --git a/ci/integration_test/sok/test_sok_pypi.sub b/ci/integration_test/sok/test_sok_pypi.sub deleted file mode 100644 index 5d4f406926..0000000000 --- a/ci/integration_test/sok/test_sok_pypi.sub +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" bash -cx " \ - rm -rf /usr/local/lib/python\${PYTHON_VERSION}/dist-packages/merlin_sok* - cd /workdir/sparse_operation_kit/ && \ - python setup.py sdist && \ - cd dist && \ - pip install *.tar.gz --no-build-isolation && \ - cd /workdir/sparse_operation_kit/sparse_operation_kit/test/function_test && \ - bash run_function_test.sh" - diff --git a/ci/integration_test/wdl/wdl.sub b/ci/integration_test/wdl/wdl.sub deleted file mode 100644 index 5cd8a85f6f..0000000000 --- a/ci/integration_test/wdl/wdl.sub +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /dataset/criteo_kaggle/ && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/wdl_1gpu.json && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/wdl_fp16_1gpu.json" diff --git a/ci/integration_test/wdl/wdl_daily.sub b/ci/integration_test/wdl/wdl_daily.sub deleted file mode 100644 index 4bc29e3ccd..0000000000 --- a/ci/integration_test/wdl/wdl_daily.sub +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /dataset/criteo_kaggle/ && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/wdl_8gpu.json && \ - mkdir /workdir/export_predictions_wdl_fp16_8gpu/ && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/wdl_fp16_8gpu.json" diff --git a/ci/post_test/check_147gb_model_benchmark.sub b/ci/post_test/check_147gb_model_benchmark.sub deleted file mode 100644 index c56d73a020..0000000000 --- a/ci/post_test/check_147gb_model_benchmark.sub +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - python3 /workdir/ci/post_test/check_performance.py --job_name 147gb_model_benchmark --log_path /logs" diff --git a/ci/post_test/check_cpu_memory_usage.py b/ci/post_test/check_cpu_memory_usage.py deleted file mode 100644 index 5ed61461a6..0000000000 --- a/ci/post_test/check_cpu_memory_usage.py +++ /dev/null @@ -1,51 +0,0 @@ -import time -import sys -import pandas as pd -import re - - -# read top output log file -if len(sys.argv) == 3: - print("Top command output file path is " + sys.argv[1]) - original_data = pd.read_csv(sys.argv[1], sep="\s+", header=None) -else: - print("Wrong input arguments, at least two arguments") - sys.exit(-1) - - -if original_data.shape[1] == 12: - original_data.columns = [ - "PID", - "USER", - "PR", - "NI", - "VIRT", - "RES", - "SHR", - "S", - "%CPU", - "%MEM", - "TIME+", - "COMMAND", - ] -else: - print("Please check top command output format") - sys.exit(-1) -# Sort by cpu% -original_data.sort_values(by="%MEM", ascending=False, inplace=True) -original_data.reset_index(inplace=True) -# Get the maximum CPU physical memory usage -searchObj_forT = re.search(r"(.*)(t|T)", original_data["RES"][0], re.M | re.I) -searchObj_forG = re.search(r"(.*)(G|g)", original_data["RES"][0], re.M | re.I) -searchObj_forM = re.search(r"(.*)", original_data["RES"][0], re.M | re.I) -cpu_usage_gb = 0 -if searchObj_forT: - cpu_usage_gb = float(searchObj_forT.group(1)) * 1000 -elif searchObj_forG: - cpu_usage_gb = float(searchObj_forG.group(1)) -else: - cpu_usage_gb = float(searchObj_forM.group(1)) / 1000 / 1000 - -if cpu_usage_gb > float(sys.argv[2]): - print("The maximum physical memory usage exceeds the threshold " + sys.argv[2]) - sys.exit(-1) diff --git a/ci/post_test/check_cpu_usage.sub b/ci/post_test/check_cpu_usage.sub deleted file mode 100644 index 2e4608e7bb..0000000000 --- a/ci/post_test/check_cpu_usage.sub +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - python3 /workdir/ci/post_test/check_cpu_memory_usage.py /logs/cpu_dynamic_mem.log 16 && \ - python3 /workdir/ci/post_test/check_cpu_memory_usage.py /logs/cpu_uvm_mem.log 16 && \ - python3 /workdir/ci/post_test/check_cpu_memory_usage.py /logs/cpu_static_mem.log 16 " diff --git a/ci/post_test/check_dcnv2_dlrm_1node.sub b/ci/post_test/check_dcnv2_dlrm_1node.sub deleted file mode 100644 index 003616b795..0000000000 --- a/ci/post_test/check_dcnv2_dlrm_1node.sub +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - python3 /workdir/ci/post_test/check_performance.py --job_name dlrm_dcnv2_1node --log_path /logs" diff --git a/ci/post_test/check_dcnv2_dlrm_8node.sub b/ci/post_test/check_dcnv2_dlrm_8node.sub deleted file mode 100644 index 21f0800f27..0000000000 --- a/ci/post_test/check_dcnv2_dlrm_8node.sub +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - python3 /workdir/ci/post_test/check_performance.py --job_name dlrm_dcnv2_8node --log_path /logs" diff --git a/ci/post_test/check_dlrm_14node.sub b/ci/post_test/check_dlrm_14node.sub deleted file mode 100644 index e821a0686c..0000000000 --- a/ci/post_test/check_dlrm_14node.sub +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - python3 /workdir/ci/post_test/check_performance.py --job_name dlrm_14node --log_path /logs" diff --git a/ci/post_test/check_dlrm_1node.sub b/ci/post_test/check_dlrm_1node.sub deleted file mode 100644 index 0885f20188..0000000000 --- a/ci/post_test/check_dlrm_1node.sub +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - python3 /workdir/ci/post_test/check_performance.py --job_name dlrm_1node --log_path /logs" diff --git a/ci/post_test/check_hps_backend_result.sub b/ci/post_test/check_hps_backend_result.sub deleted file mode 100644 index 7d7a61f990..0000000000 --- a/ci/post_test/check_hps_backend_result.sub +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /hugectr && \ - python3 /hugectr/ci/post_test/check_performance.py --job_name hps_backend_benchmark --log_path /logs " diff --git a/ci/post_test/check_hps_plugin_benchmark.sub b/ci/post_test/check_hps_plugin_benchmark.sub deleted file mode 100644 index d2a6afc915..0000000000 --- a/ci/post_test/check_hps_plugin_benchmark.sub +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - python3 /workdir/ci/post_test/check_performance.py --job_name hps_plugin_benchmark --log_path /logs" diff --git a/ci/post_test/check_hps_tf_fuse_table_benchmark.sub b/ci/post_test/check_hps_tf_fuse_table_benchmark.sub deleted file mode 100644 index 966e9f8376..0000000000 --- a/ci/post_test/check_hps_tf_fuse_table_benchmark.sub +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - python3 /workdir/ci/post_test/check_performance.py --job_name hps_tf_fuse_table_benchmark --log_path /logs" diff --git a/ci/post_test/check_hps_torch_fuse_table_benchmark.sub b/ci/post_test/check_hps_torch_fuse_table_benchmark.sub deleted file mode 100644 index 10edd5a859..0000000000 --- a/ci/post_test/check_hps_torch_fuse_table_benchmark.sub +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - python3 /workdir/ci/post_test/check_performance.py --job_name hps_torch_fuse_table_benchmark --log_path /logs" diff --git a/ci/post_test/check_inference_benchmark.sub b/ci/post_test/check_inference_benchmark.sub deleted file mode 100644 index 8077519bc8..0000000000 --- a/ci/post_test/check_inference_benchmark.sub +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - python3 /workdir/ci/post_test/check_performance.py --job_name inference_benchmark --log_path /logs" diff --git a/ci/post_test/check_performance.py b/ci/post_test/check_performance.py deleted file mode 100644 index 8c6b6100f1..0000000000 --- a/ci/post_test/check_performance.py +++ /dev/null @@ -1,436 +0,0 @@ -import os -from argparse import ArgumentParser -import json -import re -import glob -from collections import defaultdict -import math - -expected_result_json = "./ci/post_test/perf_benchmark.json" -log_pattern = { - "wdl_8gpu": { - "cmd_log": r"python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/wdl_8gpu.json", - "result_log": r"Finish 3000 iterations with batchsize: 16384 in (\d+\.?\d*)s", - }, - "dlrm_1node": { - "cmd_log": r"python3 /workdir/samples/dlrm/dgx_a100.py", - "result_log": r"/ 75868 iterations with batchsize 55296 in (\d+\.?\d*)s. Average", - }, - "dlrm_14node": { - "cmd_log": r"HugeCTR Version", - "result_log": r"/ 58527 iterations with batchsize 71680 in (\d+\.?\d*)s. Average", - }, - "dlrm_dcnv2_1node": { - "cmd_log": r"python3 train.py", - "result_log": r"/ (\d+) iterations with batchsize (\d+) in (\d+\.?\d*)s. Average speed is (\d+\.?\d*) records/s", - }, - "dlrm_dcnv2_8node": { - "cmd_log": r"python3 train.py", - "result_log": r"/ (\d+) iterations with batchsize (\d+) in (\d+\.?\d*)s. Average speed is (\d+\.?\d*) records/s", - }, - "inference_benchmark": { - "cmd_log": r"Server:", - "result_log": r"Avg request latency: (\d+\.?\d*) usec", - }, - "sok": {"cmd_log": r"python3 main.py ", "result_log": r"elapsed_time: (\d+\.?\d*)"}, - "train_bmk": { - "cmd_log": r"python3 ./benchmark_train.py", - "result_log": r"Time\(200 iters\): (\d+\.?\d*)s", - }, - "inference_benchmark_avg": { - "cmd_log": r"Client:", - "result_log": r"Avg latency: (\d+\.?\d*) usec", - }, - "inference_benchmark_p50": { - "cmd_log": r"Client:", - "result_log": r"p50 latency: (\d+\.?\d*) usec", - }, - "inference_benchmark_p90": { - "cmd_log": r"Client:", - "result_log": r"p90 latency: (\d+\.?\d*) usec", - }, - "inference_benchmark_p95": { - "cmd_log": r"Client:", - "result_log": r"p95 latency: (\d+\.?\d*) usec", - }, - "inference_benchmark_p99": { - "cmd_log": r"Client:", - "result_log": r"p99 latency: (\d+\.?\d*) usec", - }, - "hps_plugin_benchmark": { - "cmd_log": r"compute infer", - "result_log": r"compute infer (\d+\.?\d*) usec", - }, - "hps_torch_fuse_table_benchmark": { - "cmd_log": r"compute infer", - "result_log": r"compute infer (\d+\.?\d*) usec", - }, - "hps_tf_fuse_table_benchmark": { - "cmd_log": r"compute infer", - "result_log": r"compute infer (\d+\.?\d*) usec", - }, - "147gb_model_benchmark": { - "cmd_log": r"compute infer", - "result_log": r"compute infer (\d+\.?\d*) usec", - }, - "hps_backend_avg_latency": { - "cmd_log": r"compute infer", - "result_log": r"compute infer (\d+\.?\d*) usec", - }, -} - - -def extract_result_from_log(job_name, log_path): - log_files = glob.glob(os.path.join(log_path, "*", "results", "*.log")) - log_files = [fname for fname in log_files if re.match(r".*[0-9]+.log", fname)] - print("all log files", log_files) - latest_log_file = max(log_files, key=os.path.getctime) - print("use latest log file", latest_log_file) - job_log_pattern = log_pattern[job_name] - results = [] - with open(latest_log_file, "r", errors="ignore") as f: - lines = "".join(f.readlines()) - job_logs = lines.split("+ ") - for each_job_log in job_logs: - if re.search(job_log_pattern["cmd_log"], each_job_log): - for line in each_job_log.split("\n"): - match = re.search(job_log_pattern["result_log"], line) - if match is None: - continue - if job_name == "dlrm_dcnv2_1node" or job_name == "dlrm_dcnv2_8node": - result = float(match.group(4)) - else: - result = float(match.group(1)) - results.append(result) - if ( - job_name == "hps_plugin_benchmark" - or job_name == "hps_torch_fuse_table_benchmark" - or job_name == "hps_tf_fuse_table_benchmark" - or job_name == "147gb_model_benchmark" - ): - return results - return sum(results) / len(results) if len(results) > 0 else float("inf") - - -def extract_result_from_json(job_name): - with open(expected_result_json, "r") as f: - expected_reuslt = json.load(f) - return expected_reuslt[job_name] - - -def collect_benchmark_result(log_path): - headers = [ - "name", - "batch_size", - "batch_size_per_gpu", - "total_gpu_num", - "node_num", - "precision", - "platform", - "ms per iteration", - "p99 latency(usec)", - "p95 latency(usec)", - "p90 latency(usec)", - "p50 latency(usec)", - "Avg latency(usec)", - "throughput", - "result_log_path", - ] - list_benchmark = [] - for train_bmk_name in ["wdl", "dcn", "deepfm"]: - for bz_per_gpu in [256, 512, 1024, 2048, 4096, 8192]: - for gpu_num in [1, 2, 4, 8, 16, 32]: - for mixed_precision in ["FP16", "FP32"]: - benchmark = ["" for _ in range(len(headers))] - benchmark[headers.index("name")] = train_bmk_name - benchmark[headers.index("batch_size")] = bz_per_gpu * gpu_num - benchmark[headers.index("batch_size_per_gpu")] = bz_per_gpu - benchmark[headers.index("total_gpu_num")] = gpu_num - node_num = (gpu_num - 1) // 8 + 1 - benchmark[headers.index("node_num")] = node_num - benchmark[headers.index("precision")] = mixed_precision - benchmark[headers.index("platform")] = "selene" - - gpu_num_per_node = gpu_num % 8 if gpu_num % 8 != 0 else 8 - result_log_path = os.path.join( - log_path, - "train_benchmark--{bmk_name}--{node_num}x{gpu_num_per_node}x{bz_per_gpu}x{mixed_precision}".format( - bmk_name=train_bmk_name, - node_num=node_num, - gpu_num_per_node=gpu_num_per_node, - bz_per_gpu=bz_per_gpu, - mixed_precision=mixed_precision, - ), - ) - benchmark[headers.index("result_log_path")] = result_log_path - if os.path.exists(result_log_path): - ms_per_iteration = extract_result_from_log("train_bmk", result_log_path) - ms_per_iteration = ms_per_iteration / 200 * 1000 - benchmark[headers.index("ms per iteration")] = ms_per_iteration - list_benchmark.append(benchmark) - - for bz in [1, 32, 64, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]: - for mixed_precision in ["FP16", "FP32"]: - benchmark = ["" for _ in range(len(headers))] - benchmark[headers.index("name")] = "inference_benchmark" - benchmark[headers.index("batch_size")] = bz - benchmark[headers.index("batch_size_per_gpu")] = bz - benchmark[headers.index("total_gpu_num")] = 1 - benchmark[headers.index("node_num")] = 1 - benchmark[headers.index("precision")] = mixed_precision - benchmark[headers.index("platform")] = "selene" - - result_log_path = os.path.join( - log_path, - "inference_benchmark_{bz}x{mixed_precision}".format( - bz=bz, mixed_precision=mixed_precision - ), - ) - benchmark[headers.index("result_log_path")] = result_log_path - if os.path.exists(result_log_path): - avg_latency = extract_result_from_log("inference_benchmark_avg", result_log_path) - benchmark[headers.index("Avg latency(usec)")] = avg_latency - p50_latency = extract_result_from_log("inference_benchmark_p50", result_log_path) - benchmark[headers.index("p50 latency(usec)")] = p50_latency - p90_latency = extract_result_from_log("inference_benchmark_p90", result_log_path) - benchmark[headers.index("p90 latency(usec)")] = p90_latency - p95_latency = extract_result_from_log("inference_benchmark_p95", result_log_path) - benchmark[headers.index("p95 latency(usec)")] = p95_latency - p99_latency = extract_result_from_log("inference_benchmark_p99", result_log_path) - benchmark[headers.index("p99 latency(usec)")] = p99_latency - list_benchmark.append(benchmark) - - for bz in [8192, 16384, 32768, 65536]: - for gpu_num in [1, 2, 4, 8]: - benchmark = ["" for _ in range(len(headers))] - benchmark[headers.index("name")] = "sok" - benchmark[headers.index("batch_size")] = bz - bz_per_gpu = bz // gpu_num - benchmark[headers.index("batch_size_per_gpu")] = bz_per_gpu - benchmark[headers.index("total_gpu_num")] = gpu_num - benchmark[headers.index("node_num")] = 1 - benchmark[headers.index("precision")] = "FP32" - benchmark[headers.index("platform")] = "selene" - - result_log_path = os.path.join( - log_path, - "sok_benchmark_{bz_per_gpu}x{gpu_num}".format( - bz_per_gpu=bz_per_gpu, gpu_num=gpu_num - ), - ) - benchmark[headers.index("result_log_path")] = result_log_path - if os.path.exists(result_log_path): - ms_per_iteration = extract_result_from_log("sok", result_log_path) - ms_per_iteration = ms_per_iteration * 10 - benchmark[headers.index("ms per iteration")] = ms_per_iteration - list_benchmark.append(benchmark) - - for bz in [256, 1024, 2048, 8192, 131072]: - for gpu_num in [1]: - benchmark = ["" for _ in range(len(headers))] - benchmark[headers.index("name")] = "hps_backend" - benchmark[headers.index("batch_size")] = bz - bz_per_gpu = bz // gpu_num - benchmark[headers.index("batch_size_per_gpu")] = bz_per_gpu - benchmark[headers.index("total_gpu_num")] = gpu_num - benchmark[headers.index("node_num")] = 1 - benchmark[headers.index("precision")] = "FP32" - benchmark[headers.index("platform")] = "selene" - - result_log_path = os.path.join( - log_path, - "hps_backend_benchmark_{bz}".format(bz=bz), - ) - benchmark[headers.index("result_log_path")] = result_log_path - if os.path.exists(result_log_path): - backend_avg_latency = extract_result_from_log( - "hps_backend_avg_latency", result_log_path - ) - backend_throughput = int(1000000.0 / backend_avg_latency * bz) - benchmark[headers.index("Avg latency(usec)")] = backend_avg_latency - benchmark[headers.index("throughput")] = backend_throughput - list_benchmark.append(benchmark) - - print(",".join(headers)) - for benchmark in list_benchmark: - print(",".join(str(i) for i in benchmark)) - - -def check_perf_result(perf_result, expected_result): - if math.isinf(perf_result): - raise RuntimeError("perf_result: {}! Please check!".format(math.fabs(perf_result))) - if float(perf_result) > float(expected_result): - raise RuntimeError( - "performance get worse. perf latency: {} vs. upper bound latency :{}".format( - math.fabs(perf_result), math.fabs(expected_result) - ) - ) - else: - print( - "performance check pass. perf latency: {} vs. upper bound latency :{}".format( - math.fabs(perf_result), math.fabs(expected_result) - ) - ) - - -if __name__ == "__main__": - parser = ArgumentParser() - parser.add_argument("--collect_result", action="store_true", default=False) - parser.add_argument("--job_name") - parser.add_argument("--log_path", required=True) - args = parser.parse_args() - - if args.collect_result: - collect_benchmark_result(args.log_path) - else: - expected_result = extract_result_from_json(args.job_name) - - if args.job_name == "hps_plugin_benchmark": - perf_result = extract_result_from_log(args.job_name, args.log_path) - idx = 0 - batch_sizes = ["32", "1024", "16384"] - print("DLRM Inference Latency (usec)") - print( - "-----------------------------------------------------------------------------------------" - ) - print("batch_size\tnative tf\ttf_with_hps\tfp32_trt_with_hps\tfp16_trt_with_hps") - print( - "-----------------------------------------------------------------------------------------" - ) - for i in range(len(perf_result) // 4): - print( - "{}\t\t{}\t\t{}\t\t{}\t\t\t{}".format( - batch_sizes[i], - perf_result[i * 4], - perf_result[i * 4 + 1], - perf_result[i * 4 + 2], - perf_result[i * 4 + 3], - ) - ) - print( - "-----------------------------------------------------------------------------------------" - ) - for batch_size in batch_sizes: - for model_name in [ - "native_tf", - "tf_with_hps", - "fp32_trt_with_hps", - "fp16_trt_with_hps", - ]: - perf = perf_result[idx] - expected = expected_result[model_name][batch_size] - check_perf_result(perf, expected) - idx += 1 - elif ( - args.job_name == "hps_tf_fuse_table_benchmark" - or args.job_name == "hps_torch_fuse_table_benchmark" - ): - perf_result = extract_result_from_log(args.job_name, args.log_path) - idx = 0 - batch_sizes = ["256", "1024", "4096", "16384"] - print(f"Job Name: {args.job_name}") - print("HPS Fuse Table Model Inference Latency (usec)") - print("-" * 137) - print( - "batch_size\t8_static_table_unfused\t\t8_static_table_autofused\t8_dynamic_table_unfused\t\t8_dynamic_table_autofused" - ) - print("-" * 137) - for i in range(len(perf_result) // 4): - print( - "{}\t\t{}\t\t\t\t{}\t\t\t\t{}\t\t\t\t{}".format( - batch_sizes[i], - perf_result[i * 4], - perf_result[i * 4 + 1], - perf_result[i * 4 + 2], - perf_result[i * 4 + 3], - ) - ) - print("-" * 137) - for batch_size in batch_sizes: - for model_name in [ - "8_static_table_unfused", - "8_static_table_autofused", - "8_dynamic_table_unfused", - "8_dynamic_table_autofused", - ]: - perf = perf_result[idx] - expected = expected_result[model_name][batch_size] - check_perf_result(perf, expected) - idx += 1 - elif args.job_name == "147gb_model_benchmark": - perf_result = extract_result_from_log(args.job_name, args.log_path) - idx = 0 - batch_sizes = ["256", "1024", "4096", "16384"] - print("147GB Model Inference Latency (usec)") - print("-" * 100) - print( - "batch_size\tdynamic_1fc_lite_hps_trt\tdynamic_3fc_lite_hps_trt\tdynamic_dlrm_hps_trt" - ) - print("-" * 100) - for i in range(len(perf_result) // 3): - print( - "{}\t\t{}\t\t\t\t{}\t\t\t\t{}".format( - batch_sizes[i], - perf_result[i * 3], - perf_result[i * 3 + 1], - perf_result[i * 3 + 2], - ) - ) - print("-" * 100) - for batch_size in batch_sizes: - for model_name in [ - "dynamic_1fc_lite_hps_trt", - "dynamic_3fc_lite_hps_trt", - "dynamic_dlrm_hps_trt", - ]: - perf = perf_result[idx] - expected = expected_result[model_name][batch_size] - check_perf_result(perf, expected) - idx += 1 - elif args.job_name == "hps_backend_benchmark": - idx = 0 - perf_result = [] - batch_sizes = ["256", "1024", "2048", "8192", "131072"] - for bz in batch_sizes: - result_log_path = os.path.join( - args.log_path, - "hps_backend_benchmark_{bz}".format(bz=bz), - ) - if os.path.exists(result_log_path): - backend_avg_latency = extract_result_from_log( - "hps_backend_avg_latency", result_log_path - ) - perf_result.append(backend_avg_latency) - - print("HPS Backend Inference Latency (usec) and Throughput") - print( - "-----------------------------------------------------------------------------------------" - ) - print("batch_size\tavg_latency\tthroughput") - print( - "-----------------------------------------------------------------------------------------" - ) - for i in range(len(perf_result)): - print( - "{}\t\t{}\t\t{}".format( - batch_sizes[i], - perf_result[i], - int(1000000.0 / perf_result[i] * int(batch_sizes[i])), - ) - ) - print( - "-----------------------------------------------------------------------------------------" - ) - idx = 0 - for batch_size in batch_sizes: - perf = perf_result[idx] - print("Check avg_latency for BZ: {}".format(batch_size)) - expected_latency = expected_result["avg_latency"][batch_size] - check_perf_result(perf, expected_latency) - idx += 1 - else: - perf_result = extract_result_from_log(args.job_name, args.log_path) - if args.job_name in ["dlrm_dcnv2_1node", "dlrm_dcnv2_8node"]: - check_perf_result(-perf_result, -expected_result) - else: - check_perf_result(perf_result, expected_result) diff --git a/ci/post_test/check_wdl.sub b/ci/post_test/check_wdl.sub deleted file mode 100644 index 863970cfe6..0000000000 --- a/ci/post_test/check_wdl.sub +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - python3 /workdir/ci/post_test/check_performance.py --job_name wdl_8gpu --log_path /logs" diff --git a/ci/post_test/collect_benchmark.sub b/ci/post_test/collect_benchmark.sub deleted file mode 100644 index fc9918d07f..0000000000 --- a/ci/post_test/collect_benchmark.sub +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /hugectr && \ - python3 /hugectr/ci/post_test/check_performance.py --collect_result --log_path /logs " diff --git a/ci/post_test/perf_benchmark.json b/ci/post_test/perf_benchmark.json deleted file mode 100644 index 48371a571c..0000000000 --- a/ci/post_test/perf_benchmark.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "wdl_1gpu": 20.044, - "wdl_8gpu": 56, - "dlrm_1node": 119, - "dlrm_14node": 60, - "dlrm_dcnv2_1node": 6660000, - "dlrm_dcnv2_8node": 18250000, - "inference_benchmark": 500, - "hps_plugin_benchmark": { - "native_tf": {"32":800, "1024":3000, "16384":50000}, - "tf_with_hps": {"32":800, "1024":1500, "16384":6000}, - "fp32_trt_with_hps": {"32":600, "1024":1000, "16384":5000}, - "fp16_trt_with_hps": {"32":500, "1024":800, "16384":4000} - }, - "hps_torch_fuse_table_benchmark": { - "8_static_table_unfused": {"256":900, "1024":1000, "4096":1300, "16384":3000}, - "8_static_table_autofused": {"256":700, "1024":800, "4096":1300, "16384":3600}, - "8_dynamic_table_unfused": {"256":2200, "1024":5400, "4096":9500, "16384":13000}, - "8_dynamic_table_autofused": {"256":1000, "1024":1300, "4096":2600, "16384":6000} - }, - "hps_tf_fuse_table_benchmark": { - "8_static_table_unfused": {"256":1000, "1024":1200, "4096":1600, "16384":3500}, - "8_static_table_autofused": {"256":800, "1024":1000, "4096":1800, "16384":4500}, - "8_dynamic_table_unfused": {"256":2200, "1024":5200, "4096":9500, "16384":13000}, - "8_dynamic_table_autofused": {"256":1100, "1024":1500, "4096":3000, "16384":7000} - }, - "147gb_model_benchmark": { - "dynamic_1fc_lite_hps_trt": {"256":250, "1024":550, "4096":1900, "16384":5800}, - "dynamic_3fc_lite_hps_trt": {"256":300, "1024":600, "4096":2000, "16384":6000}, - "dynamic_dlrm_hps_trt": {"256":500, "1024":800, "4096":2500, "16384":7000} - }, - "hps_backend_benchmark": { - "avg_latency": {"256":600, "1024":1700, "2048":9000, "8192":36000, "131072":520000} - } -} diff --git a/ci/release.yml b/ci/release.yml deleted file mode 100644 index f3e3c8c3a5..0000000000 --- a/ci/release.yml +++ /dev/null @@ -1,258 +0,0 @@ - -# .release_test_job: &release_test_job -# allow_failure: false -# only: -# variables: -# - $RELEASE_TEST == "1" - -# .release_test_job_selene: &release_test_job_selene -# extends: .selene_job -# <<: *release_test_job -# stage: test -# variables: &release_test_job_selene_variables -# CONTAINER_IMAGE: ${TRAIN_IMAGE_RELEASE_ENROOT} -# CONTS: ${CI_PROJECT_DIR}:${CI_PROJECT_DIR},${DATASET_CRITEO_SELENE}:${CRITEO_MOUNT},${DATASET_NEW_CRITEO_SELENE}:${NEW_CRITEO_MOUNT},${DLRM_DATASET}:${DLRM_MOUNT},/lustre/fsw/devtech/hpc-hugectr/inference/:/hugectr/test/utest/ - -# .release_test_job_circe: &release_test_job_circe -# extends: .circe_job -# <<: *release_test_job -# stage: test -# variables: &release_test_job_circe_variables -# CONTAINER_IMAGE: ${TRAIN_IMAGE_RELEASE_ENROOT} -# CONTS: $(pwd):$(pwd),${DATASET_CRITEO_CIRCE}:${CRITEO_MOUNT},${DATASET_NEW_CRITEO_CIRCE}:${NEW_CRITEO_MOUNT},${DLRM_DATASET}:${DLRM_MOUNT} - -# .sample_wdl_job: &sample_wdl_job -# script: -# - srun --ntasks=1 -# -A devtech -J devtech-hugectr:${CI_PIPELINE_ID}:aleliu:hugectr:dl/hugectr/HugeCTR:${CI_COMMIT_BRANCH}:${CI_JOB_ID} -p luna -# --container-image ${CONTAINER_IMAGE} -# --container-mounts ${CONTS} -# --ntasks-per-node 1 -# bash -cx " -# cd ${CI_PROJECT_DIR} && -# export PYTHONPATH=${WORK_DIR}/build/build_single/lib && -# ln -s ${NEW_CRITEO_MOUNT}/wdl_data; -# sed -i 's/criteo_data/wdl_data/g' ./samples/wdl/wdl.py && -# python3 ./samples/wdl/wdl.py" -# - srun --ntasks=4 -# -A devtech -J devtech-hugectr:${CI_PIPELINE_ID}:aleliu:hugectr:dl/hugectr/HugeCTR:${CI_COMMIT_BRANCH}:${CI_JOB_ID} -p luna -# --container-image ${CONTAINER_IMAGE} -# --container-mounts ${CONTS} -# --ntasks-per-node 1 -# --network sharp -# bash -cx " -# cd ${CI_PROJECT_DIR} && -# export PYTHONPATH=${WORK_DIR}/build/build_single/lib && -# ln -s ${NEW_CRITEO_MOUNT}/wdl_data; -# sed -i 's/criteo_data/wdl_data/g' ./samples/wdl/wdl_8gpu.py && -# python3 ./samples/wdl/wdl_8gpu.py" -# - srun --ntasks=1 -# -A devtech -J devtech-hugectr:${CI_PIPELINE_ID}:aleliu:hugectr:dl/hugectr/HugeCTR:${CI_COMMIT_BRANCH}:${CI_JOB_ID} -p luna -# --container-image ${CONTAINER_IMAGE} -# --container-mounts ${CONTS} -# --ntasks-per-node 1 -# bash -cx " -# cd ${CI_PROJECT_DIR} && -# export PYTHONPATH=${WORK_DIR}/build/build_single/lib && -# ln -s ${NEW_CRITEO_MOUNT}/wdl_data_parquet; -# sed -i 's/criteo_data/wdl_data_parquet/g' ./samples/wdl/wdl_parquet.py; -# python3 ./samples/wdl/wdl_parquet.py" -# - srun --ntasks=1 -# -A devtech -J devtech-hugectr:${CI_PIPELINE_ID}:aleliu:hugectr:dl/hugectr/HugeCTR:${CI_COMMIT_BRANCH}:${CI_JOB_ID} -p luna -# --container-image ${CONTAINER_IMAGE} -# --container-mounts ${CONTS} -# --ntasks-per-node 1 -# bash -cx " -# cd ${CI_PROJECT_DIR} && -# export PYTHONPATH=${WORK_DIR}/build/build_single/lib && -# ln -s ${NEW_CRITEO_MOUNT}/wdl_data_nvt_bin; -# sed -i 's/criteo_data/wdl_data_nvt_bin/g' ./samples/wdl/wdl_bin.py && -# python3 ./samples/wdl/wdl_bin.py" -# timeout: 2 hours - -# .sample_deepfm_job: &sample_deepfm_job -# script: -# - srun --ntasks=1 -# -A devtech -J devtech-hugectr:${CI_PIPELINE_ID}:aleliu:hugectr:dl/hugectr/HugeCTR:${CI_COMMIT_BRANCH}:${CI_JOB_ID} -p luna -# --container-image ${CONTAINER_IMAGE} -# --container-mounts ${CONTS} -# --ntasks-per-node 1 -# bash -cx " -# cd ${CI_PROJECT_DIR} && -# export PYTHONPATH=${WORK_DIR}/build/build_single/lib && -# ln -s ${NEW_CRITEO_MOUNT}/dcn_data; -# sed -i 's/criteo_data/dcn_data/g' ./samples/deepfm/deepfm.py && -# python3 ./samples/deepfm/deepfm.py;" -# - srun --ntasks=1 -# -A devtech -J devtech-hugectr:${CI_PIPELINE_ID}:aleliu:hugectr:dl/hugectr/HugeCTR:${CI_COMMIT_BRANCH}:${CI_JOB_ID} -p luna -# --container-image ${CONTAINER_IMAGE} -# --container-mounts ${CONTS} -# --ntasks-per-node 1 -# bash -cx " -# cd ${CI_PROJECT_DIR} && -# export PYTHONPATH=${WORK_DIR}/build/build_single/lib && -# ln -s ${NEW_CRITEO_MOUNT}/deepfm_data_nvt_bin; -# sed -i 's/criteo_data/deepfm_data_nvt_bin/g' ./samples/deepfm/deepfm_bin.py && -# python3 ./samples/deepfm/deepfm_bin.py" -# - srun --ntasks=1 -# -A devtech -J devtech-hugectr:${CI_PIPELINE_ID}:aleliu:hugectr:dl/hugectr/HugeCTR:${CI_COMMIT_BRANCH}:${CI_JOB_ID} -p luna -# --container-image ${CONTAINER_IMAGE} -# --container-mounts ${CONTS} -# --ntasks-per-node 1 -# bash -cx " -# cd ${CI_PROJECT_DIR} && -# export PYTHONPATH=${WORK_DIR}/build/build_single/lib && -# ln -s ${NEW_CRITEO_MOUNT}/deepfm_data_nvt; -# sed -i 's/criteo_data/deepfm_data_nvt/g' ./samples/deepfm/deepfm_parquet.py && -# python3 ./samples/deepfm/deepfm_parquet.py" -# timeout: 2 hours - -# .sample_dcn_job: &sample_dcn_job -# script: -# - srun --ntasks=1 -# -A devtech -J devtech-hugectr:${CI_PIPELINE_ID}:aleliu:hugectr:dl/hugectr/HugeCTR:${CI_COMMIT_BRANCH}:${CI_JOB_ID} -p luna -# --container-image ${CONTAINER_IMAGE} -# --container-mounts ${CONTS} -# --ntasks-per-node 1 -# bash -cx " -# cd ${CI_PROJECT_DIR} && -# export PYTHONPATH=${WORK_DIR}/build/build_single/lib && -# ln -s ${NEW_CRITEO_MOUNT}/dcn_data; -# sed -i 's/criteo_data/dcn_data/g' ./samples/dcn/dcn.py && -# python3 ./samples/dcn/dcn.py" -# - srun --ntasks=1 -# -A devtech -J devtech-hugectr:${CI_PIPELINE_ID}:aleliu:hugectr:dl/hugectr/HugeCTR:${CI_COMMIT_BRANCH}:${CI_JOB_ID} -p luna -# --container-image ${CONTAINER_IMAGE} -# --container-mounts ${CONTS} -# --ntasks-per-node 1 -# bash -cx " -# cd ${CI_PROJECT_DIR} && -# export PYTHONPATH=${WORK_DIR}/build/build_single/lib && -# ln -s ${NEW_CRITEO_MOUNT}/deepfm_data_nvt_bin; -# sed -i 's/criteo_data/deepfm_data_nvt_bin/g' ./samples/dcn/dcn_bin.py && -# python3 ./samples/dcn/dcn_bin.py" -# - srun --ntasks=1 -# -A devtech -J devtech-hugectr:${CI_PIPELINE_ID}:aleliu:hugectr:dl/hugectr/HugeCTR:${CI_COMMIT_BRANCH}:${CI_JOB_ID} -p luna -# --container-image ${CONTAINER_IMAGE} -# --container-mounts ${CONTS} -# --ntasks-per-node 1 -# bash -cx " -# cd ${CI_PROJECT_DIR} && -# export PYTHONPATH=${WORK_DIR}/build/build_single/lib && -# ln -s ${NEW_CRITEO_MOUNT}/deepfm_data_nvt; -# sed -i 's/criteo_data/deepfm_data_nvt/g' ./samples/dcn/dcn_parquet.py && -# python3 ./samples/dcn/dcn_parquet.py" -# - srun --ntasks=1 -# -A devtech -J devtech-hugectr:${CI_PIPELINE_ID}:aleliu:hugectr:dl/hugectr/HugeCTR:${CI_COMMIT_BRANCH}:${CI_JOB_ID} -p luna -# --container-image ${CONTAINER_IMAGE} -# --container-mounts ${CONTS} -# --ntasks-per-node 1 -# bash -cx " -# cd ${CI_PROJECT_DIR} && -# export PYTHONPATH=${WORK_DIR}/build/build_single/lib && -# export CUDA_DEVICE_ORDER=PCI_BUS_ID && -# ln -s ${NEW_CRITEO_MOUNT}/dcn_data; -# sed -i 's/criteo_data/dcn_data/g' ./samples/dcn/dcn_localized_embedding.py && -# python3 ./samples/dcn/dcn_localized_embedding.py" -# - srun --ntasks=2 -# -A devtech -J devtech-hugectr:${CI_PIPELINE_ID}:aleliu:hugectr:dl/hugectr/HugeCTR:${CI_COMMIT_BRANCH}:${CI_JOB_ID} -p luna -# --container-image ${CONTAINER_IMAGE} -# --container-mounts ${CONTS} -# --ntasks-per-node 1 -# --network sharp -# bash -cx " -# cd ${CI_PROJECT_DIR} && -# export PYTHONPATH=${WORK_DIR}/build/build_multi_nccl/lib && -# ln -s ${NEW_CRITEO_MOUNT}/dcn_data; -# sed -i 's/criteo_data/dcn_data/g' ./samples/dcn/dcn_2node_8gpu.py && -# python3 ./samples/dcn/dcn_2node_8gpu.py" -# timeout: 2 hours - -# .sample_criteo_job: &sample_criteo_job -# script: -# - srun --ntasks=1 -# -A devtech -J devtech-hugectr:${CI_PIPELINE_ID}:aleliu:hugectr:dl/hugectr/HugeCTR:${CI_COMMIT_BRANCH}:${CI_JOB_ID} -p luna -# --container-image ${CONTAINER_IMAGE} -# --container-mounts ${CONTS} -# --ntasks-per-node 1 -# bash -cx " -# cd ${CI_PROJECT_DIR} && -# export PYTHONPATH=${WORK_DIR}/build/build_single/lib && -# ln -s ${NEW_CRITEO_MOUNT}/criteo_data; -# python3 ./samples/criteo/criteo.py;" -# - srun --ntasks=1 -# -A devtech -J devtech-hugectr:${CI_PIPELINE_ID}:aleliu:hugectr:dl/hugectr/HugeCTR:${CI_COMMIT_BRANCH}:${CI_JOB_ID} -p luna -# --container-image ${CONTAINER_IMAGE} -# --container-mounts ${CONTS} -# --ntasks-per-node 1 -# bash -cx " -# cd ${CI_PROJECT_DIR} && -# export PYTHONPATH=${WORK_DIR}/build/build_single/lib && -# ln -s ${NEW_CRITEO_MOUNT}/criteo_bin; -# sed -i 's/criteo_data/criteo_bin/g' samples/criteo/criteo_bin.py && -# python3 samples/criteo/criteo_bin.py" -# - srun --ntasks=1 -# -A devtech -J devtech-hugectr:${CI_PIPELINE_ID}:aleliu:hugectr:dl/hugectr/HugeCTR:${CI_COMMIT_BRANCH}:${CI_JOB_ID} -p luna -# --container-image ${CONTAINER_IMAGE} -# --container-mounts ${CONTS} -# --ntasks-per-node 1 -# bash -cx " -# cd ${CI_PROJECT_DIR} && -# export PYTHONPATH=${WORK_DIR}/build/build_single/lib && -# ln -s ${NEW_CRITEO_MOUNT}/criteo_parquet; -# sed -i 's/criteo_data/criteo_parquet/g' samples/criteo/criteo_parquet.py && -# python3 ./samples/criteo/criteo_parquet.py" -# - srun --ntasks=1 -# -A devtech -J devtech-hugectr:${CI_PIPELINE_ID}:aleliu:hugectr:dl/hugectr/HugeCTR:${CI_COMMIT_BRANCH}:${CI_JOB_ID} -p luna -# --container-image ${CONTAINER_IMAGE} -# --container-mounts ${CONTS} -# --ntasks-per-node 1 -# bash -cx " -# cd ${CI_PROJECT_DIR}; -# export PYTHONPATH=${WORK_DIR}/build/build_single/lib && -# ln -s ${NEW_CRITEO_MOUNT}/criteo_multi_slots; -# sed -i 's/criteo_data/criteo_multi_slots/g' samples/criteo_multi_slots/criteo_multi_slots.py && -# python3 ./samples/criteo_multi_slots/criteo_multi_slots.py" -# timeout: 2 hours - -# .sample_dlrm_job: &sample_dlrm_job -# script: -# - srun --ntasks=1 -# -A devtech -J devtech-hugectr:${CI_PIPELINE_ID}:aleliu:hugectr:dl/hugectr/HugeCTR:${CI_COMMIT_BRANCH}:${CI_JOB_ID} -p luna -# --container-image ${CONTAINER_IMAGE} -# --container-mounts ${CONTS} -# --ntasks-per-node 1 -# bash -cx " -# cd ${NEW_CRITEO_MOUNT}/dlrm_kaggle && -# export PYTHONPATH=${WORK_DIR}/build/build_single/lib && -# python3 ${WORK_DIR}/samples/dlrm/dlrm_kaggle_fp32.py" -# - srun --ntasks=1 -# -A devtech -J devtech-hugectr:${CI_PIPELINE_ID}:aleliu:hugectr:dl/hugectr/HugeCTR:${CI_COMMIT_BRANCH}:${CI_JOB_ID} -p luna -# --container-image ${CONTAINER_IMAGE} -# --container-mounts ${CONTS} -# --ntasks-per-node 1 -# bash -cx " -# cd ${DLRM_MOUNT} && -# export PYTHONPATH=${WORK_DIR}/build/build_single/lib && -# python3 ${WORK_DIR}/samples/dlrm/dlrm_terabyte_fp16_64k.py" -# timeout: 2 hours - -# sample_wdl_selene: -# <<: *release_test_job_selene -# <<: *sample_wdl_job - -# sample_deepfm_selene: -# <<: *release_test_job_selene -# <<: *sample_deepfm_job - -# sample_dcn_selene: -# <<: *release_test_job_selene -# <<: *sample_dcn_job - -# sample_criteo_selene: -# <<: *release_test_job_selene -# <<: *sample_criteo_job - -# sample_dlrm_selene: -# <<: *release_test_job_selene -# <<: *sample_dlrm_job \ No newline at end of file diff --git a/ci/rules.gitlab_ci.yml b/ci/rules.gitlab_ci.yml deleted file mode 100644 index 145248fb4c..0000000000 --- a/ci/rules.gitlab_ci.yml +++ /dev/null @@ -1,241 +0,0 @@ -############## -# Conditions # -############## -.if-default-ci-action: &if-default-ci-actions - if: '$CI_PIPELINE_SOURCE =~ /^(push|web|merge_request_event|trigger)$/' - -.if-new-image: &if-new-image - if: '$TEST_NEW_IMAGE == "1"' - -.if-merge-request: &if-merge-request - if: '$CI_PIPELINE_SOURCE =~ /^(web|merge_request_event)$/' - -.if-default-branch-refs: &if-default-branch-refs - if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH' - -.if-default-branch-push: &if-default-branch-push - if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $CI_PIPELINE_SOURCE == "push"' - -# If MR opened, then a push to the branch -.if-push-to-mr-opened: &if-push-to-mr-opened - if: '$CI_COMMIT_BRANCH && $CI_OPEN_MERGE_REQUESTS && $CI_PIPELINE_SOURCE == "push"' - -.if-daily-schedule: &if-daily-schedule - if: '$CI_PIPELINE_SOURCE == "schedule" && $DAILY == "1"' - -.if-use-dracorno-m2-partition: &if-use-dracorno-m2-partition - if: '$DRACORNO_PARTITION == "m2" && $CI_PIPELINE_SOURCE =~ /^(web|merge_request_event)$/' - -.if-use-dracorno-m3-partition: &if-use-dracorno-m3-partition - if: '$DRACORNO_PARTITION == "m3" && $CI_PIPELINE_SOURCE =~ /^(web|merge_request_event)$/' - -# Below is the conditions for child pipeline on dracorno -.if-child-ci-action: &if-child-ci-actions - if: '$PARENT_SOURCE =~ /^(push|web|merge_request_event|trigger)$/' - -.if-merge-request-child: &if-merge-request-child - if: '$PARENT_SOURCE =~ /^(web|merge_request_event)$/' - -.if-push-to-mr-opened-child: &if-push-to-mr-opened-child - if: '$CI_COMMIT_BRANCH && $CI_OPEN_MERGE_REQUESTS && $PARENT_SOURCE == "push"' - -.if-daily-schedule-child: &if-daily-schedule-child - if: '$PARENT_SOURCE == "schedule" && $DAILY == "1"' - -.if-schedule-benchmark: &if-schedule-benchmark - if: '$CI_PIPELINE_SOURCE == "schedule" && $BENCHMARK == "1"' - -.if-weekly-schedule-benchmark: &if-weekly-schedule-benchmark - if: '$CI_PIPELINE_SOURCE == "schedule" && $WEEKLY == "1" && $BENCHMARK == "1"' - -.if-biweekly-schedule-benchmark: &if-biweekly-schedule-benchmark - if: '$CI_PIPELINE_SOURCE == "schedule" && $BIWEEKLY == "1" && $BENCHMARK == "1"' - -.if-monthly-schedule-benchmark: &if-monthly-schedule-benchmark - if: '$CI_PIPELINE_SOURCE == "schedule" && $MONTHLY == "1" && $BENCHMARK == "1"' - -.if-weekly-ha-schedule: &if-weekly-ha-schedule - if: '$WEEKLY == "1" && $HA =="1"' - -############ -# Changes # -############ -.hugectr-source: &hugectr-source - - "HugeCTR/**/*" - - "ci/**/*" - - "cmake/Modules/*" - - "gpu_cache/**/*" - - "hps_tf/**/*" - - "onnx_converter/**/*" - - "test/**/*" - - "third_party/**/*" - - "tools/**/*" - - "sbin/**/*" - - "CMakeLists.txt" - - ".gitlab-ci.yml" - - ".clang-format" - - ".pre-commit-config.yaml" - -.sok-source: &sok-source - - "sparse_operation_kit/**/*" - - "HugeCTR/core/**/*" - - "HugeCTR/core23/**/*" - - "HugeCTR/embedding/**/*" - -.code-source: &code-source - - "HugeCTR/**/*" - - "ci/**/*" - - "cmake/Modules/*" - - "gpu_cache/**/*" - - "hps_tf/**/*" - - "onnx_converter/**/*" - - "test/**/*" - - "third_party/**/*" - - "tools/**/*" - - "CMakeLists.txt" - - ".gitlab-ci.yml" - - ".clang-format" - - ".pre-commit-config.yaml" - - "sparse_operation_kit/**/*" - -.doc-source: &doc-source - - "docs/**/*" - - "tutorial/**/*" - - "notebooks/**/*" - - "samples/**/*" - - "*.md" - -################## -# Conditions set # -################## -.format:rules:check: - rules: - # If push to a branch which has MR opened, ignore the branch pipeline - - <<: *if-push-to-mr-opened - when: never - - <<: *if-default-ci-actions - # when: always - - <<: *if-new-image - # when: always - - when: never - -.hugectr:rules:build: - rules: - # If push to a branch which has MR opened, ignore the branch pipeline - - <<: *if-push-to-mr-opened - when: never - - <<: *if-default-ci-actions - changes: *hugectr-source - #when: always - - <<: *if-new-image - #when: always - - <<: *if-daily-schedule - #when: always - - when: never - -.sok:rules:build: - rules: - # If push to a branch which has MR opened, ignore the branch pipeline - - <<: *if-push-to-mr-opened - when: never - - <<: *if-default-ci-actions - changes: *sok-source - #when: always - - <<: *if-new-image - #when: always - - <<: *if-daily-schedule - #when: always - - when: never - -# Condition for run sanity test -.hugectr:rules:sanity-test: - rules: - - <<: *if-merge-request - changes: *hugectr-source - #when: always - - <<: *if-new-image - #when: always - - when: never - -# Trigger rules for Dracorno -.trigger:rules:dracorno: - rules: - # If push to a branch which has MR opened, ignore the branch pipeline - - <<: *if-use-dracorno-m2-partition - changes: *code-source - variables: - RUNNER_TAG: dracorno_ssh - SLURM_PARTITION: "batch_dgx1_m2,batch_short_dgx1_m2,batch_short_dgx2h_m2,backfill_dgx2h_m2" - DATA_PREFIX: /gpfs - - <<: *if-use-dracorno-m3-partition - changes: *code-source - variables: - RUNNER_TAG: dracornom03_ssh - SLURM_PARTITION: "batch_dgx1_m3,batch_short_dgx1_m3,batch_16gb_dgx1_m3,backfill_dgx1_m3" - DATA_PREFIX: /lustre - - when: never - -.trigger:rules:selene: - rules: - - <<: *if-merge-request - changes: *code-source - - <<: *if-new-image - - when: never - -.hugectr:rules:test_in_child: - rules: - # Use PARENT_SOURCE to check if MR or not - - <<: *if-merge-request-child - changes: *hugectr-source - - <<: *if-new-image - - when: never - -.sok:rules:sanity-test: - rules: - - <<: *if-merge-request - changes: *sok-source - #when: always - - <<: *if-new-image - #when: always - - when: never - -.sok:rules:test_in_child: - rules: - # Use PARENT_SOURCE to check if MR or not - - <<: *if-merge-request-child - changes: *sok-source - - <<: *if-new-image - - when: never - -.default:rules:daily-test: - rules: - - <<: *if-new-image - #when: always - - <<: *if-daily-schedule - #when: always - - when: never - -.benchmark:rules:weekly: - rules: - - <<: *if-weekly-schedule-benchmark - - when: never - -.benchmark:rules:biweekly: - rules: - - <<: *if-biweekly-schedule-benchmark - - when: never - -.benchmark:rules:monthly: - rules: - - <<: *if-monthly-schedule-benchmark - - when: never - -.benchmark:rules: - rules: - - <<: *if-schedule-benchmark - - when: never - -.default:rules:weekly-test: - rules: - - <<: *if-weekly-ha-schedule - - when: never diff --git a/ci/selene/ci.yml b/ci/selene/ci.yml deleted file mode 100644 index 0811166322..0000000000 --- a/ci/selene/ci.yml +++ /dev/null @@ -1,484 +0,0 @@ -include: - - project: "dl/devops/gitlab-ci-slurm" - ref: master - file: "/.gitlab-ci.yml" - - /ci/common.yml - - /ci/template.yml - - /ci/rules.gitlab_ci.yml - -## Stage: test -# unit test -utests: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/utests - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DATASET}:${DATASET_MOUNT},/lustre/fsw/devtech/hpc-hugectr/inference/:/hugectr/test/utest/,/raid:/raid,/lustre/fsw/mlperf/mlperft-dlrm/datasets/criteo_multihot_raw:/lustre/fsw/mlperf/mlperft-dlrm/datasets/criteo_multihot_raw - WALLTIME: "02:00:00" - TEST_CMD: ./ci/utest/utest.sub - -utests_embedding_collection: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/utests_embedding_collection - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DATASET}:${DATASET_MOUNT},/lustre/fsw/devtech/hpc-hugectr/inference/:/hugectr/test/utest/,/raid:/raid,/lustre/fsw/mlperf/mlperft-dlrm/datasets/criteo_multihot_raw:/lustre/fsw/mlperf/mlperft-dlrm/datasets/criteo_multihot_raw - TEST_CMD: ./ci/utest/utest_embedding_collection.sub - -utests_core23: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/utests_core23 - CONT: $TRAIN_IMAGE_VERSIONED - TEST_CMD: ./ci/utest/utest_core23.sub - -utests_layer_1: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/utests - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DATASET}:${DATASET_MOUNT},/lustre/fsw/devtech/hpc-hugectr/inference/:/hugectr/test/utest/ - TEST_CMD: ./ci/utest/utest_layer_1.sub - -utests_layer_2: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/utests - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DATASET}:${DATASET_MOUNT},/lustre/fsw/devtech/hpc-hugectr/inference/:/hugectr/test/utest/ - WALLTIME: "01:00:00" - TEST_CMD: ./ci/utest/utest_layer_2.sub - -utests_embedding: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/utests_embedding - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DATASET}:${DATASET_MOUNT},/lustre/fsw/devtech/hpc-hugectr/inference/:/hugectr/test/utest/ - TEST_CMD: ./ci/utest/utest_embedding.sub - -utests_hybrid_e2e: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/utests_embedding - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DATASET}:${DATASET_MOUNT},/lustre/fsw/devtech/hpc-hugectr/inference/:/hugectr/test/utest/ - TEST_CMD: ./ci/utest/utest_hybrid_e2e.sub - -utests_hps: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_inference - variables: - GPFSFOLDER: $LOGDIR/utests_hps - CONT: $TRAIN_INFER_IMAGE_VERSIONED - MOUNTS: ${DATASET}:${DATASET_MOUNT},/lustre/fsw/devtech/hpc-hugectr/inference/:/hugectr/test/utest/ - TEST_CMD: ./ci/utest/utest_hps.sub - -criteo: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/criteo - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DATASET}:${DATASET_MOUNT} - WALLTIME: "00:15:00" - TEST_CMD: ./ci/integration_test/criteo/criteo.sub - -dcn: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/dcn - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DATASET}:${DATASET_MOUNT} - WALLTIME: "00:40:00" - TEST_CMD: ./ci/integration_test/dcn/dcn.sub - -dcn_8gpu: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/dcn - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DATASET}:${DATASET_MOUNT} - WALLTIME: "01:00:00" - TEST_CMD: ./ci/integration_test/dcn/dcn_8gpu.sub - -dlrm_dcnv2_benchmark_1node: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_multi_node - variables: - GPFSFOLDER: $LOGDIR/dlrm_dcnv2_benchmark_1node - CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED - MOUNTS: /lustre/fsw/mlperf/mlperft-dlrm/datasets/criteo_multihot_raw:/data,/lustre/fsw/mlperf/mlperft-dlrm/datasets/criteo_multihot_raw:/data_val - TEST_CMD: ./ci/integration_test/dlrm/train_dcnv2_1node.sub - -wdl: - extends: .selene_test_job # test on selene needs to extend .cluster_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/wdl # log dir, usually $LOGDIR + job name - CONT: $TRAIN_IMAGE_VERSIONED # image name - MOUNTS: ${DATASET_NEW_CRITEO_SELENE}:${DATASET_MOUNT} # mount - WALLTIME: "00:15:00" # estimate job time. Less time, higher priority - TEST_CMD: ./ci/integration_test/wdl/wdl.sub # test script - -deepfm: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/deepfm - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DATASET}:${DATASET_MOUNT} - WALLTIME: "00:15:00" - TEST_CMD: ./ci/integration_test/deepfm/deepfm.sub - -mmoe: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/mmoe - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${MMOE_DATASET}:${MMOE_DATASET_MOUNT} - WALLTIME: "00:15:00" - TEST_CMD: ./ci/integration_test/mmoe/mmoe.sub - -inference_hps: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/inference_hps - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DATASET}:${DATASET_MOUNT},/lustre/fsw/devtech/hpc-hugectr/inference/:/hugectr/test/utest/ - TEST_CMD: ./ci/integration_test/inference/inference_hps.sub - -embedding_cache_perf: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/embedding_cache_perf - CONT: $TRAIN_IMAGE_VERSIONED - TEST_CMD: ./ci/integration_test/inference/embedding_cache_perf_test.sub - -din_single_node: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/din_single_node - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DIN_DATASET}:${DIN_DATASET_MOUNT} - WALLTIME: "00:15:00" - TEST_CMD: ./ci/integration_test/din/din.sub - -bst_single_node: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/bst_single_node - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${BST_DATASET}:${BST_DATASET_MOUNT} - WALLTIME: "00:15:00" - TEST_CMD: ./ci/integration_test/bst/bst.sub - -# # python interface single node -py_single_node: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/py_single_node - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DATASET}:${DATASET_MOUNT},${DATASET_NEW_CRITEO_SELENE}:${NEW_CRITEO_MOUNT} - WALLTIME: "00:30:00" - TEST_CMD: ./ci/integration_test/py_interface/py_single_node.sub - -ebc_multi_node: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_multi_node - variables: - GPFSFOLDER: $LOGDIR/ebc_multi_node - CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED - MOUNTS: ${DATASET_NEW_CRITEO_SELENE}:${DATASET_MOUNT},/raid:/raid - WALLTIME: "00:45:00" - DGXNNODES: 2 - TEST_CMD: ./ci/integration_test/ebc/ebc.sub - -ebc_utest_multi_node: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_multi_node - variables: - GPFSFOLDER: $LOGDIR/ebc_utest_multi_node - CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED - MOUNTS: /raid:/raid - WALLTIME: "00:45:00" - DGXNNODES: 2 - TEST_CMD: ./ci/integration_test/ebc/utest.multinode.sub - -# hugectr to onnx converter test -hugectr2onnx: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/hugectr2onnx - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DATASET_NEW_CRITEO_SELENE}:${NEW_CRITEO_MOUNT},${DIN_DATASET}:${DIN_DATASET_MOUNT},${NCF_DATASET}:${NCF_DATASET_MOUNT},${MMOE_DATASET}:${MMOE_DATASET_MOUNT} - WALLTIME: "01:00:00" - TEST_CMD: ./ci/integration_test/hugectr2onnx/hugectr2onnx.sub - -# hps_tf_plugin -hps_tf_plugin: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_tf_hps_trt_plugin - variables: - GPFSFOLDER: $LOGDIR/hps_tf - CONT: $TF_TRT_IMAGE_VERSIONED - MOUNTS: ${DATASET}:${DATASET_MOUNT} - TEST_CMD: ./ci/integration_test/hps/hps_tf.sub - -# hps_torch_plugin -hps_torch_plugin: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_pytorch_hps_trt_plugin - variables: - GPFSFOLDER: $LOGDIR/hps_torch - CONT: $PYTORCH_TRT_IMAGE_VERSIONED - MOUNTS: ${DATASET}:${DATASET_MOUNT} - TEST_CMD: ./ci/integration_test/hps/hps_torch.sub - -# embedding_plugin -sparse_operation_kit_ut-TF2: - extends: .selene_sok_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_sok_tf2 - variables: - GPFSFOLDER: $LOGDIR/sparse_operation_kit - CONT: $SOK_IMAGE_VERSIONED_TF2 - MOUNTS: ${DATASET}:${DATASET_MOUNT} - WALLTIME: "02:00:00" - TEST_CMD: ./ci/integration_test/sok/sok_tf2_unit.sub - -sparse_operation_kit_ut-TF1: - extends: .selene_sok_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_sok_tf1 - variables: - GPFSFOLDER: $LOGDIR/sparse_operation_kit - CONT: $SOK_IMAGE_VERSIONED_TF1 - MOUNTS: ${DATASET}:${DATASET_MOUNT} - WALLTIME: "02:00:00" - TEST_CMD: ./ci/integration_test/sok/sok_tf1_unit.sub - -#Test jobs for hps_trt plugin -hps_trt_in_hugectr: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_hugectr_hps_trt_plugin - variables: - GPFSFOLDER: $LOGDIR/hugectr_hps_trt - CONT: $HUGECTR_TRT_IMAGE_VERSIONED - TEST_CMD: ./ci/integration_test/hps/hps_trt_in_merlin_hugectr.sub - -hps_trt_in_tf: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_tf_hps_trt_plugin - variables: - GPFSFOLDER: $LOGDIR/tf_hps_trt - CONT: $TF_TRT_IMAGE_VERSIONED - TEST_CMD: ./ci/integration_test/hps/hps_trt_in_merlin_tf.sub - -hps_trt_in_pytorch: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_pytorch_hps_trt_plugin - variables: - GPFSFOLDER: $LOGDIR/pytorch_hps_trt - CONT: $PYTORCH_TRT_IMAGE_VERSIONED - TEST_CMD: ./ci/integration_test/hps/hps_trt_in_merlin_pytorch.sub - -hps_plugin_benchmark: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_tf_hps_trt_plugin - variables: - GPFSFOLDER: $LOGDIR/hps_plugin_benchmark - CONT: $TF_TRT_IMAGE_VERSIONED - MOUNTS: /lustre/fsw/devtech/hpc-hugectr/hps_tf_benchmark/hps_plugin_ci_model_repo:/model_repo,/lustre/fsw/devtech/hpc-hugectr/hps_tf_benchmark/perf_data:/perf_data - WALLTIME: "00:45:00" - TEST_CMD: ./ci/benchmark/hps_plugin_benchmark/run.sub - -# S3 backend_test -s3_backend_test: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node_with_s3 - variables: - GPFSFOLDER: $LOGDIR/s3_backend_test - CONT: $TRAIN_IMAGE_VERSIONED_WITH_S3 - TEST_CMD: ./ci/integration_test/s3/s3_backend_test.sub - -# GCS backend_test -gcs_backend_test: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node_with_gcs - variables: - GPFSFOLDER: $LOGDIR/gcs_backend_test - CONT: $TRAIN_IMAGE_VERSIONED_WITH_GCS - TEST_CMD: ./ci/integration_test/gcs/gcs_backend_test.sub - -hps_torch_fuse_table_benchmark: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_pytorch_hps_trt_plugin - variables: - GPFSFOLDER: $LOGDIR/hps_torch_fuse_table_benchmark - CONT: $PYTORCH_TRT_IMAGE_VERSIONED - MOUNTS: /lustre/fsw/devtech/hpc-hugectr/hps_torch_fuse_table_benchmark/ci_model_repo:/model_repo,/lustre/fsw/devtech/hpc-hugectr/hps_torch_fuse_table_benchmark/perf_data:/perf_data - WALLTIME: "00:45:00" - TEST_CMD: ./ci/benchmark/hps_torch_fuse_table_benchmark/run.sub - -hps_tf_fuse_table_benchmark: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_tf_hps_trt_plugin - variables: - GPFSFOLDER: $LOGDIR/hps_tf_fuse_table_benchmark - CONT: $TF_TRT_IMAGE_VERSIONED - MOUNTS: /lustre/fsw/devtech/hpc-hugectr/hps_tf_fuse_table_benchmark/ci_model_repo:/model_repo,/lustre/fsw/devtech/hpc-hugectr/hps_tf_fuse_table_benchmark/perf_data:/perf_data - WALLTIME: "00:45:00" - TEST_CMD: ./ci/benchmark/hps_tf_fuse_table_benchmark/run.sub - -hps_plugin_benchmark_check: - extends: .selene_post_test_job - needs: - - hps_plugin_benchmark - variables: - GPFSFOLDER: $LOGDIR/hps_plugin_benchmark_check - CONT: $TF_TRT_IMAGE_VERSIONED - MOUNTS: $LOGDIR/hps_plugin_benchmark:/logs - WALLTIME: "00:15:00" - TEST_CMD: ./ci/post_test/check_hps_plugin_benchmark.sub - -dlrm_dcnv2_1node_check: - # Push logs to gitlab - extends: .selene_post_test_job - needs: - - dlrm_dcnv2_benchmark_1node - variables: - GPFSFOLDER: $LOGDIR/dlrm_dcnv2_1node_check - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: $LOGDIR/dlrm_dcnv2_benchmark_1node:/logs - WALLTIME: "00:15:00" - TEST_CMD: ./ci/post_test/check_dcnv2_dlrm_1node.sub - -hps_torch_fuse_table_benchmark_check: - extends: .selene_post_test_job - needs: - - hps_torch_fuse_table_benchmark - variables: - GPFSFOLDER: $LOGDIR/hps_torch_fuse_table_benchmark_check - CONT: $PYTORCH_TRT_IMAGE_VERSIONED - MOUNTS: $LOGDIR/hps_torch_fuse_table_benchmark:/logs - WALLTIME: "00:15:00" - TEST_CMD: ./ci/post_test/check_hps_torch_fuse_table_benchmark.sub - -hps_tf_fuse_table_benchmark_check: - extends: .selene_post_test_job - needs: - - hps_tf_fuse_table_benchmark - variables: - GPFSFOLDER: $LOGDIR/hps_tf_fuse_table_benchmark_check - CONT: $TF_TRT_IMAGE_VERSIONED - MOUNTS: $LOGDIR/hps_tf_fuse_table_benchmark:/logs - WALLTIME: "00:15:00" - TEST_CMD: ./ci/post_test/check_hps_tf_fuse_table_benchmark.sub - -147gb_model_benchmark: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_tf_hps_trt_plugin - variables: - GPFSFOLDER: $LOGDIR/147gb_model_benchmark - CONT: $TF_TRT_IMAGE_VERSIONED - MOUNTS: /lustre/fsw/devtech/hpc-hugectr/hps_tf_benchmark/147gb_ci_model_repo:/model_repo - WALLTIME: "00:45:00" - TEST_CMD: ./ci/benchmark/147gb_model_benchmark/run.sub - -147gb_model_benchmark_check: - extends: .selene_post_test_job - needs: - - 147gb_model_benchmark - variables: - GPFSFOLDER: $LOGDIR/147gb_model_benchmark_check - CONT: $TF_TRT_IMAGE_VERSIONED - MOUNTS: $LOGDIR/147gb_model_benchmark:/logs - WALLTIME: "00:15:00" - TEST_CMD: ./ci/post_test/check_147gb_model_benchmark.sub diff --git a/ci/template.yml b/ci/template.yml deleted file mode 100644 index e68db56a30..0000000000 --- a/ci/template.yml +++ /dev/null @@ -1,558 +0,0 @@ -stages: - - build_from_scratch - - format_check - - build - - pre_test - - test - - inference_benchmark - - sok_benchmark - - wdl_benchmark - - dcn_benchmark - - deepfm_benchmark - - dlrm_benchmark - - hps_benchmark - - post_test - -.python_format: - stage: format_check - tags: - - nvidia.com/cuda.driver.major=470 - - $BUILD_TAG - extends: - - .format:rules:check - script: - - pwd - - ls -all - - docker pull python:3.8-alpine; - - docker run -d --rm --name python_${CI_PIPELINE_ID} ${EXTRA_DOCKER_RUN_ARGS} -w /src python:3.8-alpine sleep infinity - - docker cp $(pwd) python_${CI_PIPELINE_ID}:/src - - docker exec python_${CI_PIPELINE_ID} sh -c 'pip install black==22.12.0 && pwd && ls -all . ' - - docker exec python_${CI_PIPELINE_ID} sh -c "black --line-length 100 --check --diff --color --extend-exclude \"$EXCLUDE\" ./hugectr" - after_script: - - docker stop python_${CI_PIPELINE_ID} - allow_failure: false - timeout: 15 minutes - -.clang_format: - stage: format_check - tags: - - nvidia.com/cuda.driver.major=470 - - $BUILD_TAG - extends: - - .format:rules:check - script: - - pwd - - ls -all - - docker login -u ${CI_PRIVATE_USER} -p "${CI_PRIVATE_KEY}" "${CI_REGISTRY}" - - docker run -d --rm --name clang_${CI_PIPELINE_ID} ${EXTRA_DOCKER_RUN_ARGS} -w /src gitlab-master.nvidia.com:5005/dl/hugectr/hugectr/clang-format-lint-new sleep infinity - - docker cp $(pwd) clang_${CI_PIPELINE_ID}:/src - - docker exec clang_${CI_PIPELINE_ID} sh -c "cd ./hugectr && /run-clang-format.py --clang-format-executable /clang-format/$EXECUTABLE -r --exclude $EXCLUDE --style $STYLE --extensions $EXTENSIONS ." - after_script: - - docker stop clang_${CI_PIPELINE_ID} - allow_failure: false - timeout: 15 minutes - -.codespell_check: - stage: format_check - tags: - - nvidia.com/cuda.driver.major=470 - - $BUILD_TAG - extends: - - .format:rules:check - script: - - pwd - - ls -all - - docker pull ${PRE_COM_IMAGE} - - docker run -d --rm --name codespell_${CI_PIPELINE_ID} ${EXTRA_DOCKER_RUN_ARGS} -w /src ${PRE_COM_IMAGE} sleep infinity - - docker cp $(pwd) codespell_${CI_PIPELINE_ID}:/src - - docker exec codespell_${CI_PIPELINE_ID} sh -c "cd ./hugectr && pre-commit run --all-files" - after_script: - - docker stop codespell_${CI_PIPELINE_ID} - allow_failure: false - timeout: 15 minutes - -.build_nightly: - stage: build_from_scratch - tags: - - nvidia.com/cuda.driver.major=470 - - $BUILD_TAG - script: - - docker login -u "\$oauthtoken" -p "${NVSTAGE_KEY}" "${NVSTAGE_REGISTRY}" - - docker login -u ${CI_PRIVATE_USER} -p "${CI_PRIVATE_KEY}" "${CI_REGISTRY}" - - if [[ "$MERLIN_REMOTE_REPO" == "" ]]; then - git clone $REMOTE_REPO; - else - git clone $MERLIN_REMOTE_REPO; - fi - - if [[ "$OPTIMIZED" == 1 ]]; then - cd optimized/recommendation/hugectr; - else - cd Merlin/docker; - fi - - if [[ "$MERLIN_REMOTE_BRANCH" != "" ]]; then - git checkout $MERLIN_REMOTE_BRANCH; - fi - - if [[ "$TEST_NEW_IMAGE" == "1" ]]; then - DST_IMAGE=${DST_IMAGE}.new_image; - fi - - docker build --pull - -t ${DST_IMAGE} - -f ${DOCKER_FILE} - $BUILD_ARGS - --no-cache - . ; - - docker push ${DST_IMAGE} - allow_failure: false - rules: - - if: $NIGHTLY == "1" - when: always - - if: $TEST_NEW_IMAGE == "1" - when: always - - when: never - timeout: 5 hours - -# nightly build for sok tf1 -.build_nightly_tf1: - stage: build_from_scratch - tags: - - nvidia.com/cuda.driver.major=470 - script: - - docker login -u ${CI_PRIVATE_USER} -p "${CI_PRIVATE_KEY}" "${CI_REGISTRY}" - - cd tools/dockerfiles - - docker build --pull - -t ${DST_IMAGE} - -f ${DOCKER_FILE} - $BUILD_ARGS - --no-cache - . ; - - docker push ${DST_IMAGE} - allow_failure: false - rules: - - if: $NIGHTLY == "1" - when: always - - if: $TEST_NEW_IMAGE == "1" - when: always - - when: never - timeout: 2 hours - -.build: - stage: build - tags: - - nvidia.com/cuda.driver.major=470 - - $BUILD_TAG - script: - - export JOB_DOCKERFILE="Dockerfile.${CI_JOB_NAME%%--*}.${CI_PIPELINE_ID}" && echo ${JOB_DOCKERFILE} - - echo "BUILD_HUGECTR=${BUILD_HUGECTR}" - - echo "BUILD_HUGECTR2ONNX=${BUILD_HUGECTR2ONNX}" - - echo "BUILD_SOK=${BUILD_SOK}" - - echo "BUILD_TF_PLUGIN=${BUILD_TF_PLUGIN}" - - echo "BUILD_TORCH_PLUGIN=${BUILD_TORCH_PLUGIN}" - #- git submodule update --init --recursive - - if [[ "$TEST_NEW_IMAGE" == "1" ]]; then - echo "FROM ${FROM_IMAGE}.new_image" > ${JOB_DOCKERFILE}; - else - echo "FROM ${FROM_IMAGE}" > ${JOB_DOCKERFILE}; - fi - - echo "WORKDIR /workdir" >> ${JOB_DOCKERFILE} - - echo "RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/lib/libcuda.so.1" >> ${JOB_DOCKERFILE} - - echo "COPY . ." >> ${JOB_DOCKERFILE} - - echo "RUN git log -n 1" >> ${JOB_DOCKERFILE} - - if [[ "$BUILD_HUGECTR" == 1 ]]; then - echo "RUN cd /workdir && rm -rf build && mkdir -p build && cd build && cmake ${CMAKE_OPTION} .. && make -j\$(nproc) && make install" >> ${JOB_DOCKERFILE}; - fi - - if [[ "$BUILD_SOK" == 1 ]]; then - echo "RUN cd /workdir/sparse_operation_kit/ && python setup.py install" >> ${JOB_DOCKERFILE}; - echo "RUN pip install nvtx" >> ${JOB_DOCKERFILE}; - echo "ENV LD_LIBRARY_PATH=/usr/local/hugectr/lib:/usr/local/lib:\$LD_LIBRARY_PATH" >> ${JOB_DOCKERFILE}; - echo "ENV LIBRARY_PATH=/usr/local/hugectr/lib:/usr/local/lib:\$LIBRARY_PATH" >> ${JOB_DOCKERFILE}; - echo "ENV PYTHONPATH=/workdir/sparse_operation_kit:\$PYTHONPATH" >> ${JOB_DOCKERFILE}; - fi - - if [[ "$BUILD_HUGECTR2ONNX" == 1 ]]; then - echo "RUN cd /workdir/onnx_converter && python3 setup.py install" >> ${JOB_DOCKERFILE}; - fi - #- if [[ "$BUILD_HPS_BACKEND" == 1 ]]; then - # echo "RUN git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab-master.nvidia.com/dl/hugectr/hugectr_inference_backend.git hugectr_inference_backend && cd hugectr_inference_backend && git checkout hugectr_performance_test && mkdir build && cd build && cmake -DCMAKE_INSTALL_PREFIX:PATH=/usr/local/hugectr -DTRITON_COMMON_REPO_TAG=$TRITON_BRANCH -DTRITON_CORE_REPO_TAG=$TRITON_BRANCH -DTRITON_BACKEND_REPO_TAG=$TRITON_BRANCH .. && make -j\$(nproc) && make install && cd ../.. && rm -rfv hugectr_inference_backend" >> ${JOB_DOCKERFILE}; - # fi - - if [[ "$BUILD_HPS_BACKEND" == 1 ]]; then - echo "RUN git clone --branch $HUGECTR_BACKEND_VER https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab-master.nvidia.com/dl/hugectr/hugectr_inference_backend.git hugectr_inference_backend && cd hugectr_inference_backend/hps_backend && mkdir build && cd build && cmake -DCMAKE_INSTALL_PREFIX:PATH=/usr/local/hugectr -DTRITON_COMMON_REPO_TAG=$TRITON_BRANCH -DTRITON_CORE_REPO_TAG=$TRITON_BRANCH -DTRITON_BACKEND_REPO_TAG=$TRITON_BRANCH .. && make -j\$(nproc) && make install && cd ../../.. && rm -rfv hugectr_inference_backend" >> ${JOB_DOCKERFILE}; - echo "RUN ln -s /usr/local/hugectr/backends/hps /opt/tritonserver/backends/hps" >> ${JOB_DOCKERFILE}; - fi - - if [[ "$BUILD_TF_PLUGIN" == 1 ]]; then - echo "RUN pip install ninja" >> ${JOB_DOCKERFILE}; - echo "RUN pip install tf2onnx" >> ${JOB_DOCKERFILE}; - echo "RUN cd /workdir/hps_tf/ && python setup.py install" >> ${JOB_DOCKERFILE}; - fi - - if [[ "$BUILD_TORCH_PLUGIN" == 1 ]]; then - echo "RUN pip install ninja" >> ${JOB_DOCKERFILE}; - echo "RUN cd /workdir/hps_torch/ && TORCH_CUDA_ARCH_LIST=\"7.0 7.5 8.0 9.0\" python setup.py install" >> ${JOB_DOCKERFILE}; - fi - - if [[ "$BUILD_TRT_PLUGIN" == 1 ]]; then - echo "RUN pip install tf2onnx" >> ${JOB_DOCKERFILE}; - echo "RUN mkdir /workdir/hps_trt/build && cd /workdir/hps_trt/build && cmake ${TRT_CMAKE_OPTION} .. && make -j\$(nproc) && make install" >> ${JOB_DOCKERFILE}; - fi - - echo "RUN rm /usr/local/lib/libcuda.so.1" >> ${JOB_DOCKERFILE}; - - cat ${JOB_DOCKERFILE} - - docker login -u ${CI_PRIVATE_USER} -p "${CI_PRIVATE_KEY}" "${CI_REGISTRY}" - - if [[ "$TEST_NEW_IMAGE" == "1" ]]; then - docker pull ${FROM_IMAGE}.new_image; - else - docker pull ${FROM_IMAGE}; - fi - - source sbin/docker_buildx.sh - - BUILDX_ARGS="--push --no-cache -t ${DST_IMAGE} -f ${JOB_DOCKERFILE}" - - docker_buildx::docker_buildx "$CI_RUNNER_ID" "${BUILDX_ARGS}" - variables: - GIT_SUBMODULE_STRATEGY: recursive - allow_failure: false - rules: - - if: $CI_PIPELINE_SOURCE =~ /^(push|web|merge_request_event|trigger)$/ - when: always - - if: $TEST_NEW_IMAGE == "1" - when: always - - when: never - timeout: 5 hours - -.test_local: - stage: test - extends: - - .default:rules:weekly-test - tags: - - local_test - script: - - docker login -u ${CI_PRIVATE_USER} -p "${CI_PRIVATE_KEY}" "${CI_REGISTRY}" - - echo "Container_Name=${CI_JOB_NAME}_${CI_PIPELINE_ID}" - - echo "Image=${CONT}" - - echo "Mounts=${MOUNTS}" - - echo "Command=${CMD}" - - docker pull ${CONT} - - if [[ "${MOUNTS}" == "" ]]; then - docker run --rm --name ${CI_JOB_NAME}_${CI_PIPELINE_ID} --net=host --gpus=all --privileged --runtime=nvidia --ulimit memlock=-1 --ulimit stack=67108864 --shm-size 16g -u root ${CONT} bash -cx "${CMD}"; - else - docker run --rm --name ${CI_JOB_NAME}_${CI_PIPELINE_ID} --net=host --gpus=all --privileged --runtime=nvidia --ulimit memlock=-1 --ulimit stack=67108864 --shm-size 16g ${MOUNTS} -u root ${CONT} bash -cx "${CMD}"; - fi -# - docker logs -f ${CI_JOB_NAME}_${CI_PIPELINE_ID} - after_script: - - docker stop ${CI_JOB_NAME}_${CI_PIPELINE_ID} - -.build_hugectr: - extends: - - .build - - .hugectr:rules:build - -.build_hugectr_daily: - extends: - - .build - - .default:rules:daily-test - -.build_sok: - extends: - - .build - - .sok:rules:build - -.cluster_test_job: - extends: - - .selene_luna_job - - .hugectr:rules:sanity-test - variables: - SLURM_ACCOUNT: coreai_devtech_hugectr - OLD_SLURM_ACCOUNT: "devtech" - GPFSFOLDER: "/lustre/fsw/$OLD_SLURM_ACCOUNT" - GPFSFOLDER: "/lustre/fsw/$OLD_SLURM_ACCOUNT" - DGXNNODES: 1 - allow_failure: false - -.cluster_test_job_daily: - extends: - - .draco_oci_test_job - - .default:rules:daily-test - variables: - DRACO_OCI_LOGDIR: /lustre/fsw/portfolios/coreai/users/svcnvdlfw/hugectr_ci/${CI_PIPELINE_ID} - -.selene_test_job: - extends: - - .selene_luna_job - - .hugectr:rules:test_in_child - variables: - SLURM_ACCOUNT: coreai_devtech_hugectr - OLD_SLURM_ACCOUNT: "devtech" - GPFSFOLDER: "/lustre/fsw/$OLD_SLURM_ACCOUNT" - DGXNNODES: 1 - WALLTIME: "00:30:00" - GIT_CLONE_PATH: /lustre/fsw/devtech/hpc-hugectr/hugectr-ci/$CI_CONCURRENT_ID/$CI_PROJECT_NAME - allow_failure: false - -.dracorno_test_job: - extends: - - .dracorno_job - - .hugectr:rules:test_in_child - variables: - CLUSTER: "dracorno" - SLURM_ACCOUNT: "coreai_devtech_hugectr" - SLURM_PARTITION: ${DRACO_SLURM_PARTITION} - DATA_PREFIX: ${DRACO_DATA_PREFIX} - WALLTIME: "02:00:00" - SBATCH_OTHER_PARAMS: "--nv-meta ml-model.hugectr --gpus-per-node=8" - tags: - - $RUNNER_TAG - allow_failure: false - -.draco_oci_test_job: - extends: - - .hugectr:rules:test_in_child - script: - - echo ${MOUNTS} - - export LOG_DIR="${DRACO_OCI_LOGDIR}/${CI_JOB_NAME}/${CI_JOB_ID}/results" - - echo ${LOG_DIR} - - mkdir -p "${LOG_DIR}" - - export LOG_FILE="${LOG_DIR}/${DATESTAMP}.log" - - export CONT=$(echo "${CONT}" | sed 's/:5005//g') - - chmod +x ${TEST_CMD} - - bash ${TEST_CMD} | tee ${LOG_FILE} - variables: - CI_SLURM_ACCOUNT: "coreai_devtech_all" #svcnvdlfw only attends account "coreai_devtech_all" - CI_SCHEDULER_TYPE: "slurm" - CI_SLURM_PARTITION: "batch_block1" - CI_SLURM_GPUS_PER_NODE: "8" - DRACO_OCI_PREFIX: "/lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr" - SLURM_JOB_NUM_NODES: 1 - CI_SLURM_TIME: "00:30:00" - GIT_DEPTH: "1" - tags: - - draco_oci_generic - allow_failure: false - -.draco_oci_sok_test_job: - extends: - - .sok:rules:test_in_child - script: - - echo ${MOUNTS} - - export LOG_DIR="${DRACO_OCI_LOGDIR}/${CI_JOB_NAME}/${CI_JOB_ID}/results" - - echo ${LOG_DIR} - - mkdir -p "${LOG_DIR}" - - export LOG_FILE="${LOG_DIR}/${DATESTAMP}.log" - - export CONT=$(echo "${CONT}" | sed 's/:5005//g') - - chmod +x ${TEST_CMD} - - bash ${TEST_CMD} | tee ${LOG_FILE} - variables: - CI_SLURM_ACCOUNT: "coreai_devtech_all" #svcnvdlfw only attends account "coreai_devtech_all"" - CI_SCHEDULER_TYPE: "slurm" - CI_SLURM_PARTITION: "batch_block1" - CI_SLURM_GPUS_PER_NODE: "8" - DRACO_OCI_PREFIX: "/lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr" - SLURM_JOB_NUM_NODES: 1 - CI_SLURM_TIME: "00:30:00" - GIT_DEPTH: "1" - tags: - - draco_oci_generic - allow_failure: false - -.dlcluster_test_job: - extends: - - .dlcluster_job - - .hugectr:rules:sanity-test - allow_failure: false - -.dlcluster_test_job_daily: - extends: - - .dlcluster_job - - .default:rules:daily-test - allow_failure: false - -.computelab_test_job_daily: - extends: - - .dlcluster_job - - .default:rules:daily-test - variables: - CI_SLURM_PARTITION: "a100-pcie-40gb-product,a100-pcie-80gb-product" - CI_SLURM_ACCOUNT: "cag" - CI_SCHEDULER_TYPE: docker - GIT_DEPTH: "1" - WALLTIME: "02:00:00" - tags: - - computelab_generic - allow_failure: false - -.sok_test_job: - extends: - - .selene_luna_job - - .sok:rules:sanity-test - allow_failure: false - -.selene_sok_test_job: - extends: - - .selene_luna_job - - .sok:rules:test_in_child - variables: - SLURM_ACCOUNT: coreai_devtech_hugectr - OLD_SLURM_ACCOUNT: "devtech" - GPFSFOLDER: "/lustre/fsw/$OLD_SLURM_ACCOUNT" - DGXNNODES: 1 - WALLTIME: "00:30:00" - GIT_CLONE_PATH: /lustre/fsw/devtech/hpc-hugectr/hugectr-ci/$CI_CONCURRENT_ID/$CI_PROJECT_NAME - allow_failure: false - -.dracorno_sok_test_job: - extends: - - .dracorno_job - - .sok:rules:test_in_child - variables: - CLUSTER: "dracorno" - SLURM_ACCOUNT: "coreai_devtech_hugectr" - DATA_PREFIX: ${DRACO_DATA_PREFIX} - SLURM_PARTITION: ${DRACO_SLURM_PARTITION} - WALLTIME: "02:00:00" - SBATCH_OTHER_PARAMS: "--nv-meta ml-model.hugectr --gpus-per-node=8" - tags: - - $RUNNER_TAG - allow_failure: false - -.sok_test_job_daily: - extends: - - .selene_luna_job - - .default:rules:daily-test - allow_failure: false - -.cluster_post_test_job: - extends: - - .cluster_test_job - - .hugectr:rules:sanity-test - stage: post_test - -.selene_post_test_job: - extends: - - .selene_test_job - - .hugectr:rules:test_in_child - variables: - SLURM_ACCOUNT: coreai_devtech_hugectr - OLD_SLURM_ACCOUNT: "devtech" - GPFSFOLDER: "/lustre/fsw/$OLD_SLURM_ACCOUNT" - DGXNNODES: 1 - WALLTIME: "00:30:00" - GIT_CLONE_PATH: /lustre/fsw/devtech/hpc-hugectr/hugectr-ci/$CI_CONCURRENT_ID/$CI_PROJECT_NAME - stage: post_test - -.dracorno_post_test_job: - extends: - - .dracorno_test_job - - .hugectr:rules:test_in_child - variables: - WALLTIME: "00:30:00" - stage: post_test - -.draco_oci_post_test_job: - extends: - - .draco_oci_test_job - - .hugectr:rules:test_in_child - variables: - WALLTIME: "00:30:00" - stage: post_test - -.cluster_post_test_job_daily: - extends: - - .draco_oci_test_job - - .default:rules:daily-test - variables: - DRACO_OCI_LOGDIR: /lustre/fsw/portfolios/coreai/users/svcnvdlfw/hugectr_ci/${CI_PIPELINE_ID} - stage: post_test - -.inference_benchmark: - extends: - - .selene_luna_job - - .benchmark:rules:weekly - stage: inference_benchmark - before_script: - - export PARAM=$(echo ${CI_JOB_NAME} | awk -F-- '{print $2}') - - export BZ=$(echo ${PARAM} | awk -Fx '{print $1}') - - export MIXED_PRECISION=$(echo ${PARAM} | awk -Fx '{print $2}') - - export GPFSFOLDER=$LOGDIR/inference_benchmark_${BZ}x${MIXED_PRECISION} - variables: - GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE} - CONT: ${UNIFIED_CTR_LATEST} - MOUNTS: /lustre/fsw/devtech/hpc-hugectr/inference/dlrm_regression/dlrm/1:/model/dlrm/1,/lustre/fsw/devtech/hpc-hugectr/keynote_inference/perf_data:/perf_data,${CI_PROJECT_DIR}:/hugectr - WORKDIR: /hugectr - SLURM_ACCOUNT: coreai_devtech_hugectr - OLD_SLURM_ACCOUNT: "devtech" - GPFSFOLDER: "/lustre/fsw/$OLD_SLURM_ACCOUNT" - WALLTIME: "00:15:00" - DGXNNODES: 1 - TEST_CMD: ./ci/benchmark/inference_benchmark/run.sub - -.sok_benchmark: - extends: - - .selene_luna_job - - .benchmark:rules - stage: sok_benchmark - before_script: - - export PARAM=$(echo ${CI_JOB_NAME} | awk -F-- '{print $2}') - - export BZ=$(echo ${PARAM} | awk -Fx '{print $1}') - - export GPU_NUM=$(echo ${PARAM} | awk -Fx '{print $2}') - - export GPFSFOLDER=$LOGDIR/sok_benchmark_${BZ}x${GPU_NUM} - variables: - GPFSFOLDER: $LOGDIR/sok_benchmark - CONT: ${UNIFIED_TF_LATEST} - MOUNTS: /lustre/fsw/mlperf/mlperft-dlrm/datasets/terabyte_portion_csv/:/dataset,${CI_PROJECT_DIR}:/hugectr - SLURM_ACCOUNT: coreai_devtech_hugectr - OLD_SLURM_ACCOUNT: "devtech" - GPFSFOLDER: "/lustre/fsw/$OLD_SLURM_ACCOUNT" - WALLTIME: "00:45:00" - DGXNNODES: 1 - TEST_CMD: ./ci/benchmark/sok/sok_dlrm.sub - -.train_benchmark: - extends: - - .selene_luna_job - - .benchmark:rules - before_script: - - export BENCHMARK=$(echo ${CI_JOB_NAME} | awk -F-- '{print $2}') - - export PARAM=$(echo ${CI_JOB_NAME} | awk -F-- '{print $3}') - - export NODE_NUM=$(echo ${PARAM} | awk -Fx '{print $1}') - - export GPU_NUM=$(echo ${PARAM} | awk -Fx '{print $2}') - - export BZ_PER_GPU=$(echo ${PARAM} | awk -Fx '{print $3}') - - export MIXED_PRECISION=$(echo ${PARAM} | awk -Fx '{print $4}') - - export DGXNNODES=${NODE_NUM} - - export GPFSFOLDER=$LOGDIR/train_benchmark--${BENCHMARK}--${NODE_NUM}x${GPU_NUM}x${BZ_PER_GPU}x${MIXED_PRECISION} - variables: - GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE} - CONT: ${UNIFIED_CTR_LATEST} - MOUNTS: ${DATASET_NEW_CRITEO_SELENE}:${NEW_CRITEO_MOUNT},${DATASET_CRITEO_SELENE}:${CRITEO_MOUNT},/raid:/raid,${CI_PROJECT_DIR}:/hugectr - SLURM_ACCOUNT: coreai_devtech_hugectr - OLD_SLURM_ACCOUNT: "devtech" - GPFSFOLDER: "/lustre/fsw/$OLD_SLURM_ACCOUNT" - WALLTIME: "00:15:00" - TEST_CMD: ./ci/benchmark/train_benchmark/run.sub - -.hps_backend_benchmark: - extends: - - .selene_luna_job - - .benchmark:rules:weekly - stage: hps_benchmark - before_script: - - export BZ=$(echo ${CI_JOB_NAME} | awk -F-- '{print $2}') - - export GPFSFOLDER=$LOGDIR/hps_backend_benchmark_${BZ} - - rm -rf /hps_backend_benchmark/*.out - variables: - GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE} - CONT: ${UNIFIED_TF_LATEST} - MOUNTS: /lustre/fsw/devtech/hpc-hugectr/emmaq/yingcan_benchmark:/hps_backend_benchmark,${CI_PROJECT_DIR}:/hugectr - SLURM_ACCOUNT: coreai_devtech_hugectr - OLD_SLURM_ACCOUNT: "devtech" - GPFSFOLDER: "/lustre/fsw/$OLD_SLURM_ACCOUNT" - WALLTIME: "00:30:00" - DGXNNODES: 1 - TEST_CMD: ./ci/benchmark/hps_backend_benchmark/run.sub - -collect_benchmark_result: - extends: - - .selene_luna_job - - .benchmark:rules - stage: post_test - variables: - GPFSFOLDER: $LOGDIR/collect_benchmark_result - GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE} - CONT: ${UNIFIED_CTR_LATEST} - MOUNTS: $LOGDIR:/logs,${CI_PROJECT_DIR}:/hugectr - SLURM_ACCOUNT: coreai_devtech_hugectr - OLD_SLURM_ACCOUNT: "devtech" - GPFSFOLDER: "/lustre/fsw/$OLD_SLURM_ACCOUNT" - WALLTIME: "00:15:00" - TEST_CMD: ./ci/post_test/collect_benchmark.sub diff --git a/ci/test_unit.sh b/ci/test_unit.sh old mode 100755 new mode 100644 index 102c1a4b46..a39ddcd145 --- a/ci/test_unit.sh +++ b/ci/test_unit.sh @@ -43,3 +43,4 @@ elif [ "$container" == "merlin-tensorflow" ]; then bash run_function_test.sh && \ popd fi + diff --git a/ci/utest/utest.sub b/ci/utest/utest.sub deleted file mode 100644 index d49ce258dd..0000000000 --- a/ci/utest/utest.sub +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx "\ - cd /workdir/build/bin && \ - ./checker_test && \ - ./data_reader_test && \ - ./device_map_test && \ - ./loss_test && \ - ./optimizer_test && \ - ./regularizers_test && \ - ./auc_test && \ - ./averageloss_test && \ - ./communication_test" diff --git a/ci/utest/utest_core23.sub b/ci/utest/utest_core23.sub deleted file mode 100644 index 7e91973953..0000000000 --- a/ci/utest/utest_core23.sub +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" bash -cx "\ - cd /workdir/build/bin && \ - ./core23_test" diff --git a/ci/utest/utest_embedding.sub b/ci/utest/utest_embedding.sub deleted file mode 100644 index 240ff2a984..0000000000 --- a/ci/utest/utest_embedding.sub +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx "\ - cd /workdir/build/bin \ - &&./embedding_test --gtest_filter=-hybrid_e2e.*" diff --git a/ci/utest/utest_embedding_collection.sub b/ci/utest/utest_embedding_collection.sub deleted file mode 100644 index 5404d0ddb3..0000000000 --- a/ci/utest/utest_embedding_collection.sub +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx "\ - cd /workdir/build/bin \ - && ./embedding_collection_test --gtest_filter=-test_embedding_collection.benchmark*:test_embedding_collection.utest_2node" diff --git a/ci/utest/utest_hps.sub b/ci/utest/utest_hps.sub deleted file mode 100644 index ee3fd546db..0000000000 --- a/ci/utest/utest_hps.sub +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx "\ - cd /workdir/build/bin && \ - ./lookup_session_fusing_table_test" \ No newline at end of file diff --git a/ci/utest/utest_hybrid_e2e.sub b/ci/utest/utest_hybrid_e2e.sub deleted file mode 100644 index a2a352ff9d..0000000000 --- a/ci/utest/utest_hybrid_e2e.sub +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx "\ - cd /workdir/build/bin \ - &&./embedding_test --gtest_filter=hybrid_e2e.*" diff --git a/ci/utest/utest_inference.sub b/ci/utest/utest_inference.sub deleted file mode 100644 index d58f715a3f..0000000000 --- a/ci/utest/utest_inference.sub +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx "\ - cd /workdir/build/bin && \ - ./inference_test" \ No newline at end of file diff --git a/ci/utest/utest_layer_1.sub b/ci/utest/utest_layer_1.sub deleted file mode 100644 index 562e18accf..0000000000 --- a/ci/utest/utest_layer_1.sub +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx "\ - cd /workdir/build/bin && \ - ./core23_layers_test --gtest_filter=concat_3d_layer.*:matmul_layer.*" diff --git a/ci/utest/utest_layer_2.sub b/ci/utest/utest_layer_2.sub deleted file mode 100644 index d753bd84bb..0000000000 --- a/ci/utest/utest_layer_2.sub +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx "\ - cd /workdir/build/bin && \ - ./core23_layers_test --gtest_filter=-concat_3d_layer.*:matmul_layer.*"