diff --git a/dali/benchmark/dali_bench.h b/dali/benchmark/dali_bench.h index a32c7c4bb03..598c1212493 100644 --- a/dali/benchmark/dali_bench.h +++ b/dali/benchmark/dali_bench.h @@ -21,6 +21,7 @@ #include #include +#include "dali/test/dali_test_config.h" #include "dali/core/common.h" #include "dali/core/tensor_shape.h" #include "dali/util/image.h" @@ -28,7 +29,7 @@ namespace dali { // Note: this is setup for the binary to be executed from "build" -const string image_folder = "/data/dali/benchmark/benchmark_images"; // NOLINT +static const string image_folder = testing::dali_extra_path() + "db/single/jpeg"; // NOLINT class DALIBenchmark : public benchmark::Fixture { public: diff --git a/dali/benchmark/resnet50_bench.py b/dali/benchmark/resnet50_bench.py deleted file mode 100755 index 0df97ad0869..00000000000 --- a/dali/benchmark/resnet50_bench.py +++ /dev/null @@ -1,153 +0,0 @@ -# Copyright (c) 2017-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -from nvidia.dali.pipeline import Pipeline -import nvidia.dali.ops as ops -import nvidia.dali.types as types -import numpy as np -from timeit import default_timer as timer - -image_folder = "/data/dali/benchmark/benchmark_images" - - -def read_jpegs(folder): - with open(folder + "/image_list.txt", "r") as file: - files = [line.rstrip() for line in file] - - images = [] - for fname in files: - f = open(image_folder + "/" + fname, "rb") - images.append(np.fromstring(f.read(), dtype=np.uint8)) - return images - - -def make_batch(size): - data = read_jpegs(image_folder) - return [data[i % len(data)] for i in range(size)] - - -class C2Pipe(Pipeline): - def __init__(self, batch_size, num_threads, device_id, pipelined=True, exec_async=True): - super(C2Pipe, self).__init__( - batch_size, num_threads, device_id, exec_pipelined=pipelined, exec_async=exec_async - ) - self.input = ops.ExternalSource() - self.decode = ops.ImageDecoder(device="cpu", output_type=types.RGB) - self.rcm = ops.FastResizeCropMirror(crop=(224, 224)) - self.np = ops.CropMirrorNormalize( - device="gpu", dtype=types.FLOAT16, mean=[128.0, 128.0, 128.0], std=[1.0, 1.0, 1.0] - ) - self.uniform = ops.random.Uniform(range=(0.0, 1.0)) - self.resize_uniform = ops.random.Uniform(range=(256.0, 480.0)) - self.mirror = ops.random.CoinFlip(probability=0.5) - - def define_graph(self): - self.jpegs = self.input() - images = self.decode(self.jpegs) - resized = self.rcm( - images, - crop_pos_x=self.uniform(), - crop_pos_y=self.uniform(), - mirror=self.mirror(), - resize_shorter=self.resize_uniform(), - ) - output = self.np(resized.gpu()) - return output - - def iter_setup(self): - raw_data = make_batch(self.batch_size) - self.feed_input(self.jpegs, raw_data) - - -class HybridPipe(Pipeline): - def __init__(self, batch_size, num_threads, device_id, pipelined=True, exec_async=True): - super(HybridPipe, self).__init__( - batch_size, num_threads, device_id, exec_pipelined=pipelined, exec_async=exec_async - ) - self.input = ops.ExternalSource() - self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB) - self.resize = ops.Resize(device="gpu", interp_type=types.INTERP_LINEAR) - self.cmnp = ops.CropMirrorNormalize( - device="gpu", - dtype=types.FLOAT16, - crop=(224, 224), - mean=[128.0, 128.0, 128.0], - std=[1.0, 1.0, 1.0], - ) - self.uniform = ops.random.Uniform(range=(0.0, 1.0)) - self.resize_uniform = ops.random.Uniform(range=(256.0, 480.0)) - self.mirror = ops.random.CoinFlip(probability=0.5) - - def define_graph(self): - self.jpegs = self.input() - images = self.decode(self.jpegs) - resized = self.resize(images, resize_shorter=self.resize_uniform()) - output = self.cmnp( - resized, mirror=self.mirror(), crop_pos_x=self.uniform(), crop_pos_y=self.uniform() - ) - return output - - def iter_setup(self): - raw_data = make_batch(self.batch_size) - self.feed_input(self.jpegs, raw_data) - - -def run_benchmarks(PipeType, args): - print("Running Benchmarks For {}".format(PipeType.__name__)) - for executor in args.executors: - pipelined = executor > 0 - exec_async = executor > 1 - for batch_size in args.batch_sizes: - for num_threads in args.thread_counts: - pipe = PipeType(batch_size, num_threads, 0, pipelined, exec_async) - pipe.build() - start_time = timer() - for i in range(args.num_iters): - pipe.run() - - total_time = timer() - start_time - print( - "{}/{}/{}/{}: FPS={}".format( - PipeType.__name__, - executor, - batch_size, - num_threads, - float(batch_size * args.num_iters) / total_time, - ) - ) - - -def get_args(): - parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument( - "--batch-sizes", default=[128], help="Comma separated list of batch sizes to run" - ) - parser.add_argument( - "--thread-counts", default=[1, 2, 3, 4], help="Comma separated list of thread counts" - ) - parser.add_argument("--executors", default=[2], help="List of executors to run") - parser.add_argument("--num-iters", type=int, default=100, help="Number of iterations to run") - return parser.parse_args() - - -def main(): - args = get_args() - pipe_types = [C2Pipe, HybridPipe] - for PipeType in pipe_types: - run_benchmarks(PipeType, args) - - -if __name__ == "__main__": - main() diff --git a/dali/benchmark/resnet50_nvjpeg_bench.cc b/dali/benchmark/resnet50_nvjpeg_bench.cc index 8985f5bce8e..3358446a1c8 100755 --- a/dali/benchmark/resnet50_nvjpeg_bench.cc +++ b/dali/benchmark/resnet50_nvjpeg_bench.cc @@ -41,7 +41,7 @@ BENCHMARK_DEFINE_F(RealRN50, nvjpegPipe)(benchmark::State& st) { // NOLINT pipe.AddOperator( OpSpec("Caffe2Reader") - .AddArg("path", "/data/imagenet/train-c2lmdb-480") + .AddArg("path", testing::dali_extra_path() + "db/c2lmdb") .AddOutput("raw_jpegs", StorageDevice::CPU) .AddOutput("labels", StorageDevice::CPU)); diff --git a/dali/test/python/test_RN50_data_fw_iterators.py b/dali/test/python/test_RN50_data_fw_iterators.py index ca17614253b..b8fdd3b724f 100644 --- a/dali/test/python/test_RN50_data_fw_iterators.py +++ b/dali/test/python/test_RN50_data_fw_iterators.py @@ -17,8 +17,10 @@ import nvidia.dali.types as types import argparse import time +from test_utils import get_dali_extra_path +import os -data_paths = ["/data/imagenet/train-jpeg"] +data_paths = os.path.join(get_dali_extra_path(), "db", "single", "jpeg") class RN50Pipeline(Pipeline): @@ -28,9 +30,7 @@ def __init__( super(RN50Pipeline, self).__init__( batch_size, num_threads, device_id, prefetch_queue_depth=prefetch ) - self.input = ops.readers.File( - file_root=data_paths[0], shard_id=device_id, num_shards=num_gpus - ) + self.input = ops.readers.File(file_root=data_paths, shard_id=device_id, num_shards=num_gpus) self.decode_gpu = ops.decoders.Image(device="mixed", output_type=types.RGB) self.res = ops.RandomResizedCrop(device="gpu", size=(224, 224)) diff --git a/dali/test/python/test_RN50_data_pipeline.py b/dali/test/python/test_RN50_data_pipeline.py index dce25dca6ab..4577d779c9a 100644 --- a/dali/test/python/test_RN50_data_pipeline.py +++ b/dali/test/python/test_RN50_data_pipeline.py @@ -21,6 +21,8 @@ import argparse import time from test_utils import get_dali_extra_path, AverageMeter +from subprocess import call +import tempfile class CommonPipeline(Pipeline): @@ -179,6 +181,21 @@ def __init__(self, **kwargs): super(TFRecordPipeline, self).__init__(**kwargs) tfrecord = sorted(glob.glob(kwargs["data_paths"][0])) tfrecord_idx = sorted(glob.glob(kwargs["data_paths"][1])) + if len(tfrecord_idx) == 0: + # generate indices + tfrecord_files = [ + os.path.join(tfrecord, f) + for f in os.listdir(tfrecord) + if os.path.isfile(os.path.join(tfrecord, f)) and not f.endswith(".idx") + ] + self.temp_dir = tempfile.TemporaryDirectory() + tfrecord_idxs = [ + f"{self.temp_dir.name}/{os.path.basename(f)}.idx" for f in tfrecord_files + ] + for tfrecord_file, tfrecord_idx_file in zip(tfrecord_files, tfrecord_idxs): + print(f"Generating index file for {tfrecord_file}") + call(["tfrecord2idx", tfrecord_file, tfrecord_idx_file]) + tfrecord_idx = self.temp_dir.name() cache_enabled = kwargs["decoder_cache_params"]["cache_enabled"] self.input = ops.readers.TFRecord( path=tfrecord, @@ -240,23 +257,13 @@ def define_graph(self): test_data = { + # RecordIO & LMDB are not that frequently used any more so we won't test full datasets, + # just a small ones FileReadPipeline: [["/data/imagenet/train-jpeg"], ["/data/imagenet/val-jpeg"]], - MXNetReaderPipeline: [ - [ - "/data/imagenet/train-480-val-256-recordio/train.rec", - "/data/imagenet/train-480-val-256-recordio/train.idx", - ], - [ - "/data/imagenet/train-480-val-256-recordio/val.rec", - "/data/imagenet/train-480-val-256-recordio/val.idx", - ], - ], - CaffeReadPipeline: [["/data/imagenet/train-lmdb-256x256"], ["/data/imagenet/val-lmdb-256x256"]], - Caffe2ReadPipeline: [["/data/imagenet/train-c2lmdb-480"], ["/data/imagenet/val-c2lmdb-256"]], TFRecordPipeline: [ [ - "/data/imagenet/train-val-tfrecord-480/train-*", - "/data/imagenet/train-val-tfrecord-480.idx/train-*", + "/data/imagenet/train-val-tfrecord/train-*", + "/data/imagenet/train-val-tfrecord.idx/train-*", ] ], } @@ -617,4 +624,4 @@ def define_graph(self): ) end = time.time() - print("OK {0}/{1}: {2}".format(i + 1, data_set_len, pipe_name.__name__)) + print("OK {0}/{1}: {2}".format(i, data_set_len, pipe_name.__name__)) diff --git a/dali/test/python/test_data_containers.py b/dali/test/python/test_data_containers.py index 3657dfd41ec..412e9647624 100644 --- a/dali/test/python/test_data_containers.py +++ b/dali/test/python/test_data_containers.py @@ -19,6 +19,9 @@ import nvidia.dali.types as types import os from nvidia.dali.pipeline import Pipeline +from subprocess import call +import tempfile +import ast from test_utils import get_dali_extra_path @@ -96,6 +99,21 @@ def __init__(self, batch_size, num_threads, device_id, num_gpus, data_paths, don super(TFRecordPipeline, self).__init__(batch_size, num_threads, device_id) tfrecord = sorted(glob.glob(data_paths[0])) tfrecord_idx = sorted(glob.glob(data_paths[1])) + if len(tfrecord_idx) == 0: + # generate indices + tfrecord_files = [ + os.path.join(tfrecord, f) + for f in os.listdir(tfrecord) + if os.path.isfile(os.path.join(tfrecord, f)) and not f.endswith(".idx") + ] + self.temp_dir = tempfile.TemporaryDirectory() + tfrecord_idxs = [ + f"{self.temp_dir.name}/{os.path.basename(f)}.idx" for f in tfrecord_files + ] + for tfrecord_file, tfrecord_idx_file in zip(tfrecord_files, tfrecord_idxs): + print(f"Generating index file for {tfrecord_file}") + call(["tfrecord2idx", tfrecord_file, tfrecord_idx_file]) + tfrecord_idx = self.temp_dir.name() self.input = ops.readers.TFRecord( path=tfrecord, index_path=tfrecord_idx, @@ -132,23 +150,13 @@ def define_graph(self): test_data = { + # RecordIO & LMDB are not that frequently used any more so we won't test full datasets, + # just a small ones FileReadPipeline: [["/data/imagenet/train-jpeg"], ["/data/imagenet/val-jpeg"]], - MXNetReaderPipeline: [ - [ - "/data/imagenet/train-480-val-256-recordio/train.rec", - "/data/imagenet/train-480-val-256-recordio/train.idx", - ], - [ - "/data/imagenet/train-480-val-256-recordio/val.rec", - "/data/imagenet/train-480-val-256-recordio/val.idx", - ], - ], - CaffeReadPipeline: [["/data/imagenet/train-lmdb-256x256"], ["/data/imagenet/val-lmdb-256x256"]], - Caffe2ReadPipeline: [["/data/imagenet/train-c2lmdb-480"], ["/data/imagenet/val-c2lmdb-256"]], TFRecordPipeline: [ [ - "/data/imagenet/train-val-tfrecord-480/train-*", - "/data/imagenet/train-val-tfrecord-480.idx/train-*", + "/data/imagenet/train-val-tfrecord/train-*", + "/data/imagenet/train-val-tfrecord.idx/train-*", ] ], COCOReaderPipeline: [ @@ -189,6 +197,28 @@ def define_graph(self): ], } + +def parse_nested_square_brackets(string): + """Parse a string containing nested square brackets into a list of lists.""" + try: + parsed_list = ast.literal_eval(string) + if isinstance(parsed_list, list): + return parsed_list + else: + raise ValueError("The provided string does not represent a list.") + except (ValueError, SyntaxError) as e: + raise ValueError("Invalid input string. Ensure it is a valid list format.") from e + + +def parse_key_value_pairs(pairs): + """Convert a list of key=value strings into a dictionary.""" + result = {} + for pair in pairs: + key, value = pair.split("=", 1) + result[key] = parse_nested_square_brackets(value) + return result + + parser = argparse.ArgumentParser(description="ImageDecoder RN50 dataset test") parser.add_argument( "-g", "--gpus", default=1, type=int, metavar="N", help="number of GPUs (default: 1)" @@ -203,8 +233,17 @@ def define_graph(self): "-s", "--small", action="store_true", help="use small dataset, DALI_EXTRA_PATH needs to be set" ) parser.add_argument("-n", "--no-mmap", action="store_true", help="don't mmap files from data set") +parser.add_argument( + "datasets", + metavar="KEY=VALUE", + type=str, + nargs="*", + help="Pipeline_name=datasets list of keys that can replace build-in data paths", +) args = parser.parse_args() +updated_datasets = parse_key_value_pairs(args.datasets) + N = args.gpus # number of GPUs BATCH_SIZE = args.batch # batch size LOG_INTERVAL = args.print_freq @@ -219,6 +258,10 @@ def define_graph(self): if SMALL_DATA_SET: test_data = small_test_data +for k in test_data: + if k.__name__ in updated_datasets: + test_data[k] = updated_datasets[k.__name__] + for pipe_name in test_data.keys(): data_set_len = len(test_data[pipe_name]) for i, data_set in enumerate(test_data[pipe_name]): diff --git a/docs/examples/frameworks/paddle/paddle-basic_example.ipynb b/docs/examples/frameworks/paddle/paddle-basic_example.ipynb index 2f5205d02c9..6bdd44c0ab2 100644 --- a/docs/examples/frameworks/paddle/paddle-basic_example.ipynb +++ b/docs/examples/frameworks/paddle/paddle-basic_example.ipynb @@ -25,18 +25,20 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os.path\n", + "import subprocess\n", "\n", "test_data_root = os.environ[\"DALI_EXTRA_PATH\"]\n", "\n", "# Caffe LMDB\n", "lmdb_folder = os.path.join(test_data_root, \"db\", \"lmdb\")\n", "\n", - "N = 8 # number of GPUs\n", + "res = subprocess.run([\"nvidia-smi\", \"-L\"], stdout=subprocess.PIPE, text=True)\n", + "N = res.stdout.count(\"\\n\") # number of GPUs\n", "BATCH_SIZE = 128 # batch size per GPU\n", "IMAGE_SIZE = 3" ] diff --git a/docs/examples/frameworks/paddle/paddle-various-readers.ipynb b/docs/examples/frameworks/paddle/paddle-various-readers.ipynb index 34db796cbc3..9937374dbfc 100644 --- a/docs/examples/frameworks/paddle/paddle-various-readers.ipynb +++ b/docs/examples/frameworks/paddle/paddle-various-readers.ipynb @@ -31,11 +31,12 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os.path\n", + "import subprocess\n", "\n", "test_data_root = os.environ[\"DALI_EXTRA_PATH\"]\n", "\n", @@ -53,7 +54,8 @@ "tfrecord_idx = \"idx_files/train.idx\"\n", "tfrecord2idx_script = \"tfrecord2idx\"\n", "\n", - "N = 8 # number of GPUs\n", + "res = subprocess.run([\"nvidia-smi\", \"-L\"], stdout=subprocess.PIPE, text=True)\n", + "N = res.stdout.count(\"\\n\") # number of GPUs\n", "BATCH_SIZE = 128 # batch size per GPU\n", "IMAGE_SIZE = 3" ] diff --git a/docs/examples/frameworks/pytorch/pytorch-basic_example.ipynb b/docs/examples/frameworks/pytorch/pytorch-basic_example.ipynb index 504db6e955c..5b64d0fa6ba 100644 --- a/docs/examples/frameworks/pytorch/pytorch-basic_example.ipynb +++ b/docs/examples/frameworks/pytorch/pytorch-basic_example.ipynb @@ -25,18 +25,20 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os.path\n", + "import subprocess\n", "\n", "test_data_root = os.environ[\"DALI_EXTRA_PATH\"]\n", "\n", "# Caffe LMDB\n", "lmdb_folder = os.path.join(test_data_root, \"db\", \"lmdb\")\n", "\n", - "N = 8 # number of GPUs\n", + "res = subprocess.run([\"nvidia-smi\", \"-L\"], stdout=subprocess.PIPE, text=True)\n", + "N = res.stdout.count(\"\\n\") # number of GPUs\n", "BATCH_SIZE = 128 # batch size per GPU\n", "ITERATIONS = 32\n", "IMAGE_SIZE = 3" diff --git a/docs/examples/frameworks/pytorch/pytorch-various-readers.ipynb b/docs/examples/frameworks/pytorch/pytorch-various-readers.ipynb index b9e5f4f533e..3c9844a9e99 100644 --- a/docs/examples/frameworks/pytorch/pytorch-various-readers.ipynb +++ b/docs/examples/frameworks/pytorch/pytorch-various-readers.ipynb @@ -31,11 +31,12 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os.path\n", + "import subprocess\n", "\n", "test_data_root = os.environ[\"DALI_EXTRA_PATH\"]\n", "\n", @@ -53,7 +54,8 @@ "tfrecord_idx = \"idx_files/train.idx\"\n", "tfrecord2idx_script = \"tfrecord2idx\"\n", "\n", - "N = 8 # number of GPUs\n", + "res = subprocess.run([\"nvidia-smi\", \"-L\"], stdout=subprocess.PIPE, text=True)\n", + "N = res.stdout.count(\"\\n\") # number of GPUs\n", "BATCH_SIZE = 128 # batch size per GPU\n", "IMAGE_SIZE = 3" ] diff --git a/docs/examples/frameworks/tensorflow/tensorflow-various-readers.ipynb b/docs/examples/frameworks/tensorflow/tensorflow-various-readers.ipynb index 9ce9c8d3a8c..89e30ecd2cd 100644 --- a/docs/examples/frameworks/tensorflow/tensorflow-various-readers.ipynb +++ b/docs/examples/frameworks/tensorflow/tensorflow-various-readers.ipynb @@ -31,11 +31,12 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os.path\n", + "import subprocess\n", "\n", "test_data_root = os.environ[\"DALI_EXTRA_PATH\"]\n", "\n", @@ -53,7 +54,8 @@ "tfrecord_idx = \"idx_files/train.idx\"\n", "tfrecord2idx_script = \"tfrecord2idx\"\n", "\n", - "N = 8 # number of GPUs\n", + "res = subprocess.run([\"nvidia-smi\", \"-L\"], stdout=subprocess.PIPE, text=True)\n", + "N = res.stdout.count(\"\\n\") # number of GPUs\n", "BATCH_SIZE = 128 # batch size per GPU\n", "ITERATIONS = 32\n", "IMAGE_SIZE = 3" diff --git a/docs/examples/use_cases/tensorflow/resnet-n/README.rst b/docs/examples/use_cases/tensorflow/resnet-n/README.rst index d18ae5a124f..2f9d1bf7d47 100644 --- a/docs/examples/use_cases/tensorflow/resnet-n/README.rst +++ b/docs/examples/use_cases/tensorflow/resnet-n/README.rst @@ -24,7 +24,7 @@ For the full training on 8 GPUs:: mpiexec --allow-run-as-root --bind-to socket -np 8 \ python resnet.py --num_iter=90 --iter_unit=epoch \ - --data_dir=/data/imagenet/train-val-tfrecord-480/ \ + --data_dir=/data/imagenet/train-val-tfrecord/ \ --precision=fp16 --display_every=100 \ --export_dir=/tmp --dali_mode="GPU" @@ -32,7 +32,7 @@ For the benchmark training on 8 GPUs:: mpiexec --allow-run-as-root --bind-to socket -np 8 \ python resnet.py --num_iter=400 --iter_unit=batch \ - --data_dir=/data/imagenet/train-val-tfrecord-480/ \ + --data_dir=/data/imagenet/train-val-tfrecord/ \ --precision=fp16 --display_every=100 --dali_mode="GPU" @@ -49,7 +49,7 @@ For the full training on 8 GPUs:: mpiexec --allow-run-as-root --bind-to socket -np 8 \ python resnet_ctl.py --num_iter=90 --iter_unit=epoch \ - --data_dir=/data/imagenet/train-val-tfrecord-480/ \ + --data_dir=/data/imagenet/train-val-tfrecord/ \ --precision=fp16 --display_every=100 \ --export_dir=/tmp --dali_mode="GPU" @@ -57,7 +57,7 @@ For the benchmark training on 8 GPUs:: mpiexec --allow-run-as-root --bind-to socket -np 8 \ python resnet_ctl.py --num_iter=400 --iter_unit=batch \ - --data_dir=/data/imagenet/train-val-tfrecord-480/ \ + --data_dir=/data/imagenet/train-val-tfrecord/ \ --precision=fp16 --display_every=100 --dali_mode="GPU" Predicting in CTL (Custom Training Loop) mode diff --git a/qa/TL0_rn50-benchmarks/test.sh b/qa/TL0_rn50-benchmarks/test.sh deleted file mode 100755 index fcf45287953..00000000000 --- a/qa/TL0_rn50-benchmarks/test.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -ex - -test_body() { - BINNAME=dali_benchmark.bin - - for DIRNAME in \ - "../../build/dali/python/nvidia/dali" \ - "$(python -c 'import os; from nvidia import dali; print(os.path.dirname(dali.__file__))' 2>/dev/null || echo '')" - do - if [ -x "$DIRNAME/test/$BINNAME" ]; then - FULLPATH="$DIRNAME/test/$BINNAME" - break - fi - done - - if [[ -z "$FULLPATH" ]]; then - echo "ERROR: $BINNAME not found" - exit 1 - fi - - "$FULLPATH" --benchmark_filter="RN50*" -} - -pushd ../.. -source ./qa/test_template.sh -popd diff --git a/qa/TL0_rn50_python-benchmarks/test.sh b/qa/TL0_rn50_python-benchmarks/test.sh deleted file mode 100755 index de32e66ed7f..00000000000 --- a/qa/TL0_rn50_python-benchmarks/test.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -e -# used pip packages -pip_packages='numpy' -target_dir=./dali/benchmark - -test_body() { - # test code - python resnet50_bench.py -} - -pushd ../.. -source ./qa/test_template.sh -popd diff --git a/qa/TL1_python-nvjpeg_test/test.sh b/qa/TL1_python-nvjpeg_test/test.sh index 8b610cf353a..b9864a27278 100755 --- a/qa/TL1_python-nvjpeg_test/test.sh +++ b/qa/TL1_python-nvjpeg_test/test.sh @@ -8,8 +8,28 @@ do_once() { } test_body() { + export DATA_DIR=/data/coco/coco-2017/coco2017/ + export IS_TMP_DIR=0 + if [ ! -f "/data/coco/coco-2017/coco2017/train2017/000000581929.jpg"] && [ -f "/data/coco/coco-2017/coco2017/train2017.zip"]; then + export DATA_DIR=$(mktemp -d) + export IS_TMP_DIR=1 + cd ${DATA_DIR} + cp /data/coco/coco-2017/coco2017/train2017.zip . & + cp /data/coco/coco-2017/coco2017/val2017.zip . & + cp /data/coco/coco-2017/coco2017/annotations_trainval2017.zip . & + wait + unzip -q train2017.zip & + unzip -q val2017.zip & + unzip -q annotations_trainval2017.zip & + wait + fi # test code - python test_data_containers.py --gpus ${NUM_GPUS} -b 2048 -p 10 + python test_data_containers.py --gpus ${NUM_GPUS} -b 2048 -p 10 \ + COCOReaderPipeline="[['${DATA_DIR}/train2017', \ + '${DATA_DIR}/annotations/instances_train2017.json'], \ + ['${DATA_DIR}/val2017', \ + '${DATA_DIR}/annotations/instances_val2017.json']]" + ((IS_TMP_DIR)) && rm -rf ${DATA_DIR} } pushd ../.. diff --git a/qa/TL1_separate_executor/test_nofw.sh b/qa/TL1_separate_executor/test_nofw.sh index c9b10274c83..5d8786739b6 100755 --- a/qa/TL1_separate_executor/test_nofw.sh +++ b/qa/TL1_separate_executor/test_nofw.sh @@ -9,6 +9,17 @@ do_once() { } test_body() { + start=`date +%s` + (sleep 10 && pkill -HUP ls && true) & + (ls /data/imagenet/train-jpeg > /dev/null && pkill -HUP sleep) & + wait + end=`date +%s` + runtime=$((end-start)) + echo "Data access time: $runtime seconds" + if [ $runtime -gt 3 ]; then + echo "Data access time is greater than 3 seconds, skipping the test" + return 0 + fi python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 256 --workers 3 --separate_queue \ --cpu_size 2 --gpu_size 2 --fp16 --nhwc python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 256 --workers 3 --separate_queue \ diff --git a/qa/TL1_separate_executor/test_tf.sh b/qa/TL1_separate_executor/test_tf.sh index f26ad4ab7ba..a9cc1289ed7 100755 --- a/qa/TL1_separate_executor/test_tf.sh +++ b/qa/TL1_separate_executor/test_tf.sh @@ -1,6 +1,6 @@ #!/bin/bash -e # used pip packages -pip_packages='tensorflow-gpu' +pip_packages='tensorflow-gpu nose' target_dir=./dali/test/python one_config_only=true diff --git a/qa/TL1_ssd_training/test.sh b/qa/TL1_ssd_training/test.sh index 17bc44c5312..0826986e7b3 100755 --- a/qa/TL1_ssd_training/test.sh +++ b/qa/TL1_ssd_training/test.sh @@ -5,7 +5,25 @@ target_dir=./docs/examples/use_cases/pytorch/single_stage_detector/ test_body() { NUM_GPUS=$(nvidia-smi -L | wc -l) - torchrun --nproc_per_node=${NUM_GPUS} main.py --backbone resnet50 --warmup 300 --bs 64 --eval-batch-size 8 --epochs 4 --data /data/coco/coco-2017/coco2017/ --data_pipeline dali --target 0.085 + export DATA_DIR=/data/coco/coco-2017/coco2017/ + export IS_TMP_DIR=0 + if [ -f "/data/coco/coco-2017/coco2017/train2017.zip" ]; then + apt update && apt install -y unzip + export DATA_DIR=$(mktemp -d) + export IS_TMP_DIR=1 + pushd ${DATA_DIR} + cp /data/coco/coco-2017/coco2017/train2017.zip . & + cp /data/coco/coco-2017/coco2017/val2017.zip . & + cp /data/coco/coco-2017/coco2017/annotations_trainval2017.zip . & + wait + unzip -q train2017.zip & + unzip -q val2017.zip & + unzip -q annotations_trainval2017.zip & + wait + popd + fi + torchrun --nproc_per_node=${NUM_GPUS} main.py --backbone resnet50 --warmup 300 --bs 256 --eval-batch-size 8 --epochs 4 --data ${DATA_DIR} --data_pipeline dali --target 0.085 + ((IS_TMP_DIR)) && rm -rf ${DATA_DIR} } pushd ../.. diff --git a/qa/TL1_superres_pytorch/test.sh b/qa/TL1_superres_pytorch/test.sh index b3c62788624..a5039f400c8 100644 --- a/qa/TL1_superres_pytorch/test.sh +++ b/qa/TL1_superres_pytorch/test.sh @@ -29,7 +29,7 @@ do_once() { # Pre-trained FlowNet2.0 weights # publicly available on https://drive.google.com/file/d/1QW03eyYG_vD-dT-Mx4wopYvtPu_msTKn/view - FLOWNET_PATH=/data/dali/pretrained_models/FlowNet2-SD_checkpoint.pth.tar + FLOWNET_PATH=/dali_internal/pretrained_models/FlowNet2-SD_checkpoint.pth.tar git clone https://github.com/NVIDIA/flownet2-pytorch.git cd flownet2-pytorch diff --git a/qa/TL1_tensorflow-dali_test/test.sh b/qa/TL1_tensorflow-dali_test/test.sh index fe0a5a28339..72097b21d6c 100644 --- a/qa/TL1_tensorflow-dali_test/test.sh +++ b/qa/TL1_tensorflow-dali_test/test.sh @@ -60,9 +60,9 @@ do_once() { # TF is already available and we can set env variables install_pip_pkg "pip install --force-reinstall horovod==0.28.1 -f /pip-packages" - for file in $(ls /data/imagenet/train-val-tfrecord-480-subset); + for file in $(ls /data/imagenet/train-val-tfrecord-small); do - python ../../../../../tools/tfrecord2idx /data/imagenet/train-val-tfrecord-480-subset/${file} \ + python ../../../../../tools/tfrecord2idx /data/imagenet/train-val-tfrecord-small/${file} \ idx-files/${file}.idx & done wait @@ -78,7 +78,7 @@ test_body() { # test code mpiexec --allow-run-as-root --bind-to none -np ${NUM_GPUS} \ python -u resnet.py \ - --data_dir=/data/imagenet/train-val-tfrecord-480-subset --data_idx_dir=idx-files/ \ + --data_dir=/data/imagenet/train-val-tfrecord-small --data_idx_dir=idx-files/ \ --precision=fp16 --num_iter=100 --iter_unit=batch --display_every=50 \ --batch=64 --use_xla --dali_mode="GPU" --log_dir=./ } diff --git a/qa/TL2_FW_iterators_perf/test_pytorch.sh b/qa/TL2_FW_iterators_perf/test_pytorch.sh index 5df1a4a74e3..f508b9596cb 100755 --- a/qa/TL2_FW_iterators_perf/test_pytorch.sh +++ b/qa/TL2_FW_iterators_perf/test_pytorch.sh @@ -13,6 +13,17 @@ test_body() { python test_RN50_data_fw_iterators.py --framework ${fw} --gpus ${NUM_GPUS} -b 13 \ --workers 3 --prefetch 2 --epochs 3 done + start=`date +%s` + (sleep 10 && pkill -HUP ls && true) & + (ls /data/imagenet/train-jpeg > /dev/null && pkill -HUP sleep) & + wait + end=`date +%s` + runtime=$((end-start)) + echo "Data access time: $runtime seconds" + if [ $runtime -gt 3 ]; then + echo "Data access time is greater than 3 seconds, skipping the test" + return 0 + fi torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init fork --benchmark_iters 500 --test_pipes parallel torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init spawn --benchmark_iters 500 --test_pipes parallel torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --benchmark_iters 500 --test_pipes file_reader diff --git a/qa/TL2_RN50_data_perf/test.sh b/qa/TL2_RN50_data_perf/test.sh index 8fa95864f1a..d6ae95ffa1f 100755 --- a/qa/TL2_RN50_data_perf/test.sh +++ b/qa/TL2_RN50_data_perf/test.sh @@ -8,6 +8,17 @@ do_once() { } test_body() { + start=`date +%s` + (sleep 10 && pkill -HUP ls && true) & + (ls /data/imagenet/train-jpeg > /dev/null && pkill -HUP sleep) & + wait + end=`date +%s` + runtime=$((end-start)) + echo "Data access time: $runtime seconds" + if [ $runtime -gt 3 ]; then + echo "Data access time is greater than 3 seconds, skipping the test" + return 0 + fi # test code python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 256 --workers 3 --prefetch 2 --decoder_type "legacy" python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 256 --workers 3 --prefetch 2 --decoder_type "experimental" diff --git a/qa/TL3_EfficientDet_convergence/test_tensorflow.sh b/qa/TL3_EfficientDet_convergence/test_tensorflow.sh index 6f995e6afde..d8ecf4f8c19 100644 --- a/qa/TL3_EfficientDet_convergence/test_tensorflow.sh +++ b/qa/TL3_EfficientDet_convergence/test_tensorflow.sh @@ -14,11 +14,26 @@ export NCCL_NVLS_ENABLE=0 # workaround for https://github.com/tensorflow/tensorflow/issues/63548 export WRAPT_DISABLE_EXTENSIONS=1 +export DATA_DIR=/data/coco/coco-2017/coco2017/ +export IS_TMP_DIR=0 +if [ ! -f "/data/coco/coco-2017/coco2017/train2017/000000581929.jpg"] && [ -f "/data/coco/coco-2017/coco2017/train2017.zip"]; then + export DATA_DIR=$(mktemp -d) + export IS_TMP_DIR=1 + cd ${DATA_DIR} + cp /data/coco/coco-2017/coco2017/train2017.zip . & + cp /data/coco/coco-2017/coco2017/val2017.zip . & + cp /data/coco/coco-2017/coco2017/annotations_trainval2017.zip . & + wait + unzip -q train2017.zip & + unzip -q val2017.zip & + unzip -q annotations_trainval2017.zip & +fi + python train.py \ --epochs 1 \ --input_type coco \ - --images_path /data/coco/coco-2017/coco2017/train2017 \ - --annotations_path /data/coco/coco-2017/coco2017/annotations/instances_train2017.json \ + --images_path ${DATA_DIR}/train2017 \ + --annotations_path ${DATA_DIR}/annotations/instances_train2017.json \ --batch_size 3 \ --train_steps 6000 \ --eval_steps 1000 \ @@ -33,3 +48,4 @@ python train.py --output_filename out_weights_1.h5 2>&1 | tee $LOG CLEAN_AND_EXIT ${PIPESTATUS[0]} +((IS_TMP_DIR)) && rm -rf ${DATA_DIR} diff --git a/qa/TL3_SSD_convergence/test_pytorch.sh b/qa/TL3_SSD_convergence/test_pytorch.sh index d66adcb45bd..6b5e26abf50 100644 --- a/qa/TL3_SSD_convergence/test_pytorch.sh +++ b/qa/TL3_SSD_convergence/test_pytorch.sh @@ -15,6 +15,24 @@ pip install git+https://github.com/NVIDIA/cocoapi.git#subdirectory=PythonAPI NUM_GPUS=$(nvidia-smi -L | wc -l) +export DATA_DIR=/data/coco/coco-2017/coco2017/ +export IS_TMP_DIR=0 +if [ -f "/data/coco/coco-2017/coco2017/train2017.zip" ]; then + apt update && apt install -y unzip + export DATA_DIR=$(mktemp -d) + export IS_TMP_DIR=1 + pushd ${DATA_DIR} + cp /data/coco/coco-2017/coco2017/train2017.zip . & + cp /data/coco/coco-2017/coco2017/val2017.zip . & + cp /data/coco/coco-2017/coco2017/annotations_trainval2017.zip . & + wait + unzip -q train2017.zip & + unzip -q val2017.zip & + unzip -q annotations_trainval2017.zip & + wait + popd +fi + LOG=dali.log SECONDS=0 @@ -24,7 +42,8 @@ export NCCL_NVLS_ENABLE=0 # Prevent OOM due to fragmentation on 16G machines export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:4096 -torchrun --nproc_per_node=${NUM_GPUS} main.py --backbone resnet50 --warmup 300 --bs 64 --eval-batch-size 8 --data /coco --data /data/coco/coco-2017/coco2017/ --data_pipeline dali --target 0.25 2>&1 | tee $LOG +torchrun --nproc_per_node=${NUM_GPUS} main.py --backbone resnet50 --warmup 300 --bs 256 --eval-batch-size 8 --data /coco --data ${DATA_DIR} --data_pipeline dali --target 0.25 2>&1 | tee $LOG +((IS_TMP_DIR)) && rm -rf ${DATA_DIR} RET=${PIPESTATUS[0]} echo "Training ran in $SECONDS seconds" diff --git a/qa/setup_packages.py b/qa/setup_packages.py index 90f7890b176..94b0ae1ee63 100755 --- a/qa/setup_packages.py +++ b/qa/setup_packages.py @@ -550,7 +550,10 @@ def get_pyvers_name(self, url, cuda_version): python_min_ver="3.8", python_max_ver="3.12", ) - ] + ], + # skip tests for CUDA 12 as PaddlePaddle doesn't support this CUDA yet + # and we may hit a runner that requires it + "120": [], }, links_index="https://www.paddlepaddle.org.cn/" "whl/linux/mkl/avx/stable.html", ),