diff --git a/dali/benchmark/dali_bench.h b/dali/benchmark/dali_bench.h
index a32c7c4bb03..598c1212493 100644
--- a/dali/benchmark/dali_bench.h
+++ b/dali/benchmark/dali_bench.h
@@ -21,6 +21,7 @@
 #include <string>
 #include <vector>
 
+#include "dali/test/dali_test_config.h"
 #include "dali/core/common.h"
 #include "dali/core/tensor_shape.h"
 #include "dali/util/image.h"
@@ -28,7 +29,7 @@
 namespace dali {
 
 // Note: this is setup for the binary to be executed from "build"
-const string image_folder = "/data/dali/benchmark/benchmark_images";  // NOLINT
+static const string image_folder = testing::dali_extra_path() + "db/single/jpeg";  // NOLINT
 
 class DALIBenchmark : public benchmark::Fixture {
  public:
diff --git a/dali/benchmark/resnet50_bench.py b/dali/benchmark/resnet50_bench.py
deleted file mode 100755
index 0df97ad0869..00000000000
--- a/dali/benchmark/resnet50_bench.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Copyright (c) 2017-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-from nvidia.dali.pipeline import Pipeline
-import nvidia.dali.ops as ops
-import nvidia.dali.types as types
-import numpy as np
-from timeit import default_timer as timer
-
-image_folder = "/data/dali/benchmark/benchmark_images"
-
-
-def read_jpegs(folder):
-    with open(folder + "/image_list.txt", "r") as file:
-        files = [line.rstrip() for line in file]
-
-    images = []
-    for fname in files:
-        f = open(image_folder + "/" + fname, "rb")
-        images.append(np.fromstring(f.read(), dtype=np.uint8))
-    return images
-
-
-def make_batch(size):
-    data = read_jpegs(image_folder)
-    return [data[i % len(data)] for i in range(size)]
-
-
-class C2Pipe(Pipeline):
-    def __init__(self, batch_size, num_threads, device_id, pipelined=True, exec_async=True):
-        super(C2Pipe, self).__init__(
-            batch_size, num_threads, device_id, exec_pipelined=pipelined, exec_async=exec_async
-        )
-        self.input = ops.ExternalSource()
-        self.decode = ops.ImageDecoder(device="cpu", output_type=types.RGB)
-        self.rcm = ops.FastResizeCropMirror(crop=(224, 224))
-        self.np = ops.CropMirrorNormalize(
-            device="gpu", dtype=types.FLOAT16, mean=[128.0, 128.0, 128.0], std=[1.0, 1.0, 1.0]
-        )
-        self.uniform = ops.random.Uniform(range=(0.0, 1.0))
-        self.resize_uniform = ops.random.Uniform(range=(256.0, 480.0))
-        self.mirror = ops.random.CoinFlip(probability=0.5)
-
-    def define_graph(self):
-        self.jpegs = self.input()
-        images = self.decode(self.jpegs)
-        resized = self.rcm(
-            images,
-            crop_pos_x=self.uniform(),
-            crop_pos_y=self.uniform(),
-            mirror=self.mirror(),
-            resize_shorter=self.resize_uniform(),
-        )
-        output = self.np(resized.gpu())
-        return output
-
-    def iter_setup(self):
-        raw_data = make_batch(self.batch_size)
-        self.feed_input(self.jpegs, raw_data)
-
-
-class HybridPipe(Pipeline):
-    def __init__(self, batch_size, num_threads, device_id, pipelined=True, exec_async=True):
-        super(HybridPipe, self).__init__(
-            batch_size, num_threads, device_id, exec_pipelined=pipelined, exec_async=exec_async
-        )
-        self.input = ops.ExternalSource()
-        self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB)
-        self.resize = ops.Resize(device="gpu", interp_type=types.INTERP_LINEAR)
-        self.cmnp = ops.CropMirrorNormalize(
-            device="gpu",
-            dtype=types.FLOAT16,
-            crop=(224, 224),
-            mean=[128.0, 128.0, 128.0],
-            std=[1.0, 1.0, 1.0],
-        )
-        self.uniform = ops.random.Uniform(range=(0.0, 1.0))
-        self.resize_uniform = ops.random.Uniform(range=(256.0, 480.0))
-        self.mirror = ops.random.CoinFlip(probability=0.5)
-
-    def define_graph(self):
-        self.jpegs = self.input()
-        images = self.decode(self.jpegs)
-        resized = self.resize(images, resize_shorter=self.resize_uniform())
-        output = self.cmnp(
-            resized, mirror=self.mirror(), crop_pos_x=self.uniform(), crop_pos_y=self.uniform()
-        )
-        return output
-
-    def iter_setup(self):
-        raw_data = make_batch(self.batch_size)
-        self.feed_input(self.jpegs, raw_data)
-
-
-def run_benchmarks(PipeType, args):
-    print("Running Benchmarks For {}".format(PipeType.__name__))
-    for executor in args.executors:
-        pipelined = executor > 0
-        exec_async = executor > 1
-        for batch_size in args.batch_sizes:
-            for num_threads in args.thread_counts:
-                pipe = PipeType(batch_size, num_threads, 0, pipelined, exec_async)
-                pipe.build()
-                start_time = timer()
-                for i in range(args.num_iters):
-                    pipe.run()
-
-                total_time = timer() - start_time
-                print(
-                    "{}/{}/{}/{}: FPS={}".format(
-                        PipeType.__name__,
-                        executor,
-                        batch_size,
-                        num_threads,
-                        float(batch_size * args.num_iters) / total_time,
-                    )
-                )
-
-
-def get_args():
-    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument(
-        "--batch-sizes", default=[128], help="Comma separated list of batch sizes to run"
-    )
-    parser.add_argument(
-        "--thread-counts", default=[1, 2, 3, 4], help="Comma separated list of thread counts"
-    )
-    parser.add_argument("--executors", default=[2], help="List of executors to run")
-    parser.add_argument("--num-iters", type=int, default=100, help="Number of iterations to run")
-    return parser.parse_args()
-
-
-def main():
-    args = get_args()
-    pipe_types = [C2Pipe, HybridPipe]
-    for PipeType in pipe_types:
-        run_benchmarks(PipeType, args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/dali/benchmark/resnet50_nvjpeg_bench.cc b/dali/benchmark/resnet50_nvjpeg_bench.cc
index 8985f5bce8e..3358446a1c8 100755
--- a/dali/benchmark/resnet50_nvjpeg_bench.cc
+++ b/dali/benchmark/resnet50_nvjpeg_bench.cc
@@ -41,7 +41,7 @@ BENCHMARK_DEFINE_F(RealRN50, nvjpegPipe)(benchmark::State& st) { // NOLINT
 
   pipe.AddOperator(
     OpSpec("Caffe2Reader")
-    .AddArg("path", "/data/imagenet/train-c2lmdb-480")
+    .AddArg("path", testing::dali_extra_path() + "db/c2lmdb")
     .AddOutput("raw_jpegs", StorageDevice::CPU)
     .AddOutput("labels", StorageDevice::CPU));
 
diff --git a/dali/test/python/test_RN50_data_fw_iterators.py b/dali/test/python/test_RN50_data_fw_iterators.py
index ca17614253b..b8fdd3b724f 100644
--- a/dali/test/python/test_RN50_data_fw_iterators.py
+++ b/dali/test/python/test_RN50_data_fw_iterators.py
@@ -17,8 +17,10 @@
 import nvidia.dali.types as types
 import argparse
 import time
+from test_utils import get_dali_extra_path
+import os
 
-data_paths = ["/data/imagenet/train-jpeg"]
+data_paths = os.path.join(get_dali_extra_path(), "db", "single", "jpeg")
 
 
 class RN50Pipeline(Pipeline):
@@ -28,9 +30,7 @@ def __init__(
         super(RN50Pipeline, self).__init__(
             batch_size, num_threads, device_id, prefetch_queue_depth=prefetch
         )
-        self.input = ops.readers.File(
-            file_root=data_paths[0], shard_id=device_id, num_shards=num_gpus
-        )
+        self.input = ops.readers.File(file_root=data_paths, shard_id=device_id, num_shards=num_gpus)
         self.decode_gpu = ops.decoders.Image(device="mixed", output_type=types.RGB)
         self.res = ops.RandomResizedCrop(device="gpu", size=(224, 224))
 
diff --git a/dali/test/python/test_RN50_data_pipeline.py b/dali/test/python/test_RN50_data_pipeline.py
index dce25dca6ab..4577d779c9a 100644
--- a/dali/test/python/test_RN50_data_pipeline.py
+++ b/dali/test/python/test_RN50_data_pipeline.py
@@ -21,6 +21,8 @@
 import argparse
 import time
 from test_utils import get_dali_extra_path, AverageMeter
+from subprocess import call
+import tempfile
 
 
 class CommonPipeline(Pipeline):
@@ -179,6 +181,21 @@ def __init__(self, **kwargs):
         super(TFRecordPipeline, self).__init__(**kwargs)
         tfrecord = sorted(glob.glob(kwargs["data_paths"][0]))
         tfrecord_idx = sorted(glob.glob(kwargs["data_paths"][1]))
+        if len(tfrecord_idx) == 0:
+            # generate indices
+            tfrecord_files = [
+                os.path.join(tfrecord, f)
+                for f in os.listdir(tfrecord)
+                if os.path.isfile(os.path.join(tfrecord, f)) and not f.endswith(".idx")
+            ]
+            self.temp_dir = tempfile.TemporaryDirectory()
+            tfrecord_idxs = [
+                f"{self.temp_dir.name}/{os.path.basename(f)}.idx" for f in tfrecord_files
+            ]
+            for tfrecord_file, tfrecord_idx_file in zip(tfrecord_files, tfrecord_idxs):
+                print(f"Generating index file for {tfrecord_file}")
+                call(["tfrecord2idx", tfrecord_file, tfrecord_idx_file])
+            tfrecord_idx = self.temp_dir.name()
         cache_enabled = kwargs["decoder_cache_params"]["cache_enabled"]
         self.input = ops.readers.TFRecord(
             path=tfrecord,
@@ -240,23 +257,13 @@ def define_graph(self):
 
 
 test_data = {
+    # RecordIO & LMDB are not that frequently used any more so we won't test full datasets,
+    # just a small ones
     FileReadPipeline: [["/data/imagenet/train-jpeg"], ["/data/imagenet/val-jpeg"]],
-    MXNetReaderPipeline: [
-        [
-            "/data/imagenet/train-480-val-256-recordio/train.rec",
-            "/data/imagenet/train-480-val-256-recordio/train.idx",
-        ],
-        [
-            "/data/imagenet/train-480-val-256-recordio/val.rec",
-            "/data/imagenet/train-480-val-256-recordio/val.idx",
-        ],
-    ],
-    CaffeReadPipeline: [["/data/imagenet/train-lmdb-256x256"], ["/data/imagenet/val-lmdb-256x256"]],
-    Caffe2ReadPipeline: [["/data/imagenet/train-c2lmdb-480"], ["/data/imagenet/val-c2lmdb-256"]],
     TFRecordPipeline: [
         [
-            "/data/imagenet/train-val-tfrecord-480/train-*",
-            "/data/imagenet/train-val-tfrecord-480.idx/train-*",
+            "/data/imagenet/train-val-tfrecord/train-*",
+            "/data/imagenet/train-val-tfrecord.idx/train-*",
         ]
     ],
 }
@@ -617,4 +624,4 @@ def define_graph(self):
                     )
                 end = time.time()
 
-        print("OK {0}/{1}: {2}".format(i + 1, data_set_len, pipe_name.__name__))
+        print("OK {0}/{1}: {2}".format(i, data_set_len, pipe_name.__name__))
diff --git a/dali/test/python/test_data_containers.py b/dali/test/python/test_data_containers.py
index 3657dfd41ec..412e9647624 100644
--- a/dali/test/python/test_data_containers.py
+++ b/dali/test/python/test_data_containers.py
@@ -19,6 +19,9 @@
 import nvidia.dali.types as types
 import os
 from nvidia.dali.pipeline import Pipeline
+from subprocess import call
+import tempfile
+import ast
 
 from test_utils import get_dali_extra_path
 
@@ -96,6 +99,21 @@ def __init__(self, batch_size, num_threads, device_id, num_gpus, data_paths, don
         super(TFRecordPipeline, self).__init__(batch_size, num_threads, device_id)
         tfrecord = sorted(glob.glob(data_paths[0]))
         tfrecord_idx = sorted(glob.glob(data_paths[1]))
+        if len(tfrecord_idx) == 0:
+            # generate indices
+            tfrecord_files = [
+                os.path.join(tfrecord, f)
+                for f in os.listdir(tfrecord)
+                if os.path.isfile(os.path.join(tfrecord, f)) and not f.endswith(".idx")
+            ]
+            self.temp_dir = tempfile.TemporaryDirectory()
+            tfrecord_idxs = [
+                f"{self.temp_dir.name}/{os.path.basename(f)}.idx" for f in tfrecord_files
+            ]
+            for tfrecord_file, tfrecord_idx_file in zip(tfrecord_files, tfrecord_idxs):
+                print(f"Generating index file for {tfrecord_file}")
+                call(["tfrecord2idx", tfrecord_file, tfrecord_idx_file])
+            tfrecord_idx = self.temp_dir.name()
         self.input = ops.readers.TFRecord(
             path=tfrecord,
             index_path=tfrecord_idx,
@@ -132,23 +150,13 @@ def define_graph(self):
 
 
 test_data = {
+    # RecordIO & LMDB are not that frequently used any more so we won't test full datasets,
+    # just a small ones
     FileReadPipeline: [["/data/imagenet/train-jpeg"], ["/data/imagenet/val-jpeg"]],
-    MXNetReaderPipeline: [
-        [
-            "/data/imagenet/train-480-val-256-recordio/train.rec",
-            "/data/imagenet/train-480-val-256-recordio/train.idx",
-        ],
-        [
-            "/data/imagenet/train-480-val-256-recordio/val.rec",
-            "/data/imagenet/train-480-val-256-recordio/val.idx",
-        ],
-    ],
-    CaffeReadPipeline: [["/data/imagenet/train-lmdb-256x256"], ["/data/imagenet/val-lmdb-256x256"]],
-    Caffe2ReadPipeline: [["/data/imagenet/train-c2lmdb-480"], ["/data/imagenet/val-c2lmdb-256"]],
     TFRecordPipeline: [
         [
-            "/data/imagenet/train-val-tfrecord-480/train-*",
-            "/data/imagenet/train-val-tfrecord-480.idx/train-*",
+            "/data/imagenet/train-val-tfrecord/train-*",
+            "/data/imagenet/train-val-tfrecord.idx/train-*",
         ]
     ],
     COCOReaderPipeline: [
@@ -189,6 +197,28 @@ def define_graph(self):
     ],
 }
 
+
+def parse_nested_square_brackets(string):
+    """Parse a string containing nested square brackets into a list of lists."""
+    try:
+        parsed_list = ast.literal_eval(string)
+        if isinstance(parsed_list, list):
+            return parsed_list
+        else:
+            raise ValueError("The provided string does not represent a list.")
+    except (ValueError, SyntaxError) as e:
+        raise ValueError("Invalid input string. Ensure it is a valid list format.") from e
+
+
+def parse_key_value_pairs(pairs):
+    """Convert a list of key=value strings into a dictionary."""
+    result = {}
+    for pair in pairs:
+        key, value = pair.split("=", 1)
+        result[key] = parse_nested_square_brackets(value)
+    return result
+
+
 parser = argparse.ArgumentParser(description="ImageDecoder RN50 dataset test")
 parser.add_argument(
     "-g", "--gpus", default=1, type=int, metavar="N", help="number of GPUs (default: 1)"
@@ -203,8 +233,17 @@ def define_graph(self):
     "-s", "--small", action="store_true", help="use small dataset, DALI_EXTRA_PATH needs to be set"
 )
 parser.add_argument("-n", "--no-mmap", action="store_true", help="don't mmap files from data set")
+parser.add_argument(
+    "datasets",
+    metavar="KEY=VALUE",
+    type=str,
+    nargs="*",
+    help="Pipeline_name=datasets list of keys that can replace build-in data paths",
+)
 args = parser.parse_args()
 
+updated_datasets = parse_key_value_pairs(args.datasets)
+
 N = args.gpus  # number of GPUs
 BATCH_SIZE = args.batch  # batch size
 LOG_INTERVAL = args.print_freq
@@ -219,6 +258,10 @@ def define_graph(self):
 if SMALL_DATA_SET:
     test_data = small_test_data
 
+for k in test_data:
+    if k.__name__ in updated_datasets:
+        test_data[k] = updated_datasets[k.__name__]
+
 for pipe_name in test_data.keys():
     data_set_len = len(test_data[pipe_name])
     for i, data_set in enumerate(test_data[pipe_name]):
diff --git a/docs/examples/frameworks/paddle/paddle-basic_example.ipynb b/docs/examples/frameworks/paddle/paddle-basic_example.ipynb
index 2f5205d02c9..6bdd44c0ab2 100644
--- a/docs/examples/frameworks/paddle/paddle-basic_example.ipynb
+++ b/docs/examples/frameworks/paddle/paddle-basic_example.ipynb
@@ -25,18 +25,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import os.path\n",
+    "import subprocess\n",
     "\n",
     "test_data_root = os.environ[\"DALI_EXTRA_PATH\"]\n",
     "\n",
     "# Caffe LMDB\n",
     "lmdb_folder = os.path.join(test_data_root, \"db\", \"lmdb\")\n",
     "\n",
-    "N = 8  # number of GPUs\n",
+    "res = subprocess.run([\"nvidia-smi\", \"-L\"], stdout=subprocess.PIPE, text=True)\n",
+    "N = res.stdout.count(\"\\n\")  # number of GPUs\n",
     "BATCH_SIZE = 128  # batch size per GPU\n",
     "IMAGE_SIZE = 3"
    ]
diff --git a/docs/examples/frameworks/paddle/paddle-various-readers.ipynb b/docs/examples/frameworks/paddle/paddle-various-readers.ipynb
index 34db796cbc3..9937374dbfc 100644
--- a/docs/examples/frameworks/paddle/paddle-various-readers.ipynb
+++ b/docs/examples/frameworks/paddle/paddle-various-readers.ipynb
@@ -31,11 +31,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import os.path\n",
+    "import subprocess\n",
     "\n",
     "test_data_root = os.environ[\"DALI_EXTRA_PATH\"]\n",
     "\n",
@@ -53,7 +54,8 @@
     "tfrecord_idx = \"idx_files/train.idx\"\n",
     "tfrecord2idx_script = \"tfrecord2idx\"\n",
     "\n",
-    "N = 8  # number of GPUs\n",
+    "res = subprocess.run([\"nvidia-smi\", \"-L\"], stdout=subprocess.PIPE, text=True)\n",
+    "N = res.stdout.count(\"\\n\")  # number of GPUs\n",
     "BATCH_SIZE = 128  # batch size per GPU\n",
     "IMAGE_SIZE = 3"
    ]
diff --git a/docs/examples/frameworks/pytorch/pytorch-basic_example.ipynb b/docs/examples/frameworks/pytorch/pytorch-basic_example.ipynb
index 504db6e955c..5b64d0fa6ba 100644
--- a/docs/examples/frameworks/pytorch/pytorch-basic_example.ipynb
+++ b/docs/examples/frameworks/pytorch/pytorch-basic_example.ipynb
@@ -25,18 +25,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import os.path\n",
+    "import subprocess\n",
     "\n",
     "test_data_root = os.environ[\"DALI_EXTRA_PATH\"]\n",
     "\n",
     "# Caffe LMDB\n",
     "lmdb_folder = os.path.join(test_data_root, \"db\", \"lmdb\")\n",
     "\n",
-    "N = 8  # number of GPUs\n",
+    "res = subprocess.run([\"nvidia-smi\", \"-L\"], stdout=subprocess.PIPE, text=True)\n",
+    "N = res.stdout.count(\"\\n\")  # number of GPUs\n",
     "BATCH_SIZE = 128  # batch size per GPU\n",
     "ITERATIONS = 32\n",
     "IMAGE_SIZE = 3"
diff --git a/docs/examples/frameworks/pytorch/pytorch-various-readers.ipynb b/docs/examples/frameworks/pytorch/pytorch-various-readers.ipynb
index b9e5f4f533e..3c9844a9e99 100644
--- a/docs/examples/frameworks/pytorch/pytorch-various-readers.ipynb
+++ b/docs/examples/frameworks/pytorch/pytorch-various-readers.ipynb
@@ -31,11 +31,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import os.path\n",
+    "import subprocess\n",
     "\n",
     "test_data_root = os.environ[\"DALI_EXTRA_PATH\"]\n",
     "\n",
@@ -53,7 +54,8 @@
     "tfrecord_idx = \"idx_files/train.idx\"\n",
     "tfrecord2idx_script = \"tfrecord2idx\"\n",
     "\n",
-    "N = 8  # number of GPUs\n",
+    "res = subprocess.run([\"nvidia-smi\", \"-L\"], stdout=subprocess.PIPE, text=True)\n",
+    "N = res.stdout.count(\"\\n\")  # number of GPUs\n",
     "BATCH_SIZE = 128  # batch size per GPU\n",
     "IMAGE_SIZE = 3"
    ]
diff --git a/docs/examples/frameworks/tensorflow/tensorflow-various-readers.ipynb b/docs/examples/frameworks/tensorflow/tensorflow-various-readers.ipynb
index 9ce9c8d3a8c..89e30ecd2cd 100644
--- a/docs/examples/frameworks/tensorflow/tensorflow-various-readers.ipynb
+++ b/docs/examples/frameworks/tensorflow/tensorflow-various-readers.ipynb
@@ -31,11 +31,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import os.path\n",
+    "import subprocess\n",
     "\n",
     "test_data_root = os.environ[\"DALI_EXTRA_PATH\"]\n",
     "\n",
@@ -53,7 +54,8 @@
     "tfrecord_idx = \"idx_files/train.idx\"\n",
     "tfrecord2idx_script = \"tfrecord2idx\"\n",
     "\n",
-    "N = 8  # number of GPUs\n",
+    "res = subprocess.run([\"nvidia-smi\", \"-L\"], stdout=subprocess.PIPE, text=True)\n",
+    "N = res.stdout.count(\"\\n\")  # number of GPUs\n",
     "BATCH_SIZE = 128  # batch size per GPU\n",
     "ITERATIONS = 32\n",
     "IMAGE_SIZE = 3"
diff --git a/docs/examples/use_cases/tensorflow/resnet-n/README.rst b/docs/examples/use_cases/tensorflow/resnet-n/README.rst
index d18ae5a124f..2f9d1bf7d47 100644
--- a/docs/examples/use_cases/tensorflow/resnet-n/README.rst
+++ b/docs/examples/use_cases/tensorflow/resnet-n/README.rst
@@ -24,7 +24,7 @@ For the full training on 8 GPUs::
 
     mpiexec --allow-run-as-root --bind-to socket -np 8 \
       python resnet.py --num_iter=90 --iter_unit=epoch \
-      --data_dir=/data/imagenet/train-val-tfrecord-480/ \
+      --data_dir=/data/imagenet/train-val-tfrecord/ \
       --precision=fp16 --display_every=100 \
       --export_dir=/tmp --dali_mode="GPU"
 
@@ -32,7 +32,7 @@ For the benchmark training on 8 GPUs::
 
     mpiexec --allow-run-as-root --bind-to socket -np 8 \
       python resnet.py --num_iter=400 --iter_unit=batch \
-      --data_dir=/data/imagenet/train-val-tfrecord-480/ \
+      --data_dir=/data/imagenet/train-val-tfrecord/ \
       --precision=fp16 --display_every=100 --dali_mode="GPU"
 
 
@@ -49,7 +49,7 @@ For the full training on 8 GPUs::
 
     mpiexec --allow-run-as-root --bind-to socket -np 8 \
       python resnet_ctl.py --num_iter=90 --iter_unit=epoch \
-      --data_dir=/data/imagenet/train-val-tfrecord-480/ \
+      --data_dir=/data/imagenet/train-val-tfrecord/ \
       --precision=fp16 --display_every=100 \
       --export_dir=/tmp --dali_mode="GPU"
 
@@ -57,7 +57,7 @@ For the benchmark training on 8 GPUs::
 
     mpiexec --allow-run-as-root --bind-to socket -np 8 \
       python resnet_ctl.py --num_iter=400 --iter_unit=batch \
-      --data_dir=/data/imagenet/train-val-tfrecord-480/ \
+      --data_dir=/data/imagenet/train-val-tfrecord/ \
       --precision=fp16 --display_every=100 --dali_mode="GPU"
 
 Predicting in CTL (Custom Training Loop) mode
diff --git a/qa/TL0_rn50-benchmarks/test.sh b/qa/TL0_rn50-benchmarks/test.sh
deleted file mode 100755
index fcf45287953..00000000000
--- a/qa/TL0_rn50-benchmarks/test.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/bash -ex
-
-test_body() {
-  BINNAME=dali_benchmark.bin
-
-  for DIRNAME in \
-    "../../build/dali/python/nvidia/dali" \
-    "$(python -c 'import os; from nvidia import dali; print(os.path.dirname(dali.__file__))' 2>/dev/null || echo '')"
-  do
-      if [ -x "$DIRNAME/test/$BINNAME" ]; then
-          FULLPATH="$DIRNAME/test/$BINNAME"
-          break
-      fi
-  done
-
-  if [[ -z "$FULLPATH" ]]; then
-      echo "ERROR: $BINNAME not found"
-      exit 1
-  fi
-
-  "$FULLPATH" --benchmark_filter="RN50*"
-}
-
-pushd ../..
-source ./qa/test_template.sh
-popd
diff --git a/qa/TL0_rn50_python-benchmarks/test.sh b/qa/TL0_rn50_python-benchmarks/test.sh
deleted file mode 100755
index de32e66ed7f..00000000000
--- a/qa/TL0_rn50_python-benchmarks/test.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash -e
-# used pip packages
-pip_packages='numpy'
-target_dir=./dali/benchmark
-
-test_body() {
-    # test code
-    python resnet50_bench.py
-}
-
-pushd ../..
-source ./qa/test_template.sh
-popd
diff --git a/qa/TL1_python-nvjpeg_test/test.sh b/qa/TL1_python-nvjpeg_test/test.sh
index 8b610cf353a..b9864a27278 100755
--- a/qa/TL1_python-nvjpeg_test/test.sh
+++ b/qa/TL1_python-nvjpeg_test/test.sh
@@ -8,8 +8,28 @@ do_once() {
 }
 
 test_body() {
+    export DATA_DIR=/data/coco/coco-2017/coco2017/
+    export IS_TMP_DIR=0
+    if [ ! -f "/data/coco/coco-2017/coco2017/train2017/000000581929.jpg"] && [ -f "/data/coco/coco-2017/coco2017/train2017.zip"]; then
+        export DATA_DIR=$(mktemp -d)
+        export IS_TMP_DIR=1
+        cd ${DATA_DIR}
+        cp /data/coco/coco-2017/coco2017/train2017.zip . &
+        cp /data/coco/coco-2017/coco2017/val2017.zip . &
+        cp /data/coco/coco-2017/coco2017/annotations_trainval2017.zip . &
+        wait
+        unzip -q train2017.zip &
+        unzip -q val2017.zip &
+        unzip -q annotations_trainval2017.zip &
+        wait
+    fi
     # test code
-    python test_data_containers.py --gpus ${NUM_GPUS} -b 2048 -p 10
+    python test_data_containers.py --gpus ${NUM_GPUS} -b 2048 -p 10 \
+        COCOReaderPipeline="[['${DATA_DIR}/train2017', \
+                              '${DATA_DIR}/annotations/instances_train2017.json'], \
+                             ['${DATA_DIR}/val2017', \
+                              '${DATA_DIR}/annotations/instances_val2017.json']]"
+    ((IS_TMP_DIR)) && rm -rf ${DATA_DIR}
 }
 
 pushd ../..
diff --git a/qa/TL1_separate_executor/test_nofw.sh b/qa/TL1_separate_executor/test_nofw.sh
index c9b10274c83..5d8786739b6 100755
--- a/qa/TL1_separate_executor/test_nofw.sh
+++ b/qa/TL1_separate_executor/test_nofw.sh
@@ -9,6 +9,17 @@ do_once() {
 }
 
 test_body() {
+    start=`date +%s`
+    (sleep 10 && pkill -HUP ls && true) &
+    (ls /data/imagenet/train-jpeg > /dev/null && pkill -HUP sleep) &
+    wait
+    end=`date +%s`
+    runtime=$((end-start))
+    echo "Data access time: $runtime seconds"
+    if [ $runtime -gt 3 ]; then
+        echo "Data access time is greater than 3 seconds, skipping the test"
+        return 0
+    fi
     python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 256 --workers 3 --separate_queue \
         --cpu_size 2 --gpu_size 2 --fp16 --nhwc
     python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 256 --workers 3 --separate_queue \
diff --git a/qa/TL1_separate_executor/test_tf.sh b/qa/TL1_separate_executor/test_tf.sh
index f26ad4ab7ba..a9cc1289ed7 100755
--- a/qa/TL1_separate_executor/test_tf.sh
+++ b/qa/TL1_separate_executor/test_tf.sh
@@ -1,6 +1,6 @@
 #!/bin/bash -e
 # used pip packages
-pip_packages='tensorflow-gpu'
+pip_packages='tensorflow-gpu nose'
 target_dir=./dali/test/python
 one_config_only=true
 
diff --git a/qa/TL1_ssd_training/test.sh b/qa/TL1_ssd_training/test.sh
index 17bc44c5312..0826986e7b3 100755
--- a/qa/TL1_ssd_training/test.sh
+++ b/qa/TL1_ssd_training/test.sh
@@ -5,7 +5,25 @@ target_dir=./docs/examples/use_cases/pytorch/single_stage_detector/
 
 test_body() {
     NUM_GPUS=$(nvidia-smi -L | wc -l)
-    torchrun --nproc_per_node=${NUM_GPUS} main.py --backbone resnet50 --warmup 300 --bs 64 --eval-batch-size 8 --epochs 4 --data /data/coco/coco-2017/coco2017/ --data_pipeline dali --target 0.085
+    export DATA_DIR=/data/coco/coco-2017/coco2017/
+    export IS_TMP_DIR=0
+    if [ -f "/data/coco/coco-2017/coco2017/train2017.zip" ]; then
+        apt update && apt install -y unzip
+        export DATA_DIR=$(mktemp -d)
+        export IS_TMP_DIR=1
+        pushd ${DATA_DIR}
+        cp /data/coco/coco-2017/coco2017/train2017.zip . &
+        cp /data/coco/coco-2017/coco2017/val2017.zip . &
+        cp /data/coco/coco-2017/coco2017/annotations_trainval2017.zip . &
+        wait
+        unzip -q train2017.zip &
+        unzip -q val2017.zip &
+        unzip -q annotations_trainval2017.zip &
+        wait
+        popd
+    fi
+    torchrun --nproc_per_node=${NUM_GPUS} main.py --backbone resnet50 --warmup 300 --bs 256 --eval-batch-size 8 --epochs 4 --data ${DATA_DIR} --data_pipeline dali --target 0.085
+    ((IS_TMP_DIR)) && rm -rf ${DATA_DIR}
 }
 
 pushd ../..
diff --git a/qa/TL1_superres_pytorch/test.sh b/qa/TL1_superres_pytorch/test.sh
index b3c62788624..a5039f400c8 100644
--- a/qa/TL1_superres_pytorch/test.sh
+++ b/qa/TL1_superres_pytorch/test.sh
@@ -29,7 +29,7 @@ do_once() {
 
     # Pre-trained FlowNet2.0 weights
     # publicly available on https://drive.google.com/file/d/1QW03eyYG_vD-dT-Mx4wopYvtPu_msTKn/view
-    FLOWNET_PATH=/data/dali/pretrained_models/FlowNet2-SD_checkpoint.pth.tar
+    FLOWNET_PATH=/dali_internal/pretrained_models/FlowNet2-SD_checkpoint.pth.tar
 
     git clone https://github.com/NVIDIA/flownet2-pytorch.git
     cd flownet2-pytorch
diff --git a/qa/TL1_tensorflow-dali_test/test.sh b/qa/TL1_tensorflow-dali_test/test.sh
index fe0a5a28339..72097b21d6c 100644
--- a/qa/TL1_tensorflow-dali_test/test.sh
+++ b/qa/TL1_tensorflow-dali_test/test.sh
@@ -60,9 +60,9 @@ do_once() {
     # TF is already available and we can set env variables
     install_pip_pkg "pip install --force-reinstall horovod==0.28.1 -f /pip-packages"
 
-    for file in $(ls /data/imagenet/train-val-tfrecord-480-subset);
+    for file in $(ls /data/imagenet/train-val-tfrecord-small);
     do
-        python ../../../../../tools/tfrecord2idx /data/imagenet/train-val-tfrecord-480-subset/${file} \
+        python ../../../../../tools/tfrecord2idx /data/imagenet/train-val-tfrecord-small/${file} \
             idx-files/${file}.idx &
     done
     wait
@@ -78,7 +78,7 @@ test_body() {
     # test code
     mpiexec --allow-run-as-root --bind-to none -np ${NUM_GPUS} \
         python -u resnet.py \
-        --data_dir=/data/imagenet/train-val-tfrecord-480-subset --data_idx_dir=idx-files/ \
+        --data_dir=/data/imagenet/train-val-tfrecord-small --data_idx_dir=idx-files/ \
         --precision=fp16 --num_iter=100  --iter_unit=batch --display_every=50 \
         --batch=64 --use_xla --dali_mode="GPU" --log_dir=./
 }
diff --git a/qa/TL2_FW_iterators_perf/test_pytorch.sh b/qa/TL2_FW_iterators_perf/test_pytorch.sh
index 5df1a4a74e3..f508b9596cb 100755
--- a/qa/TL2_FW_iterators_perf/test_pytorch.sh
+++ b/qa/TL2_FW_iterators_perf/test_pytorch.sh
@@ -13,6 +13,17 @@ test_body() {
         python test_RN50_data_fw_iterators.py --framework ${fw} --gpus ${NUM_GPUS} -b 13 \
             --workers 3 --prefetch 2 --epochs 3
     done
+    start=`date +%s`
+    (sleep 10 && pkill -HUP ls && true) &
+    (ls /data/imagenet/train-jpeg > /dev/null && pkill -HUP sleep) &
+    wait
+    end=`date +%s`
+    runtime=$((end-start))
+    echo "Data access time: $runtime seconds"
+    if [ $runtime -gt 3 ]; then
+        echo "Data access time is greater than 3 seconds, skipping the test"
+        return 0
+    fi
     torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init fork --benchmark_iters 500 --test_pipes parallel
     torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init spawn --benchmark_iters 500 --test_pipes parallel
     torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --benchmark_iters 500 --test_pipes file_reader
diff --git a/qa/TL2_RN50_data_perf/test.sh b/qa/TL2_RN50_data_perf/test.sh
index 8fa95864f1a..d6ae95ffa1f 100755
--- a/qa/TL2_RN50_data_perf/test.sh
+++ b/qa/TL2_RN50_data_perf/test.sh
@@ -8,6 +8,17 @@ do_once() {
 }
 
 test_body() {
+    start=`date +%s`
+    (sleep 10 && pkill -HUP ls && true) &
+    (ls /data/imagenet/train-jpeg > /dev/null && pkill -HUP sleep) &
+    wait
+    end=`date +%s`
+    runtime=$((end-start))
+    echo "Data access time: $runtime seconds"
+    if [ $runtime -gt 3 ]; then
+        echo "Data access time is greater than 3 seconds, skipping the test"
+        return 0
+    fi
     # test code
     python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 256 --workers 3 --prefetch 2 --decoder_type "legacy"
     python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 256 --workers 3 --prefetch 2 --decoder_type "experimental"
diff --git a/qa/TL3_EfficientDet_convergence/test_tensorflow.sh b/qa/TL3_EfficientDet_convergence/test_tensorflow.sh
index 6f995e6afde..d8ecf4f8c19 100644
--- a/qa/TL3_EfficientDet_convergence/test_tensorflow.sh
+++ b/qa/TL3_EfficientDet_convergence/test_tensorflow.sh
@@ -14,11 +14,26 @@ export NCCL_NVLS_ENABLE=0
 # workaround for https://github.com/tensorflow/tensorflow/issues/63548
 export WRAPT_DISABLE_EXTENSIONS=1
 
+export DATA_DIR=/data/coco/coco-2017/coco2017/
+export IS_TMP_DIR=0
+if [ ! -f "/data/coco/coco-2017/coco2017/train2017/000000581929.jpg"] && [ -f "/data/coco/coco-2017/coco2017/train2017.zip"]; then
+    export DATA_DIR=$(mktemp -d)
+    export IS_TMP_DIR=1
+    cd ${DATA_DIR}
+    cp /data/coco/coco-2017/coco2017/train2017.zip . &
+    cp /data/coco/coco-2017/coco2017/val2017.zip . &
+    cp /data/coco/coco-2017/coco2017/annotations_trainval2017.zip . &
+    wait
+    unzip -q train2017.zip &
+    unzip -q val2017.zip &
+    unzip -q annotations_trainval2017.zip &
+fi
+
 python train.py                                                                                     \
     --epochs 1                                                                                      \
     --input_type coco                                                                               \
-    --images_path /data/coco/coco-2017/coco2017/train2017                                           \
-    --annotations_path /data/coco/coco-2017/coco2017/annotations/instances_train2017.json           \
+    --images_path ${DATA_DIR}/train2017                                           \
+    --annotations_path ${DATA_DIR}/annotations/instances_train2017.json           \
     --batch_size 3                                                                                  \
     --train_steps 6000                                                                              \
     --eval_steps 1000                                                                               \
@@ -33,3 +48,4 @@ python train.py
     --output_filename out_weights_1.h5  2>&1 | tee $LOG
 
 CLEAN_AND_EXIT ${PIPESTATUS[0]}
+((IS_TMP_DIR)) && rm -rf ${DATA_DIR}
diff --git a/qa/TL3_SSD_convergence/test_pytorch.sh b/qa/TL3_SSD_convergence/test_pytorch.sh
index d66adcb45bd..6b5e26abf50 100644
--- a/qa/TL3_SSD_convergence/test_pytorch.sh
+++ b/qa/TL3_SSD_convergence/test_pytorch.sh
@@ -15,6 +15,24 @@ pip install git+https://github.com/NVIDIA/cocoapi.git#subdirectory=PythonAPI
 
 NUM_GPUS=$(nvidia-smi -L | wc -l)
 
+export DATA_DIR=/data/coco/coco-2017/coco2017/
+export IS_TMP_DIR=0
+if [ -f "/data/coco/coco-2017/coco2017/train2017.zip" ]; then
+    apt update && apt install -y unzip
+    export DATA_DIR=$(mktemp -d)
+    export IS_TMP_DIR=1
+    pushd ${DATA_DIR}
+    cp /data/coco/coco-2017/coco2017/train2017.zip . &
+    cp /data/coco/coco-2017/coco2017/val2017.zip . &
+    cp /data/coco/coco-2017/coco2017/annotations_trainval2017.zip . &
+    wait
+    unzip -q train2017.zip &
+    unzip -q val2017.zip &
+    unzip -q annotations_trainval2017.zip &
+    wait
+    popd
+fi
+
 LOG=dali.log
 
 SECONDS=0
@@ -24,7 +42,8 @@ export NCCL_NVLS_ENABLE=0
 
 # Prevent OOM due to fragmentation on 16G machines
 export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:4096
-torchrun --nproc_per_node=${NUM_GPUS} main.py --backbone resnet50 --warmup 300 --bs 64 --eval-batch-size 8 --data /coco --data /data/coco/coco-2017/coco2017/ --data_pipeline dali --target 0.25 2>&1 | tee $LOG
+torchrun --nproc_per_node=${NUM_GPUS} main.py --backbone resnet50 --warmup 300 --bs 256 --eval-batch-size 8 --data /coco --data ${DATA_DIR} --data_pipeline dali --target 0.25 2>&1 | tee $LOG
+((IS_TMP_DIR)) && rm -rf ${DATA_DIR}
 
 RET=${PIPESTATUS[0]}
 echo "Training ran in $SECONDS seconds"
diff --git a/qa/setup_packages.py b/qa/setup_packages.py
index 90f7890b176..94b0ae1ee63 100755
--- a/qa/setup_packages.py
+++ b/qa/setup_packages.py
@@ -550,7 +550,10 @@ def get_pyvers_name(self, url, cuda_version):
                     python_min_ver="3.8",
                     python_max_ver="3.12",
                 )
-            ]
+            ],
+            # skip tests for CUDA 12 as PaddlePaddle doesn't support this CUDA yet
+            # and we may hit a runner that requires it
+            "120": [],
         },
         links_index="https://www.paddlepaddle.org.cn/" "whl/linux/mkl/avx/stable.html",
     ),