Skip to content

Commit

Permalink
More fixes
Browse files Browse the repository at this point in the history
Signed-off-by: Janusz Lisiecki <[email protected]>
  • Loading branch information
JanuszL committed Feb 20, 2025
1 parent 6b8012a commit 70af6fe
Show file tree
Hide file tree
Showing 11 changed files with 71 additions and 47 deletions.
2 changes: 1 addition & 1 deletion dali/benchmark/resnet50_nvjpeg_bench.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ BENCHMARK_DEFINE_F(RealRN50, nvjpegPipe)(benchmark::State& st) { // NOLINT

pipe.AddOperator(
OpSpec("Caffe2Reader")
.AddArg("path", "/data/imagenet/train-c2lmdb-480")
.AddArg("path", testing::dali_extra_path() + "db/c2lmdb")
.AddOutput("raw_jpegs", StorageDevice::CPU)
.AddOutput("labels", StorageDevice::CPU));

Expand Down
35 changes: 21 additions & 14 deletions dali/test/python/test_RN50_data_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import argparse
import time
from test_utils import get_dali_extra_path, AverageMeter
from subprocess import call
import tempfile


class CommonPipeline(Pipeline):
Expand Down Expand Up @@ -174,11 +176,26 @@ def define_graph(self):
return self.base_define_graph(images, labels)


temp_dir = None
class TFRecordPipeline(CommonPipeline):
def __init__(self, **kwargs):
super(TFRecordPipeline, self).__init__(**kwargs)
tfrecord = sorted(glob.glob(kwargs["data_paths"][0]))
tfrecord_idx = sorted(glob.glob(kwargs["data_paths"][1]))
if not os.path.exists(tfrecord_idx):
# generate indices
tfrecord_files = [
os.path.join(tfrecord, f)
for f in os.listdir(tfrecord)
if os.path.isfile(os.path.join(tfrecord, f)) and not f.endswith(".idx")
]
global temp_dir
temp_dir = tempfile.TemporaryDirectory()
tfrecord_idxs = [f"{temp_dir.name}/{os.path.basename(f)}.idx" for f in tfrecord_files]
for tfrecord_file, tfrecord_idx_file in zip(tfrecord_files, tfrecord_idxs):
logging.info(f"Generating index file for {tfrecord}")
call(["tfrecord2idx", tfrecord, tfrecord_idx])
tfrecord_idx = temp_dir.name()
cache_enabled = kwargs["decoder_cache_params"]["cache_enabled"]
self.input = ops.readers.TFRecord(
path=tfrecord,
Expand Down Expand Up @@ -240,23 +257,13 @@ def define_graph(self):


test_data = {
# RecordIO & LMDB are not that frequently used any more so we won't test full datasets,
# just a small ones
FileReadPipeline: [["/data/imagenet/train-jpeg"], ["/data/imagenet/val-jpeg"]],
MXNetReaderPipeline: [
[
"/data/imagenet/train-480-val-256-recordio/train.rec",
"/data/imagenet/train-480-val-256-recordio/train.idx",
],
[
"/data/imagenet/train-480-val-256-recordio/val.rec",
"/data/imagenet/train-480-val-256-recordio/val.idx",
],
],
CaffeReadPipeline: [["/data/imagenet/train-lmdb-256x256"], ["/data/imagenet/val-lmdb-256x256"]],
Caffe2ReadPipeline: [["/data/imagenet/train-c2lmdb-480"], ["/data/imagenet/val-c2lmdb-256"]],
TFRecordPipeline: [
[
"/data/imagenet/train-val-tfrecord-480/train-*",
"/data/imagenet/train-val-tfrecord-480.idx/train-*",
"/data/imagenet/train-val-tfrecord/train-*",
"/data/imagenet/train-val-tfrecord.idx/train-*",
]
],
}
Expand Down
35 changes: 21 additions & 14 deletions dali/test/python/test_data_containers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
import nvidia.dali.types as types
import os
from nvidia.dali.pipeline import Pipeline
from subprocess import call
import tempfile

from test_utils import get_dali_extra_path

Expand Down Expand Up @@ -91,11 +93,26 @@ def define_graph(self):
return self.base_define_graph(images, labels)


temp_dir = None
class TFRecordPipeline(CommonPipeline):
def __init__(self, batch_size, num_threads, device_id, num_gpus, data_paths, dont_use_mmap):
super(TFRecordPipeline, self).__init__(batch_size, num_threads, device_id)
tfrecord = sorted(glob.glob(data_paths[0]))
tfrecord_idx = sorted(glob.glob(data_paths[1]))
if not os.path.exists(tfrecord_idx):
# generate indices
tfrecord_files = [
os.path.join(tfrecord, f)
for f in os.listdir(tfrecord)
if os.path.isfile(os.path.join(tfrecord, f)) and not f.endswith(".idx")
]
global temp_dir
temp_dir = tempfile.TemporaryDirectory()
tfrecord_idxs = [f"{temp_dir.name}/{os.path.basename(f)}.idx" for f in tfrecord_files]
for tfrecord_file, tfrecord_idx_file in zip(tfrecord_files, tfrecord_idxs):
logging.info(f"Generating index file for {tfrecord}")
call(["tfrecord2idx", tfrecord, tfrecord_idx])
tfrecord_idx = temp_dir.name()
self.input = ops.readers.TFRecord(
path=tfrecord,
index_path=tfrecord_idx,
Expand Down Expand Up @@ -132,23 +149,13 @@ def define_graph(self):


test_data = {
# RecordIO & LMDB are not that frequently used any more so we won't test full datasets,
# just a small ones
FileReadPipeline: [["/data/imagenet/train-jpeg"], ["/data/imagenet/val-jpeg"]],
MXNetReaderPipeline: [
[
"/data/imagenet/train-480-val-256-recordio/train.rec",
"/data/imagenet/train-480-val-256-recordio/train.idx",
],
[
"/data/imagenet/train-480-val-256-recordio/val.rec",
"/data/imagenet/train-480-val-256-recordio/val.idx",
],
],
CaffeReadPipeline: [["/data/imagenet/train-lmdb-256x256"], ["/data/imagenet/val-lmdb-256x256"]],
Caffe2ReadPipeline: [["/data/imagenet/train-c2lmdb-480"], ["/data/imagenet/val-c2lmdb-256"]],
TFRecordPipeline: [
[
"/data/imagenet/train-val-tfrecord-480/train-*",
"/data/imagenet/train-val-tfrecord-480.idx/train-*",
"/data/imagenet/train-val-tfrecord/train-*",
"/data/imagenet/train-val-tfrecord.idx/train-*",
]
],
COCOReaderPipeline: [
Expand Down
6 changes: 4 additions & 2 deletions docs/examples/frameworks/paddle/paddle-basic_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,20 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os.path\n",
"import subprocess\n",
"\n",
"test_data_root = os.environ[\"DALI_EXTRA_PATH\"]\n",
"\n",
"# Caffe LMDB\n",
"lmdb_folder = os.path.join(test_data_root, \"db\", \"lmdb\")\n",
"\n",
"N = 8 # number of GPUs\n",
"res = subprocess.run(['nvidia-smi', '-L'], stdout=subprocess.PIPE, text=True)\n",
"N = res.stdout.count('\\n') # number of GPUs\n",
"BATCH_SIZE = 128 # batch size per GPU\n",
"IMAGE_SIZE = 3"
]
Expand Down
6 changes: 4 additions & 2 deletions docs/examples/frameworks/paddle/paddle-various-readers.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,12 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os.path\n",
"import subprocess\n",
"\n",
"test_data_root = os.environ[\"DALI_EXTRA_PATH\"]\n",
"\n",
Expand All @@ -53,7 +54,8 @@
"tfrecord_idx = \"idx_files/train.idx\"\n",
"tfrecord2idx_script = \"tfrecord2idx\"\n",
"\n",
"N = 8 # number of GPUs\n",
"res = subprocess.run(['nvidia-smi', '-L'], stdout=subprocess.PIPE, text=True)\n",
"N = res.stdout.count('\\n') # number of GPUs\n",
"BATCH_SIZE = 128 # batch size per GPU\n",
"IMAGE_SIZE = 3"
]
Expand Down
6 changes: 4 additions & 2 deletions docs/examples/frameworks/pytorch/pytorch-basic_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,20 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os.path\n",
"import subprocess\n",
"\n",
"test_data_root = os.environ[\"DALI_EXTRA_PATH\"]\n",
"\n",
"# Caffe LMDB\n",
"lmdb_folder = os.path.join(test_data_root, \"db\", \"lmdb\")\n",
"\n",
"N = 8 # number of GPUs\n",
"res = subprocess.run(['nvidia-smi', '-L'], stdout=subprocess.PIPE, text=True)\n",
"N = res.stdout.count('\\n') # number of GPUs\n",
"BATCH_SIZE = 128 # batch size per GPU\n",
"ITERATIONS = 32\n",
"IMAGE_SIZE = 3"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,12 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os.path\n",
"import subprocess\n",
"\n",
"test_data_root = os.environ[\"DALI_EXTRA_PATH\"]\n",
"\n",
Expand All @@ -53,7 +54,8 @@
"tfrecord_idx = \"idx_files/train.idx\"\n",
"tfrecord2idx_script = \"tfrecord2idx\"\n",
"\n",
"N = 8 # number of GPUs\n",
"res = subprocess.run(['nvidia-smi', '-L'], stdout=subprocess.PIPE, text=True)\n",
"N = res.stdout.count('\\n') # number of GPUs\n",
"BATCH_SIZE = 128 # batch size per GPU\n",
"IMAGE_SIZE = 3"
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,12 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os.path\n",
"import subprocess\n",
"\n",
"test_data_root = os.environ[\"DALI_EXTRA_PATH\"]\n",
"\n",
Expand All @@ -53,7 +54,8 @@
"tfrecord_idx = \"idx_files/train.idx\"\n",
"tfrecord2idx_script = \"tfrecord2idx\"\n",
"\n",
"N = 8 # number of GPUs\n",
"res = subprocess.run(['nvidia-smi', '-L'], stdout=subprocess.PIPE, text=True)\n",
"N = res.stdout.count('\\n') # number of GPUs\n",
"BATCH_SIZE = 128 # batch size per GPU\n",
"ITERATIONS = 32\n",
"IMAGE_SIZE = 3"
Expand Down
8 changes: 4 additions & 4 deletions docs/examples/use_cases/tensorflow/resnet-n/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,15 @@ For the full training on 8 GPUs::

mpiexec --allow-run-as-root --bind-to socket -np 8 \
python resnet.py --num_iter=90 --iter_unit=epoch \
--data_dir=/data/imagenet/train-val-tfrecord-480/ \
--data_dir=/data/imagenet/train-val-tfrecord/ \
--precision=fp16 --display_every=100 \
--export_dir=/tmp --dali_mode="GPU"

For the benchmark training on 8 GPUs::

mpiexec --allow-run-as-root --bind-to socket -np 8 \
python resnet.py --num_iter=400 --iter_unit=batch \
--data_dir=/data/imagenet/train-val-tfrecord-480/ \
--data_dir=/data/imagenet/train-val-tfrecord/ \
--precision=fp16 --display_every=100 --dali_mode="GPU"


Expand All @@ -49,15 +49,15 @@ For the full training on 8 GPUs::

mpiexec --allow-run-as-root --bind-to socket -np 8 \
python resnet_ctl.py --num_iter=90 --iter_unit=epoch \
--data_dir=/data/imagenet/train-val-tfrecord-480/ \
--data_dir=/data/imagenet/train-val-tfrecord/ \
--precision=fp16 --display_every=100 \
--export_dir=/tmp --dali_mode="GPU"

For the benchmark training on 8 GPUs::

mpiexec --allow-run-as-root --bind-to socket -np 8 \
python resnet_ctl.py --num_iter=400 --iter_unit=batch \
--data_dir=/data/imagenet/train-val-tfrecord-480/ \
--data_dir=/data/imagenet/train-val-tfrecord/ \
--precision=fp16 --display_every=100 --dali_mode="GPU"

Predicting in CTL (Custom Training Loop) mode
Expand Down
2 changes: 1 addition & 1 deletion qa/TL1_superres_pytorch/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ do_once() {

# Pre-trained FlowNet2.0 weights
# publicly available on https://drive.google.com/file/d/1QW03eyYG_vD-dT-Mx4wopYvtPu_msTKn/view
FLOWNET_PATH=/data/dali/pretrained_models/FlowNet2-SD_checkpoint.pth.tar
FLOWNET_PATH=/dali_internal/pretrained_models/FlowNet2-SD_checkpoint.pth.tar

git clone https://github.com/NVIDIA/flownet2-pytorch.git
cd flownet2-pytorch
Expand Down
6 changes: 3 additions & 3 deletions qa/TL1_tensorflow-dali_test/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ do_once() {
# TF is already available and we can set env variables
install_pip_pkg "pip install --force-reinstall horovod==0.28.1 -f /pip-packages"

for file in $(ls /data/imagenet/train-val-tfrecord-480-subset);
for file in $(ls /data/imagenet/train-val-tfrecord-small);
do
python ../../../../../tools/tfrecord2idx /data/imagenet/train-val-tfrecord-480-subset/${file} \
python ../../../../../tools/tfrecord2idx /data/imagenet/train-val-tfrecord-small/${file} \
idx-files/${file}.idx &
done
wait
Expand All @@ -78,7 +78,7 @@ test_body() {
# test code
mpiexec --allow-run-as-root --bind-to none -np ${NUM_GPUS} \
python -u resnet.py \
--data_dir=/data/imagenet/train-val-tfrecord-480-subset --data_idx_dir=idx-files/ \
--data_dir=/data/imagenet/train-val-tfrecord-small --data_idx_dir=idx-files/ \
--precision=fp16 --num_iter=100 --iter_unit=batch --display_every=50 \
--batch=64 --use_xla --dali_mode="GPU" --log_dir=./
}
Expand Down

0 comments on commit 70af6fe

Please sign in to comment.