From db824809ff02e2993f4a258b96ac12dc7a08f09d Mon Sep 17 00:00:00 2001
From: Janusz <39967756+JanuszL@users.noreply.github.com>
Date: Thu, 25 Oct 2018 06:24:15 +0200
Subject: [PATCH] Fixes for #212 (#239)

- remove hardcoded pipeline length in PipelinedExecutor
- fix PyTorch iterator for multi-GPU
- adjust PyTorch example to use new nvJpeg API

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 dali/benchmark/resnet50_bench.cc            | 6 +++---
 dali/pipeline/executor/pipelined_executor.h | 1 -
 dali/python/nvidia/dali/pipeline.py         | 1 +
 dali/python/nvidia/dali/plugin/pytorch.py   | 6 +++---
 docs/examples/pytorch/main.py               | 4 +++-
 5 files changed, 10 insertions(+), 8 deletions(-)
diff --git a/dali/benchmark/resnet50_bench.cc b/dali/benchmark/resnet50_bench.cc
index 873eaf63b8f..8a60e9a584d 100644
--- a/dali/benchmark/resnet50_bench.cc
+++ b/dali/benchmark/resnet50_bench.cc
@@ -37,7 +37,7 @@ BENCHMARK_DEFINE_F(RN50, C2Pipe)(benchmark::State& st) { // NOLINT
   Pipeline pipe(
       batch_size,
       num_thread,
-      0, -1, pipelined, 2,
+      0, -1, pipelined, 3,
       async);
 
   TensorList<CPUBackend> data;
@@ -167,7 +167,7 @@ BENCHMARK_DEFINE_F(RN50, HybridPipe)(benchmark::State& st) { // NOLINT
   Pipeline pipe(
       batch_size,
       num_thread,
-      0, -1, pipelined, 2,
+      0, -1, pipelined, 3,
       async);
 
   TensorList<CPUBackend> data;
@@ -299,7 +299,7 @@ BENCHMARK_DEFINE_F(RN50, nvJPEGPipe)(benchmark::State& st) { // NOLINT
   Pipeline pipe(
       batch_size,
       num_thread,
-      0, -1, pipelined, 2,
+      0, -1, pipelined, 3,
       async);
 
   TensorList<CPUBackend> data;
diff --git a/dali/pipeline/executor/pipelined_executor.h b/dali/pipeline/executor/pipelined_executor.h
index 1587b401cab..2e34706743b 100644
--- a/dali/pipeline/executor/pipelined_executor.h
+++ b/dali/pipeline/executor/pipelined_executor.h
@@ -42,7 +42,6 @@ class DLL_PUBLIC PipelinedExecutor : public Executor {
       bool set_affinity = false, int max_num_stream = -1, int prefetch_queue_depth = 2) :
     Executor(batch_size, num_thread, device_id, bytes_per_sample_hint,
         set_affinity, max_num_stream, prefetch_queue_depth) {
-    Executor::queue_depth_ = 3;
   }
 
   DLL_PUBLIC virtual ~PipelinedExecutor() = default;
diff --git a/dali/python/nvidia/dali/pipeline.py b/dali/python/nvidia/dali/pipeline.py
index 41e1656c876..a011ef15a7e 100644
--- a/dali/python/nvidia/dali/pipeline.py
+++ b/dali/python/nvidia/dali/pipeline.py
@@ -313,6 +313,7 @@ def deserialize_and_build(self, serialized_pipeline):
                                 self._num_threads,
                                 self._device_id,
                                 self._exec_pipelined,
+                                self._prefetch_queue_depth,
                                 self._exec_async,
                                 self._bytes_per_sample,
                                 self._set_affinity,
diff --git a/dali/python/nvidia/dali/plugin/pytorch.py b/dali/python/nvidia/dali/plugin/pytorch.py
index 69092436ef0..d6c1e8dbb2d 100644
--- a/dali/python/nvidia/dali/plugin/pytorch.py
+++ b/dali/python/nvidia/dali/plugin/pytorch.py
@@ -146,10 +146,10 @@ def __next__(self):
                 feed_ndarray(d_arr, pyt_data[j])
             for j, l_arr in enumerate(labels):
                 feed_ndarray(l_arr, pyt_labels[j])
-            for p in self._pipes:
-                p._release_outputs()
-                p._start_run()
 
+        for p in self._pipes:
+            p._release_outputs()
+            p._start_run()
 
         copy_db_index = self._current_data_batch
         # Change index for double buffering
diff --git a/docs/examples/pytorch/main.py b/docs/examples/pytorch/main.py
index f5c3a070a34..53a1fe76d13 100644
--- a/docs/examples/pytorch/main.py
+++ b/docs/examples/pytorch/main.py
@@ -80,7 +80,9 @@ class HybridTrainPipe(Pipeline):
     def __init__(self, batch_size, num_threads, device_id, data_dir, crop):
         super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, seed=12 + device_id)
         self.input = ops.FileReader(file_root=data_dir, shard_id=args.local_rank, num_shards=args.world_size, random_shuffle=True)
-        self.decode = ops.nvJPEGDecoder(device="mixed", output_type=types.RGB)
+        # This padding sets the size of the internal nvJPEG buffers to be able to handle all images from full-sized ImageNet
+        # without additional reallocations
+        self.decode = ops.nvJPEGDecoder(device="mixed", output_type=types.RGB, device_memory_padding=211025920, host_memory_padding=140544512)
         self.rrc = ops.RandomResizedCrop(device="gpu", size =(crop, crop))
         self.cmnp = ops.CropMirrorNormalize(device="gpu",
                                             output_dtype=types.FLOAT,