Fixes for #212 (#239)

- remove hardcoded pipeline length in PipelinedExecutor - fix PyTorch iterator for multi-GPU - adjust PyTorch example to use new nvJpeg API Signed-off-by: Janusz Lisiecki <[email protected]>
NVIDIA · Oct 31, 2018 · db82480 · db82480
1 parent 2c1c9c8
commit db82480
Show file tree

Hide file tree

Showing 5 changed files with 10 additions and 8 deletions.
diff --git a/dali/benchmark/resnet50_bench.cc b/dali/benchmark/resnet50_bench.cc
@@ -37,7 +37,7 @@ BENCHMARK_DEFINE_F(RN50, C2Pipe)(benchmark::State& st) { // NOLINT
   Pipeline pipe(
       batch_size,
       num_thread,
-      0, -1, pipelined, 2,
+      0, -1, pipelined, 3,
       async);
 
   TensorList<CPUBackend> data;
@@ -167,7 +167,7 @@ BENCHMARK_DEFINE_F(RN50, HybridPipe)(benchmark::State& st) { // NOLINT
   Pipeline pipe(
       batch_size,
       num_thread,
-      0, -1, pipelined, 2,
+      0, -1, pipelined, 3,
       async);
 
   TensorList<CPUBackend> data;
@@ -299,7 +299,7 @@ BENCHMARK_DEFINE_F(RN50, nvJPEGPipe)(benchmark::State& st) { // NOLINT
   Pipeline pipe(
       batch_size,
       num_thread,
-      0, -1, pipelined, 2,
+      0, -1, pipelined, 3,
       async);
 
   TensorList<CPUBackend> data;

diff --git a/dali/pipeline/executor/pipelined_executor.h b/dali/pipeline/executor/pipelined_executor.h
@@ -42,7 +42,6 @@ class DLL_PUBLIC PipelinedExecutor : public Executor {
       bool set_affinity = false, int max_num_stream = -1, int prefetch_queue_depth = 2) :
     Executor(batch_size, num_thread, device_id, bytes_per_sample_hint,
         set_affinity, max_num_stream, prefetch_queue_depth) {
-    Executor::queue_depth_ = 3;
   }
 
   DLL_PUBLIC virtual ~PipelinedExecutor() = default;

diff --git a/dali/python/nvidia/dali/pipeline.py b/dali/python/nvidia/dali/pipeline.py
@@ -313,6 +313,7 @@ def deserialize_and_build(self, serialized_pipeline):
                                 self._num_threads,
                                 self._device_id,
                                 self._exec_pipelined,
+                                self._prefetch_queue_depth,
                                 self._exec_async,
                                 self._bytes_per_sample,
                                 self._set_affinity,

diff --git a/dali/python/nvidia/dali/plugin/pytorch.py b/dali/python/nvidia/dali/plugin/pytorch.py
@@ -146,10 +146,10 @@ def __next__(self):
                 feed_ndarray(d_arr, pyt_data[j])
             for j, l_arr in enumerate(labels):
                 feed_ndarray(l_arr, pyt_labels[j])
-            for p in self._pipes:
-                p._release_outputs()
-                p._start_run()
 
+        for p in self._pipes:
+            p._release_outputs()
+            p._start_run()
 
         copy_db_index = self._current_data_batch
         # Change index for double buffering

diff --git a/docs/examples/pytorch/main.py b/docs/examples/pytorch/main.py
@@ -80,7 +80,9 @@ class HybridTrainPipe(Pipeline):
     def __init__(self, batch_size, num_threads, device_id, data_dir, crop):
         super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, seed=12 + device_id)
         self.input = ops.FileReader(file_root=data_dir, shard_id=args.local_rank, num_shards=args.world_size, random_shuffle=True)
-        self.decode = ops.nvJPEGDecoder(device="mixed", output_type=types.RGB)
+        # This padding sets the size of the internal nvJPEG buffers to be able to handle all images from full-sized ImageNet
+        # without additional reallocations
+        self.decode = ops.nvJPEGDecoder(device="mixed", output_type=types.RGB, device_memory_padding=211025920, host_memory_padding=140544512)
         self.rrc = ops.RandomResizedCrop(device="gpu", size =(crop, crop))
         self.cmnp = ops.CropMirrorNormalize(device="gpu",
                                             output_dtype=types.FLOAT,