some test cleanup

mmarcinkiewicz · Mar 2, 2019 · 484292f · 484292f
1 parent 2445031
commit 484292f
Show file tree

Hide file tree

Showing 5 changed files with 20 additions and 6 deletions.
diff --git a/apex/amp/handle.py b/apex/amp/handle.py
@@ -1,6 +1,7 @@
 import contextlib
 import logging
 import warnings
+import torch
 
 from . import utils
 from .opt import OptimWrapper
@@ -83,7 +84,6 @@ def skip_step():
                                        "loss scale to {}".format(optimizer.loss_scaler.loss_scale()))
                         optimizer.step = optimizer_step
                     optimizer.step = skip_step
-
     # Probably ok to skip this if not delay_unscale
     if _amp_state.opt_properties.patch_torch_functions:
         _amp_state.handle._clear_cache()

diff --git a/apex/amp/scaler.py b/apex/amp/scaler.py
@@ -81,9 +81,7 @@ def clear_overflow_state(self):
             self._overflow_buf.zero_()
 
     def unscale(self, model_params, master_params, scale):
-        # torch.cuda.nvtx.range_push("unscale")
         if self._has_overflow:
-            # torch.cuda.nvtx.range_pop()
             return
 
         # Lots of defensive list processing going on here.  Way more less efficient than
@@ -92,6 +90,12 @@ def unscale(self, model_params, master_params, scale):
             in zip(model_params, master_params)] # some of these may be None
 
         if LossScaler.has_fused_kernel:
+            # TODO:  Make these lists permanent attributes of self, so they don't need to be created
+            # or garbage collected.  Profiler shows that garbage collection overhead may be
+            # substantial (200-300 usec).
+            # This may be tricky because right now the lists need to be packed densely.
+            # Maybe this could be handled within the multi_tensor_apply wrapper
+            # (allow some Tensors to be None using at::optional).
             src_dst_pairs = {torch.float16 : {torch.float16 : [[],[]], torch.float32 : [[],[]]},
                              torch.float32 : {torch.float16 : [[],[]], torch.float32 : [[],[]]}}
 
@@ -142,6 +146,8 @@ def unscale(self, model_params, master_params, scale):
             if scale == 1.0 and all_same and not self.dynamic:
                 return
 
+            # TODO:  Make these lists permanent attributes of self, so they don't need to be created
+            # or garbage collected?
             model_grads = [mmp[0].grad.data for mmp in model_master_params if mmp[0].grad is not None]
             master_grads = [mmp[1].grad.data for mmp in model_master_params if mmp[1].grad is not None]
 
@@ -151,8 +157,6 @@ def unscale(self, model_params, master_params, scale):
         if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
             self._has_overflow = self._overflow_buf.item()
 
-        # torch.cuda.nvtx.range_pop()
-
     # Separate so unscale() can be called more that once before updating.
     def update_scale(self):
         if self._has_overflow and self.dynamic:

diff --git a/apex/multi_tensor_apply/multi_tensor_apply.py b/apex/multi_tensor_apply/multi_tensor_apply.py
@@ -10,7 +10,7 @@ def __init__(self, chunk_size):
             MultiTensorApply.available = True
             self.chunk_size = chunk_size
         except ImportError as err:
-            MultiTensorApply.availble = False
+            MultiTensorApply.available = False
             MultiTensorApply.import_err = err
 
     def check_avail(self):

diff --git a/tests/L1/common/main_amp.py b/tests/L1/common/main_amp.py
@@ -107,6 +107,8 @@ def fast_collate(batch):
 print("loss_scale = {}".format(args.loss_scale), type(args.loss_scale))
 
 
+print("\nCUDNN VERSION: {}\n".format(torch.backends.cudnn.version()))
+
 if args.deterministic:
     cudnn.benchmark = False
     cudnn.deterministic = True

diff --git a/tests/L1/common/run_test.sh b/tests/L1/common/run_test.sh
@@ -46,6 +46,8 @@ rm False*
 
 set -e
 
+print_banner "Installing Apex with --cuda_ext and --cpp_ext"
+
 pushd ../../..
 python setup.py install --cuda_ext --cpp_ext
 popd
@@ -76,6 +78,8 @@ do
   set +x
 done
 
+print_banner "Reinstalling apex without extensions"
+
 pushd ../../..
 python setup.py install
 popd
@@ -102,13 +106,17 @@ do
   do
     for keep_batchnorm in "${keep_batchnorms[@]}"
     do
+      echo ""
+      echo "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} [--has-ext] $DATADIR"
       set -x
       python compare.py --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm}
       set +x
     done
   done
 done
 
+print_banner "Reinstalling Apex with --cuda_ext and --cpp_ext"
+
 pushd ../../..
 python setup.py install --cuda_ext --cpp_ext
 popd