Tgrel/minor mlperf fixes (facebookresearch#54)

* Fix command-line flag typo * Remove the end-of-epoch evaluation in MLPerf mode to avoid two evals close to each other
jianan-gu · Feb 15, 2020 · a5d707f · a5d707f
1 parent 1768658
commit a5d707f
Show file tree

Hide file tree

Showing 2 changed files with 2 additions and 2 deletions.
diff --git a/bench/run_and_time.sh b/bench/run_and_time.sh
@@ -14,6 +14,6 @@ else
 fi
 #echo $dlrm_extra_option
 
-python dlrm_s_pytorch.py --arch-sparse-feature-size=128 --arch-mlp-bot="13-512-256-128" --arch-mlp-top="1024-1024-512-256-1" --max-ind-range=40000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=1.0 --mini-batch-size=2048 --print-freq=2048 --print-time --test-freq=102400 --test-mini-batch-size=16384 --test-num-workers=16 --memory-map --mlperf-logging --mlperf-auc-threshold=0.8025 --mlperf-bin-file --mlperf-bin-shuffle $dlrm_extra_option 2>&1 | tee run_terabyte_mlperf_pt.log
+python dlrm_s_pytorch.py --arch-sparse-feature-size=128 --arch-mlp-bot="13-512-256-128" --arch-mlp-top="1024-1024-512-256-1" --max-ind-range=40000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=1.0 --mini-batch-size=2048 --print-freq=2048 --print-time --test-freq=102400 --test-mini-batch-size=16384 --test-num-workers=16 --memory-map --mlperf-logging --mlperf-auc-threshold=0.8025 --mlperf-bin-loader --mlperf-bin-shuffle $dlrm_extra_option 2>&1 | tee run_terabyte_mlperf_pt.log
 
 echo "done"
diff --git a/dlrm_s_pytorch.py b/dlrm_s_pytorch.py
@@ -923,7 +923,7 @@ def loss_fn_wrap(Z, T, use_gpu, device):
                 should_test = (
                     (args.test_freq > 0)
                     and (args.data_generation == "dataset")
-                    and (((j + 1) % args.test_freq == 0) or (j + 1 == nbatches))
+                    and (((j + 1) % args.test_freq == 0) or (j + 1 == nbatches and not args.mlperf_logging))
                 )
 
                 # print time, loss and accuracy