Merge branch 'jitendra/mlper-tlt-bfloat-cpx' into 'develop-ng'

mlperf Transformer LT Readme update See merge request intelai/models!108
kanvi-nervana · May 30, 2020 · b439716 · b439716
1 parent de00677
commit b439716
Show file tree

Hide file tree

Showing 273 changed files with 300,364 additions and 3,246 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,5 @@
 .coverage
 .tox
 test_data/
-*.bak
+download_glue_data.py
+data/
diff --git a/CODEOWNERS b/CODEOWNERS
@@ -2,7 +2,7 @@
 # Each line is a file pattern followed by one or more owners.
 
 # These owners will be the default owners for everything in the repo.
-*   @mlukaszewski @claynerobison @chuanqi129 @agramesh1
+*   @mlukaszewski @claynerobison @chuanqi129 @agramesh1 @justkw
 
 # Order is important. The last matching pattern has the most precedence.
 # So if a pull request only touches javascript files, only these owners

diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -20,16 +20,22 @@ dependencies to be installed:
 | Image Recognition      | TensorFlow    | [Inception V4](https://arxiv.org/pdf/1602.07261.pdf)        | Inference | [Int8](image_recognition/tensorflow/inceptionv4/README.md#int8-inference-instructions) [FP32](image_recognition/tensorflow/inceptionv4/README.md#fp32-inference-instructions) |
 | Image Recognition      | TensorFlow    | [MobileNet V1*](https://arxiv.org/pdf/1704.04861.pdf)        | Inference | [Int8](image_recognition/tensorflow/mobilenet_v1/README.md#int8-inference-instructions) [FP32](image_recognition/tensorflow/mobilenet_v1/README.md#fp32-inference-instructions) |
 | Image Recognition      | TensorFlow    | [ResNet 101](https://arxiv.org/pdf/1512.03385.pdf)          | Inference | [Int8](image_recognition/tensorflow/resnet101/README.md#int8-inference-instructions) [FP32](image_recognition/tensorflow/resnet101/README.md#fp32-inference-instructions) |
-| Image Recognition      | TensorFlow    | [ResNet 50](https://arxiv.org/pdf/1512.03385.pdf)           | Inference | [Int8](image_recognition/tensorflow/resnet50/README.md#int8-inference-instructions) [FP32](image_recognition/tensorflow/resnet50/README.md#fp32-inference-instructions) |
-| Image Recognition      | TensorFlow    | [ResNet 50v1.5*](https://github.com/tensorflow/models/tree/master/official/resnet) | Inference | [Int8](image_recognition/tensorflow/resnet50v1_5/README.md#int8-inference-instructions) [FP32](image_recognition/tensorflow/resnet50v1_5/README.md#fp32-inference-instructions) |
-| Image Recognition      | TensorFlow    | [ResNet 50v1.5*](https://github.com/tensorflow/models/tree/master/official/resnet) | Training | [FP32](image_recognition/tensorflow/resnet50v1_5/README.md#fp32-training-instructions) |
-| Reinforcement          | TensorFlow    | [MiniGo](https://arxiv.org/abs/1712.01815.pdf)              | Training  | [FP32](reinforcement/tensorflow/minigo/README.md#fp32-training-instructions)|
+| Image Recognition      | TensorFlow    | [ResNet 50](https://arxiv.org/pdf/1512.03385.pdf)           | Inference | [Int8](image_recognition/tensorflow/resnet50/README.md#int8-inference-instructions) [FP32](image_recognition/tensorflow/resnet50/README.md#fp32-inference-instructions)|
+| Image Recognition      | TensorFlow    | [ResNet 50v1.5](https://github.com/tensorflow/models/tree/master/official/resnet) | Inference | [Int8](image_recognition/tensorflow/resnet50v1_5/README.md#int8-inference-instructions) [FP32](image_recognition/tensorflow/resnet50v1_5/README.md#fp32-inference-instructions) [BFloat16**](image_recognition/tensorflow/resnet50v1_5/README.md#bfloat16-inference-instructions)|
+| Image Recognition      | TensorFlow    | [ResNet 50v1.5](https://github.com/tensorflow/models/tree/master/official/resnet) | Training | [FP32](image_recognition/tensorflow/resnet50v1_5/README.md#fp32-training-instructions) [BFloat16**](image_recognition/tensorflow/resnet50v1_5/README.md#bfloat16-training-instructions)|
+| Language Modeling      | TensorFlow    | [BERT](https://arxiv.org/pdf/1810.04805.pdf)                | Inference | [FP32](language_modeling/tensorflow/bert_large/README.md#fp32-inference-instructions) [BFloat16**](language_modeling/tensorflow/bert_large/README.md#bfloat16-inference-instructions) |
+| Language Modeling      | TensorFlow    | [BERT](https://arxiv.org/pdf/1810.04805.pdf)                | Training  | [FP32](language_modeling/tensorflow/bert_large/README.md#fp32-training-instructions) [BFloat16**](language_modeling/tensorflow/bert_large/README.md#bfloat16-training-instructions) |
 | Language Translation   | TensorFlow    | [GNMT*](https://arxiv.org/pdf/1609.08144.pdf)                | Inference | [FP32](language_translation/tensorflow/mlperf_gnmt/README.md#fp32-inference-instructions) |
+| Reinforcement          | TensorFlow    | [MiniGo](https://arxiv.org/abs/1712.01815.pdf)              | Training  | [FP32](reinforcement/tensorflow/minigo/README.md#fp32-training-instructions)|
 | Language Translation   | TensorFlow    | [Transformer_LT_Official ](https://arxiv.org/pdf/1706.03762.pdf)| Inference | [FP32](language_translation/tensorflow/transformer_lt_official/README.md#fp32-inference-instructions) |
+| Language Translation   | TensorFlow    | [Transformer_LT_mlperf ](https://arxiv.org/pdf/1706.03762.pdf)| Training | [FP32](language_translation/tensorflow/transformer_mlperf/README.md#fp32-training-instructions) [BFloat16**](language_translation/tensorflow/transformer_mlperf/README.md#bfloat16-training-instructions) |
 | Object Detection       | TensorFlow    | [R-FCN](https://arxiv.org/pdf/1605.06409.pdf)               | Inference | [Int8](object_detection/tensorflow/rfcn/README.md#int8-inference-instructions) [FP32](object_detection/tensorflow/rfcn/README.md#fp32-inference-instructions) |
 | Object Detection       | TensorFlow    | [SSD-MobileNet*](https://arxiv.org/pdf/1704.04861.pdf)       | Inference | [Int8](object_detection/tensorflow/ssd-mobilenet/README.md#int8-inference-instructions) [FP32](object_detection/tensorflow/ssd-mobilenet/README.md#fp32-inference-instructions) |
 | Object Detection       | TensorFlow    | [SSD-ResNet34*](https://arxiv.org/pdf/1512.02325.pdf)        | Inference | [Int8](object_detection/tensorflow/ssd-resnet34/README.md#int8-inference-instructions) [FP32](object_detection/tensorflow/ssd-resnet34/README.md#fp32-inference-instructions) |
+| Object Detection       | TensorFlow    | [SSD-ResNet34](https://arxiv.org/pdf/1512.02325.pdf)        | Training  | [FP32](object_detection/tensorflow/ssd-resnet34/README.md#fp32-training-instructions) [BFloat16**](object_detection/tensorflow/ssd-resnet34/README.md#bf16-training-instructions) |
 | Recommendation         | TensorFlow    | [Wide & Deep Large Dataset](https://arxiv.org/pdf/1606.07792.pdf)	| Inference | [Int8](recommendation/tensorflow/wide_deep_large_ds/README.md#int8-inference-instructions) [FP32](recommendation/tensorflow/wide_deep_large_ds/README.md#fp32-inference-instructions) |
 | Recommendation         | TensorFlow    | [Wide & Deep](https://arxiv.org/pdf/1606.07792.pdf)         | Inference | [FP32](recommendation/tensorflow/wide_deep/README.md#fp32-inference-instructions) |
 
-*Means the model is belong to [MLPerf](https://mlperf.org/) models, will long term support. 
+*Means the model belongs to [MLPerf](https://mlperf.org/) models and will be supported long-term.
+
+**Means the BFloat16 data type support is experimental.
diff --git a/benchmarks/common/base_benchmark_util.py b/benchmarks/common/base_benchmark_util.py
@@ -67,8 +67,8 @@ def _define_args(self):
 
         self._common_arg_parser.add_argument(
             "-p", "--precision",
-            help="Specify the model precision to use: fp32, int8",
-            required=required_arg, choices=["fp32", "int8"],
+            help="Specify the model precision to use: fp32, int8, or bfloat16",
+            required=required_arg, choices=["fp32", "int8", "bfloat16"],
             dest="precision")
 
         self._common_arg_parser.add_argument(
@@ -132,6 +132,11 @@ def _define_args(self):
             help="Specify the number threads between layers",
             dest="num_inter_threads", default=None)
 
+        self._common_arg_parser.add_argument(
+            "-ts", "--num-train-steps", type=check_positive_number,
+            help="Specify the number of training steps ",
+            dest="num_train_steps", default=1)
+
         self._common_arg_parser.add_argument(
             "--data-num-intra-threads", type=check_positive_number,
             help="The number intra op threads for the data layer config",
@@ -152,6 +157,13 @@ def _define_args(self):
                  "of using frozen graphs.",
             dest="checkpoint", default=None, type=check_valid_folder)
 
+        self._common_arg_parser.add_argument(
+            "-bb", "--backbone-model",
+            help="Specify the location of backbone-model directory. "
+                 "This option can be used by models (like SSD_Resnet34) "
+                 "to do fine-tuning training or achieve convergence.",
+            dest="backbone_model", default=None, type=check_valid_folder)
+
         self._common_arg_parser.add_argument(
             "-g", "--in-graph", help="Full path to the input graph ",
             dest="input_graph", default=None, type=check_valid_filename)
@@ -220,11 +232,11 @@ def _define_args(self):
     def _validate_args(self):
         """validate the args and initializes platform_util"""
         # check if socket id is in socket number range
-        num_sockets = self._platform_util.num_cpu_sockets
+        num_numas = self._platform_util.num_numa_nodes
         args = self.args
-        if not -1 <= args.socket_id < num_sockets:
-            raise ValueError("Socket id must be within socket number range: "
-                             "[0, {}].".format(num_sockets - 1))
+        if not -1 <= args.socket_id < num_numas:
+            raise ValueError("Socket id must be within NUMA number range: "
+                             "[0, {}].".format(num_numas - 1))
 
         # check number of cores
         num_logical_cores_per_socket = \

diff --git a/benchmarks/common/base_model_init.py b/benchmarks/common/base_model_init.py
@@ -144,6 +144,9 @@ def set_num_inter_intra_threads(self, num_inter_threads=None, num_intra_threads=
          * num_inter_threads = The number of sockets
          * num_intra_threads = The total number of cores across all sockets, or
            self.args.num_cores if a specific number of cores was defined.
+         * in case MPI_NUM_PROCESSES is used
+           * num_inter_threads = 1
+           * num_intra_threads = the number of cores on a single socket minus 2
         """
         # if num_inter_threads is specified, use that value as long as the arg isn't set
         if num_inter_threads and not self.args.num_inter_threads:
@@ -163,11 +166,16 @@ def set_num_inter_intra_threads(self, num_inter_threads=None, num_intra_threads=
         else:
             if not self.args.num_inter_threads:
                 self.args.num_inter_threads = self.platform_util.num_cpu_sockets
+                if os.environ["MPI_NUM_PROCESSES"] != "None":
+                  self.args.num_inter_threads = 1
             if not self.args.num_intra_threads:
                 if self.args.num_cores == -1:
                     self.args.num_intra_threads = \
                         int(self.platform_util.num_cores_per_socket *
                             self.platform_util.num_cpu_sockets)
+                    if os.environ["MPI_NUM_PROCESSES"] != "None":
+                      self.args.num_intra_threads = \
+                             self.platform_util.num_cores_per_socket - 2
                 else:
                     self.args.num_intra_threads = self.args.num_cores