Adding HPC logging and extending logger to support different WG (#151)

* initial refactoring commit with HPC support. Works fine on NVIDIA training submission 1.0.0 * enhanced number of samples in deepcam rcp for more variable batchsize cases initial refactoring commit with HPC support. Works fine on NVIDIA training submission 1.0.0 enhanced number of samples in deepcam rcp for more variable batchsize cases enhanced number of samples in deepcam rcp for more variable batchsize cases * reformatted rcps * removed eval_accuracy check from hpc common.yaml because we use eval_error for the two benchmarks with MAE metric. * replace dimenet -> oc20 in hpc closed_common This is in line with the reference logging convention. * Removed required grad accum logging 2/3 reference codes don't log this (yet). We shouldn't require it now at this stage. If folks do grad accum they should log it, however, so we still check the value. * adding cosmoflow rules * adding oc20 rules * adding preliminary RCP for cosmoflow * added open deepcam compliance file * fix suggested changes * fixed a yaml formatting error and removed the usage mixin in parse_file * adopting result_summarizer output * updated result summarizer * commit initial RCP for oc20 * adding bs 1024 rcp for cosmoflow * fix cosmoflow rcp epoch numbering Increasing cosmoflow RCP epoch counts all by one, to account for the epoch numbering convention fix coming from mlcommons/hpc#13 * fix oc20 rcp epoch numbering Increasing opencatalyst RCP epoch counts all by one, to account for the epoch numbering convention fix coming from mlcommons/hpc#14 * Fixing errors. * factored out benchmark descriptions into separate file, used by package checker and results summarizer * Fixing errors. * Fixing errors. * created a utility function for performing the allowed benchmark and file counts checks * Fix errors. * fixed rule choices and usage choices in argument parser Co-authored-by: Steve Farrell <[email protected]> Co-authored-by: Shang Wang <[email protected]>
mlcommons · Sep 7, 2021 · 1ce757e · 1ce757e
1 parent 9aa718d
commit 1ce757e
Show file tree

Hide file tree

Showing 100 changed files with 777 additions and 164 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,3 @@
 include VERSION
 include mlperf_logging/compliance_checker/**/*.yaml
+include mlperf_logging/rcp_checker/**/*.json
diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py
@@ -0,0 +1,96 @@
+# benchmark dictionary
+_ALL_RESULT_FILE_COUNTS = {
+    'training': {
+        'bert': 10,
+        'dlrm': 5,
+        'gnmt': 10,
+        'maskrcnn': 5,
+        'minigo': 10,
+        'resnet': 5,
+        'ssd': 5,
+        'transformer': 10,
+        'ncf': 10,
+        'rnnt': 10,
+        'unet3d': 40,
+    },
+
+    'hpc' : {
+    'deepcam': 5,
+        'cosmoflow': 10,
+        'oc20': 10
+    }
+}
+
+
+_ALL_ALLOWED_BENCHMARKS = {
+    'training': {
+        '0.6': [
+            'resnet',
+            'ssd',
+            'maskrcnn',
+            'gnmt',
+            'transformer',
+            'ncf',
+            'minigo',
+        ],
+
+    '0.7': [
+        'bert',
+        'dlrm',
+        'gnmt',
+        'maskrcnn',
+        'minigo',
+        'resnet',
+        'ssd',
+        'transformer'
+    ],
+
+    '1.0': [
+        'bert',
+        'dlrm',
+        'maskrcnn',
+        'minigo',
+        'resnet',
+        'ssd',
+        'rnnt',
+        'unet3d',
+    ],
+    },
+
+    'hpc': {
+        '0.7': [
+            'cosmoflow',
+            'deepcam',
+        ],
+
+        '1.0': [
+            'cosmoflow',
+            'deepcam',
+            'oc20',
+        ],
+    }
+}
+
+
+def get_allowed_benchmarks(usage, ruleset):
+    # check usage
+    if usage not in _ALL_ALLOWED_BENCHMARKS:
+        raise ValueError('usage {} not supported!'.format(usage))
+
+    # check ruleset
+    if ruleset not in _ALL_ALLOWED_BENCHMARKS[usage]:
+        # try short version:
+        ruleset_short = ".".join(ruleset.split(".")[:-1])
+        if ruleset_short not in _ALL_ALLOWED_BENCHMARKS[usage]:
+            raise ValueError('ruleset {} is not supported in {}'.format(ruleset, usage))
+        allowed_benchmarks = _ALL_ALLOWED_BENCHMARKS[usage][ruleset_short]
+    else:
+        allowed_benchmarks = _ALL_ALLOWED_BENCHMARKS[usage][ruleset]
+
+    return allowed_benchmarks
+
+
+def get_result_file_counts(usage):
+    if usage not in _ALL_RESULT_FILE_COUNTS:
+        raise ValueError('usage {} not supported!'.format(usage))
+    return _ALL_RESULT_FILE_COUNTS[usage]
diff --git a/mlperf_logging/compliance_checker/__main__.py b/mlperf_logging/compliance_checker/__main__.py
@@ -6,9 +6,10 @@
 parser = mlp_compliance.get_parser()
 args = parser.parse_args()
 
-config_file = args.config or f'{args.ruleset}/common.yaml'
+config_file = args.config or f'{args.usage}_{args.ruleset}/common.yaml'
 
 checker = mlp_compliance.make_checker(
+    args.usage,
     args.ruleset,
     args.quiet,
     args.werror,

diff --git a/mlperf_logging/compliance_checker/hpc_1.0.0/closed_common.yaml b/mlperf_logging/compliance_checker/hpc_1.0.0/closed_common.yaml
@@ -0,0 +1,14 @@
+
+- KEY:
+    NAME:  submission_benchmark
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] in ['deepcam', 'cosmoflow', 'oc20'] "
+    POST:  " enqueue_config('hpc_1.0.0/closed_{}.yaml'.format(v['value'])) "
+
+- KEY:
+    NAME:  gradient_accumulation_steps
+    CHECK: " v['value'] > 0 "
+
+- KEY:
+    NAME:  gradient_accumulation_frequency
+    CHECK: " v['value'] > 0 "
diff --git a/mlperf_logging/compliance_checker/hpc_1.0.0/closed_cosmoflow.yaml b/mlperf_logging/compliance_checker/hpc_1.0.0/closed_cosmoflow.yaml
@@ -0,0 +1,47 @@
+- KEY:
+    NAME:  global_batch_size
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] > 0"
+
+- KEY:
+    NAME:  opt_name
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] in ['sgd', 'SGD'] "
+
+- KEY:
+    NAME:  opt_base_learning_rate
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] >= 0."
+
+- KEY:
+    NAME:  opt_learning_rate_warmup_epochs
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] >= 0"
+
+- KEY:
+    NAME:  opt_learning_rate_warmup_factor
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] >= 0."
+
+- KEY:
+    NAME:  opt_learning_rate_decay_boundary_epochs
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_learning_rate_decay_factor
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  dropout
+    CHECK: " v['value'] >= 0. and v['value'] < 1."
+
+- KEY:
+    NAME: opt_weight_decay
+    CHECK: " v['value'] >= 0."
+
+- KEY:
+    NAME:  eval_error
+    REQ:   AT_LEAST_ONE
+    CHECK:
+        - "'epoch_num' in v['metadata']"
+    ATLEAST_ONE_CHECK: "v['value'] <= 0.124 and v['value'] > 0."
diff --git a/mlperf_logging/compliance_checker/hpc_1.0.0/closed_deepcam.yaml b/mlperf_logging/compliance_checker/hpc_1.0.0/closed_deepcam.yaml
@@ -0,0 +1,85 @@
+# General Settings
+- KEY:
+    NAME:  gradient_accumulation_frequency
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] > 0 "
+
+- KEY:
+    NAME:  seed
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] > 0"
+
+- KEY:
+    NAME:  global_batch_size
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] > 0"
+
+- KEY:
+    NAME:  num_workers
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] > 0"
+
+- KEY:
+    NAME:  batchnorm_group_size
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] > 0"
+
+
+# Optimizer Parameters
+- KEY:
+    NAME:  opt_name
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] in ['Adam', 'AdamW', 'LAMB']"
+    POST:  " if (v['value'] == 'LAMB'): enqueue_config('hpc_1.0.0/closed_deepcam_lamb.yaml') "
+
+- KEY:
+    NAME:  opt_lr
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] >0."
+
+- KEY:
+    NAME:  opt_betas
+    REQ:   EXACTLY_ONE
+    CHECK: " len(v['value']) == 2"
+
+- KEY:
+    NAME:  opt_eps
+    REQ:   EXACTLY_ONE
+    CHECK: " math.isclose(v['value'], 1e-6)"
+
+
+# Scheduler Parameters
+- KEY:
+    NAME:  scheduler_type
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] in ['multistep', 'cosine_annealing']"
+    POST:  " enqueue_config('hpc_1.0.0/closed_deepcam_{}.yaml'.format(v['value'].lower())) "
+
+- KEY:
+    NAME:  scheduler_lr_warmup_steps
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] >= 0 "
+
+- KEY:
+    NAME:  scheduler_lr_warmup_factor
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] >= 1. "
+
+# Dataset Properties
+- KEY:
+    NAME:  train_samples
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 121266"
+
+- KEY:
+    NAME:  eval_samples
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 15158"
+
+# Convergence Properties
+- KEY:
+    NAME:  eval_accuracy
+    REQ:   AT_LEAST_ONE
+    CHECK:
+        - "'epoch_num' in v['metadata']"
+    ATLEAST_ONE_CHECK: "v['value'] >= 0.82 and v['value'] <= 1."
diff --git a/mlperf_logging/compliance_checker/hpc_1.0.0/closed_deepcam_cosine_annealing.yaml b/mlperf_logging/compliance_checker/hpc_1.0.0/closed_deepcam_cosine_annealing.yaml
@@ -0,0 +1,10 @@
+# Scheduler Parameters
+- KEY:
+    NAME:  scheduler_t_max
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] >= 1. "
+
+- KEY:
+    NAME:  scheduler_eta_min
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] >= 0. "
diff --git a/mlperf_logging/compliance_checker/hpc_1.0.0/closed_deepcam_lamb.yaml b/mlperf_logging/compliance_checker/hpc_1.0.0/closed_deepcam_lamb.yaml
@@ -0,0 +1,16 @@
+# Optimizer Parameters
+- KEY:
+    NAME:  opt_bias_correction
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] "
+
+- KEY:
+    NAME:  opt_grad_averaging
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] "
+
+- KEY:
+    NAME:  opt_max_grad_norm
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 1."
+
diff --git a/mlperf_logging/compliance_checker/hpc_1.0.0/closed_deepcam_multistep.yaml b/mlperf_logging/compliance_checker/hpc_1.0.0/closed_deepcam_multistep.yaml
@@ -0,0 +1,10 @@
+# Scheduler Parameters
+- KEY:
+    NAME:  scheduler_milestones
+    REQ:   EXACTLY_ONE
+    CHECK: " len(v['value']) >= 0 "
+
+- KEY:
+    NAME:  scheduler_decay_rate
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] <= 1. "
diff --git a/mlperf_logging/compliance_checker/hpc_1.0.0/closed_oc20.yaml b/mlperf_logging/compliance_checker/hpc_1.0.0/closed_oc20.yaml
@@ -0,0 +1,39 @@
+- KEY:
+    NAME:  global_batch_size
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] > 0"
+
+- KEY:
+    NAME:  opt_name
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 'AdamW'"
+
+- KEY:
+    NAME:  opt_base_learning_rate
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] >= 0."
+
+- KEY:
+    NAME:  opt_learning_rate_warmup_steps
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] >= 0"
+
+- KEY:
+    NAME:  opt_learning_rate_warmup_factor
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] >= 0."
+
+- KEY:
+    NAME:  opt_learning_rate_decay_boundary_steps
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_learning_rate_decay_factor
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  eval_error
+    REQ:   AT_LEAST_ONE
+    CHECK:
+        - "'epoch_num' in v['metadata']"
+    ATLEAST_ONE_CHECK: "v['value'] <= 0.036 and v['value'] > 0."