Skip to content

Commit

Permalink
Adding HPC logging and extending logger to support different WG (#151)
Browse files Browse the repository at this point in the history
* initial refactoring commit with HPC support. Works fine on NVIDIA training submission 1.0.0

* enhanced number of samples in deepcam rcp for more variable batchsize cases

initial refactoring commit with HPC support. Works fine on NVIDIA training submission 1.0.0

enhanced number of samples in deepcam rcp for more variable batchsize cases

enhanced number of samples in deepcam rcp for more variable batchsize cases

* reformatted rcps

* removed eval_accuracy check from hpc common.yaml

because we use eval_error for the two benchmarks with MAE metric.

* replace dimenet -> oc20 in hpc closed_common

This is in line with the reference logging convention.

* Removed required grad accum logging

2/3 reference codes don't log this (yet). We shouldn't require it now at
this stage. If folks do grad accum they should log it, however, so we
still check the value.

* adding cosmoflow rules

* adding oc20 rules

* adding preliminary RCP for cosmoflow

* added open deepcam compliance file

* fix suggested changes

* fixed a yaml formatting error and removed the usage mixin in parse_file

* adopting result_summarizer output

* updated result summarizer

* commit initial RCP for oc20

* adding bs 1024 rcp for cosmoflow

* fix cosmoflow rcp epoch numbering

Increasing cosmoflow RCP epoch counts all by one, to account for the
epoch numbering convention fix coming from
mlcommons/hpc#13

* fix oc20 rcp epoch numbering

Increasing opencatalyst RCP epoch counts all by one, to account for the
epoch numbering convention fix coming from
mlcommons/hpc#14

* Fixing errors.

* factored out benchmark descriptions into separate file, used by package checker and results summarizer

* Fixing errors.

* Fixing errors.

* created a utility function for performing the allowed benchmark and file counts checks

* Fix errors.

* fixed rule choices and usage choices in argument parser

Co-authored-by: Steve Farrell <[email protected]>
Co-authored-by: Shang Wang <[email protected]>
  • Loading branch information
3 people authored Sep 7, 2021
1 parent 9aa718d commit 1ce757e
Show file tree
Hide file tree
Showing 100 changed files with 777 additions and 164 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
include VERSION
include mlperf_logging/compliance_checker/**/*.yaml
include mlperf_logging/rcp_checker/**/*.json
96 changes: 96 additions & 0 deletions mlperf_logging/benchmark_meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# benchmark dictionary
_ALL_RESULT_FILE_COUNTS = {
'training': {
'bert': 10,
'dlrm': 5,
'gnmt': 10,
'maskrcnn': 5,
'minigo': 10,
'resnet': 5,
'ssd': 5,
'transformer': 10,
'ncf': 10,
'rnnt': 10,
'unet3d': 40,
},

'hpc' : {
'deepcam': 5,
'cosmoflow': 10,
'oc20': 10
}
}


_ALL_ALLOWED_BENCHMARKS = {
'training': {
'0.6': [
'resnet',
'ssd',
'maskrcnn',
'gnmt',
'transformer',
'ncf',
'minigo',
],

'0.7': [
'bert',
'dlrm',
'gnmt',
'maskrcnn',
'minigo',
'resnet',
'ssd',
'transformer'
],

'1.0': [
'bert',
'dlrm',
'maskrcnn',
'minigo',
'resnet',
'ssd',
'rnnt',
'unet3d',
],
},

'hpc': {
'0.7': [
'cosmoflow',
'deepcam',
],

'1.0': [
'cosmoflow',
'deepcam',
'oc20',
],
}
}


def get_allowed_benchmarks(usage, ruleset):
# check usage
if usage not in _ALL_ALLOWED_BENCHMARKS:
raise ValueError('usage {} not supported!'.format(usage))

# check ruleset
if ruleset not in _ALL_ALLOWED_BENCHMARKS[usage]:
# try short version:
ruleset_short = ".".join(ruleset.split(".")[:-1])
if ruleset_short not in _ALL_ALLOWED_BENCHMARKS[usage]:
raise ValueError('ruleset {} is not supported in {}'.format(ruleset, usage))
allowed_benchmarks = _ALL_ALLOWED_BENCHMARKS[usage][ruleset_short]
else:
allowed_benchmarks = _ALL_ALLOWED_BENCHMARKS[usage][ruleset]

return allowed_benchmarks


def get_result_file_counts(usage):
if usage not in _ALL_RESULT_FILE_COUNTS:
raise ValueError('usage {} not supported!'.format(usage))
return _ALL_RESULT_FILE_COUNTS[usage]
3 changes: 2 additions & 1 deletion mlperf_logging/compliance_checker/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
parser = mlp_compliance.get_parser()
args = parser.parse_args()

config_file = args.config or f'{args.ruleset}/common.yaml'
config_file = args.config or f'{args.usage}_{args.ruleset}/common.yaml'

checker = mlp_compliance.make_checker(
args.usage,
args.ruleset,
args.quiet,
args.werror,
Expand Down
14 changes: 14 additions & 0 deletions mlperf_logging/compliance_checker/hpc_1.0.0/closed_common.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

- KEY:
NAME: submission_benchmark
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['deepcam', 'cosmoflow', 'oc20'] "
POST: " enqueue_config('hpc_1.0.0/closed_{}.yaml'.format(v['value'])) "

- KEY:
NAME: gradient_accumulation_steps
CHECK: " v['value'] > 0 "

- KEY:
NAME: gradient_accumulation_frequency
CHECK: " v['value'] > 0 "
47 changes: 47 additions & 0 deletions mlperf_logging/compliance_checker/hpc_1.0.0/closed_cosmoflow.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
- KEY:
NAME: global_batch_size
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0"

- KEY:
NAME: opt_name
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['sgd', 'SGD'] "

- KEY:
NAME: opt_base_learning_rate
REQ: EXACTLY_ONE
CHECK: " v['value'] >= 0."

- KEY:
NAME: opt_learning_rate_warmup_epochs
REQ: EXACTLY_ONE
CHECK: " v['value'] >= 0"

- KEY:
NAME: opt_learning_rate_warmup_factor
REQ: EXACTLY_ONE
CHECK: " v['value'] >= 0."

- KEY:
NAME: opt_learning_rate_decay_boundary_epochs
REQ: EXACTLY_ONE

- KEY:
NAME: opt_learning_rate_decay_factor
REQ: EXACTLY_ONE

- KEY:
NAME: dropout
CHECK: " v['value'] >= 0. and v['value'] < 1."

- KEY:
NAME: opt_weight_decay
CHECK: " v['value'] >= 0."

- KEY:
NAME: eval_error
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "v['value'] <= 0.124 and v['value'] > 0."
85 changes: 85 additions & 0 deletions mlperf_logging/compliance_checker/hpc_1.0.0/closed_deepcam.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# General Settings
- KEY:
NAME: gradient_accumulation_frequency
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0 "

- KEY:
NAME: seed
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0"

- KEY:
NAME: global_batch_size
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0"

- KEY:
NAME: num_workers
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0"

- KEY:
NAME: batchnorm_group_size
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0"


# Optimizer Parameters
- KEY:
NAME: opt_name
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['Adam', 'AdamW', 'LAMB']"
POST: " if (v['value'] == 'LAMB'): enqueue_config('hpc_1.0.0/closed_deepcam_lamb.yaml') "

- KEY:
NAME: opt_lr
REQ: EXACTLY_ONE
CHECK: " v['value'] >0."

- KEY:
NAME: opt_betas
REQ: EXACTLY_ONE
CHECK: " len(v['value']) == 2"

- KEY:
NAME: opt_eps
REQ: EXACTLY_ONE
CHECK: " math.isclose(v['value'], 1e-6)"


# Scheduler Parameters
- KEY:
NAME: scheduler_type
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['multistep', 'cosine_annealing']"
POST: " enqueue_config('hpc_1.0.0/closed_deepcam_{}.yaml'.format(v['value'].lower())) "

- KEY:
NAME: scheduler_lr_warmup_steps
REQ: EXACTLY_ONE
CHECK: " v['value'] >= 0 "

- KEY:
NAME: scheduler_lr_warmup_factor
REQ: EXACTLY_ONE
CHECK: " v['value'] >= 1. "

# Dataset Properties
- KEY:
NAME: train_samples
REQ: EXACTLY_ONE
CHECK: " v['value'] == 121266"

- KEY:
NAME: eval_samples
REQ: EXACTLY_ONE
CHECK: " v['value'] == 15158"

# Convergence Properties
- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "v['value'] >= 0.82 and v['value'] <= 1."
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Scheduler Parameters
- KEY:
NAME: scheduler_t_max
REQ: EXACTLY_ONE
CHECK: " v['value'] >= 1. "

- KEY:
NAME: scheduler_eta_min
REQ: EXACTLY_ONE
CHECK: " v['value'] >= 0. "
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Optimizer Parameters
- KEY:
NAME: opt_bias_correction
REQ: EXACTLY_ONE
CHECK: " v['value'] "

- KEY:
NAME: opt_grad_averaging
REQ: EXACTLY_ONE
CHECK: " v['value'] "

- KEY:
NAME: opt_max_grad_norm
REQ: EXACTLY_ONE
CHECK: " v['value'] == 1."

Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Scheduler Parameters
- KEY:
NAME: scheduler_milestones
REQ: EXACTLY_ONE
CHECK: " len(v['value']) >= 0 "

- KEY:
NAME: scheduler_decay_rate
REQ: EXACTLY_ONE
CHECK: " v['value'] <= 1. "
39 changes: 39 additions & 0 deletions mlperf_logging/compliance_checker/hpc_1.0.0/closed_oc20.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
- KEY:
NAME: global_batch_size
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0"

- KEY:
NAME: opt_name
REQ: EXACTLY_ONE
CHECK: " v['value'] == 'AdamW'"

- KEY:
NAME: opt_base_learning_rate
REQ: EXACTLY_ONE
CHECK: " v['value'] >= 0."

- KEY:
NAME: opt_learning_rate_warmup_steps
REQ: EXACTLY_ONE
CHECK: " v['value'] >= 0"

- KEY:
NAME: opt_learning_rate_warmup_factor
REQ: EXACTLY_ONE
CHECK: " v['value'] >= 0."

- KEY:
NAME: opt_learning_rate_decay_boundary_steps
REQ: EXACTLY_ONE

- KEY:
NAME: opt_learning_rate_decay_factor
REQ: EXACTLY_ONE

- KEY:
NAME: eval_error
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "v['value'] <= 0.036 and v['value'] > 0."
Loading

0 comments on commit 1ce757e

Please sign in to comment.