diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py index efc7fc8..6b56ef5 100644 --- a/mlperf_logging/benchmark_meta.py +++ b/mlperf_logging/benchmark_meta.py @@ -122,7 +122,16 @@ 'llama2_70b_lora', 'stable_diffusion', 'gnn' - ] + ], + '4.1': [ + 'bert', + 'dlrm_dcnv2', + 'gpt3', + 'ssd', + 'stable_diffusion', + 'llama2_70b_lora', + 'gnn' + ] }, 'hpc': { diff --git a/mlperf_logging/compliance_checker/README.md b/mlperf_logging/compliance_checker/README.md index 3fc4f31..9af31c8 100644 --- a/mlperf_logging/compliance_checker/README.md +++ b/mlperf_logging/compliance_checker/README.md @@ -10,9 +10,9 @@ To check a log file for compliance: python -m mlperf_logging.compliance_checker [--config YAML] [--usage training/hpc] [--ruleset MLPERF_EDITION] FILENAME -By default, 3.1.0 training edition rules are used and the default config is set to `3.1.0/common.yaml`. +By default, 3.1.0 training edition rules are used and the default config is set to `4.1.0/common.yaml`. This config will check all common keys and enqueue benchmark specific config to be checked as well. -Old training editions, still supported are 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0 +Old training editions, still supported are 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0 To check hpc compliance rules (only 1.0.0 ruleset is supported), set --usage hpc --ruleset 1.0.0. @@ -22,27 +22,23 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_ ### Existing config files for training submissions - 3.1.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file - 3.1.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks - 3.1.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks - 3.1.0/closed_resnet.yaml - Per-benchmark rules, closed submissions. - 3.1.0/closed_ssd.yaml - 3.1.0/closed_maskrcnn.yaml - 3.1.0/closed_rnnt.yaml - 3.1.0/closed_unet3d.yaml - 3.1.0/closed_bert.yaml - 3.1.0/closed_dlrm_dcnv2.yaml - 3.1.0/closed_gpt3.yaml - 3.1.0/closed_stable_diffusion.yaml - 3.1.0/open_resnet.yaml - Per-benchmark rules, closed submissions. - 3.1.0/open_ssd.yaml - 3.1.0/open_maskrcnn.yaml - 3.1.0/open_rnnt.yaml - 3.1.0/open_unet3d.yaml - 3.1.0/open_bert.yaml - 3.1.0/open_dlrm_dcnv2.yaml - 3.1.0/open_gpt3.yaml - 3.1.0/open_stable_diffusion.yaml + 4.1.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file + 4.1.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks + 4.1.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks + 4.1.0/closed_ssd.yaml - Per-benchmark rules, closed submissions. + 4.1.0/closed_bert.yaml + 4.1.0/closed_dlrm_dcnv2.yaml + 4.1.0/closed_gpt3.yaml + 4.1.0/closed_gnn.yaml + 4.1.0/closed_llama2_70b_lora.yaml + 4.1.0/closed_stable_diffusion.yaml + 4.1.0/open_ssd.yaml - Per-benchmark rules, closed submissions. + 4.1.0/open_bert.yaml + 4.1.0/open_dlrm_dcnv2.yaml + 4.1.0/open_gpt3.yaml + 4.1.0/open_gnn.yaml + 4.1.0/open_llama2_70b_lora.yaml + 4.1.0/open_stable_diffusion.yaml ### Existing config files for HPC submissions diff --git a/mlperf_logging/compliance_checker/mlp_parser/__init__.py b/mlperf_logging/compliance_checker/mlp_parser/__init__.py index 48e6bc1..6fa7b9c 100644 --- a/mlperf_logging/compliance_checker/mlp_parser/__init__.py +++ b/mlperf_logging/compliance_checker/mlp_parser/__init__.py @@ -7,6 +7,7 @@ from .ruleset_300 import parse_file as parse_file_300 from .ruleset_310 import parse_file as parse_file_310 from .ruleset_400 import parse_file as parse_file_400 +from .ruleset_410 import parse_file as parse_file_410 def parse_file(filename, ruleset='0.6.0'): @@ -28,5 +29,7 @@ def parse_file(filename, ruleset='0.6.0'): return parse_file_310(filename) elif ruleset == '4.0.0': return parse_file_400(filename) + elif ruleset == '4.1.0': + return parse_file_410(filename) else: - raise Exception(f'Ruleset "{ruleset}" is not supported') \ No newline at end of file + raise Exception(f'Ruleset "{ruleset}" is not supported') diff --git a/mlperf_logging/compliance_checker/mlp_parser/ruleset_410.py b/mlperf_logging/compliance_checker/mlp_parser/ruleset_410.py new file mode 100644 index 0000000..e30b08d --- /dev/null +++ b/mlperf_logging/compliance_checker/mlp_parser/ruleset_410.py @@ -0,0 +1,105 @@ +''' +Parses a text MLPerf log into a structured format. +''' + +from __future__ import print_function + +import collections +import json +import re +import sys +from dataclasses import dataclass + +from io import open + +@dataclass +class LogLine: + """Class for keeping track of an item in inventory.""" + full_string: str + timestamp: float + key: str + value: str + lineno: int + +TOKEN = ':::MLLOG ' + + +def parse_line(line): + if not line.startswith(TOKEN): + return None + + return json.loads(line[len(TOKEN):]) + + +def string_to_logline(lineno, string): + ''' Returns a LogLine or raises a ValueError ''' + m = parse_line(string) + + if m is None: + raise ValueError('does not match regex') + + args = [] + args.append(string) # full string + + ts = float(m['time_ms']) # may raise error, e.g. "1.2.3" + # TODO check for weird values + args.append(ts) + + args.append(m['key']) # key + + j = { 'value': m['value'], 'metadata': m['metadata'] } + args.append(j) + + args.append(lineno) + return LogLine(*args) + + +def parse_file(filename): + ''' Reads a file by name and returns list of loglines and list of errors''' + with open(filename, encoding='latin-1') as f: + return parse_generator(f) + + +def strip_and_dedup(gen): + lines = [] + for l in gen: + if TOKEN not in l: + continue + lines.append(re.sub(".*"+TOKEN, TOKEN, l)) + return lines + + + +def parse_generator(gen): + ''' Reads a generator of lines and returns (loglines, errors) + The list of errors are any parsing issues as a tuple (str_line, error_msg) + ''' + loglines = [] + failed = [] + for lineno, line in enumerate(strip_and_dedup(gen)): + line = line.strip() + try: + ll = string_to_logline(lineno, line) + loglines.append(ll) + except ValueError as e: + failed.append((line, str(e))) + return loglines, failed + + +if __name__ == '__main__': + if len(sys.argv) != 2: + print('usage: mlp_parser.py FILENAME') + print(' tests parsing on the file.') + sys.exit(1) + + filename = sys.argv[1] + lines, errors = parse_file(filename) + + print('Parsed {} log lines with {} errors.'.format(len(lines), len(errors))) + + if len(errors) > 0: + print('Lines which failed to parse:') + for line, error in errors: + print(' Following line failed: {}'.format(error)) + print(line) + diff --git a/mlperf_logging/compliance_checker/training_4.1.0/closed_bert.yaml b/mlperf_logging/compliance_checker/training_4.1.0/closed_bert.yaml new file mode 100644 index 0000000..408f669 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.1.0/closed_bert.yaml @@ -0,0 +1,48 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_lamb_epsilon + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_training_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: num_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: start_warmup_step + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_lamb_beta_1 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_lamb_beta_2 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_lamb_weight_decay_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] >= 0.720) and v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_4.1.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_4.1.0/closed_common.yaml new file mode 100755 index 0000000..e551bfe --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.1.0/closed_common.yaml @@ -0,0 +1,11 @@ + +- KEY: + NAME: submission_benchmark + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn','llama2_70b_lora'] " + POST: " enqueue_config('training_4.1.0/closed_{}.yaml'.format(v['value'])) " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " diff --git a/mlperf_logging/compliance_checker/training_4.1.0/closed_dlrm_dcnv2.yaml b/mlperf_logging/compliance_checker/training_4.1.0/closed_dlrm_dcnv2.yaml new file mode 100644 index 0000000..45344bd --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.1.0/closed_dlrm_dcnv2.yaml @@ -0,0 +1,59 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adagrad' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adagrad_learning_rate_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_adagrad_initial_accumulator_value + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_adagrad_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-8 " + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_learning_rate_decay_start_step + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] >= 0.80275 and v['value'] <= 1.0" + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 89137319 " diff --git a/mlperf_logging/compliance_checker/training_4.1.0/closed_gnn.yaml b/mlperf_logging/compliance_checker/training_4.1.0/closed_gnn.yaml new file mode 100644 index 0000000..2c1f728 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.1.0/closed_gnn.yaml @@ -0,0 +1,21 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0" + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adam' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0" + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] >= 0.72 and v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_4.1.0/closed_gpt3.yaml b/mlperf_logging/compliance_checker/training_4.1.0/closed_gpt3.yaml new file mode 100644 index 0000000..8007184 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.1.0/closed_gpt3.yaml @@ -0,0 +1,79 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 2048 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adam' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] == 2e-5 or v['value'] == 3e-5 " + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adam_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adam_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adam_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-8 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 2.69) and v['value'] > 2.6" + +- KEY: + NAME: init_checkpoint_step + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + diff --git a/mlperf_logging/compliance_checker/training_4.1.0/closed_llama2_70b_lora.yaml b/mlperf_logging/compliance_checker/training_4.1.0/closed_llama2_70b_lora.yaml new file mode 100755 index 0000000..3d80b91 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.1.0/closed_llama2_70b_lora.yaml @@ -0,0 +1,45 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: lora_alpha + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_training_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: lora_alpha + REQ: EXACTLY_ONE + +- KEY: + NAME: lora_rank + REQ: EXACTLY_ONE + CHECK: " v['value'] == 16" + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 0.925) and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_4.1.0/closed_ssd.yaml b/mlperf_logging/compliance_checker/training_4.1.0/closed_ssd.yaml new file mode 100644 index 0000000..794ab7a --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.1.0/closed_ssd.yaml @@ -0,0 +1,35 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adam' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0" + +- KEY: + NAME: opt_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.0" + +- KEY: + NAME: opt_learning_rate_warmup_epochs + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0" + +- KEY: + NAME: opt_learning_rate_warmup_factor + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0" + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] >= 0.340 and v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_4.1.0/closed_stable_diffusion.yaml b/mlperf_logging/compliance_checker/training_4.1.0/closed_stable_diffusion.yaml new file mode 100644 index 0000000..3cdc3e6 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.1.0/closed_stable_diffusion.yaml @@ -0,0 +1,74 @@ +# Stable diffusion uses two metrics, FID and CLIP. +# These metrics can be calculated offline, using different scripts +# and logged seperatly. Therefore, we create a virtual key +# called aggregated_eval_accuracy, which aggregates +# both metrics into a single log line + +- BEGIN: + CODE: | + from dataclasses import replace + agg_eval_lines = {} + for line in loglines: + if line.key == "eval_accuracy" and 'metric' in line.value['metadata']: + samples_count = line.value['metadata']['samples_count'] + if samples_count not in agg_eval_lines: + new_line = replace(line) # Make a copy + new_line.key = "aggregated_eval_accuracy" + new_line.full_string = "" # Not needed + new_line.lineno = -1 # Not needed + new_line.value = {'value': {'samples_count': samples_count}, 'metadata':{}} + agg_eval_lines[samples_count] = new_line + + agg_eval_lines[samples_count].timestamp = max(line.timestamp, agg_eval_lines[samples_count].timestamp) + agg_eval_lines[samples_count].value['value'][line.value['metadata']['metric']] = line.value['value'] + loglines.extend(agg_eval_lines.values()) + + +- KEY: + NAME: global_batch_size + REQ: AT_LEAST_ONE + CHECK: " v['value'] >= 0 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.999 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-08 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.01 " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0 " + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0 " + +- KEY: + NAME: aggregated_eval_accuracy + REQ: AT_LEAST(2) + CHECK: + - "'FID' in v['value']" + - "'CLIP' in v['value']" + - "'samples_count' in v['value']" + ATLEAST_ONE_CHECK: "(0.0 <= v['value']['FID'] <= 90.0) and (0.15 <= v['value']['CLIP'] <= 1.0)" diff --git a/mlperf_logging/compliance_checker/training_4.1.0/common.yaml b/mlperf_logging/compliance_checker/training_4.1.0/common.yaml new file mode 100755 index 0000000..f050222 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.1.0/common.yaml @@ -0,0 +1,151 @@ +# This file lists all the KEYs to be checked. Every line that matches mlperf logging regex (::MLL...) will be checked against these rules. +# In the order of the appearance in the log, for each line will execute the code specified under CHECK for the KEY in that line. +# The code will be launched using local state 'v' which is the content of value field in log line, and global state 's'. +# Global state 's' exists to allow cross-line checks, like start/stop pairs etc. To initialize 's' use BEGIN record which CODE will +# be executed before any checks. +# In addition, occurrence of each key will be counted and at the end if a requirement regarding the number of occurrences is defined it will +# be confirmed. This could be implemented using global state, but since this is a common thing to do it is natively supported. +# +# KEY record: +# NAME +# REQ - optional - {EXACTLY_ONE, AT_LEAST_ONE} +# PRE - optional - code to be executed before CHECK +# CHECK - optional - expression to be evaluated to verify correctness +# POST - optional - code to be executed after CHECK + +- BEGIN: + CODE: > + s.update({ + 'init_started': False, + 'init_stopped' : False, + 'run_started' : False, + 'run_stopped' : False, + 'in_epoch' : False, + 'last_epoch' : 0, + 'in_block' : False, + 'block_first_epoch' : -1, + 'first_init_start': 9e99, + 'compile_time_mins': 0, + }) + +- KEY: + NAME: submission_org + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + +- KEY: + NAME: submission_platform + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + +- KEY: + NAME: submission_division + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['closed', 'open'] " + POST: " enqueue_config('training_4.1.0/{}_common.yaml'.format(v['value'])); s['compile_time_mins'] = 240 if v['value'] == 'open' else 30 " + +- KEY: + NAME: submission_status + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['cloud', 'onprem', 'preview', 'research'] " + +# at least one record should be found, but any found records must pass the test +- KEY: + NAME: cache_clear + REQ: AT_LEAST_ONE + CHECK: + - "'value' in v" + +# frequency not checked +- KEY: + NAME: init_start + REQ: AT_LEAST_ONE + CHECK: + - "not s['init_stopped']" + - "not s['run_started']" + POST: " s['init_started'] = True; s['first_init_start']=min(s['first_init_start'], ll.timestamp) " + +# confirm less than 20min since the very first init_start +- KEY: + NAME: init_stop + REQ: EXACTLY_ONE + CHECK: + - "s['init_started']" + - "not s['run_started']" + - "ll.timestamp - s['first_init_start'] < (s['compile_time_mins']*60*1e3)" + POST: " s['init_stopped'] = True" + +- KEY: + NAME: run_start + REQ: EXACTLY_ONE + CHECK: " ( s['init_stopped'] == True )" + POST: " s['run_started'] = True " + +# status can also be aborted, but not allowing it here for now +# if eval is inside epoch and we decide to terminate, we can lack epoch_stop, it is ok +- KEY: + NAME: run_stop + REQ: EXACTLY_ONE + CHECK: + - "s['run_started']" + - "'status' in v['metadata']" + POST: " s['run_stopped'] = True " + +# FIXME: check epoch_count value match +- KEY: + NAME: block_start + REQ: AT_LEAST_ONE_OR(epoch_start) + CHECK: + - "s['run_started']" + - "('epoch_count' in v['metadata']) | ('samples_count' in v['metadata'])" + - "'first_epoch_num' in v['metadata'] if 'epoch_count' in v['metadata'] else True" + - "v['metadata']['epoch_count'] > 0 if 'epoch_count' in v['metadata'] else True" + - "v['metadata']['samples_count'] >= 0 if 'samples_count' in v['metadata'] else True" + +- KEY: + NAME: block_stop + REQ: AT_LEAST_ONE_OR(epoch_stop) + CHECK: + - "('first_epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: epoch_start + REQ: AT_LEAST_ONE_OR(block_start) + CHECK: + - "'epoch_num' in v['metadata']" + +- KEY: + NAME: epoch_stop + REQ: AT_LEAST_ONE_OR(block_stop) + CHECK: + - "'epoch_num' in v['metadata']" + +# making sure previous eval did print it's accuracy result +- KEY: + NAME: eval_start + REQ: AT_LEAST_ONE_OR(block_start) + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: eval_stop + REQ: AT_LEAST_ONE_OR(block_stop) + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: train_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + diff --git a/mlperf_logging/compliance_checker/training_4.1.0/open_bert.yaml b/mlperf_logging/compliance_checker/training_4.1.0/open_bert.yaml new file mode 100644 index 0000000..14c4176 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.1.0/open_bert.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_4.1.0/open_common.yaml b/mlperf_logging/compliance_checker/training_4.1.0/open_common.yaml new file mode 100644 index 0000000..3e17477 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.1.0/open_common.yaml @@ -0,0 +1,6 @@ + +- KEY: + NAME: submission_benchmark + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn'] " + POST: " enqueue_config('training_4.1.0/open_{}.yaml'.format(v['value'])) " diff --git a/mlperf_logging/compliance_checker/training_4.1.0/open_dlrm_dcnv2.yaml b/mlperf_logging/compliance_checker/training_4.1.0/open_dlrm_dcnv2.yaml new file mode 100644 index 0000000..7f70c0c --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.1.0/open_dlrm_dcnv2.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] <= 1.0" diff --git a/mlperf_logging/compliance_checker/training_4.1.0/open_gnn.yaml b/mlperf_logging/compliance_checker/training_4.1.0/open_gnn.yaml new file mode 100644 index 0000000..14c4176 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.1.0/open_gnn.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_4.1.0/open_gpt3.yaml b/mlperf_logging/compliance_checker/training_4.1.0/open_gpt3.yaml new file mode 100644 index 0000000..8007184 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.1.0/open_gpt3.yaml @@ -0,0 +1,79 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 2048 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adam' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] == 2e-5 or v['value'] == 3e-5 " + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adam_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adam_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adam_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-8 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 2.69) and v['value'] > 2.6" + +- KEY: + NAME: init_checkpoint_step + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + diff --git a/mlperf_logging/compliance_checker/training_4.1.0/open_llama2_70b_lora.yaml b/mlperf_logging/compliance_checker/training_4.1.0/open_llama2_70b_lora.yaml new file mode 100755 index 0000000..14c4176 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.1.0/open_llama2_70b_lora.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_4.1.0/open_ssd.yaml b/mlperf_logging/compliance_checker/training_4.1.0/open_ssd.yaml new file mode 100644 index 0000000..14c4176 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.1.0/open_ssd.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_4.1.0/open_stable_diffusion.yaml b/mlperf_logging/compliance_checker/training_4.1.0/open_stable_diffusion.yaml new file mode 100644 index 0000000..fe25e31 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.1.0/open_stable_diffusion.yaml @@ -0,0 +1,33 @@ +# Stable diffusion uses two metrics, FID and CLIP. +# These metrics can be calculated offline, using different scripts +# and logged seperatly. Therefore, we create a virtual key +# called aggregated_eval_accuracy, which aggregates +# both metrics into a single log line + +- BEGIN: + CODE: | + from dataclasses import replace + agg_eval_lines = {} + for line in loglines: + if line.key == "eval_accuracy" and 'metric' in line.value['metadata']: + samples_count = line.value['metadata']['samples_count'] + if samples_count not in agg_eval_lines: + new_line = replace(line) # Make a copy + new_line.key = "aggregated_eval_accuracy" + new_line.full_string = "" # Not needed + new_line.lineno = -1 # Not needed + new_line.value = {'value': {'samples_count': samples_count}, 'metadata':{}} + agg_eval_lines[samples_count] = new_line + + agg_eval_lines[samples_count].timestamp = max(line.timestamp, agg_eval_lines[samples_count].timestamp) + agg_eval_lines[samples_count].value['value'][line.value['metadata']['metric']] = line.value['value'] + loglines.extend(agg_eval_lines.values()) + +- KEY: + NAME: aggregated_eval_accuracy + REQ: AT_LEAST(2) + CHECK: + - "'FID' in v['value']" + - "'CLIP' in v['value']" + - "'samples_count' in v['value']" + ATLEAST_ONE_CHECK: "v['value']['FID'] >= 0.0 and v['value']['CLIP'] <= 1.0" diff --git a/mlperf_logging/mllog/examples/power/compute_metric_example.py b/mlperf_logging/mllog/examples/power/compute_metric_example.py index add3627..a882593 100644 --- a/mlperf_logging/mllog/examples/power/compute_metric_example.py +++ b/mlperf_logging/mllog/examples/power/compute_metric_example.py @@ -6,7 +6,7 @@ def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--input-log", type=str, default=None) parser.add_argument("--hardware-type", type=str, choices=["node", "sw"], default="node") - parser.add_argument("--ruleset", type=str, choices=["0.6.0", "0.7.0", "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0"], default="4.0.0") + parser.add_argument("--ruleset", type=str, choices=["0.6.0", "0.7.0", "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0"], default="4.1.0") args = parser.parse_args() return args diff --git a/mlperf_logging/package_checker/README.md b/mlperf_logging/package_checker/README.md index 6c7422c..6056238 100644 --- a/mlperf_logging/package_checker/README.md +++ b/mlperf_logging/package_checker/README.md @@ -10,7 +10,7 @@ To check an organization's submission package for compliance: python3 -m mlperf_logging.package_checker FOLDER USAGE RULESET ``` -Currently, USAGE in ["training"] and RULESET in ["0.6.0", "0.7.0", "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0"] are supported. +Currently, USAGE in ["training"] and RULESET in ["0.6.0", "0.7.0", "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0"] are supported. The package checker checks: 1. The number of result files for each benchmark matches the required count. If diff --git a/mlperf_logging/rcp_checker/README.md b/mlperf_logging/rcp_checker/README.md index 17af28e..79dcdf2 100644 --- a/mlperf_logging/rcp_checker/README.md +++ b/mlperf_logging/rcp_checker/README.md @@ -8,10 +8,10 @@ Run Reference Convergence Point checks for a submission directory. This consists of testing whether a submission does not converge statistically faster than the reference. -For training, RCPs are loaded from directory mlperf_logging/rcp_checker/training_4.0.0/*.json +For training, RCPs are loaded from directory mlperf_logging/rcp_checker/training_4.1.0/*.json The RCP checker supports only the 1.0.0 version onwards. -The current training version is 4.0.0. +The current training version is 4.1.0. ## Usage diff --git a/mlperf_logging/repo_checker/README.md b/mlperf_logging/repo_checker/README.md index c4f0fe5..0c4e495 100644 --- a/mlperf_logging/repo_checker/README.md +++ b/mlperf_logging/repo_checker/README.md @@ -12,7 +12,7 @@ review process. python3 -m mlperf_logging.repo_checker FOLDER USAGE RULESET ``` -Currently, USAGE in ["training", "hpc"] and RULESETS 2.0.0, 2.1.0, 3.0.0, 3.1.0 and 4.0.0 are supported. +Currently, USAGE in ["training", "hpc"] and RULESETS 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0 and 4.1.0 are supported. The repo checker checks: 1. Whether the repo contains filenames that github does not like, e.g. files with spaces, diff --git a/mlperf_logging/result_summarizer/config.yaml b/mlperf_logging/result_summarizer/config.yaml index 7b873ff..4ad71db 100644 --- a/mlperf_logging/result_summarizer/config.yaml +++ b/mlperf_logging/result_summarizer/config.yaml @@ -75,6 +75,15 @@ columns: llama2_70b_lora: ["Benchmark results (minutes)", "LLM-Finetune", "SCROLSS Gov Report", "LLama2-70B-LoRA"] gnn: ["Benchmark results (minutes)", "Graph node classification", "IGBH-Full", "R-GAT"] default: [" ", " ", " "] + "4.0.1": + bert: ["Benchmark results (minutes)", "NLP", "Wikipedia", "BERT"] + gpt3: ["Benchmark results (minutes)", "LLM", "C4", "GPT3"] + dlrm_dcnv2: ["Benchmark results (minutes)", "Recommendation", "1TB Multihot Clickthrough", "DLRM DCNv2"] + ssd: ["Benchmark results (minutes)", "Object detection, light-weight", "OpenImages", "RetinaNet"] + stable_diffusion: ["Benchmark results (minutes)", "Text to image", "Laion 400m and Coco-2017", "StableDiffusion"] + llama2_70b_lora: ["Benchmark results (minutes)", "LLM-Finetune", "SCROLSS Gov Report", "LLama2-70B-LoRA"] + gnn: ["Benchmark results (minutes)", "Graph node classification", "IGBH-Full", "R-GAT"] + default: [" ", " ", " "] hpc: "2.0.0":