Skip to content

Commit

Permalink
Merge pull request #55 from xyhuang/0.7-branch
Browse files Browse the repository at this point in the history
0.7 branch
  • Loading branch information
xyhuang authored Jun 22, 2020
2 parents 29f9ca5 + 01cd340 commit 4f4f4ea
Show file tree
Hide file tree
Showing 25 changed files with 144 additions and 43 deletions.
6 changes: 6 additions & 0 deletions mlperf_logging/compliance_checker/0.7.0/closed_common.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

- KEY:
NAME: submission_benchmark
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['resnet', 'ssd', 'maskrcnn', 'transformer', 'gnmt', 'minigo', 'dlrm', 'bert'] "
POST: " enqueue_config('0.7.0/closed_{}.yaml'.format(v['value'])) "
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "v['value'] >= 0.55 and v['value'] < 1.0"
ATLEAST_ONE_CHECK: "v['value'] >= 0.5 and v['value'] < 1.0"

- KEY:
NAME: block_start
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
NAME: opt_name
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['sgd', 'lars'] "
POST: " enqueue_config('0.7.0/resnet_{}.yaml'.format(v['value'])) "
POST: " enqueue_config('0.7.0/closed_resnet_{}.yaml'.format(v['value'])) "

- KEY:
NAME: eval_accuracy
Expand Down
2 changes: 1 addition & 1 deletion mlperf_logging/compliance_checker/0.7.0/common.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
NAME: submission_division
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['closed', 'open'] "
POST: " enqueue_config('0.7.0/{}_common.yaml'.format(v['value'])) "

- KEY:
NAME: submission_status
Expand All @@ -51,7 +52,6 @@
NAME: submission_benchmark
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['resnet', 'ssd', 'maskrcnn', 'transformer', 'gnmt', 'minigo', 'dlrm', 'bert'] "
POST: " enqueue_config('0.7.0/{}.yaml'.format(v['value'])) "

# at least one record should be found, but any found records must pass the test
- KEY:
Expand Down
7 changes: 7 additions & 0 deletions mlperf_logging/compliance_checker/0.7.0/open_bert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "v['value'] >= 0.712 and v['value'] < 1.0"
6 changes: 6 additions & 0 deletions mlperf_logging/compliance_checker/0.7.0/open_common.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

- KEY:
NAME: submission_benchmark
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['resnet', 'ssd', 'maskrcnn', 'transformer', 'gnmt', 'minigo', 'dlrm', 'bert'] "
POST: " enqueue_config('0.7.0/open_{}.yaml'.format(v['value'])) "
7 changes: 7 additions & 0 deletions mlperf_logging/compliance_checker/0.7.0/open_dlrm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "v['value'] >= 0.8025 and v['value'] < 1.0"
8 changes: 8 additions & 0 deletions mlperf_logging/compliance_checker/0.7.0/open_gnmt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "v['value'] >= 0.240 and v['value'] < 1"

12 changes: 12 additions & 0 deletions mlperf_logging/compliance_checker/0.7.0/open_maskrcnn.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'BBOX' in v['value']"
- "'SEGM' in v['value']"
- "'epoch_num' in v['metadata']"
POST: " s['accuracy_printed'] = True "
ATLEAST_ONE_CHECK: "v['value']['BBOX'] >= 0.377 and v['value']['BBOX'] < 1.0 and v['value']['SEGM'] >= 0.339 and v['value']['SEGM'] < 1.0"


9 changes: 9 additions & 0 deletions mlperf_logging/compliance_checker/0.7.0/open_minigo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "v['value'] >= 0.5 and v['value'] < 1.0"


7 changes: 7 additions & 0 deletions mlperf_logging/compliance_checker/0.7.0/open_resnet.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "v['value'] >= 0.7590 and v['value'] < 1.0"
7 changes: 7 additions & 0 deletions mlperf_logging/compliance_checker/0.7.0/open_ssd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "v['value'] >= 0.230 and v['value'] < 1.0"
7 changes: 7 additions & 0 deletions mlperf_logging/compliance_checker/0.7.0/open_transformer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "v['value'] >= 0.2500 and v['value'] < 1"
18 changes: 0 additions & 18 deletions mlperf_logging/compliance_checker/0.7.0_warn/minigo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,6 @@
- KEY:
NAME: save_model
REQ: AT_LEAST_ONE
CHECK:
- "s['in_epoch']"
- "v['value']['iteration'] == s['last_epoch']"
- "not s['model_saved']"
POST: " s['model_saved'] = True ; s['save_model_ts'].append(ll.timestamp) "


- KEY:
NAME: global_batch_size
Expand All @@ -24,11 +18,6 @@
REQ: EXACTLY_ONE
CHECK: " len(v['value']) > 0"

- KEY:
NAME: opt_base_learning_rate
REQ: EXACTLY_ONE
CHECK: " len(v['value']) > 0 "

- KEY:
NAME: opt_weight_decay
REQ: EXACTLY_ONE
Expand Down Expand Up @@ -94,10 +83,3 @@
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0 "

# block_start/stop keys are not required in minigo, so re-define without REQ
- KEY:
NAME: block_start

- KEY:
NAME: block_stop

13 changes: 7 additions & 6 deletions mlperf_logging/compliance_checker/mlp_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,13 @@ def overwrite_messages(self, keys):

def log_messages(self):
message_separator = '\n' + '-' * 30 + '\n'

print(message_separator.join([
*self.warnings.values(),
*self.overwritable.values(),
*self.not_overwritable
]))
message = message_separator.join([
*self.warnings.values(),
*self.overwritable.values(),
*self.not_overwritable
])
if message:
print(message)

def has_messages(self):
return self.not_overwritable or self.overwritable
Expand Down
14 changes: 9 additions & 5 deletions mlperf_logging/package_checker/package_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,7 @@ def check_training_result_files(folder, ruleset, quiet, werror):
ruleset: The ruleset such as 0.6.0 or 0.7.0.
"""

errors_found = 0

too_many_errors = False
result_folder = os.path.join(folder, 'results')
for system_folder in _get_sub_folders(result_folder):
for benchmark_folder in _get_sub_folders(system_folder):
Expand Down Expand Up @@ -88,6 +87,7 @@ def check_training_result_files(folder, ruleset, quiet, werror):
_EXPECTED_RESULT_FILE_COUNTS[benchmark],
len(result_files)))

errors_found = 0
result_files.sort()
for result_file in result_files:
result_basename = os.path.basename(result_file)
Expand All @@ -107,11 +107,15 @@ def check_training_result_files(folder, ruleset, quiet, werror):
valid, _, _, _ = mlp_compliance.main(result_file, config_file, checker)
if not valid:
errors_found += 1

if errors_found == 1:
print('WARNING: One file does not comply.')
print('WARNING: Allowing this failure under olympic scoring rules.')
if errors_found > 1:
too_many_errors = True

_print_divider_bar()
if errors_found > 0:
raise Exception('Found errors in logging, see log above for details.')
if too_many_errors:
raise Exception('Found too many errors in logging, see log above for details.')


def check_training_package(folder, ruleset, quiet, werror):
Expand Down
60 changes: 49 additions & 11 deletions mlperf_logging/result_summarizer/result_summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import re
import sys

from ..compliance_checker import mlp_compliance

_ALLOWED_BENCHMARKS_V06 = [
'resnet',
'ssd',
Expand Down Expand Up @@ -106,17 +108,27 @@ def _code_url(system_desc, ruleset):


def _row_key(system_desc):
system_name = system_desc['system_name']
system_name = '{}-{}'.format(system_desc['system_name'], system_desc['framework'])
if system_name == 'tpu-v3':
chips = int(system_desc['accelerators_per_node']) * 2
return 'tpu-v3-{:04d}'.format(chips)
return system_name


def _read_mlperf_score(result_file):
def _read_mlperf_score(result_file, ruleset):
with open(result_file, 'r') as f:
result = f.read()

config_file = '{ruleset}/common.yaml'.format(ruleset=ruleset)
checker = mlp_compliance.make_checker(
ruleset=ruleset,
quiet=True,
werror=False)
valid, _, _, _ = mlp_compliance.main(result_file, config_file, checker)

if not valid:
return None

run_start = re.search(_RUN_START_REGEX, result)
if run_start is None:
raise Exception('Failed to match run_start!.')
Expand All @@ -129,11 +141,25 @@ def _read_mlperf_score(result_file):
return minutes


def _compute_olympic_average(scores):
copied_scores = copy.deepcopy(scores)
copied_scores.sort()
copied_scores = copied_scores[1:-1]
return sum(copied_scores) / len(copied_scores)
def _compute_olympic_average(scores, dropped_scores):
"""There are two possible cases we might handle:
If dropped_scores == 0, then we compute a normal olympiic score.
If dropped_scores > 0, then the maximum was already dropped
(and does not appear in scores) because it did not converge.
"""
sum_of_scores = sum(scores)
count = len(scores)

# Subtract off the min
sum_of_scores -= min(scores)
count -= 1

# Subtract off the max, only if the max was not already dropped
if dropped_scores == 0:
sum_of_scores -= max(scores)
count -= 1

return sum_of_scores * 1.0 / count


def _is_organization_folder(folder):
Expand Down Expand Up @@ -216,11 +242,23 @@ def summarize_results(folder, ruleset):
pattern = '{folder}/result_*.txt'.format(folder=benchmark_folder)
result_files = glob.glob(pattern, recursive=True)
scores = []
dropped_scores = 0
for result_file in result_files:
score = _read_mlperf_score(result_file)
scores.append(score)

benchmark_scores[benchmark] = _compute_olympic_average(scores)
score = _read_mlperf_score(result_file, ruleset)
if score is None:
dropped_scores += 1
else:
scores.append(score)
if dropped_scores > 1:
print('CRITICAL ERROR: Too many non-converging runs for {} {}/{}'.
format(desc['submitter'], system, benchmark))
print('** CRITICAL ERROR ** Results in the table for {} {}/{} are NOT correct'.
format(desc['submitter'], system, benchmark))
if dropped_scores == 1:
print('NOTICE: Dropping non-converged run for {} {}/{} using olympic scoring.'
.format(desc['submitter'], system, benchmark))

benchmark_scores[benchmark] = _compute_olympic_average(scores, dropped_scores)

# Construct scores portion of the row.
if ruleset == '0.6.0':
Expand Down

0 comments on commit 4f4f4ea

Please sign in to comment.