Add training 2.1 ruleset (#263)

mlcommons · Sep 28, 2022 · d12211b · d12211b
1 parent ae5ae9a
commit d12211b
Show file tree

Hide file tree

Showing 40 changed files with 2,863 additions and 37 deletions.
diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py
@@ -74,6 +74,16 @@
         'rnnt',
         'unet3d',
     ],
+    '2.1': [
+        'bert',
+        'dlrm',
+        'maskrcnn',
+        'minigo',
+        'resnet',
+        'ssd',
+        'rnnt',
+        'unet3d',
+    ],
     },
 
     'hpc': {

diff --git a/mlperf_logging/compliance_checker/README.md b/mlperf_logging/compliance_checker/README.md
@@ -10,9 +10,9 @@ To check a log file for compliance:
 
     python -m mlperf_logging.compliance_checker [--config YAML] [--usage training/hpc] [--ruleset MLPERF_EDITION] FILENAME
 
-By default, 2.0.0 training edition rules are used and the default config is set to `2.0.0/common.yaml`.
+By default, 2.1.0 training edition rules are used and the default config is set to `2.1.0/common.yaml`.
 This config will check all common keys and enqueue benchmark specific config to be checked as well.
-Old training editions, still supported are 1.1.0, 1.0.0, 0.7.0 amd 0.6.0
+Old training editions, still supported are 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0
 
 To check hpc compliance rules (only 1.0.0 ruleset is supported), set --usage hpc --ruleset 1.0.0.
 
@@ -22,25 +22,25 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_
 
 ### Existing config files for training submissions
 
-    2.0.0/common.yaml          - currently the default config file, checks common fields complience and equeues benchmark-specific config file
-    2.0.0/closed_common.yaml   - the common rules file for closed submissions. These rules apply to all benchmarks 
-    2.0.0/open_common.yaml     - the common rules file for open submissions. These rules apply to all benchmarks
-    2.0.0/closed_resnet.yaml   - Per-benchmark rules, closed submissions.
-    2.0.0/closed_ssd.yaml
-    2.0.0/closed_minigo.yaml
-    2.0.0/closed_maskrcnn.yaml
-    2.0.0/closed_rnnt.yaml
-    2.0.0/closed_unet3d.yaml
-    2.0.0/closed_bert.yaml
-    2.0.0/closed_dlrm.yaml
-    2.0.0/open_resnet.yaml   - Per-benchmark rules, closed submissions.
-    2.0.0/open_ssd.yaml
-    2.0.0/open__minigo.yaml
-    2.0.0/open_maskrcnn.yaml
-    2.0.0/open_rnnt.yaml
-    2.0.0/open_unet3d.yaml
-    2.0.0/open_bert.yaml
-    2.0.0/open_dlrm.yaml
+    2.1.0/common.yaml          - currently the default config file, checks common fields complience and equeues benchmark-specific config file
+    2.1.0/closed_common.yaml   - the common rules file for closed submissions. These rules apply to all benchmarks 
+    2.1.0/open_common.yaml     - the common rules file for open submissions. These rules apply to all benchmarks
+    2.1.0/closed_resnet.yaml   - Per-benchmark rules, closed submissions.
+    2.1.0/closed_ssd.yaml
+    2.1.0/closed_minigo.yaml
+    2.1.0/closed_maskrcnn.yaml
+    2.1.0/closed_rnnt.yaml
+    2.1.0/closed_unet3d.yaml
+    2.1.0/closed_bert.yaml
+    2.1.0/closed_dlrm.yaml
+    2.1.0/open_resnet.yaml   - Per-benchmark rules, closed submissions.
+    2.1.0/open_ssd.yaml
+    2.1.0/open__minigo.yaml
+    2.1.0/open_maskrcnn.yaml
+    2.1.0/open_rnnt.yaml
+    2.1.0/open_unet3d.yaml
+    2.1.0/open_bert.yaml
+    2.1.0/open_dlrm.yaml
 
 ### Existing config files for HPC submissions
 

diff --git a/mlperf_logging/compliance_checker/mlp_compliance.py b/mlperf_logging/compliance_checker/mlp_compliance.py
@@ -304,7 +304,7 @@ def get_parser():
     parser.add_argument('--usage', type=str, default='training',
                     choices=usage_choices(),
                     help='what WG do the benchmarks come from')
-    parser.add_argument('--ruleset', type=str, default='2.0.0',
+    parser.add_argument('--ruleset', type=str, default='2.1.0',
                     choices=rule_choices(),
                     help='what version of rules to check the log against')
     parser.add_argument('--config',  type=str,

diff --git a/mlperf_logging/compliance_checker/mlp_parser/__init__.py b/mlperf_logging/compliance_checker/mlp_parser/__init__.py
@@ -1,8 +1,9 @@
 from .ruleset_060 import parse_file as parse_file_060
 from .ruleset_070 import parse_file as parse_file_070
 from .ruleset_100 import parse_file as parse_file_100
-from .ruleset_100 import parse_file as parse_file_110
-from .ruleset_100 import parse_file as parse_file_200
+from .ruleset_110 import parse_file as parse_file_110
+from .ruleset_200 import parse_file as parse_file_200
+from .ruleset_210 import parse_file as parse_file_210
 
 
 def parse_file(filename, ruleset='0.6.0'):
@@ -16,5 +17,7 @@ def parse_file(filename, ruleset='0.6.0'):
         return parse_file_110(filename)
     elif ruleset == '2.0.0':
         return parse_file_200(filename)
+    elif ruleset == '2.1.0':
+        return parse_file_210(filename)
     else:
         raise Exception(f'Ruleset "{ruleset}" is not supported')
diff --git a/mlperf_logging/compliance_checker/mlp_parser/ruleset_210.py b/mlperf_logging/compliance_checker/mlp_parser/ruleset_210.py
@@ -0,0 +1,104 @@
+'''
+Parses a text MLPerf log into a structured format.
+'''
+
+from __future__ import print_function
+
+import collections
+import json
+import re
+import sys
+
+from io import open
+
+LogLine = collections.namedtuple('LogLine', [
+    'full_string', # the complete line as a string
+    'timestamp',   # seconds as a float, e.g. 1234.567
+    'key',         # the string key
+    'value',       # the parsed value associated with the tag, or None if no value
+    'lineno',      # the line number in the file
+    ])
+
+
+TOKEN = ':::MLLOG '
+
+
+def parse_line(line):
+    if not line.startswith(TOKEN):
+        return None
+
+    return json.loads(line[len(TOKEN):])
+
+
+def string_to_logline(lineno, string):
+    ''' Returns a LogLine or raises a ValueError '''
+    m = parse_line(string)
+
+    if m is None:
+        raise ValueError('does not match regex')
+
+    args = []
+    args.append(string) # full string
+
+    ts = float(m['time_ms']) # may raise error, e.g. "1.2.3"
+    # TODO check for weird values
+    args.append(ts)
+
+    args.append(m['key']) # key
+
+    j = { 'value': m['value'], 'metadata': m['metadata'] }
+    args.append(j)
+
+    args.append(lineno)
+    return LogLine(*args)
+
+
+def parse_file(filename):
+    ''' Reads a file by name and returns list of loglines and list of errors'''
+    with open(filename, encoding='latin-1') as f:
+        return parse_generator(f)
+
+
+def strip_and_dedup(gen):
+    lines = []
+    for l in gen:
+        if TOKEN not in l:
+            continue
+        lines.append(re.sub(".*"+TOKEN, TOKEN, l))
+    return lines
+
+
+
+def parse_generator(gen):
+    ''' Reads a generator of lines and returns (loglines, errors)
+    The list of errors are any parsing issues as a tuple (str_line, error_msg)
+    '''
+    loglines = []
+    failed = []
+    for lineno, line in enumerate(strip_and_dedup(gen)):
+        line = line.strip()
+        try:
+            ll = string_to_logline(lineno, line)
+            loglines.append(ll)
+        except ValueError as e:
+            failed.append((line, str(e)))
+    return loglines, failed
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        print('usage: mlp_parser.py FILENAME')
+        print('       tests parsing on the file.')
+        sys.exit(1)
+
+    filename = sys.argv[1]
+    lines, errors = parse_file(filename)
+
+    print('Parsed {} log lines with {} errors.'.format(len(lines), len(errors)))
+
+    if len(errors) > 0:
+        print('Lines which failed to parse:')
+        for line, error in errors:
+            print('  Following line failed: {}'.format(error))
+            print(line)
+