Add scorer and sample data

OFAI · May 1, 2024 · 6cf93ca · 6cf93ca
1 parent ae7e780
commit 6cf93ca
Show file tree

Hide file tree

Showing 6 changed files with 1,318 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+scores.json
diff --git a/README.md b/README.md
@@ -1 +1,24 @@
-# GermEval2024-GerMS
+# GermEval2024-GerMS
+
+This repo contains:
+
+* The public web page sources for the [GermEval2024 GerMS-Detect Shared Task](https://ofai.github.io/GermEval2024-GerMS/)
+* Code and other information related to the shared task
+
+
+## Scorer
+
+The source code for the scorer used in the shared task is in 
+[python/scoring.py](python/scoring.py)
+
+The scorer can be tested with the targets and sample submissions for subtasks 1 and 2:
+
+```
+# run the scorer for the subtask 1 sample submission
+python python/scoring.py --st 1 --reference-dir test/trial/targets/ --submission-dir test/trial/submission-st1-01/
+
+# run the scorer for the subtask 2 sample submission
+python python/scoring.py --st 2 --reference-dir test/trial/targets/ --submission-dir test/trial/submission-st2-01/
+```
+
+
diff --git a/python/scoring.py b/python/scoring.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python
+
+# This file is licensed under the Apache 2.0 License terms, see
+# https://www.apache.org/licenses/LICENSE-2.0
+
+
+# see https://github.com/codalab/codabench/wiki/Competition-Bundle-Structure#scoring-program
+
+import sys
+import os
+import json
+import csv
+from collections import defaultdict
+import numpy as np
+import argparse
+from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
+from scipy.spatial import distance
+
+GLOBALS = dict(debug=False)
+EPS=0.001  # small value to allow for rounding errors when checking for a valid distribution that sums to 1.0
+
+MULT_LABELS = ["0-Kein", "1-Gering", "2-Vorhanden", "3-Stark", "4-Extrem"]
+
+ST1_COLUMNS = ['id', 'bin_maj', 'bin_one', 'bin_all', 'multi_maj', 'disagree_bin']
+ST2_COLUMNS = ['id', 'dist_bin_0', 'dist_bin_1', 'dist_multi_0', 'dist_multi_1', 'dist_multi_2', 'dist_multi_3', 'dist_multi_4']
+
+def load_targets(targets_file):
+  with open(targets_file, "rt") as infp:
+      targets = json.load(infp)
+  return targets
+
+def check_columns(data, columns):
+    """Check if the expected columns and only the expected columns are present in the data, 
+    if not, print an error message to
+    stderr and throw an exception. Otherwise return to the caller."""
+    for column in columns:
+        if column not in data:
+            raise ValueError(f"Column {column} not found in data, got {data.keys()}")
+    for column in data.keys():
+        if column not in columns:
+            raise ValueError(f"Column {column} not expected in data, expected {columns}")
+
+def check_allowed(data, column, allowed=["0", "1"]):
+    """Check if the predictions are in the allowed set, if not, print an error message to
+    stderr also showing the id and throw an exception. Otherwise return to the caller."""
+    if column not in data:
+        raise ValueError(f"Column {column} not found in data")
+        return
+    for i, value in enumerate(data[column]):
+        if value not in allowed:
+            raise ValueError(f"Invalid value {value} not one of {allowed} in column {column} at index {i} with id {data['id'][i]}")
+    print(f"Column {column} is OK")
+
+def check_dist(data, columns):
+    """Check if the predictions are in the allowed range, if not, print an error message to
+    stderr also showing the id and throw an exception. Otherwise return to the caller."""
+    for column in columns:
+        if column not in data:
+            raise ValueError(f"Column {column} not found in data")
+    for i in range(len(data["id"])):
+        sum = 0.0
+        theid = data["id"][i]
+        for column in columns:
+            try:
+                value = float(data[column][i])
+            except ValueError:
+                raise ValueError(f"Invalid value {data[column][i]} not a float in column {column} at index {i} with id {theid}")
+            if value < 0.0 or value > 1.0:
+                raise ValueError(f"Invalid value {value} not in range [0.0, 1.0] in column {column} at index {i} with id {data['id'][i]}")
+            sum += value
+        if abs(sum - 1.0) > EPS:
+            raise ValueError(f"Values in columns {columns} do not sum to 1.0 at index {i} with id {theid}")
+
+def load_tsv(submission_dir, expected_rows, expected_cols):
+  """
+  Try to load a TSV file from the submission directory. This expects a single TSV file to be present in the submission directory.
+  If there is no TSV file or there are multiple files, it will log an error to stderr and return None.
+  """
+  tsv_files = [f for f in os.listdir(submission_dir) if f.endswith('.tsv')]
+  if len(tsv_files) == 0:
+    print("No TSV file ending with '.tsv' found in submission directory", file=sys.stderr)
+    return None
+  if len(tsv_files) > 1:
+    print("Multiple TSV files found in submission directory", file=sys.stderr)
+    return None
+  tsv_file = tsv_files[0]
+  tsv_path = os.path.join(submission_dir, tsv_file)
+  print("Loading TSV file", tsv_path)
+  # Read the TSV file incrementally row by row and create a dictionary where the key is the column name and the value is a list of values for that column.
+  # Expect the column names in the first row of the TSV file.
+  # Abort reading and log an error to stderr if the file is not a valid TSV file, if it contains more than one row with the same id, 
+  # if the column name is not known, or if there are more than N_MAX rows.
+  data = defaultdict(list)
+  with open(tsv_path, 'rt') as infp:
+    reader = csv.DictReader(infp, delimiter='\t')
+    for i, row in enumerate(reader):
+      if i == 0:
+        if set(reader.fieldnames) != set(expected_cols):
+          gotcols = ", ".join(list(set(reader.fieldnames)))
+          print(f"Invalid column names in TSV file, expected:\n  {', '.join(expected_cols)}\ngot\n  {gotcols}", file=sys.stderr)
+          return None
+      if i >= expected_rows:
+        print(f"Too many rows in TSV file, expected {expected_rows}", file=sys.stderr)
+        return None
+      for col_name in reader.fieldnames:
+        data[col_name].append(row[col_name])
+  if len(data['id']) != expected_rows:
+    print(f"Missing values in TSV file, expected {expected_rows} rows, got {len(data['id'])}", file=sys.stderr)
+    return None
+  return data
+
+def score_st1(data, targets):
+    """Calculate the score for subtask 1"""
+    # NOTE: targets are a dictionary with the same keys as data, and the values are lists of the target values
+    # for some columns where more than one prediction is allowed, the target values are lists of lists
+
+    # for those columns, where more than one prediction is allowed, we need to select either the one
+    # predicted by the model, or a random incorrect one if the model did not predict a correct one
+
+    check_columns(data, ST1_COLUMNS)
+    check_allowed(data, 'bin_maj', ["0", "1"])
+    check_allowed(data, 'bin_one', ["0", "1"])
+    check_allowed(data, 'bin_all', ["0", "1"])
+    check_allowed(data, 'multi_maj', MULT_LABELS)
+    check_allowed(data, 'disagree_bin', ["0", "1"])
+
+    target_bin_maj = []
+    for pred, target in zip(data['bin_maj'], targets['bin_maj']):
+        if isinstance(target, list):
+            target_bin_maj.append(pred)
+        else:
+            target_bin_maj.append(target)
+    targets['bin_maj'] = target_bin_maj
+
+    target_multi_maj = []
+    for pred, target in zip(data['multi_maj'], targets['multi_maj']):
+        if isinstance(target, list):
+            if pred not in target:
+                # select a random incorrect target: just pick the first one
+                target_multi_maj.append(target[0])
+            else:
+                # the prediction is correct
+                target_multi_maj.append(pred)
+        else:
+            target_multi_maj.append(target)
+    targets['multi_maj'] = target_multi_maj      
+
+    scores = {}
+    used_scores = []
+    for col_name in data.keys():
+        if col_name == 'id':
+            continue
+        if GLOBALS['debug']:
+            print(f"Calculating scores for {col_name}")
+        scores[col_name+"_acc"] = accuracy_score(data[col_name], targets[col_name])
+        scores[col_name+"_f1"] = f1_score(data[col_name], targets[col_name], average='macro')
+        used_scores.append(scores[col_name+"_f1"])
+    # calculate average over all f1 scores
+    scores['score'] = np.mean(used_scores)
+    return scores
+
+
+def score_st2(data, targets):
+    """Calculate the score for subtask 2"""
+    check_dist(data, ['dist_bin_0', 'dist_bin_1'])
+    check_dist(data, ['dist_multi_0', 'dist_multi_1', 'dist_multi_2', 'dist_multi_3', 'dist_multi_4'])
+    scores = {}
+    sum_bin = 0.0
+    sum_multi = 0.0
+    for idx in range(len(data['id'])):
+        # calculate the vectors for the binary and multi-class predictions
+        dist_bin = [float(data['dist_bin_0'][idx]), float(data['dist_bin_1'][idx])]
+        dist_multi = [float(data[colname][idx]) for colname in ['dist_multi_0', 'dist_multi_1', 'dist_multi_2', 'dist_multi_3', 'dist_multi_4']]
+        # calculate the vectors for the binary and multi-class targets
+        target_bin = [targets['dist_bin_0'][idx], targets['dist_bin_1'][idx]]
+        target_multi = [targets['dist_multi_0'][idx], targets['dist_multi_1'][idx], targets['dist_multi_2'][idx], targets['dist_multi_3'][idx], targets['dist_multi_4'][idx]]
+        # calculate the distances
+        score_bin = distance.jensenshannon(dist_bin, target_bin, base=2)
+        score_multi = distance.jensenshannon(dist_multi, target_multi, base=2)
+        sum_bin += score_bin
+        sum_multi += score_multi
+    scores['js_dist_bin'] = sum_bin / len(data['id'])
+    scores['js_dist_multi'] = sum_multi / len(data['id'])
+    scores['score'] = np.mean([scores['js_dist_bin'], scores['js_dist_multi']])
+    return scores
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Scorer for the competition')
+    parser.add_argument('--submission-dir', help='Directory containing the submission (.)', default=".")
+    parser.add_argument('--reference-dir', help='Directory containing the reference data (./dev_phase/reference_data/)', default="./dev_phase/reference_data/")
+    parser.add_argument('--score-dir', help='Directory to write the scores to (.)', default=".")
+    parser.add_argument('--codabench', help='Indicate we are running on codabench, not locally', action='store_true')
+    parser.add_argument("--st", required=True, choices=["1", "2"], help='Subtask to evaluate, one of 1, 2')
+    parser.add_argument("--debug", help='Print debug information', action='store_true')
+    args = parser.parse_args()
+    GLOBALS['debug'] = args.debug
+    print(f'Running scorer for subtask {args.st}')
+    if args.codabench:
+        run_locally = False
+        print("Running on codabench")
+        submission_dir = '/app/input/res'
+        reference_dir = '/app/input/ref'
+        score_dir = '/app/output/'
+    else:
+        run_locally = True
+        print("Running locally")
+        submission_dir = args.submission_dir
+        reference_dir = args.reference_dir
+        score_dir = args.score_dir
+
+    # if we are on codabench, list all files and directories under /app  recursively
+    # if not run_locally:
+        # for root, dirs, files in os.walk('/app'):
+        #     for file in files:
+        #         print("FILE:", os.path.join(root, file))
+        #     for dir in dirs:
+        #         print("DIR:", os.path.join(root, dir))
+
+        # # also load and show the contents of the metadata file in /app/input/res/metadata as a text file, if it exists
+        # metadata_file = os.path.join(submission_dir, "metadata")
+        # if os.path.exists(metadata_file):
+        #     with open(metadata_file, 'rt') as infp:
+        #         metadata = infp.read()
+        #     print("Metadata file contents:", metadata)
+        # else:
+        #     print("No metadata file found")
+
+
+    targets_file = os.path.join(reference_dir, "targets.json")
+    print(f"Using targets file {targets_file}")
+    targets = load_targets(targets_file)
+    print(f"Loaded {len(targets)} targets")
+
+    # index the targets for easy lookup
+    targets_index = {t['id']: t for t in targets}
+
+    # load the submissing tsv file
+    if args.st == "1":
+        data = load_tsv(submission_dir, expected_rows=len(targets), expected_cols=ST1_COLUMNS)
+    elif args.st == "2":
+        data = load_tsv(submission_dir, expected_rows=len(targets), expected_cols=ST2_COLUMNS)
+    if data is None:
+        print("Problems loading the submission, aborting", file=sys.stderr)
+        sys.exit(1)
+    print(f"Loaded {len(data['id'])} rows from the submission")
+    # check if the ids in the submission match the ids in the targets exactly: the targets are a list of 
+    # dictionaries with keys 'id' and the various prediction targets, the data is a dictionary where the keys are 
+    # the column names (one of which is "id") and the values are lists of values for that column
+    if set(data['id']) != set(targets_index.keys()):
+        print("IDs in submission do not match IDs in targets", file=sys.stderr)
+        sys.exit(1)
+
+
+    # convert the targets to the same format as the submission, and in the same order by id as the submission
+    targets_dir = {}
+    for col_name in data.keys():
+        if col_name == 'id':
+            continue
+        col_values = []
+        for idx, id in enumerate(data['id']):
+            if id not in targets_index:
+                print(f"ID {id} not found in targets for id {id} in row {idx}", file=sys.stderr)
+                sys.exit(1)
+            if col_name not in targets_index[id]:
+                print(f"Column {col_name} not found in targets for id {id} in row {idx}", file=sys.stderr)
+                sys.exit(1)
+            col_values.append(targets_index[id][col_name])
+        targets_dir[col_name] = col_values
+
+    if args.st == "1":
+        scores = score_st1(data, targets_dir)
+    elif args.st == "2":
+        scores = score_st2(data, targets_dir)
+    else:
+        print("Unknown subtask", file=sys.stderr)
+        sys.exit(1)
+
+    print("Scores:", scores)
+
+    with open(os.path.join(score_dir, 'scores.json'), 'w') as score_file:
+        score_file.write(json.dumps(scores))
+    #with open(os.path.join(score_dir, 'details.html'), 'w') as html_file:
+    #    html_file.write("<html>Some text</html>")
+    print("Ending scorer")
+
+if __name__ == '__main__':
+    main()
+