SWE-Bench evaluation

JetBrains-Research · Jul 26, 2024 · 9bd03ae · 9bd03ae
1 parent 5ad0f0b
commit 9bd03ae
Show file tree

Hide file tree

Showing 8 changed files with 594 additions and 254 deletions.
diff --git a/code_editing/agents/run.py b/code_editing/agents/run.py
@@ -1,7 +1,7 @@
 import collections
 import os
 from enum import Enum
-from typing import Dict, TypedDict, Union, Type
+from typing import Dict, Type, TypedDict, Union
 
 from hydra.core.hydra_config import HydraConfig
 

diff --git a/code_editing/data_sources/hf_source.py b/code_editing/data_sources/hf_source.py
@@ -33,6 +33,7 @@ def __init__(
         super().__init__(extractor, base_data_path)
         self._dataset = load_dataset(hub_name, config, split=split, cache_dir=cache_dir)
         self.name = hub_name
+        self.split = split
         self._hub_name = hub_name
 
         # Initialize the git repositories

diff --git a/code_editing/metrics/__init__.py b/code_editing/metrics/__init__.py
@@ -13,6 +13,7 @@
     PyScopeViewLocalizationMetric,
     TotalContextMetric,
 )
+from .swe_bench_metric import SWEBenchMetric
 
 __all__ = [
     "CodeBertScoreMetric",
@@ -32,4 +33,5 @@
     "PyScopeViewLocalizationMetric",
     "TotalContextMetric",
     "GPT4ComparisonMetric",
+    "SWEBenchMetric",
 ]
diff --git a/code_editing/metrics/swe_bench_metric.py b/code_editing/metrics/swe_bench_metric.py
@@ -0,0 +1,74 @@
+import json
+import os
+import subprocess
+import tempfile
+import uuid
+from typing import List
+
+import pandas as pd
+
+from code_editing.data_sources import SWEBenchDataSource
+from code_editing.data_sources.base_source import CEDataSource
+from code_editing.metrics.base_metric import BaseMetric
+
+
+class SWEBenchMetric(BaseMetric):
+    """
+    This class implements the pass rate metric using the SWE-Bench evaluation harness.
+    """
+
+    def __init__(self, data_source: CEDataSource, max_workers: int = 12, **kwargs):
+        self.max_workers = max_workers
+
+        if not isinstance(data_source, SWEBenchDataSource):
+            raise ValueError("SWEBench pass rate calculations can only be made with SWEBenchDataSource")
+        self.data_source: SWEBenchDataSource = data_source
+
+    def _score(self, _: List[str], __: List[str], df: pd.DataFrame):
+        model_name = df["model_name"].iloc[0] if "model_name" in df.columns else "unknown"
+        swebench_obj, instance_ids = self.data_source.to_swebench_results(df, model_name)
+
+        # create a temporary file with the model predictions
+        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
+            json.dump(swebench_obj, f)
+            f.close()
+            swebench_path = f.name
+        unique_hex = uuid.uuid4().hex[:8]
+
+        # We need docker to run the evaluation harness
+        #  HACK: To give access to the docker socket, we can use the following command:
+        #  sudo chmod 666 /var/run/docker.sock
+        cmd = [
+            "python",
+            "-m",
+            "swebench.harness.run_evaluation",
+            "--dataset_name",
+            self.data_source.name,
+            "--split",
+            self.data_source.split,
+            "--predictions_path",
+            swebench_path,
+            "--max_workers",
+            str(self.max_workers),
+            "--run_id",
+            unique_hex,
+        ]
+
+        # run the evaluation harness
+        subprocess.run(cmd, check=True)
+
+        # read the results
+        results_path = f"{model_name}.{unique_hex}.json"
+        with open(results_path) as f:
+            results = json.load(f)
+
+        # remove the temporary files
+        os.remove(swebench_path)
+        os.remove(results_path)
+
+        # calculate the pass rate
+        total = len(df)
+        n_resolved = results.get("resolved_instances", 0)
+        results["pass_rate"] = n_resolved / total if total > 0 else 0
+
+        return results
diff --git a/code_editing/scripts/conf/evaluation.yaml b/code_editing/scripts/conf/evaluation.yaml
@@ -46,6 +46,9 @@ metrics:
   total_context:
     _target_: "code_editing.metrics.TotalContextMetric"
     add_sample_metrics: true
+  swebench:
+    _target_: "code_editing.metrics.SWEBenchMetric"
+    max_workers: 32
 
 wandb:
   project: lca-code-editing

diff --git a/format.sh b/format.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env sh
+black -l 120 -t py310 .
+isort --profile black .