Skip to content

Commit

Permalink
SWE-Bench evaluation
Browse files Browse the repository at this point in the history
  • Loading branch information
waleko committed Jul 26, 2024
1 parent 5ad0f0b commit 9bd03ae
Show file tree
Hide file tree
Showing 8 changed files with 594 additions and 254 deletions.
2 changes: 1 addition & 1 deletion code_editing/agents/run.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import collections
import os
from enum import Enum
from typing import Dict, TypedDict, Union, Type
from typing import Dict, Type, TypedDict, Union

from hydra.core.hydra_config import HydraConfig

Expand Down
1 change: 1 addition & 0 deletions code_editing/data_sources/hf_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def __init__(
super().__init__(extractor, base_data_path)
self._dataset = load_dataset(hub_name, config, split=split, cache_dir=cache_dir)
self.name = hub_name
self.split = split
self._hub_name = hub_name

# Initialize the git repositories
Expand Down
2 changes: 2 additions & 0 deletions code_editing/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
PyScopeViewLocalizationMetric,
TotalContextMetric,
)
from .swe_bench_metric import SWEBenchMetric

__all__ = [
"CodeBertScoreMetric",
Expand All @@ -32,4 +33,5 @@
"PyScopeViewLocalizationMetric",
"TotalContextMetric",
"GPT4ComparisonMetric",
"SWEBenchMetric",
]
74 changes: 74 additions & 0 deletions code_editing/metrics/swe_bench_metric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import json
import os
import subprocess
import tempfile
import uuid
from typing import List

import pandas as pd

from code_editing.data_sources import SWEBenchDataSource
from code_editing.data_sources.base_source import CEDataSource
from code_editing.metrics.base_metric import BaseMetric


class SWEBenchMetric(BaseMetric):
"""
This class implements the pass rate metric using the SWE-Bench evaluation harness.
"""

def __init__(self, data_source: CEDataSource, max_workers: int = 12, **kwargs):
self.max_workers = max_workers

if not isinstance(data_source, SWEBenchDataSource):
raise ValueError("SWEBench pass rate calculations can only be made with SWEBenchDataSource")
self.data_source: SWEBenchDataSource = data_source

def _score(self, _: List[str], __: List[str], df: pd.DataFrame):
model_name = df["model_name"].iloc[0] if "model_name" in df.columns else "unknown"
swebench_obj, instance_ids = self.data_source.to_swebench_results(df, model_name)

# create a temporary file with the model predictions
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
json.dump(swebench_obj, f)
f.close()
swebench_path = f.name
unique_hex = uuid.uuid4().hex[:8]

# We need docker to run the evaluation harness
# HACK: To give access to the docker socket, we can use the following command:
# sudo chmod 666 /var/run/docker.sock
cmd = [
"python",
"-m",
"swebench.harness.run_evaluation",
"--dataset_name",
self.data_source.name,
"--split",
self.data_source.split,
"--predictions_path",
swebench_path,
"--max_workers",
str(self.max_workers),
"--run_id",
unique_hex,
]

# run the evaluation harness
subprocess.run(cmd, check=True)

# read the results
results_path = f"{model_name}.{unique_hex}.json"
with open(results_path) as f:
results = json.load(f)

# remove the temporary files
os.remove(swebench_path)
os.remove(results_path)

# calculate the pass rate
total = len(df)
n_resolved = results.get("resolved_instances", 0)
results["pass_rate"] = n_resolved / total if total > 0 else 0

return results
3 changes: 3 additions & 0 deletions code_editing/scripts/conf/evaluation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ metrics:
total_context:
_target_: "code_editing.metrics.TotalContextMetric"
add_sample_metrics: true
swebench:
_target_: "code_editing.metrics.SWEBenchMetric"
max_workers: 32

wandb:
project: lca-code-editing
Expand Down
3 changes: 3 additions & 0 deletions format.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/usr/bin/env sh
black -l 120 -t py310 .
isort --profile black .
Loading

0 comments on commit 9bd03ae

Please sign in to comment.