diff --git a/swebench/harness/docker_build.py b/swebench/harness/docker_build.py index 0fb9ae3a..9d8fb64c 100644 --- a/swebench/harness/docker_build.py +++ b/swebench/harness/docker_build.py @@ -29,15 +29,14 @@ class BuildImageError(Exception): def __init__(self, image_name, message, logger): super().__init__(message) + self.super_str = super().__str__() self.image_name = image_name self.log_path = logger.log_file self.logger = logger def __str__(self): - log_msg = traceback.format_exc() - self.logger.info(log_msg) return ( - f"{self.image_name}: {super().__str__()}\n" + f"Error building image {self.image_name}: {self.super_str}\n" f"Check ({self.log_path}) for more information." ) diff --git a/swebench/harness/docker_utils.py b/swebench/harness/docker_utils.py index 65a18b93..dd8037ee 100644 --- a/swebench/harness/docker_utils.py +++ b/swebench/harness/docker_utils.py @@ -5,6 +5,7 @@ import signal import tarfile import threading +import time import traceback from pathlib import Path @@ -181,21 +182,25 @@ def exec_run_with_timeout(container, cmd, timeout: int|None=60): timeout (int): Timeout in seconds. """ # Local variables to store the result of executing the command - exec_result = None + exec_result = '' exec_id = None exception = None + timed_out = False # Wrapper function to run the command def run_command(): nonlocal exec_result, exec_id, exception try: exec_id = container.client.api.exec_create(container.id, cmd)["Id"] - exec_result = container.client.api.exec_start(exec_id) + exec_stream = container.client.api.exec_start(exec_id, stream=True) + for chunk in exec_stream: + exec_result += chunk.decode() except Exception as e: exception = e # Start the command in a separate thread thread = threading.Thread(target=run_command) + start_time = time.time() thread.start() thread.join(timeout) @@ -204,9 +209,12 @@ def run_command(): # If the thread is still alive, the command timed out if thread.is_alive(): - raise TimeoutError(f"Command '{cmd}' timed out after {timeout} seconds") - - return exec_result + if exec_id is not None: + exec_pid = container.client.api.exec_inspect(exec_id)["Pid"] + container.exec_run(f"kill -TERM {exec_pid}", detach=True) + timed_out = True + end_time = time.time() + return exec_result, timed_out, end_time - start_time def find_dependent_images(client: docker.DockerClient, image_name: str): diff --git a/swebench/harness/run_evaluation.py b/swebench/harness/run_evaluation.py index 0ad958a5..4d2e5911 100644 --- a/swebench/harness/run_evaluation.py +++ b/swebench/harness/run_evaluation.py @@ -26,6 +26,7 @@ clean_images, ) from swebench.harness.docker_build import ( + BuildImageError, build_container, build_env_images, close_logger, @@ -39,15 +40,14 @@ class EvaluationError(Exception): def __init__(self, instance_id, message, logger): super().__init__(message) + self.super_str = super().__str__() self.instance_id = instance_id self.log_file = logger.log_file self.logger = logger def __str__(self): - log_msg = traceback.format_exc() - self.logger.info(log_msg) return ( - f"{self.instance_id}: {super().__str__()}\n" + f"Evaluation error for {self.instance_id}: {self.super_str}\n" f"Check ({self.log_file}) for more information." ) @@ -59,7 +59,7 @@ def run_instance( force_rebuild: bool, client: docker.DockerClient, run_id: str, - timeout: int = None, + timeout: int | None = None, ): """ Run a single instance with the given prediction. @@ -149,17 +149,28 @@ def run_instance( eval_file = Path(log_dir / "eval.sh") eval_file.write_text(test_spec.eval_script) logger.info( - f"Eval script for {instance_id} written to {patch_file}, now applying to container..." + f"Eval script for {instance_id} written to {eval_file}; copying to container..." ) copy_to_container(container, eval_file, Path("/eval.sh")) # Run eval script, write output to logs - result = exec_run_with_timeout(container, "/bin/bash /eval.sh", timeout=timeout) - test_output = result.decode("utf-8") + # eval_cmd = "/bin/bash /eval.sh" + # if timeout: + # eval_cmd = f"timeout -s SIGKILL {timeout} {eval_cmd}" + # result = container.exec_run(eval_cmd) + test_output, timed_out, total_runtime = exec_run_with_timeout(container, "/bin/bash /eval.sh", timeout) test_output_path = log_dir / "test_output.txt" + logger.info(f'Test runtime: {total_runtime:_.2f} seconds') with open(test_output_path, "w") as f: f.write(test_output) - logger.info(f"Test output for {instance_id} written to {test_output_path}") + logger.info(f"Test output for {instance_id} written to {test_output_path}") + if timed_out: + f.write(f"\n\nTimeout error: {timeout} seconds exceeded.") + raise EvaluationError( + instance_id, + f"Test timed out after {timeout} seconds.", + logger, + ) # Get git diff after running eval script git_diff_output_after = ( @@ -189,23 +200,26 @@ def run_instance( f.write(json.dumps(report, indent=4)) return instance_id, report except EvaluationError as e: - error_msg = (f"EvaluationError {instance_id}: {e}\n" - f"{traceback.format_exc()}\n" - f"Check ({logger.log_file}) for more information.") + error_msg = traceback.format_exc() + logger.info(error_msg) + print(e) + except BuildImageError as e: + error_msg = traceback.format_exc() logger.info(error_msg) - print(error_msg) + print(e) except Exception as e: error_msg = (f"Error in evaluating model for {instance_id}: {e}\n" f"{traceback.format_exc()}\n" f"Check ({logger.log_file}) for more information.") - logger.info(error_msg) - print(error_msg) + logger.error(error_msg) finally: # Remove instance container + image, close logger cleanup_container(client, container, logger) if rm_image: remove_image(client, test_spec.instance_image_key, logger) close_logger(logger) + return + def run_instances( predictions: dict,