Merge pull request #183 from princeton-nlp/fix-timeout

Fix timeout
princeton-nlp · Jul 15, 2024 · f7556db · f7556db
2 parents c6ded6e + dd4c457
commit f7556db
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 22 deletions.
diff --git a/swebench/harness/docker_build.py b/swebench/harness/docker_build.py
@@ -29,15 +29,14 @@
 class BuildImageError(Exception):
     def __init__(self, image_name, message, logger):
         super().__init__(message)
+        self.super_str = super().__str__()
         self.image_name = image_name
         self.log_path = logger.log_file
         self.logger = logger
 
     def __str__(self):
-        log_msg = traceback.format_exc()
-        self.logger.info(log_msg)
         return (
-            f"{self.image_name}: {super().__str__()}\n"
+            f"Error building image {self.image_name}: {self.super_str}\n"
             f"Check ({self.log_path}) for more information."
         )
 

diff --git a/swebench/harness/docker_utils.py b/swebench/harness/docker_utils.py
@@ -5,6 +5,7 @@
 import signal
 import tarfile
 import threading
+import time
 import traceback
 from pathlib import Path
 
@@ -181,21 +182,25 @@ def exec_run_with_timeout(container, cmd, timeout: int|None=60):
         timeout (int): Timeout in seconds.
     """
     # Local variables to store the result of executing the command
-    exec_result = None
+    exec_result = ''
     exec_id = None
     exception = None
+    timed_out = False
 
     # Wrapper function to run the command
     def run_command():
         nonlocal exec_result, exec_id, exception
         try:
             exec_id = container.client.api.exec_create(container.id, cmd)["Id"]
-            exec_result = container.client.api.exec_start(exec_id)
+            exec_stream = container.client.api.exec_start(exec_id, stream=True)
+            for chunk in exec_stream:
+                exec_result += chunk.decode()
         except Exception as e:
             exception = e
 
     # Start the command in a separate thread
     thread = threading.Thread(target=run_command)
+    start_time = time.time()
     thread.start()
     thread.join(timeout)
 
@@ -204,9 +209,12 @@ def run_command():
 
     # If the thread is still alive, the command timed out
     if thread.is_alive():
-        raise TimeoutError(f"Command '{cmd}' timed out after {timeout} seconds")
-
-    return exec_result
+        if exec_id is not None:
+            exec_pid = container.client.api.exec_inspect(exec_id)["Pid"]
+            container.exec_run(f"kill -TERM {exec_pid}", detach=True)
+        timed_out = True
+    end_time = time.time()
+    return exec_result, timed_out, end_time - start_time
 
 
 def find_dependent_images(client: docker.DockerClient, image_name: str):

diff --git a/swebench/harness/run_evaluation.py b/swebench/harness/run_evaluation.py
@@ -26,6 +26,7 @@
     clean_images,
 )
 from swebench.harness.docker_build import (
+    BuildImageError,
     build_container,
     build_env_images,
     close_logger,
@@ -39,15 +40,14 @@
 class EvaluationError(Exception):
     def __init__(self, instance_id, message, logger):
         super().__init__(message)
+        self.super_str = super().__str__()
         self.instance_id = instance_id
         self.log_file = logger.log_file
         self.logger = logger
 
     def __str__(self):
-        log_msg = traceback.format_exc()
-        self.logger.info(log_msg)
         return (
-            f"{self.instance_id}: {super().__str__()}\n"
+            f"Evaluation error for {self.instance_id}: {self.super_str}\n"
             f"Check ({self.log_file}) for more information."
         )
 
@@ -59,7 +59,7 @@ def run_instance(
         force_rebuild: bool,
         client: docker.DockerClient,
         run_id: str,
-        timeout: int = None,
+        timeout: int | None = None,
     ):
     """
     Run a single instance with the given prediction.
@@ -149,17 +149,28 @@ def run_instance(
         eval_file = Path(log_dir / "eval.sh")
         eval_file.write_text(test_spec.eval_script)
         logger.info(
-            f"Eval script for {instance_id} written to {patch_file}, now applying to container..."
+            f"Eval script for {instance_id} written to {eval_file}; copying to container..."
         )
         copy_to_container(container, eval_file, Path("/eval.sh"))
 
         # Run eval script, write output to logs
-        result = exec_run_with_timeout(container, "/bin/bash /eval.sh", timeout=timeout)
-        test_output = result.decode("utf-8")
+        # eval_cmd = "/bin/bash /eval.sh"
+        # if timeout:
+        #     eval_cmd = f"timeout -s SIGKILL {timeout} {eval_cmd}"
+        # result = container.exec_run(eval_cmd)
+        test_output, timed_out, total_runtime = exec_run_with_timeout(container, "/bin/bash /eval.sh", timeout)
         test_output_path = log_dir / "test_output.txt"
+        logger.info(f'Test runtime: {total_runtime:_.2f} seconds')
         with open(test_output_path, "w") as f:
             f.write(test_output)
-        logger.info(f"Test output for {instance_id} written to {test_output_path}")
+            logger.info(f"Test output for {instance_id} written to {test_output_path}")
+            if timed_out:
+                f.write(f"\n\nTimeout error: {timeout} seconds exceeded.")
+                raise EvaluationError(
+                    instance_id,
+                    f"Test timed out after {timeout} seconds.",
+                    logger,
+                )
 
         # Get git diff after running eval script
         git_diff_output_after = (
@@ -189,23 +200,26 @@ def run_instance(
             f.write(json.dumps(report, indent=4))
         return instance_id, report
     except EvaluationError as e:
-        error_msg = (f"EvaluationError {instance_id}: {e}\n"
-                     f"{traceback.format_exc()}\n"
-                     f"Check ({logger.log_file}) for more information.")
+        error_msg = traceback.format_exc()
+        logger.info(error_msg)
+        print(e)
+    except BuildImageError as e:
+        error_msg = traceback.format_exc()
         logger.info(error_msg)
-        print(error_msg)
+        print(e)
     except Exception as e:
         error_msg = (f"Error in evaluating model for {instance_id}: {e}\n"
                      f"{traceback.format_exc()}\n"
                      f"Check ({logger.log_file}) for more information.")
-        logger.info(error_msg)
-        print(error_msg)
+        logger.error(error_msg)
     finally:
         # Remove instance container + image, close logger
         cleanup_container(client, container, logger)
         if rm_image:
             remove_image(client, test_spec.instance_image_key, logger)
         close_logger(logger)
+    return
+
 
 def run_instances(
         predictions: dict,