Skip to content

Commit

Permalink
Merge pull request #183 from princeton-nlp/fix-timeout
Browse files Browse the repository at this point in the history
Fix timeout
  • Loading branch information
john-b-yang authored Jul 15, 2024
2 parents c6ded6e + dd4c457 commit f7556db
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 22 deletions.
5 changes: 2 additions & 3 deletions swebench/harness/docker_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,14 @@
class BuildImageError(Exception):
def __init__(self, image_name, message, logger):
super().__init__(message)
self.super_str = super().__str__()
self.image_name = image_name
self.log_path = logger.log_file
self.logger = logger

def __str__(self):
log_msg = traceback.format_exc()
self.logger.info(log_msg)
return (
f"{self.image_name}: {super().__str__()}\n"
f"Error building image {self.image_name}: {self.super_str}\n"
f"Check ({self.log_path}) for more information."
)

Expand Down
18 changes: 13 additions & 5 deletions swebench/harness/docker_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import signal
import tarfile
import threading
import time
import traceback
from pathlib import Path

Expand Down Expand Up @@ -181,21 +182,25 @@ def exec_run_with_timeout(container, cmd, timeout: int|None=60):
timeout (int): Timeout in seconds.
"""
# Local variables to store the result of executing the command
exec_result = None
exec_result = ''
exec_id = None
exception = None
timed_out = False

# Wrapper function to run the command
def run_command():
nonlocal exec_result, exec_id, exception
try:
exec_id = container.client.api.exec_create(container.id, cmd)["Id"]
exec_result = container.client.api.exec_start(exec_id)
exec_stream = container.client.api.exec_start(exec_id, stream=True)
for chunk in exec_stream:
exec_result += chunk.decode()
except Exception as e:
exception = e

# Start the command in a separate thread
thread = threading.Thread(target=run_command)
start_time = time.time()
thread.start()
thread.join(timeout)

Expand All @@ -204,9 +209,12 @@ def run_command():

# If the thread is still alive, the command timed out
if thread.is_alive():
raise TimeoutError(f"Command '{cmd}' timed out after {timeout} seconds")

return exec_result
if exec_id is not None:
exec_pid = container.client.api.exec_inspect(exec_id)["Pid"]
container.exec_run(f"kill -TERM {exec_pid}", detach=True)
timed_out = True
end_time = time.time()
return exec_result, timed_out, end_time - start_time


def find_dependent_images(client: docker.DockerClient, image_name: str):
Expand Down
42 changes: 28 additions & 14 deletions swebench/harness/run_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
clean_images,
)
from swebench.harness.docker_build import (
BuildImageError,
build_container,
build_env_images,
close_logger,
Expand All @@ -39,15 +40,14 @@
class EvaluationError(Exception):
def __init__(self, instance_id, message, logger):
super().__init__(message)
self.super_str = super().__str__()
self.instance_id = instance_id
self.log_file = logger.log_file
self.logger = logger

def __str__(self):
log_msg = traceback.format_exc()
self.logger.info(log_msg)
return (
f"{self.instance_id}: {super().__str__()}\n"
f"Evaluation error for {self.instance_id}: {self.super_str}\n"
f"Check ({self.log_file}) for more information."
)

Expand All @@ -59,7 +59,7 @@ def run_instance(
force_rebuild: bool,
client: docker.DockerClient,
run_id: str,
timeout: int = None,
timeout: int | None = None,
):
"""
Run a single instance with the given prediction.
Expand Down Expand Up @@ -149,17 +149,28 @@ def run_instance(
eval_file = Path(log_dir / "eval.sh")
eval_file.write_text(test_spec.eval_script)
logger.info(
f"Eval script for {instance_id} written to {patch_file}, now applying to container..."
f"Eval script for {instance_id} written to {eval_file}; copying to container..."
)
copy_to_container(container, eval_file, Path("/eval.sh"))

# Run eval script, write output to logs
result = exec_run_with_timeout(container, "/bin/bash /eval.sh", timeout=timeout)
test_output = result.decode("utf-8")
# eval_cmd = "/bin/bash /eval.sh"
# if timeout:
# eval_cmd = f"timeout -s SIGKILL {timeout} {eval_cmd}"
# result = container.exec_run(eval_cmd)
test_output, timed_out, total_runtime = exec_run_with_timeout(container, "/bin/bash /eval.sh", timeout)
test_output_path = log_dir / "test_output.txt"
logger.info(f'Test runtime: {total_runtime:_.2f} seconds')
with open(test_output_path, "w") as f:
f.write(test_output)
logger.info(f"Test output for {instance_id} written to {test_output_path}")
logger.info(f"Test output for {instance_id} written to {test_output_path}")
if timed_out:
f.write(f"\n\nTimeout error: {timeout} seconds exceeded.")
raise EvaluationError(
instance_id,
f"Test timed out after {timeout} seconds.",
logger,
)

# Get git diff after running eval script
git_diff_output_after = (
Expand Down Expand Up @@ -189,23 +200,26 @@ def run_instance(
f.write(json.dumps(report, indent=4))
return instance_id, report
except EvaluationError as e:
error_msg = (f"EvaluationError {instance_id}: {e}\n"
f"{traceback.format_exc()}\n"
f"Check ({logger.log_file}) for more information.")
error_msg = traceback.format_exc()
logger.info(error_msg)
print(e)
except BuildImageError as e:
error_msg = traceback.format_exc()
logger.info(error_msg)
print(error_msg)
print(e)
except Exception as e:
error_msg = (f"Error in evaluating model for {instance_id}: {e}\n"
f"{traceback.format_exc()}\n"
f"Check ({logger.log_file}) for more information.")
logger.info(error_msg)
print(error_msg)
logger.error(error_msg)
finally:
# Remove instance container + image, close logger
cleanup_container(client, container, logger)
if rm_image:
remove_image(client, test_spec.instance_image_key, logger)
close_logger(logger)
return


def run_instances(
predictions: dict,
Expand Down

0 comments on commit f7556db

Please sign in to comment.