Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add sanitization option to host outputs #191

Open
wants to merge 7 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion src/jobflow_remote/config/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,11 @@ class WorkerBase(BaseModel):
"username instead that from the list of job ids. May be necessary for some "
"scheduler_type (e.g. SGE)",
)
sanitize_command: bool = Field(
default=False,
description="Sanitize the output of commands in case of failures due to spurious text produced"
"by the worker shell.",
)
model_config = ConfigDict(extra="forbid")

@field_validator("scheduler_type")
Expand Down Expand Up @@ -252,7 +257,9 @@ def get_host(self) -> BaseHost:
-------
The LocalHost.
"""
return LocalHost(timeout_execute=self.timeout_execute)
return LocalHost(
timeout_execute=self.timeout_execute, sanitize=self.sanitize_command
)

@property
def cli_info(self) -> dict:
Expand Down Expand Up @@ -402,6 +409,7 @@ def get_host(self) -> BaseHost:
shell_cmd=self.shell_cmd,
login_shell=self.login_shell,
interactive_login=self.interactive_login,
sanitize=self.sanitize_command,
)

@property
Expand Down
4 changes: 1 addition & 3 deletions src/jobflow_remote/jobs/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,9 +177,7 @@ def estimated_run_time(self) -> Optional[float]:
The estimated run time in seconds.
"""
if self.start_time:
return (
datetime.now(tz=self.start_time.tzinfo) - self.start_time
).total_seconds()
return (datetime.utcnow() - self.start_time).total_seconds()

return None

Expand Down
102 changes: 99 additions & 3 deletions src/jobflow_remote/remote/host/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from __future__ import annotations

import abc
import logging
import re
import traceback
from typing import TYPE_CHECKING

Expand All @@ -10,9 +12,26 @@
from pathlib import Path


logger = logging.getLogger(__name__)

SANITIZE_KEY = r"_-_-_-_-_### JFREMOTE SANITIZE ###_-_-_-_-_"


class BaseHost(MSONable):
"""Base Host class."""

def __init__(self, sanitize: bool = False):
"""
Parameters
----------
sanitize
If True text a string will be prepended and appended to the output
of the commands, to ease the parsing and avoid failures due to spurious
text coming from the host shell.
"""
self.sanitize = sanitize
self._sanitize_regex: re.Pattern | None = None

@abc.abstractmethod
def execute(
self,
Expand All @@ -28,7 +47,8 @@ def execute(
Command to execute, as a str or list of str
workdir: str or None
path where the command will be executed.

timeout
Timeout for the execution of the commands.
"""
raise NotImplementedError

Expand Down Expand Up @@ -74,8 +94,19 @@ def test(self) -> str | None:
try:
cmd = "echo 'test'"
stdout, stderr, returncode = self.execute(cmd)
if returncode != 0 or stdout.strip() != "test":
msg = f"Command was executed but some error occurred.\nstdoud: {stdout}\nstderr: {stderr}"
if returncode != 0:
msg = f"Command was executed but return code was different from zero.\nstdoud: {stdout}\nstderr: {stderr}"
elif stdout.strip() != "test" or stderr.strip() != "":
msg = (
"Command was executed but the output is not the expected one (i.e. a single 'test' "
f"string in both stdout and stderr).\nstdoud: {stdout}\nstderr: {stderr}"
)
if not self.sanitize:
msg += (
"\nIf the output contains additional text the problem may be solved by setting "
"the 'sanitize_command' option to True in the project configuration."
)

except Exception:
exc = traceback.format_exc()
msg = f"Error while executing command:\n {exc}"
Expand Down Expand Up @@ -124,6 +155,71 @@ def interactive_login(self) -> bool:
"""
return False

@property
def sanitize_regex(self) -> re.Pattern:
"""
Regular expression to sanitize sensitive info in command outputs.
"""
if not self._sanitize_regex:
escaped_key = re.escape(SANITIZE_KEY)
# Optionally match the newline that comes from the "echo" command.
# The -n option for echo to suppress the newline seems to not be
# supported on all systems
self._sanitize_regex = re.compile(
f"{escaped_key}\r?\n?(.*?)(?:{escaped_key}\r?\n?|$)", re.DOTALL
)

return self._sanitize_regex

def sanitize_command(self, cmd: str) -> str:
"""
Sanitizes a command by adding a prefix and suffix to the command string if
sanitization is enabled.
The prefix and suffix are the same and are used to mark the parts of the output
that should be sanitized. The prefix and suffix are defined by `SANITIZE_KEY`.

Parameters
----------
cmd
The command string to be sanitized

Returns
-------
str
The sanitized command string
"""
if self.sanitize:
echo_cmd = f'echo "{SANITIZE_KEY}" | tee /dev/stderr'
cmd = f"{echo_cmd};{cmd};{echo_cmd}"
return cmd

def sanitize_output(self, output: str) -> str:
"""
Sanitizes the output of a command by selecting the section between the
SANITIZE_KEY strings.
If the second instance of the key is not found, the part of the output after the key is returned.
If the key is not present, the entire output is returned.

Parameters
----------
output
The output of the command to be sanitized

Returns
-------
str
The sanitized output
"""
if self.sanitize:
match = self.sanitize_regex.search(output)
if not match:
logger.warning(
f"Even if sanitization was required, there was no match for the output: {output}. Returning the complete output"
)
return output
return match.group(1)
return output


class HostError(Exception):
pass
12 changes: 10 additions & 2 deletions src/jobflow_remote/remote/host/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@


class LocalHost(BaseHost):
def __init__(self, timeout_execute: int = None) -> None:
def __init__(self, timeout_execute: int = None, sanitize: bool = False) -> None:
self.timeout_execute = timeout_execute
super().__init__(sanitize=sanitize)

def __eq__(self, other):
return isinstance(other, LocalHost)
Expand All @@ -34,6 +35,10 @@ def execute(
----------
command: str or list of str
Command to execute, as a str or list of str
workdir: str or None
path where the command will be executed.
timeout
Timeout for the execution of the commands.

Returns
-------
Expand All @@ -46,13 +51,16 @@ def execute(
"""
if isinstance(command, (list, tuple)):
command = " ".join(command)
command = self.sanitize_command(command)
workdir = str(workdir) if workdir else Path.cwd()
timeout = timeout or self.timeout_execute
with cd(workdir):
proc = subprocess.run(
command, capture_output=True, shell=True, timeout=timeout, check=False
)
return proc.stdout.decode(), proc.stderr.decode(), proc.returncode
stdout = self.sanitize_output(proc.stdout.decode())
stderr = self.sanitize_output(proc.stderr.decode())
return stdout, stderr, proc.returncode

def mkdir(
self, directory: str | Path, recursive: bool = True, exist_ok: bool = True
Expand Down
9 changes: 8 additions & 1 deletion src/jobflow_remote/remote/host/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def __init__(
login_shell=True,
retry_on_closed_connection=True,
interactive_login=False,
sanitize: bool = False,
) -> None:
self.host = host
self.user = user
Expand All @@ -59,6 +60,7 @@ def __init__(
self.retry_on_closed_connection = retry_on_closed_connection
self._interactive_login = interactive_login
self._create_connection()
super().__init__(sanitize=sanitize)

def _create_connection(self) -> None:
if self.interactive_login:
Expand Down Expand Up @@ -175,6 +177,8 @@ def execute(
if isinstance(command, (list, tuple)):
command = " ".join(command)

command = self.sanitize_command(command)

# TODO: check if this works:
if not workdir:
workdir = "."
Expand All @@ -201,7 +205,10 @@ def execute(
timeout=timeout,
)

return out.stdout, out.stderr, out.exited
stdout = self.sanitize_output(out.stdout)
stderr = self.sanitize_output(out.stderr)

return stdout, stderr, out.exited

def mkdir(
self, directory: str | Path, recursive: bool = True, exist_ok: bool = True
Expand Down
33 changes: 33 additions & 0 deletions tests/db/remote/host/test_local.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from unittest.mock import patch


@patch("subprocess.run")
def test_sanitize(mock_run):
from jobflow_remote.remote.host.base import SANITIZE_KEY
from jobflow_remote.remote.host.local import LocalHost

lh = LocalHost(sanitize=True)

cmd = "echo 'test'"

echo_cmd = f'echo "{SANITIZE_KEY}" | tee /dev/stderr'
expected_cmd = f"{echo_cmd};{cmd};{echo_cmd}"
mock_stdout = f"SOME NOISE --{SANITIZE_KEY}\ntest{SANITIZE_KEY}\nSOME appended TEXT"

# Configure the mock
mock_run.return_value.returncode = 0
mock_run.return_value.stdout = mock_stdout.encode()
mock_run.return_value.stderr = b""

stdout, stderr, _ = lh.execute(cmd)

mock_run.assert_called_once_with(
expected_cmd,
capture_output=True,
shell=True, # noqa: S604
timeout=None,
check=False,
)

assert stdout == "test"
assert stderr == ""
40 changes: 40 additions & 0 deletions tests/db/remote/host/test_remote.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from unittest.mock import MagicMock, patch


@patch("fabric.Connection.run")
@patch("fabric.Connection.cd")
def test_sanitize(mock_cd, mock_run):
from jobflow_remote.remote.host.base import SANITIZE_KEY
from jobflow_remote.remote.host.remote import RemoteHost

rh = RemoteHost(
host="localhost",
retry_on_closed_connection=False,
sanitize=True,
shell_cmd=None,
)
rh._check_connected = lambda: True

cmd = "echo 'test'"

echo_cmd = f'echo "{SANITIZE_KEY}" | tee /dev/stderr'
expected_cmd = f"{echo_cmd};{cmd};{echo_cmd}"
mock_stdout = f"SOME NOISE --{SANITIZE_KEY}\ntest{SANITIZE_KEY}\nSOME appended TEXT"

# Configure the mock
mock_cd.return_value.__enter__ = (
MagicMock()
) # This makes the context manager do nothing
mock_cd.return_value.__exit__ = MagicMock()
mock_run.return_value.stdout = mock_stdout
mock_run.return_value.stderr = ""

# Call the function that uses subprocess.run
stdout, stderr, _ = rh.execute(cmd)

# Assert that subprocess.run was called with the expected arguments
mock_run.assert_called_once_with(expected_cmd, timeout=None, hide=True, warn=True)

# Assert on the result of your function
assert stdout == "test"
assert stderr == ""
20 changes: 20 additions & 0 deletions tests/integration/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,13 @@ def write_tmp_settings(
work_dir=str(workdir),
resources={},
),
"test_sanitize_local_worker": dict(
type="local",
scheduler_type="shell",
work_dir=str(workdir),
resources={},
sanitize_command=True,
),
"test_remote_worker": dict(
type="remote",
host="localhost",
Expand Down Expand Up @@ -273,6 +280,19 @@ def write_tmp_settings(
resources={},
max_jobs=2,
),
"test_sanitize_remote_worker": dict(
type="remote",
host="localhost",
port=slurm_ssh_port,
scheduler_type="slurm",
work_dir="/home/jobflow/jfr",
user="jobflow",
password="jobflow",
pre_run="source /home/jobflow/.venv/bin/activate",
resources={"partition": "debug", "ntasks": 1, "time": "00:01:00"},
connect_kwargs={"allow_agent": False, "look_for_keys": False},
sanitize_command=True,
),
},
exec_config={"test": {"export": {"TESTING_ENV_VAR": random_project_name}}},
runner=dict(
Expand Down
Loading