From a6d2fd24f1a58e96fa28836b7d4bfd2d39fff91a Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Fri, 25 Aug 2023 13:50:31 +0200 Subject: [PATCH 01/21] Copy ssh transfer to sftp transfer --- .../testing/sftp/Dockerfile-sftp-server | 8 + src/scitacean/testing/sftp/__init__.py | 122 +++++ src/scitacean/testing/sftp/_pytest_helpers.py | 44 ++ src/scitacean/testing/sftp/_sftp.py | 162 ++++++ .../sftp/docker-compose-sftp-server.yaml | 26 + src/scitacean/testing/sftp/fixtures.py | 226 ++++++++ .../testing/sftp/sftp_server_seed/table.csv | 2 + .../testing/sftp/sftp_server_seed/text.txt | 1 + src/scitacean/transfer/sftp.py | 493 ++++++++++++++++++ tests/conftest.py | 3 + tests/transfer/sftp_test.py | 404 ++++++++++++++ 11 files changed, 1491 insertions(+) create mode 100644 src/scitacean/testing/sftp/Dockerfile-sftp-server create mode 100644 src/scitacean/testing/sftp/__init__.py create mode 100644 src/scitacean/testing/sftp/_pytest_helpers.py create mode 100644 src/scitacean/testing/sftp/_sftp.py create mode 100644 src/scitacean/testing/sftp/docker-compose-sftp-server.yaml create mode 100644 src/scitacean/testing/sftp/fixtures.py create mode 100644 src/scitacean/testing/sftp/sftp_server_seed/table.csv create mode 100644 src/scitacean/testing/sftp/sftp_server_seed/text.txt create mode 100644 src/scitacean/transfer/sftp.py create mode 100644 tests/transfer/sftp_test.py diff --git a/src/scitacean/testing/sftp/Dockerfile-sftp-server b/src/scitacean/testing/sftp/Dockerfile-sftp-server new file mode 100644 index 00000000..9501e3a6 --- /dev/null +++ b/src/scitacean/testing/sftp/Dockerfile-sftp-server @@ -0,0 +1,8 @@ +FROM ghcr.io/linuxserver/openssh-server +RUN echo -e "Match Group *,!root \n\ + ForceCommand internal-sftp \n\ + PasswordAuthentication yes \n\ + PermitTunnel no \n\ + AllowAgentForwarding no \n\ + AllowTcpForwarding no \n\ + X11Forwarding no" >> /etc/ssh/sshd_config diff --git a/src/scitacean/testing/sftp/__init__.py b/src/scitacean/testing/sftp/__init__.py new file mode 100644 index 00000000..2ff3043a --- /dev/null +++ b/src/scitacean/testing/sftp/__init__.py @@ -0,0 +1,122 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2022 Scitacean contributors (https://github.com/SciCatProject/scitacean) +"""Helpers for running tests with an SFTP server. + +This subpackage is primarily meant for testing +:class:`scitacean.testing.sftp.SFTPFileTransfer`. +But it can also be used to test downstream code that uses the SFTP file transfer. + +The `pytest `_ fixtures in this package manage an SFTP server +running in a docker container on the local machine. +They, therefore, require docker to be installed and running. + +Use the :func:`scitacean.testing.sftp.fixtures.sftp_fileserver` +fixture to manage the server and use +:func:`scitacean.testing.sftp.fixtures.sftp_access` +to get all required access parameters. +See below for examples. + +Attention +--------- +The fixtures support `pytest-xdist `_ +but only if all workers run on the local machine (the default). + +It may still happen that tests fail due to the complexity of synchronizing start up +and shut down of the SFTP server between workers. + +See Also +-------- +`Testing <../../user-guide/testing.ipynb>`_ user guide. + +Examples +-------- +In order to test the SFTP file transfer directly, use the provided fixtures to +open a connection manually. +Here, requesting the ``require_sftp_fileserver`` fixture ensures that the server +is running during the test, or that the test gets skipped if SFTP tests are disabled. +Passing the ``connect`` argument as shown ensures that the file transfer +connects to the test server with the correct parameters. + +.. code-block:: python + + from scitacean.transfer.sftp import SFTPFileTransfer + + def test_sftp_upload( + sftp_access, + sftp_connect_with_username_password, + require_sftp_fileserver, + sftp_data_dir, + ): + sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) + ds = Dataset(...) + with sftp.connect_for_upload( + dataset=ds, + connect=sftp_connect_with_username_password + ) as connection: + # do upload + # assert that the file has been copied to sftp_data_dir + +Testing the SFTP transfer together with a client requires some additional setup. +See ``test_client_with_sftp`` in +`sftp_test.py `_ +for an example. + +Implementation notes +-------------------- +When the server fixture is first used, it initializes the server using these steps: + +1. Create a temporary directory with contents:: + + tmpdir + ├ docker-compose.yaml + ├ .env (specifies paths for docker volumes) + ├ counter (number of workers currently using the server) + ├ counter.lock (file lock) + └ data (storage of files) + └ seed (populated from scitacean/testing/sftp/sftp_server_seed) + +2. Start docker. +3. Make data writable by the user in docker. + This changes the ownership of data on the host to root (on some machines). + +The docker container and its volumes are removed at the end of the tests. +The fixture also tries to remove the temporary directory. +This can fail as the owner of its contents (in particular data) +may have been changed to root. +So cleanup can fail and leave the directory behind. + +Use the seed directory (``sftp_data_dir/"seed"``) to test downloads. +Corresponds to ``/data/seed`` on the server. + +Use the base data directory (``sftp_data_dir``) to test uploads. +Corresponds to ``/data`` on the server. + +The counter and counter.lock files are used to synchronize starting and stopping +of the docker container between processes. +This is required when ``pytest-xdist`` is used. +Otherwise, those files will not be present. +""" + +from ._pytest_helpers import add_pytest_option, sftp_enabled, skip_if_not_sftp +from ._sftp import ( + IgnorePolicy, + SFTPAccess, + SFTPUser, + can_connect, + configure, + local_access, + wait_until_sftp_server_is_live, +) + +__all__ = [ + "add_pytest_option", + "can_connect", + "configure", + "local_access", + "sftp_enabled", + "skip_if_not_sftp", + "wait_until_sftp_server_is_live", + "IgnorePolicy", + "SFTPAccess", + "SFTPUser", +] diff --git a/src/scitacean/testing/sftp/_pytest_helpers.py b/src/scitacean/testing/sftp/_pytest_helpers.py new file mode 100644 index 00000000..dfff5061 --- /dev/null +++ b/src/scitacean/testing/sftp/_pytest_helpers.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2023 SciCat Project (https://github.com/SciCatProject/scitacean) + +from typing import Optional + +import pytest + +_COMMAND_LINE_OPTION: Optional[str] = None + + +def add_pytest_option(parser: pytest.Parser, option: str = "--sftp-tests") -> None: + """Add a command-line option to pytest to toggle SFTP tests. + + Parameters + ---------- + parser: + Pytest's command-line argument parser. + option: + Name of the command-line option. + """ + parser.addoption( + option, + action="store_true", + default=False, + help="Select whether to run tests with an SFTP fileserver", + ) + global _COMMAND_LINE_OPTION + _COMMAND_LINE_OPTION = option + + +def skip_if_not_sftp(request: pytest.FixtureRequest) -> None: + """Mark the current test to be skipped if SFTP tests are disabled.""" + if not sftp_enabled(request): + pytest.skip( + "Tests against an SFTP file server are disabled, " + f"use {_COMMAND_LINE_OPTION} to enable them" + ) + + +def sftp_enabled(request: pytest.FixtureRequest) -> bool: + """Return True if SFTP tests are enabled.""" + return _COMMAND_LINE_OPTION is not None and bool( + request.config.getoption(_COMMAND_LINE_OPTION) + ) diff --git a/src/scitacean/testing/sftp/_sftp.py b/src/scitacean/testing/sftp/_sftp.py new file mode 100644 index 00000000..67722e88 --- /dev/null +++ b/src/scitacean/testing/sftp/_sftp.py @@ -0,0 +1,162 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2023 SciCat Project (https://github.com/SciCatProject/scitacean) +import importlib.resources +import time +from dataclasses import dataclass +from functools import lru_cache +from pathlib import Path +from typing import Any, Dict, Iterable, Tuple, Union + +import paramiko +import yaml + + +@dataclass +class SFTPUser: + username: str + password: str + + +@dataclass +class SFTPAccess: + host: str + port: int + user: SFTPUser + + +def _read_resource_text(filename: str) -> str: + if hasattr(importlib.resources, "files"): + # Use new API added in Python 3.9 + return ( + importlib.resources.files("scitacean.testing.sftp") + .joinpath(filename) + .read_text() + ) + # Old API, deprecated as of Python 3.11 + return importlib.resources.read_text("scitacean.testing.sftp", filename) + + +def _read_resource_yaml(filename: str) -> Any: + return yaml.safe_load(_read_resource_text(filename)) + + +@lru_cache(maxsize=1) +def _docker_compose_file() -> Dict[str, Any]: + return _read_resource_yaml("docker-compose-sftp-server.yaml") # type: ignore[no-any-return] + + +@lru_cache(maxsize=1) +def _docker_file() -> str: + return _read_resource_text("Dockerfile-sftp-server") + + +def _seed_files() -> Iterable[Tuple[str, str]]: + if hasattr(importlib.resources, "files"): + # Use new API added in Python 3.9 + yield from ( + (file.name, file.read_text()) + for file in importlib.resources.files("scitacean.testing.sftp") + .joinpath("sftp_server_seed") + .iterdir() + ) + else: + # Old API, deprecated as of Python 3.11 + with importlib.resources.path( + "scitacean.testing.sftp", "sftp_server_seed" + ) as seed_dir: + for path in seed_dir.iterdir(): + yield path.name, path.read_text() + + +def local_access() -> SFTPAccess: + config = _docker_compose_file() + service = config["services"]["scitacean-test-sftp-server"] + env = {k: v for k, v in map(lambda s: s.split("="), service["environment"])} + return SFTPAccess( + host="localhost", + port=service["ports"][0].split(":")[0], + user=SFTPUser( + username=env["USER_NAME"], + password=env["USER_PASSWORD"], + ), + ) + + +def _copy_seed(target_seed_dir: Path) -> None: + for name, content in _seed_files(): + target_seed_dir.joinpath(name).write_text(content) + + +def configure(target_dir: Union[Path, str]) -> Path: + """Generate a config file for docker compose and copy seed data.""" + target_dir = Path(target_dir) + target_seed_dir = target_dir / "data" / "seed" + target_seed_dir.mkdir(parents=True) + _copy_seed(target_seed_dir) + + config_target = target_dir / "docker-compose.yaml" + config_target.write_text(yaml.dump(_docker_compose_file())) + target_dir.joinpath("Dockerfile-sftp-server").write_text(_docker_file()) + + target_dir.joinpath(".env").write_text( + f"""DATA_DIR={target_dir / 'data'} +SEED_DIR={target_seed_dir}""" + ) + + return config_target + + +def can_connect(sftp_access: SFTPAccess) -> bool: + try: + _make_client(sftp_access) + except paramiko.SSHException: + return False + return True + + +def wait_until_sftp_server_is_live( + sftp_access: SFTPAccess, max_time: float, n_tries: int +) -> None: + # The container takes a while to be fully live. + for _ in range(n_tries): + if can_connect(sftp_access): + return + time.sleep(max_time / n_tries) + if not can_connect(sftp_access): + raise RuntimeError("Cannot connect to SFTP server") + + +def cleanup_data_dir( + sftp_access: SFTPAccess, sftp_connect_with_username_password: Any +) -> None: + # Delete all directories created by tests. + # These are owned by root on the host and cannot be deleted by Python's tempfile. + connection = sftp_connect_with_username_password( + host=sftp_access.host, port=sftp_access.port + ) + connection.run( + "find /data -not -path '/data' -not -path '/data/seed' | xargs rm -rf", + hide=True, + in_stream=False, + ) + + +def _make_client(sftp_access: SFTPAccess) -> paramiko.SSHClient: + client = paramiko.SSHClient() + client.set_missing_host_key_policy(IgnorePolicy()) + client.connect( + hostname=sftp_access.host, + port=sftp_access.port, + username=sftp_access.user.username, + password=sftp_access.user.password, + allow_agent=False, + look_for_keys=False, + ) + return client + + +# Every time we create a container, it gets a new host key. +# So simply accept any host keys. +class IgnorePolicy(paramiko.MissingHostKeyPolicy): + def missing_host_key(self, client: Any, hostname: Any, key: Any) -> None: + return diff --git a/src/scitacean/testing/sftp/docker-compose-sftp-server.yaml b/src/scitacean/testing/sftp/docker-compose-sftp-server.yaml new file mode 100644 index 00000000..4d2f6103 --- /dev/null +++ b/src/scitacean/testing/sftp/docker-compose-sftp-server.yaml @@ -0,0 +1,26 @@ +version: "2.1" +services: + scitacean-test-sftp-server: + build: + context: . + dockerfile: Dockerfile-sftp-server + container_name: scitacean-test-sftp + hostname: scitacean-test-sftp-server + environment: + - PUID=1000 + - PGID=1000 + - TZ=CET # Not UTC on purpose to test timezone detection +# - PUBLIC_KEY=yourpublickey #optional +# - PUBLIC_KEY_FILE=/path/to/file #optional +# - PUBLIC_KEY_DIR=/path/to/directory/containing/_only_/pubkeys #optional +# - PUBLIC_KEY_URL=https://github.com/username.keys #optional + - SUDO_ACCESS=false + - PASSWORD_ACCESS=true + - USER_NAME=the-scitacean + - USER_PASSWORD=sup3r-str0ng + volumes: # configured in Python + - ${DATA_DIR}:/data + - ${SEED_DIR}:/data/seed + ports: + - "2222:2222" + restart: unless-stopped diff --git a/src/scitacean/testing/sftp/fixtures.py b/src/scitacean/testing/sftp/fixtures.py new file mode 100644 index 00000000..ed2cbc8a --- /dev/null +++ b/src/scitacean/testing/sftp/fixtures.py @@ -0,0 +1,226 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2023 SciCat Project (https://github.com/SciCatProject/scitacean) +# mypy: disable-error-code="no-untyped-def" +"""Pytest fixtures to manage and access a local SFTP server.""" + +import logging +from pathlib import Path +from typing import Callable, Generator, Optional + +import fabric +import fabric.config +import pytest + +from ..._internal import docker +from .._pytest_helpers import init_work_dir, root_tmp_dir +from ._pytest_helpers import sftp_enabled, skip_if_not_sftp +from ._sftp import ( + IgnorePolicy, + SFTPAccess, + configure, + local_access, + wait_until_sftp_server_is_live, +) + + +@pytest.fixture(scope="session") +def sftp_access(request: pytest.FixtureRequest) -> SFTPAccess: + """Fixture that returns SFTP access parameters. + + Returns + ------- + : + A URL and user to connect to the testing SFTP server. + The user has access to all initial files registered in the + database and permissions to create new files. + """ + skip_if_not_sftp(request) + return local_access() + + +@pytest.fixture(scope="session") +def sftp_base_dir( + request: pytest.FixtureRequest, tmp_path_factory: pytest.TempPathFactory +) -> Optional[Path]: + """Fixture that returns the base working directory for the SFTP server setup. + + Returns + ------- + : + A path to a directory on the host machine. + The directory gets populated by the + :func:`scitacean.testing.sftp.fixtures.sftp_fileserver` fixture. + It contains the docker configuration and volumes. + + Returns ``None`` if SFTP tests are disabled + """ + if not sftp_enabled(request): + return None + return root_tmp_dir(request, tmp_path_factory) / "scitacean-sftp" + + +@pytest.fixture(scope="session") +def sftp_data_dir(sftp_base_dir: Optional[Path]) -> Optional[Path]: + """Fixture that returns the data directory for the SFTP server setup. + + Returns + ------- + : + A path to a directory on the host machine. + The directory is mounted as ``/data`` on the server. + + Returns ``None`` if SFTP tests are disabled + """ + if sftp_base_dir is None: + return None + return sftp_base_dir / "data" + + +@pytest.fixture() +def require_sftp_fileserver(request, sftp_fileserver) -> None: + """Fixture to declare that a test needs a local SFTP server. + + Like :func:`scitacean.testing.sftp.sftp_fileserver` + but this skips the test if SFTP tests are disabled. + """ + skip_if_not_sftp(request) + + +@pytest.fixture(scope="session") +def sftp_fileserver( + request: pytest.FixtureRequest, + sftp_access: SFTPAccess, + sftp_base_dir: Optional[Path], + sftp_data_dir: Optional[Path], + sftp_connect_with_username_password, +) -> Generator[bool, None, None]: + """Fixture to declare that a test needs a local SFTP server. + + If SFTP tests are enabled, this fixture configures and starts an SFTP server + in a docker container the first time a test requests it. + The server and container will be stopped and removed at the end of the test session. + + Does nothing if the SFTP tests are disabled. + + Returns + ------- + : + True if SFTP tests are enabled and False otherwise. + """ + if sftp_base_dir is None: + yield False + return + + target_dir, counter = init_work_dir(request, sftp_base_dir, name=None) + + try: + with counter.increment() as count: + if count == 1: + _sftp_docker_up(target_dir, sftp_access) + elif not _sftp_server_is_running(): + raise RuntimeError("Expected SFTP server to be running") + yield True + finally: + with counter.decrement() as count: + if count == 0: + _sftp_docker_down(target_dir) + + +@pytest.fixture(scope="session") +def sftp_connection_config() -> fabric.config.Config: + """Fixture that returns the configuration for fabric.Connection for tests. + + Can be used to open SFTP connections if ``SFTPFileTransfer`` is not enough. + """ + config = fabric.config.Config() + config["load_sftp_configs"] = False + config["connect_kwargs"] = { + "allow_agent": False, + "look_for_keys": False, + } + return config + + +@pytest.fixture(scope="session") +def sftp_connect_with_username_password( + sftp_access: SFTPAccess, sftp_connection_config: fabric.config.Config +) -> Callable[..., fabric.Connection]: + """Fixture that returns a function to create a connection to the testing SFTP server. + + Uses username+password and rejects any other authentication attempt. + + Returns + ------- + : + A function to pass as the ``connect`` argument to + :meth:`scitacean.transfer.sftp.SFTPFileTransfer.connect_for_download` + or :meth:`scitacean.transfer.sftp.SFTPFileTransfer.connect_for_upload`. + + Examples + -------- + Explicitly connect to the + + .. code-block:: python + + def test_sftp(sftp_access, sftp_connect_with_username_password, sftp_fileserver): + sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) + with sftp.connect_for_download( + connect=sftp_connect_with_username_password + ) as connection: + # use connection + """ + + def connect(host: str, port: int, **kwargs): + if kwargs: + raise ValueError( + "sftp_connect_with_username_password must only be" + f" used without extra arguments. Got {kwargs=}" + ) + connection = fabric.Connection( + host=host, + port=port, + user=sftp_access.user.username, + config=sftp_connection_config, + connect_kwargs={ + "password": sftp_access.user.password, + **sftp_connection_config.connect_kwargs, + }, + ) + connection.client.set_missing_host_key_policy(IgnorePolicy()) + return connection + + return connect + + +def _sftp_docker_up(target_dir: Path, sftp_access: SFTPAccess) -> None: + if _sftp_server_is_running(): + raise RuntimeError("SFTP docker container is already running") + docker_compose_file = target_dir / "docker-compose.yaml" + log = logging.getLogger("scitacean.testing") + log.info("Starting docker container with SFTP server from %s", docker_compose_file) + configure(target_dir) + docker.docker_compose_up(docker_compose_file) + log.info("Waiting for SFTP docker to become accessible") + wait_until_sftp_server_is_live(sftp_access=sftp_access, max_time=20, n_tries=20) + log.info("Successfully connected to SFTP server") + # Give the user write access. + docker.docker_compose_run( + docker_compose_file, "scitacean-test-sftp-server", "chown", "1000:1000", "/data" + ) + + +def _sftp_docker_down(target_dir: Path) -> None: + # Check if container is running because the fixture can call this function + # if there was an exception in _sftp_docker_up. + # In that case, there is nothing to tear down. + if _sftp_server_is_running(): + docker_compose_file = target_dir / "docker-compose.yaml" + log = logging.getLogger("scitacean.testing") + log.info( + "Stopping docker container with SFTP server from %s", docker_compose_file + ) + docker.docker_compose_down(docker_compose_file) + + +def _sftp_server_is_running() -> bool: + return docker.container_is_running("scitacean-test-sftp") diff --git a/src/scitacean/testing/sftp/sftp_server_seed/table.csv b/src/scitacean/testing/sftp/sftp_server_seed/table.csv new file mode 100644 index 00000000..76891270 --- /dev/null +++ b/src/scitacean/testing/sftp/sftp_server_seed/table.csv @@ -0,0 +1,2 @@ +7,2 +5,2 diff --git a/src/scitacean/testing/sftp/sftp_server_seed/text.txt b/src/scitacean/testing/sftp/sftp_server_seed/text.txt new file mode 100644 index 00000000..499ea0ea --- /dev/null +++ b/src/scitacean/testing/sftp/sftp_server_seed/text.txt @@ -0,0 +1 @@ +This is some text for testing. diff --git a/src/scitacean/transfer/sftp.py b/src/scitacean/transfer/sftp.py new file mode 100644 index 00000000..c42b2c4b --- /dev/null +++ b/src/scitacean/transfer/sftp.py @@ -0,0 +1,493 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2023 SciCat Project (https://github.com/SciCatProject/scitacean) + +import os +from contextlib import contextmanager +from datetime import datetime, timedelta +from getpass import getpass +from pathlib import Path +from typing import Any, Callable, Iterator, List, Optional, Tuple, Union + +from dateutil.tz import tzoffset + +# Note that invoke and paramiko are dependencies of fabric. +from fabric import Connection +from invoke.exceptions import UnexpectedExit +from paramiko import SFTPClient +from paramiko.ssh_exception import AuthenticationException, PasswordRequiredException + +from ..dataset import Dataset +from ..error import FileUploadError +from ..file import File +from ..filesystem import RemotePath +from ..logging import get_logger +from .util import source_folder_for + +# TODO pass pid in put/revert? +# downloading does not need a pid, so it should not be required in the constructor/ + + +class SFTPDownloadConnection: + def __init__(self, *, connection: Connection) -> None: + self._connection = connection + + def download_files(self, *, remote: List[RemotePath], local: List[Path]) -> None: + """Download files from the given remote path.""" + for r, l in zip(remote, local): + self.download_file(remote=r, local=l) + + def download_file(self, *, remote: RemotePath, local: Path) -> None: + get_logger().info( + "Downloading file %s from host %s to %s", + remote, + self._connection.host, + local, + ) + self._connection.get(remote=remote.posix, local=os.fspath(local)) + + +class SFTPUploadConnection: + def __init__(self, *, connection: Connection, source_folder: RemotePath) -> None: + self._connection = connection + self._source_folder = source_folder + # self._remote_timezone = self._get_remote_timezone() + + @property + def _sftp(self) -> SFTPClient: + return self._connection.sftp() # type: ignore[no-any-return] + + @property + def source_folder(self) -> RemotePath: + return self._source_folder + + def remote_path(self, filename: Union[str, RemotePath]) -> RemotePath: + return self.source_folder / filename + + def _make_source_folder(self) -> None: + try: + self._connection.run( + f"mkdir -p {self.source_folder.posix}", hide=True, in_stream=False + ) + except OSError as exc: + raise FileUploadError( + f"Failed to create source folder {self.source_folder}: {exc.args}" + ) from None + + def upload_files(self, *files: File) -> List[File]: + """Upload files to the remote folder.""" + self._make_source_folder() + uploaded = [] + try: + for file in files: + up, exc = self._upload_file(file) + uploaded.append(up) # need to add this file in order to revert it + if exc is not None: + raise exc + except Exception: + self.revert_upload(*uploaded) + raise + return uploaded + + def _upload_file(self, file: File) -> Tuple[File, Optional[Exception]]: + if file.local_path is None: + raise ValueError( + f"Cannot upload file to {file.remote_path}, " + "the file has no local path" + ) + remote_path = self.remote_path(file.remote_path) + get_logger().info( + "Uploading file %s to %s on host %s", + file.local_path, + remote_path, + self._connection.host, + ) + st = self._sftp.put( + remotepath=remote_path.posix, localpath=os.fspath(file.local_path) + ) + if (exc := self._validate_upload(file)) is not None: + return file, exc + creation_time = ( + datetime.fromtimestamp(st.st_mtime, tz=self._remote_timezone) + if st.st_mtime + else None + ) + return ( + file.uploaded( + remote_gid=str(st.st_gid), + remote_uid=str(st.st_uid), + remote_creation_time=creation_time, + remote_perm=str(st.st_mode), + remote_size=st.st_size, + ), + None, + ) + + def _validate_upload(self, file: File) -> Optional[Exception]: + if (checksum := self._compute_checksum(file)) is None: + return None + if checksum != file.checksum(): + return FileUploadError( + f"Upload of file {file.remote_path} failed: " + f"Checksum of uploaded file ({checksum}) does not " + f"match checksum of local file ({file.checksum()}) " + f"using algorithm {file.checksum_algorithm}" + ) + return None + + def _compute_checksum(self, file: File) -> Optional[str]: + if (hash_exe := _coreutils_checksum_for(file)) is None: + return None + try: + res = self._connection.run( + f"{hash_exe} {self.remote_path(file.remote_path).posix}", + hide=True, + in_stream=False, + ) + except UnexpectedExit as exc: + if exc.result.return_code == 127: + get_logger().warning( + "Cannot validate checksum of uploaded file %s because checksum " + "algorithm '%s' is not implemented on the server.", + file.remote_path, + file.checksum_algorithm, + ) + return None + raise + return res.stdout.split(" ", 1)[0] # type: ignore[no-any-return] + + def _get_remote_timezone(self) -> tzoffset: + cmd = 'date +"%Z|%::z"' + try: + tz_str = self._connection.run( + cmd, hide=True, in_stream=False + ).stdout.strip() + except UnexpectedExit as exc: + raise FileUploadError( + f"Failed to get timezone of fileserver: {exc.args}" + ) from None + tz = _parse_remote_timezone(tz_str) + get_logger().info("Detected timezone of fileserver: %s", tz) + return tz + + def revert_upload(self, *files: File) -> None: + """Remove uploaded files from the remote folder.""" + for file in files: + self._revert_upload_single(remote=file.remote_path, local=file.local_path) + + if _folder_is_empty(self._connection, self.source_folder): + try: + get_logger().info( + "Removing empty remote directory %s on host %s", + self.source_folder, + self._connection.host, + ) + self._sftp.rmdir(self.source_folder.posix) + except UnexpectedExit as exc: + get_logger().warning( + "Failed to remove empty remote directory %s on host:\n%s", + self.source_folder, + self._connection.host, + exc.result, + ) + + def _revert_upload_single( + self, *, remote: RemotePath, local: Optional[Path] + ) -> None: + remote_path = self.remote_path(remote) + get_logger().info( + "Reverting upload of file %s to %s on host %s", + local, + remote_path, + self._connection.host, + ) + + try: + self._sftp.remove(remote_path.posix) + except UnexpectedExit as exc: + get_logger().warning( + "Error reverting file %s:\n%s", remote_path, exc.result + ) + return + + +class SFTPFileTransfer: + """Upload / download files using SFTP. + + Configuration & Authentication + ------------------------------ + The file transfer connects to the server at the address given + as the ``host`` constructor argument. + This may be + + - a full url such as ``some.fileserver.edu``, + - an IP address like ``127.0.0.1``, + - or a host defined in the user's openSFTP config file. + + The file transfer can authenticate using username+password. + It will ask for those on the command line. + However, it is **highly recommended to set up a key and use an SFTP agent!** + This increases security as Scitacean no longer has to handle credentials itself. + And it is required for automated programs where a user cannot enter credentials + on a command line. + + Upload folder + ------------- + The file transfer can take an optional ``source_folder`` as a constructor argument. + If it is given, ``SFTPFileTransfer`` uploads all files to it and ignores the + source folder set in the dataset. + If it is not given, ``SFTPFileTransfer`` uses the dataset's source folder. + + The source folder argument to ``SFTPFileTransfer`` may be a Python format string. + In that case, all format fields are replaced by the corresponding fields + of the dataset. + All non-ASCII characters and most special ASCII characters are replaced. + This should avoid broken paths from essentially random contents in datasets. + + Examples + -------- + Given + + .. code-block:: python + + dset = Dataset(type="raw", + name="my-dataset", + source_folder="/dataset/source", + ) + + This uploads to ``/dataset/source``: + + .. code-block:: python + + file_transfer = SFTPFileTransfer(host="fileserver") + + This uploads to ``/transfer/folder``: + + .. code-block:: python + + file_transfer = SFTPFileTransfer(host="fileserver", + source_folder="transfer/folder") + + This uploads to ``/transfer/my-dataset``: + (Note that ``{name}`` is replaced by ``dset.name``.) + + .. code-block:: python + + file_transfer = SFTPFileTransfer(host="fileserver", + source_folder="transfer/{name}") + + A useful approach is to include a unique ID in the source folder, for example + ``"/some/base/folder/{uid}"``, to avoid clashes between different datasets. + Scitacean will fill in the ``"{uid}"`` placeholder with a new UUID4. + """ + + def __init__( + self, + *, + host: str, + port: Optional[int] = None, + source_folder: Optional[Union[str, RemotePath]] = None, + ) -> None: + """Construct a new SFTP file transfer. + + Parameters + ---------- + host: + URL or name of the server to connect to. + port: + Port of the server. + source_folder: + Upload files to this folder if set. + Otherwise, upload to the dataset's source_folder. + Ignored when downloading files. + """ + self._host = host + self._port = port + self._source_folder_pattern = ( + RemotePath(source_folder) + if isinstance(source_folder, str) + else source_folder + ) + + def source_folder_for(self, dataset: Dataset) -> RemotePath: + """Return the source folder used for the given dataset.""" + return source_folder_for(dataset, self._source_folder_pattern) + + @contextmanager + def connect_for_download( + self, connect: Optional[Callable[..., Connection]] = None + ) -> Iterator[SFTPDownloadConnection]: + """Create a connection for downloads, use as a context manager. + + Parameters + ---------- + connect: + A function that creates and returns a :class:`fabric.connection.Connection` + object. + Will first be called with only ``host`` and ``port``. + If this fails (by raising + :class:`paramiko.ssh_exception.AuthenticationException`), the function is + called with ``host``, ``port``, and, optionally, ``user`` and + ``connection_kwargs`` depending on the authentication method. + Raising :class:`paramiko.ssh_exception.AuthenticationException` in the 2nd + call or any other exception in the 1st signals failure of + ``connect_for_download``. + """ + con = _connect(self._host, self._port, connect=connect) + try: + yield SFTPDownloadConnection(connection=con) + finally: + con.close() + + @contextmanager + def connect_for_upload( + self, dataset: Dataset, connect: Optional[Callable[..., Connection]] = None + ) -> Iterator[SFTPUploadConnection]: + """Create a connection for uploads, use as a context manager. + + Parameters + ---------- + dataset: + The connection will be used to upload files of this dataset. + Used to determine the target folder. + connect: + A function that creates and returns a :class:`fabric.connection.Connection` + object. + Will first be called with only ``host`` and ``port``. + If this fails (by raising + :class:`paramiko.ssh_exception.AuthenticationException`), the function is + called with ``host``, ``port``, and, optionally, ``user`` and + ``connection_kwargs`` depending on the authentication method. + Raising :class:`paramiko.ssh_exception.AuthenticationException` in the 2nd + call or any other exception in the 1st signals failure of + ``connect_for_upload``. + """ + source_folder = self.source_folder_for(dataset) + con = _connect(self._host, self._port, connect=connect) + try: + yield SFTPUploadConnection( + connection=con, + source_folder=source_folder, + ) + finally: + con.close() + + +def _ask_for_key_passphrase() -> str: + return getpass("The private key is encrypted, enter passphrase: ") + + +def _ask_for_credentials(host: str) -> Tuple[str, str]: + print(f"You need to authenticate to access {host}") # noqa: T201 + username = input("Username: ") + password = getpass("Password: ") + return username, password + + +def _generic_connect( + host: str, + port: Optional[int], + connect: Optional[Callable[..., Connection]], + **kwargs: Any, +) -> Connection: + if connect is None: + con = Connection(host=host, port=port, **kwargs) + else: + con = connect(host=host, port=port, **kwargs) + con.open() + return con + + +def _unauthenticated_connect( + host: str, port: Optional[int], connect: Optional[Callable[..., Connection]] +) -> Connection: + return _generic_connect(host=host, port=port, connect=connect) + + +def _authenticated_connect( + host: str, + port: Optional[int], + connect: Optional[Callable[..., Connection]], + exc: AuthenticationException, +) -> Connection: + # TODO fail fast if output going to file + if isinstance(exc, PasswordRequiredException) and "encrypted" in exc.args[0]: + # TODO does not work anymore, exception is always AuthenticationException + return _generic_connect( + host=host, + port=port, + connect=connect, + connect_kwargs={"passphrase": _ask_for_key_passphrase()}, + ) + else: + username, password = _ask_for_credentials(host) + return _generic_connect( + host=host, + port=port, + connect=connect, + user=username, + connect_kwargs={"password": password}, + ) + + +def _connect( + host: str, port: Optional[int], connect: Optional[Callable[..., Connection]] +) -> Connection: + try: + try: + return _unauthenticated_connect(host, port, connect) + except AuthenticationException as exc: + return _authenticated_connect(host, port, connect, exc) + except Exception as exc: + # We pass secrets as arguments to functions called in this block, and those + # can be leaked through exception handlers. So catch all exceptions + # and strip the backtrace up to this point to hide those secrets. + raise type(exc)(exc.args) from None + except BaseException as exc: + raise type(exc)(exc.args) from None + + +def _folder_is_empty(con: Connection, path: RemotePath) -> bool: + try: + ls: str = con.run(f"ls {path.posix}", hide=True, in_stream=False).stdout + return ls == "" + except UnexpectedExit: + return False # no further processing is needed in this case + + +def _coreutils_checksum_for(file: File) -> Optional[str]: + # blake2s is not supported because `b2sum -l 256` produces a different digest + # and I don't know why. + supported = { + "md5": "md5sum -b", + "sha256": "sha256sum -b", + "sha384": "sha384sum -b", + "sha512": "sha512sum -b", + "blake2b": "b2sum -l 512 -b", + } + algorithm = file.checksum_algorithm + if algorithm == "blake2s" or algorithm not in supported: + get_logger().warning( + "Cannot validate checksum of uploaded file %s because checksum algorithm " + "'%s' is not supported by scitacean for remote files.", + file.remote_path, + file.checksum_algorithm, + ) + return None + return supported[algorithm] + + +# Using `date +"%Z"` returns a timezone abbreviation like CET or EST. +# dateutil.tz.gettz can parse this abbreviation and return a tzinfo object. +# However, on Windows, it returns `None` if the string refers to the local timezone. +# This is indistinguishable from an unrecognised timezone, +# where gettz also returns `None`. +# To avoid this, use an explicit offset obtained from `date +"%::z"`. +# The timezone name is only used for debugging and not interpreted by +# dateutil or datetime. +def _parse_remote_timezone(tz_str: str) -> tzoffset: + # tz_str is expected to be of the form + # |:: + # as produced by `date +"%Z|%::z"` + name, offset = tz_str.split("|") + hours, minutes, seconds = map(int, offset.split(":")) + return tzoffset(name, timedelta(hours=hours, minutes=minutes, seconds=seconds)) diff --git a/tests/conftest.py b/tests/conftest.py index 89893bbc..7841902f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,10 +5,12 @@ import pytest from scitacean.testing.backend import add_pytest_option as add_backend_option +from scitacean.testing.sftp import add_pytest_option as add_sftp_option from scitacean.testing.ssh import add_pytest_option as add_ssh_option pytest_plugins = ( "scitacean.testing.backend.fixtures", + "scitacean.testing.sftp.fixtures", "scitacean.testing.ssh.fixtures", ) @@ -27,4 +29,5 @@ def pytest_addoption(parser: pytest.Parser) -> None: add_backend_option(parser) + add_sftp_option(parser) add_ssh_option(parser) diff --git a/tests/transfer/sftp_test.py b/tests/transfer/sftp_test.py new file mode 100644 index 00000000..7e3e4b89 --- /dev/null +++ b/tests/transfer/sftp_test.py @@ -0,0 +1,404 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2022 Scitacean contributors (https://github.com/SciCatProject/scitacean) +# mypy: disable-error-code="no-untyped-def, return-value, arg-type, union-attr" + +import dataclasses +import tempfile +from contextlib import contextmanager +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Callable, Iterator, Optional + +import fabric +import paramiko +import pytest +from fabric import Connection + +from scitacean import Dataset, File, FileUploadError, RemotePath +from scitacean.testing.client import FakeClient +from scitacean.testing.sftp import IgnorePolicy, skip_if_not_sftp +from scitacean.transfer.sftp import ( + SFTPDownloadConnection, + SFTPFileTransfer, + SFTPUploadConnection, +) + + +@pytest.fixture(scope="session", autouse=True) +def server(request, sftp_fileserver): + skip_if_not_sftp(request) + + +def test_download_one_file(sftp_access, sftp_connect_with_username_password, tmp_path): + sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) + with sftp.connect_for_download(connect=sftp_connect_with_username_password) as con: + con.download_files( + remote=[RemotePath("/data/seed/text.txt")], local=[tmp_path / "text.txt"] + ) + assert ( + tmp_path.joinpath("text.txt").read_text() == "This is some text for testing.\n" + ) + + +def test_download_two_files(sftp_access, sftp_connect_with_username_password, tmp_path): + sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) + with sftp.connect_for_download(connect=sftp_connect_with_username_password) as con: + con.download_files( + remote=[ + RemotePath("/data/seed/table.csv"), + RemotePath("/data/seed/text.txt"), + ], + local=[tmp_path / "local-table.csv", tmp_path / "text.txt"], + ) + assert tmp_path.joinpath("local-table.csv").read_text() == "7,2\n5,2\n" + assert ( + tmp_path.joinpath("text.txt").read_text() == "This is some text for testing.\n" + ) + + +def test_upload_one_file_source_folder_in_dataset( + sftp_access, sftp_connect_with_username_password, tmp_path, sftp_data_dir +): + ds = Dataset(type="raw", source_folder=RemotePath("/data/upload")) + tmp_path.joinpath("file0.txt").write_text("File to test upload123") + + sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) + with sftp.connect_for_upload( + dataset=ds, connect=sftp_connect_with_username_password + ) as con: + assert con.source_folder == RemotePath("/data/upload") + con.upload_files( + File.from_local(path=tmp_path / "file0.txt", remote_path="upload_0.txt") + ) + + assert ( + sftp_data_dir.joinpath("upload", "upload_0.txt").read_text() + == "File to test upload123" + ) + + +# def test_upload_one_file_source_folder_in_transfer( +# sftp_access, sftp_connect_with_username_password, tmp_path, sftp_data_dir +# ): +# ds = Dataset(type="raw", owner="librarian") +# with open(tmp_path / "file1.txt", "w") as f: +# f.write("File no. 2") +# +# sftp = SFTPFileTransfer( +# host=sftp_access.host, +# port=sftp_access.port, +# source_folder="/data/upload/{owner}", +# ) +# with sftp.connect_for_upload( +# dataset=ds, connect=sftp_connect_with_username_password +# ) as con: +# assert con.source_folder == RemotePath("/data/upload/librarian") +# con.upload_files( +# File.from_local( +# path=tmp_path / "file1.txt", remote_path=RemotePath("upload_1.txt") +# ) +# ) +# +# with open(sftp_data_dir / "upload" / "librarian" / "upload_1.txt", "r") as f: +# assert f.read() == "File no. 2" +# +# +# def test_upload_two_files( +# sftp_access, sftp_connect_with_username_password, tmp_path, sftp_data_dir +# ): +# ds = Dataset(type="raw", source_folder=RemotePath("/data/upload2")) +# with open(tmp_path / "file2.1.md", "w") as f: +# f.write("First part of file 2") +# with open(tmp_path / "file2.2.md", "w") as f: +# f.write("Second part of file 2") +# +# sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) +# with sftp.connect_for_upload( +# dataset=ds, connect=sftp_connect_with_username_password +# ) as con: +# assert con.source_folder == RemotePath("/data/upload2") +# con.upload_files( +# File.from_local(path=tmp_path / "file2.1.md", base_path=tmp_path), +# File.from_local(path=tmp_path / "file2.2.md", base_path=tmp_path), +# ) +# +# with open(sftp_data_dir / "upload2" / "file2.1.md", "r") as f: +# assert f.read() == "First part of file 2" +# with open(sftp_data_dir / "upload2" / "file2.2.md", "r") as f: +# assert f.read() == "Second part of file 2" +# +# +# def test_revert_all_uploaded_files_single( +# sftp_access, sftp_connect_with_username_password, tmp_path, sftp_data_dir +# ): +# ds = Dataset(type="raw", source_folder=RemotePath("/data/revert-all-test-1")) +# with open(tmp_path / "file3.txt", "w") as f: +# f.write("File that should get reverted") +# +# sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) +# with sftp.connect_for_upload( +# dataset=ds, connect=sftp_connect_with_username_password +# ) as con: +# file = File.from_local(path=tmp_path / "file3.txt", base_path=tmp_path) +# con.upload_files(file) +# con.revert_upload(file) +# +# assert "revert-all-test-1" not in (p.name for p in sftp_data_dir.iterdir()) +# +# +# def test_revert_all_uploaded_files_two( +# sftp_access, sftp_connect_with_username_password, tmp_path, sftp_data_dir +# ): +# ds = Dataset(type="raw", source_folder=RemotePath("/data/revert-all-test-2")) +# with open(tmp_path / "file3.1.txt", "w") as f: +# f.write("File that should get reverted 1") +# with open(tmp_path / "file3.2.txt", "w") as f: +# f.write("File that should get reverted 2") +# +# sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) +# with sftp.connect_for_upload( +# dataset=ds, connect=sftp_connect_with_username_password +# ) as con: +# file1 = File.from_local(path=tmp_path / "file3.1.txt", base_path=tmp_path) +# file2 = File.from_local(path=tmp_path / "file3.2.txt", base_path=tmp_path) +# con.upload_files(file1, file2) +# con.revert_upload(file1, file2) +# +# assert "revert-all-test-2" not in (p.name for p in sftp_data_dir.iterdir()) +# +# +# def test_revert_one_uploaded_file( +# sftp_access, sftp_connect_with_username_password, tmp_path, sftp_data_dir +# ): +# ds = Dataset(type="raw", source_folder=RemotePath("/data/revert-test")) +# with open(tmp_path / "file4.txt", "w") as f: +# f.write("File that should get reverted") +# with open(tmp_path / "file5.txt", "w") as f: +# f.write("File that should be kept") +# +# sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) +# with sftp.connect_for_upload( +# dataset=ds, connect=sftp_connect_with_username_password +# ) as con: +# file4 = File.from_local(path=tmp_path / "file4.txt", base_path=tmp_path) +# file5 = File.from_local(path=tmp_path / "file5.txt", base_path=tmp_path) +# con.upload_files(file4, file5) +# con.revert_upload(file4) +# +# assert "file4.txt" not in (p.name for p in (sftp_data_dir / "revert-test").iterdir()) +# with open(sftp_data_dir / "revert-test" / "file5.txt", "r") as f: +# assert f.read() == "File that should be kept" +# +# +# def test_stat_uploaded_file( +# sftp_access, sftp_connect_with_username_password, tmp_path, sftp_data_dir +# ): +# ds = Dataset(type="raw", source_folder=RemotePath("/data/upload6")) +# with open(tmp_path / "file6.txt", "w") as f: +# f.write("File to test upload no 6") +# +# sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) +# with sftp.connect_for_upload( +# dataset=ds, connect=sftp_connect_with_username_password +# ) as con: +# [uploaded] = con.upload_files( +# File.from_local(path=tmp_path / "file6.txt", remote_path="upload_6.txt") +# ) +# +# st = (sftp_data_dir / "upload6" / "upload_6.txt").stat() +# assert uploaded.size == st.st_size +# +# # Set in docker-compose +# assert uploaded.remote_uid == "1000" +# assert uploaded.remote_gid == "1000" +# +# uploaded = dataclasses.replace(uploaded, local_path=None) +# assert datetime.now(tz=timezone.utc) - uploaded.creation_time < timedelta(seconds=5) +# +# +# class CorruptingSFTP(paramiko.SFTPClient): +# """Appends bytes to uploaded files to simulate a broken transfer.""" +# +# def put(self, localpath, remotepath, callback=None, confirm=True): +# with open(localpath, "r") as f: +# content = f.read() +# with tempfile.TemporaryDirectory() as tempdir: +# corrupted_path = Path(tempdir) / "corrupted" +# with open(corrupted_path, "w") as f: +# f.write(content + "\nevil bytes") +# super().put(str(corrupted_path), remotepath, callback, confirm) +# +# +# class CorruptingTransfer(paramiko.Transport): +# """Uses CorruptingSFTP to simulate a broken connection.""" +# +# def open_sftp_client(self) -> paramiko.SFTPClient: +# return CorruptingSFTP.from_transport(self) +# +# +# @pytest.fixture() +# def sftp_corrupting_connect(sftp_access, sftp_connection_config): +# def connect(host: str, port: int, **kwargs): +# if kwargs: +# raise ValueError( +# "connect_with_username_password must only be" +# f" used without extra arguments. Got {kwargs=}" +# ) +# connection = fabric.Connection( +# host=host, +# port=port, +# user=sftp_access.user.username, +# config=sftp_connection_config, +# connect_kwargs={ +# "password": sftp_access.user.password, +# "transport_factory": CorruptingTransfer, +# **sftp_connection_config.connect_kwargs, +# }, +# ) +# connection.client.set_missing_host_key_policy(IgnorePolicy()) +# return connection +# +# return connect +# +# +# def test_upload_file_detects_checksum_mismatch( +# sftp_access, sftp_corrupting_connect, tmp_path, sftp_data_dir +# ): +# ds = Dataset( +# type="raw", +# source_folder=RemotePath("/data/upload7"), +# checksum_algorithm="blake2b", +# ) +# with open(tmp_path / "file7.txt", "w") as f: +# f.write("File to test upload no 7") +# +# sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) +# with sftp.connect_for_upload(dataset=ds, connect=sftp_corrupting_connect) as con: +# with pytest.raises(FileUploadError): +# con.upload_files( +# dataclasses.replace( +# File.from_local( +# path=tmp_path / "file7.txt", +# remote_path=RemotePath("upload_7.txt"), +# ), +# checksum_algorithm="blake2b", +# ) +# ) +# +# assert "upload7" not in (p.name for p in sftp_data_dir.iterdir()) +# +# +# class RaisingSFTP(paramiko.SFTPClient): +# def put(self, localpath, remotepath, callback=None, confirm=True): +# raise RuntimeError("Upload disabled") +# +# +# class RaisingTransfer(paramiko.Transport): +# def open_sftp_client(self) -> paramiko.SFTPClient: +# return RaisingSFTP.from_transport(self) +# +# +# @pytest.fixture() +# def sftp_raising_connect(sftp_access, sftp_connection_config): +# def connect(host: str, port: int, **kwargs): +# if kwargs: +# raise ValueError( +# "connect_with_username_password must only be" +# f" used without extra arguments. Got {kwargs=}" +# ) +# connection = fabric.Connection( +# host=host, +# port=port, +# user=sftp_access.user.username, +# config=sftp_connection_config, +# connect_kwargs={ +# "password": sftp_access.user.password, +# "transport_factory": RaisingTransfer, +# **sftp_connection_config.connect_kwargs, +# }, +# ) +# connection.client.set_missing_host_key_policy(IgnorePolicy()) +# return connection +# +# return connect +# +# +# def test_upload_file_reverts_if_upload_fails( +# sftp_access, sftp_raising_connect, tmp_path, sftp_data_dir +# ): +# ds = Dataset(type="raw", source_folder=RemotePath("/data/upload8")) +# with open(tmp_path / "file8.txt", "w") as f: +# f.write("File to test upload no 8") +# +# sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) +# with sftp.connect_for_upload(dataset=ds, connect=sftp_raising_connect) as con: +# with pytest.raises(RuntimeError): +# con.upload_files( +# File.from_local( +# path=tmp_path / "file8.txt", +# remote_path=RemotePath("upload_8.txt"), +# ) +# ) +# +# assert "upload8" not in (p.name for p in sftp_data_dir.iterdir()) +# +# +# class SFTPTestFileTransfer(SFTPFileTransfer): +# def __init__(self, connect, **kwargs): +# super().__init__(**kwargs) +# self.connect = connect +# +# @contextmanager +# def connect_for_download( +# self, connect: Optional[Callable[..., Connection]] = None +# ) -> Iterator[SFTPDownloadConnection]: +# connect = connect if connect is not None else self.connect +# with super().connect_for_download(connect=connect) as connection: +# yield connection +# +# @contextmanager +# def connect_for_upload( +# self, dataset: Dataset, connect: Optional[Callable[..., Connection]] = None +# ) -> Iterator[SFTPUploadConnection]: +# connect = connect if connect is not None else self.connect +# with super().connect_for_upload(dataset=dataset, connect=connect) as connection: +# yield connection +# +# +# # This test is referenced in the docs. +# def test_client_with_sftp( +# require_sftp_fileserver, +# sftp_access, +# sftp_connect_with_username_password, +# sftp_data_dir, +# tmp_path, +# ): +# tmp_path.joinpath("file1.txt").write_text("File contents") +# +# client = FakeClient.without_login( +# url="", +# file_transfer=SFTPTestFileTransfer( +# connect=sftp_connect_with_username_password, +# host=sftp_access.host, +# port=sftp_access.port, +# ), +# ) +# ds = Dataset( +# access_groups=["group1"], +# contact_email="p.stibbons@uu.am", +# creation_location="UU", +# creation_time=datetime(2023, 6, 23, 10, 0, 0), +# owner="PonderStibbons", +# owner_group="uu", +# principal_investigator="MustrumRidcully", +# source_folder="/data", +# type="raw", +# ) +# ds.add_local_files(tmp_path / "file1.txt", base_path=tmp_path) +# finalized = client.upload_new_dataset_now(ds) +# +# downloaded = client.get_dataset(finalized.pid) +# downloaded = client.download_files(downloaded, target=tmp_path / "download") +# +# assert sftp_data_dir.joinpath("file1.txt").read_text() == "File contents" +# assert downloaded.files[0].local_path.read_text() == "File contents" From 9b7e39c61c5630fdc6e31b3e0a48607e19bd9a81 Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Fri, 25 Aug 2023 13:54:17 +0200 Subject: [PATCH 02/21] Use local now as creation time --- src/scitacean/transfer/sftp.py | 43 ++-------------------------------- 1 file changed, 2 insertions(+), 41 deletions(-) diff --git a/src/scitacean/transfer/sftp.py b/src/scitacean/transfer/sftp.py index c42b2c4b..a9587a0f 100644 --- a/src/scitacean/transfer/sftp.py +++ b/src/scitacean/transfer/sftp.py @@ -3,13 +3,11 @@ import os from contextlib import contextmanager -from datetime import datetime, timedelta +from datetime import datetime, timezone from getpass import getpass from pathlib import Path from typing import Any, Callable, Iterator, List, Optional, Tuple, Union -from dateutil.tz import tzoffset - # Note that invoke and paramiko are dependencies of fabric. from fabric import Connection from invoke.exceptions import UnexpectedExit @@ -50,7 +48,6 @@ class SFTPUploadConnection: def __init__(self, *, connection: Connection, source_folder: RemotePath) -> None: self._connection = connection self._source_folder = source_folder - # self._remote_timezone = self._get_remote_timezone() @property def _sftp(self) -> SFTPClient: @@ -106,16 +103,11 @@ def _upload_file(self, file: File) -> Tuple[File, Optional[Exception]]: ) if (exc := self._validate_upload(file)) is not None: return file, exc - creation_time = ( - datetime.fromtimestamp(st.st_mtime, tz=self._remote_timezone) - if st.st_mtime - else None - ) return ( file.uploaded( remote_gid=str(st.st_gid), remote_uid=str(st.st_uid), - remote_creation_time=creation_time, + remote_creation_time=datetime.now().astimezone(timezone.utc), remote_perm=str(st.st_mode), remote_size=st.st_size, ), @@ -155,20 +147,6 @@ def _compute_checksum(self, file: File) -> Optional[str]: raise return res.stdout.split(" ", 1)[0] # type: ignore[no-any-return] - def _get_remote_timezone(self) -> tzoffset: - cmd = 'date +"%Z|%::z"' - try: - tz_str = self._connection.run( - cmd, hide=True, in_stream=False - ).stdout.strip() - except UnexpectedExit as exc: - raise FileUploadError( - f"Failed to get timezone of fileserver: {exc.args}" - ) from None - tz = _parse_remote_timezone(tz_str) - get_logger().info("Detected timezone of fileserver: %s", tz) - return tz - def revert_upload(self, *files: File) -> None: """Remove uploaded files from the remote folder.""" for file in files: @@ -474,20 +452,3 @@ def _coreutils_checksum_for(file: File) -> Optional[str]: ) return None return supported[algorithm] - - -# Using `date +"%Z"` returns a timezone abbreviation like CET or EST. -# dateutil.tz.gettz can parse this abbreviation and return a tzinfo object. -# However, on Windows, it returns `None` if the string refers to the local timezone. -# This is indistinguishable from an unrecognised timezone, -# where gettz also returns `None`. -# To avoid this, use an explicit offset obtained from `date +"%::z"`. -# The timezone name is only used for debugging and not interpreted by -# dateutil or datetime. -def _parse_remote_timezone(tz_str: str) -> tzoffset: - # tz_str is expected to be of the form - # |:: - # as produced by `date +"%Z|%::z"` - name, offset = tz_str.split("|") - hours, minutes, seconds = map(int, offset.split(":")) - return tzoffset(name, timedelta(hours=hours, minutes=minutes, seconds=seconds)) From c23a1629d33bfe3f1272738018280afd0ae2029c Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Fri, 25 Aug 2023 14:32:49 +0200 Subject: [PATCH 03/21] Add RemotePath.parent --- src/scitacean/filesystem.py | 9 +++++++++ tests/filesystem_test.py | 12 ++++++++++++ 2 files changed, 21 insertions(+) diff --git a/src/scitacean/filesystem.py b/src/scitacean/filesystem.py index 59223a2d..74cf5185 100644 --- a/src/scitacean/filesystem.py +++ b/src/scitacean/filesystem.py @@ -134,6 +134,15 @@ def suffix(self) -> Optional[str]: return None return "." + parts[1] + @property + def parent(self) -> RemotePath: + """The logical parent of the path.""" + parts = self._path.rstrip("/").rsplit("/", 1) + base = "/" if self._path.startswith("/") else "." + if len(parts) == 1: + return RemotePath(base) + return RemotePath(parts[0] or base) + def truncated(self, max_length: int = 255) -> RemotePath: """Return a new remote path with all path segments truncated. diff --git a/tests/filesystem_test.py b/tests/filesystem_test.py index 084c6de3..cb424d4d 100644 --- a/tests/filesystem_test.py +++ b/tests/filesystem_test.py @@ -143,6 +143,18 @@ def test_remote_path_suffix(): assert RemotePath("source/file").suffix is None +def test_remote_path_parent(): + assert RemotePath("/").parent == RemotePath("/") + assert RemotePath("/folder").parent == RemotePath("/") + assert RemotePath("/folder/").parent == RemotePath("/") + assert RemotePath("/folder/sub").parent == RemotePath("/folder") + + assert RemotePath("").parent == RemotePath(".") + assert RemotePath(".").parent == RemotePath(".") + assert RemotePath("relative").parent == RemotePath(".") + assert RemotePath("relative/sub").parent == RemotePath("relative") + + def test_remote_path_truncated(): assert RemotePath("something-long.txt").truncated(10) == "someth.txt" assert RemotePath("longlonglong/short").truncated(5) == "longl/short" From ac895f341ec33d9198c6be0282662fc392adabbb Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Fri, 25 Aug 2023 15:10:06 +0200 Subject: [PATCH 04/21] Make SFTP file transfer pass the tests --- src/scitacean/transfer/sftp.py | 100 +++-- tests/transfer/sftp_test.py | 689 +++++++++++++++++---------------- 2 files changed, 412 insertions(+), 377 deletions(-) diff --git a/src/scitacean/transfer/sftp.py b/src/scitacean/transfer/sftp.py index a9587a0f..24967dbb 100644 --- a/src/scitacean/transfer/sftp.py +++ b/src/scitacean/transfer/sftp.py @@ -11,7 +11,7 @@ # Note that invoke and paramiko are dependencies of fabric. from fabric import Connection from invoke.exceptions import UnexpectedExit -from paramiko import SFTPClient +from paramiko import SFTPAttributes, SFTPClient from paramiko.ssh_exception import AuthenticationException, PasswordRequiredException from ..dataset import Dataset @@ -21,9 +21,6 @@ from ..logging import get_logger from .util import source_folder_for -# TODO pass pid in put/revert? -# downloading does not need a pid, so it should not be required in the constructor/ - class SFTPDownloadConnection: def __init__(self, *, connection: Connection) -> None: @@ -62,9 +59,7 @@ def remote_path(self, filename: Union[str, RemotePath]) -> RemotePath: def _make_source_folder(self) -> None: try: - self._connection.run( - f"mkdir -p {self.source_folder.posix}", hide=True, in_stream=False - ) + _mkdir_remote(self._sftp, self.source_folder) except OSError as exc: raise FileUploadError( f"Failed to create source folder {self.source_folder}: {exc.args}" @@ -127,32 +122,18 @@ def _validate_upload(self, file: File) -> Optional[Exception]: return None def _compute_checksum(self, file: File) -> Optional[str]: - if (hash_exe := _coreutils_checksum_for(file)) is None: + if file.checksum_algorithm is None: return None - try: - res = self._connection.run( - f"{hash_exe} {self.remote_path(file.remote_path).posix}", - hide=True, - in_stream=False, - ) - except UnexpectedExit as exc: - if exc.result.return_code == 127: - get_logger().warning( - "Cannot validate checksum of uploaded file %s because checksum " - "algorithm '%s' is not implemented on the server.", - file.remote_path, - file.checksum_algorithm, - ) - return None - raise - return res.stdout.split(" ", 1)[0] # type: ignore[no-any-return] + return _compute_remote_checksum( + self._sftp, self.remote_path(file.remote_path), file.checksum_algorithm + ) def revert_upload(self, *files: File) -> None: """Remove uploaded files from the remote folder.""" for file in files: self._revert_upload_single(remote=file.remote_path, local=file.local_path) - if _folder_is_empty(self._connection, self.source_folder): + if _remote_folder_is_empty(self._sftp, self.source_folder): try: get_logger().info( "Removing empty remote directory %s on host %s", @@ -424,31 +405,46 @@ def _connect( raise type(exc)(exc.args) from None -def _folder_is_empty(con: Connection, path: RemotePath) -> bool: - try: - ls: str = con.run(f"ls {path.posix}", hide=True, in_stream=False).stdout - return ls == "" - except UnexpectedExit: - return False # no further processing is needed in this case - - -def _coreutils_checksum_for(file: File) -> Optional[str]: - # blake2s is not supported because `b2sum -l 256` produces a different digest - # and I don't know why. - supported = { - "md5": "md5sum -b", - "sha256": "sha256sum -b", - "sha384": "sha384sum -b", - "sha512": "sha512sum -b", - "blake2b": "b2sum -l 512 -b", - } - algorithm = file.checksum_algorithm - if algorithm == "blake2s" or algorithm not in supported: - get_logger().warning( - "Cannot validate checksum of uploaded file %s because checksum algorithm " - "'%s' is not supported by scitacean for remote files.", - file.remote_path, - file.checksum_algorithm, +def _remote_folder_is_empty(sftp: SFTPClient, path: RemotePath) -> bool: + return not sftp.listdir(path.posix) + + +def _mkdir_remote(sftp: SFTPClient, path: RemotePath) -> None: + if (parent := path.parent) not in (".", "/"): + _mkdir_remote(sftp, parent) + + st_stat = _try_remote_stat(sftp, path) + if st_stat is None: + sftp.mkdir(path.posix) + elif not _is_remote_dir(st_stat): + raise FileExistsError( + f"Cannot make directory because path points to a file: {path}" ) + + +def _try_remote_stat(sftp: SFTPClient, path: RemotePath) -> Optional[SFTPAttributes]: + try: + return sftp.stat(path.posix) + except FileNotFoundError: return None - return supported[algorithm] + + +def _is_remote_dir(st_stat: SFTPAttributes) -> bool: + try: + return st_stat.st_mode & 0o040000 == 0o040000 + except FileNotFoundError: + return False + + +def _compute_remote_checksum( + sftp: SFTPClient, path: RemotePath, checksum_algorithm: str +) -> Optional[str]: + with sftp.open(path.posix, "r") as f: + try: + return f.check(checksum_algorithm).decode("utf-8") + except OSError as exc: + # Many servers don't support this. + # See https://docs.paramiko.org/en/latest/api/sftp.html + if "Operation unsupported" in exc.args: + return None + raise diff --git a/tests/transfer/sftp_test.py b/tests/transfer/sftp_test.py index 7e3e4b89..1a6c6f8b 100644 --- a/tests/transfer/sftp_test.py +++ b/tests/transfer/sftp_test.py @@ -24,6 +24,8 @@ ) +# TODo deep source folder +# TODO existing source folder @pytest.fixture(scope="session", autouse=True) def server(request, sftp_fileserver): skip_if_not_sftp(request) @@ -77,328 +79,365 @@ def test_upload_one_file_source_folder_in_dataset( ) -# def test_upload_one_file_source_folder_in_transfer( -# sftp_access, sftp_connect_with_username_password, tmp_path, sftp_data_dir -# ): -# ds = Dataset(type="raw", owner="librarian") -# with open(tmp_path / "file1.txt", "w") as f: -# f.write("File no. 2") -# -# sftp = SFTPFileTransfer( -# host=sftp_access.host, -# port=sftp_access.port, -# source_folder="/data/upload/{owner}", -# ) -# with sftp.connect_for_upload( -# dataset=ds, connect=sftp_connect_with_username_password -# ) as con: -# assert con.source_folder == RemotePath("/data/upload/librarian") -# con.upload_files( -# File.from_local( -# path=tmp_path / "file1.txt", remote_path=RemotePath("upload_1.txt") -# ) -# ) -# -# with open(sftp_data_dir / "upload" / "librarian" / "upload_1.txt", "r") as f: -# assert f.read() == "File no. 2" -# -# -# def test_upload_two_files( -# sftp_access, sftp_connect_with_username_password, tmp_path, sftp_data_dir -# ): -# ds = Dataset(type="raw", source_folder=RemotePath("/data/upload2")) -# with open(tmp_path / "file2.1.md", "w") as f: -# f.write("First part of file 2") -# with open(tmp_path / "file2.2.md", "w") as f: -# f.write("Second part of file 2") -# -# sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) -# with sftp.connect_for_upload( -# dataset=ds, connect=sftp_connect_with_username_password -# ) as con: -# assert con.source_folder == RemotePath("/data/upload2") -# con.upload_files( -# File.from_local(path=tmp_path / "file2.1.md", base_path=tmp_path), -# File.from_local(path=tmp_path / "file2.2.md", base_path=tmp_path), -# ) -# -# with open(sftp_data_dir / "upload2" / "file2.1.md", "r") as f: -# assert f.read() == "First part of file 2" -# with open(sftp_data_dir / "upload2" / "file2.2.md", "r") as f: -# assert f.read() == "Second part of file 2" -# -# -# def test_revert_all_uploaded_files_single( -# sftp_access, sftp_connect_with_username_password, tmp_path, sftp_data_dir -# ): -# ds = Dataset(type="raw", source_folder=RemotePath("/data/revert-all-test-1")) -# with open(tmp_path / "file3.txt", "w") as f: -# f.write("File that should get reverted") -# -# sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) -# with sftp.connect_for_upload( -# dataset=ds, connect=sftp_connect_with_username_password -# ) as con: -# file = File.from_local(path=tmp_path / "file3.txt", base_path=tmp_path) -# con.upload_files(file) -# con.revert_upload(file) -# -# assert "revert-all-test-1" not in (p.name for p in sftp_data_dir.iterdir()) -# -# -# def test_revert_all_uploaded_files_two( -# sftp_access, sftp_connect_with_username_password, tmp_path, sftp_data_dir -# ): -# ds = Dataset(type="raw", source_folder=RemotePath("/data/revert-all-test-2")) -# with open(tmp_path / "file3.1.txt", "w") as f: -# f.write("File that should get reverted 1") -# with open(tmp_path / "file3.2.txt", "w") as f: -# f.write("File that should get reverted 2") -# -# sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) -# with sftp.connect_for_upload( -# dataset=ds, connect=sftp_connect_with_username_password -# ) as con: -# file1 = File.from_local(path=tmp_path / "file3.1.txt", base_path=tmp_path) -# file2 = File.from_local(path=tmp_path / "file3.2.txt", base_path=tmp_path) -# con.upload_files(file1, file2) -# con.revert_upload(file1, file2) -# -# assert "revert-all-test-2" not in (p.name for p in sftp_data_dir.iterdir()) -# -# -# def test_revert_one_uploaded_file( -# sftp_access, sftp_connect_with_username_password, tmp_path, sftp_data_dir -# ): -# ds = Dataset(type="raw", source_folder=RemotePath("/data/revert-test")) -# with open(tmp_path / "file4.txt", "w") as f: -# f.write("File that should get reverted") -# with open(tmp_path / "file5.txt", "w") as f: -# f.write("File that should be kept") -# -# sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) -# with sftp.connect_for_upload( -# dataset=ds, connect=sftp_connect_with_username_password -# ) as con: -# file4 = File.from_local(path=tmp_path / "file4.txt", base_path=tmp_path) -# file5 = File.from_local(path=tmp_path / "file5.txt", base_path=tmp_path) -# con.upload_files(file4, file5) -# con.revert_upload(file4) -# -# assert "file4.txt" not in (p.name for p in (sftp_data_dir / "revert-test").iterdir()) -# with open(sftp_data_dir / "revert-test" / "file5.txt", "r") as f: -# assert f.read() == "File that should be kept" -# -# -# def test_stat_uploaded_file( -# sftp_access, sftp_connect_with_username_password, tmp_path, sftp_data_dir -# ): -# ds = Dataset(type="raw", source_folder=RemotePath("/data/upload6")) -# with open(tmp_path / "file6.txt", "w") as f: -# f.write("File to test upload no 6") -# -# sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) -# with sftp.connect_for_upload( -# dataset=ds, connect=sftp_connect_with_username_password -# ) as con: -# [uploaded] = con.upload_files( -# File.from_local(path=tmp_path / "file6.txt", remote_path="upload_6.txt") -# ) -# -# st = (sftp_data_dir / "upload6" / "upload_6.txt").stat() -# assert uploaded.size == st.st_size -# -# # Set in docker-compose -# assert uploaded.remote_uid == "1000" -# assert uploaded.remote_gid == "1000" -# -# uploaded = dataclasses.replace(uploaded, local_path=None) -# assert datetime.now(tz=timezone.utc) - uploaded.creation_time < timedelta(seconds=5) -# -# -# class CorruptingSFTP(paramiko.SFTPClient): -# """Appends bytes to uploaded files to simulate a broken transfer.""" -# -# def put(self, localpath, remotepath, callback=None, confirm=True): -# with open(localpath, "r") as f: -# content = f.read() -# with tempfile.TemporaryDirectory() as tempdir: -# corrupted_path = Path(tempdir) / "corrupted" -# with open(corrupted_path, "w") as f: -# f.write(content + "\nevil bytes") -# super().put(str(corrupted_path), remotepath, callback, confirm) -# -# -# class CorruptingTransfer(paramiko.Transport): -# """Uses CorruptingSFTP to simulate a broken connection.""" -# -# def open_sftp_client(self) -> paramiko.SFTPClient: -# return CorruptingSFTP.from_transport(self) -# -# -# @pytest.fixture() -# def sftp_corrupting_connect(sftp_access, sftp_connection_config): -# def connect(host: str, port: int, **kwargs): -# if kwargs: -# raise ValueError( -# "connect_with_username_password must only be" -# f" used without extra arguments. Got {kwargs=}" -# ) -# connection = fabric.Connection( -# host=host, -# port=port, -# user=sftp_access.user.username, -# config=sftp_connection_config, -# connect_kwargs={ -# "password": sftp_access.user.password, -# "transport_factory": CorruptingTransfer, -# **sftp_connection_config.connect_kwargs, -# }, -# ) -# connection.client.set_missing_host_key_policy(IgnorePolicy()) -# return connection -# -# return connect -# -# -# def test_upload_file_detects_checksum_mismatch( -# sftp_access, sftp_corrupting_connect, tmp_path, sftp_data_dir -# ): -# ds = Dataset( -# type="raw", -# source_folder=RemotePath("/data/upload7"), -# checksum_algorithm="blake2b", -# ) -# with open(tmp_path / "file7.txt", "w") as f: -# f.write("File to test upload no 7") -# -# sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) -# with sftp.connect_for_upload(dataset=ds, connect=sftp_corrupting_connect) as con: -# with pytest.raises(FileUploadError): -# con.upload_files( -# dataclasses.replace( -# File.from_local( -# path=tmp_path / "file7.txt", -# remote_path=RemotePath("upload_7.txt"), -# ), -# checksum_algorithm="blake2b", -# ) -# ) -# -# assert "upload7" not in (p.name for p in sftp_data_dir.iterdir()) -# -# -# class RaisingSFTP(paramiko.SFTPClient): -# def put(self, localpath, remotepath, callback=None, confirm=True): -# raise RuntimeError("Upload disabled") -# -# -# class RaisingTransfer(paramiko.Transport): -# def open_sftp_client(self) -> paramiko.SFTPClient: -# return RaisingSFTP.from_transport(self) -# -# -# @pytest.fixture() -# def sftp_raising_connect(sftp_access, sftp_connection_config): -# def connect(host: str, port: int, **kwargs): -# if kwargs: -# raise ValueError( -# "connect_with_username_password must only be" -# f" used without extra arguments. Got {kwargs=}" -# ) -# connection = fabric.Connection( -# host=host, -# port=port, -# user=sftp_access.user.username, -# config=sftp_connection_config, -# connect_kwargs={ -# "password": sftp_access.user.password, -# "transport_factory": RaisingTransfer, -# **sftp_connection_config.connect_kwargs, -# }, -# ) -# connection.client.set_missing_host_key_policy(IgnorePolicy()) -# return connection -# -# return connect -# -# -# def test_upload_file_reverts_if_upload_fails( -# sftp_access, sftp_raising_connect, tmp_path, sftp_data_dir -# ): -# ds = Dataset(type="raw", source_folder=RemotePath("/data/upload8")) -# with open(tmp_path / "file8.txt", "w") as f: -# f.write("File to test upload no 8") -# -# sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) -# with sftp.connect_for_upload(dataset=ds, connect=sftp_raising_connect) as con: -# with pytest.raises(RuntimeError): -# con.upload_files( -# File.from_local( -# path=tmp_path / "file8.txt", -# remote_path=RemotePath("upload_8.txt"), -# ) -# ) -# -# assert "upload8" not in (p.name for p in sftp_data_dir.iterdir()) -# -# -# class SFTPTestFileTransfer(SFTPFileTransfer): -# def __init__(self, connect, **kwargs): -# super().__init__(**kwargs) -# self.connect = connect -# -# @contextmanager -# def connect_for_download( -# self, connect: Optional[Callable[..., Connection]] = None -# ) -> Iterator[SFTPDownloadConnection]: -# connect = connect if connect is not None else self.connect -# with super().connect_for_download(connect=connect) as connection: -# yield connection -# -# @contextmanager -# def connect_for_upload( -# self, dataset: Dataset, connect: Optional[Callable[..., Connection]] = None -# ) -> Iterator[SFTPUploadConnection]: -# connect = connect if connect is not None else self.connect -# with super().connect_for_upload(dataset=dataset, connect=connect) as connection: -# yield connection -# -# -# # This test is referenced in the docs. -# def test_client_with_sftp( -# require_sftp_fileserver, -# sftp_access, -# sftp_connect_with_username_password, -# sftp_data_dir, -# tmp_path, -# ): -# tmp_path.joinpath("file1.txt").write_text("File contents") -# -# client = FakeClient.without_login( -# url="", -# file_transfer=SFTPTestFileTransfer( -# connect=sftp_connect_with_username_password, -# host=sftp_access.host, -# port=sftp_access.port, -# ), -# ) -# ds = Dataset( -# access_groups=["group1"], -# contact_email="p.stibbons@uu.am", -# creation_location="UU", -# creation_time=datetime(2023, 6, 23, 10, 0, 0), -# owner="PonderStibbons", -# owner_group="uu", -# principal_investigator="MustrumRidcully", -# source_folder="/data", -# type="raw", -# ) -# ds.add_local_files(tmp_path / "file1.txt", base_path=tmp_path) -# finalized = client.upload_new_dataset_now(ds) -# -# downloaded = client.get_dataset(finalized.pid) -# downloaded = client.download_files(downloaded, target=tmp_path / "download") -# -# assert sftp_data_dir.joinpath("file1.txt").read_text() == "File contents" -# assert downloaded.files[0].local_path.read_text() == "File contents" +def test_upload_one_file_source_folder_in_transfer( + sftp_access, sftp_connect_with_username_password, tmp_path, sftp_data_dir +): + ds = Dataset(type="raw", owner="librarian") + tmp_path.joinpath("file1.txt").write_text("File no. 2") + + sftp = SFTPFileTransfer( + host=sftp_access.host, + port=sftp_access.port, + source_folder="/data/upload/{owner}", + ) + with sftp.connect_for_upload( + dataset=ds, connect=sftp_connect_with_username_password + ) as con: + assert con.source_folder == RemotePath("/data/upload/librarian") + con.upload_files( + File.from_local( + path=tmp_path / "file1.txt", remote_path=RemotePath("upload_1.txt") + ) + ) + + assert ( + sftp_data_dir.joinpath("upload", "librarian", "upload_1.txt").read_text() + == "File no. 2" + ) + + +def test_upload_two_files( + sftp_access, sftp_connect_with_username_password, tmp_path, sftp_data_dir +): + ds = Dataset(type="raw", source_folder=RemotePath("/data/upload2")) + tmp_path.joinpath("file2.1.md").write_text("First part of file 2") + tmp_path.joinpath("file2.2.md").write_text("Second part of file 2") + + sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) + with sftp.connect_for_upload( + dataset=ds, connect=sftp_connect_with_username_password + ) as con: + assert con.source_folder == RemotePath("/data/upload2") + con.upload_files( + File.from_local(path=tmp_path / "file2.1.md", base_path=tmp_path), + File.from_local(path=tmp_path / "file2.2.md", base_path=tmp_path), + ) + + assert ( + sftp_data_dir.joinpath("upload2", "file2.1.md").read_text() + == "First part of file 2" + ) + assert ( + sftp_data_dir.joinpath("upload2", "file2.2.md").read_text() + == "Second part of file 2" + ) + + +def test_upload_one_file_existing_source_folder( + sftp_access, sftp_connect_with_username_password, tmp_path, sftp_data_dir +): + ds = Dataset(type="raw", source_folder=RemotePath("/data/upload-multiple")) + tmp_path.joinpath("file3.1.md").write_text("First part of file 3") + tmp_path.joinpath("file3.2.md").write_text("Second part of file 3") + + # First upload to ensure the folder exists. + sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) + with sftp.connect_for_upload( + dataset=ds, connect=sftp_connect_with_username_password + ) as con: + assert con.source_folder == RemotePath("/data/upload-multiple") + con.upload_files( + File.from_local(path=tmp_path / "file3.1.md", base_path=tmp_path), + ) + + # Second upload to test uploading to existing folder. + sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) + with sftp.connect_for_upload( + dataset=ds, connect=sftp_connect_with_username_password + ) as con: + assert con.source_folder == RemotePath("/data/upload-multiple") + con.upload_files( + File.from_local(path=tmp_path / "file3.2.md", base_path=tmp_path), + ) + + assert ( + sftp_data_dir.joinpath("upload-multiple", "file3.1.md").read_text() + == "First part of file 3" + ) + assert ( + sftp_data_dir.joinpath("upload-multiple", "file3.2.md").read_text() + == "Second part of file 3" + ) + + +def test_revert_all_uploaded_files_single( + sftp_access, sftp_connect_with_username_password, tmp_path, sftp_data_dir +): + ds = Dataset(type="raw", source_folder=RemotePath("/data/revert-all-test-1")) + tmp_path.joinpath("file3.txt").write_text("File that should get reverted") + + sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) + with sftp.connect_for_upload( + dataset=ds, connect=sftp_connect_with_username_password + ) as con: + file = File.from_local(path=tmp_path / "file3.txt", base_path=tmp_path) + con.upload_files(file) + con.revert_upload(file) + + assert "revert-all-test-1" not in (p.name for p in sftp_data_dir.iterdir()) + + +def test_revert_all_uploaded_files_two( + sftp_access, sftp_connect_with_username_password, tmp_path, sftp_data_dir +): + ds = Dataset(type="raw", source_folder=RemotePath("/data/revert-all-test-2")) + tmp_path.joinpath("file3.1.txt").write_text("File that should get reverted 1") + tmp_path.joinpath("file3.2.txt").write_text("File that should get reverted 2") + + sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) + with sftp.connect_for_upload( + dataset=ds, connect=sftp_connect_with_username_password + ) as con: + file1 = File.from_local(path=tmp_path / "file3.1.txt", base_path=tmp_path) + file2 = File.from_local(path=tmp_path / "file3.2.txt", base_path=tmp_path) + con.upload_files(file1, file2) + con.revert_upload(file1, file2) + + assert "revert-all-test-2" not in (p.name for p in sftp_data_dir.iterdir()) + + +def test_revert_one_uploaded_file( + sftp_access, sftp_connect_with_username_password, tmp_path, sftp_data_dir +): + ds = Dataset(type="raw", source_folder=RemotePath("/data/revert-test")) + tmp_path.joinpath("file4.txt").write_text("File that should get reverted") + tmp_path.joinpath("file5.txt").write_text("File that should be kept") + + sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) + with sftp.connect_for_upload( + dataset=ds, connect=sftp_connect_with_username_password + ) as con: + file4 = File.from_local(path=tmp_path / "file4.txt", base_path=tmp_path) + file5 = File.from_local(path=tmp_path / "file5.txt", base_path=tmp_path) + con.upload_files(file4, file5) + con.revert_upload(file4) + + assert "file4.txt" not in ( + p.name for p in (sftp_data_dir / "revert-test").iterdir() + ) + assert ( + sftp_data_dir.joinpath("revert-test", "file5.txt").read_text() + == "File that should be kept" + ) + + +def test_stat_uploaded_file( + sftp_access, sftp_connect_with_username_password, tmp_path, sftp_data_dir +): + ds = Dataset(type="raw", source_folder=RemotePath("/data/upload6")) + tmp_path.joinpath("file6.txt").write_text("File to test upload no 6") + + sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) + with sftp.connect_for_upload( + dataset=ds, connect=sftp_connect_with_username_password + ) as con: + [uploaded] = con.upload_files( + File.from_local(path=tmp_path / "file6.txt", remote_path="upload_6.txt") + ) + + st = (sftp_data_dir / "upload6" / "upload_6.txt").stat() + assert uploaded.size == st.st_size + + # Set in docker-compose + assert uploaded.remote_uid == "1000" + assert uploaded.remote_gid == "1000" + + uploaded = dataclasses.replace(uploaded, local_path=None) + assert datetime.now(tz=timezone.utc) - uploaded.creation_time < timedelta(seconds=5) + + +class CorruptingSFTP(paramiko.SFTPClient): + """Appends bytes to uploaded files to simulate a broken transfer.""" + + def put(self, localpath, remotepath, callback=None, confirm=True): + with open(localpath, "r") as f: + content = f.read() + with tempfile.TemporaryDirectory() as tempdir: + corrupted_path = Path(tempdir) / "corrupted" + with open(corrupted_path, "w") as f: + f.write(content + "\nevil bytes") + super().put(str(corrupted_path), remotepath, callback, confirm) + + +class CorruptingTransfer(paramiko.Transport): + """Uses CorruptingSFTP to simulate a broken connection.""" + + def open_sftp_client(self) -> paramiko.SFTPClient: + return CorruptingSFTP.from_transport(self) + + +@pytest.fixture() +def sftp_corrupting_connect(sftp_access, sftp_connection_config): + def connect(host: str, port: int, **kwargs): + if kwargs: + raise ValueError( + "connect_with_username_password must only be" + f" used without extra arguments. Got {kwargs=}" + ) + connection = fabric.Connection( + host=host, + port=port, + user=sftp_access.user.username, + config=sftp_connection_config, + connect_kwargs={ + "password": sftp_access.user.password, + "transport_factory": CorruptingTransfer, + **sftp_connection_config.connect_kwargs, + }, + ) + connection.client.set_missing_host_key_policy(IgnorePolicy()) + return connection + + return connect + + +@pytest.mark.skip(reason="Checksums not supported on the test server") +def test_upload_file_detects_checksum_mismatch( + sftp_access, sftp_corrupting_connect, tmp_path, sftp_data_dir +): + ds = Dataset( + type="raw", + source_folder=RemotePath("/data/upload7"), + checksum_algorithm="blake2b", + ) + tmp_path.joinpath("file7.txt").write_text("File to test upload no 7") + + sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) + with sftp.connect_for_upload(dataset=ds, connect=sftp_corrupting_connect) as con: + with pytest.raises(FileUploadError): + con.upload_files( + dataclasses.replace( + File.from_local( + path=tmp_path / "file7.txt", + remote_path=RemotePath("upload_7.txt"), + ), + checksum_algorithm="blake2b", + ) + ) + + assert "upload7" not in (p.name for p in sftp_data_dir.iterdir()) + + +class RaisingSFTP(paramiko.SFTPClient): + def put(self, localpath, remotepath, callback=None, confirm=True): + raise RuntimeError("Upload disabled") + + +class RaisingTransfer(paramiko.Transport): + def open_sftp_client(self) -> paramiko.SFTPClient: + return RaisingSFTP.from_transport(self) + + +@pytest.fixture() +def sftp_raising_connect(sftp_access, sftp_connection_config): + def connect(host: str, port: int, **kwargs): + if kwargs: + raise ValueError( + "connect_with_username_password must only be" + f" used without extra arguments. Got {kwargs=}" + ) + connection = fabric.Connection( + host=host, + port=port, + user=sftp_access.user.username, + config=sftp_connection_config, + connect_kwargs={ + "password": sftp_access.user.password, + "transport_factory": RaisingTransfer, + **sftp_connection_config.connect_kwargs, + }, + ) + connection.client.set_missing_host_key_policy(IgnorePolicy()) + return connection + + return connect + + +def test_upload_file_reverts_if_upload_fails( + sftp_access, sftp_raising_connect, tmp_path, sftp_data_dir +): + ds = Dataset(type="raw", source_folder=RemotePath("/data/upload8")) + tmp_path.joinpath("file8.txt").write_text("File to test upload no 8") + + sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) + with sftp.connect_for_upload(dataset=ds, connect=sftp_raising_connect) as con: + with pytest.raises(RuntimeError): + con.upload_files( + File.from_local( + path=tmp_path / "file8.txt", + remote_path=RemotePath("upload_8.txt"), + ) + ) + + assert "upload8" not in (p.name for p in sftp_data_dir.iterdir()) + + +class SFTPTestFileTransfer(SFTPFileTransfer): + def __init__(self, connect, **kwargs): + super().__init__(**kwargs) + self.connect = connect + + @contextmanager + def connect_for_download( + self, connect: Optional[Callable[..., Connection]] = None + ) -> Iterator[SFTPDownloadConnection]: + connect = connect if connect is not None else self.connect + with super().connect_for_download(connect=connect) as connection: + yield connection + + @contextmanager + def connect_for_upload( + self, dataset: Dataset, connect: Optional[Callable[..., Connection]] = None + ) -> Iterator[SFTPUploadConnection]: + connect = connect if connect is not None else self.connect + with super().connect_for_upload(dataset=dataset, connect=connect) as connection: + yield connection + + +# This test is referenced in the docs. +def test_client_with_sftp( + require_sftp_fileserver, + sftp_access, + sftp_connect_with_username_password, + sftp_data_dir, + tmp_path, +): + tmp_path.joinpath("file1.txt").write_text("File contents") + + client = FakeClient.without_login( + url="", + file_transfer=SFTPTestFileTransfer( + connect=sftp_connect_with_username_password, + host=sftp_access.host, + port=sftp_access.port, + ), + ) + ds = Dataset( + access_groups=["group1"], + contact_email="p.stibbons@uu.am", + creation_location="UU", + creation_time=datetime(2023, 6, 23, 10, 0, 0), + owner="PonderStibbons", + owner_group="uu", + principal_investigator="MustrumRidcully", + source_folder="/data", + type="raw", + ) + ds.add_local_files(tmp_path / "file1.txt", base_path=tmp_path) + finalized = client.upload_new_dataset_now(ds) + + downloaded = client.get_dataset(finalized.pid) + downloaded = client.download_files(downloaded, target=tmp_path / "download") + + assert sftp_data_dir.joinpath("file1.txt").read_text() == "File contents" + assert downloaded.files[0].local_path.read_text() == "File contents" From b029f55524ca6c690f4c6fd673999d7788fa04ba Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Fri, 25 Aug 2023 15:11:15 +0200 Subject: [PATCH 05/21] Add __all__ --- src/scitacean/transfer/sftp.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/scitacean/transfer/sftp.py b/src/scitacean/transfer/sftp.py index 24967dbb..07ad04f9 100644 --- a/src/scitacean/transfer/sftp.py +++ b/src/scitacean/transfer/sftp.py @@ -448,3 +448,6 @@ def _compute_remote_checksum( if "Operation unsupported" in exc.args: return None raise + + +__all__ = ["SFTPFileTransfer", "SFTPUploadConnection", "SFTPDownloadConnection"] From 46a9b710e63d4c7b18f850316084c29e2aeccc64 Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Fri, 25 Aug 2023 15:16:01 +0200 Subject: [PATCH 06/21] Pass connect function in init --- src/scitacean/transfer/sftp.py | 53 +++++-------- tests/transfer/sftp_test.py | 133 +++++++++++++++++++-------------- 2 files changed, 94 insertions(+), 92 deletions(-) diff --git a/src/scitacean/transfer/sftp.py b/src/scitacean/transfer/sftp.py index 07ad04f9..67b0c02c 100644 --- a/src/scitacean/transfer/sftp.py +++ b/src/scitacean/transfer/sftp.py @@ -245,6 +245,7 @@ def __init__( host: str, port: Optional[int] = None, source_folder: Optional[Union[str, RemotePath]] = None, + connect: Optional[Callable[..., Connection]] = None, ) -> None: """Construct a new SFTP file transfer. @@ -258,6 +259,17 @@ def __init__( Upload files to this folder if set. Otherwise, upload to the dataset's source_folder. Ignored when downloading files. + connect: + A function that creates and returns a :class:`fabric.connection.Connection` + object. + Will first be called with only ``host`` and ``port``. + If this fails (by raising + :class:`paramiko.ssh_exception.AuthenticationException`), the function is + called with ``host``, ``port``, and, optionally, ``user`` and + ``connection_kwargs`` depending on the authentication method. + Raising :class:`paramiko.ssh_exception.AuthenticationException` in the 2nd + call or any other exception in the 1st signals failure of + ``connect_for_download``. """ self._host = host self._port = port @@ -266,41 +278,23 @@ def __init__( if isinstance(source_folder, str) else source_folder ) + self._connect = connect def source_folder_for(self, dataset: Dataset) -> RemotePath: """Return the source folder used for the given dataset.""" return source_folder_for(dataset, self._source_folder_pattern) @contextmanager - def connect_for_download( - self, connect: Optional[Callable[..., Connection]] = None - ) -> Iterator[SFTPDownloadConnection]: - """Create a connection for downloads, use as a context manager. - - Parameters - ---------- - connect: - A function that creates and returns a :class:`fabric.connection.Connection` - object. - Will first be called with only ``host`` and ``port``. - If this fails (by raising - :class:`paramiko.ssh_exception.AuthenticationException`), the function is - called with ``host``, ``port``, and, optionally, ``user`` and - ``connection_kwargs`` depending on the authentication method. - Raising :class:`paramiko.ssh_exception.AuthenticationException` in the 2nd - call or any other exception in the 1st signals failure of - ``connect_for_download``. - """ - con = _connect(self._host, self._port, connect=connect) + def connect_for_download(self) -> Iterator[SFTPDownloadConnection]: + """Create a connection for downloads, use as a context manager.""" + con = _connect(self._host, self._port, connect=self._connect) try: yield SFTPDownloadConnection(connection=con) finally: con.close() @contextmanager - def connect_for_upload( - self, dataset: Dataset, connect: Optional[Callable[..., Connection]] = None - ) -> Iterator[SFTPUploadConnection]: + def connect_for_upload(self, dataset: Dataset) -> Iterator[SFTPUploadConnection]: """Create a connection for uploads, use as a context manager. Parameters @@ -308,20 +302,9 @@ def connect_for_upload( dataset: The connection will be used to upload files of this dataset. Used to determine the target folder. - connect: - A function that creates and returns a :class:`fabric.connection.Connection` - object. - Will first be called with only ``host`` and ``port``. - If this fails (by raising - :class:`paramiko.ssh_exception.AuthenticationException`), the function is - called with ``host``, ``port``, and, optionally, ``user`` and - ``connection_kwargs`` depending on the authentication method. - Raising :class:`paramiko.ssh_exception.AuthenticationException` in the 2nd - call or any other exception in the 1st signals failure of - ``connect_for_upload``. """ source_folder = self.source_folder_for(dataset) - con = _connect(self._host, self._port, connect=connect) + con = _connect(self._host, self._port, connect=self._connect) try: yield SFTPUploadConnection( connection=con, diff --git a/tests/transfer/sftp_test.py b/tests/transfer/sftp_test.py index 1a6c6f8b..c41ba542 100644 --- a/tests/transfer/sftp_test.py +++ b/tests/transfer/sftp_test.py @@ -7,12 +7,11 @@ from contextlib import contextmanager from datetime import datetime, timedelta, timezone from pathlib import Path -from typing import Callable, Iterator, Optional +from typing import Iterator import fabric import paramiko import pytest -from fabric import Connection from scitacean import Dataset, File, FileUploadError, RemotePath from scitacean.testing.client import FakeClient @@ -32,8 +31,12 @@ def server(request, sftp_fileserver): def test_download_one_file(sftp_access, sftp_connect_with_username_password, tmp_path): - sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) - with sftp.connect_for_download(connect=sftp_connect_with_username_password) as con: + sftp = SFTPFileTransfer( + host=sftp_access.host, + port=sftp_access.port, + connect=sftp_connect_with_username_password, + ) + with sftp.connect_for_download() as con: con.download_files( remote=[RemotePath("/data/seed/text.txt")], local=[tmp_path / "text.txt"] ) @@ -43,8 +46,12 @@ def test_download_one_file(sftp_access, sftp_connect_with_username_password, tmp def test_download_two_files(sftp_access, sftp_connect_with_username_password, tmp_path): - sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) - with sftp.connect_for_download(connect=sftp_connect_with_username_password) as con: + sftp = SFTPFileTransfer( + host=sftp_access.host, + port=sftp_access.port, + connect=sftp_connect_with_username_password, + ) + with sftp.connect_for_download() as con: con.download_files( remote=[ RemotePath("/data/seed/table.csv"), @@ -64,10 +71,12 @@ def test_upload_one_file_source_folder_in_dataset( ds = Dataset(type="raw", source_folder=RemotePath("/data/upload")) tmp_path.joinpath("file0.txt").write_text("File to test upload123") - sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) - with sftp.connect_for_upload( - dataset=ds, connect=sftp_connect_with_username_password - ) as con: + sftp = SFTPFileTransfer( + host=sftp_access.host, + port=sftp_access.port, + connect=sftp_connect_with_username_password, + ) + with sftp.connect_for_upload(dataset=ds) as con: assert con.source_folder == RemotePath("/data/upload") con.upload_files( File.from_local(path=tmp_path / "file0.txt", remote_path="upload_0.txt") @@ -89,10 +98,9 @@ def test_upload_one_file_source_folder_in_transfer( host=sftp_access.host, port=sftp_access.port, source_folder="/data/upload/{owner}", + connect=sftp_connect_with_username_password, ) - with sftp.connect_for_upload( - dataset=ds, connect=sftp_connect_with_username_password - ) as con: + with sftp.connect_for_upload(dataset=ds) as con: assert con.source_folder == RemotePath("/data/upload/librarian") con.upload_files( File.from_local( @@ -113,10 +121,12 @@ def test_upload_two_files( tmp_path.joinpath("file2.1.md").write_text("First part of file 2") tmp_path.joinpath("file2.2.md").write_text("Second part of file 2") - sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) - with sftp.connect_for_upload( - dataset=ds, connect=sftp_connect_with_username_password - ) as con: + sftp = SFTPFileTransfer( + host=sftp_access.host, + port=sftp_access.port, + connect=sftp_connect_with_username_password, + ) + with sftp.connect_for_upload(dataset=ds) as con: assert con.source_folder == RemotePath("/data/upload2") con.upload_files( File.from_local(path=tmp_path / "file2.1.md", base_path=tmp_path), @@ -141,20 +151,24 @@ def test_upload_one_file_existing_source_folder( tmp_path.joinpath("file3.2.md").write_text("Second part of file 3") # First upload to ensure the folder exists. - sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) - with sftp.connect_for_upload( - dataset=ds, connect=sftp_connect_with_username_password - ) as con: + sftp = SFTPFileTransfer( + host=sftp_access.host, + port=sftp_access.port, + connect=sftp_connect_with_username_password, + ) + with sftp.connect_for_upload(dataset=ds) as con: assert con.source_folder == RemotePath("/data/upload-multiple") con.upload_files( File.from_local(path=tmp_path / "file3.1.md", base_path=tmp_path), ) # Second upload to test uploading to existing folder. - sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) - with sftp.connect_for_upload( - dataset=ds, connect=sftp_connect_with_username_password - ) as con: + sftp = SFTPFileTransfer( + host=sftp_access.host, + port=sftp_access.port, + connect=sftp_connect_with_username_password, + ) + with sftp.connect_for_upload(dataset=ds) as con: assert con.source_folder == RemotePath("/data/upload-multiple") con.upload_files( File.from_local(path=tmp_path / "file3.2.md", base_path=tmp_path), @@ -176,10 +190,12 @@ def test_revert_all_uploaded_files_single( ds = Dataset(type="raw", source_folder=RemotePath("/data/revert-all-test-1")) tmp_path.joinpath("file3.txt").write_text("File that should get reverted") - sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) - with sftp.connect_for_upload( - dataset=ds, connect=sftp_connect_with_username_password - ) as con: + sftp = SFTPFileTransfer( + host=sftp_access.host, + port=sftp_access.port, + connect=sftp_connect_with_username_password, + ) + with sftp.connect_for_upload(dataset=ds) as con: file = File.from_local(path=tmp_path / "file3.txt", base_path=tmp_path) con.upload_files(file) con.revert_upload(file) @@ -194,10 +210,12 @@ def test_revert_all_uploaded_files_two( tmp_path.joinpath("file3.1.txt").write_text("File that should get reverted 1") tmp_path.joinpath("file3.2.txt").write_text("File that should get reverted 2") - sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) - with sftp.connect_for_upload( - dataset=ds, connect=sftp_connect_with_username_password - ) as con: + sftp = SFTPFileTransfer( + host=sftp_access.host, + port=sftp_access.port, + connect=sftp_connect_with_username_password, + ) + with sftp.connect_for_upload(dataset=ds) as con: file1 = File.from_local(path=tmp_path / "file3.1.txt", base_path=tmp_path) file2 = File.from_local(path=tmp_path / "file3.2.txt", base_path=tmp_path) con.upload_files(file1, file2) @@ -213,10 +231,12 @@ def test_revert_one_uploaded_file( tmp_path.joinpath("file4.txt").write_text("File that should get reverted") tmp_path.joinpath("file5.txt").write_text("File that should be kept") - sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) - with sftp.connect_for_upload( - dataset=ds, connect=sftp_connect_with_username_password - ) as con: + sftp = SFTPFileTransfer( + host=sftp_access.host, + port=sftp_access.port, + connect=sftp_connect_with_username_password, + ) + with sftp.connect_for_upload(dataset=ds) as con: file4 = File.from_local(path=tmp_path / "file4.txt", base_path=tmp_path) file5 = File.from_local(path=tmp_path / "file5.txt", base_path=tmp_path) con.upload_files(file4, file5) @@ -237,10 +257,12 @@ def test_stat_uploaded_file( ds = Dataset(type="raw", source_folder=RemotePath("/data/upload6")) tmp_path.joinpath("file6.txt").write_text("File to test upload no 6") - sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) - with sftp.connect_for_upload( - dataset=ds, connect=sftp_connect_with_username_password - ) as con: + sftp = SFTPFileTransfer( + host=sftp_access.host, + port=sftp_access.port, + connect=sftp_connect_with_username_password, + ) + with sftp.connect_for_upload(dataset=ds) as con: [uploaded] = con.upload_files( File.from_local(path=tmp_path / "file6.txt", remote_path="upload_6.txt") ) @@ -312,8 +334,10 @@ def test_upload_file_detects_checksum_mismatch( ) tmp_path.joinpath("file7.txt").write_text("File to test upload no 7") - sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) - with sftp.connect_for_upload(dataset=ds, connect=sftp_corrupting_connect) as con: + sftp = SFTPFileTransfer( + host=sftp_access.host, port=sftp_access.port, connect=sftp_corrupting_connect + ) + with sftp.connect_for_upload(dataset=ds) as con: with pytest.raises(FileUploadError): con.upload_files( dataclasses.replace( @@ -369,8 +393,10 @@ def test_upload_file_reverts_if_upload_fails( ds = Dataset(type="raw", source_folder=RemotePath("/data/upload8")) tmp_path.joinpath("file8.txt").write_text("File to test upload no 8") - sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) - with sftp.connect_for_upload(dataset=ds, connect=sftp_raising_connect) as con: + sftp = SFTPFileTransfer( + host=sftp_access.host, port=sftp_access.port, connect=sftp_raising_connect + ) + with sftp.connect_for_upload(dataset=ds) as con: with pytest.raises(RuntimeError): con.upload_files( File.from_local( @@ -383,24 +409,17 @@ def test_upload_file_reverts_if_upload_fails( class SFTPTestFileTransfer(SFTPFileTransfer): - def __init__(self, connect, **kwargs): + def __init__(self, **kwargs): super().__init__(**kwargs) - self.connect = connect @contextmanager - def connect_for_download( - self, connect: Optional[Callable[..., Connection]] = None - ) -> Iterator[SFTPDownloadConnection]: - connect = connect if connect is not None else self.connect - with super().connect_for_download(connect=connect) as connection: + def connect_for_download(self) -> Iterator[SFTPDownloadConnection]: + with super().connect_for_download() as connection: yield connection @contextmanager - def connect_for_upload( - self, dataset: Dataset, connect: Optional[Callable[..., Connection]] = None - ) -> Iterator[SFTPUploadConnection]: - connect = connect if connect is not None else self.connect - with super().connect_for_upload(dataset=dataset, connect=connect) as connection: + def connect_for_upload(self, dataset: Dataset) -> Iterator[SFTPUploadConnection]: + with super().connect_for_upload(dataset=dataset) as connection: yield connection From 3a1a298d5b3f5fdb724d79e1183e3b069fa8f94a Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Fri, 25 Aug 2023 15:22:17 +0200 Subject: [PATCH 07/21] Add release note --- docs/release-notes.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/release-notes.rst b/docs/release-notes.rst index 2c3b7403..9c335a9e 100644 --- a/docs/release-notes.rst +++ b/docs/release-notes.rst @@ -43,6 +43,8 @@ Features * File downloads now use the checksum algorithm stored in SciCat when possible. So it is no longer necessary to specify it by hand in many cases. +* Added ``SFTPFileTransfer`` which is similar to ``SSHFileTransfer`` but relies only on SFTP. + ``SSHFileTransfer`` may be removed in the future. Breaking changes ~~~~~~~~~~~~~~~~ From 801be1a4bfc698dee6ca9f493f8307e51883d5dd Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Fri, 25 Aug 2023 15:32:36 +0200 Subject: [PATCH 08/21] Observe max line length --- src/scitacean/testing/sftp/_sftp.py | 4 +++- src/scitacean/testing/sftp/fixtures.py | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/scitacean/testing/sftp/_sftp.py b/src/scitacean/testing/sftp/_sftp.py index 67722e88..da20bbcd 100644 --- a/src/scitacean/testing/sftp/_sftp.py +++ b/src/scitacean/testing/sftp/_sftp.py @@ -42,7 +42,9 @@ def _read_resource_yaml(filename: str) -> Any: @lru_cache(maxsize=1) def _docker_compose_file() -> Dict[str, Any]: - return _read_resource_yaml("docker-compose-sftp-server.yaml") # type: ignore[no-any-return] + return _read_resource_yaml( + "docker-compose-sftp-server.yaml" + ) # type: ignore[no-any-return] @lru_cache(maxsize=1) diff --git a/src/scitacean/testing/sftp/fixtures.py b/src/scitacean/testing/sftp/fixtures.py index ed2cbc8a..c4ea2942 100644 --- a/src/scitacean/testing/sftp/fixtures.py +++ b/src/scitacean/testing/sftp/fixtures.py @@ -145,7 +145,7 @@ def sftp_connection_config() -> fabric.config.Config: def sftp_connect_with_username_password( sftp_access: SFTPAccess, sftp_connection_config: fabric.config.Config ) -> Callable[..., fabric.Connection]: - """Fixture that returns a function to create a connection to the testing SFTP server. + """Fixture that returns a function to connect to the testing SFTP server. Uses username+password and rejects any other authentication attempt. @@ -162,7 +162,9 @@ def sftp_connect_with_username_password( .. code-block:: python - def test_sftp(sftp_access, sftp_connect_with_username_password, sftp_fileserver): + def test_sftp(sftp_access, + sftp_connect_with_username_password, + sftp_fileserver): sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) with sftp.connect_for_download( connect=sftp_connect_with_username_password From 5bbd2b67e3706ab3087611f2929f8f251eda1126 Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Mon, 28 Aug 2023 09:36:55 +0200 Subject: [PATCH 09/21] Run SFTP tests in tox --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index a8274fa7..6134be1a 100644 --- a/tox.ini +++ b/tox.ini @@ -5,7 +5,7 @@ isolated_build = true [testenv] deps = -r requirements/test.txt commands = - full: python -m pytest --backend-tests --ssh-tests + full: python -m pytest --backend-tests --ssh-tests --sftp-tests !full: python -m pytest [testenv:pydantic2] From 66a56cc366a3a7272603bff8974f5db60232063e Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Mon, 28 Aug 2023 09:38:48 +0200 Subject: [PATCH 10/21] Add paramiko as dependency --- pyproject.toml | 1 + requirements-pydantic2/base.in | 1 + requirements-pydantic2/base.txt | 57 +++++++++++++++++++++++++++++++++ requirements-pydantic2/test.txt | 37 +++++++++++++++++++++ requirements/base.in | 1 + 5 files changed, 97 insertions(+) create mode 100644 requirements-pydantic2/base.txt create mode 100644 requirements-pydantic2/test.txt diff --git a/pyproject.toml b/pyproject.toml index d8a16979..cd3959ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ dynamic = ["version"] [project.optional-dependencies] ssh = ["fabric"] +sftp = ["paramiko"] test = ["filelock", "hypothesis", "pyyaml"] [tool.setuptools_scm] diff --git a/requirements-pydantic2/base.in b/requirements-pydantic2/base.in index 33ff6e2c..e104e3af 100644 --- a/requirements-pydantic2/base.in +++ b/requirements-pydantic2/base.in @@ -1,5 +1,6 @@ email-validator fabric +paramiko pydantic == 2 python-dateutil requests diff --git a/requirements-pydantic2/base.txt b/requirements-pydantic2/base.txt new file mode 100644 index 00000000..ffd6730d --- /dev/null +++ b/requirements-pydantic2/base.txt @@ -0,0 +1,57 @@ +# SHA1:e21461097cae87bbf0d2ae4f359f5573f0959e67 +# +# This file is autogenerated by pip-compile-multi +# To update, run: +# +# pip-compile-multi +# +annotated-types==0.5.0 + # via pydantic +bcrypt==4.0.1 + # via paramiko +certifi==2023.5.7 + # via requests +cffi==1.15.1 + # via + # cryptography + # pynacl +charset-normalizer==3.1.0 + # via requests +cryptography==41.0.1 + # via paramiko +decorator==5.1.1 + # via fabric +dnspython==2.3.0 + # via email-validator +email-validator==2.0.0.post2 + # via -r requirements-pydantic2/base.in +fabric==3.1.0 + # via -r requirements-pydantic2/base.in +idna==3.4 + # via + # email-validator + # requests +invoke==2.1.3 + # via fabric +paramiko==3.2.0 + # via fabric +pycparser==2.21 + # via cffi +pydantic==2.0 + # via -r requirements-pydantic2/base.in +pydantic-core==2.0.1 + # via pydantic +pynacl==1.5.0 + # via paramiko +python-dateutil==2.8.2 + # via -r requirements-pydantic2/base.in +requests==2.31.0 + # via -r requirements-pydantic2/base.in +six==1.16.0 + # via python-dateutil +typing-extensions==4.7.1 + # via + # pydantic + # pydantic-core +urllib3==2.0.3 + # via requests diff --git a/requirements-pydantic2/test.txt b/requirements-pydantic2/test.txt new file mode 100644 index 00000000..1429615f --- /dev/null +++ b/requirements-pydantic2/test.txt @@ -0,0 +1,37 @@ +# SHA1:cefe43dc5cb9d9d7430647ec8f8955a6f1576e53 +# +# This file is autogenerated by pip-compile-multi +# To update, run: +# +# pip-compile-multi +# +-r base.txt +attrs==23.1.0 + # via hypothesis +execnet==1.9.0 + # via pytest-xdist +filelock==3.12.2 + # via -r requirements-pydantic2/test.in +hypothesis==6.80.0 + # via -r requirements-pydantic2/test.in +iniconfig==2.0.0 + # via pytest +packaging==23.1 + # via pytest +pluggy==1.2.0 + # via pytest +pyfakefs==5.2.2 + # via -r requirements-pydantic2/test.in +pytest==7.4.0 + # via + # -r requirements-pydantic2/test.in + # pytest-randomly + # pytest-xdist +pytest-randomly==3.12.0 + # via -r requirements-pydantic2/test.in +pytest-xdist==3.3.1 + # via -r requirements-pydantic2/test.in +pyyaml==6.0 + # via -r requirements-pydantic2/test.in +sortedcontainers==2.4.0 + # via hypothesis diff --git a/requirements/base.in b/requirements/base.in index d7343624..a3e8e358 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -1,5 +1,6 @@ email-validator fabric +paramiko pydantic < 2 # autodoc_pydantic is not compatible with pydantic 2 python-dateutil requests From 38bda9abe5214846e96d6c43bb5afbef47b3682a Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Mon, 28 Aug 2023 10:46:54 +0200 Subject: [PATCH 11/21] Do not ask for passwords in SFTP --- src/scitacean/testing/sftp/fixtures.py | 15 ++--- src/scitacean/transfer/sftp.py | 82 +++++--------------------- 2 files changed, 19 insertions(+), 78 deletions(-) diff --git a/src/scitacean/testing/sftp/fixtures.py b/src/scitacean/testing/sftp/fixtures.py index c4ea2942..58e91d47 100644 --- a/src/scitacean/testing/sftp/fixtures.py +++ b/src/scitacean/testing/sftp/fixtures.py @@ -165,19 +165,14 @@ def sftp_connect_with_username_password( def test_sftp(sftp_access, sftp_connect_with_username_password, sftp_fileserver): - sftp = SFTPFileTransfer(host=sftp_access.host, port=sftp_access.port) - with sftp.connect_for_download( - connect=sftp_connect_with_username_password - ) as connection: + sftp = SFTPFileTransfer(host=sftp_access.host, + port=sftp_access.port, + connect=sftp_connect_with_username_password) + with sftp.connect_for_download() as connection: # use connection """ - def connect(host: str, port: int, **kwargs): - if kwargs: - raise ValueError( - "sftp_connect_with_username_password must only be" - f" used without extra arguments. Got {kwargs=}" - ) + def connect(host: str, port: int): connection = fabric.Connection( host=host, port=port, diff --git a/src/scitacean/transfer/sftp.py b/src/scitacean/transfer/sftp.py index 67b0c02c..19f51698 100644 --- a/src/scitacean/transfer/sftp.py +++ b/src/scitacean/transfer/sftp.py @@ -4,15 +4,13 @@ import os from contextlib import contextmanager from datetime import datetime, timezone -from getpass import getpass from pathlib import Path -from typing import Any, Callable, Iterator, List, Optional, Tuple, Union +from typing import Callable, Iterator, List, Optional, Tuple, Union # Note that invoke and paramiko are dependencies of fabric. from fabric import Connection from invoke.exceptions import UnexpectedExit from paramiko import SFTPAttributes, SFTPClient -from paramiko.ssh_exception import AuthenticationException, PasswordRequiredException from ..dataset import Dataset from ..error import FileUploadError @@ -245,7 +243,7 @@ def __init__( host: str, port: Optional[int] = None, source_folder: Optional[Union[str, RemotePath]] = None, - connect: Optional[Callable[..., Connection]] = None, + connect: Optional[Callable[[str, Optional[int]], Connection]] = None, ) -> None: """Construct a new SFTP file transfer. @@ -260,16 +258,11 @@ def __init__( Otherwise, upload to the dataset's source_folder. Ignored when downloading files. connect: - A function that creates and returns a :class:`fabric.connection.Connection` - object. - Will first be called with only ``host`` and ``port``. - If this fails (by raising - :class:`paramiko.ssh_exception.AuthenticationException`), the function is - called with ``host``, ``port``, and, optionally, ``user`` and - ``connection_kwargs`` depending on the authentication method. - Raising :class:`paramiko.ssh_exception.AuthenticationException` in the 2nd - call or any other exception in the 1st signals failure of - ``connect_for_download``. + If this argument is set, it will be called to establish a connection + to the server instead of the builtin method. + The connection will be opened (by calling ``con.open()``) automatically. + The function arguments are ``host`` and ``port`` as determined by the + arguments to ``__init__`` shown above. """ self._host = host self._port = port @@ -314,78 +307,31 @@ def connect_for_upload(self, dataset: Dataset) -> Iterator[SFTPUploadConnection] con.close() -def _ask_for_key_passphrase() -> str: - return getpass("The private key is encrypted, enter passphrase: ") - - -def _ask_for_credentials(host: str) -> Tuple[str, str]: - print(f"You need to authenticate to access {host}") # noqa: T201 - username = input("Username: ") - password = getpass("Password: ") - return username, password - - -def _generic_connect( +def _do_connect( host: str, port: Optional[int], - connect: Optional[Callable[..., Connection]], - **kwargs: Any, + connect: Optional[Callable[[str, Optional[int]], Connection]], ) -> Connection: if connect is None: - con = Connection(host=host, port=port, **kwargs) + con = Connection(host=host, port=port) else: - con = connect(host=host, port=port, **kwargs) + con = connect(host, port) con.open() return con -def _unauthenticated_connect( - host: str, port: Optional[int], connect: Optional[Callable[..., Connection]] -) -> Connection: - return _generic_connect(host=host, port=port, connect=connect) - - -def _authenticated_connect( +def _connect( host: str, port: Optional[int], - connect: Optional[Callable[..., Connection]], - exc: AuthenticationException, -) -> Connection: - # TODO fail fast if output going to file - if isinstance(exc, PasswordRequiredException) and "encrypted" in exc.args[0]: - # TODO does not work anymore, exception is always AuthenticationException - return _generic_connect( - host=host, - port=port, - connect=connect, - connect_kwargs={"passphrase": _ask_for_key_passphrase()}, - ) - else: - username, password = _ask_for_credentials(host) - return _generic_connect( - host=host, - port=port, - connect=connect, - user=username, - connect_kwargs={"password": password}, - ) - - -def _connect( - host: str, port: Optional[int], connect: Optional[Callable[..., Connection]] + connect: Optional[Callable[[str, Optional[int]], Connection]], ) -> Connection: try: - try: - return _unauthenticated_connect(host, port, connect) - except AuthenticationException as exc: - return _authenticated_connect(host, port, connect, exc) + return _do_connect(host, port, connect) except Exception as exc: # We pass secrets as arguments to functions called in this block, and those # can be leaked through exception handlers. So catch all exceptions # and strip the backtrace up to this point to hide those secrets. raise type(exc)(exc.args) from None - except BaseException as exc: - raise type(exc)(exc.args) from None def _remote_folder_is_empty(sftp: SFTPClient, path: RemotePath) -> bool: From d965358f941294964e5b136fc0c967856ac7fa3e Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Mon, 28 Aug 2023 11:55:05 +0200 Subject: [PATCH 12/21] Handle st_mode = None --- src/scitacean/transfer/sftp.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/scitacean/transfer/sftp.py b/src/scitacean/transfer/sftp.py index 19f51698..b17390c8 100644 --- a/src/scitacean/transfer/sftp.py +++ b/src/scitacean/transfer/sftp.py @@ -359,10 +359,9 @@ def _try_remote_stat(sftp: SFTPClient, path: RemotePath) -> Optional[SFTPAttribu def _is_remote_dir(st_stat: SFTPAttributes) -> bool: - try: - return st_stat.st_mode & 0o040000 == 0o040000 - except FileNotFoundError: - return False + if st_stat.st_mode is None: + return True # Assume it is a dir and let downstream code fail if it isn't. + return st_stat.st_mode & 0o040000 == 0o040000 def _compute_remote_checksum( From 9245b05a9c5f78461718dc9ab6ecae13fea54a08 Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Mon, 28 Aug 2023 11:55:31 +0200 Subject: [PATCH 13/21] Do not use fabric for SFTP --- src/scitacean/testing/sftp/fixtures.py | 52 +++++---------- src/scitacean/transfer/sftp.py | 91 +++++++++++++------------- tests/transfer/sftp_test.py | 59 +++++++---------- 3 files changed, 84 insertions(+), 118 deletions(-) diff --git a/src/scitacean/testing/sftp/fixtures.py b/src/scitacean/testing/sftp/fixtures.py index 58e91d47..8eeb84ca 100644 --- a/src/scitacean/testing/sftp/fixtures.py +++ b/src/scitacean/testing/sftp/fixtures.py @@ -7,9 +7,8 @@ from pathlib import Path from typing import Callable, Generator, Optional -import fabric -import fabric.config import pytest +from paramiko import SFTPClient, SSHClient from ..._internal import docker from .._pytest_helpers import init_work_dir, root_tmp_dir @@ -126,39 +125,23 @@ def sftp_fileserver( _sftp_docker_down(target_dir) -@pytest.fixture(scope="session") -def sftp_connection_config() -> fabric.config.Config: - """Fixture that returns the configuration for fabric.Connection for tests. - - Can be used to open SFTP connections if ``SFTPFileTransfer`` is not enough. - """ - config = fabric.config.Config() - config["load_sftp_configs"] = False - config["connect_kwargs"] = { - "allow_agent": False, - "look_for_keys": False, - } - return config - - @pytest.fixture(scope="session") def sftp_connect_with_username_password( - sftp_access: SFTPAccess, sftp_connection_config: fabric.config.Config -) -> Callable[..., fabric.Connection]: + sftp_access: SFTPAccess, +) -> Callable[[str, int], SFTPClient]: """Fixture that returns a function to connect to the testing SFTP server. - Uses username+password and rejects any other authentication attempt. + Uses username+password from the test config. Returns ------- : - A function to pass as the ``connect`` argument to - :meth:`scitacean.transfer.sftp.SFTPFileTransfer.connect_for_download` - or :meth:`scitacean.transfer.sftp.SFTPFileTransfer.connect_for_upload`. + A function to pass as the ``connect`` argument when constructing a + :class:`scitacean.transfer.sftp.SFTPFileTransfer`. Examples -------- - Explicitly connect to the + Explicitly connect to the test server: .. code-block:: python @@ -172,19 +155,18 @@ def test_sftp(sftp_access, # use connection """ - def connect(host: str, port: int): - connection = fabric.Connection( - host=host, + def connect(host: str, port: int) -> SFTPClient: + client = SSHClient() + client.set_missing_host_key_policy(IgnorePolicy()) + client.connect( + hostname=host, port=port, - user=sftp_access.user.username, - config=sftp_connection_config, - connect_kwargs={ - "password": sftp_access.user.password, - **sftp_connection_config.connect_kwargs, - }, + username=sftp_access.user.username, + password=sftp_access.user.password, + allow_agent=False, + look_for_keys=False, ) - connection.client.set_missing_host_key_policy(IgnorePolicy()) - return connection + return client.open_sftp() return connect diff --git a/src/scitacean/transfer/sftp.py b/src/scitacean/transfer/sftp.py index b17390c8..4fd2de33 100644 --- a/src/scitacean/transfer/sftp.py +++ b/src/scitacean/transfer/sftp.py @@ -7,10 +7,8 @@ from pathlib import Path from typing import Callable, Iterator, List, Optional, Tuple, Union -# Note that invoke and paramiko are dependencies of fabric. -from fabric import Connection from invoke.exceptions import UnexpectedExit -from paramiko import SFTPAttributes, SFTPClient +from paramiko import SFTPAttributes, SFTPClient, SSHClient from ..dataset import Dataset from ..error import FileUploadError @@ -21,8 +19,9 @@ class SFTPDownloadConnection: - def __init__(self, *, connection: Connection) -> None: - self._connection = connection + def __init__(self, *, sftp_client: SFTPClient, host: str) -> None: + self._sftp_client = sftp_client + self._host = host def download_files(self, *, remote: List[RemotePath], local: List[Path]) -> None: """Download files from the given remote path.""" @@ -33,20 +32,19 @@ def download_file(self, *, remote: RemotePath, local: Path) -> None: get_logger().info( "Downloading file %s from host %s to %s", remote, - self._connection.host, + self._host, local, ) - self._connection.get(remote=remote.posix, local=os.fspath(local)) + self._sftp_client.get(remotepath=remote.posix, localpath=os.fspath(local)) class SFTPUploadConnection: - def __init__(self, *, connection: Connection, source_folder: RemotePath) -> None: - self._connection = connection + def __init__( + self, *, sftp_client: SFTPClient, source_folder: RemotePath, host: str + ) -> None: + self._sftp_client = sftp_client self._source_folder = source_folder - - @property - def _sftp(self) -> SFTPClient: - return self._connection.sftp() # type: ignore[no-any-return] + self._host = host @property def source_folder(self) -> RemotePath: @@ -57,7 +55,7 @@ def remote_path(self, filename: Union[str, RemotePath]) -> RemotePath: def _make_source_folder(self) -> None: try: - _mkdir_remote(self._sftp, self.source_folder) + _mkdir_remote(self._sftp_client, self.source_folder) except OSError as exc: raise FileUploadError( f"Failed to create source folder {self.source_folder}: {exc.args}" @@ -89,9 +87,9 @@ def _upload_file(self, file: File) -> Tuple[File, Optional[Exception]]: "Uploading file %s to %s on host %s", file.local_path, remote_path, - self._connection.host, + self._host, ) - st = self._sftp.put( + st = self._sftp_client.put( remotepath=remote_path.posix, localpath=os.fspath(file.local_path) ) if (exc := self._validate_upload(file)) is not None: @@ -123,7 +121,9 @@ def _compute_checksum(self, file: File) -> Optional[str]: if file.checksum_algorithm is None: return None return _compute_remote_checksum( - self._sftp, self.remote_path(file.remote_path), file.checksum_algorithm + self._sftp_client, + self.remote_path(file.remote_path), + file.checksum_algorithm, ) def revert_upload(self, *files: File) -> None: @@ -131,19 +131,19 @@ def revert_upload(self, *files: File) -> None: for file in files: self._revert_upload_single(remote=file.remote_path, local=file.local_path) - if _remote_folder_is_empty(self._sftp, self.source_folder): + if _remote_folder_is_empty(self._sftp_client, self.source_folder): try: get_logger().info( "Removing empty remote directory %s on host %s", self.source_folder, - self._connection.host, + self._host, ) - self._sftp.rmdir(self.source_folder.posix) + self._sftp_client.rmdir(self.source_folder.posix) except UnexpectedExit as exc: get_logger().warning( "Failed to remove empty remote directory %s on host:\n%s", self.source_folder, - self._connection.host, + self._host, exc.result, ) @@ -155,11 +155,11 @@ def _revert_upload_single( "Reverting upload of file %s to %s on host %s", local, remote_path, - self._connection.host, + self._host, ) try: - self._sftp.remove(remote_path.posix) + self._sftp_client.remove(remote_path.posix) except UnexpectedExit as exc: get_logger().warning( "Error reverting file %s:\n%s", remote_path, exc.result @@ -243,7 +243,7 @@ def __init__( host: str, port: Optional[int] = None, source_folder: Optional[Union[str, RemotePath]] = None, - connect: Optional[Callable[[str, Optional[int]], Connection]] = None, + connect: Optional[Callable[[str, Optional[int]], SFTPClient]] = None, ) -> None: """Construct a new SFTP file transfer. @@ -258,9 +258,8 @@ def __init__( Otherwise, upload to the dataset's source_folder. Ignored when downloading files. connect: - If this argument is set, it will be called to establish a connection - to the server instead of the builtin method. - The connection will be opened (by calling ``con.open()``) automatically. + If this argument is set, it will be called to create a client + for the server instead of the builtin method. The function arguments are ``host`` and ``port`` as determined by the arguments to ``__init__`` shown above. """ @@ -280,11 +279,11 @@ def source_folder_for(self, dataset: Dataset) -> RemotePath: @contextmanager def connect_for_download(self) -> Iterator[SFTPDownloadConnection]: """Create a connection for downloads, use as a context manager.""" - con = _connect(self._host, self._port, connect=self._connect) + sftp_client = _connect(self._host, self._port, connect=self._connect) try: - yield SFTPDownloadConnection(connection=con) + yield SFTPDownloadConnection(sftp_client=sftp_client, host=self._host) finally: - con.close() + sftp_client.close() @contextmanager def connect_for_upload(self, dataset: Dataset) -> Iterator[SFTPUploadConnection]: @@ -297,36 +296,34 @@ def connect_for_upload(self, dataset: Dataset) -> Iterator[SFTPUploadConnection] Used to determine the target folder. """ source_folder = self.source_folder_for(dataset) - con = _connect(self._host, self._port, connect=self._connect) + sftp_client = _connect(self._host, self._port, connect=self._connect) try: yield SFTPUploadConnection( - connection=con, - source_folder=source_folder, + sftp_client=sftp_client, source_folder=source_folder, host=self._host ) finally: - con.close() + sftp_client.close() -def _do_connect( - host: str, - port: Optional[int], - connect: Optional[Callable[[str, Optional[int]], Connection]], -) -> Connection: - if connect is None: - con = Connection(host=host, port=port) +def _default_connect(host: str, port: Optional[int]) -> SFTPClient: + client = SSHClient() + client.load_system_host_keys() + if port is not None: + client.connect(hostname=host, port=port) else: - con = connect(host, port) - con.open() - return con + client.connect(hostname=host) + return client.open_sftp() def _connect( host: str, port: Optional[int], - connect: Optional[Callable[[str, Optional[int]], Connection]], -) -> Connection: + connect: Optional[Callable[[str, Optional[int]], SFTPClient]], +) -> SFTPClient: try: - return _do_connect(host, port, connect) + if connect is None: + connect = _default_connect + return connect(host, port) except Exception as exc: # We pass secrets as arguments to functions called in this block, and those # can be leaked through exception handlers. So catch all exceptions diff --git a/tests/transfer/sftp_test.py b/tests/transfer/sftp_test.py index c41ba542..5b310a63 100644 --- a/tests/transfer/sftp_test.py +++ b/tests/transfer/sftp_test.py @@ -9,7 +9,6 @@ from pathlib import Path from typing import Iterator -import fabric import paramiko import pytest @@ -300,25 +299,19 @@ def open_sftp_client(self) -> paramiko.SFTPClient: @pytest.fixture() def sftp_corrupting_connect(sftp_access, sftp_connection_config): - def connect(host: str, port: int, **kwargs): - if kwargs: - raise ValueError( - "connect_with_username_password must only be" - f" used without extra arguments. Got {kwargs=}" - ) - connection = fabric.Connection( - host=host, + def connect(host: str, port: int) -> paramiko.SFTPClient: + client = paramiko.SSHClient() + client.set_missing_host_key_policy(IgnorePolicy()) + client.connect( + hostname=host, port=port, - user=sftp_access.user.username, - config=sftp_connection_config, - connect_kwargs={ - "password": sftp_access.user.password, - "transport_factory": CorruptingTransfer, - **sftp_connection_config.connect_kwargs, - }, + username=sftp_access.user.username, + password=sftp_access.user.password, + allow_agent=False, + look_for_keys=False, + transport_factory=CorruptingTransfer, ) - connection.client.set_missing_host_key_policy(IgnorePolicy()) - return connection + return client.open_sftp() return connect @@ -363,26 +356,20 @@ def open_sftp_client(self) -> paramiko.SFTPClient: @pytest.fixture() -def sftp_raising_connect(sftp_access, sftp_connection_config): - def connect(host: str, port: int, **kwargs): - if kwargs: - raise ValueError( - "connect_with_username_password must only be" - f" used without extra arguments. Got {kwargs=}" - ) - connection = fabric.Connection( - host=host, +def sftp_raising_connect(sftp_access): + def connect(host: str, port: int) -> paramiko.SFTPClient: + client = paramiko.SSHClient() + client.set_missing_host_key_policy(IgnorePolicy()) + client.connect( + hostname=host, port=port, - user=sftp_access.user.username, - config=sftp_connection_config, - connect_kwargs={ - "password": sftp_access.user.password, - "transport_factory": RaisingTransfer, - **sftp_connection_config.connect_kwargs, - }, + username=sftp_access.user.username, + password=sftp_access.user.password, + allow_agent=False, + look_for_keys=False, + transport_factory=RaisingTransfer, ) - connection.client.set_missing_host_key_policy(IgnorePolicy()) - return connection + return client.open_sftp() return connect From 5aad13bd83d6005994523ec19e7c44671fb16c09 Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Mon, 28 Aug 2023 12:28:45 +0200 Subject: [PATCH 14/21] Do not run ssh tests in tox --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 6134be1a..46bb49d5 100644 --- a/tox.ini +++ b/tox.ini @@ -5,7 +5,7 @@ isolated_build = true [testenv] deps = -r requirements/test.txt commands = - full: python -m pytest --backend-tests --ssh-tests --sftp-tests + full: python -m pytest --backend-tests --sftp-tests !full: python -m pytest [testenv:pydantic2] From 105065b1c8506d5f42cbc46617c1e3cab31a09fd Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Mon, 28 Aug 2023 12:43:09 +0200 Subject: [PATCH 15/21] Use SFTP in docs --- docs/developer/testing.rst | 20 ++++----- docs/user-guide/downloading.ipynb | 11 ++--- docs/user-guide/testing.ipynb | 70 +++++++++++++++---------------- docs/user-guide/uploading.ipynb | 4 +- 4 files changed, 51 insertions(+), 54 deletions(-) diff --git a/docs/developer/testing.rst b/docs/developer/testing.rst index 6eb7e131..8e1f08dc 100644 --- a/docs/developer/testing.rst +++ b/docs/developer/testing.rst @@ -21,22 +21,22 @@ In addition, the fakes are relatively complex and may diverge from the real impl To mitigate these problems, the fake client is tested alongside a real client. But to (mostly) avoid the downsides stated in the beginning, the real client is connected to a local SciCat server. -See ``tests/common/backend.py`` and ``tests/conftest.py`` for the concrete setup. -The backend is launched in a docker container based on the images provided by `scicatlive `_. -Tests in ``tests/client_tests.py`` are then run with both the fake and the real client to ensure that both produce the same results. +See ``src/testing/backend.py`` and ``tests/conftest.py`` for the concrete setup. +The backend is launched in a docker container with the image of the latest release of the SciCat backend. +Tests in ``tests/client`` are then run with both the fake and the real client to ensure that both produce the same results. Use ``--backend-tests`` with ``pytest`` to run these tests. -SSH file transfer ------------------ +SFTP file transfer +------------------ -Testing :class:`scitacean.transfer.ssh.SSHFileTransfer` requires an SSH server. -``tests/common/ssh_server.py`` contains helpers for managing one via Docker. -Tests can use it by depending on the ``ssh_fileserver`` fixture. -See the documentation in ``tests/common/ssh_server.py`` for how it works. +Testing :class:`scitacean.transfer.sftp.SFTPFileTransfer` requires an SFTP server. +``tests/common/sftp_server.py`` contains helpers for managing one via Docker. +Tests can use it by depending on the ``sftp_fileserver`` fixture. +See the documentation in ``tests/common/sftp_server.py`` for how it works. Note that those tests may leave a small directory behind. This is an issue with file ownership and permissions caused by making the Docker volumes writable. It should not be a problem in practice as those files will only be in temporary directories which should get cleaned up at reboot. -Use ``--ssh-tests`` with ``pytest`` to run these tests. +Use ``--sftp-tests`` with ``pytest`` to run these tests. diff --git a/docs/user-guide/downloading.ipynb b/docs/user-guide/downloading.ipynb index 8ff2da21..62b3002f 100644 --- a/docs/user-guide/downloading.ipynb +++ b/docs/user-guide/downloading.ipynb @@ -12,10 +12,10 @@ "\n", "```python\n", "from scitacean import Client\n", - "from scitacean.transfer.ssh import SSHFileTransfer\n", + "from scitacean.transfer.sftp import SFTPFileTransfer\n", "client = Client.from_token(url=\"https://scicat.ess.eu/api/v3\",\n", " token=...,\n", - " file_transfer=SSHFileTransfer(\n", + " file_transfer=SFTPFileTransfer(\n", " host=\"login.esss.dk\"\n", " ))\n", "```\n", @@ -40,13 +40,10 @@ "\n", "\n", "While the client itself is responsible for talking to SciCat, a `file_transfer` object is required to download data files.\n", - "Currently, only `SSHFileTransfer` is implemented.\n", - "It downloads / uploads files via SSH.\n", - "It will almost definitely change in the future!\n", + "Here, we use `SFTPFileTransfer` which downloads / uploads files via SFTP.\n", "\n", "The file transfer needs to authenticate separately from the SciCat connection.\n", - "It will ask for username and password on the command if needed.\n", - "But it is recommended to set up an SSH agent with a key and customize the `host` argument to point to the configured connection.\n", + "By default, it requires an SSH agent to be running an set up for the selected `host`.\n", "\n", "For the purposes of this guide, we don't want to connect to a real SciCat server in order to avoid the complications associated with that.\n", "So we set up a fake client that only pretends to connect to SciCat and file servers.\n", diff --git a/docs/user-guide/testing.ipynb b/docs/user-guide/testing.ipynb index fc4a51c0..a45ef168 100644 --- a/docs/user-guide/testing.ipynb +++ b/docs/user-guide/testing.ipynb @@ -18,7 +18,7 @@ "They can be mixed and matched freely with the real client and file transfers.\n", "But it is generally recommended to combine them.\n", "\n", - "Secondly, SciCat servers and fileservers are managed by the [scicat_backend](../generated/modules/scitacean.testing.backend.fixtures.scicat_backend.rst) and [ssh_fileserver](../generated/modules/scitacean.testing.ssh.fixtures.ssh_fileserver.rst) pytest fixtures.\n", + "Secondly, SciCat servers and fileservers are managed by the [scicat_backend](../generated/modules/scitacean.testing.backend.fixtures.scicat_backend.rst) and [sftp_fileserver](../generated/modules/scitacean.testing.sftp.fixtures.sftp_fileserver.rst) pytest fixtures.\n", "\n", "First, create a test dataset and file." ] @@ -554,15 +554,16 @@ "id": "97930d15-517a-4cea-bf2b-38ed700220b3", "metadata": {}, "source": [ - "## Local SSH fileserver\n", + "## Local SFTP fileserver\n", "\n", - "[scitacean.testing.ssh](../generated/modules/scitacean.testing.ssh.rst) provides tools to set up an SSH server in a Docker container on the local machine.\n", - "It is primarily intended to be used via the [pytest](https://docs.pytest.org/) fixtures in [scitacean.testing.ssh.fixtures](../generated/modules/scitacean.testing.ssh.fixtures.rst).\n", + "[scitacean.testing.sftp](../generated/modules/scitacean.testing.sftp.rst) provides tools to set up an SFTP server in a Docker container on the local machine.\n", + "It is primarily intended to be used via the [pytest](https://docs.pytest.org/) fixtures in [scitacean.testing.sftp.fixtures](../generated/modules/scitacean.testing.sftp.fixtures.rst).\n", "\n", - "The fixtures can configure, spin up, and seed an SSH server in a Docker container.\n", + "The fixtures can configure, spin up, and seed an SFTP server in a Docker container.\n", "They also clean up after the test session by stopping the Docker container.\n", + "(Scritly speaking, the server is an SSH server but all users except root are restricted to SFTP.)\n", "\n", - "Note the caveats in [scitacean.testing.ssh](../generated/modules/scitacean.testing.ssh.rst) about clean up and use of `pytest-xdist`.\n", + "Note the caveats in [scitacean.testing.sftp](../generated/modules/scitacean.testing.sftp.rst) about clean up and use of `pytest-xdist`.\n", "\n", "### Set up\n", "\n", @@ -570,7 +571,7 @@ "Then, configure pytest by\n", "\n", "- registering the fixtures and\n", - "- adding a command line option to enable ssh tests.\n", + "- adding a command line option to enable sftp tests.\n", "\n", "To this end, add the following in your `conftest.py`: (Or merge it into the setup for backend tests from above.)" ] @@ -583,15 +584,15 @@ "outputs": [], "source": [ "import pytest\n", - "from scitacean.testing.ssh import add_pytest_option as add_ssh_option\n", + "from scitacean.testing.sftp import add_pytest_option as add_sftp_option\n", "\n", "\n", "pytest_plugins = (\n", - " \"scitacean.testing.ssh.fixtures\",\n", + " \"scitacean.testing.sftp.fixtures\",\n", ")\n", "\n", "def pytest_addoption(parser: pytest.Parser) -> None:\n", - " add_backend_option(parser)" + " add_sftp_option(parser)" ] }, { @@ -599,10 +600,10 @@ "id": "0d92f3dd-986c-4f2c-acec-00b6e55ae87b", "metadata": {}, "source": [ - "The SSH server will only be launched when the corresponding command line option is given.\n", - "By default, this is `--ssh-tests` but it can be changed via the `option` argument of `add_pytest_option`.\n", + "The SFTP server will only be launched when the corresponding command line option is given.\n", + "By default, this is `--sftp-tests` but it can be changed via the `option` argument of `add_pytest_option`.\n", "\n", - "### Use SSH in tests\n", + "### Use SFTP in tests\n", "\n", "Tests that require the server can now request it as a fixture:" ] @@ -614,7 +615,7 @@ "metadata": {}, "outputs": [], "source": [ - "def test_something_with_ssh(require_ssh_fileserver):\n", + "def test_something_with_sftp(require_sftp_fileserver):\n", " # test something\n", " ..." ] @@ -624,12 +625,12 @@ "id": "3126d0d9-727a-4d55-aff1-6b31fd5df91f", "metadata": {}, "source": [ - "The `require_ssh_fileserver` fixture will ensure that the SSH server is running during the test.\n", - "If SSH tests have not been enabled by the command line option, the test will be skipped.\n", + "The `require_sftp_fileserver` fixture will ensure that the SFTP server is running during the test.\n", + "If SFTP tests have not been enabled by the command line option, the test will be skipped.\n", "\n", "Connecting to the server is not as straight forward as for the SciCat backend.\n", "It requires passing a special `connect` function to the file transfer.\n", - "This can be done by requesting `ssh_connect_with_username_password`.\n", + "This can be done by requesting `sftp_connect_with_username_password`.\n", "For example, the following opens a connection to the server to upload a file:" ] }, @@ -640,23 +641,22 @@ "metadata": {}, "outputs": [], "source": [ - "from scitacean.transfer.ssh import SSHFileTransfer\n", + "from scitacean.transfer.sftp import SFTPFileTransfer\n", "\n", - "def test_ssh_upload(\n", - " ssh_access,\n", - " ssh_connect_with_username_password,\n", - " require_ssh_fileserver,\n", - " ssh_data_dir,\n", + "def test_sftp_upload(\n", + " sftp_access,\n", + " sftp_connect_with_username_password,\n", + " require_sftp_fileserver,\n", + " sftp_data_dir,\n", "):\n", - " ssh = SSHFileTransfer(host=ssh_access.host, port=ssh_access.port)\n", + " sftp = SFTPFileTransfer(host=sftp_access.host,\n", + " port=sftp_access.port,\n", + " connect=sftp_connect_with_username_password)\n", " ds = Dataset(...)\n", - " with ssh.connect_for_upload(\n", - " dataset=ds,\n", - " connect=ssh_connect_with_username_password\n", - " ) as connection:\n", + " with sftp.connect_for_upload(dataset=ds) as connection:\n", " # do upload\n", " ...\n", - " # assert that the file has been copied to ssh_data_dir\n", + " # assert that the file has been copied to sftp_data_dir\n", " ..." ] }, @@ -666,17 +666,17 @@ "metadata": {}, "source": [ "Uploaded files are readable on the host.\n", - "So the test can read from `ssh_data_dir` to check if the upload succeeded.\n", + "So the test can read from `sftp_data_dir` to check if the upload succeeded.\n", "This directory is mounted as `/data` on the server.\n", "\n", - "Using an SSH file transfer with `Client` requires some extra steps.\n", - "An example is given by `test_client_with_ssh` in https://github.com/SciCatProject/scitacean/blob/main/tests/transfer/ssh_test.py.\n", - "It uses a subclass of `SSHFileTransfer` to pass `ssh_connect_with_username_password` to the connection as `Client` cannot do this itself. \n", + "Using an SFTP file transfer with `Client` requires some extra steps.\n", + "An example is given by `test_client_with_sftp` in https://github.com/SciCatProject/scitacean/blob/main/tests/transfer/sftp_test.py.\n", + "It uses a subclass of `SFTPFileTransfer` to pass `sftp_connect_with_username_password` to the connection as `Client` cannot do this itself.\n", "\n", "### Seed data\n", "\n", - "The server's filesystem gets seeded with some files from https://github.com/SciCatProject/scitacean/tree/main/src/scitacean/testing/ssh/ssh_server_seed.\n", - "Those files are copied to `ssh_data_dir` on the host which is mounted to `/data/seed` on the server." + "The server's filesystem gets seeded with some files from https://github.com/SciCatProject/scitacean/tree/main/src/scitacean/testing/sftp/sftp_server_seed.\n", + "Those files are copied to `sftp_data_dir` on the host which is mounted to `/data/seed` on the server." ] }, { diff --git a/docs/user-guide/uploading.ipynb b/docs/user-guide/uploading.ipynb index d5325692..6134ae83 100644 --- a/docs/user-guide/uploading.ipynb +++ b/docs/user-guide/uploading.ipynb @@ -12,10 +12,10 @@ "We connect to SciCat and a file server using a [Client](../generated/classes/scitacean.Client.rst):\n", "```python\n", "from scitacean import Client\n", - "from scitacean.transfer.ssh import SSHFileTransfer\n", + "from scitacean.transfer.sftp import SFTPFileTransfer\n", "client = Client.from_token(url=\"https://scicat.ess.eu/api/v3\",\n", " token=...,\n", - " file_transfer=SSHFileTransfer(\n", + " file_transfer=SFTPFileTransfer(\n", " host=\"login.esss.dk\"\n", " ))\n", "```\n", From b5fca42dfb78da1743801a7c6f8c36e96ddc52d6 Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Mon, 28 Aug 2023 12:49:28 +0200 Subject: [PATCH 16/21] Deprecate SSHFileTransfer --- pyproject.toml | 2 ++ src/scitacean/__init__.py | 2 ++ src/scitacean/transfer/ssh.py | 10 ++++++++-- src/scitacean/warning.py | 15 +++++++++++++++ 4 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 src/scitacean/warning.py diff --git a/pyproject.toml b/pyproject.toml index cd3959ff..d29eba6e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,8 @@ filterwarnings = [ "error", # Many tests don't set a checksum, so File raises this warning. "ignore:Cannot check if local file:UserWarning", + # Internal deprecations. + "ignore:SSHFileTransfer is deprecated:scitacean.VisibleDeprecationWarning", # From fabric / invoke "ignore:_SixMetaPathImporter:ImportWarning", "ignore:the imp module is deprecated in favour of importlib:DeprecationWarning", diff --git a/src/scitacean/__init__.py b/src/scitacean/__init__.py index 532d3a4b..6fbde532 100644 --- a/src/scitacean/__init__.py +++ b/src/scitacean/__init__.py @@ -16,6 +16,7 @@ from .filesystem import RemotePath from .model import DatasetType from .pid import PID +from .warning import VisibleDeprecationWarning __all__ = ( "Client", @@ -29,4 +30,5 @@ "RemotePath", "ScicatCommError", "ScicatLoginError", + "VisibleDeprecationWarning", ) diff --git a/src/scitacean/transfer/ssh.py b/src/scitacean/transfer/ssh.py index 419a8e52..24155f99 100644 --- a/src/scitacean/transfer/ssh.py +++ b/src/scitacean/transfer/ssh.py @@ -2,6 +2,7 @@ # Copyright (c) 2023 SciCat Project (https://github.com/SciCatProject/scitacean) import os +import warnings from contextlib import contextmanager from datetime import datetime, timedelta from getpass import getpass @@ -21,10 +22,15 @@ from ..file import File from ..filesystem import RemotePath from ..logging import get_logger +from ..warning import VisibleDeprecationWarning from .util import source_folder_for -# TODO pass pid in put/revert? -# downloading does not need a pid, so it should not be required in the constructor/ +warnings.warn( + "SSHFileTransfer is deprecated and scheduled for removal in Scitacean v23.11.0." + "Use SFTPFileTransfer instead.", + VisibleDeprecationWarning, + stacklevel=0, +) class SSHDownloadConnection: diff --git a/src/scitacean/warning.py b/src/scitacean/warning.py new file mode 100644 index 00000000..ef2d76c5 --- /dev/null +++ b/src/scitacean/warning.py @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2023 SciCat Project (https://github.com/SciCatProject/scitacean) + +"""Custom warning classes.""" + + +class VisibleDeprecationWarning(UserWarning): + """Visible deprecation warning. + + By default, Python and in particular Jupyter will not show deprecation + warnings, so this class can be used when a very visible warning is helpful. + """ + + +VisibleDeprecationWarning.__module__ = "scitacean" From de358a42061c50b358b5d37dce568222cd7fea52 Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Mon, 28 Aug 2023 12:52:07 +0200 Subject: [PATCH 17/21] Don't compute checksum during uploads --- src/scitacean/transfer/sftp.py | 62 +++++----------------------------- tests/transfer/sftp_test.py | 31 +---------------- 2 files changed, 10 insertions(+), 83 deletions(-) diff --git a/src/scitacean/transfer/sftp.py b/src/scitacean/transfer/sftp.py index 4fd2de33..fe676487 100644 --- a/src/scitacean/transfer/sftp.py +++ b/src/scitacean/transfer/sftp.py @@ -5,7 +5,7 @@ from contextlib import contextmanager from datetime import datetime, timezone from pathlib import Path -from typing import Callable, Iterator, List, Optional, Tuple, Union +from typing import Callable, Iterator, List, Optional, Union from invoke.exceptions import UnexpectedExit from paramiko import SFTPAttributes, SFTPClient, SSHClient @@ -66,17 +66,13 @@ def upload_files(self, *files: File) -> List[File]: self._make_source_folder() uploaded = [] try: - for file in files: - up, exc = self._upload_file(file) - uploaded.append(up) # need to add this file in order to revert it - if exc is not None: - raise exc + uploaded.extend(self._upload_file(file) for file in files) except Exception: self.revert_upload(*uploaded) raise return uploaded - def _upload_file(self, file: File) -> Tuple[File, Optional[Exception]]: + def _upload_file(self, file: File) -> File: if file.local_path is None: raise ValueError( f"Cannot upload file to {file.remote_path}, " @@ -92,38 +88,12 @@ def _upload_file(self, file: File) -> Tuple[File, Optional[Exception]]: st = self._sftp_client.put( remotepath=remote_path.posix, localpath=os.fspath(file.local_path) ) - if (exc := self._validate_upload(file)) is not None: - return file, exc - return ( - file.uploaded( - remote_gid=str(st.st_gid), - remote_uid=str(st.st_uid), - remote_creation_time=datetime.now().astimezone(timezone.utc), - remote_perm=str(st.st_mode), - remote_size=st.st_size, - ), - None, - ) - - def _validate_upload(self, file: File) -> Optional[Exception]: - if (checksum := self._compute_checksum(file)) is None: - return None - if checksum != file.checksum(): - return FileUploadError( - f"Upload of file {file.remote_path} failed: " - f"Checksum of uploaded file ({checksum}) does not " - f"match checksum of local file ({file.checksum()}) " - f"using algorithm {file.checksum_algorithm}" - ) - return None - - def _compute_checksum(self, file: File) -> Optional[str]: - if file.checksum_algorithm is None: - return None - return _compute_remote_checksum( - self._sftp_client, - self.remote_path(file.remote_path), - file.checksum_algorithm, + return file.uploaded( + remote_gid=str(st.st_gid), + remote_uid=str(st.st_uid), + remote_creation_time=datetime.now().astimezone(timezone.utc), + remote_perm=str(st.st_mode), + remote_size=st.st_size, ) def revert_upload(self, *files: File) -> None: @@ -361,18 +331,4 @@ def _is_remote_dir(st_stat: SFTPAttributes) -> bool: return st_stat.st_mode & 0o040000 == 0o040000 -def _compute_remote_checksum( - sftp: SFTPClient, path: RemotePath, checksum_algorithm: str -) -> Optional[str]: - with sftp.open(path.posix, "r") as f: - try: - return f.check(checksum_algorithm).decode("utf-8") - except OSError as exc: - # Many servers don't support this. - # See https://docs.paramiko.org/en/latest/api/sftp.html - if "Operation unsupported" in exc.args: - return None - raise - - __all__ = ["SFTPFileTransfer", "SFTPUploadConnection", "SFTPDownloadConnection"] diff --git a/tests/transfer/sftp_test.py b/tests/transfer/sftp_test.py index 5b310a63..341adc8b 100644 --- a/tests/transfer/sftp_test.py +++ b/tests/transfer/sftp_test.py @@ -12,7 +12,7 @@ import paramiko import pytest -from scitacean import Dataset, File, FileUploadError, RemotePath +from scitacean import Dataset, File, RemotePath from scitacean.testing.client import FakeClient from scitacean.testing.sftp import IgnorePolicy, skip_if_not_sftp from scitacean.transfer.sftp import ( @@ -316,35 +316,6 @@ def connect(host: str, port: int) -> paramiko.SFTPClient: return connect -@pytest.mark.skip(reason="Checksums not supported on the test server") -def test_upload_file_detects_checksum_mismatch( - sftp_access, sftp_corrupting_connect, tmp_path, sftp_data_dir -): - ds = Dataset( - type="raw", - source_folder=RemotePath("/data/upload7"), - checksum_algorithm="blake2b", - ) - tmp_path.joinpath("file7.txt").write_text("File to test upload no 7") - - sftp = SFTPFileTransfer( - host=sftp_access.host, port=sftp_access.port, connect=sftp_corrupting_connect - ) - with sftp.connect_for_upload(dataset=ds) as con: - with pytest.raises(FileUploadError): - con.upload_files( - dataclasses.replace( - File.from_local( - path=tmp_path / "file7.txt", - remote_path=RemotePath("upload_7.txt"), - ), - checksum_algorithm="blake2b", - ) - ) - - assert "upload7" not in (p.name for p in sftp_data_dir.iterdir()) - - class RaisingSFTP(paramiko.SFTPClient): def put(self, localpath, remotepath, callback=None, confirm=True): raise RuntimeError("Upload disabled") From b05fc0d6b4f782f575478ccc82643081b25f8d28 Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Mon, 28 Aug 2023 12:57:45 +0200 Subject: [PATCH 18/21] Remove obsolete TODOs --- tests/transfer/sftp_test.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/transfer/sftp_test.py b/tests/transfer/sftp_test.py index 341adc8b..c61b22e0 100644 --- a/tests/transfer/sftp_test.py +++ b/tests/transfer/sftp_test.py @@ -22,8 +22,6 @@ ) -# TODo deep source folder -# TODO existing source folder @pytest.fixture(scope="session", autouse=True) def server(request, sftp_fileserver): skip_if_not_sftp(request) From ab3d042ee03432bd42d2d3e5a99501fbdc542aee Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Mon, 28 Aug 2023 13:01:56 +0200 Subject: [PATCH 19/21] Update dependencies --- requirements-pydantic2/base.txt | 26 ++++++++++++++++---------- requirements-pydantic2/test.txt | 12 ++++++------ requirements/base.txt | 6 ++++-- requirements/ci.txt | 6 +++--- requirements/dev.txt | 8 ++++---- requirements/docs.txt | 4 ++-- requirements/static.txt | 2 +- requirements/test.txt | 6 +++--- 8 files changed, 39 insertions(+), 31 deletions(-) diff --git a/requirements-pydantic2/base.txt b/requirements-pydantic2/base.txt index ffd6730d..223e7ab8 100644 --- a/requirements-pydantic2/base.txt +++ b/requirements-pydantic2/base.txt @@ -1,4 +1,4 @@ -# SHA1:e21461097cae87bbf0d2ae4f359f5573f0959e67 +# SHA1:2b55aaac9edd959da15c313d2f5f84f4e2cc5f2a # # This file is autogenerated by pip-compile-multi # To update, run: @@ -9,32 +9,36 @@ annotated-types==0.5.0 # via pydantic bcrypt==4.0.1 # via paramiko -certifi==2023.5.7 +certifi==2023.7.22 # via requests cffi==1.15.1 # via # cryptography # pynacl -charset-normalizer==3.1.0 +charset-normalizer==3.2.0 # via requests -cryptography==41.0.1 +cryptography==41.0.3 # via paramiko decorator==5.1.1 # via fabric -dnspython==2.3.0 +deprecated==1.2.14 + # via fabric +dnspython==2.4.2 # via email-validator email-validator==2.0.0.post2 # via -r requirements-pydantic2/base.in -fabric==3.1.0 +fabric==3.2.1 # via -r requirements-pydantic2/base.in idna==3.4 # via # email-validator # requests -invoke==2.1.3 - # via fabric -paramiko==3.2.0 +invoke==2.2.0 # via fabric +paramiko==3.3.1 + # via + # -r requirements-pydantic2/base.in + # fabric pycparser==2.21 # via cffi pydantic==2.0 @@ -53,5 +57,7 @@ typing-extensions==4.7.1 # via # pydantic # pydantic-core -urllib3==2.0.3 +urllib3==2.0.4 # via requests +wrapt==1.15.0 + # via deprecated diff --git a/requirements-pydantic2/test.txt b/requirements-pydantic2/test.txt index 1429615f..51f86f72 100644 --- a/requirements-pydantic2/test.txt +++ b/requirements-pydantic2/test.txt @@ -8,30 +8,30 @@ -r base.txt attrs==23.1.0 # via hypothesis -execnet==1.9.0 +execnet==2.0.2 # via pytest-xdist filelock==3.12.2 # via -r requirements-pydantic2/test.in -hypothesis==6.80.0 +hypothesis==6.82.6 # via -r requirements-pydantic2/test.in iniconfig==2.0.0 # via pytest packaging==23.1 # via pytest -pluggy==1.2.0 +pluggy==1.3.0 # via pytest -pyfakefs==5.2.2 +pyfakefs==5.2.4 # via -r requirements-pydantic2/test.in pytest==7.4.0 # via # -r requirements-pydantic2/test.in # pytest-randomly # pytest-xdist -pytest-randomly==3.12.0 +pytest-randomly==3.15.0 # via -r requirements-pydantic2/test.in pytest-xdist==3.3.1 # via -r requirements-pydantic2/test.in -pyyaml==6.0 +pyyaml==6.0.1 # via -r requirements-pydantic2/test.in sortedcontainers==2.4.0 # via hypothesis diff --git a/requirements/base.txt b/requirements/base.txt index 606cd943..b7101a09 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,4 +1,4 @@ -# SHA1:a579aa51495a128838b22e1692012f3918753a8c +# SHA1:f8c625c62cb057cead0d70b0a458c8aed4f43459 # # This file is autogenerated by pip-compile-multi # To update, run: @@ -34,7 +34,9 @@ idna==3.4 invoke==2.2.0 # via fabric paramiko==3.3.1 - # via fabric + # via + # -r requirements/base.in + # fabric pycparser==2.21 # via cffi pydantic==1.10.12 diff --git a/requirements/ci.txt b/requirements/ci.txt index 9eb1c122..c04f174d 100644 --- a/requirements/ci.txt +++ b/requirements/ci.txt @@ -25,11 +25,11 @@ platformdirs==3.10.0 # via # tox # virtualenv -pluggy==1.2.0 +pluggy==1.3.0 # via tox -pyproject-api==1.5.3 +pyproject-api==1.5.4 # via tox -tox==4.9.0 +tox==4.10.0 # via -r requirements/ci.in virtualenv==20.24.3 # via tox diff --git a/requirements/dev.txt b/requirements/dev.txt index ce049f20..1c98de92 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -23,7 +23,7 @@ async-lru==2.0.4 # via jupyterlab black==23.7.0 # via -r requirements/dev.in -click==8.1.6 +click==8.1.7 # via # black # pip-compile-multi @@ -45,7 +45,7 @@ jupyter-events==0.7.0 # via jupyter-server jupyter-lsp==2.2.0 # via jupyterlab -jupyter-server==2.7.1 +jupyter-server==2.7.2 # via # jupyter-lsp # jupyterlab @@ -111,9 +111,9 @@ uri-template==1.3.0 # via jsonschema webcolors==1.13 # via jsonschema -websocket-client==1.6.1 +websocket-client==1.6.2 # via jupyter-server -wheel==0.41.1 +wheel==0.41.2 # via pip-tools # The following packages are considered to be unsafe in a requirements file: diff --git a/requirements/docs.txt b/requirements/docs.txt index 633a4476..45d46671 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -105,7 +105,7 @@ nbformat==5.9.2 # nbclient # nbconvert # nbsphinx -nbsphinx==0.9.2 +nbsphinx==0.9.3 # via -r requirements/docs.in nest-asyncio==1.5.7 # via ipykernel @@ -183,7 +183,7 @@ sphinxcontrib-jsmath==1.0.1 # via sphinx sphinxcontrib-qthelp==1.0.6 # via sphinx -sphinxcontrib-serializinghtml==1.1.8 +sphinxcontrib-serializinghtml==1.1.9 # via sphinx stack-data==0.6.2 # via ipython diff --git a/requirements/static.txt b/requirements/static.txt index 8362c2f8..23276f06 100644 --- a/requirements/static.txt +++ b/requirements/static.txt @@ -11,7 +11,7 @@ distlib==0.3.7 # via virtualenv filelock==3.12.2 # via virtualenv -identify==2.5.26 +identify==2.5.27 # via pre-commit nodeenv==1.8.0 # via pre-commit diff --git a/requirements/test.txt b/requirements/test.txt index 1f71a303..7cb6019a 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -12,15 +12,15 @@ execnet==2.0.2 # via pytest-xdist filelock==3.12.2 # via -r requirements/test.in -hypothesis==6.82.4 +hypothesis==6.82.6 # via -r requirements/test.in iniconfig==2.0.0 # via pytest packaging==23.1 # via pytest -pluggy==1.2.0 +pluggy==1.3.0 # via pytest -pyfakefs==5.2.3 +pyfakefs==5.2.4 # via -r requirements/test.in pytest==7.4.0 # via From ca740cf2b1355688a26495861c104c4e5c2e7f94 Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Mon, 28 Aug 2023 14:04:21 +0200 Subject: [PATCH 20/21] Use port=22 by default --- src/scitacean/transfer/sftp.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/scitacean/transfer/sftp.py b/src/scitacean/transfer/sftp.py index fe676487..fa59bf45 100644 --- a/src/scitacean/transfer/sftp.py +++ b/src/scitacean/transfer/sftp.py @@ -211,7 +211,7 @@ def __init__( self, *, host: str, - port: Optional[int] = None, + port: int = 22, source_folder: Optional[Union[str, RemotePath]] = None, connect: Optional[Callable[[str, Optional[int]], SFTPClient]] = None, ) -> None: @@ -275,19 +275,16 @@ def connect_for_upload(self, dataset: Dataset) -> Iterator[SFTPUploadConnection] sftp_client.close() -def _default_connect(host: str, port: Optional[int]) -> SFTPClient: +def _default_connect(host: str, port: int) -> SFTPClient: client = SSHClient() client.load_system_host_keys() - if port is not None: - client.connect(hostname=host, port=port) - else: - client.connect(hostname=host) + client.connect(hostname=host, port=port) return client.open_sftp() def _connect( host: str, - port: Optional[int], + port: int, connect: Optional[Callable[[str, Optional[int]], SFTPClient]], ) -> SFTPClient: try: From d75d420af3eed75727f8befe39435854eee75edc Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Mon, 28 Aug 2023 14:13:59 +0200 Subject: [PATCH 21/21] Add release note --- docs/release-notes.rst | 3 ++- src/scitacean/transfer/ssh.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/release-notes.rst b/docs/release-notes.rst index 9c335a9e..756ad142 100644 --- a/docs/release-notes.rst +++ b/docs/release-notes.rst @@ -44,7 +44,6 @@ Features * File downloads now use the checksum algorithm stored in SciCat when possible. So it is no longer necessary to specify it by hand in many cases. * Added ``SFTPFileTransfer`` which is similar to ``SSHFileTransfer`` but relies only on SFTP. - ``SSHFileTransfer`` may be removed in the future. Breaking changes ~~~~~~~~~~~~~~~~ @@ -60,6 +59,8 @@ Documentation Deprecations ~~~~~~~~~~~~ +* Deprecated ``SSHFileTransfer`` in favor of ``SFTPFileTransfer``. + Stability, Maintainability, and Testing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/scitacean/transfer/ssh.py b/src/scitacean/transfer/ssh.py index 24155f99..20c28921 100644 --- a/src/scitacean/transfer/ssh.py +++ b/src/scitacean/transfer/ssh.py @@ -284,6 +284,8 @@ class SSHFileTransfer: A useful approach is to include a unique ID in the source folder, for example ``"/some/base/folder/{uid}"``, to avoid clashes between different datasets. Scitacean will fill in the ``"{uid}"`` placeholder with a new UUID4. + + .. deprecated:: 23.08.0 """ def __init__(