From ffb30bfae1bae3155f41ff3e15f7e2595fbf30a8 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Thu, 20 Jul 2023 16:01:00 -0600 Subject: [PATCH 1/7] adds double dile staging prevention --- src/dcqc/file.py | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/src/dcqc/file.py b/src/dcqc/file.py index f949764..3450bd6 100644 --- a/src/dcqc/file.py +++ b/src/dcqc/file.py @@ -12,11 +12,12 @@ from __future__ import annotations import os +import glob from collections.abc import Collection, Mapping from copy import deepcopy from dataclasses import dataclass from pathlib import Path -from tempfile import mkdtemp +from tempfile import mkdtemp, gettempdir from typing import Any, ClassVar, Optional from warnings import warn @@ -308,6 +309,25 @@ def is_file_local(self) -> bool: """ return self._local_path is not None + def file_already_staged(self) -> list[Path]: + """Check if the target file has already been staged to the remote directory. + + Returns: + staged_file_paths (list): List of already staged file paths. Empty list if file has not been staged. + + Raises: + FileExistsError: If the file has already been staged more than once. This would cause a name collision in Nextflow. + """ + path_str = os.path.join(gettempdir(), self.tmp_dir + "*", self.name) + staged_file_strs = glob.glob(path_str) + staged_file_paths = [Path(path) for path in staged_file_strs] + if len(staged_file_paths) > 1: + message = ( + f"File has already been staged multiple times: {staged_file_paths}" + ) + raise FileExistsError(message) + return staged_file_paths + def stage( self, destination: Optional[Path] = None, @@ -338,8 +358,14 @@ def stage( if self._local_path is not None: return self._local_path else: - destination_str = mkdtemp(prefix=self.tmp_dir) - destination = Path(destination_str) + # if file has already been staged + staged_files = self.file_already_staged() + if not staged_files: + destination_str = mkdtemp(prefix=self.tmp_dir) + destination = Path(destination_str) + else: + destination = staged_files[0] + return destination # By this point, destination is defined (not None) if destination.is_dir(): From 7501d57d4fe51c5a14d2d354e985ed62d3e281b6 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Thu, 20 Jul 2023 16:06:22 -0600 Subject: [PATCH 2/7] change method name --- src/dcqc/file.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dcqc/file.py b/src/dcqc/file.py index 3450bd6..496ae4a 100644 --- a/src/dcqc/file.py +++ b/src/dcqc/file.py @@ -309,7 +309,7 @@ def is_file_local(self) -> bool: """ return self._local_path is not None - def file_already_staged(self) -> list[Path]: + def already_staged(self) -> list[Path]: """Check if the target file has already been staged to the remote directory. Returns: @@ -359,7 +359,7 @@ def stage( return self._local_path else: # if file has already been staged - staged_files = self.file_already_staged() + staged_files = self.already_staged() if not staged_files: destination_str = mkdtemp(prefix=self.tmp_dir) destination = Path(destination_str) From 1952dee87fa8de5803218458bc35557589496717 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Fri, 21 Jul 2023 13:09:29 -0600 Subject: [PATCH 3/7] pre-commit fix --- src/dcqc/file.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/dcqc/file.py b/src/dcqc/file.py index 496ae4a..a1c55be 100644 --- a/src/dcqc/file.py +++ b/src/dcqc/file.py @@ -11,13 +11,13 @@ from __future__ import annotations -import os import glob +import os from collections.abc import Collection, Mapping from copy import deepcopy from dataclasses import dataclass from pathlib import Path -from tempfile import mkdtemp, gettempdir +from tempfile import gettempdir, mkdtemp from typing import Any, ClassVar, Optional from warnings import warn @@ -313,10 +313,12 @@ def already_staged(self) -> list[Path]: """Check if the target file has already been staged to the remote directory. Returns: - staged_file_paths (list): List of already staged file paths. Empty list if file has not been staged. + staged_file_paths (list): List of already staged file paths. + Empty list if file has not been staged. Raises: - FileExistsError: If the file has already been staged more than once. This would cause a name collision in Nextflow. + FileExistsError: If the file has already been staged more than once. + This would cause a name collision in Nextflow. """ path_str = os.path.join(gettempdir(), self.tmp_dir + "*", self.name) staged_file_strs = glob.glob(path_str) From 7182247af442a8f80f9505ac7bc7124a061f02cb Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Fri, 21 Jul 2023 13:27:46 -0600 Subject: [PATCH 4/7] adds local_path setting --- src/dcqc/file.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/dcqc/file.py b/src/dcqc/file.py index a1c55be..40039c5 100644 --- a/src/dcqc/file.py +++ b/src/dcqc/file.py @@ -367,6 +367,7 @@ def stage( destination = Path(destination_str) else: destination = staged_files[0] + self._local_path = destination return destination # By this point, destination is defined (not None) From d1781a6105e76e41d4714e56d91b9f88b8531d94 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Fri, 21 Jul 2023 14:00:38 -0600 Subject: [PATCH 5/7] adds test for multiple files staged error --- tests/test_file.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/tests/test_file.py b/tests/test_file.py index 6e96adc..bcde76d 100644 --- a/tests/test_file.py +++ b/tests/test_file.py @@ -1,5 +1,8 @@ +import os +import shutil + from pathlib import Path -from tempfile import TemporaryDirectory +from tempfile import TemporaryDirectory, gettempdir import pytest @@ -62,6 +65,27 @@ def test_that_a_local_temporary_path_is_created_when_staging_a_remote_file(test_ assert remote_file.local_path == staged_path +def test_that_error_is_raised_when_a_file_has_been_staged_multiple_times(test_files): + duplicate_files = [ + os.path.join(gettempdir(), "dcqc-staged-test1/test.txt"), + os.path.join(gettempdir(), "dcqc-staged-test2/test.txt"), + ] + for file_path in duplicate_files: + parent_dir = os.path.dirname(file_path) + if not os.path.exists(parent_dir): + os.makedirs(parent_dir) + if not os.path.exists(file_path): + with open(file_path, "w"): + pass + remote_file = test_files["remote"] + with pytest.raises(FileExistsError): + remote_file.stage() + for file_path in duplicate_files: + directory_path = os.path.dirname(file_path) + if os.path.exists(directory_path): + shutil.rmtree(directory_path) + + def test_that_a_remote_file_is_created_when_staged_with_a_destination(test_files): remote_file = test_files["remote"] with TemporaryDirectory() as tmp_dir: From 064d2e348dd9e8d896ba1aade89adbe7a00622f1 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Fri, 21 Jul 2023 14:02:41 -0600 Subject: [PATCH 6/7] pre-commit fix --- tests/test_file.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_file.py b/tests/test_file.py index bcde76d..9403257 100644 --- a/tests/test_file.py +++ b/tests/test_file.py @@ -1,6 +1,5 @@ import os import shutil - from pathlib import Path from tempfile import TemporaryDirectory, gettempdir From 6b852fa3fcd1511994dd9937d41302b2f8bd1158 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Mon, 24 Jul 2023 15:02:25 -0600 Subject: [PATCH 7/7] updates tests to account for already staged files --- src/dcqc/file.py | 2 +- tests/test_file.py | 68 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 54 insertions(+), 16 deletions(-) diff --git a/src/dcqc/file.py b/src/dcqc/file.py index 40039c5..dcd29c5 100644 --- a/src/dcqc/file.py +++ b/src/dcqc/file.py @@ -360,7 +360,7 @@ def stage( if self._local_path is not None: return self._local_path else: - # if file has already been staged + # check if file has already been staged staged_files = self.already_staged() if not staged_files: destination_str = mkdtemp(prefix=self.tmp_dir) diff --git a/tests/test_file.py b/tests/test_file.py index 9403257..68b5eec 100644 --- a/tests/test_file.py +++ b/tests/test_file.py @@ -1,13 +1,55 @@ +import glob import os import shutil from pathlib import Path from tempfile import TemporaryDirectory, gettempdir +from typing import List import pytest from dcqc.file import File, FileType +def create_duplicate_files(file_num) -> List[str]: + """Create duplicate files (empty txt) for testing. + + Args: + file_num (int): number of files to create + + Returns: + file_path_list (List[str]): list of file paths + """ + file_path_list = [ + os.path.join(gettempdir(), f"dcqc-staged-test{i}/test.txt") + for i in range(file_num) + ] + + for file_path in file_path_list: + parent_dir = os.path.dirname(file_path) + if not os.path.exists(parent_dir): + os.makedirs(parent_dir) + if not os.path.exists(file_path): + with open(file_path, "w"): + pass + + return file_path_list + + +def remove_staged_files(): + """Removes all staged files and their parent directories + which follow the 'dcqc-staged-*' pattern. + + To be used at the end of all tests which result in such + files being created. + """ + path_str = os.path.join(gettempdir(), "dcqc-staged-" + "*", "test.txt") + staged_file_strs = glob.glob(path_str) + for staged_file_str in staged_file_strs: + directory_path = os.path.dirname(staged_file_str) + if os.path.exists(directory_path): + shutil.rmtree(directory_path) + + def test_for_an_error_if_registering_a_duplicate_file_type(): with pytest.raises(ValueError): FileType("txt", (".foo",)) @@ -62,27 +104,23 @@ def test_that_a_local_temporary_path_is_created_when_staging_a_remote_file(test_ staged_path = remote_file.stage() assert staged_path.exists() assert remote_file.local_path == staged_path + remove_staged_files() def test_that_error_is_raised_when_a_file_has_been_staged_multiple_times(test_files): - duplicate_files = [ - os.path.join(gettempdir(), "dcqc-staged-test1/test.txt"), - os.path.join(gettempdir(), "dcqc-staged-test2/test.txt"), - ] - for file_path in duplicate_files: - parent_dir = os.path.dirname(file_path) - if not os.path.exists(parent_dir): - os.makedirs(parent_dir) - if not os.path.exists(file_path): - with open(file_path, "w"): - pass + create_duplicate_files(2) remote_file = test_files["remote"] with pytest.raises(FileExistsError): remote_file.stage() - for file_path in duplicate_files: - directory_path = os.path.dirname(file_path) - if os.path.exists(directory_path): - shutil.rmtree(directory_path) + remove_staged_files() + + +def test_that_file_is_not_staged_when_it_already_has_been_staged(test_files): + duplicate_file = create_duplicate_files(1)[0] + remote_file = test_files["remote"] + destination = remote_file.stage() + assert destination == Path(duplicate_file) + remove_staged_files() def test_that_a_remote_file_is_created_when_staged_with_a_destination(test_files):