From 5e9a597acd3a8a3b937026b54ca339dc9495cc30 Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Wed, 15 Jan 2025 13:38:32 +0100 Subject: [PATCH] remove code duplication in dataset --- rocrate/model/dataset.py | 48 +++++++++++----------------------------- rocrate/rocrate.py | 1 - 2 files changed, 13 insertions(+), 36 deletions(-) diff --git a/rocrate/model/dataset.py b/rocrate/model/dataset.py index dfb9048c..976762ee 100644 --- a/rocrate/model/dataset.py +++ b/rocrate/model/dataset.py @@ -48,42 +48,21 @@ def format_id(self, identifier): return identifier.rstrip("/") + "/" def write(self, base_path): - out_path = Path(base_path) / self.id - if is_url(str(self.source)): - if self.validate_url and not self.fetch_remote: - with urlopen(self.source) as _: - self._jsonld['sdDatePublished'] = iso_now() - if self.fetch_remote: - self.__get_parts(out_path) - else: - if self.source is None: - out_path.mkdir(parents=True, exist_ok=True) - else: - if not Path(self.source).exists(): - raise FileNotFoundError( - errno.ENOENT, os.strerror(errno.ENOENT), str(self.source) - ) - out_path.mkdir(parents=True, exist_ok=True) - if not self.crate.source: - self.crate._copy_unlisted(self.source, out_path) - - def __get_parts(self, out_path): - out_path.mkdir(parents=True, exist_ok=True) - base = self.source.rstrip("/") - for entry in self._jsonld.get("hasPart", []): - try: - part = entry["@id"] - except KeyError: - continue - if is_url(part) or part.startswith("/"): - raise RuntimeError(f"'{self.source}': part '{part}' is not a relative path") - part_uri = f"{base}/{part}" - part_out_path = out_path / part - with urlopen(part_uri) as r, open(part_out_path, 'wb') as f: - shutil.copyfileobj(r, f) + out_path = Path(base_path) + + out_file_path, out_file = None, None + for path, chunk in self.stream(): + if path != out_file_path: + if out_file: + out_file.close() + out_file_path = out_path / path + out_file_path.parent.mkdir(parents=True, exist_ok=True) + out_file = open(out_file_path, "wb") + out_file.write(chunk) + if out_file is not None: + out_file.close() def stream(self) -> Generator[tuple[str, bytes], None, None]: - # TODO code duplication from self.write. Refactor to use stream? out_path = Path(self.id) # relative output path if is_url(str(self.source)): if self.validate_url and not self.fetch_remote: @@ -107,7 +86,6 @@ def stream(self) -> Generator[tuple[str, bytes], None, None]: errno.ENOENT, os.strerror(errno.ENOENT), str(self.source) ) if not self.crate.source: - # code copied from rocrate.__copy_unlisted for root, _, files in os.walk(self.source): root = Path(root) for name in files: diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 3e535a7b..df2c538a 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -476,7 +476,6 @@ def write_zip(self, out_path): out_file.write(chunk) return out_path - # TODO use context manager? https://docs.python.org/3/library/contextlib.html def stream_zip(self): """ Creates a stream of bytes representing the RO-Crate as a ZIP file.""" buffer = MemoryBuffer()