Skip to content

Commit

Permalink
remove code duplication in dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
dnlbauer committed Jan 15, 2025
1 parent 7f663ff commit 5e9a597
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 36 deletions.
48 changes: 13 additions & 35 deletions rocrate/model/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,42 +48,21 @@ def format_id(self, identifier):
return identifier.rstrip("/") + "/"

def write(self, base_path):
out_path = Path(base_path) / self.id
if is_url(str(self.source)):
if self.validate_url and not self.fetch_remote:
with urlopen(self.source) as _:
self._jsonld['sdDatePublished'] = iso_now()
if self.fetch_remote:
self.__get_parts(out_path)
else:
if self.source is None:
out_path.mkdir(parents=True, exist_ok=True)
else:
if not Path(self.source).exists():
raise FileNotFoundError(
errno.ENOENT, os.strerror(errno.ENOENT), str(self.source)
)
out_path.mkdir(parents=True, exist_ok=True)
if not self.crate.source:
self.crate._copy_unlisted(self.source, out_path)

def __get_parts(self, out_path):
out_path.mkdir(parents=True, exist_ok=True)
base = self.source.rstrip("/")
for entry in self._jsonld.get("hasPart", []):
try:
part = entry["@id"]
except KeyError:
continue
if is_url(part) or part.startswith("/"):
raise RuntimeError(f"'{self.source}': part '{part}' is not a relative path")
part_uri = f"{base}/{part}"
part_out_path = out_path / part
with urlopen(part_uri) as r, open(part_out_path, 'wb') as f:
shutil.copyfileobj(r, f)
out_path = Path(base_path)

out_file_path, out_file = None, None
for path, chunk in self.stream():
if path != out_file_path:
if out_file:
out_file.close()
out_file_path = out_path / path
out_file_path.parent.mkdir(parents=True, exist_ok=True)
out_file = open(out_file_path, "wb")
out_file.write(chunk)
if out_file is not None:
out_file.close()

def stream(self) -> Generator[tuple[str, bytes], None, None]:
# TODO code duplication from self.write. Refactor to use stream?
out_path = Path(self.id) # relative output path
if is_url(str(self.source)):
if self.validate_url and not self.fetch_remote:
Expand All @@ -107,7 +86,6 @@ def stream(self) -> Generator[tuple[str, bytes], None, None]:
errno.ENOENT, os.strerror(errno.ENOENT), str(self.source)
)
if not self.crate.source:
# code copied from rocrate.__copy_unlisted
for root, _, files in os.walk(self.source):
root = Path(root)
for name in files:
Expand Down
1 change: 0 additions & 1 deletion rocrate/rocrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,6 @@ def write_zip(self, out_path):
out_file.write(chunk)
return out_path

# TODO use context manager? https://docs.python.org/3/library/contextlib.html
def stream_zip(self):
""" Creates a stream of bytes representing the RO-Crate as a ZIP file."""
buffer = MemoryBuffer()
Expand Down

0 comments on commit 5e9a597

Please sign in to comment.