From ef851dcdf7cac93bd853cc688811941eeef40eec Mon Sep 17 00:00:00 2001 From: Koncopd Date: Mon, 16 Dec 2024 16:24:51 +0100 Subject: [PATCH 01/13] simplify sync code --- lamindb_setup/core/upath.py | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/lamindb_setup/core/upath.py b/lamindb_setup/core/upath.py index 1a33403e..070229c5 100644 --- a/lamindb_setup/core/upath.py +++ b/lamindb_setup/core/upath.py @@ -350,27 +350,19 @@ def synchronize( exists = True cloud_mts = timestamp else: - # hf requires special treatment - if protocol == "hf": - try: - stat_hf = self.stat().as_info() - is_dir = stat_hf["type"] == "directory" - exists = True - if not is_dir: - cloud_mts = stat_hf["last_commit"].date.timestamp() - except FileNotFoundError: - exists = False - else: - # perform only one network request to check existence, type and timestamp - try: - cloud_mts = self.modified.timestamp() - is_dir = False - exists = True - except FileNotFoundError: - exists = False - except IsADirectoryError: - is_dir = True - exists = True + try: + path_stat = self.stat() + path_info = path_stat.as_info() + exists = True + is_dir = path_info["type"] == "directory" + if not is_dir: + # hf requires special treatment + if protocol == "hf": + cloud_mts = path_info["last_commit"].date.timestamp() + else: + cloud_mts = path_stat.st_mtime + except FileNotFoundError: + exists = False if not exists: warn_or_error = f"The original path {self} does not exist anymore." From 1b3b9b0e4ff01e782eb901e8827721fb6886f3a5 Mon Sep 17 00:00:00 2001 From: Koncopd Date: Mon, 16 Dec 2024 18:15:59 +0100 Subject: [PATCH 02/13] alow to sync with size instead of timestamp --- lamindb_setup/core/upath.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/lamindb_setup/core/upath.py b/lamindb_setup/core/upath.py index 070229c5..9326f795 100644 --- a/lamindb_setup/core/upath.py +++ b/lamindb_setup/core/upath.py @@ -190,7 +190,13 @@ def relative_update(self, inc=1): pass def update_relative_value(self, inc=1): - self.value += inc + if inc != 0: + self.value += inc + # this is specific to http filesystem + # for some reason the last update is 0 always + # here 100% is forced manually in this case + elif self.value >= 0.999: + self.value = self.size self.call() def branch(self, path_1, path_2, kwargs): @@ -351,16 +357,16 @@ def synchronize( cloud_mts = timestamp else: try: - path_stat = self.stat() - path_info = path_stat.as_info() + cloud_stat = self.stat() + cloud_info = cloud_stat.as_info() exists = True - is_dir = path_info["type"] == "directory" + is_dir = cloud_info["type"] == "directory" if not is_dir: # hf requires special treatment if protocol == "hf": - cloud_mts = path_info["last_commit"].date.timestamp() + cloud_mts = cloud_info["last_commit"].date.timestamp() else: - cloud_mts = path_stat.st_mtime + cloud_mts = cloud_stat.st_mtime except FileNotFoundError: exists = False @@ -443,8 +449,16 @@ def synchronize( callback, print_progress, objectpath.name, "synchronizing" ) if objectpath.exists(): - local_mts_obj = objectpath.stat().st_mtime # type: ignore - need_synchronize = cloud_mts > local_mts_obj + if cloud_mts != 0: + local_mts_obj = objectpath.stat().st_mtime + need_synchronize = cloud_mts > local_mts_obj + else: + # this is true for http for example + # where size is present but st_mtime is not + # we assume that any change without the change in size is unlikely + cloud_size = cloud_stat.st_size + local_size_obj = objectpath.stat().st_size + need_synchronize = cloud_size != local_size_obj else: objectpath.parent.mkdir(parents=True, exist_ok=True) need_synchronize = True @@ -456,7 +470,8 @@ def synchronize( self.download_to( objectpath, recursive=False, print_progress=False, callback=callback ) - os.utime(objectpath, times=(cloud_mts, cloud_mts)) + if cloud_mts != 0: + os.utime(objectpath, times=(cloud_mts, cloud_mts)) else: # nothing happens if parent_update is not defined # because of Callback.no_op From 637388575c9c5e1a06b5679dfb685b70f74c57d8 Mon Sep 17 00:00:00 2001 From: Koncopd Date: Mon, 16 Dec 2024 18:56:48 +0100 Subject: [PATCH 03/13] correct local path for http urls --- lamindb_setup/core/_settings.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/lamindb_setup/core/_settings.py b/lamindb_setup/core/_settings.py index e1a137cb..a04728e9 100644 --- a/lamindb_setup/core/_settings.py +++ b/lamindb_setup/core/_settings.py @@ -200,9 +200,14 @@ def cloud_to_local_no_update( # cache_key is ignored if filepath is a local path if not isinstance(filepath, LocalPathClasses): # settings is defined further in this file - local_filepath = settings.cache_dir / ( - filepath.path if cache_key is None else cache_key # type: ignore - ) + if cache_key is None: + local_key = filepath.path # type: ignore + protocol = filepath.protocol # type: ignore + if protocol in {"http", "https"}: + local_key = local_key.removeprefix(protocol + "://") + else: + local_key = cache_key + local_filepath = settings.cache_dir / local_key else: local_filepath = filepath return local_filepath From 15734ee9a8563ab141346c4763eb5e76a67645c0 Mon Sep 17 00:00:00 2001 From: Koncopd Date: Mon, 16 Dec 2024 19:52:50 +0100 Subject: [PATCH 04/13] test --- docs/hub-prod/test-cloud-sync.ipynb | 54 +++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/docs/hub-prod/test-cloud-sync.ipynb b/docs/hub-prod/test-cloud-sync.ipynb index 41e7fd0e..19677de8 100644 --- a/docs/hub-prod/test-cloud-sync.ipynb +++ b/docs/hub-prod/test-cloud-sync.ipynb @@ -149,6 +149,22 @@ "assert settings.paths.cloud_to_local_no_update(dir_sync.as_posix(), cache_key=\"dir_cache/key\") == settings.cache_dir / \"dir_cache/key\"" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "df6a9be4", + "metadata": {}, + "outputs": [], + "source": [ + "# for http urls\n", + "http_path = UPath(\"https://raw.githubusercontent.com/laminlabs/lamindb-setup/refs/heads/main/README.md\")\n", + "http_key = \"raw.githubusercontent.com/laminlabs/lamindb-setup/refs/heads/main/README.md\"\n", + "\n", + "assert settings.paths.cloud_to_local_no_update(http_path) == settings.cache_dir / http_key\n", + "assert settings.paths.cloud_to_local_no_update(str(http_path)) == settings.cache_dir / http_key\n", + "assert settings.paths.cloud_to_local_no_update(http_path, cache_key=\"check/README.md\") == settings.cache_dir / \"check/README.md\"" + ] + }, { "cell_type": "markdown", "id": "0b79f2f7", @@ -191,6 +207,44 @@ "dir_sync_local.rmdir()" ] }, + { + "cell_type": "markdown", + "id": "d2246f90", + "metadata": {}, + "source": [ + "Test `cloud_to_local` for http" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc1b7736", + "metadata": {}, + "outputs": [], + "source": [ + "http_local = settings.paths.cloud_to_local(http_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca9f0ba8", + "metadata": {}, + "outputs": [], + "source": [ + "assert http_local.stat().st_size == http_path.stat().st_size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9da41c21", + "metadata": {}, + "outputs": [], + "source": [ + "http_local.unlink()" + ] + }, { "cell_type": "markdown", "id": "574c3f95", From c8d24f6d3c2a1b9b8fe0f0e187c31a08ad08e0d4 Mon Sep 17 00:00:00 2001 From: Koncopd Date: Mon, 16 Dec 2024 19:54:55 +0100 Subject: [PATCH 05/13] comment --- lamindb_setup/core/upath.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lamindb_setup/core/upath.py b/lamindb_setup/core/upath.py index 9326f795..9fc19fee 100644 --- a/lamindb_setup/core/upath.py +++ b/lamindb_setup/core/upath.py @@ -384,6 +384,7 @@ def synchronize( return None # synchronization logic for directories + # to synchronize directories, it should be possible to get modification times if is_dir: files = self.fs.find(str(self), detail=True) if protocol == "s3": From 5600c23e76e743b97507bc2f003f5eaeae843153 Mon Sep 17 00:00:00 2001 From: Koncopd Date: Mon, 16 Dec 2024 20:17:45 +0100 Subject: [PATCH 06/13] more checks --- docs/hub-prod/test-cloud-sync.ipynb | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/docs/hub-prod/test-cloud-sync.ipynb b/docs/hub-prod/test-cloud-sync.ipynb index 19677de8..12c814a9 100644 --- a/docs/hub-prod/test-cloud-sync.ipynb +++ b/docs/hub-prod/test-cloud-sync.ipynb @@ -152,12 +152,27 @@ { "cell_type": "code", "execution_count": null, - "id": "df6a9be4", + "id": "eda84820", "metadata": {}, "outputs": [], "source": [ "# for http urls\n", "http_path = UPath(\"https://raw.githubusercontent.com/laminlabs/lamindb-setup/refs/heads/main/README.md\")\n", + "assert http_path.protocol == \"https\"\n", + "\n", + "http_stat = http_path.stat()\n", + "assert http_stat.st_size != 0\n", + "assert http_stat.st_mtime == 0\n", + "assert http_stat.as_info()[\"type\"] == \"file\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df6a9be4", + "metadata": {}, + "outputs": [], + "source": [ "http_key = \"raw.githubusercontent.com/laminlabs/lamindb-setup/refs/heads/main/README.md\"\n", "\n", "assert settings.paths.cloud_to_local_no_update(http_path) == settings.cache_dir / http_key\n", @@ -232,6 +247,7 @@ "metadata": {}, "outputs": [], "source": [ + "assert isinstance(http_local, LocalPathClasses)\n", "assert http_local.stat().st_size == http_path.stat().st_size" ] }, From 01e5e7efb4fba2427ae1f57af63331dd0b5fc5fc Mon Sep 17 00:00:00 2001 From: Koncopd Date: Mon, 16 Dec 2024 20:31:34 +0100 Subject: [PATCH 07/13] check double sync --- docs/hub-prod/test-cloud-sync.ipynb | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/hub-prod/test-cloud-sync.ipynb b/docs/hub-prod/test-cloud-sync.ipynb index 12c814a9..39f72705 100644 --- a/docs/hub-prod/test-cloud-sync.ipynb +++ b/docs/hub-prod/test-cloud-sync.ipynb @@ -251,6 +251,18 @@ "assert http_local.stat().st_size == http_path.stat().st_size" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ae7cd2b", + "metadata": {}, + "outputs": [], + "source": [ + "http_local_mtime = http_local.stat().st_mtime\n", + "# no changes here because the file exists already\n", + "assert settings.paths.cloud_to_local(http_path).stat().st_mtime == http_local_mtime" + ] + }, { "cell_type": "code", "execution_count": null, From 1c14b3a9e3be3ff23ef399dbfe8b31122193647a Mon Sep 17 00:00:00 2001 From: Koncopd Date: Mon, 16 Dec 2024 21:11:17 +0100 Subject: [PATCH 08/13] allow http storages --- lamindb_setup/core/_settings_storage.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lamindb_setup/core/_settings_storage.py b/lamindb_setup/core/_settings_storage.py index 125ec0b9..3b7ba78e 100644 --- a/lamindb_setup/core/_settings_storage.py +++ b/lamindb_setup/core/_settings_storage.py @@ -114,7 +114,7 @@ def init_storage( root_str = f"s3://lamin-{region}/{uid}" else: root_str = f"s3://lamin-hosted-test/{uid}" - elif root_str.startswith(("gs://", "s3://", "hf://")): + elif root_str.startswith(("gs://", "s3://", "hf://", "http://", "https://")): pass else: # local path try: @@ -227,7 +227,7 @@ def _mark_storage_root(self) -> UPath: @property def record(self) -> Any: - """Storage record in current instance.""" + """Storage record in the current instance.""" if self._record is None: # dynamic import because of import order from lnschema_core.models import Storage @@ -299,10 +299,10 @@ def region(self) -> str | None: return self._region @property - def type(self) -> Literal["local", "s3", "gs"]: + def type(self) -> Literal["local", "s3", "gs", "hf", "http", "https"]: """AWS S3 vs. Google Cloud vs. local. - Returns the protocol as a string: "local", "s3", "gs". + Returns the protocol as a string: "local", "s3", "gs", "http", "https". """ import fsspec @@ -345,5 +345,5 @@ def key_to_filepath(self, filekey: UPathStr) -> UPath: return self.root / filekey def local_filepath(self, filekey: UPathStr) -> UPath: - """Local (cache) filepath from filekey: `local(filepath(...))`.""" + """Local (cache) filepath from filekey.""" return self.cloud_to_local(self.key_to_filepath(filekey)) From dc36903203fecca976bc5256dfbe81ce7c63c364 Mon Sep 17 00:00:00 2001 From: Koncopd Date: Mon, 16 Dec 2024 21:57:43 +0100 Subject: [PATCH 09/13] do not use http etag for now --- lamindb_setup/core/upath.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lamindb_setup/core/upath.py b/lamindb_setup/core/upath.py index 9fc19fee..4f92ab29 100644 --- a/lamindb_setup/core/upath.py +++ b/lamindb_setup/core/upath.py @@ -747,7 +747,9 @@ def get_stat_file_cloud(stat: dict) -> tuple[int, str | None, str | None]: hash = b16_to_b64(stat["blob_id"]) hash_type = "sha1" # s3 - elif "ETag" in stat: + # StorageClass is checked to be sure that it is indeed s3 + # because http also has ETag + elif "ETag" in stat and "StorageClass" in stat: etag = stat["ETag"] # small files if "-" not in etag: From 992ec40e4e3e4070decab55be5ca49b0567f07ae Mon Sep 17 00:00:00 2001 From: Koncopd Date: Tue, 17 Dec 2024 11:28:03 +0100 Subject: [PATCH 10/13] correctly check protocols in init_storage --- lamindb_setup/core/_settings_storage.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/lamindb_setup/core/_settings_storage.py b/lamindb_setup/core/_settings_storage.py index 3b7ba78e..073ffdb0 100644 --- a/lamindb_setup/core/_settings_storage.py +++ b/lamindb_setup/core/_settings_storage.py @@ -7,6 +7,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Literal +import fsspec from lamin_utils import logger from ._aws_credentials import HOSTED_REGIONS, get_aws_credentials_manager @@ -114,16 +115,12 @@ def init_storage( root_str = f"s3://lamin-{region}/{uid}" else: root_str = f"s3://lamin-hosted-test/{uid}" - elif root_str.startswith(("gs://", "s3://", "hf://", "http://", "https://")): - pass - else: # local path - try: - _ = Path(root_str) - except Exception as e: - logger.error( - "`storage` is not a valid local, GCP storage, AWS S3 path or Hugging Face path" - ) - raise e + elif (input_protocol := fsspec.utils.get_protocol(root_str)) not in ( + valid_protocols := ("file", "gs", "s3", "hf", "http", "https") + ): + raise ValueError( + f"Protocol {input_protocol} is not supported, valid prootocols are {', '.join(('local',) + valid_protocols[1:])}" + ) ssettings = StorageSettings( uid=uid, root=root_str, From 34e94e8e27e783dc1de7f3f39730d91d588d9e4b Mon Sep 17 00:00:00 2001 From: Koncopd Date: Tue, 17 Dec 2024 11:35:19 +0100 Subject: [PATCH 11/13] test --- lamindb_setup/core/_settings_storage.py | 3 ++- tests/hub-local/test_all.py | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/lamindb_setup/core/_settings_storage.py b/lamindb_setup/core/_settings_storage.py index 073ffdb0..b7e7eff3 100644 --- a/lamindb_setup/core/_settings_storage.py +++ b/lamindb_setup/core/_settings_storage.py @@ -118,8 +118,9 @@ def init_storage( elif (input_protocol := fsspec.utils.get_protocol(root_str)) not in ( valid_protocols := ("file", "gs", "s3", "hf", "http", "https") ): + valid_protocols = ("local",) + valid_protocols[1:] # local instead of file raise ValueError( - f"Protocol {input_protocol} is not supported, valid prootocols are {', '.join(('local',) + valid_protocols[1:])}" + f"Protocol {input_protocol} is not supported, valid protocols are {', '.join(valid_protocols)}" ) ssettings = StorageSettings( uid=uid, diff --git a/tests/hub-local/test_all.py b/tests/hub-local/test_all.py index 981c24d9..65ffdd76 100644 --- a/tests/hub-local/test_all.py +++ b/tests/hub-local/test_all.py @@ -360,3 +360,9 @@ def test_init_storage_with_non_existing_bucket(create_testadmin1_session): )[0] ) assert error.exconly().endswith("Not Found") + + +def test_init_storage_incorrect_protocol(): + with pytest.raises(ValueError) as error: + init_storage_base("incorrect-protocol://some-path/some-path-level") + assert "Protocol incorrect-protocol is not supported" in error.exconly() From 524ca006eeab9ad759f323175fcb2b04145b1bd8 Mon Sep 17 00:00:00 2001 From: Koncopd Date: Tue, 17 Dec 2024 11:58:14 +0100 Subject: [PATCH 12/13] list valid protocols --- lamindb_setup/core/_settings_storage.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lamindb_setup/core/_settings_storage.py b/lamindb_setup/core/_settings_storage.py index b7e7eff3..e8c180f0 100644 --- a/lamindb_setup/core/_settings_storage.py +++ b/lamindb_setup/core/_settings_storage.py @@ -25,6 +25,10 @@ IS_INITIALIZED_KEY = ".lamindb/_is_initialized" +# a list of supported fsspec protocols +# rename file to local before showing to a user +VALID_PROTOCOLS = ("file", "gs", "s3", "hf", "http", "https") + def base62(n_char: int) -> str: """Like nanoid without hyphen and underscore.""" @@ -115,10 +119,8 @@ def init_storage( root_str = f"s3://lamin-{region}/{uid}" else: root_str = f"s3://lamin-hosted-test/{uid}" - elif (input_protocol := fsspec.utils.get_protocol(root_str)) not in ( - valid_protocols := ("file", "gs", "s3", "hf", "http", "https") - ): - valid_protocols = ("local",) + valid_protocols[1:] # local instead of file + elif (input_protocol := fsspec.utils.get_protocol(root_str)) not in VALID_PROTOCOLS: + valid_protocols = ("local",) + VALID_PROTOCOLS[1:] # show local instead of file raise ValueError( f"Protocol {input_protocol} is not supported, valid protocols are {', '.join(valid_protocols)}" ) From 07298fdeb89582c4004b73410fd97bad1c5e1db5 Mon Sep 17 00:00:00 2001 From: Koncopd Date: Tue, 17 Dec 2024 12:01:19 +0100 Subject: [PATCH 13/13] comment --- lamindb_setup/core/_settings_storage.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lamindb_setup/core/_settings_storage.py b/lamindb_setup/core/_settings_storage.py index e8c180f0..a68ab917 100644 --- a/lamindb_setup/core/_settings_storage.py +++ b/lamindb_setup/core/_settings_storage.py @@ -307,6 +307,7 @@ def type(self) -> Literal["local", "s3", "gs", "hf", "http", "https"]: import fsspec convert = {"file": "local"} + # init_storage checks that the root protocol belongs to VALID_PROTOCOLS protocol = fsspec.utils.get_protocol(self.root_as_str) return convert.get(protocol, protocol) # type: ignore