diff --git a/README.rst b/README.rst index 0d9f9a0..0386865 100644 --- a/README.rst +++ b/README.rst @@ -263,17 +263,23 @@ datalogistik_metadata.ini Schema of the table. ``url`` - Download url in case this is a single-file table. - - ``base_url`` - Base download url in case this is a multi-file table. Each file will append - their `rel_path` to this to form the full download url. + Download url for the table. This can be: + * A URL specifying the file to be downloaded for that table (which could be a + single file, or a directory that contains many files to be downloaded) + * A base URL that is concatenated with ``rel_url_path``s in the ``files`` attribute + if the table is a multi-file table and it is preferable to list out the files ``files`` A list of files in this table. Each entry in the list has the following properties: ``rel_path`` - Path to the file, relative to the directory of this table. + Path to the file(s), relative to the directory of this table. This is the + location on disk in the cache. + + ``rel_url_path`` + URL path to the file(s), relative to the directory of this table where it is stored + remotely. This is used only when downloading the file. This is only necesary when a + multi table file has the files that make up the table listed out individually. ``file_size`` Size of the file. diff --git a/datalogistik/dataset.py b/datalogistik/dataset.py index 0c6f2a8..f412cae 100644 --- a/datalogistik/dataset.py +++ b/datalogistik/dataset.py @@ -357,7 +357,6 @@ def validate_table_files(self, table): f"No metadata found for table {table}, could not perform validation (assuming valid)" ) return True - new_file_listing = self.create_file_listing(table) # we can't perform a simple equality check on the whole listing, # because the orig_file_listing does not contain the metadata file. @@ -372,6 +371,8 @@ def validate_table_files(self, table): found = None for new_file in new_file_listing: if new_file["rel_path"] == orig_file["rel_path"]: + # drop the rel_url_path for comparison, because it's not relevant! + orig_file.pop("rel_url_path", None) found = new_file break @@ -448,7 +449,7 @@ def download(self): msg = ( "No table entries were found. " "To download a dataset, at least 1 table entry must exist " - "that has a 'url' or 'base_url' property." + "that has a 'url' property." ) log.error(msg) raise ValueError(msg) @@ -460,51 +461,48 @@ def download(self): # For now, we always download all tables. So we need to loop through each table for table in self.tables: - - # There are 2 possible types downloads: - # 1 - The table entry has a url property. Either this table is a single-file, or it is a single - # download that will produce multiple files. - # 2 - multi file (either single or multi table). The table entry has a base_url property, - # and each file has a rel_path property. This is appended to the base_url to form - # the download link. The files will be placed in the table directory (generated from the table name). - # create table dir table_path = self.ensure_table_loc(table) - # Type 1 - if table.url: - # Note that table_path will be a file if this is a single-file table, - # and a dir if it is a multi-file table (download_file will produce multiple files) - util.download_file(table.url, output_path=table_path) - util.set_readonly(table_path) - - # Type 2 - elif table.base_url: - if len(table.files) <= 1: - msg = f"Single-file table '{table.table}' has 'base_url' property set. It should only have a 'url'." + for file in table.files: + # Validate that there is a url at all + if not table.url: + msg = ( + f"Could not find a url property for Table '{table.table}'." + ) log.error(msg) - raise ValueError(msg) - for file in table.files: - # contains the suffix for the download url - rel_path = file.get("rel_path") - if not rel_path: - msg = f"Missing rel_path property for multi-file table '{table.table}'." + raise RuntimeError(msg) + + # contains the suffix for the download url + rel_url_path = file.get("rel_url_path") + + # if the filename is not at the end of full_path, join + table_url = table.url + if rel_url_path and not table_url.endswith(rel_url_path): + table_url = table_url + "/" + rel_url_path + + download_path = table_path + + # Set the rel_path + file["rel_path"] = download_path.name + + # if this is a multi file table, then we need to do validate that + # there are rel_paths + append them. All files constituting a table + # must be in a dir with name table.name (created by ensure_table_loc) + # note that the resulting dir structure is not necessarily flat, + # because the table can have multiple levels of partitioning. + if len(table.files) > 1 or table.multi_file: + if not rel_url_path: + msg = f"Missing rel_url_path property for multi-file table '{table.table}'." log.error(msg) raise ValueError(msg) + download_path = download_path / rel_url_path - # All files constituting a table must be in a dir with name table.name (created by ensure_table_loc) - # note that the resulting dir structure is not necessarily flat, - # because the table can have multiple levels of partitioning. - download_path = table_path / rel_path - url = table.base_url + "/" + rel_path - - util.download_file(url, output_path=download_path) - util.set_readonly(download_path) + # but for multi-file tables, we override this with the rel_url_path + file["rel_path"] = rel_url_path - else: - msg = f"Could not find a url or base_url property for Table '{table.table}'." - log.error(msg) - raise RuntimeError(msg) + util.download_file(table_url, output_path=download_path) + util.set_readonly(download_path) # Try validation in case the dataset info contained checksums if not self.validate_table_files(table): diff --git a/datalogistik/table.py b/datalogistik/table.py index eb9a0ef..f5b4245 100644 --- a/datalogistik/table.py +++ b/datalogistik/table.py @@ -33,4 +33,3 @@ class Table: header_line: Optional[bool] = None dim: Optional[List] = field(default_factory=list) url: Optional[str] = None - base_url: Optional[str] = None diff --git a/repo.json b/repo.json index b2e30b7..4d7d49c 100644 --- a/repo.json +++ b/repo.json @@ -12,7 +12,6 @@ "header_line": false, "files": [ { - "rel_path": "2016Q4.csv.gz", "file_size": 262125134, "md5": "6e8ff7ae76033c452a9b72350461a4e6" } @@ -77,7 +76,6 @@ ], "files": [ { - "rel_path": "nyctaxi_2010-01.csv.gz", "file_size": 591876633, "md5": "22bd86bf54da91faf26fa8a243644a9f" } @@ -98,7 +96,6 @@ ], "files": [ { - "rel_path": "chi_traffic_2020_Q1.parquet", "file_size": 182895135, "md5": "b53ea738eda7ee051bcf33b3056b83f6" } @@ -119,7 +116,6 @@ ], "files": [ { - "rel_path": "type_strings.parquet", "file_size": 87174822, "md5": "927c0cf481fb3c4870806b096b525d90" } @@ -140,7 +136,6 @@ ], "files": [ { - "rel_path": "type_dict.parquet", "file_size": 2890770, "md5": "4186ad2f3072db6558aba3c535da1b97" } @@ -161,7 +156,6 @@ ], "files": [ { - "rel_path": "type_integers.parquet", "file_size": 15882666, "md5": "7d44caa83025c9c7ef2ded67081d6d68" } @@ -182,7 +176,6 @@ ], "files": [ { - "rel_path": "type_floats.parquet", "file_size": 23851672, "md5": "d083e46ddeff5cc5a3be3ea5c2f7f9d7" } @@ -203,7 +196,6 @@ ], "files": [ { - "rel_path": "type_nested.parquet", "file_size": 130538033, "md5": "4bcf6735b00b32fccb6da338262def23" } @@ -224,7 +216,6 @@ ], "files": [ { - "rel_path": "type_simple_features.parquet", "file_size": 28637722, "md5": "7776bea2a08e0466f77058676e2bb567" } @@ -232,4 +223,4 @@ } ] } -] +] \ No newline at end of file diff --git a/tests/fixtures/test_cache/taxi_2013/face7ed/datalogistik_metadata.ini b/tests/fixtures/test_cache/taxi_2013/face7ed/datalogistik_metadata.ini index 3f47b6b..37e636d 100644 --- a/tests/fixtures/test_cache/taxi_2013/face7ed/datalogistik_metadata.ini +++ b/tests/fixtures/test_cache/taxi_2013/face7ed/datalogistik_metadata.ini @@ -7,7 +7,7 @@ "tables": [ { "table": "taxi_2013", - "base_url": "http://www.example.com", + "url": "http://www.example.com", "files": [ { "file_size": 5653, diff --git a/tests/test_datalogistik.py b/tests/test_datalogistik.py index d0c3ea1..1597c34 100644 --- a/tests/test_datalogistik.py +++ b/tests/test_datalogistik.py @@ -297,9 +297,12 @@ def test_compress(comp_string): # Integration-style tests +@pytest.mark.skipif(sys.platform == "win32", reason="windows path seperator") def test_main(capsys): # This should be in the cache already, so no conversion needed - exact_dataset = dataset.Dataset(name="fanniemae_sample", format="csv", delim="|") + exact_dataset = dataset.Dataset( + name="fanniemae_sample", format="csv", delim="|", compression="gzip" + ) with pytest.raises(SystemExit) as e: datalogistik.main(exact_dataset) @@ -310,6 +313,11 @@ def test_main(capsys): assert captured["name"] == "fanniemae_sample" assert captured["format"] == "csv" assert isinstance(captured["tables"], dict) + # this is the path from the fixtures, if this doesn't match, we've actualy converted and not just found the extant one + assert ( + captured["tables"]["fanniemae_sample"]["path"] + == "tests/fixtures/test_cache/fanniemae_sample/a77e575/fanniemae_sample.csv.gz" + ) @pytest.mark.skipif(sys.platform == "win32", reason="windows errors on the cleanup") diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 74b8247..a85c80d 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import json import os import pathlib @@ -249,17 +250,71 @@ def test_validate(): multi_file_ds.tables[0].files[0]["md5"] = orig_md5 # restore for following tests -def test_download_dataset(monkeypatch): +@pytest.mark.parametrize( + "table_url,rel_url_path,final_url", + [ + ( + "https://ursa-qa.s3.amazonaws.com/chitraffic/chi_traffic_2020_Q1.parquet", + None, + "https://ursa-qa.s3.amazonaws.com/chitraffic/chi_traffic_2020_Q1.parquet", + ), + # we can append rel_url_path + ( + "https://ursa-qa.s3.amazonaws.com/chitraffic", + "chi_traffic_2020_Q1.parquet", + "https://ursa-qa.s3.amazonaws.com/chitraffic/chi_traffic_2020_Q1.parquet", + ), + # we don't duplicate rel_url_path when it's there + ( + "https://ursa-qa.s3.amazonaws.com/chitraffic/chi_traffic_2020_Q1.parquet", + "chi_traffic_2020_Q1.parquet", + "https://ursa-qa.s3.amazonaws.com/chitraffic/chi_traffic_2020_Q1.parquet", + ), + # we can have a tablename that is differnet from rel_url_path + ( + "https://ursa-qa.s3.amazonaws.com/chitraffic", + "traffic.parquet", + "https://ursa-qa.s3.amazonaws.com/chitraffic/traffic.parquet", + ), + ], +) +def test_download_single_dataset(monkeypatch, table_url, rel_url_path, final_url): def _fake_download(url, output_path): - assert ( - url - == "https://ursa-qa.s3.amazonaws.com/chitraffic/chi_traffic_2020_Q1.parquet" - ) + assert url == final_url + assert output_path == pathlib.Path( "tests/fixtures/test_cache/chi_traffic_2020_Q1/raw/chi_traffic_2020_Q1.parquet" ) monkeypatch.setattr("datalogistik.util.download_file", _fake_download) + + # We probably should move write_metadata out from download, but let's mock it for now + def _fake_write_metadata(self): + return True + + monkeypatch.setattr(Dataset, "write_metadata", _fake_write_metadata) + + # This probably also means that set_readonly isn't in the right place, but let's mock it away nonetheless + def _fake_set_readonly(path): + return True + + monkeypatch.setattr("datalogistik.util.set_readonly", _fake_set_readonly) + + # Don't even set rel_url_path if it's none: + if rel_url_path is None: + files_dict = {} + else: + files_dict = {"rel_url_path": rel_url_path} + + ds_variant_available = Dataset( + name="chi_traffic_2020_Q1", + format="parquet", + compression=None, + tables=[Table(table="chi_traffic_2020_Q1", url=table_url, files=[files_dict])], + ) + ds_variant_available.download() + + # but errors as well ds_variant_not_available = Dataset( name="chi_traffic_2020_Q1", format="csv", @@ -268,7 +323,46 @@ def _fake_download(url, output_path): with pytest.raises(ValueError): ds_variant_not_available.download() - # download a dataset with wrong checksums, verify that validation fails + +def test_multi_file_download_dataset(monkeypatch): + def _fake_multi_download(url, output_path): + file_numbers = [1, 10, 11, 12, 2, 3, 4, 5, 6, 7, 8, 9] + file_number = file_numbers[_fake_multi_download.file_index] + assert url == f"http://www.example.com/taxi_2013_{file_number}.csv.gz" + assert output_path == pathlib.Path( + f"tests/fixtures/test_cache/taxi_2013/face7ed/taxi_2013/taxi_2013_{file_number}.csv.gz" + ) + _fake_multi_download.file_index += 1 + + _fake_multi_download.file_index = 0 # init "static variable" + monkeypatch.setattr("datalogistik.util.download_file", _fake_multi_download) + + # We probably should move write_metadata out from download, but let's mock it for now + def _fake_write_metadata(self): + return True + + monkeypatch.setattr(Dataset, "write_metadata", _fake_write_metadata) + + # This probably also means that set_readonly isn't in the right place, but let's mock it away nonetheless + def _fake_set_readonly(path): + return True + + monkeypatch.setattr("datalogistik.util.set_readonly", _fake_set_readonly) + + # add rel_url_path as if this were coming from a repo file with that. + ds = copy.deepcopy(multi_file_ds) + files = ds.tables[0].files + # add rel_url_path + [fl.update({"rel_url_path": fl["rel_path"]}) for fl in files] + # remote rel_path + [fl.pop("rel_path") for fl in files] + + ds.tables[0].files = files + + ds.download() + + +def test_failed_validation_download_dataset(monkeypatch): with tempfile.TemporaryDirectory() as tmpcachedir: tmpcachepath = pathlib.Path(tmpcachedir) @@ -298,7 +392,8 @@ def _simulate_download_by_copying(url, output_path): # Note that if we were not working in a tmp dir, the dir should have been deleted # and the exception message would be: "File integrity check for newly downloaded dataset failed." - # Test a succeeding download + +def test_validated_download_dataset(monkeypatch): with tempfile.TemporaryDirectory() as tmpcachedir: tmpcachepath = pathlib.Path(tmpcachedir) dataset = Dataset.from_json( @@ -324,20 +419,6 @@ def _simulate_download_by_copying(url, output_path): tmp_ds.ensure_table_loc() ) == util.calculate_checksum(dataset.ensure_table_loc()) - # multi-file download - def _fake_multi_download(url, output_path): - file_numbers = [1, 10, 11, 12, 2, 3, 4, 5, 6, 7, 8, 9] - file_number = file_numbers[_fake_multi_download.file_index] - assert url == f"http://www.example.com/taxi_2013_{file_number}.csv.gz" - assert output_path == pathlib.Path( - f"tests/fixtures/test_cache/taxi_2013/face7ed/taxi_2013/taxi_2013_{file_number}.csv.gz" - ) - _fake_multi_download.file_index += 1 - - _fake_multi_download.file_index = 0 # init "static variable" - monkeypatch.setattr("datalogistik.util.download_file", _fake_multi_download) - multi_file_ds.download() - def test_to_json(): json_string = simple_parquet_ds.to_json() @@ -498,21 +579,6 @@ def test_find_dataset(): assert dataset_search.find_exact_dataset(ds_variant_not_found) is None -# def test_find_close_dataset(): -# ds_variant_not_found = Dataset( -# name="chi_traffic_sample", format="csv", compression="gzip" -# ) -# close = find_close_dataset(ds_variant_not_found) -# # We prefer Parquet if we have it -# assert close.format == "parquet" -# assert simple_parquet_ds == close - -# ds_variant_not_found = Dataset(name="nyctaxi_sample", format="parquet") -# close = find_close_dataset(ds_variant_not_found) -# # But can fall back -# assert close.format == "csv" - - def test_find_close_dataset_sf_mismatch(monkeypatch): # Mock the generation, cause all we care about here is that that would be called good_return = Dataset(name="tpc-h", scale_factor=10, format="tpc-raw")