diff --git a/idc_index/index.py b/idc_index/index.py index 3c055c7..fe84ddb 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -779,6 +779,11 @@ def _validate_update_manifest_and_get_download_size( # Rename the column manifest_df.columns = ["manifest_cp_cmd"] + # remove all rows that do not contain an S3 URL + manifest_df = manifest_df[ + manifest_df["manifest_cp_cmd"].str.contains(r"s3://", na=False) + ] + # create a copy of the index index_df_copy = self.index[ [ @@ -916,7 +921,9 @@ def _validate_update_manifest_and_get_download_size( REGEXP_EXTRACT(manifest_cp_cmd, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) AS manifest_crdc_series_uuid, REGEXP_REPLACE(regexp_replace(manifest_cp_cmd, 'cp ', ''), '\\s[^\\s]*$', '') AS s3_url, FROM - manifest_df ) + manifest_df + WHERE + REGEXP_REPLACE(regexp_replace(manifest_cp_cmd, 'cp ', ''), '\\s[^\\s]*$', '') IS NOT NULL) SELECT seriesInstanceuid, index_crdc_series_uuid, diff --git a/tests/study_manifest_aws.s5cmd b/tests/study_manifest_aws.s5cmd index 86631b3..765586d 100644 --- a/tests/study_manifest_aws.s5cmd +++ b/tests/study_manifest_aws.s5cmd @@ -1,6 +1,7 @@ # To download the files in this manifest, first install s5cmd (https://github.com/peak/s5cmd), # then run the following command: # s5cmd --no-sign-request --endpoint-url https://s3.amazonaws.com run study_manifest_aws.s5cmd +study_manifest_cp_command cp s3://idc-open-data/28621ba9-1aca-4aab-a2a1-f6d2c3e2ab19/* . cp s3://idc-open-data/f0b76401-c6d1-4b61-a5fd-3fa596e6cc41/* . cp s3://idc-open-data/4ea3bbe6-98da-4b92-abe6-2ee18927e3c9/* .