From 9cd35402e8630b50a71517be6a9b52f7dff2458b Mon Sep 17 00:00:00 2001 From: kim Date: Tue, 26 Mar 2024 16:44:52 -0700 Subject: [PATCH 1/4] adds find_urls() method, searching by extension and/or pattern across https or s3 urls --- asf_search/ASFProduct.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/asf_search/ASFProduct.py b/asf_search/ASFProduct.py index 29cde330..4d35c8c0 100644 --- a/asf_search/ASFProduct.py +++ b/asf_search/ASFProduct.py @@ -3,6 +3,7 @@ import warnings from shapely.geometry import shape, Point, Polygon, mapping import json +import re from urllib import parse @@ -204,7 +205,28 @@ def _get_s3_urls(self) -> List[str]: s3_urls = self._get_access_urls(['GET DATA', 'EXTENDED METADATA', 'GET DATA VIA DIRECT ACCESS']) return [url for url in s3_urls if url.startswith('s3://')] + def find_urls(self, extension: str = None, pattern: str = r'.*', directAccess: bool = False) -> List[str]: + """ + Searches for all urls matching a given extension and/or pattern + param extension: the file extension to search for. (Defaults to `None`) + - Example: '.tiff' + param pattern: A regex pattern to search each url for.(Defaults to `False`) + - Example: `r'(QA_)+'` to find urls with 'QA_' at least once + param directAccess: should search in s3 bucket urls (Defaults to `False`) + """ + search_list = self._get_s3_urls() if directAccess else self._get_additional_urls() + + def _get_extension(file_url: str): + path = parse.urlparse(file_url).path + return os.path.splitext(path)[-1] + + if extension is not None: + search_list = [url for url in search_list if _get_extension(url) == extension] + + regexp = re.compile(pattern=pattern) + return [url for url in search_list if regexp.search(url) is not None] + def centroid(self) -> Point: """ Finds the centroid of a product From 3d7b4e74fb71cbf63e6e635abeb7081c7a1b8f32 Mon Sep 17 00:00:00 2001 From: kim Date: Tue, 26 Mar 2024 16:54:37 -0700 Subject: [PATCH 2/4] adds asfsearchresults equivalent --- asf_search/ASFSearchResults.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/asf_search/ASFSearchResults.py b/asf_search/ASFSearchResults.py index 77ef7f94..96f1ae3b 100644 --- a/asf_search/ASFSearchResults.py +++ b/asf_search/ASFSearchResults.py @@ -1,6 +1,7 @@ from collections import UserList from multiprocessing import Pool import json +from typing import List from asf_search import ASFSession, ASFSearchOptions from asf_search.download.file_download_type import FileDownloadType from asf_search.exceptions import ASFSearchError @@ -41,6 +42,15 @@ def jsonlite(self): def jsonlite2(self): return results_to_jsonlite2(self) + def find_urls(self, extension: str = None, pattern: str = r'.*', directAccess: bool = False) -> List[str]: + """Returns a list of all https or s3 urls from all results matching an extension and/or regex pattern""" + urls = [] + + for product in self: + urls.extend(product.find_urls(extension=extension, pattern=pattern, directAccess=directAccess)) + + return urls + def __str__(self): return json.dumps(self.geojson(), indent=2, sort_keys=True) From 62d748c948699de6259ba929d3e0a0b972bd6a0e Mon Sep 17 00:00:00 2001 From: kim Date: Wed, 27 Mar 2024 08:12:14 -0700 Subject: [PATCH 3/4] adds more comments to ASFSearchResults.find_urls --- asf_search/ASFSearchResults.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/asf_search/ASFSearchResults.py b/asf_search/ASFSearchResults.py index 96f1ae3b..c07e575a 100644 --- a/asf_search/ASFSearchResults.py +++ b/asf_search/ASFSearchResults.py @@ -43,7 +43,13 @@ def jsonlite2(self): return results_to_jsonlite2(self) def find_urls(self, extension: str = None, pattern: str = r'.*', directAccess: bool = False) -> List[str]: - """Returns a list of all https or s3 urls from all results matching an extension and/or regex pattern""" + """Returns a flat list of all https or s3 urls from all results matching an extension and/or regex pattern + param extension: the file extension to search for. (Defaults to `None`) + - Example: '.tiff' + param pattern: A regex pattern to search each url for.(Defaults to `False`) + - Example: `r'(QA_)+'` to find urls with 'QA_' at least once + param directAccess: should search in s3 bucket urls (Defaults to `False`) + """ urls = [] for product in self: From 8acf348b61c43db0fc8d6bf324c565353cff7a8d Mon Sep 17 00:00:00 2001 From: kim Date: Wed, 27 Mar 2024 08:21:47 -0700 Subject: [PATCH 4/4] codefactor fix --- asf_search/ASFProduct.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asf_search/ASFProduct.py b/asf_search/ASFProduct.py index 4d35c8c0..09005fb1 100644 --- a/asf_search/ASFProduct.py +++ b/asf_search/ASFProduct.py @@ -214,7 +214,7 @@ def find_urls(self, extension: str = None, pattern: str = r'.*', directAccess: b - Example: `r'(QA_)+'` to find urls with 'QA_' at least once param directAccess: should search in s3 bucket urls (Defaults to `False`) """ - search_list = self._get_s3_urls() if directAccess else self._get_additional_urls() + search_list = self._get_s3_urls() if directAccess else self._get_additional_urls() def _get_extension(file_url: str): path = parse.urlparse(file_url).path