Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: Find Urls by Extensions and Patterns #288

Draft
wants to merge 9 commits into
base: master
Choose a base branch
from
23 changes: 23 additions & 0 deletions asf_search/ASFProduct.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import warnings
from shapely.geometry import shape, Point, Polygon, mapping
import json
import re

from urllib import parse

Expand Down Expand Up @@ -290,6 +291,28 @@ def _get_additional_urls(self) -> List[str]:
and url != self.properties['url']
]

def find_urls(self, extension: str = None, pattern: str = r'.*', directAccess: bool = False) -> List[str]:
"""
Searches for all urls matching a given extension and/or pattern
param extension: the file extension to search for. (Defaults to `None`)
- Example: '.tiff'
param pattern: A regex pattern to search each url for.(Defaults to `False`)
- Example: `r'(QA_)+'` to find urls with 'QA_' at least once
param directAccess: should search in s3 bucket urls (Defaults to `False`)
"""
search_list = self._get_s3_urls() if directAccess else self._get_additional_urls()

def _get_extension(file_url: str):
path = parse.urlparse(file_url).path
return os.path.splitext(path)[-1]

if extension is not None:
search_list = [url for url in search_list if _get_extension(url) == extension]

regexp = re.compile(pattern=pattern)

return [url for url in search_list if regexp.search(url) is not None]

def _get_s3_urls(self) -> List[str]:
s3_urls = self._get_access_urls(
['GET DATA', 'EXTENDED METADATA', 'GET DATA VIA DIRECT ACCESS']
Expand Down
16 changes: 16 additions & 0 deletions asf_search/ASFSearchResults.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections import UserList
from multiprocessing import Pool
import json
from typing import List
from asf_search import ASFSession, ASFSearchOptions
from asf_search.download.file_download_type import FileDownloadType
from asf_search.exceptions import ASFSearchError
Expand Down Expand Up @@ -42,6 +43,21 @@ def jsonlite(self):
def jsonlite2(self):
return results_to_jsonlite2(self)

def find_urls(self, extension: str = None, pattern: str = r'.*', directAccess: bool = False) -> List[str]:
"""Returns a flat list of all https or s3 urls from all results matching an extension and/or regex pattern
param extension: the file extension to search for. (Defaults to `None`)
- Example: '.tiff'
param pattern: A regex pattern to search each url for.(Defaults to `False`)
- Example: `r'(QA_)+'` to find urls with 'QA_' at least once
param directAccess: should search in s3 bucket urls (Defaults to `False`)
"""
urls = []

for product in self:
urls.extend(product.find_urls(extension=extension, pattern=pattern, directAccess=directAccess))

return urls

def __str__(self):
return json.dumps(self.geojson(), indent=2, sort_keys=True)

Expand Down
Loading