Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes and updates for metadata search #29

Merged
merged 13 commits into from
Feb 27, 2024
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# System
.DS_Store

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
3 changes: 3 additions & 0 deletions config/config.ini
jgrethe marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@ default_profile=ci

[prod]
pennsieve_profile_name=prod
scicrunch_api_key=YOUR_KEY_HERE

[dev]
pennsieve_profile_name=test
scicrunch_api_key=YOUR_KEY_HERE

[ci]
pennsieve_profile_name=ci
scicrunch_api_key=YOUR_KEY_HERE
81 changes: 81 additions & 0 deletions docs/examples_metadata.py
jgrethe marked this conversation as resolved.
Show resolved Hide resolved
jgrethe marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Set of example call to the metadata services within the SPARC client
# Make sure to get an API key and add to your config/config.ini file
# Add key to the scicrunch_api_key attribute
# Instructions for getting an API key can be found at: https://fdilab.gitbook.io/api-handbook/sparc-k-core-api-overview/getting-started-with-sparc-apis

import json

from sparc.client import SparcClient

client = SparcClient(connect=False, config_file='../config/config.ini')

# Connect
response = client.metadata.connect()

if response == "https://scicrunch.org/api/1/elastic":
test_pass = True
else:
test_pass = False
jgrethe marked this conversation as resolved.
Show resolved Hide resolved

print( str(test_pass) )

# Get Info
response = client.metadata.info()

if response == "https://scicrunch.org/api/1/elastic":
test_pass = True
else:
test_pass = False
jgrethe marked this conversation as resolved.
Show resolved Hide resolved

print( str(test_pass) )

# ES list datasets
response = {}
response = client.metadata.list_datasets()

check_response = response['hits']['total']
if check_response > 200:
test_pass = True
else:
test_pass = False
jgrethe marked this conversation as resolved.
Show resolved Hide resolved

print( str(test_pass) )

# ES search via default
response = {}
response = client.metadata.search_datasets()

check_response = response['hits']['total']
if check_response > 200:
test_pass = True
else:
test_pass = False
jgrethe marked this conversation as resolved.
Show resolved Hide resolved

print( str(test_pass) )

# ES search via JSON string
response = {}
response = client.metadata.search_datasets("{\"query\": {\"terms\": {\"_id\": [ \"136\", \"95\" ] } } }")

check_response = response['hits']['total']
if check_response == 2:
test_pass = True
else:
test_pass = False
jgrethe marked this conversation as resolved.
Show resolved Hide resolved

print( str(test_pass) )

# ES search via JSON object
response = {}
body = "{\"query\": {\"terms\": {\"_id\": [ \"136\", \"95\" ] } } }"
body_json = json.loads(body)

response = client.metadata.search_datasets(body_json)

check_response = response['hits']['total']
if check_response == 2:
test_pass = True
else:
test_pass = False
jgrethe marked this conversation as resolved.
Show resolved Hide resolved

print( str(test_pass) )
8 changes: 8 additions & 0 deletions docs/sparc.client.services.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ sparc.client.services.pennsieve module
:undoc-members:
:show-inheritance:

sparc.client.services.metadata module
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. automodule:: sparc.client.services.metadata
:members:
:undoc-members:
:show-inheritance:

Module contents
~~~~~~~~~~~~~~~

Expand Down
278 changes: 278 additions & 0 deletions src/sparc/client/services/metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
import json
import logging
from configparser import SectionProxy
from typing import List, Optional, Union

import requests
from requests.adapters import HTTPAdapter, Retry

from ._default import ServiceBase


class MetadataService(ServiceBase):
"""A wrapper for the Elasticsearch Metadata library

Parameters:
-----------
config : dict
A configuration containing necessary API key (scicrunch_api_key).
connect : bool
Not needed with REST metadata services.

Attributes:
-----------
default_headers : dict
A dictionary with headers to make HTTP requests.
host_api : str
A default HTTP address of the SciCrunch Elasticsearch API endpoint.

Methods:
--------
get_profile() -> str
Returns the currently used API Key.
set_profile() -> str
Changes the API Key.
close() : None
Not needed with REST metadata services.
getURL(...) : dict
Supporting function to retrieve data from REST endpoint via GET
This support Elasticsearch URL based queries
postURL(...) : dict
Supporting function to retrieve data from REST endpoint
This supports Elasticsearch JSON queries
list_datasets(...) : dict
Returns a dictionary with datasets metadata.
search_datasets(...) : dict
Returns a dictionary with datasets matching search criteria.

"""

default_headers = {
"Content-Type": "application/json",
"Accept": "application/json; charset=utf-8",
}

host_api = "https://scicrunch.org/api/1/elastic"

scicrunch_api_key: str = None
profile_name: str = None

def __init__(
self, config: Optional[Union[dict, SectionProxy]] = None, connect: bool = False
) -> None:
logging.info("Initializing SPARC K-Core Elasticsearch services...")
logging.debug(str(config))

if config is not None:
self.scicrunch_api_key = config.get("scicrunch_api_key")
logging.info("SciCrunch API Key: Found")
self.profile_name = config.get("pennsieve_profile_name")
logging.info("Profile: " + self.profile_name)
else:
logging.warning("SciCrunch API Key: Not Found")
logging.info("Profile: none")
if connect:
self.connect()

def connect(self) -> str:
"""Not needed as metadata services are REST service calls"""
logging.info("Metadata REST services available...")

self.host_api = "https://scicrunch.org/api/1/elastic"
return self.host_api

def info(self) -> str:
"""Returns information about the metadata search services."""

self.host_api = "https://scicrunch.org/api/1/elastic"
jgrethe marked this conversation as resolved.
Show resolved Hide resolved
return self.host_api

def get_profile(self) -> str:
"""Returns currently used API key.

Returns:
--------
A string with API Key.
"""
return self.scicrunch_api_key

def set_profile(self, api_key: str) -> str:
"""Changes the API key to the specified name.

Parameters:
-----------
api_key : str
The API key to use.

Returns:
--------
A string with confirmation of API key switch.
"""
self.scicrunch_api_key = api_key
return self.scicrunch_api_key

def close(self) -> None:
"""Not needed as metadata services are REST service calls"""
return self.host_api

#####################################################################
# Supporting Functions

#####################################################################
# Function to GET content from URL with retries
def getURL(self, url, headers="NONE"):
result = "[ERROR]"
url_session = requests.Session()

retries = Retry(
total=6, backoff_factor=1, status_forcelist=[403, 404, 413, 429, 500, 502, 503, 504]
)

url_session.mount("https://", HTTPAdapter(max_retries=retries))

success = 1
jgrethe marked this conversation as resolved.
Show resolved Hide resolved

try:
if headers == "NONE":
url_result = url_session.get(url)
else:
url_result = url_session.get(url, headers=headers)

if url_result.status_code == 410:
logging.warning("Retrieval Status 410 - URL Unpublished:" + url)
else:
url_result.raise_for_status()

except requests.exceptions.HTTPError as errh:
logging.error("Retrieving URL - HTTP Error:", errh)
success = 0
except requests.exceptions.ConnectionError as errc:
logging.error("Retrieving URL - Error Connecting:", errc)
success = 0
except requests.exceptions.Timeout as errt:
logging.error("Retrieving URL - Timeout Error:", errt)
success = 0
except requests.exceptions.RequestException as err:
logging.error("Retrieving URL - Something Else", err)
success = 0

url_session.close()

if success == 1:
result = url_result
else:
result = {}
jgrethe marked this conversation as resolved.
Show resolved Hide resolved

return result.json()

#####################################################################
# Function to retrieve content via POST from URL with retries
def postURL(self, url, body, headers="NONE"):
result = "[ERROR]"
url_session = requests.Session()
jgrethe marked this conversation as resolved.
Show resolved Hide resolved

retries = Retry(
total=6, backoff_factor=1, status_forcelist=[403, 404, 413, 429, 500, 502, 503, 504]
)

url_session.mount("https://", HTTPAdapter(max_retries=retries))

try:
if type(body) is dict:
body_json = body
else:
body_json = json.loads(body)
except:
logging.error("Elasticsearch query body can not be read")

success = 1

try:
if headers == "NONE":
url_result = url_session.post(url, json=body_json)
else:
url_result = url_session.post(url, json=body_json, headers=headers)

if url_result.status_code == 410:
logging.warning("Retrieval Status 410 - URL Unpublished:" + url)
else:
url_result.raise_for_status()

except requests.exceptions.HTTPError as errh:
logging.error("Retrieving URL - HTTP Error:", errh)
success = 0
except requests.exceptions.ConnectionError as errc:
logging.error("Retrieving URL - Error Connecting:", errc)
success = 0
except requests.exceptions.Timeout as errt:
logging.error("Retrieving URL - Timeout Error:", errt)
success = 0
except requests.exceptions.RequestException as err:
logging.error("Retrieving URL - Something Else", err)
success = 0

url_session.close()

if success == 1:
result = url_result
else:
result = {}

return result.json()

#####################################################################
# Metadata Search Functions

def list_datasets(self, limit: int = 10, offset: int = 0) -> list:
"""Lists datasets and associated metadata.

Parameters:
-----------
limit : int
Max number of datasets returned.
offset : int
Offset used for pagination of results.

Returns:
--------
A json with the results.

"""
self.host_api = "https://scicrunch.org/api/1/elastic/SPARC_Algolia_pr/_search"
jgrethe marked this conversation as resolved.
Show resolved Hide resolved

list_url = (
self.host_api
+ "?"
+ "from="
+ str(offset)
+ "&size="
+ str(limit)
+ "&key="
+ self.scicrunch_api_key
)

list_results = self.getURL(list_url, headers=self.default_headers)
return list_results

def search_datasets(self, query: str = '{"query": { "match_all": {}}}') -> list:
"""Gets datasets matching specified query.

This function provides

Parameters:
-----------
query : str
Elasticsearch JSON query.

Returns:
--------
A json with the results.

"""

self.host_api = "https://scicrunch.org/api/1/elastic/SPARC_Algolia_pr/_search"

list_url = self.host_api + "?" + "key=" + self.scicrunch_api_key

list_results = self.postURL(list_url, body=query, headers=self.default_headers)
return list_results