Skip to content

Commit

Permalink
Add misc_utils.to_camelcase. Add qa_utils.is_subdict. Add ff_utils.ge…
Browse files Browse the repository at this point in the history
…t_schema and ff_utils.get_schemas.
  • Loading branch information
netsettler committed Aug 21, 2023
1 parent 3972a56 commit ee6dff3
Show file tree
Hide file tree
Showing 8 changed files with 394 additions and 40 deletions.
17 changes: 17 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,23 @@ Change Log
----------


7.9.0
=====

* In ``misc_utils``:

* New function ``to_camelcase`` that can take either snake_case or CamelCase input.

* In ``qa_utils``:

* New function ``is_subdict`` for asymmetric testing of dictionary equivalence.

* In ``ff_utils``:

* New function ``get_schema`` that will pull down an individual schema definition.
* New function ``get_schemas`` that will pull down all schema definitions.


7.8.0
=====

Expand Down
62 changes: 48 additions & 14 deletions dcicutils/ff_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import time

from collections import namedtuple
from dcicutils.lang_utils import disjoined_list
from elasticsearch.exceptions import AuthorizationException
from typing import Optional, Dict, List
from urllib.parse import parse_qs, urlencode, urlparse, urlunparse
Expand All @@ -16,7 +15,8 @@
AnyAuthDict, AuthDict, SimpleAuthPair, AuthData, AnyAuthData, PortalEnvName,
# S3BucketName, S3KeyName,
)
from .misc_utils import PRINT
from .lang_utils import disjoined_list
from .misc_utils import PRINT, to_camel_case


# TODO (C4-92, C4-102): Probably to centralize this information in env_utils. Also figure out relation to CGAP.
Expand Down Expand Up @@ -281,7 +281,7 @@ def get_metadata(obj_id, key=None, ff_env=None, check_queue=False, add_on=''):
Function to get metadata for a given obj_id (uuid or @id, most likely).
Either takes a dictionary form authentication (MUST include 'server')
or a string fourfront-environment.
Also a boolean 'check_queue', which if True
Also, a boolean 'check_queue', which, if True,
will use information from the queues and/or datastore=database to
ensure that the metadata is accurate.
Takes an optional string add_on that should contain things like
Expand Down Expand Up @@ -421,7 +421,7 @@ def search_result_generator(page_generator):
now return A,C,D but we already had the first page, so we request data starting at position 3
for the second page and get E,G,I. That means our sequence of return values would be A,C,E,E,G,I,K,M,
or, in other words, showing a duplication. To avoid this, we keep track of the IDs we've seen
and show only the first case of each element, so A,C,E,G,I,K,M. (We won't see the D but we weren't
and show only the first case of each element, so A,C,E,G,I,K,M. (We won't see the D, but we weren't
going to see it anyway, and it wasn't available the time we started, so the timing was already close.)
Unfortunately, we aren't so lucky for deletion, though that happens more rarely. That will cause
Expand Down Expand Up @@ -687,7 +687,7 @@ def get_associated_qc_metrics(uuid, key=None, ff_env=None, include_processed_fil

resp = get_metadata(uuid, key=key, ff_env=ff_env)

# Checks wheter the input is a experiment or experimentset otherwise throws an error
# Checks whether the input is an Experiment or ExperimentSet, and otherwise throws an error.
if 'ExperimentSet' not in resp['@type']:
raise TypeError('Expected ExperimentSet')

Expand Down Expand Up @@ -862,15 +862,15 @@ def get_es_metadata(uuids, es_client=None, filters=None, sources=None, chunk_siz
sources = ['embedded.files.uuid']
i.e. getting all fields for lab in embedded frame
sources = ['embedded.lab.*']
i.e. for getting a only object frame
i.e. for getting only an object frame
sources = ['object.*']
chunk_size:
Integer chunk_size may be used to control the number of uuids that are
passed to Elasticsearch in each query; setting this too high may cause
ES reads to timeout.
is_generator:
Boolean is_generator will return a generator for individual results if True;
if False (default), returns a list of results.
if False, (default), returns a list of results.
key: authentication key for ff_env (see get_authentication_with_server)
ff_env: authentication by env (needs system variables)
"""
Expand Down Expand Up @@ -941,6 +941,40 @@ def _get_es_metadata(uuids, es_client, filters, sources, chunk_size, auth):
yield hit['_source'] # yield individual items from ES


def get_schemas(key=None, ff_env=None) -> Dict[str, Dict]:
"""
Gets a dictionary of all schema definitions
Args:
key (dict): standard ff_utils authentication key
ff_env (str): standard ff environment string
Returns:
dict: a mapping from keys that are schema names to schema definitions
"""
auth = get_authentication_with_server(key, ff_env)
schemas = get_metadata('profiles/', key=auth, add_on='frame=raw')
return schemas


def get_schema(name, key=None, ff_env=None) -> Dict:
"""
Gets the schema definition with the given name.
Args:
name (str): a schema name (CamelCase or snake_case), or None
key (dict): standard ff_utils authentication key
ff_env (str): standard ff environment string
Returns:
dict: contains key schema names and value item class names
"""
auth = get_authentication_with_server(key, ff_env)
url = f"profiles/{to_camel_case(name)}.json"
schema = get_metadata(url, key=auth, add_on='frame=raw')
return schema


def get_schema_names(key=None, ff_env=None):
"""
Create a dictionary of all schema names to item class names
Expand Down Expand Up @@ -1034,7 +1068,7 @@ def remove_keys(my_dict, remove_list):
chunk = 100 # chunk the requests - don't want to hurt es performance

while uuid_list:
uuids_to_check = [] # uuids to add to uuid_list if not if not in item_uuids
uuids_to_check = [] # uuids to add to uuid_list if not in item_uuids

# get the next page of data, recreating the es_client if need be
try:
Expand Down Expand Up @@ -1121,7 +1155,7 @@ def _get_page(*, page, key=None, ff_env=None):

def get_health_page(key=None, ff_env=None):
"""
Simple function to return the json for a FF health page
Simple function to return the json for an environment's health page
"""
return _get_page(page='/health', key=key, ff_env=ff_env)

Expand All @@ -1143,7 +1177,7 @@ def get_indexing_status(key=None, ff_env=None):
def get_counts_summary(env):
""" Returns a named tuple given an FF name to check representing the counts state.
CountSummary
are_even: boolean on whether or not counts are even
are_even: boolean that is True if counts are even and False otherwise
summary_total: raw value of counts
"""
totals = get_counts_page(ff_env=env)
Expand Down Expand Up @@ -1182,7 +1216,7 @@ def execute_search(self, index, query, is_generator=False, page_size=200):
:arg index: index to search under
:arg query: query to run
:arg is_generator: boolean on whether or not to use a generator
:arg is_generator: boolean that is True if a generator is requested and otherwise False
:arg page_size: if using a generator, how many results to give per request
:returns: list of results of query or None
Expand All @@ -1194,7 +1228,7 @@ def execute_search(self, index, query, is_generator=False, page_size=200):

def search_es_metadata(index, query, key=None, ff_env=None, is_generator=False):
"""
Executes a lucene search query on on the ES Instance for this
Executes a lucene search query on the ES Instance for this
environment.
NOTE: It is okay to use this function directly but for repeat usage please use
Expand All @@ -1204,7 +1238,7 @@ def search_es_metadata(index, query, key=None, ff_env=None, is_generator=False):
:arg query: dictionary of query
:arg key: optional, 2-tuple authentication key (access_key_id, secret)
:arg ff_env: ff_env to use
:arg is_generator: boolean on whether or not to use a generator
:arg is_generator: boolean that is True if a generator is requested and otherwise False
:returns: list of results of query or None
"""
Expand Down Expand Up @@ -1484,7 +1518,7 @@ def dump_results_to_json(store, folder):


def parse_s3_bucket_and_key_url(url: str) -> (str, str):
""" Parses the given s3 URL into its pair of bucket, key
""" Parses the given s3 URL into its pair of (bucket, key).
Note that this function works the way it does because of how these
urls end up in our database. Eventually we should clean this up.
Format:
Expand Down
10 changes: 10 additions & 0 deletions dcicutils/misc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1331,6 +1331,16 @@ def snake_case_to_camel_case(s, separator='_'):
return s.title().replace(separator, '')


def to_camel_case(s):
"""
Converts a string that might be in snake_case or CamelCase into CamelCase.
"""
if s[:1].isupper() and '_' not in s:
return s
else:
return snake_case_to_camel_case(s)


def capitalize1(s):
"""
Capitalizes the first letter of a string and leaves the others alone.
Expand Down
102 changes: 89 additions & 13 deletions dcicutils/qa_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import functools
import hashlib
import io
import json
import logging
import os
import pytest
Expand Down Expand Up @@ -720,7 +721,7 @@ def mock_action_handler(self, wrapped_action, *args, **kwargs):
texts = remove_suffix('\n', text).split('\n')
last_text = texts[-1]
result = wrapped_action(text, **kwargs) # noQA - This call to print is low-level implementation
# This only captures non-file output output.
# This only captures non-file output.
file = kwargs.get('file')
if file is None:
file = sys.stdout
Expand Down Expand Up @@ -853,7 +854,7 @@ def __init__(self, *, region_name=None, boto3=None, **kwargs):
self._aws_secret_access_key = kwargs.get("aws_secret_access_key")
self._aws_region = region_name

# These is specific for testing.
# This is specific for testing.
self._aws_credentials_dir = None

# FYI: Some things to note about how boto3 (and probably any AWS client) reads AWS credentials/region.
Expand Down Expand Up @@ -915,7 +916,7 @@ def put_credentials_for_testing(self,
self._aws_secret_access_key = aws_secret_access_key
self._aws_region = region_name

# These is specific for testing.
# This is specific for testing.
self._aws_credentials_dir = aws_credentials_dir

@staticmethod
Expand Down Expand Up @@ -2270,8 +2271,7 @@ def create_object_for_testing(self, object_content: str, *, Bucket: str, Key: st
def upload_fileobj(self, Fileobj, Bucket, Key, **kwargs): # noqa - Uppercase argument names are chosen by AWS
self.check_for_kwargs_required_by_mock("upload_fileobj", Bucket=Bucket, Key=Key, **kwargs)
data = Fileobj.read()
PRINT("Uploading %s (%s bytes) to bucket %s key %s"
% (Fileobj, len(data), Bucket, Key))
PRINT(f"Uploading {Fileobj} ({len(data)} bytes) to bucket {Bucket} key {Key}")
with self.s3_files.open(os.path.join(Bucket, Key), 'wb') as fp:
fp.write(data)

Expand All @@ -2284,8 +2284,7 @@ def download_fileobj(self, Bucket, Key, Fileobj, **kwargs): # noqa - Uppercase
self.check_for_kwargs_required_by_mock("download_fileobj", Bucket=Bucket, Key=Key, **kwargs)
with self.s3_files.open(os.path.join(Bucket, Key), 'rb') as fp:
data = fp.read()
PRINT("Downloading bucket %s key %s (%s bytes) to %s"
% (Bucket, Key, len(data), Fileobj))
PRINT(f"Downloading bucket {Bucket} key {Key} ({len(data)} bytes) to {Fileobj}")
Fileobj.write(data)

def download_file(self, Bucket, Key, Filename, **kwargs): # noqa - Uppercase argument names are chosen by AWS
Expand Down Expand Up @@ -2382,7 +2381,7 @@ def head_bucket(self, Bucket): # noQA - AWS argument naming style
raise ClientError(operation_name='HeadBucket',
error_response={ # noQA - PyCharm wrongly complains about this dictionary
"Error": {"Code": "404", "Message": "Not Found"},
"ResponseMetadata": {"HTTPStatusCode": 404},
"ResponseMetadata": self.compute_mock_response_metadata(http_status_code=404),
})

def get_object_tagging(self, Bucket, Key):
Expand Down Expand Up @@ -2645,7 +2644,7 @@ def list_objects(self, Bucket, Prefix=None): # noQA - AWS argument naming style
}

def list_objects_v2(self, Bucket): # noQA - AWS argument naming style
# This is different but similar to list_objects. However we don't really care about that.
# This is different but similar to list_objects. However, we don't really care about that.
return self.list_objects(Bucket=Bucket)

def copy_object(self, CopySource, Bucket, Key, CopySourceVersionId=None,
Expand Down Expand Up @@ -2698,7 +2697,7 @@ def _copy_object(self, CopySource, Bucket, Key, CopySourceVersionId, StorageClas
new_storage_class = target_storage_class
if (copy_in_place
and GlacierUtils.transition_involves_glacier_restoration(source_storage_class, target_storage_class)):
new_storage_class = None # For a restoration, the don't update the glacier data. It's restored elsewhere.
new_storage_class = None # For a restoration, don't update the glacier data. It's restored elsewhere.
target_attribute_block.restore_temporarily(delay_seconds=self.RESTORATION_DELAY_SECONDS,
duration_days=1, storage_class=target_storage_class)
PRINT(f"Set up restoration {target_attribute_block.restoration}")
Expand Down Expand Up @@ -2806,6 +2805,7 @@ def _delete_versioned_object(self, s3_filename, version_id) -> Dict[str, Any]:

def restore_object(self, Bucket, Key, RestoreRequest, VersionId: Optional[str] = None,
StorageClass: Optional[S3StorageClass] = None):
# TODO: VersionId is unused in the arglist. Is that OK? -kmp 19-Aug-2023
duration_days: int = RestoreRequest.get('Days')
storage_class: S3StorageClass = StorageClass or self.storage_class
s3_filename = f"{Bucket}/{Key}"
Expand Down Expand Up @@ -3047,8 +3047,8 @@ def known_bug_expected(jira_ticket=None, fixed=False, error_class=None):
with known_bug_expected(jira_ticket="TST-00001", error_class=RuntimeError, fixed=True):
... stuff that fails ...
If the previously-expected error (now thought to be fixed) happens, an error will result so it's easy to tell
if there's been a regression.
If the previously-expected error (now thought to be fixed) happens, an error will result
so that it's easy to tell if there's been a regression.
Parameters:
Expand Down Expand Up @@ -3088,7 +3088,7 @@ def client_failer(operation_name, code=400):
def fail(message, code=code):
raise ClientError(
{ # noQA - PyCharm wrongly complains about this dictionary
"Error": {"Message": message, "Code": code}
"Error": {"Message": message, "Code": code} # noQA - Boto3 declares a string here but allows int code
},
operation_name=operation_name)
return fail
Expand Down Expand Up @@ -3473,3 +3473,79 @@ def mocked_input(*args, **kwargs):
inputs.append(item)
yield
assert not inputs, "Did not use all inputs."


def is_subdict(json1, json2, desc1="json1", desc2="json2", verbose=True):
"""
Does asymmetric testing of dictionary equivalence, assuring json2 has all the content of json1,
even if not vice versa. In other words, the dictionary structure is equivalent to the extent
that (recursively) all dictionary keys on the left occur in the right hand side even if not
necessarily all dictionary keys on the right occur in the left.
For example,
x = {"foo": 3}
y = {"foo": 3, "bar": 4}
is_subdict(x, y) is True
is_subdict(y, x) is False
The desc1 and desc2 can be provided to help with verbose mode, identifying what is on the left
and what is on the right.
:param json1: a JSON structure, the outer part of which is a dictionary
:param json2: a JSON structure, the outer part of which is a dictionary
:param desc1: a name or brief description for the json1 (default "json1")
:param desc2: a name or brief description for the json2 (default "json2")
:param verbose: a boolean (default True) that controls whether the comparison is verbose, showing
output explaining failures or near-failures when True, and otherwise, if False, not showing such output.
"""


def out(x):
if verbose:
PRINT(x)

def sorted_set_repr(x):
return f"{{{repr(sorted(x))[1:-1]}}}"

def recurse(json1, json2, path=""):
if isinstance(json1, dict) and isinstance(json2, dict):
k1 = set(json1.keys())
k2 = set(json2.keys())
result = k1 <= k2
if result:
if k1 != k2:
out(f"Non-fatal keyword mismatch at {path!r}:")
out(f" {desc1} keys: {sorted_set_repr(k1)}")
out(f" {desc2} keys: {sorted_set_repr(k2)}")
result = all(recurse(value, json2[key], path=f"{path}.{key}")
for key, value in json1.items())
if not result:
# out(f"Recursive failure at {path!r} in object comparison")
pass
else:
out(f"Failed at {path!r} in object comparison due to key set mismatch:")
out(f" {desc1} keys: {sorted_set_repr(k1)}")
out(f" {desc2} keys: {sorted_set_repr(k2)}")
elif isinstance(json1, list) and isinstance(json2, list):
len1 = len(json1)
len2 = len(json2)
result = len1 == len2
if not result:
out(f"Failed at {path!r} in list comparison due to length mismatch: {len1} vs {len2}")
else:
result = all(recurse(json1[i], json2[i], path=f"{path}[{i}]") for i in range(len1))
if not result:
# out(f"Recursive failure at {path!r} in list comparison")
pass
elif type(json1) == type(json2):
result = json1 == json2
if not result:
out(f"Failed at {path!r} due to value mismatch: {json.dumps(json1)} != {json.dumps(json2)}")
else:
result = False
if not result:
out(f"Mismatch at {path}.")
out(f" {desc1}: {json1}")
out(f" {desc2}: {json2}")
return result
return recurse(json1, json2)
Loading

0 comments on commit ee6dff3

Please sign in to comment.