From ee6dff38c05df948c8b6658e7e921c54e14dfeea Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 21 Aug 2023 06:48:00 -0400 Subject: [PATCH 1/9] Add misc_utils.to_camelcase. Add qa_utils.is_subdict. Add ff_utils.get_schema and ff_utils.get_schemas. --- CHANGELOG.rst | 17 +++++ dcicutils/ff_utils.py | 62 ++++++++++++---- dcicutils/misc_utils.py | 10 +++ dcicutils/qa_utils.py | 102 ++++++++++++++++++++++---- pyproject.toml | 2 +- test/test_ff_utils.py | 157 +++++++++++++++++++++++++++++++++++++--- test/test_misc_utils.py | 19 ++++- test/test_qa_utils.py | 65 ++++++++++++++++- 8 files changed, 394 insertions(+), 40 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 8cae87006..b74dac182 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,23 @@ Change Log ---------- +7.9.0 +===== + +* In ``misc_utils``: + + * New function ``to_camelcase`` that can take either snake_case or CamelCase input. + +* In ``qa_utils``: + + * New function ``is_subdict`` for asymmetric testing of dictionary equivalence. + +* In ``ff_utils``: + + * New function ``get_schema`` that will pull down an individual schema definition. + * New function ``get_schemas`` that will pull down all schema definitions. + + 7.8.0 ===== diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index 37a0439db..101db132e 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -6,7 +6,6 @@ import time from collections import namedtuple -from dcicutils.lang_utils import disjoined_list from elasticsearch.exceptions import AuthorizationException from typing import Optional, Dict, List from urllib.parse import parse_qs, urlencode, urlparse, urlunparse @@ -16,7 +15,8 @@ AnyAuthDict, AuthDict, SimpleAuthPair, AuthData, AnyAuthData, PortalEnvName, # S3BucketName, S3KeyName, ) -from .misc_utils import PRINT +from .lang_utils import disjoined_list +from .misc_utils import PRINT, to_camel_case # TODO (C4-92, C4-102): Probably to centralize this information in env_utils. Also figure out relation to CGAP. @@ -281,7 +281,7 @@ def get_metadata(obj_id, key=None, ff_env=None, check_queue=False, add_on=''): Function to get metadata for a given obj_id (uuid or @id, most likely). Either takes a dictionary form authentication (MUST include 'server') or a string fourfront-environment. - Also a boolean 'check_queue', which if True + Also, a boolean 'check_queue', which, if True, will use information from the queues and/or datastore=database to ensure that the metadata is accurate. Takes an optional string add_on that should contain things like @@ -421,7 +421,7 @@ def search_result_generator(page_generator): now return A,C,D but we already had the first page, so we request data starting at position 3 for the second page and get E,G,I. That means our sequence of return values would be A,C,E,E,G,I,K,M, or, in other words, showing a duplication. To avoid this, we keep track of the IDs we've seen - and show only the first case of each element, so A,C,E,G,I,K,M. (We won't see the D but we weren't + and show only the first case of each element, so A,C,E,G,I,K,M. (We won't see the D, but we weren't going to see it anyway, and it wasn't available the time we started, so the timing was already close.) Unfortunately, we aren't so lucky for deletion, though that happens more rarely. That will cause @@ -687,7 +687,7 @@ def get_associated_qc_metrics(uuid, key=None, ff_env=None, include_processed_fil resp = get_metadata(uuid, key=key, ff_env=ff_env) - # Checks wheter the input is a experiment or experimentset otherwise throws an error + # Checks whether the input is an Experiment or ExperimentSet, and otherwise throws an error. if 'ExperimentSet' not in resp['@type']: raise TypeError('Expected ExperimentSet') @@ -862,7 +862,7 @@ def get_es_metadata(uuids, es_client=None, filters=None, sources=None, chunk_siz sources = ['embedded.files.uuid'] i.e. getting all fields for lab in embedded frame sources = ['embedded.lab.*'] - i.e. for getting a only object frame + i.e. for getting only an object frame sources = ['object.*'] chunk_size: Integer chunk_size may be used to control the number of uuids that are @@ -870,7 +870,7 @@ def get_es_metadata(uuids, es_client=None, filters=None, sources=None, chunk_siz ES reads to timeout. is_generator: Boolean is_generator will return a generator for individual results if True; - if False (default), returns a list of results. + if False, (default), returns a list of results. key: authentication key for ff_env (see get_authentication_with_server) ff_env: authentication by env (needs system variables) """ @@ -941,6 +941,40 @@ def _get_es_metadata(uuids, es_client, filters, sources, chunk_size, auth): yield hit['_source'] # yield individual items from ES +def get_schemas(key=None, ff_env=None) -> Dict[str, Dict]: + """ + Gets a dictionary of all schema definitions + + Args: + key (dict): standard ff_utils authentication key + ff_env (str): standard ff environment string + + Returns: + dict: a mapping from keys that are schema names to schema definitions + """ + auth = get_authentication_with_server(key, ff_env) + schemas = get_metadata('profiles/', key=auth, add_on='frame=raw') + return schemas + + +def get_schema(name, key=None, ff_env=None) -> Dict: + """ + Gets the schema definition with the given name. + + Args: + name (str): a schema name (CamelCase or snake_case), or None + key (dict): standard ff_utils authentication key + ff_env (str): standard ff environment string + + Returns: + dict: contains key schema names and value item class names + """ + auth = get_authentication_with_server(key, ff_env) + url = f"profiles/{to_camel_case(name)}.json" + schema = get_metadata(url, key=auth, add_on='frame=raw') + return schema + + def get_schema_names(key=None, ff_env=None): """ Create a dictionary of all schema names to item class names @@ -1034,7 +1068,7 @@ def remove_keys(my_dict, remove_list): chunk = 100 # chunk the requests - don't want to hurt es performance while uuid_list: - uuids_to_check = [] # uuids to add to uuid_list if not if not in item_uuids + uuids_to_check = [] # uuids to add to uuid_list if not in item_uuids # get the next page of data, recreating the es_client if need be try: @@ -1121,7 +1155,7 @@ def _get_page(*, page, key=None, ff_env=None): def get_health_page(key=None, ff_env=None): """ - Simple function to return the json for a FF health page + Simple function to return the json for an environment's health page """ return _get_page(page='/health', key=key, ff_env=ff_env) @@ -1143,7 +1177,7 @@ def get_indexing_status(key=None, ff_env=None): def get_counts_summary(env): """ Returns a named tuple given an FF name to check representing the counts state. CountSummary - are_even: boolean on whether or not counts are even + are_even: boolean that is True if counts are even and False otherwise summary_total: raw value of counts """ totals = get_counts_page(ff_env=env) @@ -1182,7 +1216,7 @@ def execute_search(self, index, query, is_generator=False, page_size=200): :arg index: index to search under :arg query: query to run - :arg is_generator: boolean on whether or not to use a generator + :arg is_generator: boolean that is True if a generator is requested and otherwise False :arg page_size: if using a generator, how many results to give per request :returns: list of results of query or None @@ -1194,7 +1228,7 @@ def execute_search(self, index, query, is_generator=False, page_size=200): def search_es_metadata(index, query, key=None, ff_env=None, is_generator=False): """ - Executes a lucene search query on on the ES Instance for this + Executes a lucene search query on the ES Instance for this environment. NOTE: It is okay to use this function directly but for repeat usage please use @@ -1204,7 +1238,7 @@ def search_es_metadata(index, query, key=None, ff_env=None, is_generator=False): :arg query: dictionary of query :arg key: optional, 2-tuple authentication key (access_key_id, secret) :arg ff_env: ff_env to use - :arg is_generator: boolean on whether or not to use a generator + :arg is_generator: boolean that is True if a generator is requested and otherwise False :returns: list of results of query or None """ @@ -1484,7 +1518,7 @@ def dump_results_to_json(store, folder): def parse_s3_bucket_and_key_url(url: str) -> (str, str): - """ Parses the given s3 URL into its pair of bucket, key + """ Parses the given s3 URL into its pair of (bucket, key). Note that this function works the way it does because of how these urls end up in our database. Eventually we should clean this up. Format: diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index fd0747d43..88c228c7f 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -1331,6 +1331,16 @@ def snake_case_to_camel_case(s, separator='_'): return s.title().replace(separator, '') +def to_camel_case(s): + """ + Converts a string that might be in snake_case or CamelCase into CamelCase. + """ + if s[:1].isupper() and '_' not in s: + return s + else: + return snake_case_to_camel_case(s) + + def capitalize1(s): """ Capitalizes the first letter of a string and leaves the others alone. diff --git a/dcicutils/qa_utils.py b/dcicutils/qa_utils.py index 6a04fa510..472b198eb 100644 --- a/dcicutils/qa_utils.py +++ b/dcicutils/qa_utils.py @@ -10,6 +10,7 @@ import functools import hashlib import io +import json import logging import os import pytest @@ -720,7 +721,7 @@ def mock_action_handler(self, wrapped_action, *args, **kwargs): texts = remove_suffix('\n', text).split('\n') last_text = texts[-1] result = wrapped_action(text, **kwargs) # noQA - This call to print is low-level implementation - # This only captures non-file output output. + # This only captures non-file output. file = kwargs.get('file') if file is None: file = sys.stdout @@ -853,7 +854,7 @@ def __init__(self, *, region_name=None, boto3=None, **kwargs): self._aws_secret_access_key = kwargs.get("aws_secret_access_key") self._aws_region = region_name - # These is specific for testing. + # This is specific for testing. self._aws_credentials_dir = None # FYI: Some things to note about how boto3 (and probably any AWS client) reads AWS credentials/region. @@ -915,7 +916,7 @@ def put_credentials_for_testing(self, self._aws_secret_access_key = aws_secret_access_key self._aws_region = region_name - # These is specific for testing. + # This is specific for testing. self._aws_credentials_dir = aws_credentials_dir @staticmethod @@ -2270,8 +2271,7 @@ def create_object_for_testing(self, object_content: str, *, Bucket: str, Key: st def upload_fileobj(self, Fileobj, Bucket, Key, **kwargs): # noqa - Uppercase argument names are chosen by AWS self.check_for_kwargs_required_by_mock("upload_fileobj", Bucket=Bucket, Key=Key, **kwargs) data = Fileobj.read() - PRINT("Uploading %s (%s bytes) to bucket %s key %s" - % (Fileobj, len(data), Bucket, Key)) + PRINT(f"Uploading {Fileobj} ({len(data)} bytes) to bucket {Bucket} key {Key}") with self.s3_files.open(os.path.join(Bucket, Key), 'wb') as fp: fp.write(data) @@ -2284,8 +2284,7 @@ def download_fileobj(self, Bucket, Key, Fileobj, **kwargs): # noqa - Uppercase self.check_for_kwargs_required_by_mock("download_fileobj", Bucket=Bucket, Key=Key, **kwargs) with self.s3_files.open(os.path.join(Bucket, Key), 'rb') as fp: data = fp.read() - PRINT("Downloading bucket %s key %s (%s bytes) to %s" - % (Bucket, Key, len(data), Fileobj)) + PRINT(f"Downloading bucket {Bucket} key {Key} ({len(data)} bytes) to {Fileobj}") Fileobj.write(data) def download_file(self, Bucket, Key, Filename, **kwargs): # noqa - Uppercase argument names are chosen by AWS @@ -2382,7 +2381,7 @@ def head_bucket(self, Bucket): # noQA - AWS argument naming style raise ClientError(operation_name='HeadBucket', error_response={ # noQA - PyCharm wrongly complains about this dictionary "Error": {"Code": "404", "Message": "Not Found"}, - "ResponseMetadata": {"HTTPStatusCode": 404}, + "ResponseMetadata": self.compute_mock_response_metadata(http_status_code=404), }) def get_object_tagging(self, Bucket, Key): @@ -2645,7 +2644,7 @@ def list_objects(self, Bucket, Prefix=None): # noQA - AWS argument naming style } def list_objects_v2(self, Bucket): # noQA - AWS argument naming style - # This is different but similar to list_objects. However we don't really care about that. + # This is different but similar to list_objects. However, we don't really care about that. return self.list_objects(Bucket=Bucket) def copy_object(self, CopySource, Bucket, Key, CopySourceVersionId=None, @@ -2698,7 +2697,7 @@ def _copy_object(self, CopySource, Bucket, Key, CopySourceVersionId, StorageClas new_storage_class = target_storage_class if (copy_in_place and GlacierUtils.transition_involves_glacier_restoration(source_storage_class, target_storage_class)): - new_storage_class = None # For a restoration, the don't update the glacier data. It's restored elsewhere. + new_storage_class = None # For a restoration, don't update the glacier data. It's restored elsewhere. target_attribute_block.restore_temporarily(delay_seconds=self.RESTORATION_DELAY_SECONDS, duration_days=1, storage_class=target_storage_class) PRINT(f"Set up restoration {target_attribute_block.restoration}") @@ -2806,6 +2805,7 @@ def _delete_versioned_object(self, s3_filename, version_id) -> Dict[str, Any]: def restore_object(self, Bucket, Key, RestoreRequest, VersionId: Optional[str] = None, StorageClass: Optional[S3StorageClass] = None): + # TODO: VersionId is unused in the arglist. Is that OK? -kmp 19-Aug-2023 duration_days: int = RestoreRequest.get('Days') storage_class: S3StorageClass = StorageClass or self.storage_class s3_filename = f"{Bucket}/{Key}" @@ -3047,8 +3047,8 @@ def known_bug_expected(jira_ticket=None, fixed=False, error_class=None): with known_bug_expected(jira_ticket="TST-00001", error_class=RuntimeError, fixed=True): ... stuff that fails ... - If the previously-expected error (now thought to be fixed) happens, an error will result so it's easy to tell - if there's been a regression. + If the previously-expected error (now thought to be fixed) happens, an error will result + so that it's easy to tell if there's been a regression. Parameters: @@ -3088,7 +3088,7 @@ def client_failer(operation_name, code=400): def fail(message, code=code): raise ClientError( { # noQA - PyCharm wrongly complains about this dictionary - "Error": {"Message": message, "Code": code} + "Error": {"Message": message, "Code": code} # noQA - Boto3 declares a string here but allows int code }, operation_name=operation_name) return fail @@ -3473,3 +3473,79 @@ def mocked_input(*args, **kwargs): inputs.append(item) yield assert not inputs, "Did not use all inputs." + + +def is_subdict(json1, json2, desc1="json1", desc2="json2", verbose=True): + """ + Does asymmetric testing of dictionary equivalence, assuring json2 has all the content of json1, + even if not vice versa. In other words, the dictionary structure is equivalent to the extent + that (recursively) all dictionary keys on the left occur in the right hand side even if not + necessarily all dictionary keys on the right occur in the left. + + For example, + x = {"foo": 3} + y = {"foo": 3, "bar": 4} + is_subdict(x, y) is True + is_subdict(y, x) is False + + The desc1 and desc2 can be provided to help with verbose mode, identifying what is on the left + and what is on the right. + + :param json1: a JSON structure, the outer part of which is a dictionary + :param json2: a JSON structure, the outer part of which is a dictionary + :param desc1: a name or brief description for the json1 (default "json1") + :param desc2: a name or brief description for the json2 (default "json2") + :param verbose: a boolean (default True) that controls whether the comparison is verbose, showing + output explaining failures or near-failures when True, and otherwise, if False, not showing such output. + """ + + + def out(x): + if verbose: + PRINT(x) + + def sorted_set_repr(x): + return f"{{{repr(sorted(x))[1:-1]}}}" + + def recurse(json1, json2, path=""): + if isinstance(json1, dict) and isinstance(json2, dict): + k1 = set(json1.keys()) + k2 = set(json2.keys()) + result = k1 <= k2 + if result: + if k1 != k2: + out(f"Non-fatal keyword mismatch at {path!r}:") + out(f" {desc1} keys: {sorted_set_repr(k1)}") + out(f" {desc2} keys: {sorted_set_repr(k2)}") + result = all(recurse(value, json2[key], path=f"{path}.{key}") + for key, value in json1.items()) + if not result: + # out(f"Recursive failure at {path!r} in object comparison") + pass + else: + out(f"Failed at {path!r} in object comparison due to key set mismatch:") + out(f" {desc1} keys: {sorted_set_repr(k1)}") + out(f" {desc2} keys: {sorted_set_repr(k2)}") + elif isinstance(json1, list) and isinstance(json2, list): + len1 = len(json1) + len2 = len(json2) + result = len1 == len2 + if not result: + out(f"Failed at {path!r} in list comparison due to length mismatch: {len1} vs {len2}") + else: + result = all(recurse(json1[i], json2[i], path=f"{path}[{i}]") for i in range(len1)) + if not result: + # out(f"Recursive failure at {path!r} in list comparison") + pass + elif type(json1) == type(json2): + result = json1 == json2 + if not result: + out(f"Failed at {path!r} due to value mismatch: {json.dumps(json1)} != {json.dumps(json2)}") + else: + result = False + if not result: + out(f"Mismatch at {path}.") + out(f" {desc1}: {json1}") + out(f" {desc2}: {json2}") + return result + return recurse(json1, json2) diff --git a/pyproject.toml b/pyproject.toml index f31d26f91..70f90b624 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.8.0" +version = "7.9.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/test/test_ff_utils.py b/test/test_ff_utils.py index 3037e0ad3..145d3bc86 100644 --- a/test/test_ff_utils.py +++ b/test/test_ff_utils.py @@ -1,4 +1,6 @@ +import base64 import copy +import hashlib import json import os import pytest @@ -11,7 +13,7 @@ from dcicutils.ff_mocks import mocked_s3utils_with_sse, TestScenarios, RequestsTestRecorder from dcicutils.misc_utils import make_counter, remove_prefix, remove_suffix, check_true from dcicutils.qa_utils import ( - check_duplicated_items_by_key, ignored, raises_regexp, MockResponse, MockBoto3, MockBotoSQSClient, + check_duplicated_items_by_key, ignored, raises_regexp, MockResponse, MockBoto3, MockBotoSQSClient, is_subdict, ) from types import GeneratorType from unittest import mock @@ -177,7 +179,7 @@ def test_url_params_functions(): def test_unified_authentication_decoding(integrated_ff): """ - Test that we decode the various formats and locations of keys and secrets in an uniform way. + Test that we decode the various formats and locations of keys and secrets in a uniform way. """ any_id, any_secret, any_env = 'any-id', 'any-secret', 'any-env' @@ -369,7 +371,7 @@ def test_unified_authentication_integrated(integrated_ff): @using_fresh_ff_deployed_state_for_testing() def test_unified_authentication_prod_envs_integrated_only(): # This is ONLY an integration test. There is no unit test functionality tested here. - # All of the functionality used here is already tested elsewhere. + # All functionality used here is already tested elsewhere. # Fourfront prod ff_prod_auth = ff_utils.unified_authentication(ff_env="data") @@ -473,7 +475,7 @@ def compute_mock_queue_attribute(self, QueueUrl, Attribute): # noQA - Amazon AW @pytest.mark.flaky def test_stuff_in_queues_integrated(integrated_ff): """ - Gotta index a bunch of stuff to make this work + We need to index a bunch of stuff to make this work """ search_res = ff_utils.search_metadata('search/?limit=all&type=File', key=integrated_ff['ff_key']) # just take the first handful @@ -552,7 +554,7 @@ def test_get_metadata_unit(): ts = TestScenarios - # The first part of this function sets up some common tools and then we test various scenarios + # The first part of this function sets up some common tools, and then we test various scenarios. counter = make_counter() # used to generate some sample data in mock calls unsupplied = object() # used in defaulting to prove an argument wasn't called @@ -579,7 +581,7 @@ def mocked_authorized_request(url, auth=None, ff_env=None, verb='GET', (a) that the mock uses a URL on the 'bar' scenario environment ('http://fourfront-bar.example/') so that we know some other authorized request wasn't also attempted that wasn't expecting this mock. (b) that the retry_fxn was not passed, since this mock doesn't know what to do with that - (c) that it's a GET operation, since we're not prepared to store anything and we're testing a getter function + (c) that it's a GET operation, since we're not prepared to store anything, and we're testing a getter function (d) that proper authorization for the 'bar' scenario was given It returns mock data that identifies itself as what was asked for. """ @@ -621,7 +623,7 @@ def test_it(n, check_queue=None, expect_suffix="", **kwargs): res_w_key = ff_utils.get_metadata(test_item, key=ts.bar_env_auth_dict, check_queue=check_queue, **kwargs) # Check that the data flow back from our mock authorized_request call did what we expect assert res_w_key == {'@id': test_item_id, 'mock_data': n} - # Check that the call out to the mock authorized_request is the thing we think. + # Check that the call-out to the mock authorized_request is the thing we think. # In particular, we expect that # (a) this is a GET # (b) it has appropriate auth @@ -1005,8 +1007,8 @@ def check_search_metadata(integrated_ff, url, expect_shortfall=False): if url != '': # replace stub with actual url from integrated_ff url = integrated_ff['ff_key']['server'] + '/' - # Note that we do some some .reset() calls on a mock that are not needed when servicing the integration test, - # but they are harmless and it seemed pointless to make it conditional. -kmp 15-Jan-2021 + # Note that we do some .reset() calls on a mock that are not needed when servicing the integration test, + # but they are harmless, and it seemed pointless to make it conditional. -kmp 15-Jan-2021 InsertingMockedSearchItems.reset() search_res = ff_utils.search_metadata(url + 'search/?limit=all&type=File', key=integrated_ff['ff_key']) assert isinstance(search_res, list) @@ -1290,6 +1292,141 @@ def test_get_health_page(integrated_ff): assert bad_health_res and 'error' in bad_health_res +@pytest.mark.xfail(reason=("Probably long-standing problem, recently discovered." + " System functions OK in spite of it. Tracked as C4-1087.")) +@pytest.mark.integrated +def test_get_schema_consistent(integrated_ff): + + def get_it(): + return ff_utils.get_schema('User', + key=integrated_ff['ff_key'], + ff_env=integrated_ff['ff_env']) + + first_try = get_it() + + for i in range(5): + print(f"Try {i}...") + nth_try = get_it() + assert first_try == nth_try + time.sleep(2) + + +@pytest.mark.integrated +@pytest.mark.flaky +def test_get_schema_and_get_schemas(integrated_ff): + + def tidy(x: dict) -> dict: + assert isinstance(x, dict) + # We have observed some variability in the facets and return values of get_metadata, + # so this is intended to paper over that since it seems irrelevant to what we're + # wanting to find out here. We'll track that other bug (C4-1087) elsewhere. + tidied = x.copy() + tidied.pop('facets', None) + tidied.pop('columns', None) + return tidied + + user_schema = ff_utils.get_schema('User', + key=integrated_ff['ff_key'], + ff_env=integrated_ff['ff_env']) + user_schema = tidy(user_schema) + + assert user_schema.get('title') == 'User' + assert user_schema.get('type') == 'object' + assert isinstance(user_schema.get('properties'), dict) + + experiment_set_schema_snake = ff_utils.get_schema('experiment_set', + key=integrated_ff['ff_key'], + ff_env=integrated_ff['ff_env']) + experiment_set_schema_snake = tidy(experiment_set_schema_snake) + assert experiment_set_schema_snake.get('title') == 'Experiment Set' + + experiment_set_schema_camel = ff_utils.get_schema('ExperimentSet', + key=integrated_ff['ff_key'], + ff_env=integrated_ff['ff_env']) + experiment_set_schema_camel = tidy(experiment_set_schema_camel) + assert experiment_set_schema_camel.get('title') == 'Experiment Set' + + all_schemas = ff_utils.get_schemas(key=integrated_ff['ff_key'], + ff_env=integrated_ff['ff_env']) + # We have these already loaded, so check them in full... + user_schema_from_all_schemas = all_schemas.get('User') + user_schema_from_all_schemas = tidy(user_schema_from_all_schemas) + + experiment_set_schema_from_all_schemas = all_schemas.get('ExperimentSet') + experiment_set_schema_from_all_schemas = tidy(experiment_set_schema_from_all_schemas) + + assert is_subdict(desc1=" profiles/User.keys()", + desc2="profiles/['User'].keys()", + json1=user_schema, + json2=user_schema_from_all_schemas) + + assert is_subdict(desc1=" profiles/ExperimentSet.keys()", + desc2="profiles/['ExperimentSet'].keys()", + json1=experiment_set_schema_camel, + json2=experiment_set_schema_from_all_schemas) + + assert is_subdict(desc1=" profiles/experiment_set.keys()", + desc2="profiles/['ExperimentSet'].keys()", + json1=experiment_set_schema_snake, + json2=experiment_set_schema_from_all_schemas) + + # Do some random spot-checking... + assert 'Lab' in all_schemas + assert 'ExperimentType' in all_schemas + + +@pytest.mark.xfail(reason=("Probably long-standing problem, recently discovered." + " System functions OK in spite of it. Tracked as C4-1087.")) +@pytest.mark.integrated +def test_get_metadata_consistent(integrated_ff): + + def get_it(): + return ff_utils.get_metadata('profiles/User.json', + key=integrated_ff['ff_key'], + ff_env=integrated_ff['ff_env']) + + first_try = get_it() + + for i in range(5): + print(f"Try {i}...") + nth_try = get_it() + try: + assert first_try == nth_try + except AssertionError: + is_subdict(first_try, nth_try, verbose=True) # This is called for output side-effect + raise # Go ahead and continue raising. We just wanted more debugging info. + time.sleep(2) + + +@pytest.mark.xfail(reason=("Probably long-standing problem, recently discovered." + " System functions OK in spite of it. Tracked as C4-1087.")) +@pytest.mark.integrated +def test_get_metadata_consistent_noting_pattern(integrated_ff): + + print() + + def hash(json): + return base64.b64encode(hashlib.md5(str(json).encode('utf-8')).digest()) + + def get_it(): + return ff_utils.get_metadata('profiles/User.json', + key=integrated_ff['ff_key'], + ff_env=integrated_ff['ff_env']) + + first_try = get_it() + print(f"first_try={hash(first_try)}") + + failed = False + + for i in range(2, 10): + nth_try = get_it() + print(f"Try {i}: {hash(nth_try)}") + failed = failed or not is_subdict(nth_try, first_try) + time.sleep(2) + + assert not failed + + @pytest.mark.integrated @pytest.mark.flaky def test_get_schema_names(integrated_ff): @@ -1599,7 +1736,7 @@ def get_mocked_result(*, kind, dirname, uuid, ignored_kwargs=None): @pytest.mark.unit def test_get_qc_metrics_logic_unit(): """ - End to end test on 'get_associated_qc_metrics' to check the logic of the fuction to make sure + End-to-end test on 'get_associated_qc_metrics' to check the logic of the fuction to make sure it is getting the qc metrics. """ with mock.patch("dcicutils.ff_utils.get_metadata") as mock_get_metadata: diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py index 895a25757..a07c6d234 100644 --- a/test/test_misc_utils.py +++ b/test/test_misc_utils.py @@ -30,7 +30,7 @@ classproperty, classproperty_cached, classproperty_cached_each_subclass, Singleton, NamedObject, obsolete, ObsoleteError, CycleError, TopologicalSorter, keys_and_values_to_dict, dict_to_keys_and_values, is_c4_arn, deduplicate_list, chunked, parse_in_radix, format_in_radix, managed_property, future_datetime, - MIN_DATETIME, MIN_DATETIME_UTC, INPUT, builtin_print, map_chunked, + MIN_DATETIME, MIN_DATETIME_UTC, INPUT, builtin_print, map_chunked, to_camel_case, ) from dcicutils.qa_utils import ( Occasionally, ControlledTime, override_environ as qa_override_environ, MockFileSystem, printed_output, @@ -1978,6 +1978,23 @@ def test_snake_case_to_camel_case_hyphenated(token, expected): assert snake_case_to_camel_case(token, separator='-') == expected +@pytest.mark.parametrize('token, expected', [ + ('variant_sample', 'VariantSample'), + ('variant', 'Variant'), + ('_variant_', 'Variant'), + ('__variant', 'Variant'), + ('higlass_view_config', 'HiglassViewConfig'), + ('a_b_c_d', 'ABCD'), + ('', ''), + ('oneverylongthing1234567895_d', 'Oneverylongthing1234567895D'), + ('x_m_l_container', 'XMLContainer'), + ('X_M_L_Container', 'XMLContainer'), +]) +def test_to_camel_case_hyphenated(token, expected): + assert to_camel_case(token) == expected + assert to_camel_case(expected) == expected # make sure it's stable + + @pytest.mark.parametrize('token, expected', [ ('', ''), ('x', 'X'), diff --git a/test/test_qa_utils.py b/test/test_qa_utils.py index d369dc16f..0c2a87bd7 100644 --- a/test/test_qa_utils.py +++ b/test/test_qa_utils.py @@ -20,7 +20,7 @@ from dcicutils.lang_utils import there_are from dcicutils.misc_utils import Retry, PRINT, file_contents, REF_TZ, local_attrs, ignored from dcicutils.qa_utils import ( - mock_not_called, override_environ, override_dict, show_elapsed_time, timed, + mock_not_called, override_environ, override_dict, show_elapsed_time, timed, is_subdict, ControlledTime, Occasionally, RetryManager, MockFileSystem, NotReallyRandom, MockUUIDModule, MockedCommandArgs, MockResponse, printed_output, MockBotoS3Client, MockKeysNotImplemented, MockBoto3, known_bug_expected, raises_regexp, VersionChecker, check_duplicated_items_by_key, guess_local_timezone_for_testing, @@ -2256,3 +2256,66 @@ def test_mock_boto3_iam_role_collection(): collection = MockBoto3IamRoleCollection() assert collection["Roles"] == [] assert collection["Foo"] is None + + +def test_is_subdict(): + + print() # start on fresh line + + for same in [{}, {"foo": 3}, {"foo": [1, "x", {"bar": 17}]}]: + with printed_output() as printed: + assert is_subdict(same, same) + assert printed.lines == [] + + for verbose in [False, True]: + with printed_output() as printed: + assert is_subdict({"foo": 3}, {"foo": 3, "bar": 4}, verbose=verbose) + if verbose: + assert printed.lines == [ + "Non-fatal keyword mismatch at '':", + " json1 keys: {'foo'}", + " json2 keys: {'bar', 'foo'}", + ] + else: + assert printed.lines == [] + + for verbose in [True, False]: + with printed_output() as printed: + assert not is_subdict({"foo": 3, "bar": {"x": 3, "y": 4}}, + {"foo": 3, "bar": {"x": 3, "y": 5, "baz": 0}}, verbose=verbose) + if verbose: + assert printed.lines == [ + "Non-fatal keyword mismatch at '.bar':", + " json1 keys: {'x', 'y'}", + " json2 keys: {'baz', 'x', 'y'}", + "Failed at '.bar.y' due to value mismatch: 4 != 5", + # "Recursive failure at '.bar' in object comparison", + # "Recursive failure at '' in object comparison", + ] + else: + assert printed.lines == [] + + for verbose in [True, False]: + with printed_output() as printed: + assert not is_subdict({"foo": 3, "bar": [1, 2, 3]}, + {"foo": 3, "bar": [1, 2, 3, 4]}, verbose=verbose) + if verbose: + assert printed.lines == [ + "Failed at '.bar' in list comparison due to length mismatch: 3 vs 4", + # "Recursive failure at '' in object comparison" + ] + else: + assert printed.lines == [] + + for verbose in [True, False]: + with printed_output() as printed: + assert not is_subdict({"foo": 3, "baz": [1, 2, 3]}, + {"foo": 3, "bar": [1, 2, 3, 4]}, verbose=verbose) + if verbose: + assert printed.lines == [ + "Failed at '' in object comparison due to key set mismatch:", + " json1 keys: {'baz', 'foo'}", + " json2 keys: {'bar', 'foo'}", + ] + else: + assert printed.lines == [] From b86f96e27783ebe8633bd19cfaacfbe0e0117a37 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 21 Aug 2023 07:02:33 -0400 Subject: [PATCH 2/9] PEP8 --- dcicutils/qa_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dcicutils/qa_utils.py b/dcicutils/qa_utils.py index 472b198eb..1886bb8eb 100644 --- a/dcicutils/qa_utils.py +++ b/dcicutils/qa_utils.py @@ -3499,7 +3499,6 @@ def is_subdict(json1, json2, desc1="json1", desc2="json2", verbose=True): output explaining failures or near-failures when True, and otherwise, if False, not showing such output. """ - def out(x): if verbose: PRINT(x) From a1421cc364ffa76bcce535f9877bc93a2e5a4731 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 21 Aug 2023 09:32:32 -0400 Subject: [PATCH 3/9] Minor tweaks to ff_utils.dump_results_to_json for style reasons, and repairs to its overly complex and error-prone unit test. --- CHANGELOG.rst | 2 ++ dcicutils/ff_utils.py | 13 +++++-- dcicutils/qa_utils.py | 6 ++-- test/test_ff_utils.py | 79 +++++++++++++++++++++++++++++-------------- 4 files changed, 69 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b74dac182..8b4de9ee1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -22,6 +22,8 @@ Change Log * New function ``get_schema`` that will pull down an individual schema definition. * New function ``get_schemas`` that will pull down all schema definitions. + * Minor tweaks to ``dump_results_to_json`` for style reasons, + and repairs to its overly complex and error-prone unit test. 7.8.0 diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index 101db132e..a94a1b354 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -1,4 +1,5 @@ import boto3 +import io import json import os import random @@ -1482,6 +1483,14 @@ def convert_param(parameter_dict, vals_as_string=False): if v % 1 == 0: v = int(v) except ValueError: + # TODO: Maybe just catch Exception, not ValueError? + # This is why I hate advice to people that they try to second-guess error types for which they + # don't have express guarantees about the kinds of errors that can happen. + # Does the code author really want to have other errors go unhandled, or did they just think this + # was the only possible error. If it's the only possible error, Exception is a find way to catch it. + # In fact, float('a') will raise ValueError, but float(['a']) will raise TypeError. + # I think this should probably treat both of those the same, but I'm not going to make that change + # for now.. -kmp 21-Aug-2023 v = str(v) else: v = str(v) @@ -1512,8 +1521,8 @@ def dump_results_to_json(store, folder): if not os.path.exists(folder): os.makedirs(folder) for a_type in store: - filename = folder + '/' + a_type + '.json' - with open(filename, 'w') as outfile: + filename = os.path.join(folder, a_type + '.json') + with io.open(filename, 'w') as outfile: json.dump(store[a_type], outfile, indent=4) diff --git a/dcicutils/qa_utils.py b/dcicutils/qa_utils.py index 1886bb8eb..c1d600b6e 100644 --- a/dcicutils/qa_utils.py +++ b/dcicutils/qa_utils.py @@ -3478,9 +3478,9 @@ def mocked_input(*args, **kwargs): def is_subdict(json1, json2, desc1="json1", desc2="json2", verbose=True): """ Does asymmetric testing of dictionary equivalence, assuring json2 has all the content of json1, - even if not vice versa. In other words, the dictionary structure is equivalent to the extent - that (recursively) all dictionary keys on the left occur in the right hand side even if not - necessarily all dictionary keys on the right occur in the left. + even if not vice versa. In other words, the dictionary structure is equivalent, to the extent + that (recursively) all dictionary keys on the left hand side also occur on the right hand side + even if not necessarily all dictionary keys on the right occur in the left. For example, x = {"foo": 3} diff --git a/test/test_ff_utils.py b/test/test_ff_utils.py index 145d3bc86..af708acbb 100644 --- a/test/test_ff_utils.py +++ b/test/test_ff_utils.py @@ -5,13 +5,13 @@ import os import pytest import requests -import shutil +import tempfile import time from botocore.exceptions import ClientError from dcicutils import es_utils, ff_utils, s3_utils from dcicutils.ff_mocks import mocked_s3utils_with_sse, TestScenarios, RequestsTestRecorder -from dcicutils.misc_utils import make_counter, remove_prefix, remove_suffix, check_true +from dcicutils.misc_utils import make_counter, remove_prefix, remove_suffix, check_true, ignorable, file_contents from dcicutils.qa_utils import ( check_duplicated_items_by_key, ignored, raises_regexp, MockResponse, MockBoto3, MockBotoSQSClient, is_subdict, ) @@ -471,6 +471,11 @@ def compute_mock_queue_attribute(self, QueueUrl, Attribute): # noQA - Amazon AW assert ff_utils.stuff_in_queues('fourfront-foo', check_secondary=True) +def test_internal_compute_stuff_in_queues_requires_env(): + with pytest.raises(ValueError): + ff_utils.internal_compute_stuff_in_queues(None, True) + + @pytest.mark.integratedx @pytest.mark.flaky def test_stuff_in_queues_integrated(integrated_ff): @@ -1856,29 +1861,22 @@ def test_delete_field(integrated_ff): assert "Bad status code" in str(exec_info.value) -@pytest.mark.direct_es_query -@pytest.mark.integrated @pytest.mark.file_operation -@pytest.mark.flaky -def test_dump_results_to_json(integrated_ff): - - def clear_folder(folder): - ignored(folder) - try: - shutil.rmtree(test_folder) - except FileNotFoundError: - pass - - test_folder = 'test/test_data' - clear_folder(test_folder) - test_list = ['7f9eb396-5c1a-4c5e-aebf-28ea39d6a50f'] - key, ff_env = integrated_ff['ff_key'], integrated_ff['ff_env'] - store, uuids = ff_utils.expand_es_metadata(test_list, store_frame='object', key=key, ff_env=ff_env) - len_store = len(store) - ff_utils.dump_results_to_json(store, test_folder) - all_files = os.listdir(test_folder) - assert len(all_files) == len_store - clear_folder(test_folder) +@pytest.mark.unit +def test_dump_results_to_json(): + print() + with tempfile.TemporaryDirectory() as test_folder: + store = { # mocked first return value from expanded_es_metadata + 'experiment_hi_c': [{'uuid': '1234', '@id': '/a/b/', "other": "stuff"}], + 'experiment_set': [{'uuid': '12345', '@id': '/c/d/', "other": "stuff"}], + } + ff_utils.dump_results_to_json(store, test_folder) + all_files = os.listdir(test_folder) + assert len(all_files) == len(store) + for file in all_files: + print(f"file={file}") + base, ext = os.path.splitext(os.path.basename(file)) + assert store[base] == json.loads(file_contents(os.path.join(test_folder, file))) @pytest.mark.direct_es_query @@ -1934,8 +1932,37 @@ def test_convert_param(): expected2 = [{'workflow_argument_name': 'param1', 'value': '5'}] converted_params1 = ff_utils.convert_param(params) converted_params2 = ff_utils.convert_param(params, vals_as_string=True) - assert expected1 == converted_params1 - assert expected2 == converted_params2 + assert converted_params1 == expected1 + assert converted_params2 == expected2 + + params = {'param1': 5, 'param2': 6} + expected1 = [{'workflow_argument_name': 'param1', 'value': 5}, + {'workflow_argument_name': 'param2', 'value': 6}] + expected2 = [{'workflow_argument_name': 'param1', 'value': '5'}, + {'workflow_argument_name': 'param2', 'value': '6'}] + converted_params1 = ff_utils.convert_param(params) + converted_params2 = ff_utils.convert_param(params, vals_as_string=True) + assert converted_params1 == expected1 + assert converted_params2 == expected2 + + params = {'param1': 'a'} + expected1 = [{'workflow_argument_name': 'param1', 'value': 'a'}] + expected2 = [{'workflow_argument_name': 'param1', 'value': 'a'}] + converted_params1 = ff_utils.convert_param(params) + converted_params2 = ff_utils.convert_param(params, vals_as_string=True) + assert converted_params1 == expected1 + assert converted_params2 == expected2 + + # See my notes in convert_param about the error-handling it does. Not sure this is what's intended. + params = {'param1': ['a']} + expected1 = [{'workflow_argument_name': 'param1', 'value': "['a']"}] + expected2 = [{'workflow_argument_name': 'param1', 'value': "['a']"}] + with pytest.raises(Exception): + converted_params1 = ff_utils.convert_param(params) + converted_params2 = ff_utils.convert_param(params, vals_as_string=True) + ignorable(converted_params1, expected1) # wouldn't be ignored if commented-out line below got uncommented + # assert converted_params1 == expected1 + assert converted_params2 == expected2 @pytest.mark.integrated From 192e0adca067aeb702fbde90a702064879ae52a8 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 21 Aug 2023 13:32:26 -0400 Subject: [PATCH 4/9] Make a beta. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 70f90b624..c20bc2ec3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.9.0" +version = "7.8.0.1b0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From a3f8a258a7f80634efff9205dbdea0b12ce55e79 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 21 Aug 2023 15:25:32 -0400 Subject: [PATCH 5/9] Test coverage. --- dcicutils/ff_utils.py | 10 ++++++--- test/test_ff_utils.py | 50 ++++++++++++++++++++++++++++++------------- 2 files changed, 42 insertions(+), 18 deletions(-) diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index a94a1b354..382e6a49c 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -1511,13 +1511,17 @@ def generate_rand_accession(project_prefix='4DN', item_prefix='FI'): def dump_results_to_json(store, folder): - """Takes resuls from expand_es_metadata, and dumps them into the given folder in json format. + """ + Takes resuls from expand_es_metadata, and dumps them into the given folder in json format. + + e.g., given a dictionary {'a': a_dict, 'b': b_dict, 'c': c_dict} + it will write files 'a', 'b', and 'c' to the given folder (creating it if need be), + containing a_dict, b_dict, and c_dict, respectively. + Args: store (dict): results from expand_es_metadata folder: folder for storing output """ - if folder[-1] == '/': - folder = folder[:-1] if not os.path.exists(folder): os.makedirs(folder) for a_type in store: diff --git a/test/test_ff_utils.py b/test/test_ff_utils.py index af708acbb..5e843e10b 100644 --- a/test/test_ff_utils.py +++ b/test/test_ff_utils.py @@ -1865,18 +1865,24 @@ def test_delete_field(integrated_ff): @pytest.mark.unit def test_dump_results_to_json(): print() - with tempfile.TemporaryDirectory() as test_folder: - store = { # mocked first return value from expanded_es_metadata - 'experiment_hi_c': [{'uuid': '1234', '@id': '/a/b/', "other": "stuff"}], - 'experiment_set': [{'uuid': '12345', '@id': '/c/d/', "other": "stuff"}], - } - ff_utils.dump_results_to_json(store, test_folder) - all_files = os.listdir(test_folder) - assert len(all_files) == len(store) - for file in all_files: - print(f"file={file}") - base, ext = os.path.splitext(os.path.basename(file)) - assert store[base] == json.loads(file_contents(os.path.join(test_folder, file))) + for subdir in [None, 'missing/dir']: + for slashed in [False, True]: + with tempfile.TemporaryDirectory() as test_folder: + if subdir: + test_folder = os.path.join(test_folder, subdir) + if slashed: + test_folder = test_folder.rstrip('/') + '/' + store = { # mocked first return value from expanded_es_metadata + 'experiment_hi_c': [{'uuid': '1234', '@id': '/a/b/', "other": "stuff"}], + 'experiment_set': [{'uuid': '12345', '@id': '/c/d/', "other": "stuff"}], + } + ff_utils.dump_results_to_json(store, test_folder) + all_files = os.listdir(test_folder) + assert len(all_files) == len(store) + for file in all_files: + print(f"test_folder={test_folder} file={file}") + base, ext = os.path.splitext(os.path.basename(file)) + assert store[base] == json.loads(file_contents(os.path.join(test_folder, file))) @pytest.mark.direct_es_query @@ -1991,6 +1997,12 @@ def test_are_counts_even_integrated(integrated_ff): # These are stripped-down versions of actual results that illustrate the kinds of output we might expect. +SAMPLE_COUNTS_PAGE_FAILURE = { + 'error': + ("DEV_ENV_DOMAIN_SUFFIX is not defined." + " It is needed for get_env_real_url('fourfront-wolf')" + " because env fourfront-wolf has no entry in public_url_table.")} + SAMPLE_COUNTS_MISMATCH = { 'title': 'Item Counts', 'db_es_total': 'DB: 54 ES: 57 < ES has 3 more items >', @@ -2018,7 +2030,10 @@ def test_are_counts_even_integrated(integrated_ff): } -@pytest.mark.parametrize('expect_match, sample_counts', [(True, SAMPLE_COUNTS_MATCH), (False, SAMPLE_COUNTS_MISMATCH)]) +@pytest.mark.parametrize('expect_match, sample_counts', + [(True, SAMPLE_COUNTS_MATCH), + (False, SAMPLE_COUNTS_MISMATCH), + (False, SAMPLE_COUNTS_PAGE_FAILURE)]) def test_are_counts_even_unit(expect_match, sample_counts): unsupplied = object() @@ -2046,8 +2061,13 @@ def mocked_authorized_request(url, auth=None, ff_env=None, verb='GET', assert counts_are_even assert 'more items' not in ' '.join(totals) else: - assert not counts_are_even - assert 'more items' in ' '.join(totals) + if isinstance(totals, list): # in the normal case, totals is a list + assert not counts_are_even + assert 'more items' in ' '.join(totals) + else: # when failing to get the page, it's a dictionary + assert not counts_are_even + assert isinstance(totals, dict) + assert 'error' in totals @pytest.mark.parametrize('url, expected_bucket, expected_key', [ From 6dd5ef64e05ac2a2957b62e7029eaa66dd9086bc Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Tue, 22 Aug 2023 13:36:57 -0400 Subject: [PATCH 6/9] Unify get_schemas with get_schema_names in a more elegant way. --- CHANGELOG.rst | 2 ++ dcicutils/ff_utils.py | 71 ++++++++++++++++++++---------------- test/test_ff_utils.py | 83 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 126 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 8b4de9ee1..f073f4a79 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -22,6 +22,8 @@ Change Log * New function ``get_schema`` that will pull down an individual schema definition. * New function ``get_schemas`` that will pull down all schema definitions. + * New argument ``allow_abstract`` to ``get_schema_names`` + for conceptual compatibility with ``get_schemas``. * Minor tweaks to ``dump_results_to_json`` for style reasons, and repairs to its overly complex and error-prone unit test. diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index 382e6a49c..925f72a2e 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -17,7 +17,7 @@ # S3BucketName, S3KeyName, ) from .lang_utils import disjoined_list -from .misc_utils import PRINT, to_camel_case +from .misc_utils import PRINT, to_camel_case, remove_suffix # TODO (C4-92, C4-102): Probably to centralize this information in env_utils. Also figure out relation to CGAP. @@ -942,64 +942,75 @@ def _get_es_metadata(uuids, es_client, filters, sources, chunk_size, auth): yield hit['_source'] # yield individual items from ES -def get_schemas(key=None, ff_env=None) -> Dict[str, Dict]: +def get_schema(name, key=None, ff_env=None) -> Dict: """ - Gets a dictionary of all schema definitions + Gets the schema definition with the given name. Args: + name (str): a schema name (CamelCase or snake_case), or None key (dict): standard ff_utils authentication key ff_env (str): standard ff environment string Returns: - dict: a mapping from keys that are schema names to schema definitions + dict: contains key schema names and value item class names """ auth = get_authentication_with_server(key, ff_env) - schemas = get_metadata('profiles/', key=auth, add_on='frame=raw') - return schemas + url = f"profiles/{to_camel_case(name)}.json" + schema = get_metadata(url, key=auth, add_on='frame=raw') + return schema -def get_schema(name, key=None, ff_env=None) -> Dict: +def get_schemas(key=None, ff_env=None, *, allow_abstract=True, require_id=False) -> Dict[str, Dict]: """ - Gets the schema definition with the given name. + Gets a dictionary of all schema definitions. + By default, this returns all schemas, but the allow_abstract= and require_id= keywords allow limited filtering. Args: - name (str): a schema name (CamelCase or snake_case), or None - key (dict): standard ff_utils authentication key - ff_env (str): standard ff environment string + key (dict): standard ff_utils authentication key + ff_env (str): standard ff environment string + allow_abstract (boolean): controls whether abstract schemas can be returned (default True, return them) + require_id (boolean): controls whether a $id field is required for schema to be included + (default False, include even if no $id) Returns: - dict: contains key schema names and value item class names + dict: a mapping from keys that are schema names to schema definitions """ auth = get_authentication_with_server(key, ff_env) - url = f"profiles/{to_camel_case(name)}.json" - schema = get_metadata(url, key=auth, add_on='frame=raw') - return schema + schemas: Dict[str, Dict] = get_metadata('profiles/', key=auth, add_on='frame=raw') + filtered_schemas = {} + for schema_name, schema in schemas.items(): + if allow_abstract or not schema.get('isAbstract'): + id_field = schema.get('$id') # some test schemas in local may not have the $id field + if id_field or not require_id: + filtered_schemas[schema_name] = schema + return filtered_schemas -def get_schema_names(key=None, ff_env=None): +def get_schema_names(key=None, ff_env=None, allow_abstract=False) -> Dict[str, str]: """ Create a dictionary of all schema names to item class names + + By default, names of abstract schemas are not included. The allow_abstract= keyword argument controls this. + + Names that have no $id cannot be included because they lack the relevant information to + construct our return value. However, these presumably mostly come up for local debugging and shouldn't matter. + (See ff_utils.get_schemas if you want more refined control.) + i.e. FileFastq: file_fastq Args: - key (dict): standard ff_utils authentication key - ff_env (str): standard ff environment string + key (dict): standard ff_utils authentication key + ff_env (str): standard ff environment string + allow_abstract (boolean): controls whether names of abstract schemas can be returned (default False, omit) Returns: dict: contains key schema names and value item class names """ - auth = get_authentication_with_server(key, ff_env) - schema_name = {} - profiles = get_metadata('profiles/', key=auth, add_on='frame=raw') - for key, value in profiles.items(): - # skip abstract types - if value.get('isAbstract') is True: - continue - # some test schemas in local don't have the id field - schema_filename = value.get('$id') - if schema_filename: - schema_name[key] = schema_filename.split('/')[-1][:-5] - return schema_name + schemas = get_schemas(key=key, ff_env=ff_env, allow_abstract=allow_abstract, require_id=True) + return { + schema_name: remove_suffix(".json", schema.get('$id').split('/')[-1]) + for schema_name, schema in schemas.items() + } def expand_es_metadata(uuid_list, key=None, ff_env=None, store_frame='raw', add_pc_wfr=False, ignore_field=None, diff --git a/test/test_ff_utils.py b/test/test_ff_utils.py index 5e843e10b..7d4254d7a 100644 --- a/test/test_ff_utils.py +++ b/test/test_ff_utils.py @@ -1316,6 +1316,89 @@ def get_it(): time.sleep(2) +def test_get_schemas_options(): + + mocked_schemas = { + 'foo': { + "a": "schema", + "$id": "something/Foo.json", + }, + 'abstract_foo': { + "some": "schema data", + "$id": "something/AbstractFoo.json", + 'isAbstract': True, + }, + 'bar': { + "another": "schema", + "$id": "something/Bar.json", + }, + 'bartemp': { + "some": "hack schema", + }, + 'baz': { + "yet_another": "schema", + "$id": "something/Baz.json", + }, + } + + def mocked_schemas_subset(keys): + return {key: value for key, value in mocked_schemas.items() if key in keys} + + with mock.patch.object(ff_utils, "get_authentication_with_server") as mock_get_authentication_with_server: + + mock_get_authentication_with_server.return_value = 'some-auth' + + with mock.patch.object(ff_utils, "get_metadata") as mock_get_metadata: + + def mocked_get_metadata(url, key, add_on): + assert url == "profiles/" # this is the web API to ask for all profiles + assert key == 'some-auth' # we assume auth is tested elsewhere + assert add_on == "frame=raw" # we presently always send this + return mocked_schemas + + mock_get_metadata.side_effect = mocked_get_metadata + + expected = mocked_schemas + actual = ff_utils.get_schemas() + assert actual == mocked_schemas + + expected = mocked_schemas + actual = ff_utils.get_schemas(allow_abstract=True, require_id=False) + assert actual == expected + # There should be just one abstract in our sample data + assert any(not schema.get('isAbstract') for schema in actual.values()) + assert not all(not schema.get('isAbstract') for schema in actual.values()) + # There should be just one that has no id in our sample data + assert any(schema.get('$id') for schema in actual.values()) + assert not all(schema.get('$id') for schema in actual.values()) + + expected = mocked_schemas_subset({'abstract_foo', 'foo', 'bar', 'baz'}) + actual = ff_utils.get_schemas(allow_abstract=True, require_id=True) + assert actual == expected + # There should be just one abstract in our sample data + assert any(not schema.get('isAbstract') for schema in actual.values()) + assert not all(not schema.get('isAbstract') for schema in actual.values()) + # We have asked that they all have an id + assert all(schema.get('$id') for schema in actual.values()) + + expected = mocked_schemas_subset({'foo', 'bar', 'bartemp', 'baz'}) + actual = ff_utils.get_schemas(allow_abstract=False, require_id=False) + assert actual == expected + # We have asked that none are abstract + assert all(not schema.get('isAbstract') for schema in actual.values()) + # There should be just one that has no id in our sample data + assert any(schema.get('$id') for schema in actual.values()) + assert not all(schema.get('$id') for schema in actual.values()) + + expected = mocked_schemas_subset({'foo', 'bar', 'baz'}) + actual = ff_utils.get_schemas(allow_abstract=False, require_id=True) + assert actual == expected + # We have asked that none are abstract + assert all(not schema.get('isAbstract') for schema in actual.values()) + # We have asked that they all have an id + assert all(schema.get('$id') for schema in actual.values()) + + @pytest.mark.integrated @pytest.mark.flaky def test_get_schema_and_get_schemas(integrated_ff): From d7fb58ef672e0483d6d0610807c85a18359d2440 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Tue, 22 Aug 2023 14:33:52 -0400 Subject: [PATCH 7/9] De-beta as 7.9.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c20bc2ec3..70f90b624 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.8.0.1b0" +version = "7.9.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From 3a84e3f92422a704aca5cfd92f98e0166a50e485 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Tue, 22 Aug 2023 15:49:06 -0400 Subject: [PATCH 8/9] Testing coverage. --- dcicutils/ff_utils.py | 2 +- dcicutils/qa_utils.py | 2 +- test/test_qa_utils.py | 13 +++++++++++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index 925f72a2e..96f465290 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -969,7 +969,7 @@ def get_schemas(key=None, ff_env=None, *, allow_abstract=True, require_id=False) key (dict): standard ff_utils authentication key ff_env (str): standard ff environment string allow_abstract (boolean): controls whether abstract schemas can be returned (default True, return them) - require_id (boolean): controls whether a $id field is required for schema to be included + require_id (boolean): controls whether a '$id' field is required for schema to be included (default False, include even if no $id) Returns: diff --git a/dcicutils/qa_utils.py b/dcicutils/qa_utils.py index c1d600b6e..8b440c413 100644 --- a/dcicutils/qa_utils.py +++ b/dcicutils/qa_utils.py @@ -3543,7 +3543,7 @@ def recurse(json1, json2, path=""): else: result = False if not result: - out(f"Mismatch at {path}.") + out(f"Type mismatch ({json1.__class__.__name__} vs {json2.__class__.__name__}) at {path!r}:") out(f" {desc1}: {json1}") out(f" {desc2}: {json2}") return result diff --git a/test/test_qa_utils.py b/test/test_qa_utils.py index 0c2a87bd7..bed502166 100644 --- a/test/test_qa_utils.py +++ b/test/test_qa_utils.py @@ -2319,3 +2319,16 @@ def test_is_subdict(): ] else: assert printed.lines == [] + + for verbose in [True, False]: + with printed_output() as printed: + assert not is_subdict({"foo": [1, 2, 3]}, + {"foo": 3}, verbose=verbose) + if verbose: + assert printed.lines == [ + "Type mismatch (list vs int) at '.foo':", + " json1: [1, 2, 3]", + " json2: 3", + ] + else: + assert printed.lines == [] From 56f702aaa381fe96f456f2a8e5558c0f41d40027 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 23 Aug 2023 22:04:34 -0400 Subject: [PATCH 9/9] Mark chardet as an acceptable license for use. --- dcicutils/license_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index 855fa5c80..db18fd7df 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -810,6 +810,12 @@ class C4InfrastructureLicenseChecker(LicenseChecker): 'pytest-timeout', # MIT Licensed ], + # Linking = With Restrictions, Private Use = Yes + # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + 'GNU Lesser General Public License v2 or later (LGPLv2+)': [ + 'chardet' # used at runtime during server operation (ingestion), but not modified or distributed + ], + # Linking = With Restrictions, Private Use = Yes # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses 'GNU Lesser General Public License v3 or later (LGPLv3+)': [