Add misc_utils.to_camelcase. Add qa_utils.is_subdict. Add ff_utils.ge…

…t_schema and ff_utils.get_schemas.
4dn-dcic · Aug 21, 2023 · ee6dff3 · ee6dff3
1 parent 3972a56
commit ee6dff3
Show file tree

Hide file tree

Showing 8 changed files with 394 additions and 40 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,23 @@ Change Log
 ----------
 
 
+7.9.0
+=====
+
+* In ``misc_utils``:
+
+  * New function ``to_camelcase`` that can take either snake_case or CamelCase input.
+
+* In ``qa_utils``:
+
+  * New function ``is_subdict`` for asymmetric testing of dictionary equivalence.
+
+* In ``ff_utils``:
+
+  * New function ``get_schema`` that will pull down an individual schema definition.
+  * New function ``get_schemas`` that will pull down all schema definitions.
+
+
 7.8.0
 =====
 

diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py
@@ -6,7 +6,6 @@
 import time
 
 from collections import namedtuple
-from dcicutils.lang_utils import disjoined_list
 from elasticsearch.exceptions import AuthorizationException
 from typing import Optional, Dict, List
 from urllib.parse import parse_qs, urlencode, urlparse, urlunparse
@@ -16,7 +15,8 @@
     AnyAuthDict, AuthDict, SimpleAuthPair, AuthData, AnyAuthData, PortalEnvName,
     # S3BucketName, S3KeyName,
 )
-from .misc_utils import PRINT
+from .lang_utils import disjoined_list
+from .misc_utils import PRINT, to_camel_case
 
 
 # TODO (C4-92, C4-102): Probably to centralize this information in env_utils. Also figure out relation to CGAP.
@@ -281,7 +281,7 @@ def get_metadata(obj_id, key=None, ff_env=None, check_queue=False, add_on=''):
     Function to get metadata for a given obj_id (uuid or @id, most likely).
     Either takes a dictionary form authentication (MUST include 'server')
     or a string fourfront-environment.
-    Also a boolean 'check_queue', which if True
+    Also, a boolean 'check_queue', which, if True,
     will use information from the queues and/or datastore=database to
     ensure that the metadata is accurate.
     Takes an optional string add_on that should contain things like
@@ -421,7 +421,7 @@ def search_result_generator(page_generator):
     now return A,C,D but we already had the first page, so we request data starting at position 3
     for the second page and get E,G,I.  That means our sequence of return values would be A,C,E,E,G,I,K,M,
     or, in other words, showing a duplication. To avoid this, we keep track of the IDs we've seen
-    and show only the first case of each element, so A,C,E,G,I,K,M. (We won't see the D but we weren't
+    and show only the first case of each element, so A,C,E,G,I,K,M. (We won't see the D, but we weren't
     going to see it anyway, and it wasn't available the time we started, so the timing was already close.)
 
     Unfortunately, we aren't so lucky for deletion, though that happens more rarely. That will cause
@@ -687,7 +687,7 @@ def get_associated_qc_metrics(uuid, key=None, ff_env=None, include_processed_fil
 
     resp = get_metadata(uuid, key=key, ff_env=ff_env)
 
-    # Checks wheter the input is a experiment or experimentset otherwise throws an error
+    # Checks whether the input is an Experiment or ExperimentSet, and otherwise throws an error.
     if 'ExperimentSet' not in resp['@type']:
         raise TypeError('Expected ExperimentSet')
 
@@ -862,15 +862,15 @@ def get_es_metadata(uuids, es_client=None, filters=None, sources=None, chunk_siz
                 sources = ['embedded.files.uuid']
             i.e. getting all fields for lab in embedded frame
                 sources = ['embedded.lab.*']
-            i.e. for getting a only object frame
+            i.e. for getting only an object frame
                 sources = ['object.*']
         chunk_size:
             Integer chunk_size may be used to control the number of uuids that are
             passed to Elasticsearch in each query; setting this too high may cause
             ES reads to timeout.
         is_generator:
             Boolean is_generator will return a generator for individual results if True;
-            if False (default), returns a list of results.
+            if False, (default), returns a list of results.
         key: authentication key for ff_env (see get_authentication_with_server)
         ff_env: authentication by env (needs system variables)
     """
@@ -941,6 +941,40 @@ def _get_es_metadata(uuids, es_client, filters, sources, chunk_size, auth):
                 yield hit['_source']  # yield individual items from ES
 
 
+def get_schemas(key=None, ff_env=None) -> Dict[str, Dict]:
+    """
+    Gets a dictionary of all schema definitions
+
+    Args:
+        key (dict):   standard ff_utils authentication key
+        ff_env (str): standard ff environment string
+
+    Returns:
+        dict: a mapping from keys that are schema names to schema definitions
+    """
+    auth = get_authentication_with_server(key, ff_env)
+    schemas = get_metadata('profiles/', key=auth, add_on='frame=raw')
+    return schemas
+
+
+def get_schema(name, key=None, ff_env=None) -> Dict:
+    """
+    Gets the schema definition with the given name.
+
+    Args:
+        name (str):   a schema name (CamelCase or snake_case), or None
+        key (dict):   standard ff_utils authentication key
+        ff_env (str): standard ff environment string
+
+    Returns:
+        dict: contains key schema names and value item class names
+    """
+    auth = get_authentication_with_server(key, ff_env)
+    url = f"profiles/{to_camel_case(name)}.json"
+    schema = get_metadata(url, key=auth, add_on='frame=raw')
+    return schema
+
+
 def get_schema_names(key=None, ff_env=None):
     """
     Create a dictionary of all schema names to item class names
@@ -1034,7 +1068,7 @@ def remove_keys(my_dict, remove_list):
     chunk = 100  # chunk the requests - don't want to hurt es performance
 
     while uuid_list:
-        uuids_to_check = []  # uuids to add to uuid_list if not if not in item_uuids
+        uuids_to_check = []  # uuids to add to uuid_list if not in item_uuids
 
         # get the next page of data, recreating the es_client if need be
         try:
@@ -1121,7 +1155,7 @@ def _get_page(*, page, key=None, ff_env=None):
 
 def get_health_page(key=None, ff_env=None):
     """
-    Simple function to return the json for a FF health page
+    Simple function to return the json for an environment's health page
     """
     return _get_page(page='/health', key=key, ff_env=ff_env)
 
@@ -1143,7 +1177,7 @@ def get_indexing_status(key=None, ff_env=None):
 def get_counts_summary(env):
     """ Returns a named tuple given an FF name to check representing the counts state.
             CountSummary
-                are_even: boolean on whether or not counts are even
+                are_even: boolean that is True if counts are even and False otherwise
                 summary_total: raw value of counts
     """
     totals = get_counts_page(ff_env=env)
@@ -1182,7 +1216,7 @@ def execute_search(self, index, query, is_generator=False, page_size=200):
 
         :arg index: index to search under
         :arg query: query to run
-        :arg is_generator: boolean on whether or not to use a generator
+        :arg is_generator: boolean that is True if a generator is requested and otherwise False
         :arg page_size: if using a generator, how many results to give per request
 
         :returns: list of results of query or None
@@ -1194,7 +1228,7 @@ def execute_search(self, index, query, is_generator=False, page_size=200):
 
 def search_es_metadata(index, query, key=None, ff_env=None, is_generator=False):
     """
-        Executes a lucene search query on on the ES Instance for this
+        Executes a lucene search query on the ES Instance for this
         environment.
 
         NOTE: It is okay to use this function directly but for repeat usage please use
@@ -1204,7 +1238,7 @@ def search_es_metadata(index, query, key=None, ff_env=None, is_generator=False):
         :arg query: dictionary of query
         :arg key: optional, 2-tuple authentication key (access_key_id, secret)
         :arg ff_env: ff_env to use
-        :arg is_generator: boolean on whether or not to use a generator
+        :arg is_generator: boolean that is True if a generator is requested and otherwise False
 
         :returns: list of results of query or None
     """
@@ -1484,7 +1518,7 @@ def dump_results_to_json(store, folder):
 
 
 def parse_s3_bucket_and_key_url(url: str) -> (str, str):
-    """ Parses the given s3 URL into its pair of bucket, key
+    """ Parses the given s3 URL into its pair of (bucket, key).
         Note that this function works the way it does because of how these
         urls end up in our database. Eventually we should clean this up.
         Format:

diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py
@@ -1331,6 +1331,16 @@ def snake_case_to_camel_case(s, separator='_'):
     return s.title().replace(separator, '')
 
 
+def to_camel_case(s):
+    """
+    Converts a string that might be in snake_case or CamelCase into CamelCase.
+    """
+    if s[:1].isupper() and '_' not in s:
+        return s
+    else:
+        return snake_case_to_camel_case(s)
+
+
 def capitalize1(s):
     """
     Capitalizes the first letter of a string and leaves the others alone.

diff --git a/dcicutils/qa_utils.py b/dcicutils/qa_utils.py
@@ -10,6 +10,7 @@
 import functools
 import hashlib
 import io
+import json
 import logging
 import os
 import pytest
@@ -720,7 +721,7 @@ def mock_action_handler(self, wrapped_action, *args, **kwargs):
         texts = remove_suffix('\n', text).split('\n')
         last_text = texts[-1]
         result = wrapped_action(text, **kwargs)  # noQA - This call to print is low-level implementation
-        # This only captures non-file output output.
+        # This only captures non-file output.
         file = kwargs.get('file')
         if file is None:
             file = sys.stdout
@@ -853,7 +854,7 @@ def __init__(self, *, region_name=None, boto3=None, **kwargs):
         self._aws_secret_access_key = kwargs.get("aws_secret_access_key")
         self._aws_region = region_name
 
-        # These is specific for testing.
+        # This is specific for testing.
         self._aws_credentials_dir = None
 
     # FYI: Some things to note about how boto3 (and probably any AWS client) reads AWS credentials/region.
@@ -915,7 +916,7 @@ def put_credentials_for_testing(self,
         self._aws_secret_access_key = aws_secret_access_key
         self._aws_region = region_name
 
-        # These is specific for testing.
+        # This is specific for testing.
         self._aws_credentials_dir = aws_credentials_dir
 
     @staticmethod
@@ -2270,8 +2271,7 @@ def create_object_for_testing(self, object_content: str, *, Bucket: str, Key: st
     def upload_fileobj(self, Fileobj, Bucket, Key, **kwargs):  # noqa - Uppercase argument names are chosen by AWS
         self.check_for_kwargs_required_by_mock("upload_fileobj", Bucket=Bucket, Key=Key, **kwargs)
         data = Fileobj.read()
-        PRINT("Uploading %s (%s bytes) to bucket %s key %s"
-              % (Fileobj, len(data), Bucket, Key))
+        PRINT(f"Uploading {Fileobj} ({len(data)} bytes) to bucket {Bucket} key {Key}")
         with self.s3_files.open(os.path.join(Bucket, Key), 'wb') as fp:
             fp.write(data)
 
@@ -2284,8 +2284,7 @@ def download_fileobj(self, Bucket, Key, Fileobj, **kwargs):  # noqa - Uppercase
         self.check_for_kwargs_required_by_mock("download_fileobj", Bucket=Bucket, Key=Key, **kwargs)
         with self.s3_files.open(os.path.join(Bucket, Key), 'rb') as fp:
             data = fp.read()
-        PRINT("Downloading bucket %s key %s (%s bytes) to %s"
-              % (Bucket, Key, len(data), Fileobj))
+        PRINT(f"Downloading bucket {Bucket} key {Key} ({len(data)} bytes) to {Fileobj}")
         Fileobj.write(data)
 
     def download_file(self, Bucket, Key, Filename, **kwargs):  # noqa - Uppercase argument names are chosen by AWS
@@ -2382,7 +2381,7 @@ def head_bucket(self, Bucket):  # noQA - AWS argument naming style
         raise ClientError(operation_name='HeadBucket',
                           error_response={  # noQA - PyCharm wrongly complains about this dictionary
                               "Error": {"Code": "404", "Message": "Not Found"},
-                              "ResponseMetadata": {"HTTPStatusCode": 404},
+                              "ResponseMetadata": self.compute_mock_response_metadata(http_status_code=404),
                           })
 
     def get_object_tagging(self, Bucket, Key):
@@ -2645,7 +2644,7 @@ def list_objects(self, Bucket, Prefix=None):  # noQA - AWS argument naming style
         }
 
     def list_objects_v2(self, Bucket):  # noQA - AWS argument naming style
-        # This is different but similar to list_objects. However we don't really care about that.
+        # This is different but similar to list_objects. However, we don't really care about that.
         return self.list_objects(Bucket=Bucket)
 
     def copy_object(self, CopySource, Bucket, Key, CopySourceVersionId=None,
@@ -2698,7 +2697,7 @@ def _copy_object(self, CopySource, Bucket, Key, CopySourceVersionId, StorageClas
         new_storage_class = target_storage_class
         if (copy_in_place
                 and GlacierUtils.transition_involves_glacier_restoration(source_storage_class, target_storage_class)):
-            new_storage_class = None  # For a restoration, the don't update the glacier data. It's restored elsewhere.
+            new_storage_class = None  # For a restoration, don't update the glacier data. It's restored elsewhere.
             target_attribute_block.restore_temporarily(delay_seconds=self.RESTORATION_DELAY_SECONDS,
                                                        duration_days=1, storage_class=target_storage_class)
             PRINT(f"Set up restoration {target_attribute_block.restoration}")
@@ -2806,6 +2805,7 @@ def _delete_versioned_object(self, s3_filename, version_id) -> Dict[str, Any]:
 
     def restore_object(self, Bucket, Key, RestoreRequest, VersionId: Optional[str] = None,
                        StorageClass: Optional[S3StorageClass] = None):
+        # TODO: VersionId is unused in the arglist. Is that OK? -kmp 19-Aug-2023
         duration_days: int = RestoreRequest.get('Days')
         storage_class: S3StorageClass = StorageClass or self.storage_class
         s3_filename = f"{Bucket}/{Key}"
@@ -3047,8 +3047,8 @@ def known_bug_expected(jira_ticket=None, fixed=False, error_class=None):
         with known_bug_expected(jira_ticket="TST-00001", error_class=RuntimeError, fixed=True):
             ... stuff that fails ...
 
-    If the previously-expected error (now thought to be fixed) happens, an error will result so it's easy to tell
-    if there's been a regression.
+    If the previously-expected error (now thought to be fixed) happens, an error will result
+    so that it's easy to tell if there's been a regression.
 
     Parameters:
 
@@ -3088,7 +3088,7 @@ def client_failer(operation_name, code=400):
     def fail(message, code=code):
         raise ClientError(
             {  # noQA - PyCharm wrongly complains about this dictionary
-                "Error": {"Message": message, "Code": code}
+                "Error": {"Message": message, "Code": code}  # noQA - Boto3 declares a string here but allows int code
             },
             operation_name=operation_name)
     return fail
@@ -3473,3 +3473,79 @@ def mocked_input(*args, **kwargs):
             inputs.append(item)
         yield
         assert not inputs, "Did not use all inputs."
+
+
+def is_subdict(json1, json2, desc1="json1", desc2="json2", verbose=True):
+    """
+    Does asymmetric testing of dictionary equivalence, assuring json2 has all the content of json1,
+    even if not vice versa. In other words, the dictionary structure is equivalent to the extent
+    that (recursively) all dictionary keys on the left occur in the right hand side even if not
+    necessarily all dictionary keys on the right occur in the left.
+
+    For example,
+       x = {"foo": 3}
+       y = {"foo": 3, "bar": 4}
+       is_subdict(x, y) is True
+       is_subdict(y, x) is False
+
+    The desc1 and desc2 can be provided to help with verbose mode, identifying what is on the left
+    and what is on the right.
+
+    :param json1: a JSON structure, the outer part of which is a dictionary
+    :param json2: a JSON structure, the outer part of which is a dictionary
+    :param desc1: a name or brief description for the json1 (default "json1")
+    :param desc2: a name or brief description for the json2 (default "json2")
+    :param verbose: a boolean (default True) that controls whether the comparison is verbose, showing
+        output explaining failures or near-failures when True, and otherwise, if False, not showing such output.
+    """
+
+
+    def out(x):
+        if verbose:
+            PRINT(x)
+
+    def sorted_set_repr(x):
+        return f"{{{repr(sorted(x))[1:-1]}}}"
+
+    def recurse(json1, json2, path=""):
+        if isinstance(json1, dict) and isinstance(json2, dict):
+            k1 = set(json1.keys())
+            k2 = set(json2.keys())
+            result = k1 <= k2
+            if result:
+                if k1 != k2:
+                    out(f"Non-fatal keyword mismatch at {path!r}:")
+                    out(f" {desc1} keys: {sorted_set_repr(k1)}")
+                    out(f" {desc2} keys: {sorted_set_repr(k2)}")
+                result = all(recurse(value, json2[key], path=f"{path}.{key}")
+                             for key, value in json1.items())
+                if not result:
+                    # out(f"Recursive failure at {path!r} in object comparison")
+                    pass
+            else:
+                out(f"Failed at {path!r} in object comparison due to key set mismatch:")
+                out(f" {desc1} keys: {sorted_set_repr(k1)}")
+                out(f" {desc2} keys: {sorted_set_repr(k2)}")
+        elif isinstance(json1, list) and isinstance(json2, list):
+            len1 = len(json1)
+            len2 = len(json2)
+            result = len1 == len2
+            if not result:
+                out(f"Failed at {path!r} in list comparison due to length mismatch: {len1} vs {len2}")
+            else:
+                result = all(recurse(json1[i], json2[i], path=f"{path}[{i}]") for i in range(len1))
+                if not result:
+                    # out(f"Recursive failure at {path!r} in list comparison")
+                    pass
+        elif type(json1) == type(json2):
+            result = json1 == json2
+            if not result:
+                out(f"Failed at {path!r} due to value mismatch: {json.dumps(json1)} != {json.dumps(json2)}")
+        else:
+            result = False
+            if not result:
+                out(f"Mismatch at {path}.")
+                out(f" {desc1}: {json1}")
+                out(f" {desc2}: {json2}")
+        return result
+    return recurse(json1, json2)