Ouranosinc · Nazim-crim · Nov 30, 2023 · Nov 6, 2023 · Nov 6, 2023 · Nov 6, 2023
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -7,7 +7,14 @@ Changes
 `Unreleased <https://github.com/Ouranosinc/cowbird/tree/master>`_ (latest)
 ------------------------------------------------------------------------------------
 
-* Nothing yet.
+* Add optional key ``field`` and ``regex`` to be used in the ``sync_permissions`` section found in the config.
+  This allow to sync permission using a field other than ``resource_full_name`` when creating the nametype path
+  from the segment ``ex.: /field1::type1/field2::type2``. This adds the support of using ``resource_display_name``.
+* The ``regex`` is used to extract the desired information from the ``nametype_path`` that should be used to do an
+  exact match. This new search overrides the default way of matching each segment with the ``nametype path``. 
+  In the case where a ``regex`` is found in the target segment, the data will be formed using the same ``resource_type``
+  for every match in the same segment. Similary, as using ``- name: "**"`` in the config to match multiple segment,
+  it is possible to use a ``regex`` to match multiple directory in the same segment with ``regex: '(?<=:).*\/?(?=\/)' ``
 
 `2.1.0 <https://github.com/Ouranosinc/cowbird/tree/2.1.0>`_ (2023-09-18)
 ------------------------------------------------------------------------------------

diff --git a/cowbird/api/webhooks/views.py b/cowbird/api/webhooks/views.py
@@ -108,6 +108,7 @@ def post_permission_webhook_view(request: Request) -> AnyResponseType:
                     msg_on_fail=s.PermissionWebhook_POST_BadRequestResponseSchema.description)
     # Use raw value for service name, to avoid errors with `None` values
     # when the permission is not applied to a `service` type resource.
+    resource_display_name = ar.get_multiformat_body(request, "resource_display_name", check_type=(str, type(None)))
     service_name = ar.get_multiformat_body(request, "service_name", check_type=(str, type(None)))
     service_type = ar.get_multiformat_body(request, "service_type")
     resource_id = ar.get_multiformat_body(request, "resource_id", check_type=int)
@@ -127,6 +128,7 @@ def post_permission_webhook_view(request: Request) -> AnyResponseType:
         service_type=service_type,
         resource_id=resource_id,
         resource_full_name=resource_full_name,
+        resource_display_name=resource_display_name,
         name=name,
         access=access,
         scope=scope,

diff --git a/cowbird/config.py b/cowbird/config.py
@@ -191,7 +191,7 @@ def validate_sync_perm_config_schema(sync_cfg: SyncPointConfig) -> None:
             "services": {
                 str: {  # Service type, must correspond to an actual Magpie service type
                     str: [  # Resource key, used to identify the resource here and in the permissions_mapping
-                        {"name": str, "type": str}
+                        {"name": str, "type": str, Optional("field"): str, Optional("regex"): str}
                     ]
                 }
             },

diff --git a/cowbird/permissions_synchronizer.py b/cowbird/permissions_synchronizer.py
@@ -1,6 +1,6 @@
 import re
 from copy import deepcopy
-from typing import TYPE_CHECKING, Callable, Dict, Iterator, List, MutableMapping, Tuple, cast
+from typing import TYPE_CHECKING, Callable, Collection, Dict, Iterator, List, MutableMapping, Tuple, cast
 
 from cowbird.config import (
     BIDIRECTIONAL_ARROW,
@@ -68,12 +68,14 @@ def __init__(self,
                  access: str,
                  scope: str,
                  user: str = None,
-                 group: str = None
+                 group: str = None,
+                 resource_display_name: str = None
                  ) -> None:
         self.service_name = service_name
         self.service_type = service_type
         self.resource_id = resource_id
         self.resource_full_name = resource_full_name
+        self.resource_display_name = resource_display_name
         self.name = name
         self.access = access
         self.scope = scope
@@ -85,6 +87,7 @@ def __eq__(self, other: "Permission") -> bool:  # type: ignore[override]
                 self.service_type == other.service_type and
                 self.resource_id == other.resource_id and
                 self.resource_full_name == other.resource_full_name and
+                self.resource_display_name == other.resource_display_name and
                 self.name == other.name and
                 self.access == other.access and
                 self.scope == other.scope and
@@ -184,6 +187,13 @@ def _generate_regex_from_segments(res_segments: List[ConfigSegment]) -> Tuple[st
         for segment in res_segments:
             matched_groups = re.match(NAMED_TOKEN_REGEX, segment["name"])
             if matched_groups:
+                if segment.get("regex") is not None:
+                    # if a regex is passed, override the current regex and return
+                    regex = segment.get("regex")
+                    res_regex = (
+                        rf"{regex}"
+                    )
+                    return res_regex, -1
                 # match any name with specific type 1 time only
                 res_regex += (
                     rf"/(?P<{matched_groups.groups()[0]}>{SEGMENT_NAME_REGEX})"
@@ -199,6 +209,26 @@ def _generate_regex_from_segments(res_segments: List[ConfigSegment]) -> Tuple[st
         res_regex += r"$"
         return res_regex, named_segments_count
 
+    @staticmethod
+    def _generate_nametype_path_from_segments(res_segments: List[ConfigSegment], src_resource_tree: ResourceTree) -> str:
+        """
+        Generate nametype path (ex.: /name1::type1/name2::type2 where name can be a field found in ResourceSegment).
+
+        :param res_segments: list of segments
+        :param src_resource_tree: Resource tree associated with the permission to synchronize
+        """
+        resource_nametype_path = ""
+        index = 0
+        for res in src_resource_tree:
+            if index < len(res_segments):
+                key = res_segments[index].get("field") if res_segments[index].get("field") is not None else "resource_name"
+            else:
+                key = "resource_name"
+            resource_nametype_path += f'/{res[key]}{RES_NAMETYPE_SEPARATOR}{res["resource_type"]}'
+            index = index + 1
+
+        return resource_nametype_path
+
     @staticmethod
     def _remove_type_from_nametype_path(nametype_path: str) -> str:
         """
@@ -210,16 +240,17 @@ def _remove_type_from_nametype_path(nametype_path: str) -> str:
                 formatted_path += "/" + segment.split(RES_NAMETYPE_SEPARATOR)[0]
         return formatted_path
 
-    def _find_matching_res(self, service_type: str, resource_nametype_path: str) -> Tuple[str, Dict[str, str]]:
+    def _find_matching_res(self, permission: Permission, src_resource_tree: ResourceTree) -> Tuple[str, Collection[str]]:
         """
         Finds a resource key that matches the input resource path, in the sync_permissions config. Note that it returns
         the longest match and only the named segments of the path are included in the length value. Any tokenized
         segment is ignored in the length.
 
-        :param service_type: Type of the service associated with the input resource.
-        :param resource_nametype_path: Full resource path name, which includes the type of each segment
-                                       (ex.: /name1::type1/name2::type2)
+        :param permission: Permission of the service associated with the input resource.
+        :param ResourceTree: Resource tree associated with the permission to synchronize
         """
+
+        service_type = permission.service_type
         if service_type in self.services:
             # Find which resource from the config matches with the input permission's resource tree
             # The length of a match is determined by the number of named segments matching the input resource.
@@ -238,21 +269,39 @@ def _find_matching_res(self, service_type: str, resource_nametype_path: str) ->
             # An error would be raised because 2 matches of the same length would be found.
             # - /**/file
             # - /file
+            #
+            # In the case where a regex is used, the behavior is changed to search for the exact match in the res_segments.
+            # The lengh of the match is used to favor a more specific match.
+            # Example:
+            # 1:
+            # - //dir1/dir2//
+            # - //dir1/dir2//dir3// # We favor this path if it matches since it is more specific.
+            # note: It is possible to have multiple dir in the same segment when using a custom regex that extract a display_name
+            # containing a path to a target resource.
 
             matched_length_by_res = {}
             matched_groups_by_res = {}
             service_resources = self.services[service_type]
             for res_key, res_segments in service_resources.items():
                 res_regex, named_segments_count = SyncPoint._generate_regex_from_segments(res_segments)
-                matches = re.match(res_regex, resource_nametype_path)
+                resource_nametype_path = SyncPoint._generate_nametype_path_from_segments(res_segments, src_resource_tree)
+                if named_segments_count == -1:
+                    # To be able to match a path anywhere in the resource_nametype_path we need to use search
+                    # only when the field regex is passed in the res_segments. This allow to stay backward compatible.
+                    matches = re.search(res_regex, resource_nametype_path)
+                else:
+                    matches = re.match(res_regex, resource_nametype_path)
                 if matches:
-                    matched_groups = matches.groupdict()
+                    exact_match = matches.group()
+                    matched_groups = matches.groupdict() if named_segments_count != -1 else exact_match
                     if "multi_token" in matched_groups:
                         matched_groups["multi_token"] = SyncPoint._remove_type_from_nametype_path(
                             matched_groups["multi_token"]
                         )
                     matched_groups_by_res[res_key] = matched_groups
-                    matched_length_by_res[res_key] = named_segments_count
+                    # Since we want to be able to match multiple dir /dir1/dir2/dir3/** in the same segment if a custom regex is passed.
+                    # We need to use the len of the exact match to avoid matching the wrong res_key
+                    matched_length_by_res[res_key] = named_segments_count if named_segments_count != -1 else len(exact_match)
 
             # Find the longest match
             max_match_len = max(matched_length_by_res.values(), default=0)
@@ -269,43 +318,59 @@ def _find_matching_res(self, service_type: str, resource_nametype_path: str) ->
 
     @staticmethod
     def _create_res_data(target_segments: List[ConfigSegment],
-                         input_matched_groups: Dict[str, str],
+                         input_matched_groups: Collection[str],
                          ) -> List[ResourceSegment]:
         """
         Creates resource data, by replacing any tokens found in the segment names to their actual corresponding values.
         This data includes the name and type of each segments of a full resource path.
+        In the case where a regex is found in the target segment, the data will be formed using the same resource_type
+        for every match in the current segment.
 
         :param target_segments: List containing the name and type info of each segment of the target resource path.
         :param input_matched_groups:
         """
         res_data: List[ResourceSegment] = []
         for segment in target_segments:
-            matched_groups = re.match(NAMED_TOKEN_REGEX, segment["name"])
-            if matched_groups:
-                res_data.append({
-                    "resource_name": input_matched_groups[matched_groups.groups()[0]],
-                    "resource_type": segment["type"]
-                })
-            elif segment["name"] == MULTI_TOKEN:
-                multi_segments = input_matched_groups["multi_token"]
-                # Skip the segment if the multi_token matched 0 times, resulting in an empty string.
+            # Use the regex to create the res_data
+            if segment.get("regex") is not None:
+                regex = segment.get("regex")
+                matches = re.search(regex, input_matched_groups)
+                multi_segments = matches.group()
                 if multi_segments:
                     for seg in multi_segments.split("/"):
-                        if seg:  # Ignore empty splits
+                        if seg:
                             res_data.append({
                                 "resource_name": seg,
                                 "resource_type": segment["type"]
                             })
+
             else:
-                res_data.append({
-                    "resource_name": segment["name"],
-                    "resource_type": segment["type"]
-                })
+                matched_groups = re.match(NAMED_TOKEN_REGEX, segment["name"])
+                if matched_groups:
+                    res_data.append({
+                        "resource_name": input_matched_groups[matched_groups.groups()[0]],
+                        "resource_type": segment["type"]
+                    })
+                elif segment["name"] == MULTI_TOKEN:
+                    multi_segments = input_matched_groups["multi_token"]
+                    # Skip the segment if the multi_token matched 0 times, resulting in an empty string.
+                    if multi_segments:
+                        for seg in multi_segments.split("/"):
+                            if seg:  # Ignore empty splits
+                                res_data.append({
+                                    "resource_name": seg,
+                                    "resource_type": segment["type"]
+                                })
+                else:
+                    res_data.append({
+                        "resource_name": segment["name"],
+                        "resource_type": segment["type"]
+                    })
         return res_data
 
     def _get_resource_full_name_and_type(self,
                                          res_key: str,
-                                         matched_groups: Dict[str, str],
+                                         matched_groups: Collection[str],
                                          ) -> Tuple[str, List[ResourceSegment]]:
         """
         Finds the resource data from the config by using the resource key.
@@ -365,7 +430,7 @@ def _is_in_permissions(target_permission: str,
     def _filter_used_targets(self,
                              target_res_and_permissions: TargetResourcePermissions,
                              input_src_res_key: str,
-                             src_matched_groups: Dict[str, str],
+                             src_matched_groups: Collection[str],
                              input_permission: Permission,
                              ) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
         """
@@ -444,7 +509,7 @@ def _filter_used_targets(self,
     def _get_permission_data(self,
                              user_targets: Dict[str, List[str]],
                              group_targets: Dict[str, List[str]],
-                             src_matched_groups: Dict[str, str],
+                             src_matched_groups: Collection[str],
                              input_permission: Permission) -> PermissionData:
         """
         Formats permissions data to send to Magpie. Output contains, for each target resource key, the resource path
@@ -485,7 +550,7 @@ def _prepare_permissions_to_remove(self,
                                        target_res_and_permissions: TargetResourcePermissions,
                                        input_permission: Permission,
                                        input_src_res_key: str,
-                                       src_matched_groups: Dict[str, str],
+                                       src_matched_groups: Collection[str],
                                        ) -> PermissionData:
         """
         Removes every source resource found in the mappings that has an existing permission that is synced to one of the
@@ -501,7 +566,7 @@ def _prepare_permissions_to_remove(self,
 
     def _find_permissions_to_sync(self,
                                   src_res_key: str,
-                                  src_matched_groups: Dict[str, str],
+                                  src_matched_groups: Collection[str],
                                   input_permission: Permission,
                                   perm_operation: Callable[[List[PermissionConfigItemType]], None],
                                   ) -> PermissionData:
@@ -543,11 +608,7 @@ def sync(self,
         :param permission: Permission to synchronize with others services
         :param src_resource_tree: Resource tree associated with the permission to synchronize
         """
-        resource_nametype_path = ""
-        for res in src_resource_tree:
-            resource_nametype_path += f"/{res['resource_name']}{RES_NAMETYPE_SEPARATOR}{res['resource_type']}"
-
-        src_res_key, src_matched_groups = self._find_matching_res(permission.service_type, resource_nametype_path)
+        src_res_key, src_matched_groups = self._find_matching_res(permission, src_resource_tree)
         if not src_res_key:
             # A matching resource was not found in the sync config, nothing to do.
             return

diff --git a/cowbird/typedefs.py b/cowbird/typedefs.py
@@ -119,7 +119,7 @@
 ConfigList = List[ConfigItem]
 ConfigDict = Dict[str, Union[str, ConfigItem, ConfigList, JSON]]
 ConfigResTokenInfo = TypedDict("ConfigResTokenInfo", {"has_multi_token": bool, "named_tokens": MutableSet[str]})
-ConfigSegment = TypedDict("ConfigSegment", {"name": str, "type": str})
+ConfigSegment = TypedDict("ConfigSegment", {"name": str, "type": str, "field": Optional[str], "regex": Optional[str]})
 
 SyncPointMappingType = List[str]
 SyncPointServicesType = Dict[
@@ -142,12 +142,12 @@
     SyncPermissionConfig,
 ]
 
-ResourceSegment = TypedDict("ResourceSegment", {"resource_name": str, "resource_type": str})
+ResourceSegment = TypedDict("ResourceSegment", {"resource_name": str, "resource_type": str, "resource_display_name": NotRequired[str]})
 ResourceTree = List[
     Dict[
         str,
         # FIXME: replace by a more specific type provided by Magpie directly if eventually implemented
-        #   Only partial fields are provided below (resource_name/resource_type),
+        #   Only partial fields are provided below (resource_name/resource_type/resource_display_name),
         #   because those are the only ones used for now in Cowbird's sync operation.
         #   This actually contains more details such as the resource ID, permission names, etc.
         #   (see the response body of 'GET /magpie/resources/{resource_id}' for exact content).