Update entities_checker_util.py

GoogleCloudPlatform · Oct 13, 2023 · 43c2986 · 43c2986
1 parent 3d56028
commit 43c2986
Showing 1 changed file with 300 additions and 0 deletions.
diff --git a/src/dfcx_scrapi/tools/entities_checker_util.py b/src/dfcx_scrapi/tools/entities_checker_util.py
@@ -1 +1,301 @@
+import logging
+import time
+from typing import Dict, List
+import pandas as pd
+import re
+from dfcx_scrapi.core import scrapi_base
+from dfcx_scrapi.core.intents import Intents
+from dfcx_scrapi.core.flows import Flows
+from dfcx_scrapi.core.pages import Pages
+from dfcx_scrapi.core.entity_types import EntityTypes
+from dfcx_scrapi.core.transition_route_groups import TransitionRouteGroups
 
+# logging config
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)-8s %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+
+class ParametersCheckerUtil(scrapi_base.ScrapiBase):
+    """Utility class for checking DFCX Agent's parameters."""
+
+    def __init__(
+        self,
+        agent_id: str,
+        creds_path: str = None,
+        creds_dict: Dict = None,
+        creds=None,
+        scope=False,
+    ):
+        super().__init__(
+            creds_path=creds_path,
+            creds_dict=creds_dict,
+            creds=creds,
+            scope=scope,
+        )
+      self.agent_id = agent_id
+      if creds_path:
+        self.creds_path = creds_path
+
+    @staticmethod
+    def _get_entity_type_by_parameter_id(parameters, parameter_id):
+        """static method that returns the entity type that is paired with the given parameter id"""
+
+        entity_type = None
+        for parameter in parameters:
+            if parameter.id == parameter_id:
+                entity_type = parameter.entity_type
+                break
+
+        return entity_type
+
+    def _get_tags_in_intents(self) -> pd.DataFrame:
+        """Get all the tag_texts that are referenced to the specific parameter id & entity type id in the training phrases in the intents
+
+        Returns:
+            A dataframe with columns
+            intent_id - the intent name
+            intent - the intent display name
+            training_phrase - the training phrase in the intent
+            tag_text - the subset of the training phrase that is tagged with the specific entity type id
+            parameter_id - parameter id
+            entity_type_id - entity id
+        """
+
+        df = pd.DataFrame({
+                "intent": pd.Series(dtype="str"),
+                "intent_id": pd.Series(dtype="str"),
+                "training_phrase": pd.Series(dtype="str"),
+                "tag_text": pd.Series(dtype="str"),
+                "parameter_id": pd.Series(dtype="str"),
+                "entity_type_id": pd.Series(dtype="str"),
+                })
+
+        if not self._intents_list: 
+            self._intents_list = self._intents.list_intents(agent_id=self.agent_id)
+
+        for intent in self._intents_list:
+            if 'parameters' in intent:
+                for training_phrase in intent.training_phrases:
+                    concat_training_phrase = ''
+                    tag_texts = []
+                    for part in training_phrase.parts:
+                        concat_training_phrase += part.text
+                        if 'parameter_id' in part:
+                            entity_type = self._get_entity_type_by_parameter_id(intent.parameters, part.parameter_id)
+                            tag_texts_set = (part.text, part.parameter_id, entity_type)
+                            tag_texts.append(tag_texts_set)
+                    if tag_texts:
+                        for pair in tag_texts:
+                            temp = pd.DataFrame({
+                                "intent": [intent.display_name],
+                                "intent_id": [intent.name],
+                                "training_phrase": [concat_training_phrase],
+                                "tag_text": [pair[0]],
+                                "parameter_id": [pair[1]],
+                                "entity_type_id": [pair[2]]
+                                })
+                            df = pd.concat([df, temp])
+
+        df = df.reset_index(drop=True)
+
+        return df
+
+    def _get_entity_types_df(self) -> pd.DataFrame:
+        """Get all the entity types and store all the entity values and synonyms in one row
+        
+        Returns:
+            A dataframe with columns
+            entity_type_id
+            entity_type
+            kind
+            entity_values - list of the [entity values]
+            synonyms - list of the [synonyms]
+        """
+        df = pd.DataFrame({
+            "entity_type_id": pd.Series(dtype="str"),
+            "entity_type": pd.Series(dtype="str"),
+            "kind": pd.Series(dtype="str"),
+            "entity_values": pd.Series(dtype="str"),
+            "synonyms": pd.Series(dtype="str")
+            })
+
+        if not self._entity_types_list:
+            self._entity_types_list = self._entity_types.list_entity_types(agent_id = self.agent_id)
+
+        for entity_type in self._entity_types_list:
+            entity_values = []
+            synonyms = []
+            for entity in entity_type.entities:
+                entity_values.append(entity.value)
+                synonyms += list(entity.synonyms)
+            temp = pd.DataFrame({
+                "entity_type_id": [entity_type.name],
+                "entity_type": [entity_type.display_name],
+                "kind": [entity_type.kind.name],
+                "entity_values": [entity_values],
+                "synonyms": [synonyms]})
+            df = pd.concat([df, temp])
+
+        df = df.reset_index(drop=True)
+
+        return df
+
+    def _unpack_nested_entity_types(self, df, target_kind_type): 
+        """Unpacking the nested entity types to the comparable dataframe structure
+        e.g : Nested entity type ->  entity_type : @child_entity_type1 , @child_entity_type2 
+              unpacked entity type -> entity_type: [child1.entity_values, child2.entity_values] : [child1.synonyms, chilld.synonyms]
+
+        Returns:
+            A dataframe with columns
+            entity_type_id
+            entity_type
+            kind
+            entity_values - list of the [entity values]
+            synonyms - list of the [synonyms]
+        """
+        for idx, row in df.iterrows(): 
+            kind = row['kind']
+            if kind == 'KIND_LIST': 
+                entity_values = row['entity_values']
+                new_entity_values = []
+                new_synonyms = []
+                is_nested_entity_type = True
+                for entity_value in entity_values: 
+                    if '@' == entity_value[0] and (df['entity_type'] == entity_value[1::]).any():         
+                        entity_value = entity_value[1::]
+                        child_entity_type_row = df.loc[df['entity_type'] == entity_value]
+                        child_index = child_entity_type_row.index[0]
+                        child_entity_type_kind = child_entity_type_row['kind'][child_index]
+                        if child_entity_type_kind == target_kind_type:
+                            child_entity_values = child_entity_type_row['entity_values'][child_index]
+                            child_entity_synonyms = child_entity_type_row['synonyms'][child_index]
+                            new_entity_values += child_entity_values
+                            new_synonyms += child_entity_synonyms
+                        else: 
+                            is_nested_entity_type = False
+                            break
+                    else: 
+                        is_nested_entity_type = False
+                        break
+
+                if new_entity_values and is_nested_entity_type: 
+                    df.loc[idx, 'entity_values'] = new_entity_values
+                    df.loc[idx, 'synonyms'] = new_synonyms
+                    df.loc[idx, 'kind'] = target_kind_type
+
+        return df 
+
+    def generate_hidden_synonym_tags(self) -> pd.DataFrame: 
+        """ Generate the overall stats that identify the incorrect tags in the training phrases by comparing with the entity type's synonyms 
+            Merges the intents and the entity types dataframes to create the comparable dataframe
+            Check if the tag_text is relevent in the entity type's synonyms 
+            if a tag_text in synonyms then is_hidden = YES else is_hidden = NO
+            
+        Returns:
+            A dataframe with columns
+            intent
+            intent_id
+            training_phrase
+            tag_text
+            parameter_id
+            entity_type_id
+            entity_type
+            kind
+            entity_values
+            synonyms
+            is_hidden
+        """
+        tags_intents = self._get_tags_in_intents()
+        entity_types_mapper = self._get_entity_types_df()
+        entity_types_mapper = self._unpack_nested_entity_types(entity_types_mapper, 'KIND_MAP')
+        hidden_entities = pd.merge(tags_intents, entity_types_mapper, on = 'entity_type_id')
+        hidden_entities = hidden_entities.drop(hidden_entities[~hidden_entities.kind.str.contains('KIND_MAP')].index)
+        hidden_entities = hidden_entities.reset_index(drop=True)
+        hidden_entities['is_hidden'] = pd.Series(None, index=hidden_entities.index)
+
+        for idx, row in hidden_entities.iterrows():
+            synonyms = row['synonyms']
+            tag_text = row['tag_text']
+
+            for synonym in synonyms: 
+                synonym = synonym.lower()
+                tag_text = tag_text.lower()
+                if [sub_synonym for sub_synonym in synonym if sub_synonym.isalnum()] == [sub_tag_text for sub_tag_text in tag_text if sub_tag_text.isalnum()]: 
+                    hidden_entities.loc[idx, 'is_hidden'] = 'NO'
+
+            if pd.isna(hidden_entities.loc[idx, 'is_hidden']): 
+                hidden_entities.loc[idx, 'is_hidden'] = 'YES'
+
+        return hidden_entities
+
+    def generate_hidden_regex_tags(self) -> pd.DataFrame: 
+        """ Generate the overall stats that identify the incorrect tags in the training phrases by comparing with the entity type's regex 
+        
+        Returns:
+            A dataframe with columns
+            intent
+            intent_id
+            training_phrase
+            tag_text
+            parameter_id
+            entity_type_id
+            entity_type
+            kind
+            entity_values
+            synonyms
+            is_hidden
+        """
+        tags_intents = self._get_tags_in_intents()
+        entity_types_mapper = self._get_entity_types_df()
+        entity_types_mapper = self._unpack_nested_entity_types(entity_types_mapper, 'KIND_REGEX')
+        hidden_entities = pd.merge(tags_intents, entity_types_mapper, on = 'entity_type_id')
+        hidden_entities = hidden_entities.drop(hidden_entities[~hidden_entities.kind.str.contains('KIND_REGEX')].index)
+        hidden_entities = hidden_entities.reset_index(drop=True)
+        hidden_entities['is_hidden'] = pd.Series(None, index=hidden_entities.index)
+
+        for idx, row in hidden_entities.iterrows():
+            regexs = row['synonyms']
+            tag_text = row['tag_text']
+            for regex in regexs: 
+                if re.match(regex, tag_text): 
+                    hidden_entities.loc[idx, 'is_hidden'] = 'NO'
+            if pd.isna(hidden_entities.loc[idx, 'is_hidden']): 
+                hidden_entities.loc[idx, 'is_hidden'] = 'YES'
+
+        return hidden_entities
+
+    def space_in_entity_values(self) -> pd.DataFrame: 
+        """ Validating if there is any unnecessary space(s) in the front or/and in the end of the entities   
+            e.g: Phone: "iphone " => should be Phone: "iphone"
+            
+            Returns:
+            A dataframe with columns
+            entity_type_id
+            entity_type
+            kind
+            entity_values 
+            synonyms
+            has_space: if the entity value have the space(s) then YES else NO 
+            entities_with_space: list of the entity values that have the space(s)
+        """
+        entity_types_mapper = self._get_entity_types_df()
+        entity_types_mapper = self._unpack_nested_entity_types(entity_types_mapper, 'KIND_MAP')
+        entity_types_mapper['has_space'] = pd.Series('NO', index=entity_types_mapper.index)
+        entity_types_mapper['entities_with_space'] = pd.Series('NA', index=entity_types_mapper.index)
+
+        for idx, row in entity_types_mapper.iterrows(): 
+            entity_values = row['entity_values']
+            tmp_entity_values = []
+            for entity in entity_values:
+                striped_entity = entity.strip()
+                if not entity == striped_entity:
+                    entity_types_mapper.loc[idx, 'has_space'] = 'YES'
+                    tmp_entity_values.append(entity)
+                    entity_types_mapper.loc[idx, 'entities_with_space'] = tmp_entity_values
+
+        return entity_types_mapper
+
+
+