Skip to content

Commit

Permalink
Update entities_checker_util.py
Browse files Browse the repository at this point in the history
  • Loading branch information
jkshj21 authored Oct 13, 2023
1 parent 3d56028 commit 43c2986
Showing 1 changed file with 300 additions and 0 deletions.
300 changes: 300 additions & 0 deletions src/dfcx_scrapi/tools/entities_checker_util.py
Original file line number Diff line number Diff line change
@@ -1 +1,301 @@
import logging
import time
from typing import Dict, List
import pandas as pd
import re
from dfcx_scrapi.core import scrapi_base
from dfcx_scrapi.core.intents import Intents
from dfcx_scrapi.core.flows import Flows
from dfcx_scrapi.core.pages import Pages
from dfcx_scrapi.core.entity_types import EntityTypes
from dfcx_scrapi.core.transition_route_groups import TransitionRouteGroups

# logging config
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-8s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)

class ParametersCheckerUtil(scrapi_base.ScrapiBase):
"""Utility class for checking DFCX Agent's parameters."""

def __init__(
self,
agent_id: str,
creds_path: str = None,
creds_dict: Dict = None,
creds=None,
scope=False,
):
super().__init__(
creds_path=creds_path,
creds_dict=creds_dict,
creds=creds,
scope=scope,
)
self.agent_id = agent_id
if creds_path:
self.creds_path = creds_path

@staticmethod
def _get_entity_type_by_parameter_id(parameters, parameter_id):
"""static method that returns the entity type that is paired with the given parameter id"""

entity_type = None
for parameter in parameters:
if parameter.id == parameter_id:
entity_type = parameter.entity_type
break

return entity_type

def _get_tags_in_intents(self) -> pd.DataFrame:
"""Get all the tag_texts that are referenced to the specific parameter id & entity type id in the training phrases in the intents
Returns:
A dataframe with columns
intent_id - the intent name
intent - the intent display name
training_phrase - the training phrase in the intent
tag_text - the subset of the training phrase that is tagged with the specific entity type id
parameter_id - parameter id
entity_type_id - entity id
"""

df = pd.DataFrame({
"intent": pd.Series(dtype="str"),
"intent_id": pd.Series(dtype="str"),
"training_phrase": pd.Series(dtype="str"),
"tag_text": pd.Series(dtype="str"),
"parameter_id": pd.Series(dtype="str"),
"entity_type_id": pd.Series(dtype="str"),
})

if not self._intents_list:
self._intents_list = self._intents.list_intents(agent_id=self.agent_id)

for intent in self._intents_list:
if 'parameters' in intent:
for training_phrase in intent.training_phrases:
concat_training_phrase = ''
tag_texts = []
for part in training_phrase.parts:
concat_training_phrase += part.text
if 'parameter_id' in part:
entity_type = self._get_entity_type_by_parameter_id(intent.parameters, part.parameter_id)
tag_texts_set = (part.text, part.parameter_id, entity_type)
tag_texts.append(tag_texts_set)
if tag_texts:
for pair in tag_texts:
temp = pd.DataFrame({
"intent": [intent.display_name],
"intent_id": [intent.name],
"training_phrase": [concat_training_phrase],
"tag_text": [pair[0]],
"parameter_id": [pair[1]],
"entity_type_id": [pair[2]]
})
df = pd.concat([df, temp])

df = df.reset_index(drop=True)

return df

def _get_entity_types_df(self) -> pd.DataFrame:
"""Get all the entity types and store all the entity values and synonyms in one row
Returns:
A dataframe with columns
entity_type_id
entity_type
kind
entity_values - list of the [entity values]
synonyms - list of the [synonyms]
"""
df = pd.DataFrame({
"entity_type_id": pd.Series(dtype="str"),
"entity_type": pd.Series(dtype="str"),
"kind": pd.Series(dtype="str"),
"entity_values": pd.Series(dtype="str"),
"synonyms": pd.Series(dtype="str")
})

if not self._entity_types_list:
self._entity_types_list = self._entity_types.list_entity_types(agent_id = self.agent_id)

for entity_type in self._entity_types_list:
entity_values = []
synonyms = []
for entity in entity_type.entities:
entity_values.append(entity.value)
synonyms += list(entity.synonyms)
temp = pd.DataFrame({
"entity_type_id": [entity_type.name],
"entity_type": [entity_type.display_name],
"kind": [entity_type.kind.name],
"entity_values": [entity_values],
"synonyms": [synonyms]})
df = pd.concat([df, temp])

df = df.reset_index(drop=True)

return df

def _unpack_nested_entity_types(self, df, target_kind_type):
"""Unpacking the nested entity types to the comparable dataframe structure
e.g : Nested entity type -> entity_type : @child_entity_type1 , @child_entity_type2
unpacked entity type -> entity_type: [child1.entity_values, child2.entity_values] : [child1.synonyms, chilld.synonyms]
Returns:
A dataframe with columns
entity_type_id
entity_type
kind
entity_values - list of the [entity values]
synonyms - list of the [synonyms]
"""
for idx, row in df.iterrows():
kind = row['kind']
if kind == 'KIND_LIST':
entity_values = row['entity_values']
new_entity_values = []
new_synonyms = []
is_nested_entity_type = True
for entity_value in entity_values:
if '@' == entity_value[0] and (df['entity_type'] == entity_value[1::]).any():
entity_value = entity_value[1::]
child_entity_type_row = df.loc[df['entity_type'] == entity_value]
child_index = child_entity_type_row.index[0]
child_entity_type_kind = child_entity_type_row['kind'][child_index]
if child_entity_type_kind == target_kind_type:
child_entity_values = child_entity_type_row['entity_values'][child_index]
child_entity_synonyms = child_entity_type_row['synonyms'][child_index]
new_entity_values += child_entity_values
new_synonyms += child_entity_synonyms
else:
is_nested_entity_type = False
break
else:
is_nested_entity_type = False
break

if new_entity_values and is_nested_entity_type:
df.loc[idx, 'entity_values'] = new_entity_values
df.loc[idx, 'synonyms'] = new_synonyms
df.loc[idx, 'kind'] = target_kind_type

return df

def generate_hidden_synonym_tags(self) -> pd.DataFrame:
""" Generate the overall stats that identify the incorrect tags in the training phrases by comparing with the entity type's synonyms
Merges the intents and the entity types dataframes to create the comparable dataframe
Check if the tag_text is relevent in the entity type's synonyms
if a tag_text in synonyms then is_hidden = YES else is_hidden = NO
Returns:
A dataframe with columns
intent
intent_id
training_phrase
tag_text
parameter_id
entity_type_id
entity_type
kind
entity_values
synonyms
is_hidden
"""
tags_intents = self._get_tags_in_intents()
entity_types_mapper = self._get_entity_types_df()
entity_types_mapper = self._unpack_nested_entity_types(entity_types_mapper, 'KIND_MAP')
hidden_entities = pd.merge(tags_intents, entity_types_mapper, on = 'entity_type_id')
hidden_entities = hidden_entities.drop(hidden_entities[~hidden_entities.kind.str.contains('KIND_MAP')].index)
hidden_entities = hidden_entities.reset_index(drop=True)
hidden_entities['is_hidden'] = pd.Series(None, index=hidden_entities.index)

for idx, row in hidden_entities.iterrows():
synonyms = row['synonyms']
tag_text = row['tag_text']

for synonym in synonyms:
synonym = synonym.lower()
tag_text = tag_text.lower()
if [sub_synonym for sub_synonym in synonym if sub_synonym.isalnum()] == [sub_tag_text for sub_tag_text in tag_text if sub_tag_text.isalnum()]:
hidden_entities.loc[idx, 'is_hidden'] = 'NO'

if pd.isna(hidden_entities.loc[idx, 'is_hidden']):
hidden_entities.loc[idx, 'is_hidden'] = 'YES'

return hidden_entities

def generate_hidden_regex_tags(self) -> pd.DataFrame:
""" Generate the overall stats that identify the incorrect tags in the training phrases by comparing with the entity type's regex
Returns:
A dataframe with columns
intent
intent_id
training_phrase
tag_text
parameter_id
entity_type_id
entity_type
kind
entity_values
synonyms
is_hidden
"""
tags_intents = self._get_tags_in_intents()
entity_types_mapper = self._get_entity_types_df()
entity_types_mapper = self._unpack_nested_entity_types(entity_types_mapper, 'KIND_REGEX')
hidden_entities = pd.merge(tags_intents, entity_types_mapper, on = 'entity_type_id')
hidden_entities = hidden_entities.drop(hidden_entities[~hidden_entities.kind.str.contains('KIND_REGEX')].index)
hidden_entities = hidden_entities.reset_index(drop=True)
hidden_entities['is_hidden'] = pd.Series(None, index=hidden_entities.index)

for idx, row in hidden_entities.iterrows():
regexs = row['synonyms']
tag_text = row['tag_text']
for regex in regexs:
if re.match(regex, tag_text):
hidden_entities.loc[idx, 'is_hidden'] = 'NO'
if pd.isna(hidden_entities.loc[idx, 'is_hidden']):
hidden_entities.loc[idx, 'is_hidden'] = 'YES'

return hidden_entities

def space_in_entity_values(self) -> pd.DataFrame:
""" Validating if there is any unnecessary space(s) in the front or/and in the end of the entities
e.g: Phone: "iphone " => should be Phone: "iphone"
Returns:
A dataframe with columns
entity_type_id
entity_type
kind
entity_values
synonyms
has_space: if the entity value have the space(s) then YES else NO
entities_with_space: list of the entity values that have the space(s)
"""
entity_types_mapper = self._get_entity_types_df()
entity_types_mapper = self._unpack_nested_entity_types(entity_types_mapper, 'KIND_MAP')
entity_types_mapper['has_space'] = pd.Series('NO', index=entity_types_mapper.index)
entity_types_mapper['entities_with_space'] = pd.Series('NA', index=entity_types_mapper.index)

for idx, row in entity_types_mapper.iterrows():
entity_values = row['entity_values']
tmp_entity_values = []
for entity in entity_values:
striped_entity = entity.strip()
if not entity == striped_entity:
entity_types_mapper.loc[idx, 'has_space'] = 'YES'
tmp_entity_values.append(entity)
entity_types_mapper.loc[idx, 'entities_with_space'] = tmp_entity_values

return entity_types_mapper



0 comments on commit 43c2986

Please sign in to comment.