Skip to content

Commit

Permalink
add full_text option
Browse files Browse the repository at this point in the history
  • Loading branch information
vitorbellini committed Apr 16, 2024
1 parent d3af2df commit 21a8af3
Show file tree
Hide file tree
Showing 8 changed files with 91 additions and 10 deletions.
1 change: 1 addition & 0 deletions dag_confs/examples_and_tests/all_parameters_example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ dag:
dou_sections:
- SECAO_1
- EDICAO_SUPLEMENTAR
full_text: True
report:
emails:
- [email protected]
Expand Down
4 changes: 4 additions & 0 deletions schemas/ro-dou.json
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@
"type": "boolean",
"description": "description"
},
"full_text": {
"type": "boolean",
"description": "description"
},
"date": {
"type": "string",
"description": "description",
Expand Down
3 changes: 3 additions & 0 deletions src/dou_dag_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ def create_dag(self, specs: DAGConfig, config_file: str) -> DAG:
'is_exact_search': specs.is_exact_search,
'ignore_signature_match': specs.ignore_signature_match,
'force_rematch': specs.force_rematch,
'full_text': specs.full_text,
'department': specs.department,
'result_as_email': result_as_html(specs),
},
Expand Down Expand Up @@ -240,6 +241,7 @@ def perform_searches(
is_exact_search: bool,
ignore_signature_match: bool,
force_rematch: bool,
full_text: bool,
result_as_email: bool,
department: List[str],
**context) -> dict:
Expand All @@ -266,6 +268,7 @@ def perform_searches(
search_date,
department,
ignore_signature_match,
full_text,
get_trigger_date(context, local_time = True)
)

Expand Down
14 changes: 11 additions & 3 deletions src/hooks/inlabs_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def search_text(
self,
search_terms: dict,
ignore_signature_match: bool,
full_text: bool,
conn_id: str = CONN_ID,
) -> dict:
"""Searches the DOU Database with the provided search terms and processes
Expand Down Expand Up @@ -63,7 +64,7 @@ def search_text(

return (
self.TextDictHandler().transform_search_results(
all_results, search_terms["texto"], ignore_signature_match
all_results, search_terms["texto"], ignore_signature_match, full_text
)
if not all_results.empty
else {}
Expand Down Expand Up @@ -160,7 +161,11 @@ def __init__(self, *args, **kwargs):
pass

def transform_search_results(
self, response: pd.DataFrame, text_terms: list, ignore_signature_match: bool
self,
response: pd.DataFrame,
text_terms: list,
ignore_signature_match: bool,
full_text: bool = False,
) -> dict:
"""Transforms and sorts the search results based on the presence
of text terms and signature matching.
Expand All @@ -171,6 +176,8 @@ def transform_search_results(
text_terms (list): The list of text terms used in the search.
ignore_signature_match (bool): Flag to ignore publication
signature content.
full_text (bool): If trim result text content.
Defaults to False.
Returns:
dict: A dictionary of sorted and processed search results.
Expand Down Expand Up @@ -205,7 +212,8 @@ def transform_search_results(
),
axis=1,
)
df["texto"] = df["texto"].apply(self._trim_text)
if not full_text:
df["texto"] = df["texto"].apply(self._trim_text)
df["display_date_sortable"] = None
df["hierarchyList"] = None

Expand Down
7 changes: 5 additions & 2 deletions src/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@ class DAGConfig:
is_exact_search: bool
ignore_signature_match: bool
force_rematch: bool
full_text: bool
terms: List[str]
sql: str
conn_id: str
department: List[str]
department: List[str]
emails: List[str]
subject: str
attach_csv: bool
Expand Down Expand Up @@ -113,6 +114,7 @@ def _parse_yaml(self) -> DAGConfig:
is_exact_search = search.get('is_exact_search', True)
ignore_signature_match = search.get('ignore_signature_match', False)
force_rematch = search.get('force_rematch', None)
full_text = search.get('full_text', None)
department = search.get('department', None)
schedule = self._get_safe_schedule(dag, self.DEFAULT_SCHEDULE)
doc_md = dag.get('doc_md', None)
Expand All @@ -137,10 +139,11 @@ def _parse_yaml(self) -> DAGConfig:
is_exact_search=is_exact_search,
ignore_signature_match=ignore_signature_match,
force_rematch=force_rematch,
full_text=full_text,
terms=terms,
sql=sql,
conn_id=conn_id,
department=department,
department=department,
emails=emails,
subject=subject,
attach_csv=attach_csv,
Expand Down
5 changes: 4 additions & 1 deletion src/searchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,7 @@ def exec_search(
search_date: str,
department: List[str],
ignore_signature_match: bool,
full_text: bool,
reference_date: datetime = datetime.now(),
) -> Dict:
"""
Expand All @@ -407,6 +408,7 @@ def exec_search(
department (List[str]): List of departments to filter the search.
ignore_signature_match (bool): Flag to ignore publication
signature content.
full_text (bool): If trim result text content
reference_date (datetime, optional): Reference date for the
search. Defaults to now.
Expand All @@ -426,7 +428,8 @@ def exec_search(

search_results = inlabs_hook.search_text(
search_terms,
ignore_signature_match
ignore_signature_match,
full_text
)

return self._group_results(search_results, terms)
Expand Down
60 changes: 56 additions & 4 deletions tests/inlabs_hook_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ def test_group_to_dict(inlabs_hook, df_in, dict_out):


@pytest.mark.parametrize(
"terms, df_in, dict_out",
"terms, df_in, dict_out, full_text",
[
(
["Pellentesque", "Lorem"],
Expand Down Expand Up @@ -303,12 +303,64 @@ def test_group_to_dict(inlabs_hook, df_in, dict_out):
}
],
},
)
False,
),
(
["Lorem"],
pd.DataFrame(
[
{
"artcategory": "Texto exemplo art_category",
"arttype": "Publicação xxx",
"id": 1,
"assina": "Pessoa 1",
"data": "Brasília/DF, 15 de março de 2024.",
"ementa": "None",
"identifica": "Título da Publicação 1",
"name": "15.03.2024 bsb DOU xxx",
"pdfpage": "http://xxx.gov.br/",
"pubdate": datetime(2024, 3, 15),
"pubname": "DO1",
"subtitulo": "None",
"texto": """Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Phasellus venenatis auctor mauris. Integer id neque quis urna
ultrices iaculis. Donec et enim mauris. Sed vel massa eget est
viverra finibus a et magna. Pellentesque vel elementum
mauris, id semper tellus. Vivamus convallis lacinia ex sed
fermentum. Nulla mollis cursus ipsum vel interdum. Mauris
facilisis posuere elit. Proin consectetur tincidunt urna.
Cras tincidunt nunc vestibulum velit pellentesque facilisis.
Aenean sollicitudin ante elit, vitae vehicula nisi congue id.
Brasília/DF, 15 de março de 2024. Pessoa 1 Analista
""",
"titulo": "None",
},
]
),
{
"Lorem": [
{
"section": "DOU - Seção 1",
"title": "Título da Publicação 1",
"href": "http://xxx.gov.br/",
"abstract": "<%%>Lorem</%%> ipsum dolor sit amet, consectetur adipiscing elit. Phasellus venenatis auctor mauris. Integer id neque quis urna ultrices iaculis. Donec et enim mauris. Sed vel massa eget est viverra finibus a et magna. Pellentesque vel elementum mauris, id semper tellus. Vivamus convallis lacinia ex sed fermentum. Nulla mollis cursus ipsum vel interdum. Mauris facilisis posuere elit. Proin consectetur tincidunt urna. Cras tincidunt nunc vestibulum velit pellentesque facilisis. Aenean sollicitudin ante elit, vitae vehicula nisi congue id. Brasília/DF, 15 de março de 2024. Pessoa 1 Analista",
"date": "15/03/2024",
"id": 1,
"display_date_sortable": None,
"hierarchyList": None,
}
],
},
True,
),
],
)
def test_transform_search_results(inlabs_hook, terms, df_in, dict_out):
def test_transform_search_results(inlabs_hook, terms, df_in, dict_out, full_text):
r = inlabs_hook.TextDictHandler().transform_search_results(
response=df_in, text_terms=terms, ignore_signature_match=False
response=df_in,
text_terms=terms,
ignore_signature_match=False,
full_text=full_text,
)
assert r == dict_out

Expand Down
7 changes: 7 additions & 0 deletions tests/parsers_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def test_hash_dag_id(yaml_parser, dag_id, size, hashed):
"field": "TUDO",
"is_exact_search": True,
"ignore_signature_match": False,
"full_text": None,
"force_rematch": None,
"terms": ["dados abertos",
"governo aberto",
Expand Down Expand Up @@ -69,6 +70,7 @@ def test_hash_dag_id(yaml_parser, dag_id, size, hashed):
"field": "TUDO",
"is_exact_search": True,
"ignore_signature_match": True,
"full_text": True,
"force_rematch": True,
"terms": ["dados abertos",
"governo aberto",
Expand Down Expand Up @@ -99,6 +101,7 @@ def test_hash_dag_id(yaml_parser, dag_id, size, hashed):
"field": "TUDO",
"is_exact_search": True,
"ignore_signature_match": False,
"full_text": None,
"force_rematch": None,
"terms": [],
"sql": ("SELECT 'cloroquina' as TERMO, 'Ações inefetivas' as GRUPO "
Expand Down Expand Up @@ -132,6 +135,7 @@ def test_hash_dag_id(yaml_parser, dag_id, size, hashed):
"field": "TUDO",
"is_exact_search": True,
"ignore_signature_match": False,
"full_text": None,
"force_rematch": None,
"terms": ["cimentodaaroeira"],
"sql": None,
Expand Down Expand Up @@ -160,6 +164,7 @@ def test_hash_dag_id(yaml_parser, dag_id, size, hashed):
"field": "TUDO",
"is_exact_search": True,
"ignore_signature_match": False,
"full_text": None,
"force_rematch": None,
"terms": ["dados abertos",
"governo aberto",
Expand Down Expand Up @@ -196,6 +201,7 @@ def test_hash_dag_id(yaml_parser, dag_id, size, hashed):
"field": "TUDO",
"is_exact_search": True,
"ignore_signature_match": False,
"full_text": None,
"force_rematch": None,
"terms": ["dados abertos"],
"sql": None,
Expand Down Expand Up @@ -225,6 +231,7 @@ def test_hash_dag_id(yaml_parser, dag_id, size, hashed):
"field": "TUDO",
"is_exact_search": True,
"ignore_signature_match": False,
"full_text": None,
"force_rematch": None,
"terms": ["tecnologia", "informação"],
"sql": None,
Expand Down

0 comments on commit 21a8af3

Please sign in to comment.