add full_text option

gestaogovbr · Apr 16, 2024 · 21a8af3 · 21a8af3
1 parent d3af2df
commit 21a8af3
Show file tree

Hide file tree

Showing 8 changed files with 91 additions and 10 deletions.
diff --git a/dag_confs/examples_and_tests/all_parameters_example.yaml b/dag_confs/examples_and_tests/all_parameters_example.yaml
@@ -21,6 +21,7 @@ dag:
     dou_sections:
       - SECAO_1
       - EDICAO_SUPLEMENTAR
+    full_text: True
   report:
     emails:
       - [email protected]

diff --git a/schemas/ro-dou.json b/schemas/ro-dou.json
@@ -109,6 +109,10 @@
               "type": "boolean",
               "description": "description"
             },
+            "full_text": {
+              "type": "boolean",
+              "description": "description"
+            },
             "date": {
               "type": "string",
               "description": "description",

diff --git a/src/dou_dag_generator.py b/src/dou_dag_generator.py
@@ -195,6 +195,7 @@ def create_dag(self, specs: DAGConfig, config_file: str) -> DAG:
                     'is_exact_search': specs.is_exact_search,
                     'ignore_signature_match': specs.ignore_signature_match,
                     'force_rematch': specs.force_rematch,
+                    'full_text': specs.full_text,
                     'department': specs.department,
                     'result_as_email': result_as_html(specs),
                     },
@@ -240,6 +241,7 @@ def perform_searches(
         is_exact_search: bool,
         ignore_signature_match: bool,
         force_rematch: bool,
+        full_text: bool,
         result_as_email: bool,
         department: List[str],
         **context) -> dict:
@@ -266,6 +268,7 @@ def perform_searches(
                 search_date,
                 department,
                 ignore_signature_match,
+                full_text,
                 get_trigger_date(context, local_time = True)
             )
 

diff --git a/src/hooks/inlabs_hook.py b/src/hooks/inlabs_hook.py
@@ -28,6 +28,7 @@ def search_text(
         self,
         search_terms: dict,
         ignore_signature_match: bool,
+        full_text: bool,
         conn_id: str = CONN_ID,
     ) -> dict:
         """Searches the DOU Database with the provided search terms and processes
@@ -63,7 +64,7 @@ def search_text(
 
         return (
             self.TextDictHandler().transform_search_results(
-                all_results, search_terms["texto"], ignore_signature_match
+                all_results, search_terms["texto"], ignore_signature_match, full_text
             )
             if not all_results.empty
             else {}
@@ -160,7 +161,11 @@ def __init__(self, *args, **kwargs):
             pass
 
         def transform_search_results(
-            self, response: pd.DataFrame, text_terms: list, ignore_signature_match: bool
+            self,
+            response: pd.DataFrame,
+            text_terms: list,
+            ignore_signature_match: bool,
+            full_text: bool = False,
         ) -> dict:
             """Transforms and sorts the search results based on the presence
             of text terms and signature matching.
@@ -171,6 +176,8 @@ def transform_search_results(
                 text_terms (list): The list of text terms used in the search.
                 ignore_signature_match (bool): Flag to ignore publication
                     signature content.
+                full_text (bool):  If trim result text content.
+                    Defaults to False.
 
             Returns:
                 dict: A dictionary of sorted and processed search results.
@@ -205,7 +212,8 @@ def transform_search_results(
                 ),
                 axis=1,
             )
-            df["texto"] = df["texto"].apply(self._trim_text)
+            if not full_text:
+                df["texto"] = df["texto"].apply(self._trim_text)
             df["display_date_sortable"] = None
             df["hierarchyList"] = None
 

diff --git a/src/parsers.py b/src/parsers.py
@@ -21,10 +21,11 @@ class DAGConfig:
     is_exact_search: bool
     ignore_signature_match: bool
     force_rematch: bool
+    full_text: bool
     terms: List[str]
     sql: str
     conn_id: str
-    department: List[str]    
+    department: List[str]
     emails: List[str]
     subject: str
     attach_csv: bool
@@ -113,6 +114,7 @@ def _parse_yaml(self) -> DAGConfig:
         is_exact_search = search.get('is_exact_search', True)
         ignore_signature_match = search.get('ignore_signature_match', False)
         force_rematch = search.get('force_rematch', None)
+        full_text = search.get('full_text', None)
         department = search.get('department', None)
         schedule = self._get_safe_schedule(dag, self.DEFAULT_SCHEDULE)
         doc_md = dag.get('doc_md', None)
@@ -137,10 +139,11 @@ def _parse_yaml(self) -> DAGConfig:
             is_exact_search=is_exact_search,
             ignore_signature_match=ignore_signature_match,
             force_rematch=force_rematch,
+            full_text=full_text,
             terms=terms,
             sql=sql,
             conn_id=conn_id,
-            department=department,            
+            department=department,
             emails=emails,
             subject=subject,
             attach_csv=attach_csv,

diff --git a/src/searchers.py b/src/searchers.py
@@ -391,6 +391,7 @@ def exec_search(
         search_date: str,
         department: List[str],
         ignore_signature_match: bool,
+        full_text: bool,
         reference_date: datetime = datetime.now(),
     ) -> Dict:
         """
@@ -407,6 +408,7 @@ def exec_search(
             department (List[str]): List of departments to filter the search.
             ignore_signature_match (bool): Flag to ignore publication
                 signature content.
+            full_text (bool): If trim result text content
             reference_date (datetime, optional): Reference date for the
                 search. Defaults to now.
 
@@ -426,7 +428,8 @@ def exec_search(
 
         search_results = inlabs_hook.search_text(
             search_terms,
-            ignore_signature_match
+            ignore_signature_match,
+            full_text
         )
 
         return self._group_results(search_results, terms)

diff --git a/tests/inlabs_hook_test.py b/tests/inlabs_hook_test.py
@@ -237,7 +237,7 @@ def test_group_to_dict(inlabs_hook, df_in, dict_out):
 
 
 @pytest.mark.parametrize(
-    "terms, df_in, dict_out",
+    "terms, df_in, dict_out, full_text",
     [
         (
             ["Pellentesque", "Lorem"],
@@ -303,12 +303,64 @@ def test_group_to_dict(inlabs_hook, df_in, dict_out):
                     }
                 ],
             },
-        )
+            False,
+        ),
+        (
+            ["Lorem"],
+            pd.DataFrame(
+                [
+                    {
+                        "artcategory": "Texto exemplo art_category",
+                        "arttype": "Publicação xxx",
+                        "id": 1,
+                        "assina": "Pessoa 1",
+                        "data": "Brasília/DF, 15 de março de 2024.",
+                        "ementa": "None",
+                        "identifica": "Título da Publicação 1",
+                        "name": "15.03.2024 bsb DOU xxx",
+                        "pdfpage": "http://xxx.gov.br/",
+                        "pubdate": datetime(2024, 3, 15),
+                        "pubname": "DO1",
+                        "subtitulo": "None",
+                        "texto": """Lorem ipsum dolor sit amet, consectetur adipiscing elit.
+                        Phasellus venenatis auctor mauris. Integer id neque quis urna
+                        ultrices iaculis. Donec et enim mauris. Sed vel massa eget est
+                        viverra finibus a et magna. Pellentesque vel elementum
+                        mauris, id semper tellus. Vivamus convallis lacinia ex sed
+                        fermentum. Nulla mollis cursus ipsum vel interdum. Mauris
+                        facilisis posuere elit. Proin consectetur tincidunt urna.
+                        Cras tincidunt nunc vestibulum velit pellentesque facilisis.
+                        Aenean sollicitudin ante elit, vitae vehicula nisi congue id.
+                        Brasília/DF, 15 de março de 2024.  Pessoa 1  Analista
+                        """,
+                        "titulo": "None",
+                    },
+                ]
+            ),
+            {
+                "Lorem": [
+                    {
+                        "section": "DOU - Seção 1",
+                        "title": "Título da Publicação 1",
+                        "href": "http://xxx.gov.br/",
+                        "abstract": "<%%>Lorem</%%> ipsum dolor sit amet, consectetur adipiscing elit. Phasellus venenatis auctor mauris. Integer id neque quis urna ultrices iaculis. Donec et enim mauris. Sed vel massa eget est viverra finibus a et magna. Pellentesque vel elementum mauris, id semper tellus. Vivamus convallis lacinia ex sed fermentum. Nulla mollis cursus ipsum vel interdum. Mauris facilisis posuere elit. Proin consectetur tincidunt urna. Cras tincidunt nunc vestibulum velit pellentesque facilisis. Aenean sollicitudin ante elit, vitae vehicula nisi congue id. Brasília/DF, 15 de março de 2024. Pessoa 1 Analista",
+                        "date": "15/03/2024",
+                        "id": 1,
+                        "display_date_sortable": None,
+                        "hierarchyList": None,
+                    }
+                ],
+            },
+            True,
+        ),
     ],
 )
-def test_transform_search_results(inlabs_hook, terms, df_in, dict_out):
+def test_transform_search_results(inlabs_hook, terms, df_in, dict_out, full_text):
     r = inlabs_hook.TextDictHandler().transform_search_results(
-        response=df_in, text_terms=terms, ignore_signature_match=False
+        response=df_in,
+        text_terms=terms,
+        ignore_signature_match=False,
+        full_text=full_text,
     )
     assert r == dict_out
 

diff --git a/tests/parsers_test.py b/tests/parsers_test.py
@@ -39,6 +39,7 @@ def test_hash_dag_id(yaml_parser, dag_id, size, hashed):
                 "field": "TUDO",
                 "is_exact_search": True,
                 "ignore_signature_match": False,
+                "full_text": None,
                 "force_rematch": None,
                 "terms": ["dados abertos",
                     "governo aberto",
@@ -69,6 +70,7 @@ def test_hash_dag_id(yaml_parser, dag_id, size, hashed):
                 "field": "TUDO",
                 "is_exact_search": True,
                 "ignore_signature_match": True,
+                "full_text": True,
                 "force_rematch": True,
                 "terms": ["dados abertos",
                     "governo aberto",
@@ -99,6 +101,7 @@ def test_hash_dag_id(yaml_parser, dag_id, size, hashed):
                 "field": "TUDO",
                 "is_exact_search": True,
                 "ignore_signature_match": False,
+                "full_text": None,
                 "force_rematch": None,
                 "terms": [],
                 "sql": ("SELECT 'cloroquina' as TERMO, 'Ações inefetivas' as GRUPO "
@@ -132,6 +135,7 @@ def test_hash_dag_id(yaml_parser, dag_id, size, hashed):
                 "field": "TUDO",
                 "is_exact_search": True,
                 "ignore_signature_match": False,
+                "full_text": None,
                 "force_rematch": None,
                 "terms": ["cimentodaaroeira"],
                 "sql": None,
@@ -160,6 +164,7 @@ def test_hash_dag_id(yaml_parser, dag_id, size, hashed):
                 "field": "TUDO",
                 "is_exact_search": True,
                 "ignore_signature_match": False,
+                "full_text": None,
                 "force_rematch": None,
                 "terms": ["dados abertos",
                     "governo aberto",
@@ -196,6 +201,7 @@ def test_hash_dag_id(yaml_parser, dag_id, size, hashed):
                 "field": "TUDO",
                 "is_exact_search": True,
                 "ignore_signature_match": False,
+                "full_text": None,
                 "force_rematch": None,
                 "terms": ["dados abertos"],
                 "sql": None,
@@ -225,6 +231,7 @@ def test_hash_dag_id(yaml_parser, dag_id, size, hashed):
                 "field": "TUDO",
                 "is_exact_search": True,
                 "ignore_signature_match": False,
+                "full_text": None,
                 "force_rematch": None,
                 "terms": ["tecnologia", "informação"],
                 "sql": None,