diff --git a/dag_confs/examples_and_tests/inlabs_example.yaml b/dag_confs/examples_and_tests/inlabs_example.yaml new file mode 100644 index 0000000..6275666 --- /dev/null +++ b/dag_confs/examples_and_tests/inlabs_example.yaml @@ -0,0 +1,19 @@ +dag: + id: inlabs_example + description: DAG de teste + tags: + - inlabs + schedule: 0 8 * * MON-FRI + owner: + - cdata + search: + sources: + - INLABS + terms: + - tecnologia + - informação + report: + emails: + - destination@economia.gov.br + attach_csv: True + subject: "Teste do Ro-dou" diff --git a/requirements.txt b/requirements.txt index 4b7b704..7c3c49e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ pandas==1.5.2,<2 -unidecode==1.2.0 \ No newline at end of file +unidecode==1.2.0 +html2text==2024.2.26 \ No newline at end of file diff --git a/schemas/ro-dou.json b/schemas/ro-dou.json index 262d08e..3a11b20 100644 --- a/schemas/ro-dou.json +++ b/schemas/ro-dou.json @@ -42,7 +42,7 @@ "description": "description", "items": { "type": "string", - "enum": ["QD", "DOU"] + "enum": ["QD", "DOU", "INLABS"] } }, "territory_id": { @@ -89,9 +89,9 @@ "description": "departamento para filtro na busca", "items": { "type": "string", - "description": "nome do departamento" + "description": "nome do departamento" } - }, + }, "field": { "type": "string", "description": "description", @@ -164,7 +164,9 @@ "description": "description", "format": "uri-reference" } - } + }, + "required": ["webhook"], + "additionalProperties": false }, "discord": { "type": "object", @@ -175,7 +177,9 @@ "description": "description", "format": "uri-reference" } - } + }, + "required": ["webhook"], + "additionalProperties": false }, "emails": { "type": "array", diff --git a/src/dou_dag_generator.py b/src/dou_dag_generator.py index 4329e4d..842ffd2 100755 --- a/src/dou_dag_generator.py +++ b/src/dou_dag_generator.py @@ -31,7 +31,7 @@ get_trigger_date, template_ano_mes_dia_trigger_local_time) from notification.notifier import Notifier from parsers import DAGConfig, YAMLParser -from searchers import BaseSearcher, DOUSearcher, QDSearcher +from searchers import BaseSearcher, DOUSearcher, QDSearcher, INLABSSearcher class DouDigestDagGenerator(): """ @@ -53,6 +53,7 @@ def __init__(self, on_retry_callback=None, on_failure_callback=None): self.searchers = { 'DOU': DOUSearcher(), 'QD': QDSearcher(), + 'INLABS': INLABSSearcher(), } self.on_retry_callback = on_retry_callback self.on_failure_callback = on_failure_callback @@ -221,9 +222,8 @@ def perform_searches( **context) -> dict: """Performs the search in each source and merge the results """ - logging.info('Searching for: %s', ', '.join(term_list)) - logging.info( - f'Trigger date: {get_trigger_date(context, local_time=True)}') + logging.info('Searching for: %s', term_list) + logging.info('Trigger date: %s', get_trigger_date(context, local_time=True)) if 'DOU' in sources: dou_result = self.searchers['DOU'].exec_search( @@ -236,6 +236,15 @@ def perform_searches( force_rematch, department, get_trigger_date(context, local_time = True)) + elif 'INLABS' in sources: + inlabs_result = self.searchers['INLABS'].exec_search( + term_list, + dou_sections, + search_date, + department, + ignore_signature_match, + get_trigger_date(context, local_time = True) + ) if 'QD' in sources: qd_result = self.searchers['QD'].exec_search( @@ -252,8 +261,12 @@ def perform_searches( if 'DOU' in sources and 'QD' in sources: return merge_results(qd_result, dou_result) + elif 'INLABS' in sources and 'QD' in sources: + return merge_results(qd_result, inlabs_result) elif 'DOU' in sources: return dou_result + elif 'INLABS' in sources: + return inlabs_result else: return qd_result diff --git a/src/dou_hook.py b/src/hooks/dou_hook.py similarity index 71% rename from src/dou_hook.py rename to src/hooks/dou_hook.py index 59904fd..6535684 100644 --- a/src/dou_hook.py +++ b/src/hooks/dou_hook.py @@ -1,10 +1,11 @@ """ Hook para realizar operações de consultas à API do Diário Oficial da União. """ +import sys +import os import logging -from datetime import datetime, timedelta +from datetime import datetime import time -from enum import Enum import json from typing import List import requests @@ -13,42 +14,8 @@ from bs4 import BeautifulSoup - -class Section(Enum): - """Define the section options to be used as parameter in the search""" - - SECAO_1 = "do1" - SECAO_2 = "do2" - SECAO_3 = "do3" - EDICAO_EXTRA = "doe" - EDICAO_EXTRA_1A = "do1_extra_a" - EDICAO_EXTRA_1B = "do1_extra_b" - EDICAO_EXTRA_1D = "do1_extra_d" - EDICAO_EXTRA_2A = "do2_extra_a" - EDICAO_EXTRA_2B = "do2_extra_b" - EDICAO_EXTRA_2D = "do2_extra_d" - EDICAO_EXTRA_3A = "do3_extra_a" - EDICAO_EXTRA_3B = "do3_extra_b" - EDICAO_EXTRA_3D = "do3_extra_d" - EDICAO_SUPLEMENTAR = "do1a" - TODOS = "todos" - - -class SearchDate(Enum): - """Define the search date options to be used as parameter in the search""" - - DIA = "dia" - SEMANA = "semana" - MES = "mes" - ANO = "ano" - - -class Field(Enum): - """Define the search field options to be used as parameter in the search""" - - TUDO = "tudo" - TITULO = "title_pt_BR" - CONTEUDO = "ddm__text__21040__texto_pt_BR" +sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) +from utils.search_domains import SearchDate, Field, Section, calculate_from_datetime class DOUHook(BaseHook): @@ -88,28 +55,6 @@ def _get_query_str(self, term, field, is_exact_search): else: return f"{field.value}-{term}" - def calculate_from_datetime( - self, publish_to_date: datetime, search_date: SearchDate - ): - """ - Calculate parameter `publishFrom` to be passed to the API based - on publishTo parameter and `search_date`. Perform especial - calculation to the MES (month) parameter option - """ - if search_date == SearchDate.DIA: - return publish_to_date - - elif search_date == SearchDate.SEMANA: - return publish_to_date - timedelta(days=6) - - elif search_date == SearchDate.MES: - end_prev_month = publish_to_date.replace(day=1) - timedelta(days=1) - publish_from_date = end_prev_month.replace(day=publish_to_date.day) - return publish_from_date - timedelta(days=1) - - elif search_date == SearchDate.ANO: - return publish_to_date - timedelta(days=364) - def _request_page(self, with_retry: bool, payload: dict): try: return requests.get(self.IN_API_BASE_URL, params=payload, timeout=10) @@ -117,7 +62,7 @@ def _request_page(self, with_retry: bool, payload: dict): if with_retry: logging.info("Sleep for 30 seconds before retry requests.get().") time.sleep(30) - return requests.get(self.IN_API_BASE_URL, params=payload, timeout=10) + return requests.get(self.IN_API_BASE_URL, params=payload, timeout=10) def search_text( @@ -141,25 +86,25 @@ def search_text( - A list of dicts of structred results. """ - publish_from = self.calculate_from_datetime(reference_date, search_date) + publish_from = calculate_from_datetime(reference_date, search_date) payload = { "q": self._get_query_str(search_term, field, is_exact_search), "exactDate": "personalizado", "publishFrom": publish_from.strftime("%d-%m-%Y"), "publishTo": reference_date.strftime("%d-%m-%Y"), - "sortType": "0", - "s": [section.value for section in sections] + "sortType": "0", + "s": [section.value for section in sections], } page = self._request_page(payload=payload, with_retry=with_retry) soup = BeautifulSoup(page.content, "html.parser") - + # Checks if there is more than one page of results pagination_tag = soup.find( 'button', id='lastPage' ) - + if (pagination_tag) is not None: # Get the number of pages in the pagination bar number_pages = int(pagination_tag.text.strip()) @@ -174,11 +119,11 @@ def search_text( # Loop for each page of result for page_num in range(number_pages): logging.info("Searching in page %s", str(page_num + 1)) - + # If there is more than one page add extra payload params and reload the page if page_num > 0: # The id is needed for pagination to work because it requires - # passing the last id from the previous item page in request URL + # passing the last id from the previous item page in request URL # Delta is the number of records per page. By now is restricted up to 20. payload.update({ "id": item["id"], @@ -188,11 +133,11 @@ def search_text( "currentPage": page_num, }) page = self._request_page(payload=payload, with_retry=with_retry) - soup = BeautifulSoup(page.content, "html.parser") + soup = BeautifulSoup(page.content, "html.parser") script_tag = soup.find( "script", id="_br_com_seatecnologia_in_buscadou_BuscaDouPortlet_params" - ) + ) search_results = json.loads(script_tag.contents[0])["jsonArray"] if search_results: diff --git a/src/hooks/inlabs_hook.py b/src/hooks/inlabs_hook.py new file mode 100644 index 0000000..7f967e3 --- /dev/null +++ b/src/hooks/inlabs_hook.py @@ -0,0 +1,355 @@ +"""Apache Airflow Hook to execute DOU searches from INLABS source. +""" + +import re +from datetime import datetime, timedelta, date +import pandas as pd +import unicodedata +import html2text + +from airflow.hooks.base import BaseHook +from airflow.providers.postgres.hooks.postgres import PostgresHook + + +class INLABSHook(BaseHook): + """A custom Apache Airflow Hook designed for executing searches via + the DOU Postgres Database provided by INLABS. + + Attributes: + CONN_ID (str): DOU INLABS Database Airflow conn id. + """ + + CONN_ID = "inlabs_db" + + def __init__(self, *args, **kwargs): + pass + + def search_text( + self, + search_terms: dict, + ignore_signature_match: bool, + conn_id: str = CONN_ID, + ) -> dict: + """Searches the DOU Database with the provided search terms and processes + the results. + + Args: + search_terms (dict): A dictionary containing the search + parameters. + ignore_signature_match (bool): Flag to ignore publication + signature content. + conn_id (str): DOU Database Airflow conn id + + Returns: + dict: A dictionary of processed search results. + """ + + hook = PostgresHook(conn_id) + + # Fetching results for main search terms + main_search_queries = self._generate_sql(search_terms) + hook.run(main_search_queries["create_extension"], autocommit=True) + main_search_results = hook.get_pandas_df(main_search_queries["select"]) + + # Fetching results for yesterday extra search terms + extra_search_terms = self._adapt_search_terms_to_extra(search_terms) + extra_search_queries = self._generate_sql(extra_search_terms) + extra_search_results = hook.get_pandas_df(extra_search_queries["select"]) + + # Combining main and extra search results + all_results = pd.concat( + [main_search_results, extra_search_results], ignore_index=True + ) + + return ( + self.TextDictHandler().transform_search_results( + all_results, search_terms["texto"], ignore_signature_match + ) + if not all_results.empty + else {} + ) + + @staticmethod + def _generate_sql(payload: dict) -> str: + """Generates SQL query based on a dictionary of lists. The + dictionary key is the table column and the dictionary values + are a list of the terms to filter. + + Args: + payload (dict): A dictionary containing search parameters. + example = { + "texto": ["Termo 1", "Termo 2"], + "pubdate": ["2024-04-01", "2024-04-01"] + "pubname": ["DO1"] + } + + Returns: + str: The generated SQL query. + """ + + allowed_keys = [ + "name", + "pubname", + "artcategory", + "identifica", + "titulo", + "subtitulo", + "texto", + ] + filtered_dict = {k: payload[k] for k in payload if k in allowed_keys} + + pub_date = payload.get("pubdate", [date.today().strftime("%Y-%m-%d")]) + pub_date_from = pub_date[0] + try: + pub_date_to = pub_date[1] + except IndexError: + pub_date_to = pub_date_from + + query = f"SELECT * FROM dou_inlabs.article_raw WHERE (pubdate BETWEEN '{pub_date_from}' AND '{pub_date_to}')" + + conditions = [] + for key, values in filtered_dict.items(): + key_conditions = " OR ".join( + [ + rf"dou_inlabs.unaccent({key}) ~* dou_inlabs.unaccent('\y{value}\y')" + for value in values + ] + ) + conditions.append(f"({key_conditions})") + + if conditions: + query = f"{query} AND {' AND '.join(conditions)}" + + queries = { + "create_extension": "CREATE EXTENSION IF NOT EXISTS unaccent SCHEMA dou_inlabs", + "select": query, + } + + return queries + + @staticmethod + def _adapt_search_terms_to_extra(payload: dict) -> dict: + """Modifies payload dictionary by subtracting one day of `pubdate` + and adding `E` (for extra publication) on `pubname`. + + Args: + payload (dict): A dictionary containing search parameters. + + Returns: + dict: The modified payload dictionary with adapted search terms. + """ + + payload["pubdate"] = [ + (datetime.strptime(date, "%Y-%m-%d") - timedelta(days=1)).strftime( + "%Y-%m-%d" + ) + for date in payload["pubdate"] + ] + payload["pubname"] = [ + s if s.endswith("E") else s + "E" for s in payload["pubname"] + ] + + return payload + + class TextDictHandler: + """Handles the transformation and organization of text search + results from the DOU Database. + """ + + def __init__(self, *args, **kwargs): + pass + + def transform_search_results( + self, response: pd.DataFrame, text_terms: list, ignore_signature_match: bool + ) -> dict: + """Transforms and sorts the search results based on the presence + of text terms and signature matching. + + Args: + response (pd.DataFrame): The dataframe of search results + from the Database. + text_terms (list): The list of text terms used in the search. + ignore_signature_match (bool): Flag to ignore publication + signature content. + + Returns: + dict: A dictionary of sorted and processed search results. + """ + + df = response.copy() + df.dropna(subset=["identifica"], inplace=True) + df["pubname"] = df["pubname"].apply(self._rename_section) + df["identifica"] = df["identifica"].apply(self._remove_html_tags) + df["pubdate"] = df["pubdate"].dt.strftime("%d/%m/%Y") + df["texto"] = df["texto"].apply(self._remove_html_tags) + df["matches"] = df["texto"].apply(self._find_matches, keys=text_terms) + df["matches_assina"] = df.apply( + lambda row: self._normalize(row["matches"]) + in self._normalize(row["assina"]), + axis=1, + ) + df["count_assina"] = df.apply( + lambda row: ( + row["texto"].count(row["assina"]) + if row["assina"] is not None + else 0 + ), + axis=1, + ) + df["texto"] = df.apply( + lambda row: self._highlight_terms( + row["matches"].split(", "), row["texto"] + ), + axis=1, + ) + df["texto"] = df["texto"].apply(self._trim_text) + df["display_date_sortable"] = None + df["hierarchyList"] = None + + if ignore_signature_match: + df = df[~((df["matches_assina"]) & (df["count_assina"] == 1))] + + cols_rename = { + "pubname": "section", + "identifica": "title", + "pdfpage": "href", + "texto": "abstract", + "pubdate": "date", + "id": "id", + "display_date_sortable": "display_date_sortable", + "hierarchyList": "hierarchyList", + } + df.rename(columns=cols_rename, inplace=True) + cols_output = list(cols_rename.values()) + + return ( + {} + if df.empty + else self._group_to_dict( + df.sort_values(by="matches"), "matches", cols_output + ) + ) + + @staticmethod + def _rename_section(section: str) -> str: + """Rename DOU Section for formatted text to notifications. + + Example: + DO1 -> DOU - Seção 1 + DO2E -> DOU - Seção 2 Extra + """ + + # section[:2] = DO + return section[:2] + "U - Seção " + section[2:].replace("E", " Extra") + + @staticmethod + def _remove_html_tags(text) -> str: + if isinstance(text, str): + text = html2text.HTML2Text().handle(text).replace("\n", " ").strip() + text = re.sub(r"\s+", " ", text) + return text + return "" + + def _find_matches(self, text: str, keys: list) -> list: + """Find keys that match the text, considering normalization + for matching and ensuring exact matches. + + Args: + text (str): The text in which to search for keys. + keys (list): A list of keys to be searched for in the text. + It's assumed that keys are strings. + + Returns: + list: A sorted list of unique keys found in the text. + """ + + normalized_text = self._normalize(text) + matches = [ + key + for key in keys + if re.search( + r"\b" + re.escape(self._normalize(key)) + r"\b", + normalized_text, + re.IGNORECASE, + ) + ] + + return ", ".join(sorted(set(matches))) + + @staticmethod + def _normalize(text: str) -> str: + """Normalize text by removing accents and converting to + lowercase. + + Parameters: + text (str): The text to normalize. + + Returns: + str: The normalized ASCII string. + """ + + return ( + unicodedata.normalize("NFKD", text) + .encode("ascii", "ignore") + .decode("ascii") + .lower() + if isinstance(text, str) + else "" + ) + + @staticmethod + def _highlight_terms(terms: list, text: str) -> str: + """Wrap `terms` values in `text` with `<%%>` and `%%>`. + + Args: + terms (list): List of terms to be wrapped on text. + text (str): String content to be updated with wrapped + `terms`. + + Returns: + str: `text` with values on `terms` wrapped with `<%%>` + and `%%>`. + """ + + escaped_terms = [re.escape(term) for term in terms] + pattern = rf"\b({'|'.join(escaped_terms)})\b" + highlighted_text = re.sub( + pattern, r"<%%>\1%%>", text, flags=re.IGNORECASE + ) + + return highlighted_text + + @staticmethod + def _trim_text(text: str) -> str: + """Get a len(x) string and returns len(400) keeping `<%%>` + at the center. + """ + + parts = text.split("<%%>", 1) + return ( + "(...) " + parts[0][-200:] + "<%%>" + parts[1][:200] + " (...)" + if len(parts) > 1 + else text[:400] + " (...)" + ) + + @staticmethod + def _group_to_dict(df: pd.DataFrame, group_column: str, cols: list) -> dict: + """Convert DataFrame grouped by a column to a dictionary. + + Args: + df (pd.DataFrame): Input dataframe to transform. + group_column (str): The dataframe column to group_by. + cols (list): Filter of the cols that will remain on the + output dataframe. + + Returns: + dict: Dictionary with keys as unique values of group_column + and values as lists of dictionaries representing the + selected columns. + """ + + return ( + df.groupby(group_column) + .apply(lambda x: x[cols].apply(lambda y: y.to_dict(), axis=1).tolist()) + .to_dict() + ) diff --git a/src/searchers.py b/src/searchers.py index 6fcc194..9b5260f 100644 --- a/src/searchers.py +++ b/src/searchers.py @@ -11,17 +11,23 @@ from abc import ABC from datetime import datetime, timedelta from random import random -from typing import Dict, List, Tuple -from urllib.parse import urljoin +from typing import Dict, List, Tuple, Union import string import pandas as pd import requests - from unidecode import unidecode sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) -from dou_hook import DOUHook, Field, SearchDate, Section +from hooks.dou_hook import DOUHook +from hooks.inlabs_hook import INLABSHook +from utils.search_domains import ( + Field, + SearchDate, + Section, + SectionINLABS, + calculate_from_datetime, +) class BaseSearcher(ABC): @@ -52,27 +58,28 @@ def _group_results(self, search_results: dict, term_list: Dict[list, str]) -> di return grouped_result - def _group_by_term_group(self, search_results: dict, term_n_group: str) -> dict: + @staticmethod + def _group_by_term_group(search_results: dict, term_n_group: str) -> dict: """Rebuild the dict grouping the results based on term_n_group mapping """ dict_struct = ast.literal_eval(term_n_group) terms, groups = dict_struct.values() term_group_map = dict(zip(terms.values(), groups.values())) - groups = sorted(list(set(term_group_map.values()))) - - grouped_result = { - g1: { - t: search_results[t] - for (t, g2) in sorted(term_group_map.items()) - if t in search_results and g1 == g2 - } - for g1 in groups - } - # Clear empty groups - trimmed_result = {k: v for k, v in grouped_result.items() if v} - return trimmed_result + grouped_result = {} + for k, v in search_results.items(): + group = term_group_map[k.split(",")[0]] + update = {k: v} + + if group in grouped_result: + grouped_result[group].update(update) + else: + grouped_result[group] = update + + sorted_dict = {key: grouped_result[key] for key in sorted(grouped_result)} + + return sorted_dict def _really_matched(self, search_term: str, abstract: str) -> bool: """Verifica se o termo encontrado pela API realmente é igual ao @@ -368,3 +375,115 @@ def _build_query_payload(search_term: str, reference_date: datetime) -> List[tup ("published_until", reference_date.strftime("%Y-%m-%d")), ("querystring", f'"{search_term}"'), ] + + +class INLABSSearcher(BaseSearcher): + """ + A searcher class that interfaces with an Airflow INLABSHook to perform + DOU searches with various filters such as `terms`, `sections`, `dates`, + and `departments`. + """ + + def exec_search( + self, + terms: Union[List[str], str], + dou_sections: List[str], + search_date: str, + department: List[str], + ignore_signature_match: bool, + reference_date: datetime = datetime.now(), + ) -> Dict: + """ + Execute a search with given parameters, applying filters and + transforming terms as needed. + + Args: + terms (Union[List[str], str]): Search terms as a List or a + string formatted as a dict (when from sql query). + dou_sections (List[str]): List of DOU sections to filter the search. + dou_sections examples: SECAO_1, SECAO_3D, EDICAO_EXTRA_1 + search_date (str): Date interval filter. + search_date examples: DIA, SEMANA, MES, ANO + department (List[str]): List of departments to filter the search. + ignore_signature_match (bool): Flag to ignore publication + signature content. + reference_date (datetime, optional): Reference date for the + search. Defaults to now. + + Returns: + Dict: Grouped search results. + """ + + inlabs_hook = INLABSHook() + search_terms = self._prepare_search_terms(terms) + search_terms = self._apply_filters( + search_terms, + dou_sections, + department, + reference_date, + search_date + ) + + search_results = inlabs_hook.search_text( + search_terms, + ignore_signature_match + ) + + return self._group_results(search_results, terms) + + def _prepare_search_terms(self, terms: Union[List[str], str]) -> Dict: + """Prepare search terms based on input terms. + + Args: + terms (Union[List[str], str]): Can be one of: + String formatted as dictionary when comes from a database + query + List when comes from `terms` key of the .yaml + Returns: + Dict: Formatted as {"texto": List of terms} + """ + + if isinstance(terms, List): + return {"texto": terms} + return {"texto": self._split_sql_terms(json.loads(terms))} + + def _apply_filters( + self, + search_terms: Dict, + sections: List[str], + department: List[str], + reference_date: datetime, + search_date: str + ): + """Apply `sections`, `departments` and `date` filters to the + search_terms dictionary.""" + + if "TODOS" not in sections: + search_terms["pubname"] = self._parse_sections(sections) + if department: + search_terms["artcategory"] = department + publish_from = calculate_from_datetime(reference_date, SearchDate[search_date]).strftime("%Y-%m-%d") + publish_to = reference_date.strftime("%Y-%m-%d") + search_terms["pubdate"] = [publish_from, publish_to] + + return search_terms + + @staticmethod + def _split_sql_terms(terms: Dict) -> List: + """Split SQL terms into a list, removing duplicates. + Get only the values from the first key of the Dict.""" + + first_key = next(iter(terms)) + return list(set(terms[first_key].values())) + + @staticmethod + def _parse_sections(sections: List) -> List: + """Parse DOU section codes into a list of section names based on + SectionINLABS class. Avoid duplicates. + + Example: + the section ["SECAO_1", "SECAO_3", "EDICAO_EXTRA_3D", "EDICAO_EXTRA", + "EDICAO_EXTRA_1"] outputs ['DO1E', 'DO3E', 'DO1', 'DO3'] + """ + + return list({SectionINLABS[section].value for section in sections}) diff --git a/src/utils/search_domains.py b/src/utils/search_domains.py new file mode 100644 index 0000000..22d7ef6 --- /dev/null +++ b/src/utils/search_domains.py @@ -0,0 +1,80 @@ +from enum import Enum +from datetime import datetime, timedelta + +class Section(Enum): + """Define the section options to be used as parameter in the search""" + + SECAO_1 = "do1" + SECAO_2 = "do2" + SECAO_3 = "do3" + EDICAO_EXTRA = "doe" + EDICAO_EXTRA_1A = "do1_extra_a" + EDICAO_EXTRA_1B = "do1_extra_b" + EDICAO_EXTRA_1D = "do1_extra_d" + EDICAO_EXTRA_2A = "do2_extra_a" + EDICAO_EXTRA_2B = "do2_extra_b" + EDICAO_EXTRA_2D = "do2_extra_d" + EDICAO_EXTRA_3A = "do3_extra_a" + EDICAO_EXTRA_3B = "do3_extra_b" + EDICAO_EXTRA_3D = "do3_extra_d" + EDICAO_SUPLEMENTAR = "do1a" + TODOS = "todos" + +class SectionINLABS(Enum): + """Define the section options to be used as parameter in the search""" + + SECAO_1 = "DO1" + SECAO_2 = "DO2" + SECAO_3 = "DO3" + EDICAO_EXTRA = "DO1E" + EDICAO_EXTRA_1 = "DO1E" + EDICAO_EXTRA_2 = "DO2E" + EDICAO_EXTRA_3 = "DO3E" + EDICAO_EXTRA_1A = "DO1E" + EDICAO_EXTRA_1B = "DO1E" + EDICAO_EXTRA_1D = "DO1E" + EDICAO_EXTRA_2A = "DO2E" + EDICAO_EXTRA_2B = "DO2E" + EDICAO_EXTRA_2D = "DO2E" + EDICAO_EXTRA_3A = "DO3E" + EDICAO_EXTRA_3B = "DO3E" + EDICAO_EXTRA_3D = "DO3E" + EDICAO_SUPLEMENTAR = "DO1E" + +class SearchDate(Enum): + """Define the search date options to be used as parameter in the search""" + + DIA = "dia" + SEMANA = "semana" + MES = "mes" + ANO = "ano" + + +class Field(Enum): + """Define the search field options to be used as parameter in the search""" + + TUDO = "tudo" + TITULO = "title_pt_BR" + CONTEUDO = "ddm__text__21040__texto_pt_BR" + + +def calculate_from_datetime(publish_to_date: datetime, search_date: SearchDate): + """ + Calculate parameter `publishFrom` to be passed to the API based + on publishTo parameter and `search_date`. Perform especial + calculation to the MES (month) parameter option + """ + + if search_date == SearchDate.DIA: + return publish_to_date + + elif search_date == SearchDate.SEMANA: + return publish_to_date - timedelta(days=6) + + elif search_date == SearchDate.MES: + end_prev_month = publish_to_date.replace(day=1) - timedelta(days=1) + publish_from_date = end_prev_month.replace(day=publish_to_date.day) + return publish_from_date - timedelta(days=1) + + elif search_date == SearchDate.ANO: + return publish_to_date - timedelta(days=364) \ No newline at end of file diff --git a/tests-requirements.txt b/tests-requirements.txt index d09afca..9139bde 100644 --- a/tests-requirements.txt +++ b/tests-requirements.txt @@ -9,4 +9,5 @@ ijson==3.0.4 openpyxl==3.0.7 jsonschema==4.21.1 PyYAML==6.0.1 -requests==2.31.0 \ No newline at end of file +requests==2.31.0 +html2text==2024.2.26 \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 37ccf26..91cb470 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,7 +9,8 @@ from dags.ro_dou_src.dou_dag_generator import (DouDigestDagGenerator, SearchResult) from dags.ro_dou_src.parsers import YAMLParser -from dags.ro_dou_src.searchers import DOUSearcher +from dags.ro_dou_src.searchers import DOUSearcher, INLABSSearcher +from dags.ro_dou_src.hooks.inlabs_hook import INLABSHook TEST_AIRFLOW_HOME = '/opt/airflow' @@ -50,6 +51,14 @@ def yaml_parser()-> YAMLParser: def dou_searcher()-> DOUSearcher: return DOUSearcher() +@pytest.fixture() +def inlabs_searcher()-> INLABSSearcher: + return INLABSSearcher() + +@pytest.fixture() +def inlabs_hook()-> INLABSHook: + return INLABSHook() + @pytest.fixture() def report_example() -> dict: report = { diff --git a/tests/inlabs_hook_test.py b/tests/inlabs_hook_test.py new file mode 100644 index 0000000..b2d5f3a --- /dev/null +++ b/tests/inlabs_hook_test.py @@ -0,0 +1,380 @@ +import pytest +import pandas as pd +from datetime import datetime + + +@pytest.mark.parametrize( + "data_in, query_out", + [ + ( + { + "texto": ["term1", "term2"], + "pubname": ["DO1"], + "pubdate": ["2024-04-01", "2024-04-02"], + }, + "SELECT * FROM dou_inlabs.article_raw WHERE (pubdate BETWEEN '2024-04-01' AND '2024-04-02') AND (dou_inlabs.unaccent(texto) ~* dou_inlabs.unaccent('\\yterm1\\y') OR dou_inlabs.unaccent(texto) ~* dou_inlabs.unaccent('\\yterm2\\y')) AND (dou_inlabs.unaccent(pubname) ~* dou_inlabs.unaccent('\\yDO1\\y'))", + ), + ], +) +def test_generate_sql(inlabs_hook, data_in, query_out): + assert inlabs_hook._generate_sql(data_in)["select"] == query_out + + +@pytest.mark.parametrize( + "data_in, data_out", + [ + ( + { + "texto": ["term1", "term2"], + "pubname": ["DO1"], + "pubdate": ["2024-04-01", "2024-04-02"], + }, + { + "texto": ["term1", "term2"], + "pubname": ["DO1E"], + "pubdate": ["2024-03-31", "2024-04-01"], + }, + ), + ], +) +def test_adapt_search_terms_to_extra(inlabs_hook, data_in, data_out): + assert inlabs_hook._adapt_search_terms_to_extra(data_in) == data_out + + +@pytest.mark.parametrize( + "text, keys, matches", + [ + ( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + ["lorem", "sit", "not_find"], + "lorem, sit", + ), + ], +) +def test_find_matches(inlabs_hook, text, keys, matches): + assert inlabs_hook.TextDictHandler()._find_matches(text, keys) == matches + + +@pytest.mark.parametrize( + "text_in, text_out", + [ + ("çãAî é", "caai e"), + ], +) +def test_normalize(inlabs_hook, text_in, text_out): + assert inlabs_hook.TextDictHandler()._normalize(text_in) == text_out + + +@pytest.mark.parametrize( + "pub_name_in, pub_name_out", + [ + ("DO1", "DOU - Seção 1"), + ("DO2", "DOU - Seção 2"), + ("DO3", "DOU - Seção 3"), + ("DOE", "DOU - Seção Extra"), + ("DO1E", "DOU - Seção 1 Extra"), + ], +) +def test_rename_section(inlabs_hook, pub_name_in, pub_name_out): + assert inlabs_hook.TextDictHandler()._rename_section(pub_name_in) == pub_name_out + + +@pytest.mark.parametrize( + "texto_in, texto_out", + [ + ( # texto_in + """ +
Título da Publicação
+Título da Publicação 2
+Lorem ipsum dolor sit amet, consectetur adipiscing elit. + Phasellus venenatis auctor mauris.
+Brasília/DF, 15 de março de 2024.
+Pessoa 1
+Analista
+ """, + # texto_out + ( + "Título da Publicação Título da Publicação 2 Lorem ipsum dolor sit amet, " + "consectetur adipiscing elit. Phasellus venenatis auctor mauris. " + "Brasília/DF, 15 de março de 2024. Pessoa 1 Analista" + ), + ) + ], +) +def test_remove_html_tags(inlabs_hook, texto_in, texto_out): + print(inlabs_hook.TextDictHandler()._remove_html_tags(texto_in)) + assert inlabs_hook.TextDictHandler()._remove_html_tags(texto_in) == texto_out + + +@pytest.mark.parametrize( + "term, texto_in, texto_out", + [ + ( + ["elementum"], + "Pellentesque vel elementum mauris, id semper tellus.", + "Pellentesque vel <%%>elementum%%> mauris, id semper tellus.", + ), + ( + ["elementum", "tellus"], + "Pellentesque vel elementum mauris, id semper tellus.", + "Pellentesque vel <%%>elementum%%> mauris, id semper <%%>tellus%%>.", + ), + ], +) +def test_highlight_terms(inlabs_hook, term, texto_in, texto_out): + assert inlabs_hook.TextDictHandler()._highlight_terms(term, texto_in) == texto_out + + +@pytest.mark.parametrize( + "texto_in, texto_out", + [ + ( # texto_in + """ + Lorem ipsum dolor sit amet, consectetur adipiscing elit. + Phasellus venenatis auctor mauris. Integer id neque quis urna + ultrices iaculis. Donec et enim mauris. Sed vel massa eget est + viverra finibus a et magna. <%%>Pellentesque%%> vel elementum + mauris, id semper tellus. Vivamus convallis lacinia ex sed + fermentum. Nulla mollis cursus ipsum vel interdum. Mauris + facilisis posuere elit. Proin consectetur tincidunt urna. + Cras tincidunt nunc vestibulum velit pellentesque facilisis. + Aenean sollicitudin ante elit, vitae vehicula nisi congue id. + Brasília/DF, 15 de março de 2024. Pessoa 1 Analista + """, + # texto_out + ( + """(...) cing elit. + Phasellus venenatis auctor mauris. Integer id neque quis urna + ultrices iaculis. Donec et enim mauris. Sed vel massa eget est + viverra finibus a et magna. <%%>Pellentesque%%> vel elementum + mauris, id semper tellus. Vivamus convallis lacinia ex sed + fermentum. Nulla mollis cursus ipsum vel interdum. Mauris + facilisis posue (...)""" + ), + ), + ], +) +def test_trim_text(inlabs_hook, texto_in, texto_out): + print(inlabs_hook.TextDictHandler()._trim_text(texto_in)) + assert inlabs_hook.TextDictHandler()._trim_text(texto_in) == texto_out + + +@pytest.mark.parametrize( + "df_in, dict_out", + [ + ( + # df_in + pd.DataFrame( + [ + { + "title": "Título da Publicação 1", + "href": "http://xxx.gov.br/", + "date": "2024-03-15", + "section": "DO1", + "abstract": "Lorem ipsum dolor sit amet.", + "matches": "Lorem", + }, + { + "title": "Título da Publicação 2", + "href": "http://xxx.gov.br/", + "date": "2024-03-15", + "section": "DO1", + "abstract": "Pellentesque vel elementum mauris.", + "matches": "Pellentesque", + }, + { + "title": "Título da Publicação 3", + "href": "http://xxx.gov.br/", + "date": "2024-03-15", + "section": "DO1", + "abstract": "Dolor sit amet, lórem consectetur adipiscing elit.", + "matches": "Lorem", + }, + ] + ), + # dict_out + { + "Lorem": [ + { + "title": "Título da Publicação 1", + "href": "http://xxx.gov.br/", + "date": "2024-03-15", + "section": "DO1", + "abstract": "Lorem ipsum dolor sit amet.", + }, + { + "title": "Título da Publicação 3", + "href": "http://xxx.gov.br/", + "date": "2024-03-15", + "section": "DO1", + "abstract": "Dolor sit amet, lórem consectetur adipiscing elit.", + }, + ], + "Pellentesque": [ + { + "title": "Título da Publicação 2", + "href": "http://xxx.gov.br/", + "date": "2024-03-15", + "section": "DO1", + "abstract": "Pellentesque vel elementum mauris.", + } + ], + }, + ), + ], +) +def test_group_to_dict(inlabs_hook, df_in, dict_out): + cols = [ + "section", + "title", + "href", + "abstract", + "date", + ] + r = inlabs_hook.TextDictHandler()._group_to_dict(df_in, "matches", cols) + + assert r == dict_out + + +@pytest.mark.parametrize( + "terms, df_in, dict_out", + [ + ( + ["Pellentesque", "Lorem"], + pd.DataFrame( + [ + { + "artcategory": "Texto exemplo art_category", + "arttype": "Publicação xxx", + "id": 1, + "assina": "Pessoa 1", + "data": "Brasília/DF, 15 de março de 2024.", + "ementa": "None", + "identifica": "Título da Publicação 1", + "name": "15.03.2024 bsb DOU xxx", + "pdfpage": "http://xxx.gov.br/", + "pubdate": datetime(2024, 3, 15), + "pubname": "DO1", + "subtitulo": "None", + "texto": "Lorem ipsum dolor sit amet.", + "titulo": "None", + }, + { + "artcategory": "Texto exemplo art_category", + "arttype": "Publicação xxx", + "id": 2, + "assina": "Pessoa 2", + "data": "Brasília/DF, 15 de março de 2024.", + "ementa": "None", + "identifica": "Título da Publicação 2", + "name": "15.03.2024 bsb DOU xxx", + "pdfpage": "http://xxx.gov.br/", + "pubdate": datetime(2024, 3, 15), + "pubname": "DO1", + "subtitulo": "None", + "texto": "Dolor sit amet, consectetur adipiscing elit. Pellentesque.", + "titulo": "None", + }, + ] + ), + { + "Lorem": [ + { + "section": "DOU - Seção 1", + "title": "Título da Publicação 1", + "href": "http://xxx.gov.br/", + "abstract": "(...) <%%>Lorem%%> ipsum dolor sit amet. (...)", + "date": "15/03/2024", + "id": 1, + "display_date_sortable": None, + "hierarchyList": None, + } + ], + "Pellentesque": [ + { + "section": "DOU - Seção 1", + "title": "Título da Publicação 2", + "href": "http://xxx.gov.br/", + "abstract": "(...) Dolor sit amet, consectetur adipiscing elit. <%%>Pellentesque%%>. (...)", + "date": "15/03/2024", + "id": 2, + "display_date_sortable": None, + "hierarchyList": None, + } + ], + }, + ) + ], +) +def test_transform_search_results(inlabs_hook, terms, df_in, dict_out): + r = inlabs_hook.TextDictHandler().transform_search_results( + response=df_in, text_terms=terms, ignore_signature_match=False + ) + assert r == dict_out + + +@pytest.mark.parametrize( + "terms, df_in, dict_out", + [ + ( # terms + ["Pellentesque", "Pessoa 1"], + # df_in + pd.DataFrame( + [ + { + "artcategory": "Texto exemplo art_category", + "arttype": "Publicação xxx", + "id": 1, + "assina": "Pessoa 1", + "data": "Brasília/DF, 15 de março de 2024.", + "ementa": "None", + "identifica": "Título da Publicação", + "name": "15.03.2024 bsb DOU xxx", + "pdfpage": "http://xxx.gov.br/", + "pubdate": datetime(2024, 3, 15), + "pubname": "DO1", + "subtitulo": "None", + "texto": "Pessoa 1 ipsum dolor sit amet.", + "titulo": "None", + }, + { + "artcategory": "Texto exemplo art_category", + "arttype": "Publicação xxx", + "id": 2, + "assina": "Pessoa 2", + "data": "Brasília/DF, 15 de março de 2024.", + "ementa": "None", + "identifica": "Título da Publicação", + "name": "15.03.2024 bsb DOU xxx", + "pdfpage": "http://xxx.gov.br/", + "pubdate": datetime(2024, 3, 15), + "pubname": "DO1", + "subtitulo": "None", + "texto": "Pellentesque Phasellus venenatis auctor mauris.", + "titulo": "None", + }, + ] + ), + # dict_out + { + "Pellentesque": [ + { + "section": "DOU - Seção 1", + "title": "Título da Publicação", + "href": "http://xxx.gov.br/", + "abstract": "(...) <%%>Pellentesque%%> Phasellus venenatis auctor mauris. (...)", + "date": "15/03/2024", + "id": 2, + "display_date_sortable": None, + "hierarchyList": None, + } + ] + }, + ) + ], +) +def test_ignore_signature(inlabs_hook, terms, df_in, dict_out): + r = inlabs_hook.TextDictHandler().transform_search_results( + response=df_in, text_terms=terms, ignore_signature_match=True + ) + assert r == dict_out diff --git a/tests/inlabs_searcher_test.py b/tests/inlabs_searcher_test.py new file mode 100644 index 0000000..e20f02b --- /dev/null +++ b/tests/inlabs_searcher_test.py @@ -0,0 +1,140 @@ +"""INLABS Seracher unit tests +""" + +from datetime import datetime +from collections import Counter +import pytest + + +@pytest.mark.parametrize( + "search_terms, sections, department, reference_date, search_date, filters_applyed", + [ + ( + {"texto": ["a", "b"]}, + ["SECAO_2"], + ["Ministério"], + datetime.now(), + "DIA", + { + "texto": ["a", "b"], + "pubname": ["DO2"], + "artcategory": ["Ministério"], + "pubdate": [ + datetime.now().strftime("%Y-%m-%d"), + datetime.now().strftime("%Y-%m-%d"), + ], + }, + ), + ], +) +def test_apply_filters( + inlabs_searcher, + search_terms, + sections, + department, + reference_date, + search_date, + filters_applyed, +): + assert ( + inlabs_searcher._apply_filters( + search_terms, sections, department, reference_date, search_date + ) + == filters_applyed + ) + + +@pytest.mark.parametrize( + "terms, search_terms", + [ + (["a", "b", "c"], {"texto": ["a", "b", "c"]}), + ( + '{"termo": {"0": "Pessoa 0","1": "Pessoa 1"}, "termo_group": {"0": "Grupo 1","1": "Grupo 2"}}', + {"texto": ["Pessoa 0", "Pessoa 1"]}, + ), + ], +) +def test_prepare_search_terms(inlabs_searcher, terms, search_terms): + search_terms_return = inlabs_searcher._prepare_search_terms(terms) + assert set(search_terms_return.keys()) == set( + search_terms.keys() + ), "The dictionaries do not have the same keys." + + for key in search_terms_return: + assert Counter(search_terms_return[key]) == Counter( + search_terms[key] + ), f"The lists under the key '{key}' do not have the same content." + + +@pytest.mark.parametrize( + "raw_sections, parsed_sections", + [ + (["SECAO_1"], ["DO1"]), + (["SECAO_2"], ["DO2"]), + (["SECAO_3"], ["DO3"]), + (["SECAO_1", "EDICAO_EXTRA"], ["DO1", "DO1E"]), + ( + ["SECAO_2", "EDICAO_EXTRA_1A", "EDICAO_EXTRA_2B", "EDICAO_EXTRA_3D"], + ["DO2", "DO1E", "DO2E", "DO3E"], + ), + ], +) +def test_parse_sections(inlabs_searcher, raw_sections, parsed_sections): + assert sorted(inlabs_searcher._parse_sections(raw_sections)) == sorted( + parsed_sections + ) + + +@pytest.mark.parametrize( + "sql_terms, sql_splitted_terms", + [ + ( # sql_terms + { + "termo": { + "0": "Pessoa 0", + "1": "Pessoa 1", + "2": "Pessoa 2", + "3": "Pessoa 3", + "4": "Pessoa 4", + "5": "Pessoa 5", + "6": "Pessoa 6", + "7": "Pessoa 7", + "8": "Pessoa 8", + "9": "Pessoa 9", + "10": "Pessoa 10", + }, + "termo_group": { + "0": "Grupo 1", + "1": "Grupo 2", + "2": "Grupo 2", + "3": "Grupo 1", + "4": "Grupo 3", + "5": "Grupo 2", + "6": "Grupo 1", + "7": "Grupo 1", + "8": "Grupo 1", + "9": "Grupo 2", + "10": "Grupo 1", + }, + }, + # sql_splitted_terms + [ + "Pessoa 0", + "Pessoa 1", + "Pessoa 2", + "Pessoa 3", + "Pessoa 4", + "Pessoa 5", + "Pessoa 6", + "Pessoa 7", + "Pessoa 8", + "Pessoa 9", + "Pessoa 10", + ], + ), + ], +) +def test_split_sql_terms(inlabs_searcher, sql_terms, sql_splitted_terms): + assert sorted(inlabs_searcher._split_sql_terms(sql_terms)) == sorted( + sql_splitted_terms + ) diff --git a/tests/parsers_test.py b/tests/parsers_test.py index 58cd57b..6f3886b 100644 --- a/tests/parsers_test.py +++ b/tests/parsers_test.py @@ -75,7 +75,7 @@ def test_hash_dag_id(yaml_parser, dag_id, size, hashed): "lei de acesso à informação"], "sql": None, "conn_id": None, - "department": None, + "department": None, "emails": ["dest1@economia.gov.br", "dest2@economia.gov.br"], "subject": "Assunto do Email", "attach_csv": True, @@ -108,7 +108,7 @@ def test_hash_dag_id(yaml_parser, dag_id, size, hashed): "UNION SELECT 'uso de máscara' as TERMO, 'Ações efetivas' as GRUPO " "UNION SELECT 'distanciamento social' as TERMO, 'Ações efetivas' as GRUPO\n"), "conn_id": "example_database_conn", - "department": None, + "department": None, "emails": ["destination@economia.gov.br"], "subject": "[String] com caracteres especiais deve estar entre aspas", "attach_csv": True, @@ -136,7 +136,7 @@ def test_hash_dag_id(yaml_parser, dag_id, size, hashed): "terms": ["cimentodaaroeira"], "sql": None, "conn_id": None, - "department": None, + "department": None, "emails": ["destination@economia.gov.br"], "subject": 'Teste do Ro-dou', "attach_csv": False, @@ -166,7 +166,7 @@ def test_hash_dag_id(yaml_parser, dag_id, size, hashed): "lei de acesso à informação"], "sql": None, "conn_id": None, - "department": None, + "department": None, "emails": ["destination@economia.gov.br"], "subject": "Teste do Ro-dou", "attach_csv": False, @@ -214,7 +214,35 @@ def test_hash_dag_id(yaml_parser, dag_id, size, hashed): "dag_tags": {"dou", "generated_dag"}, "owner": "", } - ), + ), + ("inlabs_example.yaml", + { + "dag_id": "inlabs_example", + "sources": ["INLABS"], + "territory_id": None, + "dou_sections": ["TODOS"], + "search_date": "DIA", + "field": "TUDO", + "is_exact_search": True, + "ignore_signature_match": False, + "force_rematch": None, + "terms": ["tecnologia", "informação"], + "sql": None, + "conn_id": None, + "department": None, + "emails": ["destination@economia.gov.br"], + "subject": "Teste do Ro-dou", + "attach_csv": True, + "discord_webhook": None, + "slack_webhook": None, + "schedule": "0 8 * * MON-FRI", + "description": "DAG de teste", + "skip_null": True, + "doc_md": None, + "dag_tags": {"dou", "generated_dag", "inlabs"}, + "owner": "cdata", + } + ), ]) def test_parse(filepath, result_tuple): filepath = os.path.join(DouDigestDagGenerator().YAMLS_DIR,